https://www.kaggle.com/datasets/iarunava/cell-images-for-detecting-malaria
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# layers 에서는 Conv2D, MaxPool2D, Dropout, Flatten, Dense 를 불러옵니다.
# callbacks 에서는 EarlyStopping 을 불러옵니다.
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dropout, Flatten, Dense
from tensorflow.keras.callbacks import EarlyStopping
!wget https://data.lhncbc.nlm.nih.gov/public/Malaria/cell_images.zip
!unzip cell_images.zip
import os
for root, dirs, files in os.walk("./cell_images/"):
print(root, dirs, len(files))
>>>>
./cell_images/ ['Parasitized', 'Uninfected'] 0
./cell_images/Parasitized [] 13780
./cell_images/Uninfected [] 13780
import glob
upics = glob.glob('./cell_images/Uninfected/*.png')
apics = glob.glob('./cell_images/Parasitized/*.png')
len(upics), upics[0], len(apics), apics[0]
>>>>
(13779,
'./cell_images/Uninfected/C174P135NThinF_IMG_20151127_135311_cell_143.png',
13779,
'./cell_images/Parasitized/C168P129ThinF_IMG_20151118_154126_cell_159.png')
# upics
upics_0 = upics[0]
upics_0_img = plt.imread(upics_0)
plt.imshow(upics_0_img)
# apics
apics_0 = apics[0]
apics_0_img = plt.imread(apics_0)
plt.imshow(apics_0_img)
import cv2
plt.figure(figsize=(8, 8))
labels = 'Uninfected'
for i images in enumerate(upics[:9]):
ax = plt.subplot(3, 3, i+1)
img = cv2.imread(images)
plt.imshow(img)
plt.title(f'{labels} {img.shape}')
plt.axis('off')
plt.figure(figsize=(8, 8))
labels = "Infected"
for i, images in enumerate(apics[:9]):
ax = plt.subplot(3, 3, i + 1)
img = cv2.imread(images)
plt.imshow(img)
plt.title(f'{labels} {img.shape}')
plt.axis("off")
from tensorflow.keras.preprocessing.image import ImageDataGenerator
#validation_split 값을 통해 학습 : 검증 비율을 8:2로 나눈다
datagen = ImageDataGenerator(rescale=1/255.0, validation_split=0.2)
width=16
height=16
flow_from_directory 통해 이미지를 불러옵니다.
training 데이터셋 생성한다.
class_mode 에는 이진분류이기 때문에 binary 넣어준다
classmode : One of 'categorical', 'binary', 'sparse',
'input', or None.
Default : 'categorical'.
subset : Subset of data {'training' or 'validation'}
trainDatagen = datagen.flow_from_directory(directory = 'cell_images/',
target_size = (height, width),
class_mode = 'binary',
batch_size = 64,
subset='training')
>>>>
Found 22048 images belonging to 2 classes.
trainDatagen.num_classes
>>>> 2
trainDatagen.classes
>>>>
array([0, 0, 0, ..., 1, 1, 1], dtype=int32)
# {'Parasitized' : 0, 'Uninfected' : 1}
# 0 : 감염, 1 : 미감염
trainDatagen.class_indices
>>>>
{'Parasitized': 0, 'Uninfected': 1}
: validation 데이터셋 생성
valDatagen = datagen.flow_from_directory(directory = 'cell_images/',
target_size =(height, width),
class_mode = 'binary',
batch_size = 64,
subset='validation')
model = Sequential()
# 입력층
model.add(Conv2D(filters=16, kernel_size=(3,3), padding='valid',
activation='relu', input_shape=(height, width, 3)))
model.add(Conv2D(filters=16, kernel_size=(3,3), padding='same',
activation='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=1))
model.add(Conv2D(filters=16, kernel_size=(3,3), padding='same',
activation='relu'))
model.add(Conv2D(filters=16, kernel_size=(3,3), padding='same',
activation='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=1))
# 20% 는 0으로 대체합니다. => 오버피팅 방지
model.add(Dropout(0.2))
# Fully-connected layer
model.add(Flatten())
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=32, activation='relu'))
# 출력층=> binary classification
model.add(Dense(1, activation='sigmoid'))
# summary
model.summary()
# tensorflow.keras.utils의 plot_model 을 통한 레이어 시각화
from tensorflow.keras.utils import plot_model
plot_model(model)
model.complie(optimizer='adam',
loss='binary_crossentropy',
metrics=[accuracy'])
배치(batch): 모델 학습에 한 번에 입력할 데이터셋
에폭(epoch): 모델 학습시 전체 데이터를 학습한 횟 수
스텝(step): (모델 학습의 경우) 하나의 배치를 학습한 횟 수
EarlyStopping: 성능이 더 이상 좋아지지 않으면 학습을 중지
early_stop = EarlyStopping(monitor='val_loss', patience=2)
early_stop = EarlyStopping(monitor='val_loss', patience=5)
# fit
history = model.fit(trainDatagen, epochs=1000, validation_data=valDatagen, callbacks=early_stop)
# history
df_hist = pd.DataFrame(history.history)
df_hist.tail()
df_hist[['loss', 'val_loss']].plot()
df_hist[['accuracy', 'val_accuracy']].plot()