https://www.kaggle.com/datasets/vijaygiitk/multiclass-weather-dataset
The dataset contains 6-folders: 5-folders having each category of images and one with the alien-test having the images of all categories. It also consist a csv file having the labels for the images in alien-test folder.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
import tqdm as tqdm
import cv2
# mount
from google.colab import drive
drive.mount('/gdrive')
with open('/gdrive/My Drive/data/dataset2', 'w') as f:
f.write('Hello Google Drive!')
!cat '/gdrive/My Drive/data/dataset2'
!unzip archive.zip
import os
root_dir = '/gdrive/My Drive/data/dataset'
image_label = os.listdir(root_dir)
image_label.remove('test.csv')
image_label
>>>>
['cloudy', 'shine', 'rainy', 'alien_test', 'foggy', 'sunrise']
import glob
fig, axes = plt.subplots(nrow=1, ncols=len(image_label), figsize=(20, 5))
for i, img_label in enumerate(image_label):
wfiles = glob.glob(f"{root_dir}/{img_label}/*")
wfiles = sorted(wfiles)
print(wfiles[0])
img = plt.imread(wfiles[1])
axes[i].imread(img)
axes[i].set_title(img_label)
def img_read_resize(img_path):
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = cv2.resize(img, (120, 120))
return img
img_path = f"{root_dir}/cloudy/cloudy131.jpg"
print(img_read_resize(img_path).shape)
# 이미지 사이즈 변경 확인
plt.imread(img_resize(img_path))
def img_folder_read(img_label):
img_files = []
labels = []
wfiles = glob.glob(f"{root_dir}/{img_label}/*")
wfiles = sorted(wfiles)
for w_img in wfiles:
try:
img_files.append(img_read_resize(w_img))
labels.append(img_label)
except:
continue
return img_files, labels
img_label = 'shine'
img_files, labels = img_folder_read(img_label)
len(img_files), len(labels), img_files[0].shape, labels[0]
>>>>
(249, 249, (120, 120, 3), 'shine')
image_label
>>>>
['cloudy', 'shine', 'rainy', 'alien_test', 'foggy', 'sunrise']
x_train_img = []
x_test_img = []
y_train_img = []
y_test_img = []
for img_label in tqdm.tqdm(image_label):
x_temp, y_temp = img.folder_read(img_label)
if img_label == 'alien_test':
x_test_img.extend(x_temp)
y_test_img.extend(y_temp)
else:
x_train_img.extend(x_temp)
y_train_img.extend(y_temp)
len(x_train_img), len(y_train_img), len(x_test_img), len(y_test_img)
>>>>
100%|█ █ █ █ █ █ █ █ █ █| 6/6 [00:22<00:00, 3.68s/it]
(1498, 1498, 30, 30)
# train 데이터 미리보기
plt.imshow(x_train_img[170])
plt.title(y_train_img[170])
# test 데이터 미리보기
plt.imshow(x_test_img[6])
plt.title(y_test_img[0])
x_train_arr = np.array(x_train_img)
y_train_arr = np.array(y_train_img)
x_test_arr = np.array(x_test_img)
y_test_arr = np.array(y_test_img)
x_train_arr.shape, y_train_arr.shape, x_test_arr.shape, y_test_arr.shape
>>>>
((1498, 120, 120, 3), (1498,), (30, 120, 120, 3), (30,))
from sklearn.model_selection import train_test_split
x_train_raw, x_valid_raw, y_train_raw, y_valid_raw = train_test_split(
x_train_arr, y_train_arr, test_size=0.33, random_state=42, stratify=y_train_arr)
x_train_raw.shape, x_valid_raw.shape, y_train_raw.shape, y_valid_raw.shape
>>>>
((1003, 120, 120, 3), (495, 120, 120, 3), (1003,), (495,))
x_train = x_train_raw / 255
x_valid = x_valid_raw / 255
x_test = x_test_arr / 255
x_train[0].max(), x_valid[0].max(), x_test[0].max()
>>>>
(1.0, 1.0, 1.0)
y_train_arr
>>>>
array(['cloudy', 'cloudy', 'cloudy', ..., 'sunrise', 'sunrise', 'sunrise'],
dtype='<U7')
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
lb.fit(y_train_arr)
print(lb.classes_)
y_train = lb.transform(y_train_raw)
y_valid = lb.transform(y_valid_raw)
y_train.shape, y_valid.shape
>>>>
['cloudy' 'foggy' 'rainy' 'shine' 'sunrise']
((1003, 5), (495, 5))
lb.classes_
>>>>
array(['cloudy', 'foggy', 'rainy', 'shine', 'sunrise'], dtype='<U7')
# 정답 인코딩이 제대로 되어있는지 확인
# 클래스가 train, valid 균일하게 나눠졌는지 확인
# pd.Series 형태로 구성해 준 이유는 np.array 형태이기 때문
pd.Series(y_train_raw).value_counts(1)
>>>>
sunrise 0.233300
cloudy 0.200399
foggy 0.200399
rainy 0.199402
shine 0.166500
dtype: float64
pd.Series(y_valid_raw).value_counts(1)
>>>>
sunrise 0.234343
rainy 0.200000
cloudy 0.200000
foggy 0.200000
shine 0.165657
dtype: float64
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dense, Flatten, Dropout
y_train.shape
>>>>
(1003, 5)
# 예측할 class의 수가 shape의 열의 개수가 된다.
n_class = y_train.shape[1]
n_class
>>>>
5
model = Sequential()
model.add(Conv2D(filter=16, kernel_size=(3, 3),
activation='relu', input_shape=x_train[o].shape))
model.add(MaxPool2D(2, 2))
model.add(Dropout(0.2))
model.add(Conv2D(filters=64, kernel_size=(3,3), padding='same',
activation='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=1))
model.add(Dropout(0.2))
# Flatten => 1차원 형태로 변환하고 Fully-connected Layer로 전달하기 위해
model.add(Flatten())
model.add(Dense(units=64, activation="relu"))
model.add(Dense(units=32, activation='relu'))
# 출력층
model.add(Dense(n_class, activation='softmax'))
model.summary()
>>>>
Model: "sequential_25"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d_26 (Conv2D) (None, 118, 118, 16) 448
max_pooling2d_26 (MaxPoolin (None, 59, 59, 16) 0
g2D)
dropout_23 (Dropout) (None, 59, 59, 16) 0
conv2d_27 (Conv2D) (None, 59, 59, 64) 9280
max_pooling2d_27 (MaxPoolin (None, 58, 58, 64) 0
g2D)
dropout_24 (Dropout) (None, 58, 58, 64) 0
flatten_12 (Flatten) (None, 215296) 0
dense_38 (Dense) (None, 64) 13779008
dense_39 (Dense) (None, 32) 2080
dense_40 (Dense) (None, 5) 165
=================================================================
Total params: 13,790,981
Trainable params: 13,790,981
Non-trainable params: 0
_________________________________________________________________
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy']
)
from tensorflow.keras.callbacks import EarlyStopping
earlystop = EarlyStopping(monitor="val_accuracy", patience=5, verbose=1)
history = model.fit(x_train, y_train, validation_data=(x_valid, y_valid),
epochs=20, callbacks=[earlystop])
df_hist = pd.DataFrame(history.history)
df_hist.tail
>>>>
<bound method NDFrame.tail of
loss accuracy val_loss val_accuracy
0 3.025001 0.258225 1.388153 0.418182
1 1.199404 0.444666 1.196450 0.569697
2 0.952285 0.622134 1.196582 0.519192
3 0.758515 0.705882 0.905438 0.692929
4 0.666997 0.752742 0.809039 0.713131
5 0.579274 0.768694 0.895901 0.692929
6 0.446963 0.828514 0.775644 0.729293
7 0.380925 0.853440 0.712370 0.763636
8 0.350297 0.879362 0.925306 0.662626
9 0.288178 0.897308 0.833619 0.737374
10 0.266054 0.899302 0.738016 0.759596
11 0.175380 0.941176 0.797875 0.777778
12 0.211277 0.922233 0.812663 0.783838
13 0.157780 0.947159 1.045204 0.745455
14 0.123782 0.955135 0.879894 0.777778
15 0.088632 0.972084 0.945401 0.787879
16 0.057009 0.982054 1.058660 0.793939
17 0.062206 0.975075 0.959297 0.795960
18 0.070612 0.976072 1.113115 0.769697
19 0.059526 0.980060 1.009745 0.800000>
df_hist[['accuracy', 'val_accuracy']].plot()
df_hist[['loss', 'val_loss']].plot()
# softmax로 출력했기 때문에 합이 1이 되는 값으로 출력
y_pred = model.predict(x_test)
y_pred[0]
>>>>
array([4.6532657e-02, 9.2098981e-01, 3.1598058e-02, 3.0585946e-04,
5.7366164e-04], dtype=float32)
# TF로 예측한 값을 csv파일과 비교
test = pd.read_csv(f'{root_dir}/test.csv')
y_test = test['labels']
y_predict = np.argmax(y_pred, axis=1)
y_predict[:5]
>>>>
array([1, 0, 0, 1, 1])
(y_test == y_predict).mean()
>>>>
0.8
test.head()
>>>>
Image_id labels
0 Cloud_1.png 0
1 Cloud_2.jpg 0
2 Cloud_3.jpeg 0
3 Cloud_4.jpg 0
4 foggy_1.jpg 1
fig, axes = plt.subplots(nrows=6, ncols=5, figsize=(20, 20))
for i tcsv in test.iterrows():
col = i % 5
row = i // 5
Image_id = tcsv['Image_id']
img_label = tcsv['labels']
img = plt.imread(f'{root_dir}/alien_test/{Image_id}')
color = 'red'
if img_label == y_predict[i]:
color = 'blue'
axes[row, col].imshow(img)
axes[row, col].set_title(f"test : {class_name[img_label]}, predict : {class_name[y_predict[i]]}', color=color)