1970๋ ๋ ๋ณด์คํด ์ง์ญ์ ์ฃผํ ๊ฐ๊ฒฉ์ ์์ธกํ๋ ํ๊ท(์ฐ์์ ์ธ ๊ฐ) ๋ฌธ์
from tensorflow.keras.datasets.boston_housing import load_data
(X_train, y_train), (X_test, y_test) = load_data(path='boston_housing.npz', test_split = 0.2, seed=777)
X_train.shape
X_train.shape -> (404,13)
import numpy as np
# 1) feature ์ ์ฒ๋ฆฌ -> ์ ๊ทํ -> ํ์คํ(Standardization) -> ์ค์ ๊ฐ์ ํ๊ท (mean)์ผ๋ก ๋บ ๋ค์ ํ์คํธ์ฐจ ๋๋๋ ๊ฒ
mean = np.mean(X_train, axis=0) # ๋ชจ๋ row๋ฅผ ๋ฐ์ํ ํ๊ท
std = np.std(X_train, axis=0)
X_train = (X_train - mean) / std
X_test = (X_test - mean) / std
from sklearn.model_selection import train_test_split
# 2) ํ๋ จ ๋ฐ์ดํฐ์
/ ๊ฒ์ฆ ๋ฐ์ดํฐ์
๋ถ๋ฆฌ
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33, random_state=777)
from keras.models import Sequential
from keras.layers import Dense
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(13,)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1)) # ํ๋์ ์ถ๋ ฅ ๊ฐ activation default : linear
model.compile(optimizer='adam', loss='mse', metrics=['mae', 'mse'])
history = model.fit(X_train, y_train, epochs=300, validation_data=(X_val, y_val))
import matplotlib.pyplot as plt
his_dict = history.history
mse = his_dict['mse']
val_mse = his_dict['val_mse'] # ๊ฒ์ฆ ๋ฐ์ดํฐ๊ฐ ์๋ ๊ฒฝ์ฐ โval_โ ์์์ด๊ฐ ๋ถ์ต๋๋ค.
epochs = range(1, len(mse) + 1)
fig = plt.figure(figsize = (10, 5))
# ํ๋ จ ๋ฐ ๊ฒ์ฆ ์์ค ๊ทธ๋ฆฌ๊ธฐ
ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(epochs, mse, color = 'blue', label = 'train_mse')
ax1.plot(epochs, val_mse, color = 'orange', label = 'val_mse')
ax1.set_title('train and val mse')
ax1.set_xlabel('epochs')
ax1.set_ylabel('mse')
ax1.legend()
mae = his_dict['mae']
val_mae = his_dict['val_mae']
# ํ๋ จ ๋ฐ ๊ฒ์ฆ ์ ํ๋ ๊ทธ๋ฆฌ๊ธฐ
ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(epochs, mae, color = 'blue', label = 'train_mae')
ax2.plot(epochs, val_mae, color = 'orange', label = 'val_mae')
ax2.set_title('train and val mae')
ax2.set_xlabel('epochs')
ax2.set_ylabel('mae')
ax2.legend()
plt.show()
model.evaluate(X_test, y_test)
test_predictions = model.predict(X_test).flatten()
plt.scatter(y_test, test_predictions)
plt.xlabel('True Values [Price]')
plt.ylabel('Predictions [Price]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])
K-Fold ์ฌ์ฉํ๊ธฐ
- ๋ฐ์ดํฐ ๊ฐ์๊ฐ ์ ์ ๊ฒฝ์ฐ ์ฑ๋ฅ์ ํฅ์์ํฌ ์ ์๋ ์ข์ ๋ฐฉ๋ฒ : ๊ต์ฐจ๊ฒ์ฆ
- ๊ฒฐ๊ณผ๋ฅผ ํ์ธํด๋ณด๋ฉด ๋ชจ๋ ๋ชจ๋ธ์ด ์ ๋ถ ์ข์ ์ฑ๋ฅ์ ๊ฐ์ง์ง๋ ์์
-> ๊ฐ ํด๋์์ ์ฌ์ฉํ ํ์ต, ๊ฒ์ฆ ๋ฐ์ดํฐ๊ฐ ๋ค๋ฅด๊ธฐ ๋๋ฌธ
-> ๋๋ฒ์งธ ๋ชจ๋ธ์ ์๋์ ์ผ๋ก ํ ์คํธ ๋ชจ๋ธ๊ณผ ๋ ๋น์ทํ ๋ถํฌ์ ๋ฐ์ดํฐ๋ฅผ ํ์ตํ๋ค๊ณ ๋ณผ ์ ์์- ์ด ๋๋ฌธ์ ์ต์ข ํ์ต ๊ฒฐ๊ณผ๋ ํ๊ท ์ ์ฌ์ฉ
from tensorflow.keras.datasets.boston_housing import load_data from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense import numpy as np from sklearn.model_selection import KFold (x_train, y_train), (x_test, y_test) = load_data(path='boston_housing.npz', test_split=0.2, seed=777) # ๋ฐ์ดํฐ ํ์คํ mean = np.mean(x_train, axis = 0) std = np.std(x_train, axis = 0) # ์ฌ๊ธฐ๊น์ง ์ ๋ถ ๋์ผํฉ๋๋ค. x_train = (x_train - mean) / std x_test = (x_test - mean) / std #---------------------------------------- # K-Fold๋ฅผ ์งํํด๋ด ๋๋ค. k = 3 # ์ฃผ์ด์ง ๋ฐ์ดํฐ์ ์ k๋งํผ ๋ฑ๋ถํฉ๋๋ค. # ์ฌ๊ธฐ์๋ 3์ด๋ฏ๋ก ํ๋ จ ๋ฐ์ดํฐ์ (404๊ฐ)๋ฅผ 3๋ฑ๋ถํ์ฌ # 1๊ฐ๋ ๊ฒ์ฆ์ ์ผ๋ก, ๋๋จธ์ง 2๊ฐ๋ ํ๋ จ์ ์ผ๋ก ํ์ฉํฉ๋๋ค. kfold = KFold(n_splits=k) # ์ฌ์ฌ์ฉ์ ์ํด ๋ชจ๋ธ์ ๋ฐํํ๋ ํจ์๋ฅผ ์ ์ํฉ๋๋ค. def get_model(): model = Sequential() model.add(Dense(64, activation = 'relu', input_shape = (13, ))) model.add(Dense(32, activation = 'relu')) model.add(Dense(1)) model.compile(optimizer = 'adam', loss = 'mse', metrics = ['mae']) return model mae_list = [] # ํ ์คํธ์ ์ ํ๊ฐํ ํ ๊ฒฐ๊ณผ mae๋ฅผ ๋ด์ ๋ฆฌ์คํธ๋ฅผ ์ ์ธํฉ๋๋ค. # k๋ฒ ์งํํฉ๋๋ค. for train_index, val_index in kfold.split(x_train): # ํด๋น ์ธ๋ฑ์ค๋ ๋ฌด์์๋ก ์์ฑ๋ฉ๋๋ค. # ๋ฌด์์๋ก ์์ฑํด์ฃผ๋ ๊ฒ์ ๊ณผ๋์ ํฉ์ ํผํ ์ ์๋ ์ข์ ๋ฐฉ๋ฒ์ ๋๋ค. x_train_fold, x_val_fold = x_train[train_index], x_train[val_index] y_train_fold, y_val_fold = y_train[train_index], y_train[val_index] # ๋ชจ๋ธ์ ๋ถ๋ฌ์ต๋๋ค. model = get_model() model.fit(x_train_fold, y_train_fold, epochs = 300, validation_data = (x_val_fold, y_val_fold)) _, test_mae = model.evaluate(x_test, y_test) mae_list.append(test_mae) print(np.mean(mae_list))
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import RMSprop
import warnings
warnings.filterwarnings('ignore')
dataset_path = tf.keras.utils.get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
#์ฐ๋น, ์ค๋ฆฐ๋, ๋ฐฐ์๋, ๋ง๋ ฅ, ์ค๋ ฅ, ๊ฐ์, ์ฐ์, ์ ์กฐ๊ตญ(1;USA, 2;Europe, 3;Japan)
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight', 'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv(dataset_path, names=column_names, na_values='?', comment='\t', sep=' ', skipinitialspace=True)
dataset = raw_dataset.copy()
๐
- na_values = ํน์ ๊ธฐํธ(์ฌ๊ธฐ์๋ '?') ๋ค์ null ์ฒ๋ฆฌ
- comment = '\t' ๋ถํฐ๋์ค๋ ํน์ ๋ฌธ์๋ ์ฃผ์์ผ๋ก ๊ฐ์ฃผํ์ฌ ์ฝ์ง ์์
- sep = ์ด๋ค ๊ธฐ์ค์ผ๋ก ์ฝ์ด๋ค์ผ์ง
- skipinitialspace : True์ ๊ฒฝ์ฐ, delimiter์ ์งํ์ ์๋ ๊ณต๋ฐฑ์ ๋ฌด์
dataset.isna().sum()
dataset = dataset.dropna()
dataset.isna().sum()
# Origin(์ ์กฐ๊ตญ) ์์นํ -> ๋ฒ์ฃผํ ๋ณ๊ฒฝ -> ์ํซ์ธ์ฝ๋ฉ
origin = dataset.pop('Origin') # ์๋ผ๋ด๊ณ ๋ณต์ฌํ๊ธฐ
dataset['USA'] = (origin == 1) * 1.0
dataset['Europe'] = (origin == 2) * 1.0
dataset['Japan'] = (origin == 3) * 1.0
train_dataset = dataset.sample(frac=0.8, random_state=0) # 80%๋ฅผ ์์ ์ถ์ถ
test_dataset = dataset.drop(train_dataset.index) # 80%๋ฅผ ์์์ถ์ถํ index๋ฅผ ์ญ์ ํ๊ณ ๋จ์20% ๋ด๊ธฐ
train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')
train_dataset
mean = np.mean(train_dataset, axis=0)
std = np.std(train_dataset, axis=0)
train_dataset = (train_dataset - mean) / std
test_dataset = (test_dataset - mean) / std
train_dataset.head()
๋ช๊ฐ์ feature -> len(train_dataset.keys())
np.random.seed(7)
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]))
model.add(Dense(64, activation='relu'))
model.add(Dense(1)) # linear
# ์ค์
model.compile(optimizer=RMSprop(0.001), loss='mse', metrics=['mae','mse'])
# ํ์ต
history = model.fit(train_dataset, train_labels, epochs=500)
import matplotlib.pyplot as plt
his_dict = history.history
mse = his_dict['mse']
epochs = range(1, len(mse) + 1)
fig = plt.figure(figsize = (10, 5))
# ํ๋ จ ๋ฐ ๊ฒ์ฆ ์์ค ๊ทธ๋ฆฌ๊ธฐ
ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(epochs, mse, color = 'blue', label = 'train_mse')
ax1.set_title('train mse')
ax1.set_xlabel('epochs')
ax1.set_ylabel('mse')
ax1.legend()
mae = his_dict['mae']
# ํ๋ จ ๋ฐ ๊ฒ์ฆ ์ ํ๋ ๊ทธ๋ฆฌ๊ธฐ
ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(epochs, mae, color = 'blue', label = 'train_mae')
ax2.set_title('train mae')
ax2.set_xlabel('epochs')
ax2.set_ylabel('mae')
ax2.legend()
plt.show()
loss, mae, mse = model.evaluate(test_dataset, test_labels)
print('ํ
์คํธ์
์ ํ๊ท ์ ๋ ์ค์ฐจ : {:5.2f} MPG'.format(mae))
test_predictions = model.predict(test_dataset).flatten()
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])
๐
์ฐ๋น๊ฐ ๋์์ง์๋ก ์ ํ๋๊ฐ ๋จ์ด์ง
์ฐ๋น๊ฐ ๋ฎ์ ๋ชจ๋ธ๋ค์ feature๋ค์ด ์ ๋ฐ์๋จ