๐Ÿ‘Œ ํšŒ๊ท€๋ฌธ์ œ ๐Ÿ‘Œ

parkeuยท2022๋…„ 9์›” 29์ผ
0

ABC๋ถ€ํŠธ์บ ํ”„

๋ชฉ๋ก ๋ณด๊ธฐ
37/55

๐Ÿ  ๋ณด์Šคํ„ด ์ฃผ์ฑ… ๊ฐ€๊ฒฉ ์˜ˆ์ธก

๋ฌธ์ œ ์ •์˜

1970๋…„๋Œ€ ๋ณด์Šคํ„ด ์ง€์—ญ์˜ ์ฃผํƒ ๊ฐ€๊ฒฉ์„ ์˜ˆ์ธกํ•˜๋Š” ํšŒ๊ท€(์—ฐ์†์ ์ธ ๊ฐ’) ๋ฌธ์ œ

๋ณด์Šคํ„ด ์ฃผํƒ ๊ฐ€๊ฒฉ ๋ฐ์ดํ„ฐ์…‹ ๋‹ค์šด๋กœ๋“œ

from tensorflow.keras.datasets.boston_housing import load_data
(X_train, y_train), (X_test, y_test) = load_data(path='boston_housing.npz', test_split = 0.2, seed=777)
X_train.shape

X_train.shape -> (404,13)

๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ๋ฐ ๊ฒ€์ฆ ๋ฐ์ดํ„ฐ์…‹ ๋งŒ๋“ค๊ธฐ

import numpy as np

# 1) feature ์ „์ฒ˜๋ฆฌ -> ์ •๊ทœํ™” -> ํ‘œ์ค€ํ™”(Standardization) -> ์‹ค์ œ ๊ฐ’์„ ํ‰๊ท (mean)์œผ๋กœ ๋บ€ ๋‹ค์Œ ํ‘œ์ค€ํŽธ์ฐจ ๋‚˜๋ˆ„๋Š” ๊ฒƒ
mean = np.mean(X_train, axis=0) # ๋ชจ๋“  row๋ฅผ ๋ฐ˜์˜ํ•œ ํ‰๊ท 
std = np.std(X_train, axis=0)

X_train = (X_train - mean) / std 
X_test = (X_test - mean) / std

from sklearn.model_selection import train_test_split
# 2) ํ›ˆ๋ จ ๋ฐ์ดํ„ฐ์…‹ / ๊ฒ€์ฆ ๋ฐ์ดํ„ฐ์…‹ ๋ถ„๋ฆฌ

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33, random_state=777)

๋ชจ๋ธ ๊ตฌ์„ฑํ•˜๊ธฐ

from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(13,)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1)) # ํ•˜๋‚˜์˜ ์ถœ๋ ฅ ๊ฐ’ activation default : linear

๋ชจ๋ธ ์„ค์ •, ํ•™์Šตํ•˜๊ธฐ

model.compile(optimizer='adam', loss='mse', metrics=['mae', 'mse'])
history = model.fit(X_train, y_train, epochs=300, validation_data=(X_val, y_val))

๋ชจ๋ธ ๊ฒฐ๊ณผ ๊ทธ๋ฆฌ๊ธฐ

import matplotlib.pyplot as plt

his_dict = history.history
mse = his_dict['mse']
val_mse = his_dict['val_mse'] # ๊ฒ€์ฆ ๋ฐ์ดํ„ฐ๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ โ€˜val_โ€™ ์ˆ˜์‹์–ด๊ฐ€ ๋ถ™์Šต๋‹ˆ๋‹ค.

epochs = range(1, len(mse) + 1)
fig = plt.figure(figsize = (10, 5))

# ํ›ˆ๋ จ ๋ฐ ๊ฒ€์ฆ ์†์‹ค ๊ทธ๋ฆฌ๊ธฐ
ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(epochs, mse, color = 'blue', label = 'train_mse')
ax1.plot(epochs, val_mse, color = 'orange', label = 'val_mse')
ax1.set_title('train and val mse')
ax1.set_xlabel('epochs')
ax1.set_ylabel('mse')
ax1.legend()

mae = his_dict['mae']
val_mae = his_dict['val_mae']

# ํ›ˆ๋ จ ๋ฐ ๊ฒ€์ฆ ์ •ํ™•๋„ ๊ทธ๋ฆฌ๊ธฐ
ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(epochs, mae, color = 'blue', label = 'train_mae')
ax2.plot(epochs, val_mae, color = 'orange', label = 'val_mae')
ax2.set_title('train and val mae')
ax2.set_xlabel('epochs')
ax2.set_ylabel('mae')
ax2.legend()

plt.show()


๋ชจ๋ธ ํ‰๊ฐ€ํ•˜๊ธฐ

model.evaluate(X_test, y_test)

๋ชจ๋ธ ์˜ˆ์ธก๊ฒฐ๊ณผ ๊ทธ๋ฆฌ๊ธฐ

test_predictions = model.predict(X_test).flatten()

plt.scatter(y_test, test_predictions)
plt.xlabel('True Values [Price]')
plt.ylabel('Predictions [Price]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])


K-Fold ์‚ฌ์šฉํ•˜๊ธฐ

  • ๋ฐ์ดํ„ฐ ๊ฐœ์ˆ˜๊ฐ€ ์ ์€ ๊ฒฝ์šฐ ์„ฑ๋Šฅ์„ ํ–ฅ์ƒ์‹œํ‚ฌ ์ˆ˜ ์žˆ๋Š” ์ข‹์€ ๋ฐฉ๋ฒ• : ๊ต์ฐจ๊ฒ€์ฆ
  • ๊ฒฐ๊ณผ๋ฅผ ํ™•์ธํ•ด๋ณด๋ฉด ๋ชจ๋“  ๋ชจ๋ธ์ด ์ „๋ถ€ ์ข‹์€ ์„ฑ๋Šฅ์„ ๊ฐ€์ง€์ง€๋Š” ์•Š์Œ
    -> ๊ฐ ํด๋“œ์—์„œ ์‚ฌ์šฉํ•œ ํ•™์Šต, ๊ฒ€์ฆ ๋ฐ์ดํ„ฐ๊ฐ€ ๋‹ค๋ฅด๊ธฐ ๋•Œ๋ฌธ
    -> ๋‘๋ฒˆ์งธ ๋ชจ๋ธ์€ ์ƒ๋Œ€์ ์œผ๋กœ ํ…Œ์ŠคํŠธ ๋ชจ๋ธ๊ณผ ๋” ๋น„์Šทํ•œ ๋ถ„ํฌ์˜ ๋ฐ์ดํ„ฐ๋ฅผ ํ•™์Šตํ–ˆ๋‹ค๊ณ  ๋ณผ ์ˆ˜ ์žˆ์Œ
  • ์ด ๋•Œ๋ฌธ์— ์ตœ์ข… ํ•™์Šต ๊ฒฐ๊ณผ๋Š” ํ‰๊ท ์„ ์‚ฌ์šฉ
from tensorflow.keras.datasets.boston_housing import load_data
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import numpy as np
from sklearn.model_selection import KFold

(x_train, y_train), (x_test, y_test) = load_data(path='boston_housing.npz',
                                           test_split=0.2,
                                                 seed=777)

# ๋ฐ์ดํ„ฐ ํ‘œ์ค€ํ™”
mean = np.mean(x_train, axis = 0)
std = np.std(x_train, axis = 0)
# ์—ฌ๊ธฐ๊นŒ์ง„ ์ „๋ถ€ ๋™์ผํ•ฉ๋‹ˆ๋‹ค.
x_train = (x_train - mean) / std
x_test = (x_test - mean) / std

#----------------------------------------
# K-Fold๋ฅผ ์ง„ํ–‰ํ•ด๋ด…๋‹ˆ๋‹ค.
k = 3

# ์ฃผ์–ด์ง„ ๋ฐ์ดํ„ฐ์…‹์„ k๋งŒํผ ๋“ฑ๋ถ„ํ•ฉ๋‹ˆ๋‹ค.
# ์—ฌ๊ธฐ์„œ๋Š” 3์ด๋ฏ€๋กœ ํ›ˆ๋ จ ๋ฐ์ดํ„ฐ์…‹(404๊ฐœ)๋ฅผ 3๋“ฑ๋ถ„ํ•˜์—ฌ
# 1๊ฐœ๋Š” ๊ฒ€์ฆ์…‹์œผ๋กœ, ๋‚˜๋จธ์ง€ 2๊ฐœ๋Š” ํ›ˆ๋ จ์…‹์œผ๋กœ ํ™œ์šฉํ•ฉ๋‹ˆ๋‹ค.
kfold = KFold(n_splits=k)

# ์žฌ์‚ฌ์šฉ์„ ์œ„ํ•ด ๋ชจ๋ธ์„ ๋ฐ˜ํ™˜ํ•˜๋Š” ํ•จ์ˆ˜๋ฅผ ์ •์˜ํ•ฉ๋‹ˆ๋‹ค.

def get_model():
    model = Sequential()
    model.add(Dense(64, activation = 'relu', input_shape = (13, )))
    model.add(Dense(32, activation = 'relu')) 
    model.add(Dense(1))   

    model.compile(optimizer = 'adam', loss = 'mse', metrics = ['mae'])

    return model

mae_list = [] # ํ…Œ์ŠคํŠธ์…‹์„ ํ‰๊ฐ€ํ•œ ํ›„ ๊ฒฐ๊ณผ mae๋ฅผ ๋‹ด์„ ๋ฆฌ์ŠคํŠธ๋ฅผ ์„ ์–ธํ•ฉ๋‹ˆ๋‹ค.

# k๋ฒˆ ์ง„ํ–‰ํ•ฉ๋‹ˆ๋‹ค.
for train_index, val_index in kfold.split(x_train):
    # ํ•ด๋‹น ์ธ๋ฑ์Šค๋Š” ๋ฌด์ž‘์œ„๋กœ ์ƒ์„ฑ๋ฉ๋‹ˆ๋‹ค.
    # ๋ฌด์ž‘์œ„๋กœ ์ƒ์„ฑํ•ด์ฃผ๋Š” ๊ฒƒ์€ ๊ณผ๋Œ€์ ํ•ฉ์„ ํ”ผํ•  ์ˆ˜ ์žˆ๋Š” ์ข‹์€ ๋ฐฉ๋ฒ•์ž…๋‹ˆ๋‹ค.
    x_train_fold, x_val_fold = x_train[train_index], x_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # ๋ชจ๋ธ์„ ๋ถˆ๋Ÿฌ์˜ต๋‹ˆ๋‹ค.
    model = get_model()

    model.fit(x_train_fold, y_train_fold, epochs = 300, validation_data = (x_val_fold, y_val_fold))

    _, test_mae = model.evaluate(x_test, y_test)
    mae_list.append(test_mae)

print(np.mean(mae_list)) 

๐Ÿš™ ์ž๋™์ฐจ ์—ฐ๋น„ ์˜ˆ์ธก

๋ชจ๋ธ ๊ตฌ์„ฑ

  • ํšŒ๊ท€๋Š” ๊ฐ€๊ฒฉ์ด๋‚˜ ํ™•๋ฅ  ๊ฐ™์ด ์—ฐ์†๋œ ์ถœ๋ ฅ ๊ฐ’์„ ์˜ˆ์ธกํ•˜๋Š” ๊ฒƒ์ด ๋ชฉ์ 
  • 1970๋…„๋Œ€ ํ›„๋ฐ˜๊ณผ 1980 ์ดˆ๋ฐ˜์˜ ์ž๋™์ฐจ ์—ฐ๋น„ ์˜ˆ์ธกํ•˜๋Š” ๋ชจ๋ธ ๋งŒ๋“ค๊ธฐ
  • ๋‘๋ฒˆ์งธ ์ธต Dense ๋ ˆ์ด์–ด ๊ฐœ์ˆ˜ 64, ํ™œ์„ฑํ™” ํ•จ์ˆ˜ relu ๋งˆ์ง€๋ง‰ ์ถœ๋ ฅ Dense ์—ฐ๋น„ ์˜ˆ์ธก์œผ๋กœ ์ถœ๋ ฅ ๋‰ด๋Ÿฐ์ด 1๊ฐœ, ํ™œ์„ฑํ™” ํ•จ์ˆ˜ linear

๋ชจ๋ธ ํ•™์Šต๊ณผ์ • ์„ค์ •

  • loss='mse', optimizer=RMSprop(0.001), metrics=['mae','mse']

๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import RMSprop

import warnings
warnings.filterwarnings('ignore')

๋ฐ์ดํ„ฐ ์ค€๋น„

dataset_path = tf.keras.utils.get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")

#์—ฐ๋น„, ์‹ค๋ฆฐ๋”, ๋ฐฐ์ˆ˜๋Ÿ‰, ๋งˆ๋ ฅ, ์ค‘๋ ฅ, ๊ฐ€์†, ์—ฐ์‹, ์ œ์กฐ๊ตญ(1;USA, 2;Europe, 3;Japan)
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight', 'Acceleration', 'Model Year', 'Origin']


raw_dataset = pd.read_csv(dataset_path, names=column_names, na_values='?', comment='\t', sep=' ', skipinitialspace=True)

dataset = raw_dataset.copy()

๐Ÿ‘€

  • na_values = ํŠน์ • ๊ธฐํ˜ธ(์—ฌ๊ธฐ์„œ๋Š” '?') ๋“ค์„ null ์ฒ˜๋ฆฌ
  • comment = '\t' ๋ถ€ํ„ฐ๋‚˜์˜ค๋Š” ํŠน์ • ๋ฌธ์ž๋Š” ์ฃผ์„์œผ๋กœ ๊ฐ„์ฃผํ•˜์—ฌ ์ฝ์ง€ ์•Š์Œ
  • sep = ์–ด๋–ค ๊ธฐ์ค€์œผ๋กœ ์ฝ์–ด๋“ค์ผ์ง€
  • skipinitialspace : True์˜ ๊ฒฝ์šฐ, delimiter์˜ ์งํ›„์— ์žˆ๋Š” ๊ณต๋ฐฑ์€ ๋ฌด์‹œ


๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ

dataset.isna().sum()
  • Horsepower ์— 6๊ฐœ ์กด์žฌ
dataset = dataset.dropna()
dataset.isna().sum()
  • ์‚ญ์ œ๋จ
# Origin(์ œ์กฐ๊ตญ) ์ˆ˜์น˜ํ˜• -> ๋ฒ”์ฃผํ˜• ๋ณ€๊ฒฝ -> ์›ํ•ซ์ธ์ฝ”๋”ฉ
origin = dataset.pop('Origin') # ์ž˜๋ผ๋‚ด๊ณ  ๋ณต์‚ฌํ•˜๊ธฐ
dataset['USA'] = (origin == 1) * 1.0
dataset['Europe'] = (origin == 2) * 1.0
dataset['Japan'] = (origin == 3) * 1.0
  • dataset.head()

ํ›ˆ๋ จ ๋ฐ์ดํ„ฐ์™€ ํ…Œ์ŠคํŠธ ๋ฐ์ดํ„ฐ๋กœ ๋ถ„ํ• 

train_dataset = dataset.sample(frac=0.8, random_state=0) # 80%๋ฅผ ์ž„์˜ ์ถ”์ถœ
test_dataset = dataset.drop(train_dataset.index) # 80%๋ฅผ ์ž„์˜์ถ”์ถœํ•œ index๋ฅผ ์‚ญ์ œํ•˜๊ณ  ๋‚จ์€20% ๋‹ด๊ธฐ

X(ํŠน์„ฑ, ๋…๋ฆฝ๋ณ€์ˆ˜, ๋ฌธ์ œ์ง‘) y(๋ ˆ์ด๋ธ”, ์ข…์†๋ณ€์ˆ˜, ์ •๋‹ต์ง€)๋กœ ๋ถ„๋ฆฌ

train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')
train_dataset

๋ฐ์ดํ„ฐ ์ •๊ทœํ™” -> ํ‘œ์ค€ํ™”

mean = np.mean(train_dataset, axis=0)
std = np.std(train_dataset, axis=0)

train_dataset = (train_dataset - mean) / std
test_dataset = (test_dataset - mean) / std

train_dataset.head()


ํ•™์Šต ๋ชจ๋ธ ๋งŒ๋“ค๊ธฐ

๋ช‡๊ฐœ์˜ feature -> len(train_dataset.keys())

np.random.seed(7)

model = Sequential()
model.add(Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]))
model.add(Dense(64, activation='relu'))
model.add(Dense(1)) # linear

๋ชจ๋ธ ์„ค์ •, ํ•™์Šต

# ์„ค์ •
model.compile(optimizer=RMSprop(0.001), loss='mse', metrics=['mae','mse'])
# ํ•™์Šต
history = model.fit(train_dataset, train_labels, epochs=500)

๋ชจ๋ธ ๊ทธ๋ฆฌ๊ธฐ

import matplotlib.pyplot as plt

his_dict = history.history
mse = his_dict['mse']

epochs = range(1, len(mse) + 1)
fig = plt.figure(figsize = (10, 5))

# ํ›ˆ๋ จ ๋ฐ ๊ฒ€์ฆ ์†์‹ค ๊ทธ๋ฆฌ๊ธฐ
ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(epochs, mse, color = 'blue', label = 'train_mse')
ax1.set_title('train mse')
ax1.set_xlabel('epochs')
ax1.set_ylabel('mse')
ax1.legend()

mae = his_dict['mae']

# ํ›ˆ๋ จ ๋ฐ ๊ฒ€์ฆ ์ •ํ™•๋„ ๊ทธ๋ฆฌ๊ธฐ
ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(epochs, mae, color = 'blue', label = 'train_mae')
ax2.set_title('train mae')
ax2.set_xlabel('epochs')
ax2.set_ylabel('mae')
ax2.legend()

plt.show()


๋ชจ๋ธ ํ‰๊ฐ€ํ•˜๊ธฐ

loss, mae, mse = model.evaluate(test_dataset, test_labels)
print('ํ…Œ์ŠคํŠธ์…‹์˜ ํ‰๊ท  ์ ˆ๋Œ€ ์˜ค์ฐจ : {:5.2f} MPG'.format(mae))

๋ชจ๋ธ ์˜ˆ์ธก ๊ทธ๋ฆฌ๊ธฐ

test_predictions = model.predict(test_dataset).flatten()

plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])

๐Ÿ‘€
์—ฐ๋น„๊ฐ€ ๋†’์•„์งˆ์ˆ˜๋ก ์ •ํ™•๋„๊ฐ€ ๋–จ์–ด์ง
์—ฐ๋น„๊ฐ€ ๋‚ฎ์€ ๋ชจ๋ธ๋“ค์€ feature๋“ค์ด ์ž˜ ๋ฐ˜์˜๋จ

profile
๋ฐฐ๊ณ ํŒŒ์šฉ.

0๊ฐœ์˜ ๋Œ“๊ธ€