

์ฃผ์ด์ง ๋ฐ์ดํฐ๋ฅผ ํ์ฉํ์ฌ ํ์ดํ๋ํธ์ ํ์นํ ์น๊ฐ๋ค์ ์์กด์ฌ๋ถ(survival)์ ์์ธกํ๋ ๋ชจ๋ธ์ ๊ตฌํํ๋ค.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
# Raw Data Loading
df = pd.read_csv('/content/drive/MyDrive/KDT/data/Titanic/train.csv')
display(df)

df1 = df.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=False)
์ฐ์ , ํ์์๊ฑฐ๋ ์๋ฏธ๊ฐ ์ค๋ณต๋๋ feature๋ฅผ ์ญ์ ํ์๋ค. ์ฐ์ PassengerId, Name, Ticket๊ณผ ๊ฐ์ด ์์กด์ฌ๋ถ์ ์๊ด์๋ feature๋ฅผ ์ญ์ ํ๊ณ Pclass์ ๋น์ทํ ์๋ฏธ๋ฅผ ๊ฐ์ง Fare์ ์ญ์ ํ์๋ค. ๋ํ ๊ฒฐ์ธก์น๊ฐ ์ ์ฒด ๋ฐ์ดํฐ์ ์ฝ 70%๊ฐ ๋๋ Cabin์ ๊ฒฝ์ฐ ๋น๊ต์ ์ ์ ๋ฐ์ดํฐ์์ ์์๋ก ๊ฒฐ์ธก์น๋ฅผ ๋์ฒดํ๊ฒ ๋๋ฉด ์ ์ฒด ๋ฐ์ดํฐ์ ์๊ณก์ด ๋ฐ์ํ ์๋ ์์ผ๋ฏ๋ก ์ญ์ ํ์๋ค.
df1['Family'] = df1['SibSp'] + df1['Parch']
df2 = df1.drop(['SibSp', 'Parch'], axis=1, inplace=False)
ํจ๊ป ํ์นํ ํ์ ์๋งค, ๋ฐฐ์ฐ์์ ์๋ฅผ ๋ด๊ณ ์๋ SibSp์ ํจ๊ป ํ์นํ ๋ถ๋ชจ, ์์์ ์๋ฅผ ๋ด๊ณ ์๋ Parch๋ ์๋ฏธ๊ฐ ๋น์ทํ๊ธฐ์ ์ด ๋์ ๋ํด์ ์๋ก์ด ์ปฌ๋ผ(Family)๋ฅผ ์ถ๊ฐํ์๋ค.
df2['Sex'] = np.where(df2['Sex'] == 'female', 0, 1)
embarked_mapping = {'S' : 0, 'C' : 1, 'Q' : 2}
df2['Embarked'] = df2['Embarked'].map(embarked_mapping)
์ฑ๋ณ์ ๋ด๊ณ ์๋ ์ด์ง ๋ฐ์ดํฐ Sex ์ปฌ๋ผ์ ๋ชจ๋ธ์ด ์ฒ๋ฆฌํ ์ ์๋๋ก ์ฌ์๋ 0, ๋จ์๋ 1๋ก ๋ณํํ์๊ณ , ์น๊ฐ๋ค์ด ์ด๋์ ํ์นํ์๋์ง๋ฅผ ๋ด๊ณ ์๋ Embarked ์ปฌ๋ผ ๋ํ S(Southampton)๋ 0, C(Cherbourg)๋ 1, Q(Queenstown)์ 2๋ก ๋ฒ์ฃผํ ๋ฐ์ดํฐ๋ก ์ฒ๋ฆฌํด์ฃผ์๋ค.
df2.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 891 entries, 0 to 890
# Data columns (total 6 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 Survived 891 non-null int64
# 1 Pclass 891 non-null int64
# 2 Sex 891 non-null int64
# 3 Age 714 non-null float64
# 4 Embarked 889 non-null float64
# 5 Family 891 non-null int64
# dtypes: float64(2), int64(4)
# memory usage: 41.9 KB
ํ์ฌ๊น์ง ์ฒ๋ฆฌํ DataFrame์ ๋ค์๊ณผ ๊ฐ์ด ๊ตฌ์ฑ๋์ด์๊ณ , info()๋ฅผ ๋ณด๋ฉด ์์ง Age์ Embarked์ด์ ๊ฒฐ์ธก์น๊ฐ ๋จ์์๋ ๊ฒ์ ๋ณผ ์ ์๋ค.
df2['Age'] = df2['Age'].fillna(value=df2['Age'].median(), axis=0)
df2['Embarked'] = df2['Embarked'].ffill()
๋ฐ๋ผ์ ๊ฒฐ์ธก์น๊ฐ ์๋์ ์ผ๋ก ๋ง์ Age์ด๊ณผ ๊ฐ์ ๊ฒฝ์ฐ๋ median(์ค์๊ฐ)์ผ๋ก ์ฑ์์ฃผ์๊ณ , ๊ฒฐ์ธก์น๊ฐ 2๊ฐ ๋ฐ์ ์กด์ฌํ์ง ์์ Embarked์ด์ ๊ฐ ๊ฒฐ์ธก์น์ ์์ ์๋ ๋ฐ์ดํฐ๋ฅผ ๊ฐ์ ธ์ ์ฑ์์ฃผ์๋ค.
plt.boxplot(df2['Age'].values)
plt.show()

๋ค๋ฅธ Feature๋ค์ ๋ชจ๋ ์ด์ง, ๋ฒ์ฃผํ ๋ฐ์ดํฐ๋ฅผ ๋ด๊ณ ์๋๋ฐ ๋ฐํด, Age๋ ์ฐ์์ ์ธ ์ค์๊ฐ์ ๊ฐ์ง๋ ๋ฐ์ดํฐ์ด๊ธฐ์ ์ด์์น๋ฅผ ๋จผ์ ํ์ธํด ์ฃผ์๋ค.
ํ์ธ ๊ฒฐ๊ณผ, ๋ช๊ฐ์ ์ด์์น๊ฐ ๋ฐ๊ฒฌ๋์์ง๋ง ๋ชจ๋ ์ค์กด๊ฐ๋ฅํ ๋์ด๋ผ๊ณ ํ๋จ์ ๋ด๋ฆด ์ ์์๋ค. ์ด์ ๋ฐ๋ผ ์ด์์น๋ ๋ฐ๋ก ๋์ฒด ๋๋ ์ญ์ ์ฒ๋ฆฌ๋ฅผ ํ์ง ์์๋ค.
df2.loc[df2['Age'] < 8, 'Age'] = 0
df2.loc[(df2['Age'] >= 8) & (df2['Age'] < 20), 'Age'] = 1
df2.loc[(df2['Age'] >= 20) & (df2['Age'] < 50), 'Age'] = 2
df2.loc[(df2['Age'] >= 50) & (df2['Age'] < 80), 'Age'] = 3
df2.loc[df2['Age'] >= 80, 'Age'] = 4
df2['Age'].value_counts()
์น๊ฐ๋ค์ ์์กด ์ฌ๋ถ๋ ๋์ด์ ๋ฐ๋ผ ์์กด ํ๋ฅ ์ด ๋ฌ๋ผ์ง ๊ฒ์ด๋ผ๊ณ ํ๋จํ์๋ค. ์ด์ ๋ฐ๋ผ ์น๊ฐ๋ค์ ์ฐ๋ น๋์ ๋ฐ๋ผ ๋ฒ์ฃผํ ๋ฐ์ดํฐ๋ก ๊ตฌ๊ฐํ ์ฒ๋ฆฌ๋ฅผ ํด์ฃผ์๋ค.
x_data = df2.drop('Survived', axis=1, inplace=False).values
t_data = df2['Survived'].values
scaler = MinMaxScaler()
scaler.fit(x_data)
x_data_norm = scaler.transform(x_data)
๊ฐ feature๋ง๋ค ๋ฐ์ดํฐ์ scale์ด ๋ค๋ฅด๊ธฐ ๋๋ฌธ์ ๋ ๋ฆฝ๋ณ์์ ์ข ์๋ณ์๋ฅผ ๋๋๊ณ ๋ ๋ฆฝ๋ณ์์ ๋ํด Min-Max Scaling ์ฒ๋ฆฌ๋ฅผ ํ์๋ค.
x_data_train_norm, x_data_test_norm, t_data_train, t_data_test = \
train_test_split(x_data_norm,
t_data,
test_size=0.2,
stratify=t_data)
๋ชจ๋ธ ํ์ต ํ ๋ชจ๋ธ ๊ฒ์ฆ์ ์ํด ํ์ต๋ฐ์ดํฐ์ ํ ์คํธ ๋ฐ์ดํฐ๋ฅผ ๋๋์ด์ฃผ์๋ค.
model = Sequential()
model.add(Flatten(input_shape=(5,)))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate=1e-2),
loss='binary_crossentropy',
metrics=['acc'])
es_callback = EarlyStopping(monitor='val_loss',
patience=5,
restore_best_weights=True,
verbose=1)
model.fit(x_data_train_norm,
t_data_train,
epochs=1000,
validation_split=0.2,
batch_size=100,
callbacks=[es_callback],
verbose=1)
์ด์ง ๋ก์ง์คํฑ ๋ชจ๋ธ์ด๊ธฐ ๋๋ฌธ์ ํ์ฑํ ํจ์๋ก 'sigmoid' ํจ์๋ฅผ, ์์ค ํจ์๋ก 'binary_crossentropy' ํจ์๋ฅผ ์ฌ์ฉํ์ฌ ๋งค์ฐ ๊ฐ๋จํ DNN ๋ชจ๋ธ์ ๊ตฌํํ์๋ค.
result = model.predict(x_data_test_norm)
result = np.where(result >= 0.5, 1, 0).reshape(-1)
print(classification_report(t_data_test, result))
# precision recall f1-score support
# 0 0.79 0.94 0.85 110
# 1 0.85 0.59 0.70 69
# accuracy 0.80 179
# macro avg 0.82 0.77 0.78 179
# weighted avg 0.81 0.80 0.80 179
๋ชจ๋ธ ํ๊ฐ ๊ฒฐ๊ณผ F1 Score๊ฐ 0.8๋ก ์ถ๋ ฅ๋ ๊ฒ์ ํ์ธํ ์ ์์๋ค.
# Test Data Loading
test_df = pd.read_csv('/content/drive/MyDrive/KDT/data/Titanic/test.csv')
# Test Data Preprocessing
test_df1 = test_df.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=False)
# Family = SibSp + Parch
test_df1['Family'] = test_df1['SibSp'] + test_df1['Parch']
test_df2 = test_df1.drop(['SibSp', 'Parch'], axis=1, inplace=False)
# Sex ๋ฐ๊พธ๊ธฐ
test_df2['Sex'] = np.where(test_df2['Sex'] == 'female', 0, 1)
# Embarked ๋ฐ๊พธ๊ธฐ
embarked_mapping = {'S' : 0, 'C' : 1, 'Q' : 2}
test_df2['Embarked'] = test_df2['Embarked'].map(embarked_mapping)
# ๊ฒฐ์ธก์น ์ฒ๋ฆฌ
test_df2['Age'] = test_df2['Age'].fillna(test_df2['Age'].median(), axis=0)
test_df2['Embarked'] = test_df2['Embarked'].ffill()
# ์ด์์น ์ฒ๋ฆฌ
plt.boxplot(test_df2['Age'].values)
plt.show()
# Age Binning
test_df2.loc[test_df2['Age'] < 8, 'Age'] = 0
test_df2.loc[(test_df2['Age'] >= 8) & (test_df2['Age'] < 20), 'Age'] = 1
test_df2.loc[(test_df2['Age'] >= 20) & (test_df2['Age'] < 50), 'Age'] = 2
test_df2.loc[(test_df2['Age'] >= 50) & (test_df2['Age'] < 80), 'Age'] = 3
test_df2.loc[test_df2['Age'] >= 80, 'Age'] = 4
# ์ ๊ทํ
test_data_norm = scaler.transform(test_df2.values)
# ์์ธก
test_result = model.predict(test_data_norm)
test_result = np.where(test_result >= 0.5, 1, 0).reshape(-1)
# Submission Data Loading
submission = pd.read_csv('/content/drive/MyDrive/KDT/data/Titanic/gender_submission.csv')
# ์ ๋ต ์
๋ ฅ ๋ฐ ์ถ์ถ
submission['Survived'] = test_result
submission.to_csv('Titanic_DNN.csv', index=False)
ํ ์คํธ ๋ฐ์ดํฐ๋ฅผ ๊ฐ์ ธ์์ ํ๋ จ ๋ฐ์ดํฐ์ ๋๊ฐ์ด ์ ์ฒ๋ฆฌ๋ฅผ ์งํํ์๊ณ , ์์ธก ๊ฒฐ๊ณผ๋ฅผ ์ ์ถ ๋ฐ์ดํฐ์ ์ฝ์ ํ์ฌ ์ ์ถํ์๋ค.

์ต๊ทผ์ ๊ณต๋ถํ ๋งค์ฐ ๊ฐ๋จํ DNN๋ชจ๋ธ์ ๊ตฌํํ์ฌ ์ฌ๋ฌ๊ฐ์ง ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ๋ฅผ ์ฐ์ตํ๊ธฐ ์ํด ๊ฐ์ฅ ์ ๋ช ํ Titanic Data Set์ ํ์ฉํ์ฌ ๋จธ์ ๋ฌ๋ ๋ชจ๋ธ ๊ตฌํ์ ์ฐ์ตํด ๋ณด์๋ค. ๋ฐ์ดํฐ ์์ด ๋ถ์กฑํ ํ๋ ์๊ฒ ์ง๋ง, ๊ฒฐ๊ณผ๊ฐ ์ข์ง ์์ ๋ ์ข์ ๋ฐฉ๋ฒ์ด ์์์ง ๋ ๊ณ ๋ฏผํ๊ณ ๊ณต๋ถํด์ผ๊ฒ ๋ค.
https://www.kaggle.com/competitions/titanic