import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore')
train_data = pd.read_csv("./train.csv")
print(train_data.shape)
train_data
test_data = pd.read_csv("./test.csv")
print(test_data.shape)
test_data
train_data.info() #데이터 자료형 확인
train_data.isna().sum() #데이터 결측치 확인
train_data.describe() #수치형 데이터 확인
train_data.describe(include=['O']) #범주형 데이터 확인
train_data.hist(figsize=(10, 9))
plt.show()
# 결측치 처리
train_data['CryoSleep'] = train_data['CryoSleep'].fillna(0)
train_data['VIP'] = train_data['VIP'].fillna(0)
train_data['Cabin'] = train_data['Cabin'].fillna(train_data['Cabin'].mode()[0])
train_data['HomePlanet'] = train_data['HomePlanet'].fillna(train_data['HomePlanet'].mode()[0])
train_data['Destination'] = train_data['Destination'].fillna(train_data['Destination'].mode()[0])
train_data['ShoppingMall'] = train_data['ShoppingMall'].fillna(train_data['ShoppingMall'].median())
train_data['VRDeck'] = train_data['VRDeck'].fillna(train_data['VRDeck'].median())
train_data['FoodCourt'] = train_data['FoodCourt'].fillna(train_data['FoodCourt'].median())
train_data['Spa'] = train_data['Spa'].fillna(train_data['Spa'].median())
train_data['RoomService'] = train_data['RoomService'].fillna(train_data['RoomService'].median())
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())
# Cabin 분할
cab = train_data["Cabin"].apply(lambda x: x.split("/"))
train_data["Cab_1"] = cab.apply(lambda x: x[0])
train_data["Cab_3"] = cab.apply(lambda x: x[2])
train_data["Cab_2"] = cab.apply(lambda x: float(x[1]))
#필요 없는 column drop
train_data.drop(['PassengerId', 'Name', 'Cabin'], axis = 1, inplace = True)
# bool형 int로 변경
train_data["VIP"] = train_data["VIP"].astype(int)
train_data["CryoSleep"] = train_data["CryoSleep"].astype(int)
train_data.isna().sum() #데이터 결측치 확인
train_data[['VIP', 'Transported']].groupby(['VIP'], as_index=False).mean().sort_values(by='Transported')
arrived = train_data[train_data['Transported'] == 1]['VIP']. value_counts()
not_arrived = train_data[train_data['Transported'] == 0]['VIP']. value_counts()
df = pd. DataFrame([arrived, not_arrived])
df. index = ['arrived','not_arrived']
df. plot(kind= 'bar',stacked= True , figsize= (6,4))
plt.title('VIP')
plt. show()
train_data[['CryoSleep', 'Transported']].groupby(['CryoSleep'], as_index=False).mean().sort_values(by='Transported')
arrived = train_data[train_data['Transported'] == 1]['CryoSleep']. value_counts()
not_arrived = train_data[train_data['Transported'] == 0]['CryoSleep']. value_counts()
df = pd. DataFrame([arrived, not_arrived])
df. index = ['arrived','not_arrived']
df. plot(kind= 'bar',stacked= False, figsize= (6,4))
plt.title('CryoSleep')
plt. show()
train_data[['HomePlanet', 'Transported']].groupby(['HomePlanet'], as_index=False).mean().sort_values(by='Transported')
arrived = train_data[train_data['Transported'] == 1]['HomePlanet']. value_counts()
not_arrived = train_data[train_data['Transported'] == 0]['HomePlanet']. value_counts()
df = pd. DataFrame([arrived, not_arrived])
df.index = ['arrived','not_arrived']
df.plot(kind= 'bar',stacked= False , figsize= (6,4))
plt.title('HomePlanet')
plt. show()
train_data[['Destination', 'Transported']].groupby(['Destination'], as_index=False).mean().sort_values(by='Transported')
arrived = train_data[train_data['Transported'] == 1]['Destination']. value_counts()
not_arrived = train_data[train_data['Transported'] == 0]['Destination']. value_counts()
df = pd. DataFrame([arrived, not_arrived])
df. index = ['arrived','not_arrived']
df. plot(kind= 'bar',stacked= False , figsize= (8,4))
plt.title('Destination')
plt. show()
train_data.loc[(train_data['Age'] >= 0) & (train_data['Age'] < 10), 'Age'] = 0
train_data.loc[(train_data['Age'] >= 10) & (train_data['Age'] < 20), 'Age'] = 10
train_data.loc[(train_data['Age'] >= 20) & (train_data['Age'] < 30), 'Age'] = 20
train_data.loc[(train_data['Age'] >= 30) & (train_data['Age'] < 40), 'Age'] = 30
train_data.loc[(train_data['Age'] >= 40) & (train_data['Age'] < 50), 'Age'] = 40
train_data.loc[(train_data['Age'] >= 50) & (train_data['Age'] < 60), 'Age'] = 50
train_data.loc[(train_data['Age'] >= 60) & (train_data['Age'] < 70), 'Age'] = 60
train_data.loc[(train_data['Age'] >= 70) & (train_data['Age'] < 80), 'Age'] = 70
Arrived = train_data[train_data['Transported'] == 1]['Age']. value_counts()
Not_Arrived = train_data[train_data['Transported'] == 0]['Age']. value_counts()
df = pd. DataFrame([Arrived, Not_Arrived])
df. index = ['Arrived','Not_Arrived']
df. plot(kind= 'bar',stacked= False , figsize= (6,4)). legend(loc = 'lower center')
plt.title('Age')
plt. show()
arrived = train_data[train_data['Transported'] == 1]['Cab_1']. value_counts()
not_arrived = train_data[train_data['Transported'] == 0]['Cab_1']. value_counts()
df = pd. DataFrame([arrived, not_arrived])
df. index = ['arrived','not_arrived']
df. plot(kind= 'bar',stacked= False , figsize= (8,4))
plt.title('Cab_1')
plt. show()
arrived = train_data[train_data['Transported'] == 1]['Cab_3']. value_counts()
not_arrived = train_data[train_data['Transported'] == 0]['Cab_3']. value_counts()
df = pd. DataFrame([arrived, not_arrived])
df. index = ['arrived','not_arrived']
df. plot(kind= 'bar',stacked= False , figsize= (8,4))
plt.title('Cab_3')
plt. show()
plt.figure(figsize = (16, 12))
x = sns.heatmap(train_data.corr(), cmap = 'Blues', linewidths = '0.1',annot = True)
input_data = train_data.drop(['Transported'], axis = 1)
encoding_train_data = pd.get_dummies(input_data)
encoding_train_data
encoding_train_data.columns
analysis = pd.merge(encoding_train_data, train_data['Transported'], left_index = True, right_index=True)
sns.pairplot(analysis[['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Transported']],hue='Transported', palette='husl', markers=['o','s'])
plt.show()
sns.pairplot(analysis[['Spa', 'VRDeck', 'Destination_55 Cancri e', 'Destination_TRAPPIST-1e', 'Destination_PSO J318.5-22', 'Transported']],hue='Transported', palette='husl', markers=['o','s'])
plt.show()
sns.pairplot(analysis[['HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars', 'Cab_2', 'Transported']],hue='Transported', palette='husl', markers=['o','s'])
plt.show()
sns.pairplot(analysis[['Cab_1_A','Cab_1_B', 'Cab_1_C', 'Cab_1_D', 'Cab_1_E', 'Transported']],hue='Transported', palette='husl', markers=['o','s'])
plt.show()
sns.pairplot(analysis[['Cab_1_F', 'Cab_1_G', 'Cab_1_T', 'Cab_3_P', 'Cab_3_S', 'Transported']],hue='Transported', palette='husl', markers=['o','s'])
plt.show()
y_target = train_data['Transported']
x_data = encoding_train_data.drop(['Cab_1_A', 'Cab_1_G', 'Cab_1_T', 'Cab_1_D', 'Cab_1_C', 'VIP', 'HomePlanet_Mars', 'Destination_PSO J318.5-22'], axis = 1)
pca = PCA(n_components = 10)
pca.fit(encoding_train_data)
pca_train_data = pca.transform(encoding_train_data)
pca_train_data
pca_cols = []
for i in range(0, 10) :
x = 'pca_col' + str(i)
pca_cols.append(x)
pca_cols
pca_df = pd.DataFrame(pca_train_data, columns = pca_cols)
pca_df
x_train, x_test, y_train, y_test = train_test_split(pca_df, y_target, test_size = 0.2, random_state = 0)
params = { 'n_estimators' : [50, 100, 200, 300, 400],
'max_depth' : [4, 6, 8, 10, 12, 16],
'min_samples_leaf' : [4, 8, 12, 16, 20], #과적합 제어
'min_samples_split' : [4, 8, 12, 16, 20] #과적합 제어
}
grid_re_clf = RandomForestClassifier(random_state = 5, n_jobs = -1)
grid_cv = GridSearchCV(grid_re_clf, param_grid = params, cv = 5, n_jobs = -1)
grid_cv.fit(x_train, y_train)
print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))
# 모델 학습
pca_model = RandomForestClassifier(n_estimators=100, random_state=5,
max_depth=10, min_samples_leaf = 4, min_samples_split = 16, oob_score=True)
pca_model.fit(x_train, y_train)
pred = pca_model.predict(x_test)
# 평가
print("훈련 세트 정확도: {:.3f}".format(pca_model.score(x_train, y_train)) )
print("테스트 세트 정확도: {:.3f}".format(pca_model.score(x_test, y_test)) )
print("OOB 샘플의 정확도: {:.3f}".format(pca_model.oob_score_) )
y_target = train_data['Transported']
x_data = encoding_train_data.drop(['Cab_1_A', 'Cab_1_G', 'Cab_1_T', 'Cab_1_D', 'Cab_1_C', 'VIP', 'HomePlanet_Mars', 'Destination_PSO J318.5-22'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_target, test_size = 0.15, random_state = 0)
params = { 'n_estimators' : [50, 100, 200, 300, 400],
'max_depth' : [4, 6, 8, 10, 12, 16],
'min_samples_leaf' : [4, 8, 12, 16, 20], #과적합 제어
'min_samples_split' : [4, 8, 12, 16, 20] #과적합 제어
}
grid_re_clf = RandomForestClassifier(random_state = 5, n_jobs = -1)
grid_cv = GridSearchCV(grid_re_clf, param_grid = params, cv = 5, n_jobs = -1)
grid_cv.fit(x_train, y_train)
print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))
# 모델 학습
model = RandomForestClassifier(n_estimators = 200, random_state=5, max_depth = 16, min_samples_leaf = 8, min_samples_split = 4, oob_score=True)
model.fit(x_train, y_train)
pred = model.predict(x_test)
# 평가
print("훈련 세트 정확도: {:.3f}".format(model.score(x_train, y_train)) )
print("테스트 세트 정확도: {:.3f}".format(model.score(x_test, y_test)) )
print("OOB 샘플의 정확도: {:.3f}".format(model.oob_score_) )
# feature importance
ftr_importances_values = model.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index = x_train.columns)
ftr_sort = ftr_importances.sort_values(ascending=False)
plt.figure(figsize=(12,10))
plt.title('Feature Importances')
sns.barplot(x=ftr_sort, y=ftr_sort.index)
plt.show()
test_data.isna().sum() #데이터 결측치 확인
# 결측치 처리
test_data['CryoSleep'] = test_data['CryoSleep'].fillna(0)
test_data['VIP'] = test_data['VIP'].fillna(0)
test_data['Cabin'] = test_data['Cabin'].fillna(test_data['Cabin'].mode()[0])
test_data['HomePlanet'] = test_data['HomePlanet'].fillna(test_data['HomePlanet'].mode()[0])
test_data['Destination'] = test_data['Destination'].fillna(test_data['Destination'].mode()[0])
test_data['ShoppingMall'] = test_data['ShoppingMall'].fillna(test_data['ShoppingMall'].median())
test_data['VRDeck'] = test_data['VRDeck'].fillna(test_data['VRDeck'].median())
test_data['FoodCourt'] = test_data['FoodCourt'].fillna(test_data['FoodCourt'].median())
test_data['Spa'] = test_data['Spa'].fillna(test_data['Spa'].median())
test_data['RoomService'] = test_data['RoomService'].fillna(test_data['RoomService'].median())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())
# Cabin 분할
cab = test_data["Cabin"].apply(lambda x: x.split("/"))
test_data["Cab_1"] = cab.apply(lambda x: x[0])
test_data["Cab_3"] = cab.apply(lambda x: x[2])
test_data["Cab_2"] = cab.apply(lambda x: float(x[1]))
#필요 없는 column drop
test_data.drop(['PassengerId', 'Name', 'Cabin'], axis = 1, inplace = True)
# bool형 int로 변경
test_data["VIP"] = test_data["VIP"].astype(int)
test_data["CryoSleep"] = test_data["CryoSleep"].astype(int)
encoding_test_data = pd.get_dummies(test_data)
test_x_data = encoding_test_data.drop(['Cab_1_A', 'Cab_1_G', 'Cab_1_T', 'Cab_1_D', 'Cab_1_C', 'VIP', 'HomePlanet_Mars', 'Destination_PSO J318.5-22'], axis = 1)
pca = PCA(n_components = 10)
pca.fit(encoding_test_data)
pca_test_data = pca.transform(encoding_test_data)
pca_cols = []
for i in range(0, 10) :
x = 'pca_col' + str(i)
pca_cols.append(x)
pca_df = pd.DataFrame(pca_test_data, columns = pca_cols)
pca_rf_pred = pca_model.predict(pca_df)
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['Transported'] = pca_rf_pred
sample_submission.to_csv('submission.csv',index = False)
sample_submission.head()
rf_predict = model.predict(test_x_data)
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['Transported'] = rf_predict
sample_submission.to_csv('CKH_submission_2.csv',index = False)
sample_submission.head()