import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/내배캠 실습/ML/titanic/'
train_df = pd.read_csv(file_path + 'train.csv')
test_df = pd.read_csv(file_path + 'test.csv')
# 확인
test_df.head(3)

train_df.describe(include = 'all')

📍 전처리
1. SibSp와 Parch 합쳐서 Family 컬럼 만들기
2. 이상치, 결측치 처리
3. 숫자형 데이터 스케일링
4. 범주형 데이터 인코딩
- 종속 변수:
성별(Sex),나이(Age),요금(Fare),가족(Family),등급(Pclass),정박항(Embarked)- 독립 변수:
생존 여부(Survived)
train_df_2 = train_df.copy()
def get_family(df):
df['Family'] = df['SibSp'] + df['Parch'] + 1
return df
# 함수 작동 확인
get_family(train_df_2).head(3)

sns.pairplot(train_df_2[['Age','Fare','Family']])

train_df_2 = train_df_2[train_df_2['Fare'] < 512]
def get_non_missing(df):
Age_mean = train_df_2['Age'].mean()
Fare_mean = train_df_2['Fare'].mean()
df['Age'] = df['Age'].fillna(Age_mean)
# train 데이터에는 필요하지않으나 test 데이터에 결측치 존재해서 추가
df['Fare'] = df['Fare'].fillna(Fare_mean)
df['Embarked'] = df['Embarked'].fillna('S')
return df
get_non_missing(train_df_2).info()
def get_numeric_sc(df):
# sd_sc: Fare , mm_sc : Age, Family
from sklearn.preprocessing import StandardScaler, MinMaxScaler
sd_sc = StandardScaler()
mm_sc = MinMaxScaler()
sd_sc.fit(train_df_2[['Fare']])
df['Fare_sd_sc'] = sd_sc.transform(df[['Fare']])
mm_sc.fit(train_df_2[['Age','Family']])
df[['Age_mm_sc','Family_mm_sc']] = mm_sc.transform(df[['Age','Family']])
return df
get_numeric_sc(train_df_2)
def get_category(df):
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
le2 = LabelEncoder()
oe = OneHotEncoder()
le.fit(train_df_2[['Pclass']])
df['Pclass_le'] = le.transform(df['Pclass'])
le2.fit(train_df_2[['Sex']])
df['Sex_le'] = le2.transform(df['Sex'])
#index reset을 하기위한 구문
df = df.reset_index()
oe.fit(train_df_2[['Embarked']])
embarked_csr = oe.transform(df[['Embarked']])
embarked_csr_df = pd.DataFrame(embarked_csr.toarray(), columns = oe.get_feature_names_out())
df = pd.concat([df, embarked_csr_df], axis = 1)
return df
train_df_2 = get_category(train_df_2)
def get_model(df):
from sklearn.linear_model import LogisticRegression
model_lor = LogisticRegression()
X = df[['Age_mm_sc','Fare_sd_sc','Family_mm_sc','Pclass_le',
'Sex_le','Embarked_C','Embarked_C','Embarked_C']]
y = df[['Survived']]
return model_lor.fit(X,y)
model_output = get_model(train_df_2)
X = train_df_2[['Age_mm_sc','Fare_sd_sc','Family_mm_sc','Pclass_le',
'Sex_le','Embarked_C','Embarked_C','Embarked_C']]
y_pred = model_output.predict(X)
from sklearn.metrics import accuracy_score, f1_score
print(accuracy_score(train_df_2['Survived'], y_pred))
print(f1_score(train_df_2['Survived'], y_pred))
# 0.8029279279279279
# 0.7311827956989247
test_df_2 = get_family(test_df)
test_df_2 = get_non_missing(test_df_2)
test_df_2 = get_numeric_sc(test_df_2)
test_df_2 = get_category(test_df_2)
test_X = test_df_2[['Age_mm_sc','Fare_sd_sc','Family_mm_sc','Pclass_le',
'Sex_le','Embarked_C','Embarked_C','Embarked_C']]
y_test_pred = model_output.predict(test_X)