데이터 불러오기

데이터 EDA
import matplotlib.pyplot as plt
import seaborn as sns
f, ax = plt.subplots(1, 2, figsize = (18, 8))
titanic['survived'].value_counts().plot.pie(explode = [0, 0.05],
autopct = '%1.1f%%',
ax=ax[0],
shadow = True)
ax[0].set_title('Pie plot_Survived')
ax[0].set_label('')
sns.countplot(x='survived', data=titanic, ax=ax[1])
ax[1].set_title('Count plot_Survived')
plt.show()

- 성별에 따른 생존 상황 확인
- 그래프 확인 시 남성의 생존 가능성이 더 낮다
f, ax = plt.subplots(1, 2, figsize = (18, 8))
sns.countplot(x='sex', data=titanic, ax=ax[0])
ax[0].set_title('Count of passenger of sex')
ax[0].set_label('')
sns.countplot(x='sex', hue='survived', data=titanic, ax=ax[1])
ax[1].set_title('Sex : Survived and Unsurvived')
plt.show()

- 경제력 대비 생존율 확인 및 선실 등급별 성별 시각화
pd.crosstab(titanic['pclass'], titanic['survived'], margins=True)

grid = sns.FacetGrid(titanic, row='pclass', col='sex', height=4, aspect=2)
grid.map(plt.hist, 'age', alpha = 0.8, bins = 20)
grid.add_legend()

- 나이별 승객 현황
-아이들과 20~30대가 많다
import plotly.express as px
fig = px.histogram(titanic, x='age')
fig.show()

- 등실별 생존률에 따른 연령 분포
- 선실등급이 높으면 생존률도 높다
grid = sns.FacetGrid(titanic, col='survived', row='pclass', height=4, aspect=2)
grid.map(plt.hist, 'age', alpha = 0.5, bins = 20)
grid.add_legend()

titanic['age_cat'] = pd.cut(titanic['age'], bins=[0, 7, 15, 30, 60, 100],
include_lowest=True,
labels=['baby', 'teen', 'young', 'adult', 'old'])
titanic.head()

plt.figure(figsize=(12, 4))
plt.subplot(131)
sns.barplot(x='pclass', y='survived', data=titanic)
plt.subplot(132)
sns.barplot(x='age_cat', y='survived', data=titanic)
plt.subplot(133)
sns.barplot(x='sex', y='survived', data=titanic)

import re
for idx, dataset in titanic.iterrows():
tmp = dataset['name']
print(re.search('\,\s\w+(\s\w+)?\.', tmp).group())

titanic['title'].unique()
titanic['title'] = titanic['title'].replace('Ms', 'Miss')
titanic['title'] = titanic['title'].replace('Mlle', 'Miss')
titanic['title'] = titanic['title'].replace('Mme', 'Mrs')
Rare_f = ['Dona', 'Dr', 'Lady', 'the Countess']
Rare_m = ['Capt', 'Col', 'Don', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Master']
for i in Rare_f:
titanic['title'] = titanic['title'].replace(i, 'Rare_f')
for i in Rare_m:
titanic['title'] = titanic['title'].replace(i, 'Rare_m')
titanic['title'].unique()

titanic[['title', 'survived']].groupby(['title'], as_index=False).mean()

머신러닝 생존자 예측을 위한 구조확인

LabelEncoder을 통한 gender 구분
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(titanic['sex'])
titanic['gender'] = le.transform(titanic['sex'])
titanic.head()

결측치 삭제 및 상관관계 확인
titanic.isnull().sum()

titanic = titanic[titanic['age'].notnull()]
titanic = titanic[titanic['fare'].notnull()]
titanic.isnull().sum()

correlation_matrix = titanic.corr(numeric_only=True).round(1)
sns.heatmap(data = correlation_matrix, annot=True, cmap='bwr')

특성 선택 및 데이터 분할
- 선택할 특성은 'pclass', 'age', 'sibsp(함께 탑승한 형제 또는 배우자 수)', 'parch'(함께 탑승한 부모 또는 자녀 수), 'fare(티켓비용)', 'gender'
from sklearn.model_selection import train_test_split
X = titanic[['pclass', 'age', 'sibsp', 'parch', 'fare', 'gender']]
y = titanic['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
DecisionTree 학습 미 예상 데이터(디카프리오, 윈슬릿)로 생존 예측
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dt = DecisionTreeClassifier(max_depth=4, random_state=13)
dt.fit(X_train, y_train)
pred = dt.predict(X_test)
print(accuracy_score(y_test, pred))

import numpy as np
dicaprio = np.array([[3, 18, 0, 0, 5, 1]])
print('Dicaprio : ', dt.predict_proba(dicaprio)[0, 1])

winslet = np.array([[1, 16, 1, 1, 100, 0]])
print('Winslet : ', dt.predict_proba(winslet)[0, 1])
