1. 타이타닉 탑승자 데이터 가져오기
# 데이터 불려오기
import pandas as pd
titanic_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/titanic.xls'
titanic = pd.read_excel(titanic_url)
![](https://velog.velcdn.com/images/rkdgg0/post/6a24aa8d-a0c6-4f3c-8c2e-c3f073e3e68e/image.png)
2. 데이터 탐색적 분석 - EDA
# 생존 비율 확인
# 생존 : 1, 사망 : 0
import matplotlib.pyplot as plt
import seaborn as sns
titanic['survived'].value_counts().plot.pie(autopct = '%1.1f%%', shadow = True, explode=[0, 0.05])
![](https://velog.velcdn.com/images/rkdgg0/post/6d99c009-5969-4bec-ab5a-5d72c32b0286/image.png)
# 성별에 따른 생존현황
f, ax = plt.subplots(1, 2, figsize= (16,8))
sns.countplot(x= 'sex', data =titanic, ax=ax[0])
ax[0].set_title('count of passengers of sex')
ax[0].set_ylabel('')
sns.countplot(x= 'sex', data =titanic, hue= 'survived', ax=ax[1])
ax[1].set_title('sex : survived')
# -> 남성의 생존 가능성이 더 낮다
![](https://velog.velcdn.com/images/rkdgg0/post/25cfb86e-b06c-4189-b266-70cb12eb1363/image.png)
# 경쟁력 대비 생존률
pd.crosstab(titanic['pclass'], titanic['survived'], margins=True)
# -> 1등실의 생존 가능성이 높다, 여성 생존률도 높다
# -> 그럼 1등실에는 여성이 많이 타고 있었는지 확인이 필요할꺼같다
![](https://velog.velcdn.com/images/rkdgg0/post/a7426774-243f-41cf-9c20-b08fdc935a65/image.png)
# 선실 등급별 성별 상황
grid = sns.FacetGrid(titanic, row='pclass', col= 'sex' , height=4, aspect=2)
grid.map(plt.hist, 'age', alpha = 0.8, bins = 20)
grid.add_legend();
# -> 3등실에는 남성이 많다 - 특히 20대 남성
![](https://velog.velcdn.com/images/rkdgg0/post/0a92ad42-6ad2-41fd-8c21-b642fd480cbf/image.png)
# 나이별 생존률
import plotly.express as px
fig = px.histogram(titanic, x='age')
fig.show()
![](https://velog.velcdn.com/images/rkdgg0/post/307dc671-fbd9-46b3-9bcd-e89f86677331/image.png)
# 등실별 생존률을 연령별로 관찰
grid = sns.FacetGrid(titanic, row='pclass', col= 'age' , height=4, aspect=2)
grid.map(plt.hist, 'age', alpha = 0.5, bins = 20)
grid.add_legend();
![](https://velog.velcdn.com/images/rkdgg0/post/5235b03f-1916-4271-a100-50bf953e14e6/image.png)
# 나이를 5단계로 정리하여 DataFrame
titanic['age_cat'] = pd.cut(
titanic['age'],
bins = [0,7,15,30,60,100],
labels=['baby','teen','young','adult','old']
)
# 나이, 성별, 등급별 생존자 수를 한번에 파악해보기
plt.figure(figsize=(14,6))
plt.subplot(131)
sns.barplot(x='pclass', y = 'survived', data = titanic)
plt.subplot(132)
sns.barplot(x='age_cat', y = 'survived', data = titanic)
plt.subplot(133)
sns.barplot(x='sex', y = 'survived', data = titanic)
plt.show()
![](https://velog.velcdn.com/images/rkdgg0/post/9e13c7a0-5e2f-4512-b228-b5221f81c560/image.png)
# 남/여 나이별 생존 상황
fig, axes = plt.subplots(nrows= 1, ncols= 2, figsize=(14,6))
women = titanic[titanic['sex'] == 'female']
men = titanic[titanic['sex'] == 'male']
ax = sns.distplot( women[women['survived']==1]['age'] ,bins= 20, label='survived', ax = axes[0], kde=False)
ax = sns.distplot( women[women['survived']==0]['age'] ,bins= 40, label='not survived', ax = axes[0], kde=False)
ax.legend();
ax.set_title('female')
ax = sns.distplot( men[men['survived']==1]['age'] ,bins= 20, label='survived', ax = axes[1], kde=False)
ax = sns.distplot( men[men['survived']==0]['age'] ,bins= 40, label='not survived', ax = axes[1], kde=False)
ax.legend();
ax.set_title('male')
![](https://velog.velcdn.com/images/rkdgg0/post/01a82a64-bd41-4b33-aad0-e8806365db63/image.png)
# 탑승객의 이름에서 신분을 알수있다.
# 정규식을 이용해서 문장 사이의 신분에 대한 정보얻기
import re
title = []
for idx, dataset in titanic.iterrows():
tmp = dataset['name']
title.append(re.search('\,\s\w+(\s\w+)?\.', tmp).group()[2:-1])
titanic['title'] = title
pd.crosstab(titanic['title'], titanic['sex'])
![](https://velog.velcdn.com/images/rkdgg0/post/8edcb25d-99e7-44f7-a189-1ed35f0ce928/image.png)
# 사회적 신분을 깔끔하게 정리
titanic['title'] = titanic['title'].replace('Mlle', 'Miss')
titanic['title'] = titanic['title'].replace('Ms', 'Miss')
titanic['title'] = titanic['title'].replace('Mme', 'Mrs')
Rare_f = ['Dona', 'Lady', 'the Countess']
Rare_m = [ 'Capt', 'Col', 'Don', 'Major', 'Rev','Sir','Dr','Master', 'Jonkheer']
# 신분 별 생존률
titanic[['title', 'survived']].groupby(['title'], as_index=False).mean()
![](https://velog.velcdn.com/images/rkdgg0/post/8ebf3020-bf69-4c92-af35-1dba34c564e8/image.png)
3. 머신러닝을 이용한 생존자 예측
# 머신러닝을 위해 해당 컬럼을 숫자로 변경 / 남 : 1, 여 : 0
# Label Encode를 사용하면 편하게 변경 가능
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(titanic['sex'])
titanic['gender'] = le.transform(titanic['sex'])
# 결측치는 제외 후 진행
titanic = titanic[titanic['age'].notnull()]
titanic = titanic[titanic['fare'].notnull()]
# 특성을 선택 후 데이터를 나누기
from sklearn.model_selection import train_test_split
X = titanic[['pclass','age','sibsp','parch','fare','gender']]
y = titanic['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8, random_state = 13)
1) Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dt = DecisionTreeClassifier(max_depth=4, random_state=13)
dt.fit(X_train, y_train)
pred = dt.predict(X_test)
print(accuracy_score(y_test, pred))
# -> 0.7655502392344498
2) 생존률 확인(가상의인물을 설정하여 생존률 구하기)
# ['pclass','age','sibsp','parch','fare','gender']
import numpy as np
dicaprio = np.array([[3, 18, 0, 0, 5, 1]])
print('Dicaprio : ', dt.predict_proba(dicaprio)[0, 1])
# -> Dicaprio 생존률 : 0.22950819672131148
winslet = np.array([[1, 16, 1, 1, 100, 0]])
print('Winslet : ', dt.predict_proba(winslet)[0, 1])
# -> Winslet 생존률 : 1.0