📢 오늘 공부 목표
정말 디카프리오는 살 수 없었을까 ..🤨
#!pip install plotly_express
import pandas as pd
titanic = pd.read_excel('titanic.xls')
import matplotlib.pyplot as plt
import seaborn as sns
#1행 2열의 그림판 생성 후 각각 할당
f, ax = plt.subplots(1,2,figsize = (18,8))
titanic['survived'].value_counts().plot.pie(
autopct = '%1.1f%%', #% 표기
shadow = True, #그림자
explode = [0,0.05], #띄우기
ax = ax[0] #첫번째에 그려라
)
ax[0].set_title('Pie Plot - Survived')
ax[0].set_ylabel('')
sns.countplot(x='survived', data=titanic, ax=ax[1])
ax[1].set_title('Count Plot - Survived')
plt.show()
# 성별에 따른 생존현황
#1행 2열의 그림판 생성 후 각각 할당
f, ax = plt.subplots(1,2,figsize = (18,8))
sns.countplot(x='sex', data=titanic, ax=ax[0])
ax[0].set_title('Count of Passengers of Sex')
ax[0].set_ylabel('')
sns.countplot(x='sex', data=titanic, ax=ax[1], hue='survived')
ax[1].set_title('Sex : Survived and Unsurvived')
plt.show()
📢 남성의 생존 가능성이 더 낮다.
pd.crosstab(titanic['pclass'], titanic['survived'], margins=True)
📢 선실 등급이 높을 수록 생존률이 높았다.
grid = sns.FacetGrid(titanic, row='pclass', col='sex', height=4, aspect=2)
grid.map(plt.hist, 'age',
alpha = .8 #투명도
, bins = 20) # 구간
grid.add_legend()
📢 3등실에 남자가 많았다.
import plotly.express as px
fig = px.histogram(titanic, x='age')
fig.show()
grid = sns.FacetGrid(titanic, row='survived', col='pclass', height=4, aspect=1)
grid.map(plt.hist, 'age',
alpha = .5 #투명도
, bins = 20) # 구간
grid.add_legend()
titanic['age_category'] = pd.cut(titanic['age'], bins=[0,7,15,30,60,100],
include_lowest=True,
labels = ['baby','teen','young','adult','old'])
plt.figure(figsize=(12,4))
plt.subplot(131) #1행3열중 1번째
sns.barplot(x='pclass',y='survived', data=titanic)
plt.subplot(132) #1행3열중 2번째
sns.barplot(x='age_category',y='survived', data=titanic)
plt.subplot(133) #1행3열중 3번째
sns.barplot(x='sex',y='survived', data=titanic)
plt.subplots_adjust(top=1, bottom=0.1, left=0.1, right=1, hspace=0.5, wspace=0.5)
📢 어릴수록, 등실이 높을수록, 여성일수록 생존률이 높다.
fig, axes = plt.subplots(nrows=1, ncols=2, figsize = (14,6))
women = titanic[titanic['sex']=='female']
men = titanic[titanic['sex']=='male']
# 여성
ax = sns.distplot(women[women['survived']==1]['age'], bins = 20,label='survied', ax = axes[0], kde=False)
ax = sns.distplot(women[women['survived']==0]['age'], bins = 20,label='not_survied', ax = axes[0], kde=False)
ax.legend();
ax.set_title('Female')
# 남성
ax = sns.distplot(men[men['survived']==1]['age'], bins = 20,label='survied', ax = axes[1], kde=False)
ax = sns.distplot(men[men['survived']==0]['age'], bins = 20,label='not_survied', ax = axes[1], kde=False)
ax.legend();
ax.set_title('Male')
import re
title = []
for idx, dataset in titanic.iterrows() :
tmp = dataset['name']
title.append(re.search('\,\s\w+(\s\w+)?\.',tmp).group()[2:-1])
titanic['title'] = title
titanic.head()
titanic['title'] = titanic['title'].replace('Mlle', 'Miss')
titanic['title'] = titanic['title'].replace('Mme', 'Miss')
titanic['title'] = titanic['title'].replace('Ms', 'Miss')
#여성 귀족
Rare_f = ['Dona', 'Lady', 'the Countess']
#남성 귀족
Rare_m = ['Capt', 'Col', 'Don', 'Dr', 'Jonkheer',
'Major', 'Master', 'Rev', 'Sir']
for each in Rare_f:
titanic['title'] = titanic['title'].replace(each, 'Rare_f')
for each in Rare_m:
titanic['title'] = titanic['title'].replace(each, 'Rare_m')
titanic[['title','survived']].groupby(['title'], as_index=False).mean()
📢 신분을 떠나 여성의 생존률이 더 높았다.
여성귀족 > 일반여성 > 귀족남성 > 일반남성
📌 머신 러닝을 위해서 컬럼은 모두 숫자로 바꿔야 함
from sklearn.preprocessing import LabelEncoder
# 문자를 숫자로 변형시키는 라이브러리
le = LabelEncoder()
# 학습
le.fit(titanic['sex'])
# 학습에 sex컬럼 데이터 변환시킨 후 gender 컬럼생성
titanic['gender'] = le.transform(titanic['sex'])
titanic = titanic[titanic['age'].notnull()]
titanic = titanic[titanic['fare'].notnull()]
from sklearn.model_selection import train_test_split
X = titanic[['pclass','age', 'sibsp', 'parch','fare','gender']]
y = titanic['survived']
X_train, X_test, y_train, y_test = \
train_test_split(X,y,test_size=0.2, random_state=13)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
#학습
dt = DecisionTreeClassifier(max_depth=4, random_state=13)
dt.fit(X_train, y_train)
#예측
pred = dt.predict(X_test)
# 정확성
accuracy_score(y_test, pred)
➡️0.7655502392344498
['pclass','age', 'sibsp', 'parch','fare','gender']
= 3등실, 18살, 형제부부, 부모혹은자녀, 비용, 남성
import numpy as np
dicaprio = np.array([[3,18,0,0,5,1]])
dt.predict_proba(dicaprio)
# array([[0.83271375, 0.16728625]]) : 생존할확률, 생존하지못할확률
dt.predict_proba(dicaprio)[0,1]
# 0.16728624535315986 # 생존활확률
['pclass','age', 'sibsp', 'parch','fare','gender']
= 1등실, 16살, 형제부부 1, 부모혹은자녀 1, 100달러, 여성
winslet = np.array([[1,16,1,1,100,0]])
dt.predict_proba(winslet)[0,1]
# 1.0 # 생존활확률