titanic_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/titanic.xls"
titanic = pd.read_excel(titanic_url)
titanic.head()

f, ax = plt.subplots(1, 2, figsize=(16, 8))
titanic['survived'].value_counts().plot.pie(ax=ax[0], autopct='%1.1f%%', shadow=True, explode=[0, 0.05])
ax[0].set_title('Pie plot - survived')
ax[0].set_ylabel('')
sns.countplot(x='survived', data=titanic, ax=ax[1])
ax[1].set_title('count plot - survived')
plt.show()

f, ax = plt.subplots(1, 2, figsize=(16, 8))
sns.countplot(x='sex', data=titanic, ax=ax[0])
ax[0].set_title('Count of passengers of sex')
ax[0].set_ylabel('')
sns.countplot(x='sex', hue='survived', data=titanic, ax=ax[1])
ax[1].set_title('sex : survived')
plt.show()

pd.crosstab(titanic['pclass'], titanic['survived'], margins=True)

grid = sns.FacetGrid(titanic, row='pclass',col='sex', height=4, aspect=2)
grid.map(plt.hist, 'age', alpha=0.8, bins=20)
grid.add_legend()

import plotly.express as px
fig = px.histogram(titanic, x='age')
fig.show()

grid = sns.FacetGrid(titanic, row='pclass',col='survived', height=4, aspect=2)
grid.map(plt.hist, 'age', alpha=0.5, bins=20)
grid.add_legend();

titanic['age_cat'] = pd.cut(titanic['age'], bins=[0, 7, 15, 30, 60, 100],
include_lowest=True,
labels=['baby', 'teen', 'young', 'adult', 'old'])
titanic.head()

plt.figure(figsize=(14, 6))
plt.subplot(131)
sns.barplot(x='pclass', y='survived', data=titanic)
plt.subplot(132)
sns.barplot(x='age_cat', y='survived', data=titanic)
plt.subplot(133)
sns.barplot(x='sex', y='survived', data=titanic)
plt.show();

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))
women = titanic[titanic['sex'] == 'female']
men = titanic[titanic['sex'] == 'male']
ax = sns.histplot(women[women['survived']==1]['age'], bins=20,
label = 'survived', ax=axes[0], kde=False)
ax = sns.histplot(women[women['survived']==0]['age'], bins=40,
label = 'not_survived', ax=axes[0], kde=False)
ax.legend(); ax.set_title('Female')
ax = sns.histplot(men[men['survived']==1]['age'], bins=20,
label = 'survived', ax=axes[1], kde=False)
ax = sns.histplot(men[men['survived']==0]['age'], bins=40,
label = 'not_survived', ax=axes[1], kde=False)
ax.legend(); ax.set_title('Male')
plt.show()

for idx, dataset in titanic.iterrows():
print(dataset['name'])

import re
title = []
for idx, dataset in titanic.iterrows():
tmp = dataset['name']
title.append(re.search('\,\s\w+(\s\w+)?\.', tmp).group()[2:-1])
titanic['title'] = title
titanic.head()

pd.crosstab(titanic['title'], titanic['sex'])

titanic['title'].unique()
array(['Miss', 'Master', 'Mr', 'Mrs', 'Col', 'Mme', 'Dr', 'Major', 'Capt',
'Lady', 'Sir', 'Mlle', 'Dona', 'Jonkheer', 'the Countess', 'Don',
'Rev', 'Ms'], dtype=object)
titanic['title'] = titanic['title'].replace('Mlle', 'Miss')
titanic['title'] = titanic['title'].replace('Ms', 'Miss')
titanic['title'] = titanic['title'].replace('Mme', 'Mrs')
Rare_f = ['Dona', 'Lady', 'the Countess']
Rare_m = ['Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Master']
for each in Rare_f:
titanic['title'] = titanic['title'].replace(each, 'Rare_f')
for each in Rare_m:
titanic['title'] = titanic['title'].replace(each, 'Rare_m')
for each in Rare_f:
titanic['title'] = titanic['title'].replace(each, 'Rare_f')
for each in Rare_m:
titanic['title'] = titanic['title'].replace(each, 'Rare_m')
array(['Miss', 'Rare_m', 'Mr', 'Mrs', 'Rare_f'], dtype=object)
titanic[['title', 'survived']].groupby(['title'], as_index=False).mean()


from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(titanic['sex'])
le.classes_
array(['female', 'male'], dtype=object)
titanic['gender'] = le.transform(titanic['sex'])
titanic.head()

titanic = titanic[titanic['age'].notnull()]
titanic = titanic[titanic['fare'].notnull()]
titanic.info()

from sklearn.model_selection import train_test_split
X = titanic[['pclass', 'age', 'sibsp', 'parch', 'fare', 'gender']]
y = titanic['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dt = DecisionTreeClassifier(max_depth=4, random_state=13)
dt.fit(X_train, y_train)
pred = dt.predict(X_test)
print(accuracy_score(y_test, pred))
0.7655502392344498
# 디카프리오
dicaprio = np.array([[3, 18, 0, 0, 5, 1]])
print('dicaprio : ', dt.predict_proba(dicaprio)[0, 1])
dicaprio : 0.16728624535315986
# 윈슬릿
winslet = np.array([[1, 16, 1, 1, 100, 0]])
print('winslet : ', dt.predict_proba(winslet)[0,1])
winslet : 1.0
"이 글은 제로베이스 데이터 취업 스쿨의 강의 자료 일부를 발췌하여 작성되었습니다.”