fig, ax = plt.subplots(1,2, figsize=(18,8))
titanic['survived'].value_counts().plot(kind='pie',ax=ax[0],shadow=True, explode=[0,0.05] ,autopct='%.1f%%')
ax[0].set_title('Pie plot - survived')
ax[0].set_ylabel('')
sns.countplot(x='survived', data=titanic, ax=ax[1])
ax[1].set_title('Count plot - survived');
fig, ax = plt.subplots(1,2, figsize=(18,8))
sns.countplot(x='sex', data=titanic, ax=ax[0])
ax[0].set_title('Count of passengers of sex')
ax[0].set_ylabel('')
sns.countplot(x='sex', data=titanic, hue='survived' ,ax=ax[1])
ax[1].set_title('Sex : Survived')
plt.show()
pd.crosstab(titanic['pclass'], titanic['survived'], margins= True)
grid = sns.FacetGrid(titanic, row='pclass', col='sex', height = 4 , aspect = 2)
grid.map(plt.hist, 'age', alpha=0.8, bins=20)
grid.add_legend();
FacetGrid 은 무엇인가 ? (참고 : https://velog.io/@qw2397/ML)
1등실에 여성이 많이 보이지는 않는다
import plotly.express as px
fig = px.histogram(titanic, x='age')
fig.show()
grid = sns.FacetGrid(titanic, row='pclass', col='survived', height = 4 , aspect = 2)
grid.map(plt.hist, 'age', alpha=0.8, bins=20)
grid.add_legend();
titanic['age_cat'] = pd.cut(titanic['age'], bins = [0,7,15,30,60,100],
include_lowest=True,
labels = ['baby', 'teen','young' , 'adult', 'old'])
titanic.head()
plt.figure(figsize=(12,4))
plt.subplot(131)
sns.barplot(x='pclass', y='survived', data = titanic)
plt.subplot(132)
sns.barplot(x='age_cat', y='survived', data = titanic)
plt.subplot(133)
sns.barplot(x='sex', y='survived', data = titanic)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize = (14,6))
women = titanic[titanic['sex'] == 'female']
men = titanic[titanic['sex'] == 'male']
ax = sns.distplot(women[women['survived']==1]['age'], bins = 20,
label = 'survived' ,ax=axes[0], kde=False)
ax = sns.distplot(women[women['survived']==0]['age'], bins = 40,
label = 'not_survived' ,ax=axes[0], kde=False)
ax.legend(); ax.set_title('female')
women = titanic[titanic['sex'] == 'female']
men = titanic[titanic['sex'] == 'male']
ax = sns.distplot(men[men['survived']==1]['age'], bins = 20,
label = 'survived' ,ax=axes[1], kde=False)
ax = sns.distplot(men[men['survived']==0]['age'], bins = 40,
label = 'not_survived' ,ax=axes[1], kde=False)
ax.legend(); ax.set_title('Male')
plt.show()
import re
title = []
for idx, dataset in titanic.iterrows():
tmp = dataset['name']
title.append(re.search('\,\s\w+(\s\w+)?\.', tmp).group()[2:-1])
titanic['title'] = title
titanic.head()
titanic['title'] = titanic['title'].replace('Mlle','Miss')
titanic['title'] = titanic['title'].replace('Ms','Miss')
titanic['title'] = titanic['title'].replace('Mme','Mrs')
Rare_f = ['Dona', 'Dr', 'Lady', 'the Countess']
Rare_m = ['Capt', 'Col', 'Don', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Master' ]
for each in Rare_f:
titanic['title'] = titanic['title'].replace(each, 'Rare_f')
for each in Rare_m:
titanic['title'] = titanic['title'].replace(each, 'Rare_m')
titanic[['title', 'survived']].groupby(['title'], as_index=False).mean()
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(titanic['sex'])
titanic['gender'] = le.transform(titanic['sex'])
titanic.head()
titanic = titanic[titanic['age'].notnull()]
titanic = titanic[titanic['fare'].notnull()]
correlation_matirx = titanic.corr().round(1)
sns.heatmap(correlation_matirx, annot=True, cmap ='bwr')
from sklearn.model_selection import train_test_split
X = titanic[['pclass', 'age', 'sibsp' , 'parch', 'fare', 'gender']]
y = titanic['survived']
X_train, X_test, y_train, y_test = train_test_split(X,y ,
test_size=0.2,
random_state=13)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
clf = DecisionTreeClassifier(max_depth=4, random_state=13)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(accuracy_score(y_test, pred))
persona = np.array([[3,18,0,0,5,1]])
clf.predict_proba(persona)[0,1]