Titanic - Machine Learning from Disaster | Kaggle
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory*
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"*
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session*
/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv
train = pd.read_csv('/kaggle/input/titanic/train.csv')
train.head()
| | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
```python
train.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test.head()
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
test.columns
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
women **=** train.loc[train["Sex"] **==** "female"]["Survived"]
women
1 1
2 1
3 1
8 1
9 1
..
880 1
882 0
885 0
887 1
888 0
Name: Survived, Length: 314, dtype: int64
*# 값이 1 또는 0 뿐이기 때문에 count 대신 sum 사용 가능*
*# 몇 %가 살았는지 볼 수 있음*
rate **=** (sum(women)**/**len(women)) ***** 100
rate
74.20382165605095
men **=** train.loc[train["Sex"]**==**"male"]["Survived"]
men
0 0
4 0
5 0
6 0
7 0
..
883 0
884 0
886 0
889 1
890 0
Name: Survived, Length: 577, dtype: int64
rate_men **=** (sum(men)**/**len(men)) ***** 100
rate_men
18.890814558058924
속성별로 나무를 만든다
*# RandomForestClassifier : 어떤 나무가 제일 잘 컸는지*
**from** sklearn.ensemble **import** RandomForestClassifier
y **=** train["Survived"]
y
0 0
1 1
2 1
3 1
4 0
..
886 0
887 1
888 0
889 1
890 0
Name: Survived, Length: 891, dtype: int64
train.colums
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
*# 학습시킬 대상 뽑아내기*
*# features = ["Pclass","Sex","SibSp","Parch"]*
features **=** "Pclass, Sex, SibSp, Parch".split(", ")
features
['Pclass', 'Sex', 'SibSp', 'Parch']
*# train 파일 중 features에 대한 것들을 랜덤으로 가져오기*
X **=** pd.get_dummies(train[features])
X
Pclass | SibSp | Parch | Sex_female | Sex_male | |
---|---|---|---|---|---|
0 | 3 | 1 | 0 | False | True |
1 | 1 | 1 | 0 | True | False |
2 | 3 | 0 | 0 | True | False |
3 | 1 | 1 | 0 | True | False |
4 | 3 | 0 | 0 | False | True |
... | ... | ... | ... | ... | ... |
886 | 2 | 0 | 0 | False | True |
887 | 1 | 0 | 0 | True | False |
888 | 3 | 1 | 2 | True | False |
889 | 1 | 0 | 0 | False | True |
890 | 3 | 0 | 0 | False | True |
891 rows × 5 columns
X_test **=** pd.get_dummies(test[features])
X_test
Pclass | SibSp | Parch | Sex_female | Sex_male | |
---|---|---|---|---|---|
0 | 3 | 0 | 0 | False | True |
1 | 3 | 1 | 0 | True | False |
2 | 2 | 0 | 0 | False | True |
3 | 3 | 0 | 0 | False | True |
4 | 3 | 1 | 1 | True | False |
... | ... | ... | ... | ... | ... |
413 | 3 | 0 | 0 | False | True |
414 | 1 | 0 | 0 | True | False |
415 | 3 | 0 | 0 | False | True |
416 | 3 | 0 | 0 | False | True |
417 | 3 | 1 | 1 | False | True |
418 rows × 5 columns
model **=** RandomForestClassifier(n_estimators**=**100, max_depth**=**5,random_state**=**1)
*# depth와 feature 개수 동일 -> 아마 중복 없을 것*
model
RandomForestClassifier
RandomForestClassifier(max_depth=5, random_state=1)
model.__dict__
{'estimator': DecisionTreeClassifier(),
'n_estimators': 100,
'estimator_params': ('criterion',
'max_depth',
'min_samples_split',
'min_samples_leaf',
'min_weight_fraction_leaf',
'max_features',
'max_leaf_nodes',
'min_impurity_decrease',
'random_state',
'ccp_alpha'),
'base_estimator': 'deprecated',
'bootstrap': True,
'oob_score': False,
'n_jobs': None,
'random_state': 1,
'verbose': 0,
'warm_start': False,
'class_weight': None,
'max_samples': None,
'criterion': 'gini',
'max_depth': 5,
'min_samples_split': 2,
'min_samples_leaf': 1,
'min_weight_fraction_leaf': 0.0,
'max_features': 'sqrt',
'max_leaf_nodes': None,
'min_impurity_decrease': 0.0,
'ccp_alpha': 0.0}
*# 제일 잘 맞는 나무 고르기*
model.fit(X,y)
RandomForestClassifier
RandomForestClassifier(max_depth=5, random_state=1)
# 잘 맞는지 테스트 해보기
predicts = model.predict(X_test)
predicts # 살았는지/죽었는지 에 대한 값들
array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0])
output = pd.DataFrame({"PassengerId": test["PassengerId"],"Survived": predicts})
output
PassengerId | Survived | |
---|---|---|
0 | 892 | 0 |
1 | 893 | 1 |
2 | 894 | 0 |
3 | 895 | 0 |
4 | 896 | 1 |
... | ... | ... |
413 | 1305 | 0 |
414 | 1306 | 1 |
415 | 1307 | 0 |
416 | 1308 | 0 |
417 | 1309 | 0 |
418 rows × 2 columns
output.to_csv("submit.csv", index=False)