Machine Learning - Practice Titanic

화이티 ·2023년 12월 17일

Machine Learning

목록 보기

6/23

🚤Titanic 실습

1. 문제정의
2. 데이터 수집
3. 데이터 전처리
 - PassengerId 삭제
 - Embarked 결측치 채우기
 - Fare 결측치 채우기
 - Age 결측치 채우기
 - Cabin 결측치 채우기
4. 탐색적 데이터 분석
 - Cabin 시각화
 - Pclass 시각화
 - Cabin과 Pclass 시각화
 - Sex 시각화
 - Embarked 시각화
 - Sex, Age, Survived 시각화
 - Sex, Fare, Survived 시각화

# 타입 변경 : Name, Sex, Ticket, Cabin, Embarked
5. 모델 선택 및 hyper parameter
* knn, decision tree모델 사용하기
* 새로운 모델 사용해보기(ensemble계열 모델) > randomForest, AdaBoost
6. 학습
7. 평가 및 예측

1. 문제정의

titanic 데이터를 사용해서 생존자 와 사망자 예측해보기

2. 데이터 수집

kaggle train,test데이터 download
import library

# 필요한 library 불어오기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split #훈련과 테스트용 셋트 분리
from sklearn.metrics import accuracy_score #평가를 진행할 때 정확도 측정
from sklearn.tree import DecisionTreeClassifier #결정트리모델 가져오기

#1. data를 로드하기
train = pd.read_csv('./data/Titanic/train.csv')
test = pd.read_csv('./data/Titanic/test.csv')
print(train.shape)
print(test.shape)

3. data 전처리

결측치 처리
이상치 처리

# 결측치:age, cabin, embarked
# type 변경: name, sex, ticket, cabin, embarked
train.info()

# 결측치:age, cabin, fare
# type 변경: name, sex, ticket, cabin, embarked
test.info()

# 정답 컬럼 분리
y_train = train['Survived']

passengerID삭제

도움이 안될거같아

train.drop('PassengerId', axis =1, inplace = True)

Embarked 결측치 채우기

탑승한 항구 empty data fillfull

train['Embarked'].value_counts()

# 최빈값으로 결축치 채우기
train['Embarked'] = train['Embarked'].fillna('S')

train.info()

**Fare결측치 채우기**

test['Fare'].describe()

test['Fare'].fillna(14.45, inplace = True)

test.info()

Age결측치 채우기

단순 통계치기 아니라 다른 컬럼간의 상관관계를 이용해보자

# 상관관계 확인하기
# 피어슨 상관계수
# 두 변수간의 선형 상관관계를 수치적으로 표현
# 두 값이 비레관계면 양의 숫자 (최대값 1) > 양의 상관관계
# 두 값이 반비례관계면 음의 숫자(최대값 -1) > 음의 상관관계
# 0에 가까올 수록 관계가 없다
train.corr()

# pclass, sex를 기준으로 age 결측치를 채워보자
# average in each class 
# Age에 결측치가 좀재한다 => Pclass과 Sex을 확인하고 해당하는 값으로 결측치 채움
# Age에 결측치가 존재하지 않는다 => 그값을 그대로 사용한다
age_table = train [['Pclass','Sex','Age']].groupby(by = ['Pclass','Sex']).median()

# apply
# 행이나 열 단위로 복잡한 작업(커스텀 함수)을 할 떄 사용
# 결측치 채우는 함수 만들기
def fill_age(data):# data는 한나의 행 데이터 (한 사람의 데이터)
    if np.isnan(data['Age']): # 결측치면 True, 아니면 False
				# isnan means isnull
        #결측치라먄 pClass과 Sex을 확인하고 해당 값으로 age_table에서 검색
        #나온값으로 결측치 채우기
        return age_table.loc[data['Pclass'],data['Sex']][0]
    else: 
        # 값이 있기 때문에 그대로 사용
        return data['Age']

# aplly는 한 행 (axis =1)씩 데이터를 가져와사 fill_age함수에 집어넣음
# fill_age () => 함수를 사용하겠다
# fill_age = > 함수를  가져오겠다
train['Age'] = train.apply(fill_age, axis =1)
train['Age']

train.info()

test['Age'] = test.apply(fill_age, axis = 1)
test['Age']

test.info()

4.탐색전 데이터 분석

데이터 다 자세하게 살펴보자
통계치 , 그래프
정답 (생존유모)와 얼마나 연관이 있는가

# 그래프 그리는 library
# matplotlib : 사용하기 어려운편, 자세하게 사용할 수 있음
# seaborn: 사용하기 쉬운편, 자세하게 사용하기는 힘듬
import seaborn as sns

**Cabin 시각화**

#data = 사용할 데이터
#x:x축에 사용할 데이터
# hue = 데이터 분리 기준
# 결측치로 채운 n데이터가 생존과 사만이 분리가 되어있어서 데이터로 사용하자

sns.countplot(data = train, x = 'Cabin', hue = 'Survived')

**Pclass시각화**

# 1등급에 탑승한 사람은 많이 생존
# 3 등급에 탑승한 사람은 많이 사망
sns.countplot(data = train, x = 'Pclass', hue = 'Survived')

**Cabin 과 pclass시각화**

# n구역은 사망자의 비율이높고, 3등듭의 객실 사람이 많다'
#2, a, b, c ,r궁역은 1등급만 존재(좋운 비싸 구역이지 않을 까)
sns.countplot(data= train, x = 'Cabin', hue = 'Pclass')

**Gender시각화**

sns.countplot(data= train, x = 'Sex', hue = 'Survived')

****Embarked 살펴보기[¶]

sns.countplot(data= train, x = 'Embarked', hue = 'Survived')

sns.countplot(data= train, x = 'Embarked', hue = 'Pclass')

Sex, Age, Survived 시각화

데이터의 분포를 확인

sns.violinplot(data = train, x = 'Sex', y='Age', hue = 'Survived', split = True)
#20대가 많이 탑승했다
# 남아는 많이 살았고 여아는 많이 죽었다>남아선호사상

sns.violinplot(data = train, x = 'Sex', y='Fare', hue = 'Survived', split = True)
# 요금이 저럼할수록 죽은 사람이 더 많다

Family_size 만들기

SibSp + Parch + 1 (나 자신)
특성공학 : 컬럼에 연산을 통해서 의미있는 새로운 정보를 추출하는 행위

train['Family_size']= train['SibSp']+train['Parch']+1
test['Family_size']= train['SibSp']+train['Parch']+1

sns.countplot(data=train, x='Family_size', hue = 'Survived')

# 1명일때는 죽은 비율이 높당>Alone
# 2~4명일때는 산 비율이 높다>Small
# 5명 이상일때 죽은 비율이 높다>large
#수치형>범주형:Binning
# cut:구간정보, 구간에 대한 이름을 통해서 수치형 값을 범즈형으로 병경
# bins: 구간 정보
bins = [0,1,4,20]
#labels: 구간에 대한 이름
labels = ['Alone','Small','Large']

train_cut = pd.cut(train['Family_size'], bins = bins, labels=labels)
test_cut = pd.cut(test['Family_size'], bins = bins, labels=labels)

train['Family_Group']=train_cut
test['Family_Group']=test_cut

Name column 살펴보기

이름은 자체 특성상 거의 모든 값이 다름
영어이름은 중간에 호칭이 존재
특성공학을 통해 반정형 데이터로 변경

#split: 매개변수값을 기준으로 앞과 뒤로 나눠준다
#strip: 문자열 맨 앞과 뒤에 있는 공백 제거
train['Name'][0].split(',')[1].split('.')[0].strip()

# apply 함수를 사용하기위한 커스텀 함수 제작
# 이름을 하나씻 집어넣고 정해진 함수 사용
def split_title(name):
    return name.split(',')[1].split('.')[0].strip()

train['Title']=train['Name'].apply(split_title)
test['Title']=test['Name'].apply(split_title)

# 범주가 너무 많다
# 1 or 2개인 데이터들이 많다 > 과대적합 유발 > 처리를 해줘야함
# 1 or 2개인 데이터들이 많다 > other로 합치기
train['Title'].value_counts()

convert_name_dic = {
    'Mr' : 'Mr' , 
    'Mrs' : 'Mrs', 
    'Miss' : 'Miss', 
    'Master' : 'Master', 
    'Don' : 'Other', 
    'Rev' : 'Rev', 
    'Dr': 'Dr', 
    'Mme' : 'Other', 
    'Ms' : 'Other',
    'Major' : 'Other',
    'Lady' : 'Other',
    'Sir' : 'Other',
    'Mlle' : 'Other',
    'Col' : 'Other', 
    'Capt' : 'Other', 
    'the Countess' : 'Other',
    'Jonkheer' : 'Other',
    'Dona' : 'Other'
}

train['Title']=train['Title'].map(convert_name_dic)
train['Title'].value_counts()

test['Title']=test['Title'].map(convert_name_dic)
test['Title'].value_counts()

**Ticket column 확인해보기**

# 문자열 끝엔 최소 4자리 숫자가 있다 >한자리 숫자만 있는 경우도 존재

train['Ticket'].unique()

# 공통점 없고 다른 탑승객과 똑같은 티켓을 가진 사람이 약 200명 
# ticket데이터는 생존과 연관이 없어보임>삭제
train.drop('Ticket',axis =1, inplace = True)
test.drop('Ticket',axis =1, inplace = True)

쓸모없는 데이터 삭제

survived: 이미 y_train으로 빼돔
name: 중간 호칭을 title로 뺌

train.drop('Survived', axis =1, inplace = True)
train.drop('Name', axis =1, inplace = True)
test.drop('Name', axis =1, inplace = True)

글자데이터 > 숫자데이터로 변경

원핫인코딩을 사용

train, test합처서 사용해 사용 > 컬럼값을 다를수 ㄹㄲ다규

> 값이 달라ㅣㅈ먄ㄴ 생건뢴느 컬럼이 장압ㅁ
> 

```python
combined = pd.concat([train,test], ignore_index = True)
combined
```



```python
categorical_feature = ['Sex','Cabin','Embarked','Family_Group', 'Title']
```

```python
#원핫인코딩
one_hot = pd.get_dummies(combined[categorical_feature])
one_hot.shape
```

```python
total = pd.concat([combined, one_hot], axis = 1)
total.shape
```

```python
total.drop(categorical_feature,axis =1, inplace = True)
total.shape
```

****x_train, x_test 나누기****

```python
x_train = total.iloc[:891]
x_test = total.iloc[891:]
```

# **5. 모델 선택 및 하이퍼 파라미터 튜닝**

- knn. decisiontree 사용해보기

```python
knn = KNeighborsClassifier()
```

```python
rf = RandomForestClassifier(n_estimators = 50, max_features = 0.5)
```

```python
knn = KNeighborsClassifier(n_neighbors = 10)
```

```python
tree = DecisionTreeClassifier()
```

****6. 학습****

```python
rf.fit(x_train,y_train)
```

```python
knn.fit(x_train,y_train)
```

```python
tree.fit(x_train,y_train)
```

1. 예측

```python
knn.score(x_train,y_train)
```

```python
rf.score(x_train,y_train)
```

```python
tree.score(x_train,y_train)
```

```python
pre_knn = knn.predict(x_test)
```

```python
pre_tree = tree.predict(x_test)
```

```python
pre_rf = rf.predict(x_test)
pre_rf
```

1. Export

```python
# 예측값을 gender_submission파일에 적고 업로드하기
sub = pd.read_csv('./data/Titanic/gender_submission.csv')
```

```python
sub['Survived'] = pre_knn 
sub.to_csv('knn_pre.csv', index = False)
```

```python
sub['Survived'] = pre_tree 
sub.to_csv('tree_pre.csv', index = False)
```

```python
sub['Survived'] = pre_rf 
sub.to_csv('rf_pre.csv', index = False)
```

# RANDOM FOREST - Bagging

```python
# 5~7단계 모음
# 5단계 모델 생성 및 하이퍼 파라미터 튜닝
rf = RandomForestClassifier(n_estimators = 90, max_features = 0.5, max_depth=5,
    min_samples_split=70,
    min_samples_leaf=20)
# 6단계 학습
rf.fit(x_train,y_train)
# 7단계 평가
cross_val_score(rf,x_train,y_train,cv=5).mean()
```

```python
pre_rf = rf.predict(x_test)
sub = pd.read_csv('./data/Titanic/gender_submission.csv')
sub['Survived'] = pre_rf 
sub.to_csv('rf_pre.csv', index = False)
```

```python
# randomForest 모델 중요도 결과
num_feature = len(x_train.columns)
name_feature = x_train.columns
feature_importance = rf.feature_importances_
plt.barh(range(num_feature),feature_importance)
plt.yticks(np.arange(num_feature),name_feature)
plt.xlabel('importance')
plt.ylabel('attr')
plt.show()
```



# GRID SEARCH CV - Boosting

```python
from sklearn.model_selection import GridSearchCV
# hyper parameter 조절 쉽게하게
param = {'n_estimators' : [30,40,50,60],
        'max_features' : [0,3,0.4,0.5,0.6],
        'max_depth': [3,4,5,6],
        'min_samples_split' : [10,20,25,30],
        'min_samples_leaf': [10,20,25,30]}
grid_search = GridSearchCV(RandomForestClassifier(),param,cv =5)
grid_search.fit(x_train,y_train)
```

```python
grid_search.best_score_
```

```python
# 가장 좋았을때의 hyper parameter
grid_search.best_params_
```

```python
# 예측
prd_rf_grid = grid_search.predict(x_test)
sub = pd.read_csv('./data/Titanic/gender_submission.csv')
sub['Survived'] = prd_rf_grid 
sub.to_csv('rf_grid_pre.csv', index = False)
```

```python
param = {'n_estimators' : [36,68,40,42,44],
        'max_features' : [0,5,0.55,0.6,0.65],
        'max_depth': [3,4,5,6,7],
        'min_samples_split' : [16,18,20,22,24],
        'min_samples_leaf': [16,18,20,22,24]}
grid_search = GridSearchCV(RandomForestClassifier(),param,cv =5)
grid_search.fit(x_train,y_train)
```

# ADA BOOSTing

Gradient Boosting and XGBoost use a similar approach to build the sequence of models, where each subsequent model tries to minimize the error of the previous model. In contrast, AdaBoost uses a different approach, where each subsequent model tries to focus on the samples that were misclassified by the previous model.

AdaBoost tiến hành train các mô hình mới dựa trên việc đánh lại trọng số cho các điểm dữ liệu hiện tại, nhằm giúp các mô hình mới có thể tập trung hơn vào các mẫu dữ liệu đang bị học sai, từ đó làm giảm giá trị loss của mô hình.

```python
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(estimator=None,
    *,
    n_estimators=50,
    learning_rate=1.0,
    algorithm='SAMME.R',
    random_state=None,
    base_estimator='deprecated',)
```

```python
abc.fit(x_train,y_train)
cross_val_score(abc,x_train,y_train,cv=5).mean()
```

```python
pre_abc = abc.predict(x_test)
sub = pd.read_csv('./data/Titanic/gender_submission.csv')
sub['Survived'] = pre_abc
sub.to_csv('abc_pre.csv', index = False)
```

```python
# Decision TRee 기반 모델들은
abc.feature_importances_
import matplotlib.pyplot as plt
# 특성의 객수
num_feature = len(x_train.columns)
# 특성의 이름
name_feature = x_train.columns
# 특성 중요도
feature_importance = abc.feature_importances_
#AdaBoost 모델 중요도 결과
import numpy as np
plt.barh(range(num_feature),feature_importance)

plt.yticks(np.arange(num_feature),name_feature)
plt.xlabel('importance')
plt.ylabel('attr')
plt.show()
```


# Gradient Boosting
```python

from sklearn.ensemble import GradientBoostingClassifier
#GradientBoostingClassifier학습
gbc = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.15)


```python
gbc.fit(x_train,y_train)
cross_val_score(gbc,x_train,y_train,cv=5).mean()

#GradientBoostingClassifier 평가 > kaggle업로드
pre_gbc = gbc.predict(x_test)
sub = pd.read_csv('./data/Titanic/gender_submission.csv')
sub['Survived'] = pre_gbc
sub.to_csv('gbc_pre.csv', index = False)

#GradientBoostingClassifier 주요도 그래프 그리기
num_feature = len(x_train.columns)
name_feature = x_train.columns
feature_importance = gbc.feature_importances_
plt.barh(range(num_feature),feature_importance)
plt.yticks(np.arange(num_feature),name_feature)
plt.xlabel('importance')
plt.ylabel('attr')
plt.show()