Kaggle - Prediction of H1N1 vaccination

seongyong·2021년 4월 12일
2

Project

목록 보기
3/6

h1n1 캐글 대회

작성코드

라이브러리 import

#프로파일링 설치

# !pip install -U pandas-profiling`

# graphviz 설치
# !pip install graphviz

import numpy as np
import pandas as pd

from pandas_profiling import ProfileReport
import pandas_profiling

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

import seaborn as sns
import matplotlib.pyplot as plt
import graphviz

EDA

#데이터 불러오기

target = ['vacc_h1n1_f']

train_val = pd.merge(pd.read_csv('train.csv'), pd.read_csv('train_labels.csv')['vacc_h1n1_f'], left_index = True, right_index = True)
test = pd.read_csv('test.csv')

ProfileReport(train_val, minimal=True).to_notebook_iframe()


#기준모델의 accuracy

print('class 비율 : \n', train_val[target].value_counts(normalize = True)) #normalize : 기준을 1로 잡아줌

y_pred = len(train_val) * [0]

acc = accuracy_score(train_val[target], y_pred)

print('\n기준모델의 accuracy : ', round(acc,2))


전처리 및 Feature engineering

#전처리 및 feature engineering

#중복제거
train_val.drop_duplicates(inplace = True)

#cardinality 높은 feature 제거
train_val.drop(['state', 'employment_occupation', 'employment_industry'], inplace  = True, axis = 1)

#na 제거
train_val.dropna(how = 'any', inplace = True)

#na 대체, SimpleImupter로 같은 효과 낼 수 있음
# cols = list(train_val.columns)

# for col in cols:
#     train_val[col].fillna(train_val[col].mode()[0], inplace = True)

#categorical value를 ordinal avlue로
le = LabelEncoder()

cols = list(train_val.columns)

for col in cols:
    train_val[col] = le.fit_transform(train_val[col])

#train, val 데이터 분리

X_train_val = train_val.copy()
X_train_val.drop(target, inplace = True, axis = 1)
y_train_val = train_val[target]

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state = 1)

모델링

#best parameter 찾기

param_grid = {
    'min_samples_split' : [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 
    'min_samples_leaf' : [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_depth' :[10, 30, 50]
}

grid = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5)
grid.fit(X_train_val, y_train_val)

print('best parameters : \n', grid.best_params_)


pipe = make_pipeline(
    # SimpleImputer(missing_values = np.nan, strategy='most_frequent'),
    DecisionTreeClassifier(
        min_samples_split = 50,       #최소한 이 정도는 되야 분리가 일어난다.
        min_samples_leaf = 50,        #leaf 노드에 이 정도의 데이터는 있어야한다.
        max_depth = 30,                #최대 깊이
        random_state=1,
        criterion = 'entropy')
)

pipe.fit(X_train_val, y_train_val)


print('training accuracy : ', pipe.score(X_train, y_train))
print('validation accuracy : ', pipe.score(X_val, y_val))


#f1 score

y_pred = pipe.predict(X_val)

print('validation f1 score : ', f1_score(y_val, y_pred))


#tree를 시각화
model_dt = pipe.named_steps['decisiontreeclassifier']

dot_data = export_graphviz(model_dt
                          , max_depth=3
                          , feature_names=X_train.columns
                          , class_names=['no', 'yes']
                          , filled=True
                          , proportion=True)

graphviz.Source(dot_data)


#feature의 중요도 시각화

importances = pd.Series(model_dt.feature_importances_, X_train.columns).sort_values()

plt.figure(figsize=(10,8))
importances.plot.barh();

예측 데이터 캐글 제출 준비

#test data 처리

imp = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')

cols = list(test.columns)

#na 최빈값으로 처리
test = pd.DataFrame(imp.fit_transform(test), columns = cols)

#Label encoding
for col in cols:
    test[col] = le.fit_transform(test[col])

#불필요칼럼(높은 cardinality) 제거
test.drop(['state', 'employment_occupation', 'employment_industry'], inplace  = True, axis = 1)

#test class 예측
y_pred = model_dt.predict(test)

#submission
result = pd.DataFrame(y_pred, columns = ['vacc_h1n1_f'])

result.reset_index(drop = False, inplace = True)

result.rename(columns = {
    'index' : 'id'
}, inplace = True)

result.set_index('id', inplace = True)

#submission 생성
result.to_csv('submission3_0412.csv')

추가 학습내용학습내용

#특정 dtype 추출
selected_cols = df.select_dtypes(include = ['float', 'object'])

#특성별 카디널리티 리스트
selected_cols.nunique()

#모든 cpu core 사용
LogisticRegression(n_jobs=-1)

0개의 댓글