πŸ›³οΈ 타이타닉 μƒμ‘΄μž 예츑 πŸ›³οΈ

parkeuΒ·2022λ…„ 9μ›” 21일
0

ABCλΆ€νŠΈμΊ ν”„

λͺ©λ‘ 보기
25/55

🐼 μ€€λΉ„

# ν•œκΈ€ 깨짐 방지
import matplotlib as mpl
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'

!apt -qq -y install fonts-nanum

import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()

πŸš£β€β™€οΈ KNeighborsClassifier 타이타닉 μƒμ‘΄μž 예츑

데이터 μ€€λΉ„

import pandas as pd
import seaborn as sns

df = sns.load_dataset('titanic')
df

데이터 탐색


πŸ‘€ NaN 값이 λ§Žμ€ Column이 λ§ŽμŒμ„ μ•Œμˆ˜ μžˆλ‹€.

NaN 값이 λ§Žμ€ deck(객싀 데크 μœ„μΉ˜), embark_town(μŠΉμ„  λ„μ‹œ) μ—΄(컬럼) μ‚­μ œ

rdf = df.drop(['deck', 'embark_town'], axis=1)
rdf.info()

age 컬럼이 NaN인 λͺ¨λ“  ν–‰(rows) μ‚­μ œ 891->714(177건 μ‚­μ œ)

# 2) age 컬럼이 NaN인 λͺ¨λ“  ν–‰(rows) μ‚­μ œ 891->714(177건 μ‚­μ œ)
rdf = rdf.dropna(subset=['age'], how='any', axis=0)
rdf.info()

πŸ‘€ λͺ¨λ“  데이터가 714둜 맞좰짐


ν•™μŠ΅μ— ν•„μš”ν•œ 컬럼(feature) μΆ”μΆœ

생쑴여뢀(survived), 객싀등급(pclass), 성별(sex), λ‚˜μ΄(age), 같이 νƒ‘μŠΉν•œ ν˜•μ œ/자맀수(sibsp), λΆ€λͺ¨ μžλ…€μˆ˜(parch)

# 생쑴여뢀(survived), 객싀등급(pclass), 성별(sex), λ‚˜μ΄(age), 같이 νƒ‘μŠΉν•œ ν˜•μ œ/자맀수(sibsp), λΆ€λͺ¨ μžλ…€μˆ˜(parch)
rdf = rdf[['survived','pclass','sex','age','sibsp','parch']]
rdf.head()

성별(sex)에 원핫인코딩

# 성별 컬럼의 κ°’(λ²”μ£Όν˜•)을 λͺ¨λΈμ΄ 인식할 수 μžˆλ„λ‘ μˆ«μžν˜•μœΌλ‘œ λ³€κ²½
# 남성 -> 0, μ—¬μ„± -> 1 (1에 더 κ°€μ€‘μΉ˜κ°€ 뢀여될 수 μžˆμœΌλ―€λ‘œ λ‹€λ₯Έ ν˜•νƒœ ν•„μš”!)
# 원핫인코딩 -> male [1, 0], female [0, 1] 
onehot_sex = pd.get_dummies(rdf['sex']) # 컬럼 ν˜•νƒœλ‘œ 제곡
onehot_sex

πŸ‘€ 원핫인코딩 κ²°κ³Όλ₯Ό μ›λž˜μ˜ 데이터셋에 ν•©μΉ˜κΈ° !

ndf = pd.concat([rdf,onehot_sex], axis=1)
ndf.head()

πŸ‘€ 이제 ν•„μš” μ—†λŠ” sex 컬럼 μ‚­μ œ

ndf.drop(['sex'], axis=1, inplace=True)
ndf.head()


데이터셋 λΆ„λ¦¬ν•˜κΈ°

# 1) ndf -> 정닡지(target, label, μ’…μ†λ³€μˆ˜) y, λ¬Έμ œμ§€(data, feature, λ…λ¦½λ³€μˆ˜) X 데이터 뢄리
# survived -> y , λ‚˜λ¨Έμ§€ 6개 feature -> X
X = ndf[['pclass','age','sibsp','parch','female','male']]
y = ndf['survived']
# X.shape -> (714,6)
# y.shape -> (714, )

# 2) train, test μ…‹μœΌλ‘œ 뢄리(70:30)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

μ΄μ›ƒμ˜ 수(결정경계)에 λ”°λ₯Έ μ„±λŠ₯평가

from sklearn.neighbors import KNeighborsClassifier
# μ΄μ›ƒμ˜ μˆ˜μ— λ”°λ₯Έ 정확도λ₯Ό μ €μž₯ν•  리슀트 λ³€μˆ˜
train_scores = []
test_scores = []

n_neighbors_settings = range(1,11)
# 1 ~ 10κΉŒμ§€ n_neighbors의 수λ₯Ό μ¦κ°€μ‹œμΌœμ„œ ν•™μŠ΅ ν›„ 정확도 μ €μž₯
for n_neighbor in n_neighbors_settings:
  # λͺ¨λΈ 생성 및 ν•™μŠ΅
  clf = KNeighborsClassifier(n_neighbors=n_neighbor)
  clf.fit(X_train, y_train)
  # ν›ˆλ ¨ μ„ΈνŠΈ 정확도 μ €μž₯
  train_scores.append(clf.score(X_train, y_train))
  # ν…ŒμŠ€νŠΈ μ„ΈνŠΈ 정확도 μ €μž₯
  test_scores.append(clf.score(X_test, y_test))

# 예츑 정확도 비ꡐ κ·Έλž˜ν”„ 그리기
plt.figure(dpi=100)
plt.plot(n_neighbors_settings, train_scores, label='ν›ˆλ ¨μ •ν™•λ„')
plt.plot(n_neighbors_settings, test_scores, label='ν…ŒμŠ€νŠΈμ •ν™•λ„')
plt.ylabel('정확도')
plt.xlabel('μ΄μ›ƒμ˜ 수')
plt.legend()
plt.show()


πŸ‘€ μŠ€μœ—μŠ€νŒŸμ΄ 7μž„μ„ μ•Œ 수 μžˆλ‹€.


λͺ¨λΈ μ„±λŠ₯ 평가

# λͺ¨λΈ μ„€μ • ν›„ ν•™μŠ΅ν•˜κΈ°
clf = KNeighborsClassifier(n_neighbors=7) # μœ„μ˜ κ·Έλž˜ν”„μ—μ„œ μŠ€μœ—μŠ€νŒŸ 7μ΄λ―€λ‘œ
clf.fit(X_train, y_train)
# μ˜ˆμΈ‘ν•˜κΈ°
y_pred = clf.predict(X_test)

# λͺ¨λΈ μ„±λŠ₯ μ§€ν‘œ 계산
from sklearn import metrics

# accuracy, precision, recall, f1
print('ν…ŒμŠ€νŠΈ μ„±λŠ₯ 평가 n_neighbors=7')
print('accuracy : ', metrics.accuracy_score(y_test, y_pred))
print('precision : ', metrics.precision_score(y_test, y_pred))
print('recall : ', metrics.recall_score(y_test, y_pred))
print('f1 : ', metrics.f1_score(y_test, y_pred))

profile
배고파용.

0개의 λŒ“κΈ€