πΌ μ€λΉ
# νκΈ κΉ¨μ§ λ°©μ§ import matplotlib as mpl import matplotlib.pyplot as plt %config InlineBackend.figure_format = 'retina' !apt -qq -y install fonts-nanum import matplotlib.font_manager as fm fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf' font = fm.FontProperties(fname=fontpath, size=9) plt.rc('font', family='NanumBarunGothic') mpl.font_manager._rebuild()
import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
df
π NaN κ°μ΄ λ§μ Columnμ΄ λ§μμ μμ μλ€.
rdf = df.drop(['deck', 'embark_town'], axis=1)
rdf.info()
# 2) age 컬λΌμ΄ NaNμΈ λͺ¨λ ν(rows) μμ 891->714(177건 μμ )
rdf = rdf.dropna(subset=['age'], how='any', axis=0)
rdf.info()
π λͺ¨λ λ°μ΄ν°κ° 714λ‘ λ§μΆ°μ§
# μμ‘΄μ¬λΆ(survived), κ°μ€λ±κΈ(pclass), μ±λ³(sex), λμ΄(age), κ°μ΄ νμΉν νμ /μ맀μ(sibsp), λΆλͺ¨ μλ
μ(parch)
rdf = rdf[['survived','pclass','sex','age','sibsp','parch']]
rdf.head()
# μ±λ³ 컬λΌμ κ°(λ²μ£Όν)μ λͺ¨λΈμ΄ μΈμν μ μλλ‘ μ«μνμΌλ‘ λ³κ²½
# λ¨μ± -> 0, μ¬μ± -> 1 (1μ λ κ°μ€μΉκ° λΆμ¬λ μ μμΌλ―λ‘ λ€λ₯Έ νν νμ!)
# μν«μΈμ½λ© -> male [1, 0], female [0, 1]
onehot_sex = pd.get_dummies(rdf['sex']) # μ»¬λΌ ννλ‘ μ 곡
onehot_sex
π μν«μΈμ½λ© κ²°κ³Όλ₯Ό μλμ λ°μ΄ν°μ μ ν©μΉκΈ° !
ndf = pd.concat([rdf,onehot_sex], axis=1)
ndf.head()
π μ΄μ νμ μλ sex μ»¬λΌ μμ
ndf.drop(['sex'], axis=1, inplace=True)
ndf.head()
# 1) ndf -> μ λ΅μ§(target, label, μ’
μλ³μ) y, λ¬Έμ μ§(data, feature, λ
립λ³μ) X λ°μ΄ν° λΆλ¦¬
# survived -> y , λλ¨Έμ§ 6κ° feature -> X
X = ndf[['pclass','age','sibsp','parch','female','male']]
y = ndf['survived']
# X.shape -> (714,6)
# y.shape -> (714, )
# 2) train, test μ
μΌλ‘ λΆλ¦¬(70:30)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)
from sklearn.neighbors import KNeighborsClassifier
# μ΄μμ μμ λ°λ₯Έ μ νλλ₯Ό μ μ₯ν 리μ€νΈ λ³μ
train_scores = []
test_scores = []
n_neighbors_settings = range(1,11)
# 1 ~ 10κΉμ§ n_neighborsμ μλ₯Ό μ¦κ°μμΌμ νμ΅ ν μ νλ μ μ₯
for n_neighbor in n_neighbors_settings:
# λͺ¨λΈ μμ± λ° νμ΅
clf = KNeighborsClassifier(n_neighbors=n_neighbor)
clf.fit(X_train, y_train)
# νλ ¨ μΈνΈ μ νλ μ μ₯
train_scores.append(clf.score(X_train, y_train))
# ν
μ€νΈ μΈνΈ μ νλ μ μ₯
test_scores.append(clf.score(X_test, y_test))
# μμΈ‘ μ νλ λΉκ΅ κ·Έλν 그리기
plt.figure(dpi=100)
plt.plot(n_neighbors_settings, train_scores, label='νλ ¨μ νλ')
plt.plot(n_neighbors_settings, test_scores, label='ν
μ€νΈμ νλ')
plt.ylabel('μ νλ')
plt.xlabel('μ΄μμ μ')
plt.legend()
plt.show()
π μ€μμ€νμ΄ 7μμ μ μ μλ€.
# λͺ¨λΈ μ€μ ν νμ΅νκΈ°
clf = KNeighborsClassifier(n_neighbors=7) # μμ κ·Έλνμμ μ€μμ€ν 7μ΄λ―λ‘
clf.fit(X_train, y_train)
# μμΈ‘νκΈ°
y_pred = clf.predict(X_test)
# λͺ¨λΈ μ±λ₯ μ§ν κ³μ°
from sklearn import metrics
# accuracy, precision, recall, f1
print('ν
μ€νΈ μ±λ₯ νκ° n_neighbors=7')
print('accuracy : ', metrics.accuracy_score(y_test, y_pred))
print('precision : ', metrics.precision_score(y_test, y_pred))
print('recall : ', metrics.recall_score(y_test, y_pred))
print('f1 : ', metrics.f1_score(y_test, y_pred))