y-hat : ์์ธกํ ๊ฐ
y : ์ ๋ต
import mglearn
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings('ignore')
plt.rc('font', family ='NanumBarunGothic')
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 100
X, y = mglearn.datasets.make_forge()
fig, axes = plt.subplots(1, 2, figsize=(10, 3))
for model, ax in zip([LinearSVC(max_iter=5000), LogisticRegression()], axes):
clf = model.fit(X, y)
mglearn.plots.plot_2d_separator(clf, X, fill=False, eps=0.5,
ax=ax, alpha=.7)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
ax.set_title(clf.__class__.__name__)
ax.set_xlabel("ํน์ฑ 0")
ax.set_ylabel("ํน์ฑ 1")
axes[0].legend()
plt.show()
C=1 ๊ท์ (๊ณต๋ถ๋ฅผ ๋ ์ํค๊ฒ ๋ค -> ๊ณผ๋์ ํฉ์ ํผํ๊ฒ ๋ค.) ์ค์ ๊ฐ
C ์ค์ ๊ฐ์ ๋ฎ๊ฒํ๋ฉด ex) 0.01, 0.001 -> ๊ท์ ๊ฐํ -> ์ผ๋ฐํ -> ๊ณผ์์ ํฉ
C ์ค์ ๊ฐ์ ๋๊ฒํ๋ฉด ex) 10, 100, 1000 -> ๊ท์ ์ํ -> ๊ณผ๋์ ํฉ
mglearn.plots.plot_linear_svc_regularization()
- ๊ท์ ๊ฐ๋๋ฅผ ๊ฒฐ์ ํ๋ C ์ค์ ์ ๋ฐ๋ฅธ ์ฑ๋ฅ ๋น๊ต
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
# ๋ฐ์ดํฐ ๊ฐ์ ธ์ค๊ธฐ
cancer = load_breast_cancer()
# ๋ฐ์ดํฐ์
๋ถ๋ฆฌํ๊ธฐ
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=7)
๐ stratify=cancer.target : ๋น์จ ๋ณด์
from sklearn.linear_model import LogisticRegression
# C = 1
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print('-----------๊ธฐ๋ณธ----------')
logreg001 = LogisticRegression().fit(X_train, y_train)
print('logreg ํ๋ จ ์ธํธ ์ ์ : {:.8f}'.format(logreg.score(X_train, y_train)))
print('logreg ํ
์คํธ ์ธํธ ์ ์ : {:.8f}'.format(logreg.score(X_test, y_test)))
print('--------๊ท์ ๊ฐํ--------')
logreg001 = LogisticRegression(C=0.01).fit(X_train, y_train)
print('logreg001 ํ๋ จ ์ธํธ ์ ์ : {:.8f}'.format(logreg001.score(X_train, y_train)))
print('logreg001 ํ
์คํธ ์ธํธ ์ ์ : {:.8f}'.format(logreg001.score(X_test, y_test)))
print('--------๊ท์ ์ํ--------')
logreg100 = LogisticRegression(C=100).fit(X_train, y_train)
print('logreg100 ํ๋ จ ์ธํธ ์ ์ : {:.8f}'.format(logreg100.score(X_train, y_train)))
print('logreg100 ํ
์คํธ ์ธํธ ์ ์ : {:.8f}'.format(logreg100.score(X_test, y_test)))
๐ ์ด ๊ฒฐ๊ณผ๋ฅผ ๋ณธ๋ค๋ฉด, ๊ท์ ์ํ๋ ๋ชจ๋ธ์ ์ ํํ๋๊ฒ์ด ์ข์!
# L2 ๊ท์ ์ ๋ํ feature๋ค์ ๊ฐ์ค์น๋ฅผ ํ์ธ
plt.plot(logreg100.coef_.T, '^', label="C=100")
plt.plot(logreg.coef_.T, 'o', label="C=1")
plt.plot(logreg001.coef_.T, 'v', label="C=0.01")
plt.xticks(range(cancer.data.shape[1]), cancer.feature_names, rotation=90)
xlims = plt.xlim()
plt.hlines(0, xlims[0], xlims[1])
plt.xlim(xlims)
plt.ylim(-5, 5)
plt.xlabel("ํน์ฑ")
plt.ylabel("๊ณ์ ํฌ๊ธฐ")
plt.legend()
plt.show()
print('----------๊ธฐ๋ณธ-----------')
logreg001 = LogisticRegression(penalty='l1', solver='liblinear').fit(X_train, y_train)
print('logreg ํ๋ จ ์ธํธ ์ ์ : {:.8f}'.format(logreg.score(X_train, y_train)))
print('logreg ํ
์คํธ ์ธํธ ์ ์ : {:.8f}'.format(logreg.score(X_test, y_test)))
print('--------๊ท์ ๊ฐํ--------')
logreg001 = LogisticRegression(C=0.01, penalty='l1', solver='liblinear').fit(X_train, y_train)
print('logreg001 ํ๋ จ ์ธํธ ์ ์ : {:.8f}'.format(logreg001.score(X_train, y_train)))
print('logreg001 ํ
์คํธ ์ธํธ ์ ์ : {:.8f}'.format(logreg001.score(X_test, y_test)))
print('--------๊ท์ ์ํ--------')
logreg100 = LogisticRegression(C=100, penalty='l1', solver='liblinear').fit(X_train, y_train)
print('logreg100 ํ๋ จ ์ธํธ ์ ์ : {:.8f}'.format(logreg100.score(X_train, y_train)))
print('logreg100 ํ
์คํธ ์ธํธ ์ ์ : {:.8f}'.format(logreg100.score(X_test, y_test)))
# L1 ๊ท์ ์ ๋ํ feature๋ค์ ๊ฐ์ค์น๋ฅผ ํ์ธ
plt.plot(logreg100.coef_.T, '^', label="C=100")
plt.plot(logreg.coef_.T, 'o', label="C=1")
plt.plot(logreg001.coef_.T, 'v', label="C=0.01")
plt.xticks(range(cancer.data.shape[1]), cancer.feature_names, rotation=90)
xlims = plt.xlim()
plt.hlines(0, xlims[0], xlims[1])
plt.xlim(xlims)
plt.ylim(-5, 5)
plt.xlabel("ํน์ฑ")
plt.ylabel("๊ณ์ ํฌ๊ธฐ")
plt.legend()
plt.show()
- ๋ฐ์ดํฐ์ ์ ์ฌ๋ฌ ์์ฑ์ ๋ํ๋ด๋ ๋ฐ์ดํฐํ๋ ์์ ๊ฐ ์ด์ ์ด ๋ฒกํฐ ํํ๋ก ๊ตฌํ๋จ
๐ผ ์ค๋น
๋ฌธ์ ์ ์ : SVM ์ฌ์ฉํ์ฌ ํ์ดํ๋ ์์กด์(1), ์ฌ๋ง์(0) ์์ธกํ๋ ์ด์ง๋ถ๋ฅ๋ชจ๋ธ๋ก ์ ์
๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ํฌํธ
import pandas as pd import seaborn as sns import matplotlib.pyplot as plt
ํ๊ธ๊นจ์ง ๋ฐฉ์ง
import matplotlib as mpl import matplotlib.pyplot as plt %config InlineBackend.figure_format = 'retina' !apt -qq -y install fonts-nanum import matplotlib.font_manager as fm fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf' font = fm.FontProperties(fname=fontpath, size=9) plt.rc('font', family='NanumBarunGothic') mpl.font_manager._rebuild()
# ๋ฐ์ดํฐ ์ค๋นํ๊ธฐ
df = sns.load_dataset("titanic")
# ๋ฐ์ดํฐ ํ์ธํ๊ธฐ
df.head()
attrs = df.columns
plt.figure(figsize=(20,20), dpi=200)
for i, feature in enumerate(attrs):
plt.subplot(5, 5, i+1)
sns.countplot(data=df, x=feature, hue='survived')
sns.despine()
๐ ๊ทธ๋ํ๋ฅผ ๋ณด๊ณ ๋ฃ๊ณ ๋บ ๊ฒ ๊ณ ๋ฅด๊ธฐ
df.isna().sum()
# 1) NaN์ด ๋ง์ ์ปฌ๋ผ ๋ฐ ์ค๋ณต ์ปฌ๋ผ ์ญ์ -> deck(NaN ๋ค์), embark_town(์ค๋ณต)
rdf = df.drop(['deck','embark_town'], axis=1)
rdf.info()
๐ ์ง์์ง
# 2) age ์ปฌ๋ผ์ ๋ฐ์ดํฐ๊ฐ ์๋ row(ํ) ์ญ์ -> age๊ฐ NaN์ธ 177๊ฑด๋ง ์ญ์ ๋๋๋ก
rdf = rdf.dropna(subset=['age'], how='any', axis=0)
rdf.info()
๐ ์ญ์ ๋์ด ์ค์ด๋ค์๋ค
# embarked ์น์ ๋์ NaN ๋ฐ์ดํฐ 2๊ฑด ์ด๋ป๊ฒ์ฑ์ฐ์ง ? -> ์น์ ๋์ ์ค ๊ฐ์ฅ ๋ง์ด ์ถํํ ๋์๋ก ์ง์
most_freq = rdf['embarked'].value_counts(dropna=True).idxmax()
most_freq # -> 'S'
# 'S'์ด๋ฏ๋ก NaN ๋ฐ์ดํฐ 'S'๋ก ์ง์
# embarked ์ด์ NaN๊ฐ์ ์น์ ๋์ ์ค ๊ฐ์ฅ ๋ง์ด ์ถํํ ๋์๋ก ์ฑ์ฐ๊ธฐ
rdf['embarked'].fillna(most_freq, inplace=True)
rdf.isna().sum()
๐ NaN๊ฐ์ด ์์์ ๋ณผ ์ ์์!
# 4) ํ์ต์ ํ์ํ ์ปฌ๋ผ์ ์ ํ
# ์์กด์ฌ๋ถ, ๊ฐ์ค ๋ฑ๊ธ, ์ฑ๋ณ, ๋์ด, ํ์ /์๋งค์, ๋ถ๋ชจ/์๋
์, ํ์น๋์
ndf = rdf[['survived','pclass','sex','age','sibsp','parch','embarked']]
ndf.head()
# 5) ๋ฌธ์๋ก ๋์ด์๋ ๊ฐ -๋ณํ > ์ธ์ฝ๋ฉ -> ์ํซ์ธ์ฝ๋ฉ(๋ฒ์ฃผํ ๋ฐ์ดํฐ๋ฅผ ๋จธ์ ๋ฌ๋ ๋ชจ๋ธ์ด ์ธ์ํ ์ ์๋๋ก ์ซ์ํ์ผ๋ก ๋ณํ)
# ex) male [1,0], female [0,1]
# ex) S[1,0,0],C[0,1,0],Q[0,0,1]
# 5-1) onehot ์ธ์ฝ๋ฉ
onehot_sex = pd.get_dummies(ndf['sex'])
onehot_embarked = pd.get_dummies(ndf['embarked'])
# 5-2) ndf ๋ฐ์ดํฐํ๋ ์์ ์ฐ๊ฒฐ
ndf = pd.concat([ndf, onehot_sex],axis=1)
ndf = pd.concat([ndf, onehot_embarked],axis=1)
# 6) ๊ธฐ์กด ์ปฌ๋ผ์ญ์
ndf.drop(['sex','embarked'], axis=1, inplace=True)
ndf
X = ndf[['pclass', 'age', 'sibsp', 'parch', 'female', 'male', 'C', 'Q', 'S']]
y = ndf['survived']
# X (feature, ๋
๋ฆฝ๋ณ์) ๊ฐ์ ์ ๊ทํ -> 0~1 ์ฌ์ด๋ก ๊ฐ์ ์ค์ฌ์ฃผ๋ ์์
-> ์ค์ผ์ผ๋ง(๋ฒ์์กฐ์ )
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)
X
# train, test set์ผ๋ก ๋ถ๋ฆฌ(7:3)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.3, random_state=7)
print('trains shape : ', X_train.shape)
print('test shape : ', X_test.shape)
from sklearn import svm
# ๋ชจ๋ธ ๊ฐ์ฒด ์์ฑ kernel = 'rbf'
# ๋ฒกํฐ ๊ณต๊ฐ์ ๋งตํํ๋ํจ์ -> ์ ํ(linear), ๋คํญ์(poly), ๊ฐ์ฐ์์ RBF(rbf), ์๊ทธ๋ชจ์ด๋(sigmoid)
svm_model = svm.SVC(kernel = 'rbf')
# ๋ชจ๋ธ ํ์ต
svm_model.fit(X_train, y_train)
print('ํ๋ จ ์ธํธ ์ ์ : {:.8f}'.format(svm_model.score(X_train, y_train)))
print('ํ
์คํธ ์ธํธ ์ ์ : {:.8f}'.format(svm_model.score(X_test, y_test)))
# ๋ชจ๋ธ ํ์ต ๊ฒฐ๊ณผ
from sklearn import metrics
y_pred = svm_model.predict(X_test) # ๋ฌธ์ ํ์ด๋ด
print('accuracy : ', metrics.accuracy_score(y_test, y_pred))
print('precision : ', metrics.precision_score(y_test, y_pred))
print('recall : ', metrics.recall_score(y_test, y_pred))
print('f1s : ', metrics.f1_score(y_test, y_pred))
accuracy : 0.8046511627906977
precision : 0.873015873015873
recall : 0.6179775280898876
f1s : 0.7236842105263157