from sklearn import preprocessing # ์ ๊ทํ ํฌํจํ ์ ์ฒ๋ฆฌ ๊ด๋ จ
from sklearn.model_selection import train_test_split # ๋ฐ์ดํฐ์
๋ถ๋ฆฌ ๊ด๋ จ
from sklearn import tree # ๊ฒฐ์ ํธ๋ฆฌ ๋ชจ๋ธ ๊ด๋ จ
from sklearn import metrics # ์ฑ๋ฅํ๊ฐ ๊ด๋ จ
import pandas as pd
import numpy as np
uci_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/\
breast-cancer-wisconsin/breast-cancer-wisconsin.data'
df = pd.read_csv(uci_path, header = None)
df.head()
# ์ด ์ด๋ฆ ์ง์ ํ๊ธฐ
df.columns = ['id', 'clump', 'cell_size', 'cell_shape', 'adhesion', 'epithlial',
'bare_nuclei', 'chromatin', 'normal_nucleoli', 'mitoses', 'class']
df.head()
-df.info()๋ก ํ์ธํด๋ณด๋ ์ด 699๊ฑด์ ๋ฐ์ดํฐ๊ฐ ์๊ณ ๋ค๋ฅธ ์ปฌ๋ผ์ ์ ์ํ(์ซ์ํ)์ด์์ง๋ง, 'bare_nuclei'๋ผ๋ ์ปฌ๋ผ๋ง object type์ด์๋ค. ๋จธ์ ๋ฌ๋ ๋ชจ๋ธ์ ์ซ์๋ง ์ดํดํ๊ณ ์ฒ๋ฆฌํ๊ธฐ ๋๋ฌธ์ ์ ์ฒ๋ฆฌ๊ฐ ํ์ํ๋ค.
# 1) 'bare_nuclei'์ ๊ณ ์ ๊ฐ(unique) ํ์ธ
df['bare_nuclei'].unique()
# 2) '?' -> np.nana์ผ๋ก ๋ณ๊ฒฝํ๊ณ ์๋ฅผ ํ์ธํ์
df['bare_nuclei'].replace('?', np.nan, inplace = True)
df['bare_nuclei'].isna().sum()
# 3) NaN๊ฐ์ ์ญ์ ํ๊ณ ์ ์ฒด ์ ํ์ธ
df.dropna(subset=['bare_nuclei'], axis =0, inplace = True)
df.info()
# 4) bare_nuclei ์ปฌ๋ผ์ ํ๋ณํ
df['bare_nuclei'] = df['bare_nuclei'].astype('int64')
df.info()
X = df[['clump', 'cell_size', 'cell_shape', 'adhesion', 'epithlial',
'bare_nuclei', 'chromatin', 'normal_nucleoli', 'mitoses']]
y = df['class']
X
#์ค์ผ์ผ๋ง ํ ๋ฐ์ดํฐ ๋ถ๋ฆฌ
X = preprocessing.StandardScaler().fit(X).transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 7)
print('train shape', X_train.shape)
print('test shape', X_test.shape)
tree_model = tree.DecisionTreeClassifier(criterion= 'entropy', max_depth=5)
tree_model.fit(X_train, y_train)
y_pred = tree_model.predict(X_test)
print('ํ๋ จ ์ธํธ ์ ํ๋: {:.2f}%'.format(tree_model.score(X_train, y_train)*100))
print('ํ
์คํธ ์ธํธ ์ ํ๋: {:.2f}%'.format(tree_model.score(X_test, y_test)*100))
tree_report = metrics.classification_report(y_test, y_pred)
print(tree_report)
from sklearn.tree import export_graphviz # ํธ๋ฆฌ ๋ชจ๋ธ์ ๊ทธ๋ํ
export_graphviz(tree_model, out_file='tree.dot', class_names=['์
์ฑ','์์ฑ'],
feature_names=df.columns[1:10], impurity=False, filled=True)
import graphviz
with open('tree.dot') as f:
dot_graph = f.read()
display(graphviz.Source(dot_graph))
train_scores = []
test_scores = []
max_depth = np.arange(1, 10, 1) #๊ฐ์ง ๊ฐ์ 1-10๊น์ง ์ค์
for n in max_depth:
# ๋ชจ๋ธ ์์ฑ ๋ฐ ํ์ต
tree_model = tree.DecisionTreeClassifier(criterion= 'entropy', max_depth=n).fit(X_train, y_train)
# ํ๋ จ ์ธํธ ์ ํ๋ ์ ์ฅ
train_scores.append(tree_model.score(X_train, y_train))
# ํ
์คํธ ์ธํธ ์ ํ๋ ์ ์ฅ
test_scores.append(tree_model.score(X_test, y_test))
# ์์ธก ์ ํ๋ ๋น๊ต ๊ทธ๋ํ ๊ทธ๋ฆฌ๊ธฐ
plt.figure(dpi=150)
plt.plot(max_depth, train_scores, label='ํ๋ จ ์ ํ๋')
plt.plot(max_depth, test_scores, label='ํ
์คํธ ์ ํ๋')
plt.ylabel('์ ํ๋')
plt.xlabel('๊ฐ์ง๊ฐ์')
plt.legend()
plt.show()
import numpy as np # ๋ฐ์ดํฐ ์ฒ๋ฆฌ ๊ด๋ จ
import pandas as pd
import matplotlib.pyplot as plt # ์๊ฐํ ๊ด๋ จ
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split # ๋ฐ์ดํฐ ์
๋ถ๋ฆฌ ๊ด๋ จ
from sklearn.preprocessing import StandardScaler # ๋ฐ์ดํฐ ์ ๊ทํ ๊ด๋ จ
from sklearn.linear_model import LogisticRegression # ๋ก์ง์คํฑ ํ๊ท ๋ชจ๋ธ
from sklearn.tree import DecisionTreeClassifier # ๊ฒฐ์ ํธ๋ฆฌ ๋ถ๋ฅ ๋ชจ๋ธ
from sklearn.ensemble import RandomForestClassifier # ๋๋คํฌ๋ ์คํธ ๋ถ๋ฅ ๋ชจ๋ธ(์์๋ธ)
from sklearn.ensemble import GradientBoostingClassifier # ๊ทธ๋๋์ธํธ๋ถ์คํ
๋ถ๋ฅ ๋ชจ๋ธ(์์๋ธ)
from sklearn import metrics # ์ฑ๋ฅํ๊ฐ
df = pd.read_csv('/content/heart.csv')
df.head()
df['sex'].unique()
df['cp'].unique()
df['fbs'].unique()
categorical_var = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']
df[categorical_var] = df[categorical_var].astype('category')
numberic_var = [i for i in df.columns if i not in categorical_var][:-1]
X= df.iloc[:, :-1]# ์: data row ์ ๋ค: column ์(':'=์ ๋ถ๋ฅผ ์๋ฏธ)
y = df['output']
#๋ฒ์ฃผํ ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ
temp_X = pd.get_dummies(X[categorical_var])
X_modified = pd.concat([X, temp_X], axis =1)
X_modified.drop(categorical_var, axis=1, inplace =True)
X_modified
#์์นํ ๋ณ์ ์ ์ฒ๋ฆฌ
X_modified[numberic_var] = StandardScaler().fit(X_modified[numberic_var]).transform(X_modified[numberic_var])
X_train, X_test, y_train, y_test = train_test_split(X_modified, y, test_size=0.2, random_state = 7)
print('train shape', X_train.shape)
print('test shape', X_test.shape)
# 1. LogisticRegression
logreg = LogisticRegression().fit(X_train, y_train)
print('ํ๋ จ ์ธํธ ์ ํ๋: {:.5f}%'.format(logreg.score(X_train, y_train)*100))
print('ํ
์คํธ ์ธํธ ์ ํ๋: {:.5f}%'.format(logreg.score(X_test, y_test)*100))
# 2. DecisionTree
tree = DecisionTreeClassifier(max_depth=5, min_samples_leaf=20, min_samples_split=40).fit(X_train, y_train)
print('ํ๋ จ ์ธํธ ์ ํ๋: {:.5f}%'.format(tree.score(X_train, y_train)*100))
print('ํ
์คํธ ์ธํธ ์ ํ๋: {:.5f}%'.format(tree.score(X_test, y_test)*100))
# 3. RandomForest
# ๊ฒฐ์ ํธ๋ฆฌ์ ๋ฌ๋ฆฌ ๋๋คํ๊ฒ ๊ฐ์ง์น๊ธฐ ์์ feature๋ฅผ ์ ํํ๋ค.
random = RandomForestClassifier(n_estimators=300, random_state=7).fit(X_train, y_train)
print('ํ๋ จ ์ธํธ ์ ํ๋: {:.8f}%'.format(random.score(X_train, y_train)*100))
print('ํ
์คํธ ์ธํธ ์ ํ๋: {:.8f}%'.format(random.score(X_test, y_test)*100))
# 4. GradientBoosting
boost = GradientBoostingClassifier(max_depth=3, learning_rate=0.05).fit(X_train, y_train)
print('ํ๋ จ ์ธํธ ์ ํ๋: {:.8f}%'.format(boost.score(X_train, y_train)*100))
# ํจํด ์ผ๋ฐํ๋ก ํ๋ จ ๋ฐ์ดํฐ์ ๋ํ ์ ํ๋๋ ๋จ์ด์ง
print('ํ
์คํธ ์ธํธ ์ ํ๋: {:.8f}%'.format(boost.score(X_test, y_test)*100))