import pandas as pd
wine = pd.read_csv('https://bit.ly/wine_csv_data')
wine.head()
wine.info()
wine.describe()
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(
data, target, random_state=42, test_size=0.2
)
print(train_input.shape, test_input.shape)
(5197, 3) (1300, 3) // train_set이 test_set의 3배
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_scaled, train_target)
print(lr.score(train_scaled, train_target))
print(lr.score(test_scaled, test_target))
print(lr.coef_, lr.intercept_)
0.7808350971714451
0.7776923076923077
[[ 0.51270274 1.6733911 -0.68767781]] [1.81777902]
-> alcohol, sugar 높을수록 화이트 와인, ph 높을수록 레드 와인
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(train_scaled, train_target)
print(dt.score(train_scaled, train_target))
print(dt.score(test_scaled, test_target))
0.996921300750433
0.8592307692307692
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
plt.figure(figsize=(10,7))
plot_tree(dt)
plt.show()
plt.figure(figsize=(10, 7))
plot_tree(dt, max_depth=1, filled=True,
feature_names=['alcohol', 'sugar', 'pH'])
plt.show()
-> sugar를 기준으로 샘플이 나누어짐
dt = DecisionTreeClassifier(max_depth=3, random_state=42)
dt.fit(train_scaled, train_target)
print(dt.score(train_scaled, train_target))
print(dt.score(test_scaled, test_target))
0.8454877814123533 // 과대적합 완화
0.8415384615384616
print(dt.feature_importances_)
[0.12345626 0.86862934 0.0079144 ]
-> alcohol, sugar, pH