red_wine['color'] = 1
white_wine['color'] = 0
# red : 1 ,white : 0
wine = pd.concat([red_wine,white_wine])
import plotly.express as px
fig = px.histogram(wine, x='quality')
fig.show()
fig = px.histogram(wine , x='quality', color='color')
fig.show()
X = wine.drop(['color'], axis=1)
y = wine['color']
from sklearn.model_selection import train_test_split
import numpy as np
X_train, X_test , y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state = 13)
import plotly.graph_objects as go
fig =go.Figure()
fig.add_trace(go.Histogram(x=X_train['quality'], name ='Train'))
fig.add_trace(go.Histogram(x=X_test['quality'], name ='Test'))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()
update_layout : 그래프를 어떻게 나타낼 것인가 Stack, overlay, 등등..
(참고 : https://plotly.com/python/reference/layout/)
Train 과 Test 셋에 비슷한 비율로 데이터가 분리되어 있다
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=2 , random_state=13)
clf.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
y_pred_tr = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
print('Train ACC : ', accuracy_score(y_train,y_pred_tr))
print('Test ACC : ', accuracy_score(y_test,y_pred_test))
>>>
Train ACC : 0.9553588608812776
Test ACC : 0.9569230769230769
fig = go.Figure()
fig.add_trace(go.Box(y=X['fixed acidity'], name ='fixed acidity'))
fig.add_trace(go.Box(y=X['chlorides'], name ='chlorides'))
fig.add_trace(go.Box(y=X['quality'], name ='quality'))
fig.show()
from sklearn.preprocessing import MinMaxScaler, StandardScaler
mms = MinMaxScaler()
ss = StandardScaler()
X_ss = ss.fit_transform(X)
X_mms = mms.fit_transform(X)
X_ss_pd = pd.DataFrame(X_ss, columns=X.columns)
X_mms_pd = pd.DataFrame(X_mms, columns=X.columns)
X_train, X_test , y_train, y_test = train_test_split(X_mms_pd, y, test_size=0.2,
random_state = 13)
clf = DecisionTreeClassifier(max_depth=2 , random_state=13)
clf.fit(X_train, y_train)
y_pred_tr = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
print('Train ACC : ', accuracy_score(y_train,y_pred_tr))
print('Test ACC : ', accuracy_score(y_test,y_pred_test))
>>>
Train ACC : 0.9553588608812776
Test ACC : 0.9569230769230769
X_train, X_test , y_train, y_test = train_test_split(X_ss_pd, y, test_size=0.2,
random_state = 13)
clf = DecisionTreeClassifier(max_depth=2 , random_state=13)
clf.fit(X_train, y_train)
y_pred_tr = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
print('Train ACC : ', accuracy_score(y_train,y_pred_tr))
print('Test ACC : ', accuracy_score(y_test,y_pred_test))
>>>
Train ACC : 0.9553588608812776
Test ACC : 0.9569230769230769
dict(zip(X_train.columns, clf.feature_importances_))
>>>
{'fixed acidity': 0.0,
'volatile acidity': 0.0,
'citric acid': 0.0,
'residual sugar': 0.0,
'chlorides': 0.24230360549660776,
'free sulfur dioxide': 0.0,
'total sulfur dioxide': 0.7576963945033922,
'density': 0.0,
'pH': 0.0,
'sulphates': 0.0,
'alcohol': 0.0,
'quality': 0.0}
wine['taste'] = [1 if grade > 5 else 0 for grade in wine['quality']]
X = wine.drop(['taste','quality'], axis=1)
y = wine['taste']
X_train, X_test ,y_train, y_test = train_test_split(X,y, test_size = 0.2,
random_state = 13)
clf = DecisionTreeClassifier(max_depth=2, random_state=13)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
print(f'Train ACC : {accuracy_score(y_train, y_pred_train)}')
print(f'test ACC : {accuracy_score(y_test, y_pred_test)}')
>>>
Train ACC : 0.7294593034442948
test ACC : 0.7161538461538461
import matplotlib.pyplot as plt
import sklearn.tree as tree
plt.figure(figsize=(12,8))
tree.plot_tree(clf, feature_names=X.columns,
rounded=True,
filled=True);