featureimportances : feature별 중요도 반환
wine dataset : color 분류
#데이터 불러오기
import pandas as pd
wine = pd.read_csv('data/wine.csv')
#데이터 분리
X = wine.drop(columns='color')
y = wine['color']
##quality를 Label Encoding 처리
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(['A','B','C'])
X['quality'] = le.transform(X['quality'])
#train/test 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=0)
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
#학습 한 tree정보
tree.get_depth() # depth, level을 조회 >>max_depth
tree.get_n_leaves() #leaf node의 개수
#함수로 평가결과 조회
from metrics import print_metrics_classification
print_metrics_classification(y_train, tree.predict(X_train), tree.predict_proba(X_train)[:,1], 'Train set 평가결과')
print_metrics_classification(y_test, tree.predict(X_test), tree.predict_proba(X_test)[:,1], 'Test set 평가결과')
#각 feature(컬럼)의 중요도(점수)
tree.feature_importances_
### graphviz를 이용해 tree 구조 시각화
from sklearn.tree import export_graphviz
from graphviz import Source
graph = Source(export_graphviz(tree, #학습한 DecisionTree 모델
feature_names = X_train.columns,
class_names=['White', 'Red'],
rounded = True,
filled=True
))
GridSearchCV
max_dept
max_leaf_nodes
min_samples_leaf
max_features
-기준: 정확도
-feature importances_ 확인
-tree 구조 확인(graphviz)
from sklearn.model_selection import GridSearchCV
params = {
"max_depth":range(1,14),
"max_leaf_nodes":range(10,34),
'min_samples_leaf':range(10,1000,50),
'max_features':range(1,13)
}
gs = GridSearchCV(DecisionTreeClassifier(random_state=0),
params,
scoring="accuracy",
cv=5,
n_jobs=-1)
gs.fit(X_train, y_train)
print('best_score:', gs.best_score_)
print('best param:', gs.best_params_)
best_model = gs.best_estimator_
fi = pd.Series(best_model.feature_importances_, index=X.columns)
fi.sort_values(ascending=False)
graph = Source(export_graphviz(best_model,
feature_names = X.columns,
class_names=['White', 'Red'],
filled=True,
rounded=True))
-회귀
import pandas as pd
from sklearn.model_selection import train_test_split
def get_boston_dataset(path='data/boston_hosing.csv', test_size=0.25):
df = pd.read_csv(path)
X = df.drop(columns='MEDV')
y = df['MEDV']
dataset = train_test_split(X, y, test_size=test_size, random_state=0)
return dataset
from dataset import get_boston_dataset
X_train, X_test, y_train, y_test = get_boston_dataset()
X_train.shape, X_test.shape
#모델링
from sklearn.tree import DecisionTreeRegressor
tree_rg = DecisionTreeRegressor(max_depth=2, random_state=0)
tree_rg.fit(X_train, y_train)
graph2 = Source(export_graphviz(tree_rg,
feature_names=X_train.columns,
filled=True,
rounded=True))
#데이터 분리
X_train, X_test, y_train, y_test = get_wine_dataset()
#모델 생성, 학습, 검증
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=500, #DecisionTree 모델 개수(최소 200개 이상)
max_features=10, #sampling할 feature의 개수, default: 'auto' - sqrt(전체feature수)
max_depth=5, #500개의 decisiontree에 공통적으로 적용될 hyper parameter
random_state=0,
n_jobs=-1
)
#학습
rfc.fit(X_train,y_train)
#검증
pred_train = rfc.predict(X_train)
pred_test = rfc.predict(X_test)
proba_pos_train = rfc.predict_proba(X_train)[:,1]
proba_pos_test = rfc.predict_proba(X_test)[:,1]
print_metrics_classification(y_train,pred_train, proba_pos_train,'train set')
print_metrics_classification(y_test,pred_test, proba_pos_test,'test set')
#feature importance
fi_rf = pd.Series(rfc.feature_importances_, index=X_train.columns)
fi_rf = fi_rf.sort_values(ascending=False)
-GridSearch
#데이터 로드
(X_train, X_test, y_train, y_test), feature_names = get_breast_cancer_dataset() #scaling=True)
print(X_train.shape, X_test.shape)
print(X_train.mean(axis=0))
print(X_train.std(axis=0))
#모델 생성
params = {
"n_estimators":range(100,501,100),
"max_features":range(1,31),
'max_depth':range(1,6,1),
'min_samples_leaf':range(4,50)
}
rfc = RandomForestClassifier(random_state=0)
gs = GridSearchCV(rfc, params, scoring='accuracy', cv=4, n_jobs=-1)
#학습
gs.fit(X_train,y_train)
print(gs.best_score_)
print(gs.best_params_)
result_df = pd.DataFrame(gs.cv_results_).sort_values('rank_test_score')
best_model = gs.best_estimator_
fi = pd.Series(best_model.feature_importances_, index=feature_names).sort_values(ascending=False)
pred_test = best_model.predict(X_test)
print_metrics_classification(y_test, pred_test)