- X : 소요분 / 방송구분 / 판매단가 / ARS 금액 /수수료율 / '방송요일' / '방송월'
- Y : '목표달성여부'
- 학습 : 검증 = 8 : 2
- 특성 공학 기법 (결측처리(평균,최빈값) + 스케일링&인코딩)
- 알고리즘 (Decision Tree 알고리즘 / 하이퍼파라미터 튜닝 X)
- 평가
✔︎ X,Y 설정
Y = df1['목표달성여부']
X = df1[['소요분', '방송구분', '판매단가', 'ARS금액', '수수료율', '방송요일', '방송월']]
✔︎ 훈련을 위한 필요 라이브러리 불러오기
#학습데이터와 검증 데이터 분할
from sklearn.model_selection import train_test_split
# 학습과 특성공학이 같이 수행되는 파이프라인 구축
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
#특성공학 기법
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
#알고리즘
from sklearn.tree import DecisionTreeClassifier
#교차검증 + 하이퍼 파라미터 튜닝 기법
from sklearn.model_selection import GridSearchCV
# 평가
from sklearn.metrics import classification_report
✔︎ 학습데이터와 검증데이터 분할
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.2,
random_state=1234)
✔︎ 연속형 변수와 범주형 변수 구분
numeric_list = X.describe().columns # 숫자 데이터 구분 리스트
category_list = X.describe(include = 'object').columns # 문자 데이터 구분 리스트
✔︎ 각 데이터 타입 별 파이프라인 구축
numeric_pipe = make_pipeline(SimpleImputer(strategy = 'mean'), MinMaxScaler())
category_pipe = make_pipeline(SimpleImputer(strategy = 'most_frequent'),
OneHotEncoder(handle_unknown = 'ignore'))
preprocess_pipe = make_column_transformer((numeric_pipe, numeric_list),(category_pipe, category_list))
✔︎ 특성공학 + 학습
model_pipe = make_pipeline(preprocess_pipe, DecisionTreeClassifier())
model_pipe.fit(X_train, Y_train)
✔︎ 하이퍼 파라미터 튜닝
hyperparameter = {'decisiontreeclassifier__max_depth':range(5,11),
'decisiontreeclassifier__min_samples_split':range(5,11)}
# 교차검증 3회 x 하이퍼파라미터 튜닝 25회 = 75회 학습
grid_model = GridSearchCV(model_pipe, param_grid = hyperparameter,
cv=3, scoring = 'f1', n_jobs = -1)
grid_model.fit(X_train, Y_train)코드를 입력하세요
best_model = grid_model.best_estimator_
✔︎ 평가를 위한 평가함수 정의
def eval_func1(model):
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
print('학습 성능')
print(classification_report(Y_train, Y_train_pred))
print('일반화 성능')
print(classification_report(Y_test, Y_test_pred))
✔︎ 함수를 활용하여 평가
eval_func1(best_model)
>>> 학습 성능
precision recall f1-score support
0 0.69 0.92 0.79 13042
1 0.71 0.32 0.44 7984
accuracy 0.69 21026
macro avg 0.70 0.62 0.61 21026
weighted avg 0.70 0.69 0.66 21026
일반화 성능
precision recall f1-score support
0 0.68 0.90 0.77 3291
1 0.62 0.29 0.39 1966
accuracy 0.67 5257
macro avg 0.65 0.59 0.58 5257
weighted avg 0.66 0.67 0.63 5257
✔︎ 사용할 데이터 불러오기 (질병 데이터)
df2 = pd.read_csv('12_Data.csv')
df2.head(2) # Diagnosis / B 정상 M 암
>>> Image ID Diagnosis Mean Radius Mean Perimeter Mean Area Mean Texture Mean Smoothness Mean Compactness Mean Concavity Mean Concave Points ... SE Radius SE Perimeter SE Area SE Texture SE Smoothness SE Compactness SE Concavity SE Concave Points SE Symmetry SE Fractal Dim
0 842302 M 17.99 122.8 1001.0 10.38 0.12 0.27760 0.3001 0.1471 ... 1.0950 8.589 153.40 0.9053 0.0064 0.0490 0.0537 0.0159 0.0300 0.0062
1 842517 M 20.57 132.9 1326.0 17.77 0.08 0.07864 0.0869 0.0702 ... 0.5435 3.398 74.08 0.7339 0.0052 0.0131 0.0186 0.0134 0.0139 0.0035
✔︎ ['Diagnosis'] 컬럼 확인
df2['Diagnosis'].value_counts()
>>> Diagnosis
B 357
M 212
Name: count, dtype: int64
✔︎ Mean Radius 값과 Mean Concavity 에 따라 M/B 값이 어덯게 바뀌는지 확인
color_mapping = {'M': 'red', 'B':'blue'}
fig1 = px.scatter(df2, x='Mean Radius', y='Mean Concavity',
color='Diagnosis', color_discrete_map=color_mapping)
fig1 # # Source Data
✔︎ 라이브러리 불러오기
from imblearn.under_sampling import RandomUnderSampler
✔︎ X,Y 설정
X= df2[['Mean Radius', 'Mean Concavity']]
Y= df2['Diagnosis']
✔︎ Random Undersampling
sample_model = RandomUnderSampler()
X_resamp, Y_resamp = sample_model.fit_resample(X,Y)
X_resamp['Target'] = Y_resamp
from plotly.subplots import make_subplots
fig2 = px.scatter(X_resamp, x='Mean Radius', y='Mean Concavity',
color='Target', color_discrete_map = color_mapping)
✔︎ Subplot 생성
figure = make_subplots(rows=1, cols=2, subplot_titles=('Source', 'Resample'))
✔︎ 각각의 그래프를 subplot에 추가
def figure_func():
color_mapping = {'M': 'red', 'B':'blue'}
fig1 = px.scatter(df2, x='Mean Radius', y='Mean Concavity',
color='Diagnosis', color_discrete_map=color_mapping)
fig2 = px.scatter(X_resamp, x='Mean Radius', y='Mean Concavity',
color='Target', color_discrete_map = color_mapping)
figure = make_subplots(rows=1, cols=2, subplot_titles=('Source', 'Resample'))
# 각각의 그래프를 나누어 출력
for trace in fig1.data:
figure.add_trace(trace, row=1, col=1)
for trace in fig2.data:
figure.add_trace(trace, row=1, col=2)
figure.show()
figure_func()
from imblearn.under_sampling import TomekLinks
sample_model = TomekLinks()
X_resamp, Y_resamp = sample_model.fit_resample(X,Y)
X_resamp['Target'] = Y_resamp
Y_resamp.value_counts()
>>> Diagnosis
B 342
M 212
Name: count, dtype: int64
from imblearn.under_sampling import EditedNearestNeighbours
sample_model = EditedNearestNeighbours()
X_resamp, Y_resamp = sample_model.fit_resample(X,Y)
X_resamp['Target'] = Y_resamp
Y_resamp.value_counts()
>>> Diagnosis
B 294
M 212
Name: count, dtype: int64
from imblearn.over_sampling import RandomOverSampler
sample_model = RandomOverSampler()
X_resamp, Y_resamp = sample_model.fit_resample(X,Y)
X_resamp['Target'] = Y_resamp
Y_resamp.value_counts()
>>> Diagnosis
M 357
B 357
Name: count, dtype: int64
from imblearn.over_sampling import ADASYN
sample_model = ADASYN()
X_resamp, Y_resamp = sample_model.fit_resample(X,Y)
X_resamp['Target'] = Y_resamp
Y_resamp.value_counts()
>>> Diagnosis
M 362
B 357
Name: count, dtype: int64
Combining Sampling : Under Sampling + Over Sampling
from imblearn.combine import SMOTETomek
sample_model = SMOTETomek()
X_resamp, Y_resamp = sample_model.fit_resample(X,Y)
X_resamp['Target'] = Y_resamp
Y_resamp.value_counts()
>>> Diagnosis
M 342
B 342
Name: count, dtype: int64
✔︎ 사용 데이터 불러오기
- 정형외과 병원 / 디스크 수술 후 환자들의 데이터
df2 = pd.read_csv('15_Data.csv')
✔︎ X,Y 설정
Y = df2['수술실패여부']
X = df2[['연령','체중','신장','수술기법','통증기간(월)','헤모글로빈수치']]
✔︎ 학습데이터셋과 검증데이터셋 나누기
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,random_state=1234)
numeric_list = X.describe().columns
category_list = X.describe(include='object').columns
from imblearn.pipeline import make_pipeline
numeric_pipe = make_pipeline(SimpleImputer(strategy='mean'), MinMaxScaler())
category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'),
OneHotEncoder())
✔︎ 파이프라인 구성
preprocess_pipe2 = make_column_transformer((numeric_pipe, numeric_list),
(category_pipe, category_list))
model_pipe2 = make_pipeline(preprocess_pipe2, SMOTE(), DecisionTreeClassifier())
✔︎ 학습
grid_model = GridSearchCV(model_pipe2, param_grid=hyperparameter,
cv=3, n_jobs=-1, scoring='f1')
grid_model.fit(X_train, Y_train)
best_model = grid_model.best_estimator_
✔︎ 평가
eval_func1(best_model)