- ML_1016_01_feature_engineering.ipynb
Feature Scaling
- StandardScaler : 표준화, 주로 신경망 모델
- MinMaxScaler : 0~1
- RobustScaler : oulier에 강함
- PowerTransformer : 분산 안정화, 왜도 최소화, box-cox와 yeo-johnson변환 지원
from sklearn.preprocessing import PowerTransformer
pt=PowerTransformer(method='yeo-johnson')
X_train[num_features]=pt.fit_transform(X_train[num_features])
X_test[num_features]=pt.transform(X_test[num_features])
All about Feature Scaling
Feature Selection
- Model Based feature Selection
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
select1 = SelectFromModel(RandomForestClassifier(random_state=0), threshold=None)
X_train_sc3_fs1 = select1.fit(X_train_sc3, y_train).transform(X_train_sc3)
print("X_train_sc3.shape: {}, X_train_sc3_fs1.shape: {}".format(X_train_sc3.shape, X_train_sc3_fs1.shape))
mask = select1.get_support()
plt.matshow(mask.reshape(1,-1), cmap="gray_r")
plt.show()
X_test_sc3_fs1 = select1.transform(X_test_sc3)
svm.fit(X_train_sc3_fs1, y_train).score(X_test_sc3_fs1, y_test)
- Univarate(단변량) Feature Selection
-통계 모델 기반
- y값과 하나의 feature간의 통계적 유의미를 분석
- 주로 선형 모델에서 유용
from sklearn.feature_selection import SelectKBest
select2 = SelectKBest(k=10)
X_train_sc3_fs2 = select2.fit_transform(X_train_sc3, y_train)
X_train_sc3_fs2.shape
mask = select2.get_support()
plt.matshow(mask.reshape(1,-1), cmap="gray_r")
plt.show()
X_test_sc3_fs2 = select2.transform(X_test_sc3)
svm.fit(X_train_sc3_fs2, y_train).score(X_test_sc3_fs2, y_test)
- Recursive feature elimination
from sklearn.feature_selection import RFE
select3 = RFE(estimator=RandomForestClassifier(random_state=0), n_features_to_select=10, step=1)
X_train_sc3_fs3 = select3.fit_transform(X_train_sc3, y_train)
X_train_sc3_fs3.shape
X_test_sc3_fs3 = select3.transform(X_test_sc3)
svm.fit(X_train_sc3_fs3, y_train).score(X_test_sc3_fs3, y_test)
- 다중공선성 고려 Feature 선택(Correaltion)
from collections import Counter
correlation_matrix = df.corr()
pairs =[]
for i in range(len(correlation_matrix.columns)):
for j in range(i+1, len(correlation_matrix.columns)):
pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j], correlation_matrix.iloc[i,j]))
pairs = sorted(pairs, key=lambda x: -x[2])
result_df = pd.DataFrame(pairs, columns =['col1', 'col2', 'correlation'])
high_corr_cols = result_df.loc[result_df['Correlation']>0.7, 'col1'].tolist() + result_df.loc[result_df['Correaltion']>0.7, 'col2'].tolsit()
C=Counter(high_corr_cols)
d_cols = pd.DataFrame(C.most_common()[:10]).iloc[:,0].tolist()
df_ = df.drop(columns= d_cols)
Feature Generation
- Automatic generating polynomial and interaction features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(interaction_only=True)
poly = PolynomialFeatures(2, include_bias=False)
X_train_sc3_poly = poly.fit_transform(X_train_sc3)
X_test_sc3_poly = poly.transform(X_test_sc3)
print(X_train_sc3_poly.shape, X_test_sc3_poly.shape)
svm.fit(X_train_sc3_poly, y_train).score(X_test_sc3_poly, y_test)
select2 = SelectKBest(k=20)
X_train_sc3_poly_fs2 = select2.fit(X_train_sc3_poly, y_train).transform(X_train_sc3_poly)
X_test_sc3_poly_fs2 = select2.transform(X_test_sc3_poly)
print(X_train_sc3_poly_fs2.shape)
svm.fit(X_train_sc3_poly_fs2, y_train).score(X_test_sc3_poly_fs2, y_test)
mask = select2.get_support()
np.array(poly.get_feature_names())[mask]
- ratio features
- product features : feature importance를 봤을 때, 상위 feature들 중 numerical feature끼리 곱하여 추가함
- Addition or Subtraction features : 중요한 feature끼리 더하거나 빼서 새로운 feature 생성
- Aggregation features: Category와 numerical feature의 조합으로 생성하며, Category 각 그룹당 mean, median, variance, standard deviation을 feature로 사용
group_object = credit_card.groupby(by=['SK_ID_CURR'])['AMT_DRAWINGS_ATM_CURRENT'].agg('sum').reset_index()
for m in [3,6,12]:
start = str(pd.to_datetime(tr.tran_date.max()) - pd.offsets.MonthBegin(m))
f = tr.query('tran_date >= @start').groupby('cust_id')['amount'].agg([
(f'최근{m}개월_구매금액', np.sum),
(f'최근{m}개월_구매건수', np.size)
]).reset_index()
display(f)
features = features.merge(f, how='left'); features
- Clustering 기반 Feature generation
Classifiation 문제일 때
results = []
for k in range(2,51):
kmeans = MiniBatchKMeans(n_clusters=k)
pred = kmeans.fit_predict(x_train)
results.append(v_measure_score(y_train, pred))
sns.lineplot(x=range(2,51), y=results);
sns.lineplot(x=range(2,10), y=results[:8]);
kmeans = MiniBatchKMeans(n_clusters=3)
clusters = pd.DataFrame(x_train, columns=col, index=idx).join(y_train)
clusters['group'] = kmeans.fit_predict(x_train)
clusters.groupby('group').mean().T
c_summary = pd.DataFrame(clusters.mean(), columns =['overall'])
c_summary = c_summary.join(clusters.groupby('group').mean().T)
c_diff = c_summary.substract(c_summary['overall'], axis=0)
c_diff['overall'] = c_summary['overall']
c_diff
churn_prob = clusters.groupby('group')[['churn']].mean().reset_index()
| group | churn |
---|
0 | 0 | 0.119900 |
1 | 1 | 0.143210 |
2 | 2 | 0.515434 |
x_train = np.hstack((x_train, kmeans.predict(x_train).reshape(len(x_train), 1)))
x_test = np.hstack((x_test, kmeans.predict(x_test).reshape(len(x_test), 1)))
rf= RandomForestClassifier(max_depth=10)
rf.fit(x_train, y_train)
rf_pred = rf.predict_proba(x_test)
roc_auc_score(y_test, rf_pred(:1])