학습내용

One hot encoding

pd.get_dummies(data, prefix='X') #칼럼설정 안하면 모든 categorical 칼럼에 대해 one-hot-encoding
pd.get_dummies(data, drop_first = True) #더미코딩

encoder = OneHotEncoder(use_cat_names = True)

#둘 차이는 get_dummies는 한번에 categorical을 몰아넣음
#encoder는 원래 순서대로 변화해주는듯

Pandas profiling

import pandas_profiling
from pandas_profiling import ProfileReport

df.profile_report() #둘 다 같은 결과
ProfileReport(df)

Distribution plot

displot
distplot
# 비슷한데 좀 차이있음, displot은 변화가 잘 안되는느낌

np.percentile 이상치 제거

df = df[(df['price'] >= np.percentile(df['price'], 0.05)) & 
        (df['price'] <= np.percentile(df['price'], 99.5))]

K best feature selection

from sklearn.feature_selection import f_regression, SelectKBest

selector = SelectKBest(score_func = f_regression, k = 10) #f statistic을 통해 p-value를 계산
mark = selector.get_support() #선택된 특성을 불리언 값으로 표시해주어 어떤 특성이 선택되었는지 확인 가능

all_names = df.columns
select_names = all_names[mark]

selector.fit_transform(X_train, y_train) #selecting
selector.transform(X_test)

Ridge

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.metrics import mean_absolute_error, r2_score

ridge = Ridge(alphas = alpha, normalize = True)

RidgeCV

alphas = [0, 0.001, 0.01, 0.1, 1]

ridge = RidgeCV(alphas = alphas, normalize = True, cv = 5)
ridge.fit(X_train_selected, y_train)
print('best alpha : ', ridge.alpha_)
print('best score : ', ridge.best_score_)

0개의 댓글