import pandas as pd
import numpy as np
# House Price Data의 train data 이용
data = pd.read_csv(os.path.join(path,file), index_col=None)
data.head()
data.shape # (1640, 81)
data.info() # Alley, FireplaceQu, PoolQC, Fence, MiscFeature 카럼에서 결측치가 많은 것을 확인
na_feat = ['Alley', 'FireplaceQu, 'PoolQC', 'Fence', 'MiscFeature'] # 결측치가 많은 항목들
data.drop(na_feat, inplace=True, axis=1) # inplace가 True이면 메소드가 적용된 df로 기존 df 대체
data.shape # (1640, 76)
# Target Feature 지정
y = data['SalePrice']
data.drop(['Id', 'SalePrice'], axis=1, inplace=True)
for i in data.columns[data.isna(),sum()>0]:
if data[i].dtype=='object':
data[i] = data[i].fillna('None')
else:
data[i] = data[i].fillna(-1)
실행결과
# numerical features
numerical_f = list(data.columns[data.dtypes!='object'])
# numerical feature에서 제외할 항목 선택
numerical_f.remove('YrSold')
numerical_f.remove('YearBuilt')
numerical_f.remove('YearRemodAdd')
# numerical feature를 object로 변환
data['YrSold'] = data['YrSold'].astype('object')
data['YearBuilt'] = data['YearBuilt'].astype('object')
data['YearRemodAdd'] = data['YearRemodAdd'].astype('object')
# Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data[numerical_f] = scaler.fit_transform(data[numerical_f])
data.head()
실행결과
categorical_f = list(data.columns[data.dtypes=='object'])
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for i in categorical_f:
data[i] = encoder.fit_transform(data[i])
data.head()
실행결과
from sklearn.model_selection import train_test_split
print(data.shape, y.shape) # (1460, 74) (1460,)
x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=0)
print('train data: ', x_train.shape, y_train.shape) # train data: (1168, 74) (1168,)
print('test data; ', x_test.shape, y_test.shape) # test data: (292, 74) (292,)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_t, y_pred)
실행결과
from sklearn.metrics import mean_squared_error
mean_squared_error(y_t, y_pred)**0.5
실행결과
from sklearn.metric import mean_squared_log_error
mean_squared_log_error(y_t, y_pred)**0.5
실행결과
from sklearn.metrics import r2_score
r2_score(y_t, y_pred) # 음수인 경우 모델의 예측값이 평균값으로 예측하는 것보다 부정확
실행결과
n : 표본 수, p : 독립변수의 갯수
SST(총 변동) = SSR(설명 가능한 변동) + SSE(설명 불가능한 변동)
R2 = r2_score(y_t, y_pred)
n = len(y_t) # n값이 커질수록 p의 영향도가 작아짐
p = 5 # p값이 커질수록 값이 작아짐
adj_R2 = 1-((1-R2)*(n-1)/(n-p-1))
adj_R2
실행결과
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)
print('n coefficient :', len(lr.coef_))
print('x_train shape[1] :', x_train.shape[1])
print('lr.intercept_ : ', lr.intercept_)
print('lr.coef_ : ', lr.coef_) # 데이터로부터 얻은 계수(coefficient)의 추정치 (beta0, beta1, ...)
def regression_metrics(true, pred):
print('MAE : ', mean_absolute_error(true, pred))
print('MSE : ', mean_squared_error(true, pred))
print('RMSE : ', mean_squared_rror(true, pred)**0.5)
print('R-squared : ', r2_score(true, pred)) # 모형이 얼마나 선형적인가 (1에 가까울수록 선형적)
lr_predict = lr.predict(x_test)
regression_metrics(y_test, lr_predict)
실행결과
import statsmodels.api as sm
X = sm.add_constant(x_train)
sm_lr = sm.OLS(y_train, X)
sm_lr = sm_lr.fit()
print(sm_lr.summary())
실행결과
X_t = sm.add_constant(x_test)
sm_lr_predict = sm_lr.predict(X_t)
regression_metrics(y_test, sm_lr_predict)
실행결과
from sklearn.linear_model import Lasso, Ridge, ElasticNet
alphas = [10, 1, 0.1, 0.01]
for alpha in alphas:
lasso = Lasso(alpha=alpha, random_state=0)
lasso.fit(x_train, y_train)
pred = lasso.predict(x_test)
print(f'alpha = {alpha}')
regression_metrics(y_test, pred)
print("\n")
실행결과
alphas = [10, 1, 0.1, 0.01]
for alpha in alphas:
ridge = Ridge(alpha=alpha, random_state=0)
ridge.fit(x_train, y_train)
pred = ridge.predict(x_test)
print(f'alpha = {alpha}')
regression_metrics(y_test, pred)
print("\n")
실행결과
ratios = [0.2, 0.5, 0.8]
for ratio in ratios:
en = ElasticNet(alpha=1, 11_ratio=ratio, random_state=0) # alpha default = 1.0
en.fit(x_train, y_train)
pred = en.predict(x_test)
print(f'ratio = {ratio}')
regression_metrics(y_test, pred)
print("\n")
실행결과
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(x_train, y_train)
logistic_pred = logistic.predict_proba(x_test) # predict_proba 함수는 각 샘플에 대해 어느 클래스에 속할지 확률을 0과 1 사이의 값으로 리턴
np.min(logistic_pred)
np.max(logistic_pred)
실행결과
from sklearn.ensemble import AdaBoostRegressor
adaboost = AdaBoostRegressor()
adaboost.fit(x_train, y_train)
adaboost_pred = adaboost.predict(x_test)
regression_metrics(y_test, adaboost_pred)
실행결과
from sklearn.ensemble import GradientBoostingRegressor
GBM = GradientBoostingRegressor()
GBM.fit(x_train, y_train)
GBM_pred = GBM.predict(x_test)
regression_metrics(y_test, GBM_pred)
실행결과
from xgboost import XGBRegressor
XGB = XGBRegressor(n_estimators=10, random_state=0) # 'tree_method' : 'gpu_hist'
XGB.fit (x_train, y_train, eval_set=[(x_test,y_test)], early_stopping_rounds=4, eval_metric='rmse')
XGB_pred = XGB.predict(x_test)
regression_metrics(y_test, XGB_pred)
실행결과
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor(n_estimators=10, random_state=0) # 'device_type' : 'gpu'
lgbm.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=4, eval_metric='rmse')
lgbm_pred = lgbm.predict(x_test)
regression_metrics(y_test, lgbm_pred)
실행결과
from catboost import CatBoostRegressor
cat = CatBoostRegressor(n_estimators=10, random_state=0, loss_function='RMSE') # 'task_type' : 'GPU'
cat.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=4)
cat_pred = cat.predict(x_test)
regression_metrics(y_test, cat_pred)
실행결과