import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('/content/drive/MyDrive/edwith/프로젝트로 배우는 데이터 사이언스/diabetes.csv')
print(df.shape)
df.head()
(df.Insulin > 0).value_counts()
train = df[df.Insulin > 0].copy()
test = df[df.Insulin <= 0].copy()
train.shape, test.shape
feature_names = test.columns.tolist()
feature_names.remove('Insulin')
feature_names
label_name = 'Insulin'
X_train = train[feature_names]
y_train = train[label_name]
X_test = test[feature_names]
y_test = test[label_name]
print('Train set의 shape: {}, {}\nTest set의 shape: {}, {}'.format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model
model.fit(X_train, y_train)
.fit
을 써주면 됨from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(model, X_train, y_train, cv = 5, verbose = 2, n_jobs = -1)
y_pred
(y_pred == y_train).sum()
plt.figure(figsize = (10, 8))
sns.regplot(y_train, y_pred)
plt.show()
.
plt.figure(figsize = (10, 8))
sns.distplot(y_train, hist = False, label = 'train')
sns.distplot(y_pred, hist = False, label = 'pred')
plt.legend()
plt.show()
error = abs(y_train - y_pred)
print('MAE: {}'.format(error.mean()))
sns.distplot(error)
RMSE = np.sqrt((error**2).mean())
print('RMSE: {}'.format(RMSE))
from sklearn.tree import plot_tree
plt.figure(figsize = (20, 20))
tree = plot_tree(model, feature_names = feature_names, filled = True, fontsize = 10)
plt.figure(figsize = (10, 8))
sns.barplot(x = model.feature_importances_, y = feature_names)
plt.show()
y_test = model.predict(X_test)
y_test[: 5]
train.groupby('Outcome')['Insulin'].agg(['mean', 'median'])
train.groupby('Outcome')['Insulin'].describe()
plt.figure(figsize = (10, 8))
sns.barplot(data = train, x = 'Outcome', y = 'Insulin')
plt.show()
test['Insulin'] = y_test
test.groupby('Outcome')['Insulin'].describe()
plt.figure(figsize = (10, 8))
sns.barplot(data = test, x = 'Outcome', y = 'Insulin')
plt.show()
plt.figure(figsize = (10, 8))
sns.distplot(train.Insulin)
plt.show()
plt.figure(figsize = (10, 4))
sns.boxplot(train.Insulin)
plt.show()
desc = train.Insulin.describe()
desc
IQR = desc.loc['75%'] - desc.loc['25%']
OUT = desc.loc['75%'] + 1.5 * IQR
train[train.Insulin > OUT].shape
train = train[train.Insulin < 600]
X_train = train[feature_names]
y_train = train[label_name]
y_pred = cross_val_predict(model, X_train, y_train, cv = 5, verbose = 2, n_jobs = -1)
print('맞은 개수는:', sum(y_pred == y_train))
plt.figure(figsize = (10, 8))
sns.regplot(y_train, y_pred)
plt.show()
error = abs(y_train - y_pred)
print('MAE:', error.mean())
plt.figure(figsize = (10, 5))
sns.distplot(error)
plt.show()
RMSE = np.sqrt((error**2).mean())
print('600을 넘는 값들을 이상치로 제거한 후의 RMSE:{}'.format(RMSE))
train = train[train.Insulin < OUT]
X_train = train[feature_names]
y_train = train[label_name]
y_pred = cross_val_predict(model, X_train, y_train, cv = 5, verbose = 2, n_jobs = -1)
print('맞은 개수는:', sum(y_pred == y_train))
plt.figure(figsize = (10, 8))
sns.regplot(y_train, y_pred)
plt.show()
plt.figure(figsize = (10, 5))
sns.distplot(y_train, label = 'Train', hist = False)
sns.distplot(y_pred, label = 'Predict', hist = False)
plt.legend()
plt.show()
error = abs(y_pred - y_train)
mae = error.mean()
rmse = np.sqrt((error**2).mean())
print('Outlier를 모두 제거한 후의 MAE: {}'.format(mae))
print('Outlier를 모두 제거한 후의 RMSE: {}'.format(rmse))
train.groupby('Outcome')['Insulin'].describe()
y_test = model.fit(X_train, y_train).predict(X_test)
test['Insulin'] = y_test
test.groupby('Outcome')['Insulin'].describe()
plt.figure(figsize = (10, 8))
sns.barplot(x = model.feature_importances_, y = feature_names)
plt.show()
from sklearn.model_selection import RandomizedSearchCV
max_depth = np.random.randint(5, 30, 10)
max_features = np.random.uniform(.3, 1., 10)
param_distributions = {'max_depth': max_depth, 'max_features': max_features}
regressor = RandomizedSearchCV(model, param_distributions, random_state = 42,
n_iter = 10, scoring = None, cv = 5, verbose = 2)
regressor.fit(X_train, y_train)
regressor.best_estimator_
regressor.best_score_
regressor.cv_results_
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
estimators = [DecisionTreeRegressor(random_state = 42),
RandomForestRegressor(random_state = 42),
GradientBoostingRegressor(random_state = 42)]
estimators
results = []
for estimator in estimators:
result = estimator.__class__.__name__
results.append(result)
pd.DataFrame(results)
results = []
for estimator in estimators:
result = []
max_detph = np.random.randint(5, 30, 10)
max_features = np.random.uniform(0.3, 1.0, 10)
param_distributions = {'max_depth': max_detph, 'max_features': max_features}
if estimator.__class__.__name__ != 'DecisionTreeRegressor':
n_estimators = np.random.randint(100, 500, 10)
param_distributions['n_estimators'] = n_estimators
regressor = RandomizedSearchCV(estimator, param_distributions, n_iter = 10,
cv = 5, verbose = 2, random_state = 42)
regressor.fit(X_train, y_train)
result.append(estimator.__class__.__name__) ## 모델 명
result.append(regressor.best_params_) ## 최적 파라미터
result.append(regressor.best_estimator_) ## 최적 모델
result.append(regressor.best_score_) ## 최적 모델의 평균 점수
result.append(regressor.cv_results_) ## 학습 과정에 대한 Info
results.append(result) ## 종합적
df_cv = pd.DataFrame(results)
df_cv.columns = ['model', 'params', 'estimator', 'score', 'cv_result']
df_cv
best_estimator = df_cv.loc[1, 'estimator']
best_estimator
best_estimator.fit(X_train, y_train)
from sklearn.model_selection import cross_val_predict
y_predict = cross_val_predict(best_estimator, X_train, y_train, cv = 5, verbose = 2, n_jobs = -1)
y_predict[:5]
plt.figure(figsize = (10, 8))
sns.regplot(y_train, y_predict)
plt.show()
from sklearn.metrics import r2_score
r2_score(y_train, y_predict)
plt.figure(figsize = (10, 8))
sns.distplot(y_train, hist = False, label = 'train')
sns.distplot(y_predict, hist = False, label = 'pred')
plt.legend()
plt.show()
error = abs(y_train - y_predict)
mae = error.mean()
rmse = np.sqrt((error**2).mean())
print('MAE: {}, RMSE: {}'.format(mae, rmse))
y_test = best_estimator.predict(X_test)
test['Insulin'] = y_test
test.groupby('Outcome')['Insulin'].describe()
plt.figure(figsize = (10, 8))
sns.barplot(x = best_estimator.feature_importances_, y = feature_names)
plt.show()
df.loc[df['Insulin'] == 0, 'Insulin'] = test['Insulin']
## 현재 작업공간에 df를 diabets_fill_insulin이란 이름으로 저장
df.to_csv('diabets_fill_insulin.csv', index = False)
pd.read_csv('diabets_fill_insulin.csv')