
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('/content/drive/MyDrive/edwith/프로젝트로 배우는 데이터 사이언스/diabetes.csv')
print(df.shape)
df.head()

(df.Insulin > 0).value_counts()

train = df[df.Insulin > 0].copy()
test = df[df.Insulin <= 0].copy()
train.shape, test.shape

feature_names = test.columns.tolist()
feature_names.remove('Insulin')
feature_names

label_name = 'Insulin'
X_train = train[feature_names]
y_train = train[label_name]
X_test = test[feature_names]
y_test = test[label_name]
print('Train set의 shape: {}, {}\nTest set의 shape: {}, {}'.format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model

model.fit(X_train, y_train)

.fit을 써주면 됨
from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(model, X_train, y_train, cv = 5, verbose = 2, n_jobs = -1)
y_pred

(y_pred == y_train).sum()

plt.figure(figsize = (10, 8))
sns.regplot(y_train, y_pred)
plt.show()

.
plt.figure(figsize = (10, 8))
sns.distplot(y_train, hist = False, label = 'train')
sns.distplot(y_pred, hist = False, label = 'pred')
plt.legend()
plt.show()

error = abs(y_train - y_pred)
print('MAE: {}'.format(error.mean()))

sns.distplot(error)

RMSE = np.sqrt((error**2).mean())
print('RMSE: {}'.format(RMSE))

from sklearn.tree import plot_tree
plt.figure(figsize = (20, 20))
tree = plot_tree(model, feature_names = feature_names, filled = True, fontsize = 10)

plt.figure(figsize = (10, 8))
sns.barplot(x = model.feature_importances_, y = feature_names)
plt.show()

y_test = model.predict(X_test)
y_test[: 5]

train.groupby('Outcome')['Insulin'].agg(['mean', 'median'])

train.groupby('Outcome')['Insulin'].describe()

plt.figure(figsize = (10, 8))
sns.barplot(data = train, x = 'Outcome', y = 'Insulin')
plt.show()

test['Insulin'] = y_test
test.groupby('Outcome')['Insulin'].describe()

plt.figure(figsize = (10, 8))
sns.barplot(data = test, x = 'Outcome', y = 'Insulin')
plt.show()

plt.figure(figsize = (10, 8))
sns.distplot(train.Insulin)
plt.show()

plt.figure(figsize = (10, 4))
sns.boxplot(train.Insulin)
plt.show()

desc = train.Insulin.describe()
desc

IQR = desc.loc['75%'] - desc.loc['25%']
OUT = desc.loc['75%'] + 1.5 * IQR
train[train.Insulin > OUT].shape

train = train[train.Insulin < 600]
X_train = train[feature_names]
y_train = train[label_name]
y_pred = cross_val_predict(model, X_train, y_train, cv = 5, verbose = 2, n_jobs = -1)
print('맞은 개수는:', sum(y_pred == y_train))

plt.figure(figsize = (10, 8))
sns.regplot(y_train, y_pred)
plt.show()

error = abs(y_train - y_pred)
print('MAE:', error.mean())

plt.figure(figsize = (10, 5))
sns.distplot(error)
plt.show()

RMSE = np.sqrt((error**2).mean())
print('600을 넘는 값들을 이상치로 제거한 후의 RMSE:{}'.format(RMSE))

train = train[train.Insulin < OUT]
X_train = train[feature_names]
y_train = train[label_name]
y_pred = cross_val_predict(model, X_train, y_train, cv = 5, verbose = 2, n_jobs = -1)
print('맞은 개수는:', sum(y_pred == y_train))

plt.figure(figsize = (10, 8))
sns.regplot(y_train, y_pred)
plt.show()

plt.figure(figsize = (10, 5))
sns.distplot(y_train, label = 'Train', hist = False)
sns.distplot(y_pred, label = 'Predict', hist = False)
plt.legend()
plt.show()

error = abs(y_pred - y_train)
mae = error.mean()
rmse = np.sqrt((error**2).mean())
print('Outlier를 모두 제거한 후의 MAE: {}'.format(mae))
print('Outlier를 모두 제거한 후의 RMSE: {}'.format(rmse))

train.groupby('Outcome')['Insulin'].describe()

y_test = model.fit(X_train, y_train).predict(X_test)
test['Insulin'] = y_test
test.groupby('Outcome')['Insulin'].describe()

plt.figure(figsize = (10, 8))
sns.barplot(x = model.feature_importances_, y = feature_names)
plt.show()

from sklearn.model_selection import RandomizedSearchCV
max_depth = np.random.randint(5, 30, 10)
max_features = np.random.uniform(.3, 1., 10)
param_distributions = {'max_depth': max_depth, 'max_features': max_features}
regressor = RandomizedSearchCV(model, param_distributions, random_state = 42,
n_iter = 10, scoring = None, cv = 5, verbose = 2)
regressor.fit(X_train, y_train)

regressor.best_estimator_

regressor.best_score_

regressor.cv_results_

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
estimators = [DecisionTreeRegressor(random_state = 42),
RandomForestRegressor(random_state = 42),
GradientBoostingRegressor(random_state = 42)]
estimators

results = []
for estimator in estimators:
result = estimator.__class__.__name__
results.append(result)
pd.DataFrame(results)

results = []
for estimator in estimators:
result = []
max_detph = np.random.randint(5, 30, 10)
max_features = np.random.uniform(0.3, 1.0, 10)
param_distributions = {'max_depth': max_detph, 'max_features': max_features}
if estimator.__class__.__name__ != 'DecisionTreeRegressor':
n_estimators = np.random.randint(100, 500, 10)
param_distributions['n_estimators'] = n_estimators
regressor = RandomizedSearchCV(estimator, param_distributions, n_iter = 10,
cv = 5, verbose = 2, random_state = 42)
regressor.fit(X_train, y_train)
result.append(estimator.__class__.__name__) ## 모델 명
result.append(regressor.best_params_) ## 최적 파라미터
result.append(regressor.best_estimator_) ## 최적 모델
result.append(regressor.best_score_) ## 최적 모델의 평균 점수
result.append(regressor.cv_results_) ## 학습 과정에 대한 Info
results.append(result) ## 종합적

df_cv = pd.DataFrame(results)
df_cv.columns = ['model', 'params', 'estimator', 'score', 'cv_result']
df_cv

best_estimator = df_cv.loc[1, 'estimator']
best_estimator

best_estimator.fit(X_train, y_train)

from sklearn.model_selection import cross_val_predict
y_predict = cross_val_predict(best_estimator, X_train, y_train, cv = 5, verbose = 2, n_jobs = -1)
y_predict[:5]

plt.figure(figsize = (10, 8))
sns.regplot(y_train, y_predict)
plt.show()

from sklearn.metrics import r2_score
r2_score(y_train, y_predict)

plt.figure(figsize = (10, 8))
sns.distplot(y_train, hist = False, label = 'train')
sns.distplot(y_predict, hist = False, label = 'pred')
plt.legend()
plt.show()

error = abs(y_train - y_predict)
mae = error.mean()
rmse = np.sqrt((error**2).mean())
print('MAE: {}, RMSE: {}'.format(mae, rmse))

y_test = best_estimator.predict(X_test)
test['Insulin'] = y_test
test.groupby('Outcome')['Insulin'].describe()

plt.figure(figsize = (10, 8))
sns.barplot(x = best_estimator.feature_importances_, y = feature_names)
plt.show()

df.loc[df['Insulin'] == 0, 'Insulin'] = test['Insulin']
## 현재 작업공간에 df를 diabets_fill_insulin이란 이름으로 저장
df.to_csv('diabets_fill_insulin.csv', index = False)
pd.read_csv('diabets_fill_insulin.csv')
