import seaborn as sns
plt.figure(figsize = (4, 4), dpi = 100)
sns.scatterplot(data = df, x = "ICV", y = "Age");
plt.figure(figsize = (4, 4), dpi = 100)
sns.scatterplot(data = df, x = "ICV", y = "lbankssts")
man = df[df['Sex'].values == 1]
y_man = man.Age
X_man = man.drop(['ID', 'Age','Sex','ICV'], axis = 1)
woman = df[df['Sex'].values == 2]
y_woman = woman.Age
X_woman = woman.drop(['ID', 'Age','Sex','ICV'], axis = 1)
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
x_man = std.fit_transform(X_man)
x_woman = std.fit_transform(X_woman)
from sklearn.model_selection import train_test_split, cross_val_score
x_man_train, x_man_valid, y_man_train, y_man_valid = \
train_test_split(x_man, y_man, test_size=0.1, shuffle=True, random_state=123)
x_woman_train, x_woman_valid, y_woman_train, y_woman_valid = \
train_test_split(x_woman, y_woman, test_size=0.1, shuffle=True, random_state=123)
Age, ID, Sex를 제외한 모든 feature를 사용하는 방법
1.1 데이터를 변형하지 않고 사용하는 방법
1.2 ICV데이터 변형
ICV 데이터 제거
전체 데이터를 ICV로 나누는 방법
대뇌 피질 두께만을 ICV로 나누는 방법
반지름을 구하여 데이터를 나누는 방법
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, RepeatedKFold
reg = linear_model.Lasso(max_iter = 10000, random_state = 1)
param_grid = {'alpha' : np.arange(0.01,1,0.01)}
grid = GridSearchCV(reg, param_grid = param_grid,
cv=RepeatedKFold(n_splits=5, n_repeats=5, random_state = 123),
return_train_score=True)
grid.fit(x_man_train,y_man_train)
print(f"best parameters: {grid.best_params_}")
print(f"valid-set score: {grid.score(x_man_valid, y_man_valid):.3f}")
best parameters: {'alpha': 0.33}
valid-set score: 0.775
results = pd.DataFrame(grid.cv_results_)
results.plot('param_alpha', 'mean_train_score')
results.plot('param_alpha', 'mean_test_score', ax=plt.gca())
plt.fill_between(results.param_alpha.astype(np.float64),
results['mean_train_score'] + results['std_train_score'],
results['mean_train_score'] - results['std_train_score'], alpha=0.2)
plt.fill_between(results.param_alpha.astype(np.float64),
results['mean_test_score'] + results['std_test_score'],
results['mean_test_score'] - results['std_test_score'], alpha=0.2)
plt.legend()
from sklearn.model_selection import cross_val_predict
reg = linear_model.Lasso(max_iter = 10000, random_state = 1, alpha = 0.33)
y_man_pred = cross_val_predict(reg, x_man_train, y_man_train, cv=5)
plt.plot([15, 85], [15, 85], color='k')
plt.scatter(y_man_pred, y_man_train, alpha=.5, s=4)
reg = linear_model.Lasso(max_iter = 10000, random_state = 1)
param_grid = {'alpha' : np.arange(0.01,1,0.01)}
grid = GridSearchCV(reg, param_grid = param_grid,
cv=RepeatedKFold(n_splits=5, n_repeats=5, random_state = 123),
return_train_score=True)
grid.fit(x_woman_train,y_woman_train)
print(f"best parameters: {grid.best_params_}")
print(f"valid-set score: {grid.score(x_woman_valid, y_woman_valid):.3f}")
best parameters: {'alpha': 0.26}
valid-set score: 0.573
results = pd.DataFrame(grid.cv_results_)
results.plot('param_alpha', 'mean_train_score')
results.plot('param_alpha', 'mean_test_score', ax=plt.gca())
plt.fill_between(results.param_alpha.astype(np.float64),
results['mean_train_score'] + results['std_train_score'],
results['mean_train_score'] - results['std_train_score'], alpha=0.2)
plt.fill_between(results.param_alpha.astype(np.float64),
results['mean_test_score'] + results['std_test_score'],
results['mean_test_score'] - results['std_test_score'], alpha=0.2)
plt.legend()
reg = linear_model.Lasso(max_iter = 10000, random_state = 1, alpha = 0.26)
y_woman_pred = cross_val_predict(reg, x_woman_train, y_woman_train, cv=5)
plt.plot([15, 85], [15, 85], color='k')
plt.scatter(y_woman_pred, y_woman_train, alpha=.5, s=4)
reg = linear_model.Ridge(max_iter = 10000)
param_grid = {'alpha' : np.arange(50,60,0.1)}
grid2 = GridSearchCV(reg, param_grid = param_grid,
cv=RepeatedKFold(n_splits=5, n_repeats=5, random_state = 123),
return_train_score=True)
grid2.fit(x_man_train,y_man_train)
print(f"best parameters: {grid2.best_params_}")
print(f"valid-set score: {grid2.score(x_man_valid, y_man_valid):.3f}")
best parameters: {'alpha': 56.30000000000009}
valid-set score: 0.772
results = pd.DataFrame(grid2.cv_results_)
results.plot('param_alpha', 'mean_train_score')
results.plot('param_alpha', 'mean_test_score', ax=plt.gca())
plt.fill_between(results.param_alpha.astype(np.float64),
results['mean_train_score'] + results['std_train_score'],
results['mean_train_score'] - results['std_train_score'], alpha=0.2)
plt.fill_between(results.param_alpha.astype(np.float64),
results['mean_test_score'] + results['std_test_score'],
results['mean_test_score'] - results['std_test_score'], alpha=0.2)
plt.legend()
reg = linear_model.Ridge(alpha = 56.3)
y_man_pred = cross_val_predict(reg, x_man_train, y_man_train, cv=5)
plt.plot([15, 85], [15, 85], color='k')
plt.scatter(y_man_pred, y_man_train, alpha=.5, s=4)
reg = linear_model.Ridge()
param_grid = {'alpha' : np.arange(40,50,0.01)}
grid = GridSearchCV(reg, param_grid = param_grid,
cv=RepeatedKFold(n_splits=5, n_repeats=5, random_state = 123),
return_train_score=True)
grid.fit(x_woman_train,y_woman_train)
print(f"best parameters: {grid.best_params_}")
print(f"valid-set score: {grid.score(x_woman_valid, y_woman_valid):.3f}")
best parameters: {'alpha': 46.36999999999873}
valid-set score: 0.552
results = pd.DataFrame(grid.cv_results_)
results.plot('param_alpha', 'mean_train_score')
results.plot('param_alpha', 'mean_test_score', ax=plt.gca())
plt.fill_between(results.param_alpha.astype(np.float64),
results['mean_train_score'] + results['std_train_score'],
results['mean_train_score'] - results['std_train_score'], alpha=0.2)
plt.fill_between(results.param_alpha.astype(np.float64),
results['mean_test_score'] + results['std_test_score'],
results['mean_test_score'] - results['std_test_score'], alpha=0.2)
plt.legend()
reg = linear_model.Ridge(alpha = 46.3)
y_woman_pred = cross_val_predict(reg, x_woman_train, y_woman_train, cv=5)
plt.plot([15, 85], [15, 85], color='k')
plt.scatter(y_woman_pred, y_woman_train, alpha=.5, s=4)
from sklearn import svm
reg = svm.SVR()
param_grid = { 'C': np.arange(10,20,0.1)}
grid3 = GridSearchCV(reg, param_grid = param_grid,
cv=RepeatedKFold(n_splits=5, n_repeats=5, random_state = 123),
return_train_score=True)
grid3.fit(x_man_train,y_man_train)
print(f"best parameters: {grid3.best_params_}")
print(f"valid-set score: {grid3.score(x_man_valid, y_man_valid):.3f}")
best parameters: {'C': 15.999999999999979}
valid-set score: 0.587
results = pd.DataFrame(grid3.cv_results_)
results.plot('param_C', 'mean_train_score')
results.plot('param_C', 'mean_test_score', ax=plt.gca())
plt.fill_between(results.param_C.astype(np.float64),
results['mean_train_score'] + results['std_train_score'],
results['mean_train_score'] - results['std_train_score'], alpha=0.2)
plt.fill_between(results.param_C.astype(np.float64),
results['mean_test_score'] + results['std_test_score'],
results['mean_test_score'] - results['std_test_score'], alpha=0.2)
plt.legend()
reg = svm.SVR(C = 16.0)
y_man_pred = cross_val_predict(reg, x_man_train, y_man_train, cv=5)
plt.plot([15, 85], [15, 85], color='k')
plt.scatter(y_man_pred, y_man_train, alpha=.5, s=4)
reg = svm.SVR()
param_grid3 = { 'C': np.arange(20,25,0.1)}
grid3 = GridSearchCV(reg, param_grid = param_grid3,
cv=RepeatedKFold(n_splits=5, n_repeats=5, random_state = 123),
return_train_score=True)
grid3.fit(x_woman_train,y_woman_train)
print(f"best parameters: {grid3.best_params_}")
print(f"valid-set score: {grid3.score(x_woman_valid, y_woman_valid):.3f}")
best parameters: {'C': 21.700000000000024}
valid-set score: 0.620
results = pd.DataFrame(grid3.cv_results_)
results.plot('param_C', 'mean_train_score')
results.plot('param_C', 'mean_test_score', ax=plt.gca())
plt.fill_between(results.param_C.astype(np.float64),
results['mean_train_score'] + results['std_train_score'],
results['mean_train_score'] - results['std_train_score'], alpha=0.2)
plt.fill_between(results.param_C.astype(np.float64),
results['mean_test_score'] + results['std_test_score'],
results['mean_test_score'] - results['std_test_score'], alpha=0.2)
plt.legend()
reg = svm.SVR(C = 21.7)
y_woman_pred = cross_val_predict(reg, x_woman_train, y_woman_train, cv=5)
plt.plot([15, 85], [15, 85], color='k')
plt.scatter(y_woman_pred, y_woman_train, alpha=.5, s=4)