예전에 공부했던 내용들을 복습하는 차원에서 기록을 남긴다. 잊을 수 있는 내용들을 복습하기 위한 차원이므로 최대한 자세하게 서술하려고 해보자.
import pandas as pd
from datetime import datetime
# Path of the file to read
iowa_file_path = '../input/home-data-for-ml-course/train.csv'
# Fill in the line below to read the file into a variable home_data
home_data = pd.read_csv(iowa_file_path)
# Print summary statistics in next line
home_data.describe()
hdd = home_data.describe()
# What is the average lot size (rounded to nearest integer)?
avg_lot_size = round(hdd['LotArea']['mean'])
# As of today, how old is the newest home (current year - the date in which it was built)
newest_home_age = datetime.now().year - round(hdd['YearBuilt']['max'])
from sklearn.tree import DecisionTreeRegressor
y = home_data.SalePrice
print(y)
# Create the list of features below
feature_names = ['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']
# Select data corresponding to features in feature_names
X = home_data[feature_names]
#specify the model.
#For model reproducibility, set a numeric value for random_state when specifying the model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit the model
iowa_model.fit(X,y)
predictions = iowa_model.predict(X)
cf) DecisionTreeRegressor : https://velog.io/@eueueuu/Decision-Tree
# Import the train_test_split function and uncomment
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
# fill in and uncomment
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
# Specify the model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit iowa_model with the training data.
iowa_model.fit(train_X, train_y)
# Predict with all validation observations
val_predictions = iowa_model.predict(val_X)
# print the top few validation predictions
print(val_predictions[:5])
# print the top few actual prices from validation data
print(val_y.head().tolist())
val_mae = mean_absolute_error(val_predictions, val_y)
# uncomment following line to see the validation_mae
print(val_mae)
cf)
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
model.fit(train_X, train_y)
preds_val = model.predict(val_X)
mae = mean_absolute_error(val_y, preds_val)
return(mae)
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for num in candidate_max_leaf_nodes:
print(get_mae(num, train_X, val_X, train_y, val_y))
# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
best_tree_size = 100
# Fill in argument to make optimal size and uncomment
final_model = DecisionTreeRegressor(max_leaf_nodes = 100, random_state=0)
# fit the final model and uncomment the next two lines
final_model.fit(X, y)
from sklearn.ensemble import RandomForestRegressor
# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state = 1)
# fit your model
rf_model.fit(train_X, train_y)
# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(rf_model.predict(val_X), val_y)
print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))
cf) Random Forest:
(출처) Kaggle Course