
2) 회귀 Regression (정답, 라벨이 연속적인것)


2) 차원 축소


import pandas as pd
data = {'x' : [1., 2., 3., 4., 5.], 'y' : [1., 3., 4., 6., 5.]}
df = pd.DataFrame(data)
df

import matplotlib.pyplot as plt
plt.scatter(df['x'], df['y'])
plt.grid()
plt.show()

from sklearn.linear_model import LinearRegression
X = df[['x']]
y = df['y']
reg = LinearRegression()
reg.fit(X, y)

import numpy as np
from sklearn.metrics import mean_absolute_error
pred = reg.predict(X)
mae = (np.sqrt(mean_absolute_error(y, pred)))
mae
plt.scatter(y, pred)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Real vs Predicted")
plt.plot([0, 6], [0, 6], 'r')
plt.grid()
plt.show()

여러 개의 특성: 변수가 여러개 있다 Multivariate Linear Regression 문제로 일반화
행렬식으로 표현(입력 변수가 4개인 경우)
-> 벡터의 선형회귀 문제로 변환 가능
from pandas import read_csv
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
'PTRATIO', 'B', 'LSTAT', 'PRICE']
url = 'data 주소'
boston_pd = read_csv(url, header=None, delimiter=r"\s+", names=column_names)
boston_pd.head()

import plotly.express as px
fig = px.histogram(boston_pd, x = "PRICE")
fig.show()

import matplotlib.pyplot as plt
import seaborn as sns
corr_mat = boston_pd.corr().round(1)
sns.set(rc = {'figure.figsize' : (10, 8)})
sns.heatmap(data = corr_mat, annot = True, cmap = 'bwr');

sns.set_style('darkgrid')
sns.set(rc = {'figure.figsize' : (12, 6)})
fig, ax = plt.subplots(ncols = 2)
sns.regplot(x = "RM", y = "PRICE", data = boston_pd, ax = ax[0]);
sns.regplot(x = "LSTAT", y = 'PRICE' , data = boston_pd, ax = ax[1]);

from sklearn.model_selection import train_test_split
X = boston_pd.drop("PRICE", axis = 1)
y = boston_pd["PRICE"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 13)
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)

import numpy as np
from sklearn.metrics import mean_squared_error
pred_tr = reg.predict(X_train)
pred_test = reg.predict(X_test)
rmse_tr = (np.sqrt(mean_squared_error(y_train, pred_tr)))
rmse_test = (np.sqrt(mean_squared_error(y_test, pred_test)))
print("RMSE of Train Data : ", rmse_tr)
print("RMSE of Test Data : ", rmse_test)
RMSE of Train Data : 4.642806069019824
RMSE of Test Data : 4.931352584146697
plt.scatter(y_test, pred_test)
plt.xlabel("Actual House Prices ($1000)")
plt.ylabel("Predicted Prices")
plt.title("Real vs Predicted")
plt.plot([0, 48], [0, 48], 'r')
plt.show()

X = boston_pd.drop(["PRICE", "LSTAT"], axis = 1)
y = boston_pd["PRICE"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state = 13)
reg = LinearRegression()
reg.fit(X_train, y_train)

pred_tr = reg.predict(X_train)
pred_test = reg.predict(X_test)
rmse_tr = (np.sqrt(mean_squared_error(y_train, pred_tr)))
rmse_test = (np.sqrt(mean_squared_error(y_test, pred_test)))
print("RMSE of Train Data : ", rmse_tr)
print("RMSE of Test Data : ", rmse_test)
RMSE of Train Data : 5.165137874244863
RMSE of Test Data : 5.295595032597148
plt.scatter(y_test, pred_test)
plt.xlabel("Actual House Prices ($1000)")
plt.ylabel("Predicted Prices")
plt.title("Real vs Predicted")
plt.plot([0, 48], [0, 48], 'r')
plt.show()

from sklearn.tree import DecisionTreeRegressor
reg_dt = DecisionTreeRegressor(max_depth = 9, random_state = 13)
reg_dt.fit(X_train, y_train)
y_pred_dt = reg_dt.predict(X_test)
rmse_test = (np.sqrt(mean_squared_error(y_test, y_pred_dt)))
print("RMSE of Test Data : ", rmse_test)
RMSE of Test Data : 6.473200592640149
from sklearn import tree
fig = plt.figure(figsize=(15, 8))
_ = tree.plot_tree(reg_dt,feature_names=column_names, filled=True)

plt.scatter(y_test, y_pred_dt)
plt.xlabel("Actual House Prices ($1000)")
plt.ylabel("Predicted Prices")
plt.title("Real vs Predicted")
plt.plot([0, 48], [0, 48], 'r')
plt.show()

이 글은 제로베이스 데이터 취업 스쿨의 강의 자료 일부를 발췌하여 작성되었습니다