import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from math import sqrt
from sklearn import preprocessing
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
dataset = datasets.load_boston()
df_x = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df_y = pd.DataFrame(dataset.target, columns=['MEDV'])
df = pd.concat([df_x, df_y], axis=1)
sns.pairplot(df[df.columns.values.tolist()])
plt.show()
min_max_scaler = preprocessing.MinMaxScaler()
scale_columns = ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
df_x[scale_columns] = min_max_scaler.fit_transform(df_x[scale_columns])
train_X, test_X, train_y, test_y = train_test_split(df_x, df_y, test_size=0.3)
print(len(train_X), len(test_X), len(train_y), len(test_y) )
m_reg = sm.OLS(train_y, train_X).fit()
print(m_reg.summary())
pred_y = np.array(m_reg.predict(test_X))
pred_y = pred_y.reshape(pred_y.shape[0],1)
test_y = np.array(test_y)
print(sqrt(mean_squared_error(test_y, pred_y)))
plt.figure(figsize=(12,10))
sns.heatmap(train_X.corr(), annot = True, cmap= 'RdYlBu')
plt.show()
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(train_X.values, i) for i in range(train_X.shape[1])]
vif["features"] = train_X.columns
vif
tmp_train_X1 = train_X.drop('TAX', axis=1)
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(tmp_train_X1.values, i) for i in range(tmp_train_X1.shape[1])]
vif["features"] = tmp_train_X1.columns
train_x_tmp = train_X[vif['features'].tolist()]
m_reg = sm.OLS(train_y, train_x_tmp).fit()
print(m_reg.summary())
plt.plot(pred_y, label = "pred")
plt.plot(test_y, label = "true")
plt.legend()
plt.show()