import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import RocCurveDisplay, roc_auc_score
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
data = pd.read_csv('C:/Users/ddi05/Class_05/adult/adult.data', header = None)
data.head()
encoder = LabelEncoder()
labels = [1,3,5,6,7,8,9,13,14]
for i in labels :
data[i] = encoder.fit_transform(data[i])
data
scaler = MinMaxScaler()
labels = [[0,4]]
for i in labels :
data[i] = scaler.fit_transform(data[i])
data
y = data[14]
X = data[[0, 1, 3, 4, 7, 8, 9]]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 10)
dt = DecisionTreeClassifier(max_depth = 3)
dt.fit(X_train, y_train)
pred1 = dt.predict(X_test)
print('train score ', dt.score(X_train, y_train))
print('test score', dt.score(X_test, y_test))
plot_tree(dt)
RocCurveDisplay.from_predictions(y_test, pred1)
pd.DataFrame({'index' : X_test.index, 'pred' : pred1})
models = [DecisionTreeClassifier(max_depth=3), LogisticRegression(solver='liblinear'), GaussianNB()]
model_names = ['DecisionTree', 'logistic_reg', 'Naive Bayes']
for model, model_name in zip(models, model_names) :
m = model
m.fit(X_train, y_train)
pred = m.predict(X_test)
print('--------------', model_name, '---------------')
print('Train score', m.score(X_train, y_train))
print('Test score', m.score(X_test, y_test))