Machine Learning - American income analysis

화이티 ·2023년 12월 22일
0

Machine Learning

목록 보기
17/23
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split #훈련과 테스트용 셋트 분리
from sklearn.metrics import accuracy_score #평가를 진행할 때 정확도 측정
from sklearn.tree import DecisionTreeClassifier #결정트리모델 가져오기
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
test = pd.read_csv('./test.csv')
train = pd.read_csv('./train.csv')
print(train.shape)
print(test.shape)
for i in train.columns:
    if train[i].dtype == 'object':
        cate = train[i].unique()
        print(f'{i}\n {cate}({len(cate)})개')
        print('-----------------------------')
# workclass : value ' ?' change to Other
title = [' Private' ,' Self-emp-not-inc', ' Local-gov' ,' ?' ,' State-gov',
 ' Self-emp-inc' ,' Federal-gov' ,' Without-pay' ,' Never-worked']
cover_title = [' Private', ' Self-emp-not-inc', ' Local-gov', ' Other', ' State-gov',
 ' Self-emp-inc' ,' Federal-gov', ' Without-pay', ' Never-worked']
title_dict = dict(zip(title,cover_title))
train['workclass'] = train ['workclass'].map(title_dict)
test['workclass']= test ['workclass'].map(title_dict)
# occupation : value '?' change to Other
title1 =  [' Machine-op-inspct', ' Other-service', ' Handlers-cleaners',
 ' Tech-support', ' Transport-moving', ' Farming-fishing', ' Prof-specialty',
 ' Priv-house-serv' ,' Adm-clerical', ' Protective-serv' ,' Exec-managerial',
 ' ?' ,' Craft-repair', ' Sales', ' Armed-Forces']
cover_title1 = [' Machine-op-inspct', ' Other-service', ' Handlers-cleaners',
 ' Tech-support', ' Transport-moving', ' Farming-fishing', ' Prof-specialty',
 ' Priv-house-serv' ,' Adm-clerical', ' Protective-serv' ,' Exec-managerial',
 ' Other' ,' Craft-repair', ' Sales', ' Armed-Forces']
title_dict1 = dict(zip(title1,cover_title1))
train['occupation'] = train ['occupation'].map(title_dict1)
test['occupation']= test ['occupation'].map(title_dict1)
test['workclass'].value_counts()
test['occupation']
train.drop('education', axis = 1, inplace = True)
test.drop('education', axis = 1, inplace = True)
# change married status to 2 status: married and not married only
marial_status = [' Divorced', ' Never-married', ' Married-civ-spouse', ' Separated',
 ' Married-spouse-absent', ' Widowed', ' Married-AF-spouse']
convert_marial_status = [' not-married' ,' not-married', ' married', ' not-married',
 ' not-married', ' not-married' ,' married']
marital_status_dict = dict(zip(marial_status,convert_marial_status))
train['marital-status'] = train['marital-status'].map(marital_status_dict)
test['marital-status'] = test['marital-status'].map(marital_status_dict)
train['marital-status']
title =  [' Not-in-family', ' Husband', ' Unmarried' ,' Other-relative' ,' Own-child'
 ' Wife']
cover_title = [' Not-in-family', ' Married', ' Unmarried' ,' Other-relative' ,' Own-child'
 ' Married']
title_dict = dict(zip(title,cover_title))
train['relationship'] = train['relationship'].map(title_dict)
test['relationship'] = test['relationship'].map(title_dict)
title = [' Private', ' Self-emp-not-inc', ' Local-gov',  ' State-gov',
 ' Self-emp-inc' ,' Federal-gov', ' Without-pay', ' Never-worked']
cover_title = [' Private', ' Self-emp', ' gov',  ' gov',
 ' Self-empc' ,' gov', ' unem', ' unem']
title_dict = dict(zip(title,cover_title))
train['workclass'] = train ['workclass'].map(title_dict)
test['workclass']= test ['workclass'].map(title_dict)
skewed = ['capital-gain','capital-loss']
train2 = pd.DataFrame(data= train)
train2[skewed] = train[skewed].apply(lambda x: np.log(x+1))
skewed = ['capital-gain','capital-loss']
test2 = pd.DataFrame(data= test)
test2[skewed] = test[skewed].apply(lambda x: np.log(x+1))
pt2 = train.pivot_table( values = 'income',
                       index = ['native-country'],
                       aggfunc = 'mean')
national = pt2.sort_values(by = 'income', ascending = False)
native_country = national.index
conver_native_country = pt2['income']*10
native_country_dict = dict(zip(native_country,conver_native_country))
native_country_dict
train['native-country']=train['native-country'].map(native_country_dict)
test['native-country']=test['native-country'].map(native_country_dict)
bins = [0,4,8,12,16]
labels = ['e1','e2','e3','e4']
train['education-num']= pd.cut(train['education-num'], bins = bins, labels = labels)
test['education-num']= pd.cut(test['education-num'], bins = bins, labels = labels)
bins = [i for i in range (0,101,20)]
labels = ['age' + str(i) for i in range (0,100,20)]
train['age']= pd.cut(train['age'], bins = bins, labels = labels)
test['age']= pd.cut(test['age'], bins = bins, labels = labels)
X_train = train.loc[:,:'native-country']
y_train = train['income']
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(test)

MinMax Scaler

from sklearn.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler()
mm_transform_X_train=mm_scaler.transform(X_train)
mm_scaler.fit(X_test)
mm_transform_X_test=mm_scaler.transform(X_test)
X_train = pd.DataFrame(mm_transform_X_train, columns = X_train.columns)
X_test = pd.DataFrame(mm_transform_X_test, columns = X_test.columns)

Logistic Regression

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
from sklearn.model_selection import cross_val_score
cross_val_score(lr, X_train, y_train, cv =5).mean()
pre_lr = lr.predict(X_test)
result = pd.read_csv('./sample_submission.csv')
result['income']= pre_lr
result.to_csv('result.csv', index = False)

Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.15)
gbc.fit(X_train,y_train)
cross_val_score(gbc,X_train,y_train,cv=5).mean()
pre_gbc = gbc.predict(X_test)
result = pd.read_csv('./sample_submission.csv')
result['income']= pre_gbc
result.to_csv('result_gbc.csv', index = False)

Support Vector Machines (SVM)

from sklearn.svm import LinearSVC
svm = LinearSVC()
svm.fit(X_train,y_train)
cross_val_score(svm,X_train,y_train,cv=5).mean()
pre_svm = svm.predict(X_test)
result = pd.read_csv('./sample_submission.csv')
result['income']= pre_svm
result.to_csv('result_svm.csv', index = False)
profile
열심히 공부합시다! The best is yet to come! 💜

0개의 댓글