[Dacon/machine learning] 와인 품질(Quality) 분류 연습

hottogi·2022년 11월 2일
0

출처: https://dacon.io/competitions/open/235610/codeshare/4221?page=1&dtype=recent
공유된 코드에 기반한 연습용 게시물입니다.

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
plt.style.use('fivethirtyeight')

train= pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submisson= pd.read_csv("sample_submission.csv")

train= train.drop(['index'],axis=1)
train.head(5)

train.shape, test.shape

train.info()
train.describe(include='all')

import pandas_profiling as pp
train.profile_report()
print(train['quality'].value_counts())
sns.countplot(x=train['quality']);
plt.title("dist. of type", fontfamily='serif',fontsize=12);

numerical_columns = train.select_dtypes(exclude='object').columns.tolist()
numerical_columns.remove('quality')


def show_dist_plot(df, columns):
    for column in columns:
        f, ax = plt.subplots(1, 2, figsize=(16, 4))
        sns.stripplot(x=df['quality'], y=df[column], ax=ax[0], hue=df['quality'])
        sns.violinplot(data=df, x='quality', y=column, ax=ax[1])


show_dist_plot(train, numerical_columns)

plt.figure(figsize=(18,8))
corr= train.corr()
sns.heatmap(corr, annot=True, square=False, vmin=-.6, vmax=1.0);

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import plot_roc_curve,accuracy_score, confusion_matrix, plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

ss= StandardScaler()
train[numerical_columns] = ss.fit_transform(train[numerical_columns])

#factorize
train['type'] = pd.factorize(train['type'])[0]

train.head(3)

X = train.drop(['quality'],axis=1)
y = train.quality

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2, random_state=42)

X_train.shape, X_test.shape

def Model(model):
    model.fit(X_train,y_train)
    score = model.score(X_test,y_test)
    model_train_score= model.score(X_train,y_train)
    model_test_score=model.score(X_test,y_test)
    prediction = model.predict(X_test)
    cm = confusion_matrix(y_test, prediction)
    print("Testing Score\n", score)
    plot_confusion_matrix(model,X_test,y_test,cmap='OrRd')


rf= RandomForestClassifier()
rf.fit(X_train,y_train)
Model(rf)

X_train.shape, test.shape

test= test.drop(['index'],axis=1)

#Standardscaler
ss= StandardScaler()
test[numerical_columns] = ss.fit_transform(test[numerical_columns])

#factorize
test['type'] = pd.factorize(test['type'])[0]

test.head(3)

final_pred = rf.predict(test)
sample_submisson['quality'] = final_pred
sample_submisson.to_csv("submission.csv",index=False)
profile

0개의 댓글

관련 채용 정보