출처: https://dacon.io/competitions/open/235610/codeshare/4221?page=1&dtype=recent
공유된 코드에 기반한 연습용 게시물입니다.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
plt.style.use('fivethirtyeight')
train= pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submisson= pd.read_csv("sample_submission.csv")
train= train.drop(['index'],axis=1)
train.head(5)
train.shape, test.shape
train.info()
train.describe(include='all')
import pandas_profiling as pp
train.profile_report()
print(train['quality'].value_counts())
sns.countplot(x=train['quality']);
plt.title("dist. of type", fontfamily='serif',fontsize=12);
numerical_columns = train.select_dtypes(exclude='object').columns.tolist()
numerical_columns.remove('quality')
def show_dist_plot(df, columns):
for column in columns:
f, ax = plt.subplots(1, 2, figsize=(16, 4))
sns.stripplot(x=df['quality'], y=df[column], ax=ax[0], hue=df['quality'])
sns.violinplot(data=df, x='quality', y=column, ax=ax[1])
show_dist_plot(train, numerical_columns)
plt.figure(figsize=(18,8))
corr= train.corr()
sns.heatmap(corr, annot=True, square=False, vmin=-.6, vmax=1.0);
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import plot_roc_curve,accuracy_score, confusion_matrix, plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
ss= StandardScaler()
train[numerical_columns] = ss.fit_transform(train[numerical_columns])
#factorize
train['type'] = pd.factorize(train['type'])[0]
train.head(3)
X = train.drop(['quality'],axis=1)
y = train.quality
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2, random_state=42)
X_train.shape, X_test.shape
def Model(model):
model.fit(X_train,y_train)
score = model.score(X_test,y_test)
model_train_score= model.score(X_train,y_train)
model_test_score=model.score(X_test,y_test)
prediction = model.predict(X_test)
cm = confusion_matrix(y_test, prediction)
print("Testing Score\n", score)
plot_confusion_matrix(model,X_test,y_test,cmap='OrRd')
rf= RandomForestClassifier()
rf.fit(X_train,y_train)
Model(rf)
X_train.shape, test.shape
test= test.drop(['index'],axis=1)
#Standardscaler
ss= StandardScaler()
test[numerical_columns] = ss.fit_transform(test[numerical_columns])
#factorize
test['type'] = pd.factorize(test['type'])[0]
test.head(3)
final_pred = rf.predict(test)
sample_submisson['quality'] = final_pred
sample_submisson.to_csv("submission.csv",index=False)