from sklearn.datasets import load_iris
iris_data = load_iris()
type(iris_data)
iris_data
iris_data.keys()
iris_data.values()
iris_data.items()
type(iris_data.feature_names)
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
iris = load_iris()
df_clf = DecisionTreeClassifier()
df_clf.fit(iris.data,iris.target)
pred = df_clf.predict(iris.data)
accuracy_score(iris.target,pred)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(iris.data,
iris.target,
test_size=0.2,
random_state=121)
df_clf.fit(X_train,y_train)
pred = df_clf.predict(X_test)
accuracy_score(y_test,pred)
from sklearn.model_selection import KFold
import numpy as np
iris=load_iris()
features = iris.data
label = iris.target
dt_clf = DecisionTreeClassifier(random_state=156)
kfold = KFold(n_splits=5,shuffle=True)
cv_accuracy = []
#features.shape
n_iter = 0
for train_index,test_index in kfold.split(features):
# print(train_index)
# print(test_index)
X_train,X_test = features[train_index],features[test_index]
y_train,y_test = label[train_index],label[test_index]
dt_clf.fit(X_train,y_train)
pred = dt_clf.predict(X_test)
n_iter += 1
accuracy = np.round(accuracy_score(y_test,pred),4)
train_size= X_train.shape[0]
test_size = X_test.shape[0]
print(f'{n_iter}회 정확도:{accuracy} 학습데이터크기:{train_size} 검증데이터크기:{test_size}')
cv_accuracy.append(accuracy)
print(np.mean(cv_accuracy))
import pandas as pd
iris_df = pd.DataFrame(data=iris.data,columns=iris.feature_names)
iris_df.head(2)
iris_df['label'] = iris.target
iris_df.head(1)
iris_df['label'].value_counts()
kfold = KFold(n_splits=3,shuffle=True)
n_iter=0
for train_index,test_index in kfold.split(iris_df):
n_iter += 1
label_train = iris_df['label'].iloc[train_index]
label_test = iris_df['label'].iloc[test_index]
print(n_iter,'번째')
print('학습레이블 데이터 분포\n',label_train.value_counts())
print('검증레이블 데이터 분포\n',label_test.value_counts())
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=3)
n_iter=0
for train_index,test_index in kfold.split(iris_df,iris_df['label']):
n_iter += 1
label_train = iris_df['label'].iloc[train_index]
label_test = iris_df['label'].iloc[test_index]
print(n_iter,'번째')
print('학습레이블 데이터 분포\n',label_train.value_counts())
print('검증레이블 데이터 분포\n',label_test.value_counts())
iris=load_iris()
features = iris.data
label = iris.target
dt_clf = DecisionTreeClassifier(random_state=156)
kfold = StratifiedKFold(n_splits=5)
cv_accuracy = []
#features.shape
n_iter = 0
for train_index,test_index in kfold.split(features,label):
# print(train_index)
# print(test_index)
X_train,X_test = features[train_index],features[test_index]
y_train,y_test = label[train_index],label[test_index]
dt_clf.fit(X_train,y_train)
pred = dt_clf.predict(X_test)
n_iter += 1
accuracy = np.round(accuracy_score(y_test,pred),4)
train_size= X_train.shape[0]
test_size = X_test.shape[0]
print(f'{n_iter}회 정확도:{accuracy} 학습데이터크기:{train_size} 검증데이터크기:{test_size}')
cv_accuracy.append(accuracy)
print(np.mean(cv_accuracy))
from sklearn.model_selection import cross_val_score,cross_validate
iris=load_iris()
features = iris.data
label = iris.target
dt_clf = DecisionTreeClassifier(random_state=156)
scores = cross_val_score(dt_clf,features,label,scoring='accuracy',cv=3)
print(scores)
print(np.round(np.mean(scores),4))
grid_parameters = {'max_depth':[1,2,3],
'min_samples_split':[2,3]}
from sklearn.model_selection import GridSearchCV
iris= load_iris()
X_train,X_test,y_train,y_test = train_test_split(iris.data,
iris.target,
test_size=0.2,
random_state=121)
dtree = DecisionTreeClassifier()
grid_dtree = GridSearchCV(dtree,param_grid=grid_parameters,cv=3,refit=True)
grid_dtree.fit(X_train,y_train)
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df.columns
scores_df[['params','mean_test_score','rank_test_score','split0_test_score', 'split1_test_score', 'split2_test_score']]
grid_dtree.best_params_
grid_dtree.best_score_
estimator = grid_dtree.best_estimator_
pred = estimator.predict(X_test)
accuracy_score(y_test,pred)
사이킷런의 머신러닝 알고리즘은 문자열 값은 허용하지 않고 오직 숫자만 허용함, 그래서 문자열 값은 인코딩해서 숫자 형으로 변환해야한다.
문자열 피처는 일반적으로 카테고리형 피처(코드 값으로 표현)와 텍스트형 피처(피처 벡터화)를 의미한다.
원-핫인코딩
from sklearn.preprocessing import LabelEncoder
items=['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '믹서', '믹서']
encoder = LabelEncoder()
encoder.fit(items)
encoder.transform(items) #일련의 숫자형태로 바꿔주는 것 = 레이블인코딩
encoder.classes_
encoder.inverse_transform([0, 1, 4, 5, 3, 2, 2]) #숫자로 되어있는 것을 다시 원래값으로 바꿔주는 것
from sklearn.preprocessing import OneHotEncoder
import numpy as np
items=['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print(type(labels))
print(labels.shape)
print(labels.ndim)
labels = labels.reshape(-1, 1) #reshape 모양 바꾸기
print(labels)
oh_encoder = OneHotEncoder()
oh_encoder.fit(labels)
oh_labels = oh_encoder.transform(labels)
print(oh_labels.toarray())
print(oh_labels.shape)
oh_labels
import pandas as pd
df = pd.DataFrame({'items':['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']})
pd.get_dummies(df)
from sklearn.datasets import load_iris
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df.head(2)
iris_df.mean()
iris_df.var()
from sklearn.preprocessing import MinMaxScaler
#MinMaxScaler 객체 생성
scaler = MinMaxScaler()
#MinMaxScaler로 데이터 세트 변환, fit()과 transform() 호출,
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)
#transform()시 스케일 변환된 데이터 세트가 Numpy ndarray로 변환돼 이를 DataFrame으로 변환
iris_df_scaled = pd.DataFrame(data=iris_scaled, columns=iris.feature_names)
print('feature들의 최솟값')
print(iris_df_scaled.min())
print('\nfeature들의 최댓값')
print(iris_df_scaled.max())
Titanic 생존자 예측
전처리 -> 데이터탐색(탐색적 분석)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
titanic_df = pd.read_csv('titanic_train.csv')
titanic_df.head(2)
titanic_df.info()
titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace=True)
titanic_df['Cabin'].fillna('N',inplace=True)
titanic_df['Embarked'].fillna('N',inplace=True) #그냥 N이라는 문자열을 넣어줌
titanic_df.isnull().sum() #nan값 Ture인지 False인지 확인
titanic_df.isnull().sum().sum()
titanic_df['Sex'].value_counts()
titanic_df['Cabin'].value_counts()
titanic_df['Embarked'].value_counts()
titanic_df['Cabin'] = titanic_df['Cabin'].str[:1]
titanic_df.groupby(['Sex','Survived'])['Survived'].count()
sns.barplot(x='Sex', y='Survived', data=titanic_df)
plt.figure(figsize=(10,6))
sns.barplot(x='Pclass', y='Survived', hue='Sex', data=titanic_df)
titanic_df.columns
def get_category(age):
cat=''
if age<=-1: cat='Unknown'
elif age <= 5: cat='Baby'
elif age <= 12: cat='Child'
elif age <= 18: cat='Teenager'
elif age <= 25: cat='Student'
elif age <= 35: cat='Young Adult'
elif age <= 60: cat='Adult'
else:cat='Elderly'
return cat
titanic_df['Age_cat'] = titanic_df['Age'].apply(get_category)
plt.figure(figsize=(10,5))
group_names=['Unknown','Baby','Child','Teenager','Student','Young Adult', 'Adult', 'Elderly']
sns.barplot(x='Age_cat', y='Survived', hue='Sex', data=titanic_df, order=group_names) #group_names 기준으로 값 재정렬