3일차 요약
- 오전
- 넘파이 관련 공부
- 머신 러닝 개요
- 오후
- 머신 러닝 개념 공부
arr = [1,2,3,4,5]
a = np.array(arr)
arr = np.arange(12) # list의 range와 같은 의미
arr = arr.reshape(3,4)
# 0으로 초기화 하는 방법
np.zeros(3)
np.zeros(arr)
# 1로 초기화 하는 방법
np.ones(3)
np.ones_like(arr)
a = np.arange(12)
a = a.reshape(3,4)
# 값을 다시 1차원 벡터로 만들기 위해
a.reshape(12)
a.reshape(-1)
a.reshape(3,-1) # 앞에 3개로 끊어서 reshape 해줘
a.reshape(3,4,-1)
# argmax()
a = np.array([1,4,4,5,234])
a.argmax()
a = np.arange(12).reshape(3,-1)
# fancy indexing
a[[0,2]]
# boolean indexing
a = np.arange(5)
idx = [True, False, False, True, False]
a[idx]
a =np.array([3,4,5])
a+1
x = np.arange(6).reshape(3,2)
y = np.arange(6).reshape(2,3)
z = np.array([1,2,3])
x + z -> 안 됨
y + z -> 됨
X = np.array([[1,1],[2,2]])
Y = np.array([[3,3],[4,4]])
np.concatenate((X,Y), axis = 1)
np.stack((X,Y))
1. 데이터 세트 준비
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
!wget https://raw.githubusercontent.com/devdio/datasets/main/citrus.csv
citrus = pd.read_csv('./citrus.csv')
2. train, test 데이터 분리
# 데이터 섞기
df = df.sample(frac=1)
# 테스트 데이터 분리(dataframe 슬라이싱 기능 이용)
X = df[['diameter', 'weight','red','green','blue']]
y = df['name']
X_trian = X[:int(len(X)*0.8)]
X_Test = X[int(len(X)*0.8):]
y_train = y[:int(len(X)*0.8)]
y_test = y[int(len(X)*0.8):]
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# 내가 짠 코드
from sklearn.model_selection import train_test_split
X = df[['diameter', 'weight','red','green','blue']]
y = df['name']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1004)
3. 데이터 전처리
# 평균: z-score로 스케일링
def z_score_scaling(df):
X_train = df
mu_dict= {}
std_dict = {}
for idx in X_train.columns:
mu_dict[idx] = X_train[idx].mean()
std_dict[idx] = X_train[idx].std()
X_train[idx] = (X_train[idx]-mu_dict[idx])/std_dict[idx]
return X_train
label = {'orange':1, 'grapefruit':0}
y_train = y_train.map(label)
4. 학습:knn 알고리즘
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, weights='distance',metric='euclidean')
knn.fit(X_train.values, y_train.values)
5. 평가
X_test = z_score_scaling(X_test)
y_test = y_test.map(label)
y_pred = knn.predict(X_test.values)
np,sum(y_pred == y_test)/len(y_test) # 정확도 계산
# 오차행렬
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred) # 정답, 예측값 순서대로 넣어야 함
s = sns.heatmap(cm, annot=True, cmap = 'Blues', fmt='d', cbar=False)
s.set(xlabel='Prediction', ylabel='Actual')
plt.show()
# 평가지표로 평가
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
acc = accuracy_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
print(f'accuracy: {acc}')
print(f'recall: {recall}')
print(f'precision: {precision}')
print(f'f1 score: {f1}')
6. 모델 튜닝
from tqdm import tqdm
score = []
for k in tqdm(range(3, 100)):
knn = KNeighborsClassifier(n_neighbors = k)
knn.fit(X_train.values, y_train.values)
y_pred = knn.predict(X_test.values)
acc = accuracy_score(y_test, y_pred)
score.append(acc)
plt.plot(range(3,100), score)
plt.show()
# k가 n일 때 베스트라고 하면, n으로 재학습
bestk = np.array(score).argmax() + 4
best_knn = KNeighborsClassifier(n_neighbors = bestk)
best_knn = best_knn.fit(X_train.values, y_train.values)