Loading Tabular Datasets from Text Files
!head data/iris.csv
pandas를 이용한 상위 5항목 시각화
import pandas as pd
df = pd.read_csv('data/iris.csv')
df.head()
preprocessing
d = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
df = pd.read_csv('data/iris.csv')
df['Species'] = df['Species'].map(d)
시각화
#conda install mlxtend --channel conda-forge
#pip install mlxtend
%matplotlib inline
import matplotlib.pyplot as plt
from mlxtend.data import iris_data
from mlxtend.plotting import scatterplotmatrix
names = df.columns[1:5]
fig, axes = scatterplotmatrix(X[y==0], figsize=(10, 8), alpha=0.5)
fig, axes = scatterplotmatrix(X[y==1], fig_axes=(fig, axes), alpha=0.5)
fig, axes = scatterplotmatrix(X[y==2], fig_axes=(fig, axes), alpha=0.5, names=names)
plt.tight_layout()
plt.legend(labels=['Setosa', 'Versicolor', 'Virginica'])
plt.savefig('images/eda.pdf')
plt.show()
Splitting a Dataset into Train, Validation, and Test Subsets
from sklearn.model_selection import train_test_split
X_temp, X_test, y_temp, y_test = \
train_test_split(X, y, test_size=0.2,
shuffle=True, random_state=123, stratify=y)
np.bincount(y_temp)
X_train, X_valid, y_train, y_valid = \
train_test_split(X_temp, y_temp, test_size=0.2,
shuffle=True, random_state=123, stratify=y_temp)
print('Train size', X_train.shape, 'class proportions', np.bincount(y_train))
print('Valid size', X_valid.shape, 'class proportions', np.bincount(y_valid))
print('Test size', X_test.shape, 'class proportions', np.bincount(y_test))
분류 시각화
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.plotting import plot_decision_regions
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train[:, 2:], y_train)
plot_decision_regions(X_train[:, 2:], y_train, knn_model)
plt.xlabel('petal length[cm]')
plt.ylabel('petal width[cm]')
plt.savefig('images/decisionreg.pdf')
plt.show()
Scaling
Normalization -- Min-max scaling
- xnorm[i]=xmax−xminx[i]−xmin
Standardization
- xstd[i]=σxx[i]−μx
적용
mu, sigma = X_train.mean(axis=0), X_train.std(axis=0)
X_train_std = (X_train - mu) / sigma
X_valid_std = (X_valid - mu) / sigma
X_test_std = (X_test - mu) / sigma
sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_valid_std = scaler.transform(X_valid)
X_test_std = scaler.transform(X_test)
X_test_std
categorical data
- 데이터를 다루기 위해서는 mapping과정이 필요하다.
- categorical data를 각각에 상응하는 numerical data로 변환한다.
- ex)
mapping_dict = {'M': 2,
'L': 3,
'XXL': 5}
df['size'] = df['size'].map(mapping_dict)
or
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['classlabel'] = le.fit_transform(df['classlabel'])
Missing data
- 데이터 수집 비동의 등 여러가지 이유로 누락된 데이터를 제거한다.
df.dropna(axis=0)
# 누락 요소 행 제거
df.dropna(axis=1)
# 누락 요소 열 제거