# 디렉토리 생성
export ML_PATH="$HOME/ml"
mkdir -p $ML_PATH
# pip 버젼 확인
pip --version
# pip 버젼 업그레이드
python -m pip install --user -U virtualenv
# 독립적 개발 환경 생성
cd $ML_PATH
virtualenv env
# 가상환경 활성화
cd $ML_PATH
source env/bin/activate
# 패키지 설치
pip install jupyter scikit-learn
# 주피터에 커널 등록
python -m ipykernel install --user --name=python3
# 주피터 실행
jupyter notebook
import os
import tarfile
import urllib
import pandas as pd
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
os.makedirs(housing_path, exist_ok=True)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
housing = load_housing_data()
# 데이터프레임 상위 5개 값 확인
housing.head()
# 데이터 타입, 개수, 결측치 등 확인
housing.info()
# column별 통계값 확인
housing.describe()
# 범주형 데이터 카테고리 수 확인
housing["ocean_proximity"].value_counts()
# 데이터 분포 확인
housing.hist(bins=50, figsize=(20,15))
plt.show()
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
from sklearn.model_selection import StratifiedShuffleSplit
housing["income_cat"] = pd.cut(housing["median_income"],
bins=[0, 1.5, 3, 4.5, 6, np.inf],
labels=[1,2,3,4,5])
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, ramdom_state=42)
for train_idx, test_idx in split.split(housing, housing["income_cat"]):
train_set = housing.loc[train_idx]
test_set = housing.loc[test_idx]
for set_ in (train_set, test_set):
set_.drop("income_cat", axis=1, inplace=True)
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4,
s=housing['population']/100, label='population', figsize=(10,7),
c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True,
sharex=False)
plt.legend()
plt.show()
corr_matrix = housing.corr()
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12,8))
plt.show()
[bad, average, good, excellent]
같은 경우는 괜찮다.fit()
해야 함 # 결측값 처리
housing.dropna(subset=['total_bedrooms']) # 행(data) 제거
housing.drop('total_bedrooms', axis=1) # 열(feature) 제거
housing['total_bedrooms'].fillna(median, inplace=True) # 채우기
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median') # 누락값 중간값으로 대체
imputer.fit(housing_num) # imputer 학습
imputer.statistics_ # imputer가 학습한 Feature별 통계값
X = imputer.transform(housing_num) # imputer 적용
# 라벨 인코딩
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
ordinal_encoder.categories # 범주형 데이터 카테고리 리스트 반환
# 원 핫 인코딩
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot.toarray() # scipy의 csr로 생성하기 때문에 numpy array로 변환
cat_encoder.categories_
fit()
, transform()
, fit_transform()
메소드를 구현한 파이썬 클래스를 만들면 됨fit_transform()
은 TransformerMixin
을 상속하면 자동으로 생성됨BaseEstimator
상속 시 하이퍼파라미터 튜닝에 필요한 get_params()
와 set_params()
얻음from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room=True):
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self
def transform(self, x):
rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
population_per_household = X[:, population_ix] / X[:, households_ix]
if self.add_bed_rooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bed_rooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
fit_transform()
메소드를 보유한 변환기/추정기여야 함fit()
메소드 호출 시 모든 변환기의 fit_transform()
메소드를 순서대로 호출하며 다음 단계의 입력으로 출력을 보냄fit()
만 수행from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
# 일반 파이프라인
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler())
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
# 특정 열에만 적용할 수 있는 파이프라인
num_attribs = list(house_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", oneHotEncoder(), cat_attribs)
])
housing_prepared = full_pipeline.fit_transform(housing)
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
X = full_pipeline.transform(data)
lin_reg.predict(X)
from sklearn.metrics import mean_squared_error
preds = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
import joblib
joblib.dump(my_model, "my_model.pkl") # 모델 저장
my_model_loaded = joblib.load("my_model.pkl") # 모델 불러오기
from sklearn.model_selection import GridSearchCV
param_grid = [
{'n_estimators':[3,10,30], 'max_features':[2,4,6,8]},
{'bootstrap':[False], 'n_estimators':[3,10], 'max_features':[2,3,4]}
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
grid_search.best_params_ # 최상의 파라미터
grid_search.best_estimators_ # 최적 모델
grid_search.cv_results_ # 실험 결과
fit_transform()
대신 transform()
사용해야 함!!from scipy import stats
confidence = .95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors)-1,
loc=squared_errors.mean(),
scale=stats.sem(squared_errors))
predict()
메소드를 통해 예측 수행참고
Hands-on Machine Learning with Scikit-Learn, Keras & Tensorflow 2 - 오렐리앙 제롱