Ex1
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
housing = pd.read_csv('housing.csv')
housing.info()
print()
print(housing['ocean_proximity'].unique())
print(housing['ocean_proximity'].value_counts())
print()
print(housing.describe())
print()
housing['income_cat'] = pd.cut(housing['median_income'],
bins=[0.,1.5,3.0,4.5,6.0, np.inf],
labels=[1,2,3,4,5])
print(housing.head())
print()
print(housing['income_cat'].value_counts())
print()
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in split.split(housing, housing['income_cat']):
strat_train_set = housing.loc[train_idx]
strat_test_set = housing.loc[test_idx]
print('original data\n', housing['income_cat'].value_counts()/len(housing))
print()
print('test data\n', strat_test_set['income_cat'].value_counts()/len(strat_test_set))
print()
print('train data\n', strat_train_set['income_cat'].value_counts()/len(strat_train_set))
'''
목적 : stratifieldShuffleSplit 이 올바르게 작동했고 세 데이터셋 모두에서 소득 카테고리 분포가 매우 유사하여 샘플링 편향을 최소화했는지
시각적으로 확인합니다.
'''
for set_ in (strat_train_set, strat_test_set):
set_.drop('income_cat', axis=1, inplace=True)
'''
이 for 루프틑 훈련 세트(strat_train_set)와 테스트 세트를 순회합니다.
set_.drop('income_cat', axis=1, inplace=True) : 각 세트에서 income_cat 열을 제거합니다.
목적 : income_cat 열은 계층적 분할을 위해 임시로 만들었으므로, 모델 훈련/테스트 특성으로는 더 이상 필요하지 않아 제거합니다.
'''
housing = strat_train_set.copy()
housing.info()
print()
'''
housing = strat_train_set.copy() : 원본 분할 데이터셋을 수정하지 않기 위해 훈련 세트(strat_train_set) 의 복사본을 만들어 housing 변수에 할당합니다.
이후 모든 전처리 작업은 이 복사본에 대해 수행됩니다.
'''
pd.set_option('display.max_columns',20)
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()
sample_incomplet_row = housing[housing.isnull().any(axis=1)].head()
print(sample_incomplet_row)
print(housing.dropna(subset=['total_bedrooms']))
print()
print(housing.drop('total_bedrooms', axis=1))
print()
median = housing['total_bedrooms'].median()
print(sample_incomplet_row['total_bedrooms'].fillna(median))
print()
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
housing_num = housing.drop('ocean_proximity', axis=1)
imputer.fit(housing_num)
x = imputer.transform(housing_num)
housing_tr = pd.DataFrame(x,
columns=housing_num.columns,
index=housing_num.index)
housing_tr.info()
print()
housing_cat = housing[['ocean_proximity']]
from sklearn.preprocessing import OrdinalEncoder
oridinal_encoder = OrdinalEncoder()
housing_cat_encoded = oridinal_encoder.fit_transform(housing_cat)
print(housing_cat_encoded[:10])
print(oridinal_encoder.categories_)
print()
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_onehot = cat_encoder.fit_transform(housing_cat)
print(housing_cat_onehot)
print(housing_cat_onehot.toarray())
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer',SimpleImputer(strategy='median')),
('std_scaler',StandardScaler())
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
print(housing_num_tr)
print()
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
print(num_attribs)
cat_attribs = ['ocean_proximity']
full_pipeline = ColumnTransformer([
('num', num_pipeline, num_attribs),
('cat', OneHotEncoder(), cat_attribs)
])
housing_prepared = full_pipeline.fit_transform(housing)
print(housing_prepared)
print(housing_prepared[0,:])
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 20640 non-null float64
1 latitude 20640 non-null float64
2 housing_median_age 20640 non-null float64
3 total_rooms 20640 non-null float64
4 total_bedrooms 20433 non-null float64
5 population 20640 non-null float64
6 households 20640 non-null float64
7 median_income 20640 non-null float64
8 median_house_value 20640 non-null float64
9 ocean_proximity 20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
['NEAR BAY' '<1H OCEAN' 'INLAND' 'NEAR OCEAN' 'ISLAND']
ocean_proximity
<1H OCEAN 9136
INLAND 6551
NEAR OCEAN 2658
NEAR BAY 2290
ISLAND 5
Name: count, dtype: int64
longitude latitude ... median_income median_house_value
count 20640.000000 20640.000000 ... 20640.000000 20640.000000
mean -119.569704 35.631861 ... 3.870671 206855.816909
std 2.003532 2.135952 ... 1.899822 115395.615874
min -124.350000 32.540000 ... 0.499900 14999.000000
25% -121.800000 33.930000 ... 2.563400 119600.000000
50% -118.490000 34.260000 ... 3.534800 179700.000000
75% -118.010000 37.710000 ... 4.743250 264725.000000
max -114.310000 41.950000 ... 15.000100 500001.000000
[8 rows x 9 columns]
longitude latitude ... ocean_proximity income_cat
0 -122.23 37.88 ... NEAR BAY 5
1 -122.22 37.86 ... NEAR BAY 5
2 -122.24 37.85 ... NEAR BAY 5
3 -122.25 37.85 ... NEAR BAY 4
4 -122.25 37.85 ... NEAR BAY 3
[5 rows x 11 columns]
income_cat
3 7236
2 6581
4 3639
5 2362
1 822
Name: count, dtype: int64
original data
income_cat
3 0.350581
2 0.318847
4 0.176308
5 0.114438
1 0.039826
Name: count, dtype: float64
test data
income_cat
3 0.350533
2 0.318798
4 0.176357
5 0.114341
1 0.039971
Name: count, dtype: float64
train data
income_cat
3 0.350594
2 0.318859
4 0.176296
5 0.114462
1 0.039789
Name: count, dtype: float64
<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 12655 to 19773
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 16512 non-null float64
1 latitude 16512 non-null float64
2 housing_median_age 16512 non-null float64
3 total_rooms 16512 non-null float64
4 total_bedrooms 16354 non-null float64
5 population 16512 non-null float64
6 households 16512 non-null float64
7 median_income 16512 non-null float64
8 median_house_value 16512 non-null float64
9 ocean_proximity 16512 non-null object
dtypes: float64(9), object(1)
memory usage: 1.4+ MB
longitude latitude housing_median_age total_rooms total_bedrooms \
1606 -122.08 37.88 26.0 2947.0 NaN
10915 -117.87 33.73 45.0 2264.0 NaN
19150 -122.70 38.35 14.0 2313.0 NaN
4186 -118.23 34.13 48.0 1308.0 NaN
16885 -122.40 37.58 26.0 3281.0 NaN
population households median_income ocean_proximity
1606 825.0 626.0 2.9330 NEAR BAY
10915 1970.0 499.0 3.4193 <1H OCEAN
19150 954.0 397.0 3.7813 <1H OCEAN
4186 835.0 294.0 4.2891 <1H OCEAN
16885 1145.0 480.0 6.3580 NEAR OCEAN
longitude latitude housing_median_age total_rooms total_bedrooms \
12655 -121.46 38.52 29.0 3873.0 797.0
15502 -117.23 33.09 7.0 5320.0 855.0
2908 -119.04 35.37 44.0 1618.0 310.0
14053 -117.13 32.75 24.0 1877.0 519.0
20496 -118.70 34.28 27.0 3536.0 646.0
... ... ... ... ... ...
15174 -117.07 33.03 14.0 6665.0 1231.0
12661 -121.42 38.51 15.0 7901.0 1422.0
19263 -122.72 38.44 48.0 707.0 166.0
19140 -122.70 38.31 14.0 3155.0 580.0
19773 -122.14 39.97 27.0 1079.0 222.0
population households median_income ocean_proximity
12655 2237.0 706.0 2.1736 INLAND
15502 2015.0 768.0 6.3373 NEAR OCEAN
2908 667.0 300.0 2.8750 INLAND
14053 898.0 483.0 2.2264 NEAR OCEAN
20496 1837.0 580.0 4.4964 <1H OCEAN
... ... ... ... ...
15174 2026.0 1001.0 5.0900 <1H OCEAN
12661 4769.0 1418.0 2.8139 INLAND
19263 458.0 172.0 3.1797 <1H OCEAN
19140 1208.0 501.0 4.1964 <1H OCEAN
19773 625.0 197.0 3.1319 INLAND
[16354 rows x 9 columns]
longitude latitude housing_median_age total_rooms population \
12655 -121.46 38.52 29.0 3873.0 2237.0
15502 -117.23 33.09 7.0 5320.0 2015.0
2908 -119.04 35.37 44.0 1618.0 667.0
14053 -117.13 32.75 24.0 1877.0 898.0
20496 -118.70 34.28 27.0 3536.0 1837.0
... ... ... ... ... ...
15174 -117.07 33.03 14.0 6665.0 2026.0
12661 -121.42 38.51 15.0 7901.0 4769.0
19263 -122.72 38.44 48.0 707.0 458.0
19140 -122.70 38.31 14.0 3155.0 1208.0
19773 -122.14 39.97 27.0 1079.0 625.0
households median_income ocean_proximity
12655 706.0 2.1736 INLAND
15502 768.0 6.3373 NEAR OCEAN
2908 300.0 2.8750 INLAND
14053 483.0 2.2264 NEAR OCEAN
20496 580.0 4.4964 <1H OCEAN
... ... ... ...
15174 1001.0 5.0900 <1H OCEAN
12661 1418.0 2.8139 INLAND
19263 172.0 3.1797 <1H OCEAN
19140 501.0 4.1964 <1H OCEAN
19773 197.0 3.1319 INLAND
[16512 rows x 8 columns]
1606 433.0
10915 433.0
19150 433.0
4186 433.0
16885 433.0
Name: total_bedrooms, dtype: float64
<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 12655 to 19773
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 16512 non-null float64
1 latitude 16512 non-null float64
2 housing_median_age 16512 non-null float64
3 total_rooms 16512 non-null float64
4 total_bedrooms 16512 non-null float64
5 population 16512 non-null float64
6 households 16512 non-null float64
7 median_income 16512 non-null float64
dtypes: float64(8)
memory usage: 1.1 MB
[[1.]
[4.]
[1.]
[4.]
[0.]
[3.]
[0.]
[0.]
[0.]
[0.]]
[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
dtype=object)]
(0, 1) 1.0
(1, 4) 1.0
(2, 1) 1.0
(3, 4) 1.0
(4, 0) 1.0
(5, 3) 1.0
(6, 0) 1.0
(7, 0) 1.0
(8, 0) 1.0
(9, 0) 1.0
(10, 1) 1.0
(11, 0) 1.0
(12, 1) 1.0
(13, 1) 1.0
(14, 4) 1.0
(15, 0) 1.0
(16, 0) 1.0
(17, 0) 1.0
(18, 3) 1.0
(19, 0) 1.0
(20, 1) 1.0
(21, 3) 1.0
(22, 1) 1.0
(23, 0) 1.0
(24, 1) 1.0
: :
(16487, 1) 1.0
(16488, 0) 1.0
(16489, 4) 1.0
(16490, 4) 1.0
(16491, 1) 1.0
(16492, 1) 1.0
(16493, 0) 1.0
(16494, 0) 1.0
(16495, 0) 1.0
(16496, 1) 1.0
(16497, 0) 1.0
(16498, 4) 1.0
(16499, 0) 1.0
(16500, 0) 1.0
(16501, 1) 1.0
(16502, 1) 1.0
(16503, 1) 1.0
(16504, 1) 1.0
(16505, 0) 1.0
(16506, 0) 1.0
(16507, 0) 1.0
(16508, 1) 1.0
(16509, 0) 1.0
(16510, 0) 1.0
(16511, 1) 1.0
[[0. 1. 0. 0. 0.]
[0. 0. 0. 0. 1.]
[0. 1. 0. 0. 0.]
...
[1. 0. 0. 0. 0.]
[1. 0. 0. 0. 0.]
[0. 1. 0. 0. 0.]]
[[-0.94135046 1.34743822 0.02756357 ... 0.73260236 0.55628602
-0.8936472 ]
[ 1.17178212 -1.19243966 -1.72201763 ... 0.53361152 0.72131799
1.292168 ]
[ 0.26758118 -0.1259716 1.22045984 ... -0.67467519 -0.52440722
-0.52543365]
...
[-1.5707942 1.31001828 1.53856552 ... -0.86201341 -0.86511838
-0.36547546]
[-1.56080303 1.2492109 -1.1653327 ... -0.18974707 0.01061579
0.16826095]
[-1.28105026 2.02567448 -0.13148926 ... -0.71232211 -0.79857323
-0.390569 ]]
['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
[[-0.94135046 1.34743822 0.02756357 ... 0. 0.
0. ]
[ 1.17178212 -1.19243966 -1.72201763 ... 0. 0.
1. ]
[ 0.26758118 -0.1259716 1.22045984 ... 0. 0.
0. ]
...
[-1.5707942 1.31001828 1.53856552 ... 0. 0.
0. ]
[-1.56080303 1.2492109 -1.1653327 ... 0. 0.
0. ]
[-1.28105026 2.02567448 -0.13148926 ... 0. 0.
0. ]]
[-0.94135046 1.34743822 0.02756357 0.58477745 0.64037127 0.73260236
0.55628602 -0.8936472 0. 1. 0. 0.
0. ]