딥러닝(비지도학습)

짬그브·2025년 4월 2일

Ex1

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

housing = pd.read_csv('housing.csv')
housing.info()
print()

print(housing['ocean_proximity'].unique())
print(housing['ocean_proximity'].value_counts())
print()

print(housing.describe())
print()
# housing.hist(bins=50, figsize=(20,15))
# plt.tight_layout()
# plt.show()

housing['income_cat'] = pd.cut(housing['median_income'],            # 데이터 값을 구간( bin)으로 나누고 정렬
                               bins=[0.,1.5,3.0,4.5,6.0, np.inf],   # 소득 구간의 경계값을 정의
                               labels=[1,2,3,4,5])                  # 생성된 구간에 레이블을 할당합니다. 0과 1.5 사이 소득은 레이블 1, 1.5와 3.0 사이는 레이블 2 등으로 할당

print(housing.head())
print()
print(housing['income_cat'].value_counts())     # 각 소득 카테고리에 속하는 주택 수를 보여줍니다.
# housing['income_cat'].hist()                  # 새로 생성된 소득 카테고리의 히스토그램을 표시
# plt.show()
print()

from sklearn.model_selection import StratifiedShuffleSplit          # 계층적 분할을 위해 Scikit-learn 에서 필요한 클래스 임포트

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)          # n_splits=1 : 단 한번의 분할( 하나의 훈련/ 테스트 세트 쌍)만 생성하도록 지정
                                                                                    # test_size = 0.2 : 테스트 세트로 사용할 데이터의 비율을 20% 로 설정 나머지 80% 훈련 세트
                                                                                    # random_state = 42 : 난수 생성기의 시드(seed)를 설정 이렇게 하면 코드를 실행할 때마다 동일한 분할 결과를 얻을 수 있습니다.

for train_idx, test_idx in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_idx]
    strat_test_set = housing.loc[test_idx]

print('original data\n', housing['income_cat'].value_counts()/len(housing))                 # 원본 데이터셋(housing)
print()
print('test data\n', strat_test_set['income_cat'].value_counts()/len(strat_test_set))       # 생성된 테스트 세트(strat_test_set)
print()
print('train data\n', strat_train_set['income_cat'].value_counts()/len(strat_train_set))    # 생성된 훈련 세트(strat_train_set)

'''
목적 : stratifieldShuffleSplit 이 올바르게 작동했고 세 데이터셋 모두에서 소득 카테고리 분포가 매우 유사하여 샘플링 편향을 최소화했는지
시각적으로 확인합니다.
'''

for set_ in (strat_train_set, strat_test_set):
    set_.drop('income_cat', axis=1, inplace=True)
    '''
    이 for 루프틑 훈련 세트(strat_train_set)와 테스트 세트를 순회합니다.
    set_.drop('income_cat', axis=1, inplace=True) : 각 세트에서 income_cat 열을 제거합니다.
    목적 : income_cat 열은 계층적 분할을 위해 임시로 만들었으므로, 모델 훈련/테스트 특성으로는 더 이상 필요하지 않아 제거합니다.
    '''

housing = strat_train_set.copy()
housing.info()
print()
'''
housing = strat_train_set.copy() : 원본 분할 데이터셋을 수정하지 않기 위해 훈련 세트(strat_train_set) 의 복사본을 만들어 housing 변수에 할당합니다.
이후 모든 전처리 작업은 이 복사본에 대해 수행됩니다.
'''

pd.set_option('display.max_columns',20)                             # Pandas DataFrame 을 출력할 때 최대 20개 열까지 표시하도록 설정을 변경 합니다. (데이터가 너무 넓을 때 유용)
housing = strat_train_set.drop('median_house_value', axis=1)        # 훈련 세트 복사본에서 median_house_value 열 (예측하려는 대상 값, 즉 레이블) 을 제거하여 특성(feature) 데이터만 남깁니다. 결과를 다시 housing 변수에 저장합니다.
housing_labels = strat_train_set['median_house_value'].copy()       # 원본 훈련 세트에서 median_house_value 열만 선택하여 별도의 housing_labels 변수에 저장합니다. copy()를 사용하여 원본과의 연결을 끊습니다.
sample_incomplet_row = housing[housing.isnull().any(axis=1)].head()
print(sample_incomplet_row)

print(housing.dropna(subset=['total_bedrooms'])) # option 1 Nan 데이터가 적을 때
print()
print(housing.drop('total_bedrooms', axis=1)) # option 2  Nan 데이터가 많을 때
print()
median = housing['total_bedrooms'].median()
print(sample_incomplet_row['total_bedrooms'].fillna(median)) # option 3 Nan 데이터가 중간정도 일때
print()

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
housing_num = housing.drop('ocean_proximity', axis=1)
imputer.fit(housing_num)

x = imputer.transform(housing_num)
housing_tr = pd.DataFrame(x,
                          columns=housing_num.columns,
                          index=housing_num.index)

housing_tr.info()
print()

housing_cat = housing[['ocean_proximity']]

from sklearn.preprocessing import OrdinalEncoder
oridinal_encoder = OrdinalEncoder()
housing_cat_encoded = oridinal_encoder.fit_transform(housing_cat)
print(housing_cat_encoded[:10])
print(oridinal_encoder.categories_)
print()

from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_onehot = cat_encoder.fit_transform(housing_cat)
print(housing_cat_onehot)
print(housing_cat_onehot.toarray())

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('std_scaler',StandardScaler())
])

housing_num_tr = num_pipeline.fit_transform(housing_num)
print(housing_num_tr)
print()

from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
print(num_attribs)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

housing_prepared = full_pipeline.fit_transform(housing)
print(housing_prepared)
print(housing_prepared[0,:])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

['NEAR BAY' '<1H OCEAN' 'INLAND' 'NEAR OCEAN' 'ISLAND']
ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

          longitude      latitude  ...  median_income  median_house_value
count  20640.000000  20640.000000  ...   20640.000000        20640.000000
mean    -119.569704     35.631861  ...       3.870671       206855.816909
std        2.003532      2.135952  ...       1.899822       115395.615874
min     -124.350000     32.540000  ...       0.499900        14999.000000
25%     -121.800000     33.930000  ...       2.563400       119600.000000
50%     -118.490000     34.260000  ...       3.534800       179700.000000
75%     -118.010000     37.710000  ...       4.743250       264725.000000
max     -114.310000     41.950000  ...      15.000100       500001.000000

[8 rows x 9 columns]

   longitude  latitude  ...  ocean_proximity  income_cat
0    -122.23     37.88  ...         NEAR BAY           5
1    -122.22     37.86  ...         NEAR BAY           5
2    -122.24     37.85  ...         NEAR BAY           5
3    -122.25     37.85  ...         NEAR BAY           4
4    -122.25     37.85  ...         NEAR BAY           3

[5 rows x 11 columns]

income_cat
3    7236
2    6581
4    3639
5    2362
1     822
Name: count, dtype: int64

original data
 income_cat
3    0.350581
2    0.318847
4    0.176308
5    0.114438
1    0.039826
Name: count, dtype: float64

test data
 income_cat
3    0.350533
2    0.318798
4    0.176357
5    0.114341
1    0.039971
Name: count, dtype: float64

train data
 income_cat
3    0.350594
2    0.318859
4    0.176296
5    0.114462
1    0.039789
Name: count, dtype: float64
<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 12655 to 19773
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16354 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   median_house_value  16512 non-null  float64
 9   ocean_proximity     16512 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.4+ MB

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
1606     -122.08     37.88                26.0       2947.0             NaN   
10915    -117.87     33.73                45.0       2264.0             NaN   
19150    -122.70     38.35                14.0       2313.0             NaN   
4186     -118.23     34.13                48.0       1308.0             NaN   
16885    -122.40     37.58                26.0       3281.0             NaN   

       population  households  median_income ocean_proximity  
1606        825.0       626.0         2.9330        NEAR BAY  
10915      1970.0       499.0         3.4193       <1H OCEAN  
19150       954.0       397.0         3.7813       <1H OCEAN  
4186        835.0       294.0         4.2891       <1H OCEAN  
16885      1145.0       480.0         6.3580      NEAR OCEAN  
       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
12655    -121.46     38.52                29.0       3873.0           797.0   
15502    -117.23     33.09                 7.0       5320.0           855.0   
2908     -119.04     35.37                44.0       1618.0           310.0   
14053    -117.13     32.75                24.0       1877.0           519.0   
20496    -118.70     34.28                27.0       3536.0           646.0   
...          ...       ...                 ...          ...             ...   
15174    -117.07     33.03                14.0       6665.0          1231.0   
12661    -121.42     38.51                15.0       7901.0          1422.0   
19263    -122.72     38.44                48.0        707.0           166.0   
19140    -122.70     38.31                14.0       3155.0           580.0   
19773    -122.14     39.97                27.0       1079.0           222.0   

       population  households  median_income ocean_proximity  
12655      2237.0       706.0         2.1736          INLAND  
15502      2015.0       768.0         6.3373      NEAR OCEAN  
2908        667.0       300.0         2.8750          INLAND  
14053       898.0       483.0         2.2264      NEAR OCEAN  
20496      1837.0       580.0         4.4964       <1H OCEAN  
...           ...         ...            ...             ...  
15174      2026.0      1001.0         5.0900       <1H OCEAN  
12661      4769.0      1418.0         2.8139          INLAND  
19263       458.0       172.0         3.1797       <1H OCEAN  
19140      1208.0       501.0         4.1964       <1H OCEAN  
19773       625.0       197.0         3.1319          INLAND  

[16354 rows x 9 columns]

       longitude  latitude  housing_median_age  total_rooms  population  \
12655    -121.46     38.52                29.0       3873.0      2237.0   
15502    -117.23     33.09                 7.0       5320.0      2015.0   
2908     -119.04     35.37                44.0       1618.0       667.0   
14053    -117.13     32.75                24.0       1877.0       898.0   
20496    -118.70     34.28                27.0       3536.0      1837.0   
...          ...       ...                 ...          ...         ...   
15174    -117.07     33.03                14.0       6665.0      2026.0   
12661    -121.42     38.51                15.0       7901.0      4769.0   
19263    -122.72     38.44                48.0        707.0       458.0   
19140    -122.70     38.31                14.0       3155.0      1208.0   
19773    -122.14     39.97                27.0       1079.0       625.0   

       households  median_income ocean_proximity  
12655       706.0         2.1736          INLAND  
15502       768.0         6.3373      NEAR OCEAN  
2908        300.0         2.8750          INLAND  
14053       483.0         2.2264      NEAR OCEAN  
20496       580.0         4.4964       <1H OCEAN  
...           ...            ...             ...  
15174      1001.0         5.0900       <1H OCEAN  
12661      1418.0         2.8139          INLAND  
19263       172.0         3.1797       <1H OCEAN  
19140       501.0         4.1964       <1H OCEAN  
19773       197.0         3.1319          INLAND  

[16512 rows x 8 columns]

1606     433.0
10915    433.0
19150    433.0
4186     433.0
16885    433.0
Name: total_bedrooms, dtype: float64

<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 12655 to 19773
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16512 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
dtypes: float64(8)
memory usage: 1.1 MB

[[1.]
 [4.]
 [1.]
 [4.]
 [0.]
 [3.]
 [0.]
 [0.]
 [0.]
 [0.]]
[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
      dtype=object)]

  (0, 1)	1.0
  (1, 4)	1.0
  (2, 1)	1.0
  (3, 4)	1.0
  (4, 0)	1.0
  (5, 3)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 0)	1.0
  (9, 0)	1.0
  (10, 1)	1.0
  (11, 0)	1.0
  (12, 1)	1.0
  (13, 1)	1.0
  (14, 4)	1.0
  (15, 0)	1.0
  (16, 0)	1.0
  (17, 0)	1.0
  (18, 3)	1.0
  (19, 0)	1.0
  (20, 1)	1.0
  (21, 3)	1.0
  (22, 1)	1.0
  (23, 0)	1.0
  (24, 1)	1.0
  :	:
  (16487, 1)	1.0
  (16488, 0)	1.0
  (16489, 4)	1.0
  (16490, 4)	1.0
  (16491, 1)	1.0
  (16492, 1)	1.0
  (16493, 0)	1.0
  (16494, 0)	1.0
  (16495, 0)	1.0
  (16496, 1)	1.0
  (16497, 0)	1.0
  (16498, 4)	1.0
  (16499, 0)	1.0
  (16500, 0)	1.0
  (16501, 1)	1.0
  (16502, 1)	1.0
  (16503, 1)	1.0
  (16504, 1)	1.0
  (16505, 0)	1.0
  (16506, 0)	1.0
  (16507, 0)	1.0
  (16508, 1)	1.0
  (16509, 0)	1.0
  (16510, 0)	1.0
  (16511, 1)	1.0
[[0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0.]
 ...
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]]
[[-0.94135046  1.34743822  0.02756357 ...  0.73260236  0.55628602
  -0.8936472 ]
 [ 1.17178212 -1.19243966 -1.72201763 ...  0.53361152  0.72131799
   1.292168  ]
 [ 0.26758118 -0.1259716   1.22045984 ... -0.67467519 -0.52440722
  -0.52543365]
 ...
 [-1.5707942   1.31001828  1.53856552 ... -0.86201341 -0.86511838
  -0.36547546]
 [-1.56080303  1.2492109  -1.1653327  ... -0.18974707  0.01061579
   0.16826095]
 [-1.28105026  2.02567448 -0.13148926 ... -0.71232211 -0.79857323
  -0.390569  ]]

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
[[-0.94135046  1.34743822  0.02756357 ...  0.          0.
   0.        ]
 [ 1.17178212 -1.19243966 -1.72201763 ...  0.          0.
   1.        ]
 [ 0.26758118 -0.1259716   1.22045984 ...  0.          0.
   0.        ]
 ...
 [-1.5707942   1.31001828  1.53856552 ...  0.          0.
   0.        ]
 [-1.56080303  1.2492109  -1.1653327  ...  0.          0.
   0.        ]
 [-1.28105026  2.02567448 -0.13148926 ...  0.          0.
   0.        ]]
[-0.94135046  1.34743822  0.02756357  0.58477745  0.64037127  0.73260236
  0.55628602 -0.8936472   0.          1.          0.          0.
  0.        ]

짬그브

+AI to AI+

이전 포스트

파이토치 (RNN 예제)

다음 포스트

딥러닝(비지도학습)

Ex1

파이토치 (RNN 예제)

딥러닝(차원축소)

0개의 댓글