import pandas as pd
HANDS2_HOUSEPRICE = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv"
df_data = pd.read_csv(HANDS2_HOUSEPRICE)
df_data.head()
df_data.describe(include = 'all')
df_data['median_income'].describe()
df_data.describe()
실행 결과, 컬럼 longtitude와 같이 mean과 50%(median)간 차이가 작은 경우와 total_rooms, population과 같이 차이가 크게 난 경우를 볼 수 있다.
df_data['median_income'].plot(kind='density')
df_data['median_income'].sort_values().cumsum().reset_index(drop=True).plot()
from pandas import Series
def min_max_scaling(series:Series) -> Series:
return (series - series.min())/(series.max() - series.min())
from pandas import Series
def normalize(series:Series) -> Series:
return (series - series.mean())/series.std()
import pandas as pd
BIN = 5
CAT = range(10)
pd.cut(df_data['median_income], q=BIN, labels=CAT)
df_data['median_income'].plot(kind='box)
from typing import Tuple
from pandas import Series
def include_outlier(series:Series, bounds:Tuple[int, int] = (0.1, 0.9)):
series = series.copy()
lower,upper = bounds
l, u = series.quantile(lower), series.quantile(upper)
series[series < l], series[series > u] = l, u
return series
def include_outlier2(series:Series):
series = series.copy()
q3, q1 = series.quantile(0.75), series.quantile(0.25)
iqr = q3 - q1
series[series < q1 - (1.5 * iqr)], series[series > q3 + (1.5 * iqr)] = q1, q3
return series
def drop_outlier(series:Series, bounds:Tuple[int, int] = (0.1, 0.9)):
series = series.copy()
lower, upper = bounds
s, e = series.quantile(lower), series.quantile(upper)
return series[(s <= series) & (series <= e)]
def drop_outlier2(series:Series):
series = series.copy()
q3, q1 = series.quantile(0.75), series.quantile(0.25)
iqr = q3 - q1
l = q1 - (1.5*iqr)
u = q3 + (1.5*iqr)
return series[(l<=series) & (series <= u)]
아웃라이어 처리 함수 적용
include_outlier(df_data['median_income'], (0.1, 0.9)).describe()
drop_outlier(df_data['median_income'], (0.1, 0.9))