오늘은 300제 문제 풀면서
회귀 관련 문제들을 정리하겠습니다.
df.isna().sum() # null 개수
df.columns # 26개
df.drop([필요없는 것들 제외], axis=1, inplace=True)
df['age'] = 2021 - df['year']
df.drop('year', axis=1, inplace=True)
df.columns # 26개
# 제조사 43개
df['manufacturer'].value_counts()
fig = plt.figure(figsize=(8,12))
sns.countplot(x='manufacturer', data=df.fillna('n/a), order = df.fillna('n/a')['manufacturer'].value_counts().index)
# 차량 모델 31520
df['model'].value_counts()
fig = plt.figure(figsize=(8,12))
sns.countplot(x='model', data=df.fillna('n/a), order = df.fillna('n/a')['model].value_counts().index)
# codition
df['codition'].value_counts()
fig = plt.figure(figsize=(8,12))
sns.countplot(x='codition', data=df.fillna('n/a), order = df.fillna('n/a')['codition].value_counts().index)
# cynlinders
df['cynlinders'].value_counts()
fig = plt.figure(figsize=(8,12))
sns.countplot(x='cynlinders', data=df.fillna('n/a), order = df.fillna('n/a')['cynlinders].value_counts().index)
# transmission
df['transmission'].value_counts()
fig = plt.figure(figsize=(8,12))
sns.countplot(x='transmission', data=df.fillna('n/a), order = df.fillna('n/a')['transmission].value_counts().index)
# price
sns.histplot(x='price', data=df)
sns.bosplot(x='price', data=df)
sns.rugplot(x='price', data=df, height=1)
# odometer
sns.histplot(x='odometer', data=df)
# age
sns.histplot(x='age', data=df)
sns.histplot(x='age', data=df, bins=18, kde=True)
# 현재 불가능
sns.boxplot(x='manufaturer', y='price', data=df.fillna('n/a'))
df['manufacturer'].fillna('others').value_counts()
col = 'manufacturer'
counts = df[col].fillna('others').value_counts()
plt.grid()
plt.plot(range(len(counts)), counts)
n_categorical = 10
counts_index = counts.index[n_categorical:]
df[col] = df[col].apply(lambda s:s if str(s) not in counts_index else 'others')
df[col].fillna('others', inplace=True)
df.loc[df[col] == 'other', col] = 'others'
fig = plt.figure(figsize=(8,12))
sns.rugplot(x='price', data=df, height=1)
price_1 = df['price'].quantile(0.99) # 상위 1%
price_2 = df['price'].quantile(0.1) # 하위 10%
df = df[(price_1 > dr['price'] & (df['price'] > price_2)]
df.describe()
fig = plt.figure(figsize=(14,5))
sns.boxplot(x='manufaturer', y='price', data=df)
# 절대값으로 확인
sns.heatmap(df.corr(), annot=True, cmap='YlOrRd')
x_num = df[['odometer', 'age']]
scaler = StandardScaler()
scaler.fit(x_num)
x_scaled = scaler.transform(x_num)
x_scaled = pd.DataFrame(x_scaled, index=x_num.index, columns=x_num.columns)
# one-hot vec
x_cat = pd.drop(['price', 'odometer', 'age'], axis=1)
x_cat = pd.get_dummies(x_cat)
x = pd.concat([x_scaled, x_cat], axis=1)
y = pd['price']
x.head()
x.shape()
x.isna().sum() # x.fillna(0.0, inplace=True) # x['age'].mean()
mdoel = XGBRegressor()
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(mean_absolute_error(y_test, pred))
print(sqrt(mean_absolute_error(y_test, pred)))
plt.scatter(x=y_test, y=pred, alpha=0.005)
plt.plot([0, 60000], [0, 60000], 'r-')
plt.histplot(x=y_test, y=pred)
plt.plot([0, 60000], [0, 60000], 'r-')
err = (pred - y_test) / y_test * 100
plt.hist(err[err < 600], bins=12)
plt.xlabel('error (%)')
plt.xlim(-100, 100)
plt.grid()
err = (pred - y_test) / y_test
plt.hist(err, bins=12)
plt.xlabel('error ($)')
plt.grid()