- feature engineering에서의 feature extraction
- 기존 feature에 기반하여 새로운 feature 생성
📜 과정
◽ 데이터 로드
** 타이타닉 데이터셋 다운로드
import numpy as np
import pandas as pd
DATA_PATH = '../data/titanic/'
df = pd.read_csv(DATA_PATH + 'train.csv')
df.columns = [i.lower() for i in df.columns]
◽ 데이터 확인
df.info()
df.describe(include="all")
df.head()
◽ 데이터 분리
from sklearn.model_selection import train_test_split
SEED = 0
x_train, x_test = train_test_split(df, random_state=SEED, test_size=0.2)
x_train.reset_index(drop=True, inplace=True)
x_test.reset_index(drop=True, inplace=True)
print(x_train.shape, x_test.shape)
((712, 12), (179, 12))
◽ Data Cleaning
필요없는 데이터 제거
print(x_train['passengerid'].nunique(), x_train.shape[0])
x_train.drop('passengerid', axis=1, inplace=True)
x_test.drop('passengerid', axis=1, inplace=True)
결측치 제거
print(x_train.isnull().sum().sort_values(ascending=False))
x_train = x_train.drop('cabin', axis=1)
x_test = x_test.drop('cabin', axis=1)
x_train['age'] = x_train['age'].fillna(x_train['age'].median())
x_test['age'] = x_test['age'].fillna(x_train['age'].median())
embarked_mode = x_train['embarked'].mode().values[0]
x_train['embarked'] = x_train['embarked'].fillna(embarked_mode)
x_test['embarked'] = x_test['embarked'].fillna(embarked_mode)
x_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 712 non-null int64
1 pclass 712 non-null int64
2 name 712 non-null object
3 sex 712 non-null object
4 age 712 non-null float64
5 sibsp 712 non-null int64
6 parch 712 non-null int64
7 ticket 712 non-null object
8 fare 712 non-null float64
9 embarked 712 non-null object
dtypes: float64(2), int64(4), object(4)
memory usage: 55.8+ KB
수치형 데이터 타입 변환
pd.set_option('display.float_format', lambda x: '%.3f' % x)
df_number = df.select_dtypes(include=np.number)
df_number.describe()
| passengerid | survived | pclass | age | sibsp | parch | fare |
---|
count | 891.000 | 891.000 | 891.000 | 714.000 | 891.000 | 891.000 | 891.000 |
mean | 446.000 | 0.384 | 2.309 | 29.699 | 0.523 | 0.382 | 32.204 |
std | 257.354 | 0.487 | 0.836 | 14.526 | 1.103 | 0.806 | 49.693 |
min | 1.000 | 0.000 | 1.000 | 0.420 | 0.000 | 0.000 | 0.000 |
25% | 223.500 | 0.000 | 2.000 | 20.125 | 0.000 | 0.000 | 7.910 |
50% | 446.000 | 0.000 | 3.000 | 28.000 | 0.000 | 0.000 | 14.454 |
75% | 668.500 | 1.000 | 3.000 | 38.000 | 1.000 | 0.000 | 31.000 |
max | 891.000 | 1.000 | 3.000 | 80.000 | 8.000 | 6.000 | 512.329 |
x_train['survived'] = x_train['survived'].astype('int32')
x_test['survived'] = x_test['survived'].astype('int32')
x_train['age'] = x_train['age'].astype('int32')
x_test['age'] = x_test['age'].astype('int32')
x_train['fare'] = x_train['fare'].astype('float32')
x_test['fare'] = x_test['fare'].astype('float32')
x_train['pclass'] = x_train['pclass'].astype('category')
x_test['pclass'] = x_test['pclass'].astype('category')
x_train['sibsp'] = x_train['sibsp'].astype('category')
x_test['sibsp'] = x_test['sibsp'].astype('category')
x_train['parch'] = x_train['parch'].astype('category')
x_test['parch'] = x_test['parch'].astype('category')
범주형 데이터 타입 변환
df_object = x_train.select_dtypes(include='object')
df_object.describe()
| name | sex | ticket | embarked |
---|
count | 712 | 712 | 712 | 712 |
unique | 712 | 2 | 569 | 3 |
top | Boulos, Mrs. Joseph (Sultana) | male | CA. | 2343 |
freq | 1 | 465 | 7 | 519 |
x_train['sex'] = x_train['sex'].astype('category')
x_test['sex'] = x_test['sex'].astype('category')
x_train['embarked'] = x_train['embarked'].astype('category')
x_test['embarked'] = x_test['embarked'].astype('category')
중간 점검 (데이터 형식)
x_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 passengerid 712 non-null int64
1 survived 712 non-null int32
2 pclass 712 non-null category
3 name 712 non-null object
4 sex 712 non-null category
5 age 712 non-null int32
6 sibsp 712 non-null category
7 parch 712 non-null category
8 ticket 712 non-null object
9 fare 712 non-null float32
10 embarked 712 non-null category
dtypes: category(5), float32(1), int32(2), int64(1), object(2)
memory usage: 29.7+ KB
문자열
df_object = x_train.select_dtypes(include=object)
df_object.describe()
| name | ticket |
---|
count | 712 | 712 |
unique | 712 | 569 |
top | Boulos, | Mrs. |
freq | 1 | 7 |
** 공백 제거
x_train['name'] = x_train['name'].map(lambda x: x.strip())
x_test['name'] = x_test['name'].map(lambda x: x.strip())
x_train['ticket'] = x_train['ticket'].map(lambda x: x.strip())
x_test['ticket'] = x_test['ticket'].map(lambda x: x.strip())
** 문자열 포함 여부
dict_designation = {
'Mr.': '남성',
'Master.': '남성',
'Sir.': '남성',
'Miss.': '미혼 여성',
'Mrs.': '기혼 여성',
'Ms.': '미혼/기혼 여성',
'Lady.': '숙녀',
'Mlle.': '아가씨',
'Dr.': '의사',
'Rev.': '목사',
'Major.': '계급',
'Don.': '교수',
'Col.': '군인',
'Capt.': '군인',
'Mme.': '영부인',
'Countess.': '백작부인',
'Jonkheer.': '귀족'
}
def add_designation(name):
designation = 'unknown'
for key in dict_designation.keys():
if key in name:
designation = key
break
return designation
x_train['designation'] = x_train['name'].map(lambda x: add_designation(x))
x_test['designation'] = x_test['name'].map(lambda x: add_designation(x))
x_train[['name', 'designation']].head()
| name | designation |
---|
0 | Boulos, | Mrs. |
1 | Kvillner, | Mr. |
2 | Mallet, | Mr. |
3 | Betros, | Mr. |
4 | Windelov, | Mr. |
** 문자열 분리
x_train['last_name'] = x_train['name'].map(lambda x: x.split(',')[0])
x_test['last_name'] = x_test['name'].map(lambda x: x.split(',')[0])
x_train[['name', 'last_name']].head()
| name | last_name |
---|
0 | Boulos, Mrs. Joseph (Sultana) | Boulos |
1 | Kvillner, Mr. Johan Henrik Johannesson | Kvillner |
2 | Mallet, Mr. Albert | Mallet |
3 | Betros, Mr. Tannous | Betros |
4 | Windelov, Mr. Einar | Windelov |
집계
** pivot_table
df_pivot = pd.pivot_table(
data=x_train,
index='pclass',
values='fare',
aggfunc='mean'
).reset_index()
df_pivot.columns = ['pclass', 'fare_mean_by_pclass']
df_pivot
| pclass | fare_mean_by_pclass |
---|
0 | 1 | 83.370 |
1 | 2 | 20.776 |
2 | 3 | 13.839 |
그룹
agg_dict = {'survived': 'mean', 'sibsp': 'nunique', 'parch': 'nunique'}
df_groupby = x_train.groupby('pclass').agg(agg_dict).reset_index()
df_groupby.columns = ['pclass', 'survived_by_pclass', 'len_sibsp_by_pclass', 'len_parch_by_pclass']
df_groupby
| pclass | survived_by_pclass | len_sibsp_by_pclass | len_parch_by_pclass |
---|
0 | 1 | 0.607362 | 4 | 4 |
1 | 2 | 0.483444 | 4 | 4 |
2 | 3 | 0.241206 | 7 | 7 |
시계열 데이터
-
날짜 형식의 컬럼이 존재할 때 사용
-
(▼ titanic 데이터 아님)
Column | Non-Null Count | Dtype |
---|
date | 142524 non-null | datetime64[ns] |
df_cinemaTicket[['date']].head()
| date |
---|
0 | 2018-05-05 |
1 | 2018-05-05 |
2 | 2018-05-05 |
3 | 2018-05-05 |
4 | 2018-05-05 |
print(df_cinemaTicket['date'].dt.year.head(3))
print(df_cinemaTicket['date'].dt.month.head(3))
print(df_cinemaTicket['date'].dt.day.head(3))
print(df_cinemaTicket['date'].dt.weekday.head(3))
print(df_cinemaTicket['date'].dt.dayofyear.head(3))
print(df_cinemaTicket['date'].dt.quarter.head(3))
0 2018
1 2018
2 2018
Name: date, dtype: int64
0 5
1 5
2 5
Name: date, dtype: int64
0 5
1 5
2 5
Name: date, dtype: int64
0 5
1 5
2 5
Name: date, dtype: int64
0 125
1 125
2 125
Name: date, dtype: int64
0 2
1 2
2 2
Name: date, dtype: int64