import pandas as pd
# data 출처: https://www.kaggle.com/hesh97/titanicdataset-traincsv/data
train_data = pd.read_csv('../train.csv')
train_data.head()
NaN 값 확인
train_data.info()
>>> <class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
train_data.isna()
train_data['Age'].isna()
train_data['Age'].isna()
>>> 0 False
1 False
2 False
3 False
4 False
...
886 False
887 False
888 True
889 False
890 False
Name: Age, Length: 891, dtype: bool
NaN 처리 방법
데이터에서 삭제
다른 값으로 치환
NaN 데이터 삭제하기
train_data.dropna()
train_data.dropna(subset=['Age', 'Cabin'])
train_data.dropna(axis=1)
train_data['Age'].fillna(train_data['Age'].mean())
>>> 0 22.000000
1 38.000000
2 26.000000
3 35.000000
4 35.000000
...
886 27.000000
887 19.000000
888 29.699118
889 26.000000
890 32.000000
Name: Age, Length: 891, dtype: float64
# 생존자 나이 평균
mean1 = train_data[train_data['Survived'] == 1]['Age'].mean()
# 사망자 나이 평균
mean0 = train_data[train_data['Survived'] == 0]['Age'].mean()
print(mean1, mean0)
>>> 28.343689655172415 30.62617924528302
train_data[train_data['Survived'] == 1]['Age'].fillna(mean1)
train_data[train_data['Survived'] == 0]['Age'].fillna(mean0)
>>> 0 22.000000
4 35.000000
5 30.626179
6 54.000000
7 2.000000
...
884 25.000000
885 39.000000
886 27.000000
888 30.626179
890 32.000000
Name: Age, Length: 549, dtype: float64
train_data.loc[train_data['Survived'] == 1, 'Age'] = train_data[train_data['Survived'] == 1]['Age'].fillna(mean1)
train_data.loc[train_data['Survived'] == 0, 'Age'] = train_data[train_data['Survived'] == 0]['Age'].fillna(mean0)
import pandas as pd
# data 출처: https://www.kaggle.com/hesh97/titanicdataset-traincsv/data
train_data = pd.read_csv('../train.csv')
train_data.head()
info함수로 각 변수의 데이터 타입 확인
train_data.info()
>>> <class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
숫자형(Numerical Type) 데이터
범주형(Categorical Type) 데이터
import pandas as pd
# data 출처: https://www.kaggle.com/hesh97/titanicdataset-traincsv/data
train_data = pd.read_csv('../train.csv')
train_data.head()
Pclass 변수 변환하기
train_data.info()
>>> <class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null object
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(4), object(6)
memory usage: 83.7+ KB
train_data['Pclass'] = train_data['Pclass'].astype(str)
Age 변수 변환하기
import math
def age_categorize(age):
if math.isnan(age):
return -1
return math.floor(age / 10) * 10
train_data
train_data['Age'].apply(age_categorize)
>>> 0 20
1 30
2 20
3 30
4 30
..
886 20
887 10
888 -1
889 20
890 30
Name: Age, Length: 891, dtype: int64
머신러닝과 데이터 분석 A-Z 올인원 패키지 Online. 👉 https://bit.ly/3cB3C8y