# 성별에 따른 신용등급 인원수
pd.crosstab(train['gender'], train['credit'], margins=True, margins_name='Total')
# 성별에 따른 신용등급 비율
pd.crosstab(train['gender'], train['credit'], normalize='index')
# 연령에 따른 신용등급 인원수
pd.crosstab(train['age_group'], train['credit'], margins=True, margins_name='Total')
# 연령에 따른 신용등급 비율
pd.crosstab(train['age_group'], train['credit'], normalize='index')
pd.crosstab(train['used_years'], train['credit'], margins=True, margins_name='Total')
pd.crosstab(train['used_years'], train['credit'], normalize='index')
# 성별과 연령대에 따른 신용등급 인원수
pd.crosstab([train['gender'],train['age_group']], train['credit'])
# 성별과 연령대에 따른 신용등급 비율
pd.crosstab([train['gender'],train['age_group']], train['credit'], normalize='index')
# 연령대와 사용연수에 따른 신용등급 비율
pd.crosstab([train['age_group'],train['used_years']], train['credit'], normalize='index')
# heatmap에 따른 연령대와 사용연수에 따른 신용등급 비율
## 20s
twenty = train[train['age_group']=='20s']
twentyUsedYear = pd.crosstab([twenty['age_group'],twenty['used_years']], twenty['credit'], normalize='index')
## 30s
thirty = train[train['age_group']=='30s']
thirtyUsedYear = pd.crosstab([thirty['age_group'],thirty['used_years']], thirty['credit'], normalize='index')
## 40s
forty = train[train['age_group']=='40s']
fortyUsedYear = pd.crosstab([forty['age_group'],forty['used_years']], forty['credit'], normalize='index')
## 50s
fifty = train[train['age_group']=='50s']
fiftyUsedYear = pd.crosstab([fifty['age_group'],fifty['used_years']], fifty['credit'], normalize='index')
## 60s
sixty = train[train['age_group']=='60s']
sixtyUsedYear = pd.crosstab([sixty['age_group'],sixty['used_years']], sixty['credit'], normalize='index')
# 사이즈 지정
plt.figure(figsize=(20,10))
plt.subplot(2,3,1)
sns.heatmap(twentyUsedYear, vmin=0, vmax=0.8,
linewidths=1, cmap="seismic_r", cbar=False)
plt.subplot(2,3,2)
sns.heatmap(thirtyUsedYear, vmin=0, vmax=0.8,
linewidths=1, cmap="seismic_r", cbar=False)
plt.subplot(2,3,3)
sns.heatmap(fortyUsedYear, vmin=0, vmax=0.8,
linewidths=1, cmap="seismic_r", cbar=False)
plt.subplot(2,3,4)
sns.heatmap(fiftyUsedYear, vmin=0, vmax=0.8,
linewidths=1, cmap="seismic_r", cbar=False)
plt.subplot(2,3,5)
sns.heatmap(sixtyUsedYear, vmin=0, vmax=0.8,
linewidths=1, cmap="seismic_r", cbar=False)
plt.show()
# 부동산 소유에 따른신용등급 인원수
pd.crosstab(train['reality'], train['credit'], margins=True, margins_name='Total')
# 부동산 소유에 따른신용등급 비율
pd.crosstab(train['reality'], train['credit'], normalize='index')
# 부동산 종류에 따른신용등급 인원수
pd.crosstab(train['house_type'], train['credit'], margins=True, margins_name='Total')
# 부동산 종류에 따른신용등급 비율
pd.crosstab(train['house_type'], train['credit'], normalize='index')
# 자동차 소유에 따른신용등급 인원수
pd.crosstab(train['car'], train['credit'], margins=True, margins_name='Total')
# 자동차 소유에 따른신용등급 비율
pd.crosstab(train['car'], train['credit'], normalize='index')
# 20대의 차량 소유에 따른 신용등급 비율
pd.crosstab(twenty['car'], twenty['credit'], normalize='index')
pd.crosstab([twenty['car'], twenty['used_years']], twenty['credit'], margins=True, margins_name='Total')
pd.crosstab([twenty['car'], twenty['used_years']], twenty['credit'], normalize='index')
# 가족 유형에 따른 신용등급 인원수
pd.crosstab(train['family_type'], train['credit'], margins=True, margins_name='Total')
# 가족 유형에 따른 신용등급 비율
pd.crosstab(train['family_type'], train['credit'], normalize='index')
pd.crosstab(train['family_type'], train['age_group'], margins=True, margins_name='Total')
pd.crosstab(train['family_type'], train['age_group'], normalize='columns')
# 가족 인원수에 따른 신용등급
pd.crosstab(train['family_size'], train['credit'], margins='total', margins_name='Total')
# 가족 인원 1~4명에 따른 신용등급 비율
pd.crosstab(train['family_size'][train['family_size'] < 5], train['credit'], normalize='index')
# 자녀에 따른 신용등급
pd.crosstab(train['child_num'], train['credit'], margins='total', margins_name='Total')
# 자녀에 따른 신용등급
pd.crosstab(train['child_num'][train['child_num']<4], train['credit'], normalize='index')
# 자녀가 3명 이하인 경우에 따른 가족 유형 인원수
pd.crosstab(train['child_num'][train['child_num']<4], train['family_type'], margins='total', margins_name='Total')
# 자녀가 3명 이하인 경우에 따른 가족 유형 인원수
pd.crosstab([train['family_type'], train['child_num'][train['child_num']<4]],
train['credit'], margins='total', margins_name='Total')
# income_type에 따른 신용 등급 인원수
pd.crosstab(train['income_type'], train['credit'], margins='total', margins_name='Total')
# income_type에 따른 신용 등급 비율
pd.crosstab(train['income_type'], train['credit'], normalize='index')
# 4분위에 대한 정보
pd.crosstab(train['income_quartile'], train['credit'], normalize='index')
sns.displot(x='income_quintile', hue='credit', multiple='stack', col='credit', bins=4, data=train)
# 5분위에 대한 정보
pd.crosstab(train['income_quintile'], train['credit'], normalize='index')
# 5분위에 따른 신용 등급
sns.displot(x='income_quintile', hue='credit', multiple='stack', col='credit', bins=5,data=train)
# 10분위에 대한 정보
pd.crosstab(train['income_decile'], train['credit'], normalize='index')
# 10분위에 따른 신용 등급
sns.displot(x='income_decile', hue='credit', multiple='stack', col='credit', bins=10,data=train)
# 근무 연수에 대한 정보
train['worked_year'].describe()
## -1은 무직자를 말한다.
count 26457.000000
mean 5.441736
std 6.581967
min -1.000000
25% 1.000000
50% 4.000000
75% 8.000000
max 43.000000
Name: worked_year, dtype: float64
creditZero = train[train['credit']==0]
creditOne = train[train['credit']==1]
creditTwo = train[train['credit']==2]
plt.figure(figsize=(25, 8))
# 근무 연수에 따른 신용 등급 0의 인원수
plt.subplot(1, 3, 1)
creditZeroPlot = sns.histplot(x='worked_year', hue='credit', bins=43, data=creditZero)
creditZeroPlot.set_xlabel('Worked Years', fontsize=13)
creditZeroPlot.set_ylabel('Number of People', fontsize=13)
creditZeroPlot.set_title("Credit 0's Number of People by Worked Years", fontsize=16)
# 근무 연수에 따른 신용 등급 1의 인원수
plt.subplot(1, 3, 2)
creditOnePlot = sns.histplot(x='worked_year', hue='credit', bins=43, data=creditOne)
creditOnePlot.set_xlabel('Worked Years', fontsize=13)
creditOnePlot.set_ylabel('Number of People', fontsize=13)
creditOnePlot.set_title("Credit 1's Number of People by Worked Years", fontsize=16)
# 근무 연수에 따른 신용 등급 1의 인원수
plt.subplot(1, 3, 3)
creditTwoPlot = sns.histplot(x='worked_year', hue='credit', bins=43, data=creditTwo)
creditTwoPlot.set_xlabel('Worked Years', fontsize=13)
creditTwoPlot.set_ylabel('Number of People', fontsize=13)
creditTwoPlot.set_title("Credit 2's Number of People by Worked Years", fontsize=16)
plt.show()
따라서 근무 연수와 상관없이 신용등급이 결정될 수 있다는 가정이 생긴다.
# 성별에 따른 신용등급 0의 인원수
plt.figure(figsize=(12,7))
genderCreditZero = sns.histplot(x='worked_year', hue='gender', multiple='stack', bins=45, data=creditZero)
genderCreditZero.set_xlabel('Worked Years', fontsize=13)
genderCreditZero.set_ylabel('Number of People', fontsize=13)
genderCreditZero.set_title("Credit 0's Number of Men and Women by Worked Years", fontsize=16)
plt.show()
# 성별에 따른 신용등급 1의 인원수
plt.figure(figsize=(12,7))
genderCreditOne = sns.histplot(x='worked_year', hue='gender', multiple='stack', bins=45, data=creditOne)
genderCreditOne.set_xlabel('Worked Years', fontsize=13)
genderCreditOne.set_ylabel('Number of People', fontsize=13)
genderCreditOne.set_title("Credit 1's Number of Men and Women by Worked Years", fontsize=16)
plt.show()
# 성별에 따른 신용등급 2의 인원수
plt.figure(figsize=(12,7))
genderCreditTwo = sns.histplot(x='worked_year', hue='gender', multiple='stack', bins=45, data=creditTwo)
genderCreditTwo.set_xlabel('Worked Years', fontsize=13)
genderCreditTwo.set_ylabel('Number of People', fontsize=13)
genderCreditTwo.set_title("Credit 2's Number of Men and Women by Worked Years", fontsize=16)
plt.show()
train[train['worked_year']==-1].info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4438 entries, 14 to 26443
Data columns (total 22 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 gender 4438 non-null object
1 car 4438 non-null object
2 reality 4438 non-null object
3 child_num 4438 non-null int64
4 income_total 4438 non-null float64
5 income_type 4438 non-null object
6 edu_type 4438 non-null object
7 family_type 4438 non-null object
8 house_type 4438 non-null object
9 DAYS_BIRTH 4438 non-null int64
10 DAYS_EMPLOYED 4438 non-null int64
11 occyp_type 0 non-null object
12 family_size 4438 non-null float64
13 begin_month 4438 non-null float64
14 credit 4438 non-null float64
15 income_quartile 4438 non-null float64
16 income_quintile 4438 non-null float64
17 income_decile 4438 non-null float64
18 age 4438 non-null int64
19 age_group 4438 non-null object
20 used_years 4438 non-null int64
21 worked_year 4438 non-null int64
dtypes: float64(7), int64(6), object(9)
memory usage: 797.5+ KB
train[train['income_type']=='Pensioner'].info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4449 entries, 14 to 26443
Data columns (total 22 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 gender 4449 non-null object
1 car 4449 non-null object
2 reality 4449 non-null object
3 child_num 4449 non-null int64
4 income_total 4449 non-null float64
5 income_type 4449 non-null object
6 edu_type 4449 non-null object
7 family_type 4449 non-null object
8 house_type 4449 non-null object
9 DAYS_BIRTH 4449 non-null int64
10 DAYS_EMPLOYED 4449 non-null int64
11 occyp_type 9 non-null object
12 family_size 4449 non-null float64
13 begin_month 4449 non-null float64
14 credit 4449 non-null float64
15 income_quartile 4449 non-null float64
16 income_quintile 4449 non-null float64
17 income_decile 4449 non-null float64
18 age 4449 non-null int64
19 age_group 4449 non-null object
20 used_years 4449 non-null int64
21 worked_year 4449 non-null int64
dtypes: float64(7), int64(6), object(9)
memory usage: 799.4+ KB
train[(train['worked_year']==-1)&(train['income_type']=='Pensioner')].info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4438 entries, 14 to 26443
Data columns (total 22 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 gender 4438 non-null object
1 car 4438 non-null object
2 reality 4438 non-null object
3 child_num 4438 non-null int64
4 income_total 4438 non-null float64
5 income_type 4438 non-null object
6 edu_type 4438 non-null object
7 family_type 4438 non-null object
8 house_type 4438 non-null object
9 DAYS_BIRTH 4438 non-null int64
10 DAYS_EMPLOYED 4438 non-null int64
11 occyp_type 0 non-null object
12 family_size 4438 non-null float64
13 begin_month 4438 non-null float64
14 credit 4438 non-null float64
15 income_quartile 4438 non-null float64
16 income_quintile 4438 non-null float64
17 income_decile 4438 non-null float64
18 age 4438 non-null int64
19 age_group 4438 non-null object
20 used_years 4438 non-null int64
21 worked_year 4438 non-null int64
dtypes: float64(7), int64(6), object(9)
memory usage: 797.5+ KB
결측치 처리
# 무직자와 연금수령자 중 무직자에 대해 Unempolyed 지정
train['occyp_type'][(train.worked_year==-1)&(train.income_type=='Pensioner')] = 'Unempolyed'
train['occyp_type'][train.worked_year==-1] = 'Unempolyed'
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 22 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 gender 26457 non-null object
1 car 26457 non-null object
2 reality 26457 non-null object
3 child_num 26457 non-null int64
4 income_total 26457 non-null float64
5 income_type 26457 non-null object
6 edu_type 26457 non-null object
7 family_type 26457 non-null object
8 house_type 26457 non-null object
9 DAYS_BIRTH 26457 non-null int64
10 DAYS_EMPLOYED 26457 non-null int64
11 occyp_type 22724 non-null object
12 family_size 26457 non-null float64
13 begin_month 26457 non-null float64
14 credit 26457 non-null float64
15 income_quartile 26457 non-null float64
16 income_quintile 26457 non-null float64
17 income_decile 26457 non-null float64
18 age 26457 non-null int64
19 age_group 26457 non-null object
20 used_years 26457 non-null int64
21 worked_year 26457 non-null int64
dtypes: float64(7), int64(6), object(9)
memory usage: 4.4+ MB
train.dropna(axis=0, inplace=True)
train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 22724 entries, 1 to 26456
Data columns (total 22 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 gender 22724 non-null object
1 car 22724 non-null object
2 reality 22724 non-null object
3 child_num 22724 non-null int64
4 income_total 22724 non-null float64
5 income_type 22724 non-null object
6 edu_type 22724 non-null object
7 family_type 22724 non-null object
8 house_type 22724 non-null object
9 DAYS_BIRTH 22724 non-null int64
10 DAYS_EMPLOYED 22724 non-null int64
11 occyp_type 22724 non-null object
12 family_size 22724 non-null float64
13 begin_month 22724 non-null float64
14 credit 22724 non-null float64
15 income_quartile 22724 non-null float64
16 income_quintile 22724 non-null float64
17 income_decile 22724 non-null float64
18 age 22724 non-null int64
19 age_group 22724 non-null object
20 used_years 22724 non-null int64
21 worked_year 22724 non-null int64
dtypes: float64(7), int64(6), object(9)
memory usage: 4.0+ MB
# credit값을 object로 치환
train['credit'] = train['credit'].astype('object')
# occyp_type에 따른 신용등급
plt.figure(figsize=(12,7))
occypPlot = sns.histplot(x='occyp_type', hue='credit', multiple='stack', bins=19, data=train)
plt.xticks(rotation = 90)
occypPlot.set_xlabel('Occyp Type', fontsize=13)
occypPlot.set_ylabel('Number of People', fontsize=13)
occypPlot.set_title("Credit Rank Number of People by Occpy Type", fontsize=16)
plt.show()
pd.crosstab(train['occyp_type'], train['credit'], normalize='index')