Kaggle: Home Credit Default Risk 세 번째 모델 학습 및 성능 평가

코드싸개·2021년 1월 8일
1

prev_application 데이터 세트 기반의 EDA와 Feature Engineering 수행 후 학습 모델 생성/평가

라이브러리 및 데이터 세트 로딩. 이전 application 데이터의 FE 함수 복사

import numpy as np # 1.18.5
import pandas as pd # 0.25.1
import gc
import time
import matplotlib.pyplot as plt # 3.2.2
import seaborn as sns # 0.10.1
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)
app_train = pd.read_csv('application_train.csv')
app_test = pd.read_csv('application_test.csv')
def get_apps_dataset():
    app_train = pd.read_csv('application_train.csv')
    app_test = pd.read_csv('application_test.csv')
    apps = pd.concat([app_train, app_test])
    
    return apps

apps = get_apps_dataset()
<ipython-input-3-da4fa99e2eb3>:4: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  apps = pd.concat([app_train, app_test])

이전 application 데이터의 feature engineering 함수 복사

def get_apps_processed(apps):
    
    # EXT_SOURCE_X FEATURE 가공
    apps['APPS_EXT_SOURCE_MEAN'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    apps['APPS_EXT_SOURCE_STD'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    apps['APPS_EXT_SOURCE_STD'] = apps['APPS_EXT_SOURCE_STD'].fillna(apps['APPS_EXT_SOURCE_STD'].mean())
    
    # AMT_CREDIT 비율로 Feature 가공
    apps['APPS_ANNUITY_CREDIT_RATIO'] = apps['AMT_ANNUITY']/apps['AMT_CREDIT']
    apps['APPS_GOODS_CREDIT_RATIO'] = apps['AMT_GOODS_PRICE']/apps['AMT_CREDIT']
    
    # AMT_INCOME_TOTAL 비율로 Feature 가공
    apps['APPS_ANNUITY_INCOME_RATIO'] = apps['AMT_ANNUITY']/apps['AMT_INCOME_TOTAL']
    apps['APPS_CREDIT_INCOME_RATIO'] = apps['AMT_CREDIT']/apps['AMT_INCOME_TOTAL']
    apps['APPS_GOODS_INCOME_RATIO'] = apps['AMT_GOODS_PRICE']/apps['AMT_INCOME_TOTAL']
    apps['APPS_CNT_FAM_INCOME_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['CNT_FAM_MEMBERS']
    
    # DAYS_BIRTH, DAYS_EMPLOYED 비율로 Feature 가공
    apps['APPS_EMPLOYED_BIRTH_RATIO'] = apps['DAYS_EMPLOYED']/apps['DAYS_BIRTH']
    apps['APPS_INCOME_EMPLOYED_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['DAYS_EMPLOYED']
    apps['APPS_INCOME_BIRTH_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['DAYS_BIRTH']
    apps['APPS_CAR_BIRTH_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_BIRTH']
    apps['APPS_CAR_EMPLOYED_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_EMPLOYED']
    
    return apps

previous 데이터 로딩

prev = pd.read_csv('previous_application.csv')
print(prev.shape, apps.shape)
(1670214, 37) (356255, 122)

application와 previous outer 조인하고 누락된 집합들 확인.

prev.head()
SK_ID_PREV SK_ID_CURR NAME_CONTRACT_TYPE AMT_ANNUITY AMT_APPLICATION AMT_CREDIT AMT_DOWN_PAYMENT AMT_GOODS_PRICE WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START FLAG_LAST_APPL_PER_CONTRACT NFLAG_LAST_APPL_IN_DAY RATE_DOWN_PAYMENT RATE_INTEREST_PRIMARY RATE_INTEREST_PRIVILEGED NAME_CASH_LOAN_PURPOSE NAME_CONTRACT_STATUS DAYS_DECISION NAME_PAYMENT_TYPE CODE_REJECT_REASON NAME_TYPE_SUITE NAME_CLIENT_TYPE NAME_GOODS_CATEGORY NAME_PORTFOLIO NAME_PRODUCT_TYPE CHANNEL_TYPE SELLERPLACE_AREA NAME_SELLER_INDUSTRY CNT_PAYMENT NAME_YIELD_GROUP PRODUCT_COMBINATION DAYS_FIRST_DRAWING DAYS_FIRST_DUE DAYS_LAST_DUE_1ST_VERSION DAYS_LAST_DUE DAYS_TERMINATION NFLAG_INSURED_ON_APPROVAL
0 2030495 271877 Consumer loans 1730.430 17145.0 17145.0 0.0 17145.0 SATURDAY 15 Y 1 0.0 0.182832 0.867336 XAP Approved -73 Cash through the bank XAP NaN Repeater Mobile POS XNA Country-wide 35 Connectivity 12.0 middle POS mobile with interest 365243.0 -42.0 300.0 -42.0 -37.0 0.0
1 2802425 108129 Cash loans 25188.615 607500.0 679671.0 NaN 607500.0 THURSDAY 11 Y 1 NaN NaN NaN XNA Approved -164 XNA XAP Unaccompanied Repeater XNA Cash x-sell Contact center -1 XNA 36.0 low_action Cash X-Sell: low 365243.0 -134.0 916.0 365243.0 365243.0 1.0
2 2523466 122040 Cash loans 15060.735 112500.0 136444.5 NaN 112500.0 TUESDAY 11 Y 1 NaN NaN NaN XNA Approved -301 Cash through the bank XAP Spouse, partner Repeater XNA Cash x-sell Credit and cash offices -1 XNA 12.0 high Cash X-Sell: high 365243.0 -271.0 59.0 365243.0 365243.0 1.0
3 2819243 176158 Cash loans 47041.335 450000.0 470790.0 NaN 450000.0 MONDAY 7 Y 1 NaN NaN NaN XNA Approved -512 Cash through the bank XAP NaN Repeater XNA Cash x-sell Credit and cash offices -1 XNA 12.0 middle Cash X-Sell: middle 365243.0 -482.0 -152.0 -182.0 -177.0 1.0
4 1784265 202054 Cash loans 31924.395 337500.0 404055.0 NaN 337500.0 THURSDAY 9 Y 1 NaN NaN NaN Repairs Refused -781 Cash through the bank HC NaN Repeater XNA Cash walk-in Credit and cash offices -1 XNA 24.0 high Cash Street: high NaN NaN NaN NaN NaN NaN
prev_app_outer = prev.merge(apps['SK_ID_CURR'], on='SK_ID_CURR', how='outer', indicator=True)
SK_ID_PREV SK_ID_CURR NAME_CONTRACT_TYPE AMT_ANNUITY AMT_APPLICATION AMT_CREDIT AMT_DOWN_PAYMENT AMT_GOODS_PRICE WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START FLAG_LAST_APPL_PER_CONTRACT NFLAG_LAST_APPL_IN_DAY RATE_DOWN_PAYMENT RATE_INTEREST_PRIMARY RATE_INTEREST_PRIVILEGED NAME_CASH_LOAN_PURPOSE NAME_CONTRACT_STATUS DAYS_DECISION NAME_PAYMENT_TYPE CODE_REJECT_REASON NAME_TYPE_SUITE NAME_CLIENT_TYPE NAME_GOODS_CATEGORY NAME_PORTFOLIO NAME_PRODUCT_TYPE CHANNEL_TYPE SELLERPLACE_AREA NAME_SELLER_INDUSTRY CNT_PAYMENT NAME_YIELD_GROUP PRODUCT_COMBINATION DAYS_FIRST_DRAWING DAYS_FIRST_DUE DAYS_LAST_DUE_1ST_VERSION DAYS_LAST_DUE DAYS_TERMINATION NFLAG_INSURED_ON_APPROVAL _merge
0 2030495.0 271877 Consumer loans 1730.430 17145.0 17145.0 0.0 17145.0 SATURDAY 15.0 Y 1.0 0.000000 0.182832 0.867336 XAP Approved -73.0 Cash through the bank XAP NaN Repeater Mobile POS XNA Country-wide 35.0 Connectivity 12.0 middle POS mobile with interest 365243.0 -42.0 300.0 -42.0 -37.0 0.0 both
1 1696966.0 271877 Consumer loans 68258.655 1800000.0 1754721.0 180000.0 1800000.0 SATURDAY 18.0 Y 1.0 0.101325 NaN NaN XAP Refused -472.0 Cash through the bank SCO NaN Repeater Clothing and Accessories POS XNA Regional / Local 55.0 Furniture 36.0 low_normal POS industry with interest NaN NaN NaN NaN NaN NaN both
2 2154916.0 271877 Consumer loans 12417.390 108400.5 119848.5 0.0 108400.5 SUNDAY 14.0 Y 1.0 0.000000 NaN NaN XAP Approved -548.0 Cash through the bank XAP NaN New Furniture POS XNA Stone 196.0 Furniture 12.0 middle POS industry with interest 365243.0 -512.0 -182.0 -392.0 -387.0 0.0 both
3 2802425.0 108129 Cash loans 25188.615 607500.0 679671.0 NaN 607500.0 THURSDAY 11.0 Y 1.0 NaN NaN NaN XNA Approved -164.0 XNA XAP Unaccompanied Repeater XNA Cash x-sell Contact center -1.0 XNA 36.0 low_action Cash X-Sell: low 365243.0 -134.0 916.0 365243.0 365243.0 1.0 both
4 1536272.0 108129 Cash loans 21709.125 450000.0 512370.0 NaN 450000.0 WEDNESDAY 9.0 Y 1.0 NaN NaN NaN XNA Approved -515.0 XNA XAP NaN Repeater XNA Cash x-sell AP+ (Cash loan) 6.0 XNA 36.0 low_normal Cash X-Sell: low 365243.0 -485.0 565.0 -155.0 -147.0 1.0 both
prev_app_outer['_merge'].value_counts()
both          1670214
right_only      17398
left_only           0
Name: _merge, dtype: int64

previous 컬럼 설명

Table컬럼명컬럼 대분류컬럼 중분류컬럼 설명
previous_application.csvSK_ID_PREV대출고유ID과거 대출 고유 ID
previous_application.csvSK_ID_CURR대출고유ID현재 대출 고유 ID
previous_application.csvNAME_CONTRACT_TYPE대출대출 유형대출 유형
previous_application.csvAMT_ANNUITY대출대출 금액월 대출 지급액
previous_application.csvAMT_APPLICATION대출대출 금액대출 신청 금액
previous_application.csvAMT_CREDIT대출대출 금액대출금액(허가)
previous_application.csvAMT_DOWN_PAYMENT대출대출 금액대출 시 납부한 선금액
previous_application.csvAMT_GOODS_PRICE대출대출 금액소비자 대출상품액
previous_application.csvWEEKDAY_APPR_PROCESS_START고객행동대출 신청 시작 요일
previous_application.csvHOUR_APPR_PROCESS_START고객행동대출 신청 시작 시간대
previous_application.csvFLAG_LAST_APPL_PER_CONTRACT고객행동이전 계약의 마지막 대출 신청 여부
previous_application.csvNFLAG_LAST_APPL_IN_DAY고객행동하루중 마지막 대출 신청 여부(하루에 여러 번 대출 신청했을 경우)
previous_application.csvNFLAG_MICRO_CASH대출대출 유형소액 대출 여부
previous_application.csvRATE_DOWN_PAYMENT대출대출 금액선금 비율(정규화됨)
previous_application.csvRATE_INTEREST_PRIMARY대출대출 금액이자율
previous_application.csvRATE_INTEREST_PRIVILEGED대출대출 금액이자율
previous_application.csvNAME_CASH_LOAN_PURPOSE대출대출 유형현금 대출 목적
previous_application.csvNAME_CONTRACT_STATUS대출대출 상태대출 상태(허가, 취소)
previous_application.csvDAYS_DECISION대출대출 상태과거 신청 대비 현재 신청 결정 기간
previous_application.csvNAME_PAYMENT_TYPE대출대출 유형과거 대출 신청의 납부 방법
previous_application.csvCODE_REJECT_REASON대출대출 상태과거 신청 거절 사유
previous_application.csvNAME_TYPE_SUITE고객행동(추천)동행 고객
previous_application.csvNAME_CLIENT_TYPE고객행동신규 고객 또는 기존 대출 고객 여부
previous_application.csvNAME_GOODS_CATEGORY대출대출 유형대출 상품 중분류 유형
previous_application.csvNAME_PORTFOLIO대출대출 유형현금대출/POS/CAR 대출 유형
previous_application.csvNAME_PRODUCT_TYPE채널판매 유형고객이 찾아온 대출인가, 영업 대출인가
previous_application.csvCHANNEL_TYPE채널채널 유형채널 유형
previous_application.csvSELLERPLACE_AREA채널채널 유형판매자 판매 지역
previous_application.csvNAME_SELLER_INDUSTRY채널채널 유형판매자 Industry
previous_application.csvCNT_PAYMENT대출대출 금액이전 대출 신청의 대출금액 관련 Term
previous_application.csvNAME_YIELD_GROUP대출대출 금액집단 금리 적용 유형
previous_application.csvPRODUCT_COMBINATION대출대출 유형이전 대출 결합 상품
previous_application.csvDAYS_FIRST_DRAWING대출상태신청날짜부터 최초 대출 지급까지의 일자
previous_application.csvDAYS_FIRST_DUE대출상태신청날짜부터 마감일까지의 일자
previous_application.csvDAYS_LAST_DUE_1ST_VERSION대출상태신청날짜부터 첫 만기일까지의 일자
previous_application.csvDAYS_LAST_DUE대출상태신청날짜부터 마지막 만기일까지의 일자
previous_application.csvDAYS_TERMINATION대출상태현 대출 신청일자 대비 대출 예상 종료 일자
previous_application.csvNFLAG_INSURED_ON_APPROVAL대출상태대출 신청 시 보험가입 요청여부

previous 컬럼과 Null 값 조사

prev.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670214 entries, 0 to 1670213
Data columns (total 37 columns):
SK_ID_PREV                     1670214 non-null int64
SK_ID_CURR                     1670214 non-null int64
NAME_CONTRACT_TYPE             1670214 non-null object
AMT_ANNUITY                    1297979 non-null float64
AMT_APPLICATION                1670214 non-null float64
AMT_CREDIT                     1670213 non-null float64
AMT_DOWN_PAYMENT               774370 non-null float64
AMT_GOODS_PRICE                1284699 non-null float64
WEEKDAY_APPR_PROCESS_START     1670214 non-null object
HOUR_APPR_PROCESS_START        1670214 non-null int64
FLAG_LAST_APPL_PER_CONTRACT    1670214 non-null object
NFLAG_LAST_APPL_IN_DAY         1670214 non-null int64
RATE_DOWN_PAYMENT              774370 non-null float64
RATE_INTEREST_PRIMARY          5951 non-null float64
RATE_INTEREST_PRIVILEGED       5951 non-null float64
NAME_CASH_LOAN_PURPOSE         1670214 non-null object
NAME_CONTRACT_STATUS           1670214 non-null object
DAYS_DECISION                  1670214 non-null int64
NAME_PAYMENT_TYPE              1670214 non-null object
CODE_REJECT_REASON             1670214 non-null object
NAME_TYPE_SUITE                849809 non-null object
NAME_CLIENT_TYPE               1670214 non-null object
NAME_GOODS_CATEGORY            1670214 non-null object
NAME_PORTFOLIO                 1670214 non-null object
NAME_PRODUCT_TYPE              1670214 non-null object
CHANNEL_TYPE                   1670214 non-null object
SELLERPLACE_AREA               1670214 non-null int64
NAME_SELLER_INDUSTRY           1670214 non-null object
CNT_PAYMENT                    1297984 non-null float64
NAME_YIELD_GROUP               1670214 non-null object
PRODUCT_COMBINATION            1669868 non-null object
DAYS_FIRST_DRAWING             997149 non-null float64
DAYS_FIRST_DUE                 997149 non-null float64
DAYS_LAST_DUE_1ST_VERSION      997149 non-null float64
DAYS_LAST_DUE                  997149 non-null float64
DAYS_TERMINATION               997149 non-null float64
NFLAG_INSURED_ON_APPROVAL      997149 non-null float64
dtypes: float64(15), int64(6), object(16)
memory usage: 471.5+ MB

주요 컬럼 EDA 수행

SK_ID_CURR당 평균 SK_ID_PREV 건수 구하기

  • groupby 로 평균 건수 구함.
  • boxplot으로 시각화
prev.groupby('SK_ID_CURR')['SK_ID_CURR'].count().mean()
4.928964135313716
sns.boxplot(prev.groupby('SK_ID_CURR')['SK_ID_CURR'].count())

숫자형 피처들의 Histogram을 TARGET 유형에 따라 비교

  • application_train의 TARGET 값을 가져오기 위해 prev와 app_train을 inner join 후 TARGET 유형에 따라 비교
  • 숫자형 컬럼명 필터링
app_prev = prev.merge(app_train[['SK_ID_CURR', 'TARGET']], on='SK_ID_CURR', how='left')
app_prev.shape
(1670214, 38)
def show_hist_by_target(df, columns):
    cond_1 = (df['TARGET'] == 1)
    cond_0 = (df['TARGET'] == 0)
    
    for column in columns:
        fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 4), squeeze=False)
        sns.violinplot(x='TARGET', y=column, data=df, ax=axs[0][0] )
        sns.distplot(df[cond_0][column], ax=axs[0][1], label='0', color='blue')
        sns.distplot(df[cond_1][column], ax=axs[0][1], label='1', color='red')   
num_columns = app_prev.dtypes[app_prev.dtypes != 'object'].index.tolist()
num_columns = [ column for column in num_columns if column not in ['SK_ID_PREV', 'SK_ID_CURR', 'TARGET']]
print(num_columns)
['AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE', 'HOUR_APPR_PROCESS_START', 'NFLAG_LAST_APPL_IN_DAY', 'RATE_DOWN_PAYMENT', 'RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED', 'DAYS_DECISION', 'SELLERPLACE_AREA', 'CNT_PAYMENT', 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION', 'NFLAG_INSURED_ON_APPROVAL']
show_hist_by_target(app_prev, num_columns)
/Users/andylee/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:369: UserWarning: Default bandwidth for data is 0; skipping density estimation.
  warnings.warn(msg, UserWarning)
/Users/andylee/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:369: UserWarning: Default bandwidth for data is 0; skipping density estimation.
  warnings.warn(msg, UserWarning)
/Users/andylee/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:369: UserWarning: Default bandwidth for data is 0; skipping density estimation.
  warnings.warn(msg, UserWarning)
/Users/andylee/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:369: UserWarning: Default bandwidth for data is 0; skipping density estimation.
  warnings.warn(msg, UserWarning)

  • AMT_ANNUITY, AMT_CREDIT, AMT_APPLICATION, AMT_GOODS_CREDIT는 TARGET=1일 경우에 소액 비율이 약간 높음(큰 차이는 아님)
  • RATE_DOWN_PAYMENT는 큰 차이 없음.
  • RATE_INTEREST_PRIMARY, RATE_INTEREST_PRIVILEGED 는 NULL값이 매우 많아서 판단 어려움
  • DAYS_DECISION은 TARGET=1일 때 0에 가까운(최근일)값이 약간 더 많음.
  • DAYS_FIRST_DRAWING, DAYS_FIRST_DUE, DAYS_LAST_DUE_1ST_VERSION, DAYS_LAST_DUE, DAYS_TERMINATION은 365243 값이 매우 많음.

Category 피처들의 Histogram을 TARGET 유형에 따라 비교

object_columns = app_prev.dtypes[app_prev.dtypes=='object'].index.tolist()
object_columns
['NAME_CONTRACT_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'FLAG_LAST_APPL_PER_CONTRACT',
 'NAME_CASH_LOAN_PURPOSE',
 'NAME_CONTRACT_STATUS',
 'NAME_PAYMENT_TYPE',
 'CODE_REJECT_REASON',
 'NAME_TYPE_SUITE',
 'NAME_CLIENT_TYPE',
 'NAME_GOODS_CATEGORY',
 'NAME_PORTFOLIO',
 'NAME_PRODUCT_TYPE',
 'CHANNEL_TYPE',
 'NAME_SELLER_INDUSTRY',
 'NAME_YIELD_GROUP',
 'PRODUCT_COMBINATION']
def show_category_by_target(df, columns):
    for column in columns:
        chart = sns.catplot(x=column, col="TARGET", data=df, kind="count")
        chart.set_xticklabels(rotation=65)
        
show_category_by_target(app_prev, object_columns)

  • NAME_CONTRACT_TYPE은 TARGET=1일때 CASH_LOAN의 비중이 약간 높음
  • NAME_CONTRACT_STATUS(대출허가상태)는 TARGET=1일때 상대적으로 TARGET=0 대비 (당연히) Refused의 비율이 높음.
  • NAME_PAYMENT_TYPE(대출납부방법)는 TARGET=1일때 상대적으로 TARGET=0 대비 XNA의 비율이 약간 높음.

prev 데이터 세트 feature engineering 수행.

SQL 대비 Pandas groupby 사용 로직 비교

SQL로 SK_ID_CURR별 건수, 평균 AMT_CREDIT, 최대 AMT_CREDIT, 최소 AMT_CREDIT 구하기

select sk_id_curr, count(*), avg(amt_credit) , max(amt_credit), min(amt_credit) from previous group by sk_id_curr

pandas groupby 단일 aggregation 함수 사용

  • groupby SK_ID_CURR
  • SK_ID_CURR별 건수, AMT_CREDIT에 대한 평균, 최대 값
prev.groupby('SK_ID_CURR')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fcfc8bfba90>
prev_group = prev.groupby('SK_ID_CURR')
prev_agg = pd.DataFrame()
prev_agg['CNT'] = prev_group['SK_ID_CURR'].count()
prev_agg['AVG_CREDIT'] = prev_group['AMT_CREDIT'].mean()
prev_agg['MAX_CREDIT'] = prev_group['AMT_CREDIT'].max()
prev_agg['MIN_CREDIT'] = prev_group['AMT_CREDIT'].min()

prev_agg.head()
CNT AVG_CREDIT MAX_CREDIT MIN_CREDIT
SK_ID_CURR
100001 1 23787.00 23787.0 23787.0
100002 1 179055.00 179055.0 179055.0
100003 3 484191.00 1035882.0 68053.5
100004 1 20106.00 20106.0 20106.0
100005 2 20076.75 40153.5 0.0

groupby agg()함수를 이용하여 여러개의 aggregation 함수 적용

prev_group = prev.groupby('SK_ID_CURR')
prev_agg1 = prev_group['AMT_CREDIT'].agg(['mean', 'max', 'min'])
prev_agg2 = prev_group['AMT_ANNUITY'].agg(['mean', 'max', 'min'])

prev_agg = prev_agg1.merge(prev_agg2, on='SK_ID_CURR', how='inner')
prev_agg.head()
mean_x max_x min_x mean_y max_y min_y
SK_ID_CURR
100001 23787.00 23787.0 23787.0 3951.000 3951.000 3951.000
100002 179055.00 179055.0 179055.0 9251.775 9251.775 9251.775
100003 484191.00 1035882.0 68053.5 56553.990 98356.995 6737.310
100004 20106.00 20106.0 20106.0 5357.250 5357.250 5357.250
100005 20076.75 40153.5 0.0 4813.200 4813.200 4813.200

agg()에 dictionary를 이용하여 groupby 적용

agg_dict = {
    'SK_ID_CURR':['count'],
    'AMT_CREDIT':['mean', 'max', 'sum'],
    'AMT_ANNUITY':['mean', 'max', 'sum'], 
    'AMT_APPLICATION':['mean', 'max', 'sum'],
    'AMT_DOWN_PAYMENT':['mean', 'max', 'sum'],
    'AMT_GOODS_PRICE':['mean', 'max', 'sum']
}
prev_group = prev.groupby('SK_ID_CURR')
prev_amt_agg = prev_group.agg(agg_dict)
prev_amt_agg.head()
SK_ID_CURR AMT_CREDIT AMT_ANNUITY AMT_APPLICATION AMT_DOWN_PAYMENT AMT_GOODS_PRICE
count mean max sum mean max sum mean max sum mean max sum mean max sum
SK_ID_CURR
100001 1 23787.00 23787.0 23787.0 3951.000 3951.000 3951.000 24835.50 24835.5 24835.5 2520.0 2520.0 2520.0 24835.5 24835.5 24835.5
100002 1 179055.00 179055.0 179055.0 9251.775 9251.775 9251.775 179055.00 179055.0 179055.0 0.0 0.0 0.0 179055.0 179055.0 179055.0
100003 3 484191.00 1035882.0 1452573.0 56553.990 98356.995 169661.970 435436.50 900000.0 1306309.5 3442.5 6885.0 6885.0 435436.5 900000.0 1306309.5
100004 1 20106.00 20106.0 20106.0 5357.250 5357.250 5357.250 24282.00 24282.0 24282.0 4860.0 4860.0 4860.0 24282.0 24282.0 24282.0
100005 2 20076.75 40153.5 40153.5 4813.200 4813.200 4813.200 22308.75 44617.5 44617.5 4464.0 4464.0 4464.0 44617.5 44617.5 44617.5

grouby agg 로 만들어진 Multi index 컬럼 변경.

  • MultiIndex로 되어 있는 컬럼명 확인
  • MultiIndex 컬럼명을 _로 연결하여 컬럼명 변경.

칼럼 인덱스가 계층 구조로 되어있어서 예를 들면 AMT_CREDIT_MEAN 이런식으로 바꾸려 함

prev_amt_agg.columns
MultiIndex([(      'SK_ID_CURR', 'count'),
            (      'AMT_CREDIT',  'mean'),
            (      'AMT_CREDIT',   'max'),
            (      'AMT_CREDIT',   'sum'),
            (     'AMT_ANNUITY',  'mean'),
            (     'AMT_ANNUITY',   'max'),
            (     'AMT_ANNUITY',   'sum'),
            ( 'AMT_APPLICATION',  'mean'),
            ( 'AMT_APPLICATION',   'max'),
            ( 'AMT_APPLICATION',   'sum'),
            ('AMT_DOWN_PAYMENT',  'mean'),
            ('AMT_DOWN_PAYMENT',   'max'),
            ('AMT_DOWN_PAYMENT',   'sum'),
            ( 'AMT_GOODS_PRICE',  'mean'),
            ( 'AMT_GOODS_PRICE',   'max'),
            ( 'AMT_GOODS_PRICE',   'sum')],
           )
# [(column[0]+'_'+column[1].upper()) for column in prev_amt_agg.columns]
prev_amt_agg.columns = ['PREV_' + ('_').join(column).upper() for column in prev_amt_agg.columns.ravel()]
prev_amt_agg.head()
PREV_SK_ID_CURR_COUNT PREV_AMT_CREDIT_MEAN PREV_AMT_CREDIT_MAX PREV_AMT_CREDIT_SUM PREV_AMT_ANNUITY_MEAN PREV_AMT_ANNUITY_MAX PREV_AMT_ANNUITY_SUM PREV_AMT_APPLICATION_MEAN PREV_AMT_APPLICATION_MAX PREV_AMT_APPLICATION_SUM PREV_AMT_DOWN_PAYMENT_MEAN PREV_AMT_DOWN_PAYMENT_MAX PREV_AMT_DOWN_PAYMENT_SUM PREV_AMT_GOODS_PRICE_MEAN PREV_AMT_GOODS_PRICE_MAX PREV_AMT_GOODS_PRICE_SUM
SK_ID_CURR
100001 1 23787.00 23787.0 23787.0 3951.000 3951.000 3951.000 24835.50 24835.5 24835.5 2520.0 2520.0 2520.0 24835.5 24835.5 24835.5
100002 1 179055.00 179055.0 179055.0 9251.775 9251.775 9251.775 179055.00 179055.0 179055.0 0.0 0.0 0.0 179055.0 179055.0 179055.0
100003 3 484191.00 1035882.0 1452573.0 56553.990 98356.995 169661.970 435436.50 900000.0 1306309.5 3442.5 6885.0 6885.0 435436.5 900000.0 1306309.5
100004 1 20106.00 20106.0 20106.0 5357.250 5357.250 5357.250 24282.00 24282.0 24282.0 4860.0 4860.0 4860.0 24282.0 24282.0 24282.0
100005 2 20076.75 40153.5 40153.5 4813.200 4813.200 4813.200 22308.75 44617.5 44617.5 4464.0 4464.0 4464.0 44617.5 44617.5 44617.5

prev 피처 가공. 대출 신청액 대비 다른 금액 차이 및 비율 생성.

prev['PREV_CREDIT_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_CREDIT']
prev['PREV_GOODS_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_GOODS_PRICE']
prev['PREV_CREDIT_APPL_RATIO'] = prev['AMT_CREDIT']/prev['AMT_APPLICATION']
prev['PREV_ANNUITY_APPL_RATIO'] = prev['AMT_ANNUITY']/prev['AMT_APPLICATION']
prev['PREV_GOODS_APPL_RATIO'] = prev['AMT_GOODS_PRICE']/prev['AMT_APPLICATION']

DAYS_XXX 피처의 365243 을 NULL로 변환하고, 첫번째 만기일과 마지막 만기일까지의 기간 가공

prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
# 첫번째 만기일과 마지막 만기일까지의 기간
prev['PREV_DAYS_LAST_DUE_DIFF'] = prev['DAYS_LAST_DUE_1ST_VERSION'] - prev['DAYS_LAST_DUE']

기존 이자율 관련 컬럼이 null이 많아서 새롭게 간단한 이자율을 대출 금액과 대출 금액 납부 횟수를 기반으로 계산

prev.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670214 entries, 0 to 1670213
Data columns (total 43 columns):
SK_ID_PREV                     1670214 non-null int64
SK_ID_CURR                     1670214 non-null int64
NAME_CONTRACT_TYPE             1670214 non-null object
AMT_ANNUITY                    1297979 non-null float64
AMT_APPLICATION                1670214 non-null float64
AMT_CREDIT                     1670213 non-null float64
AMT_DOWN_PAYMENT               774370 non-null float64
AMT_GOODS_PRICE                1284699 non-null float64
WEEKDAY_APPR_PROCESS_START     1670214 non-null object
HOUR_APPR_PROCESS_START        1670214 non-null int64
FLAG_LAST_APPL_PER_CONTRACT    1670214 non-null object
NFLAG_LAST_APPL_IN_DAY         1670214 non-null int64
RATE_DOWN_PAYMENT              774370 non-null float64
RATE_INTEREST_PRIMARY          5951 non-null float64
RATE_INTEREST_PRIVILEGED       5951 non-null float64
NAME_CASH_LOAN_PURPOSE         1670214 non-null object
NAME_CONTRACT_STATUS           1670214 non-null object
DAYS_DECISION                  1670214 non-null int64
NAME_PAYMENT_TYPE              1670214 non-null object
CODE_REJECT_REASON             1670214 non-null object
NAME_TYPE_SUITE                849809 non-null object
NAME_CLIENT_TYPE               1670214 non-null object
NAME_GOODS_CATEGORY            1670214 non-null object
NAME_PORTFOLIO                 1670214 non-null object
NAME_PRODUCT_TYPE              1670214 non-null object
CHANNEL_TYPE                   1670214 non-null object
SELLERPLACE_AREA               1670214 non-null int64
NAME_SELLER_INDUSTRY           1670214 non-null object
CNT_PAYMENT                    1297984 non-null float64
NAME_YIELD_GROUP               1670214 non-null object
PRODUCT_COMBINATION            1669868 non-null object
DAYS_FIRST_DRAWING             62705 non-null float64
DAYS_FIRST_DUE                 956504 non-null float64
DAYS_LAST_DUE_1ST_VERSION      903285 non-null float64
DAYS_LAST_DUE                  785928 non-null float64
DAYS_TERMINATION               771236 non-null float64
NFLAG_INSURED_ON_APPROVAL      997149 non-null float64
PREV_CREDIT_DIFF               1670213 non-null float64
PREV_GOODS_DIFF                1284699 non-null float64
PREV_CREDIT_APPL_RATIO         1333446 non-null float64
PREV_ANNUITY_APPL_RATIO        1296344 non-null float64
PREV_GOODS_APPL_RATIO          1277830 non-null float64
PREV_DAYS_LAST_DUE_DIFF        761917 non-null float64
dtypes: float64(21), int64(6), object(16)
memory usage: 547.9+ MB

CNT_PAYMENT 는 전체 납부해야하는 횟수, AMT_ANNUITY 월 별 납부해야하는 금액 -> 이것으로 간단하게 이자율 계산

all_pay = prev['AMT_ANNUITY'] * prev['CNT_PAYMENT']
prev['PREV_INTERESTS_RATE'] = (all_pay/prev['AMT_CREDIT'] - 1)/prev['CNT_PAYMENT']
prev.iloc[:, -7:].head(10)
PREV_CREDIT_DIFF PREV_GOODS_DIFF PREV_CREDIT_APPL_RATIO PREV_ANNUITY_APPL_RATIO PREV_GOODS_APPL_RATIO PREV_DAYS_LAST_DUE_DIFF PREV_INTERESTS_RATE
0 0.0 0.0 1.000000 0.100929 1.0 342.0 0.017596
1 -72171.0 0.0 1.118800 0.041463 1.0 NaN 0.009282
2 -23944.5 0.0 1.212840 0.133873 1.0 NaN 0.027047
3 -20790.0 0.0 1.046200 0.104536 1.0 30.0 0.016587
4 -66555.0 0.0 1.197200 0.094591 1.0 NaN 0.037343
5 -25573.5 0.0 1.081186 0.075251 1.0 0.0 0.014044
6 0.0 NaN NaN NaN NaN NaN NaN
7 0.0 NaN NaN NaN NaN NaN NaN
8 0.0 NaN NaN NaN NaN NaN NaN
9 0.0 NaN NaN NaN NaN NaN NaN

기존 대출 금액, 대출 상태 관련 피처들과 이들을 가공하여 만들어진 새로운 컬럼들로 aggregation 수행.

agg_dict = {
     # 기존 컬럼. 
    'SK_ID_CURR':['count'],
    'AMT_CREDIT':['mean', 'max', 'sum'],
    'AMT_ANNUITY':['mean', 'max', 'sum'], 
    'AMT_APPLICATION':['mean', 'max', 'sum'],
    'AMT_DOWN_PAYMENT':['mean', 'max', 'sum'],
    'AMT_GOODS_PRICE':['mean', 'max', 'sum'],
    'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['mean', 'sum'],
    # 가공 컬럼
    'PREV_CREDIT_DIFF':['mean', 'max', 'sum'], 
    'PREV_CREDIT_APPL_RATIO':['mean', 'max'],
    'PREV_GOODS_DIFF':['mean', 'max', 'sum'],
    'PREV_GOODS_APPL_RATIO':['mean', 'max'],
    'PREV_DAYS_LAST_DUE_DIFF':['mean', 'max', 'sum'],
    'PREV_INTERESTS_RATE':['mean', 'max']
}

prev_group = prev.groupby('SK_ID_CURR')
prev_amt_agg = prev_group.agg(agg_dict)
prev_amt_agg.columns = ['PREV_'+('_').join(column).upper() for column in prev_amt_agg.columns.ravel()]
prev_amt_agg.head(10)
PREV_SK_ID_CURR_COUNT PREV_AMT_CREDIT_MEAN PREV_AMT_CREDIT_MAX PREV_AMT_CREDIT_SUM PREV_AMT_ANNUITY_MEAN PREV_AMT_ANNUITY_MAX PREV_AMT_ANNUITY_SUM PREV_AMT_APPLICATION_MEAN PREV_AMT_APPLICATION_MAX PREV_AMT_APPLICATION_SUM PREV_AMT_DOWN_PAYMENT_MEAN PREV_AMT_DOWN_PAYMENT_MAX PREV_AMT_DOWN_PAYMENT_SUM PREV_AMT_GOODS_PRICE_MEAN PREV_AMT_GOODS_PRICE_MAX PREV_AMT_GOODS_PRICE_SUM PREV_RATE_DOWN_PAYMENT_MIN PREV_RATE_DOWN_PAYMENT_MAX PREV_RATE_DOWN_PAYMENT_MEAN PREV_DAYS_DECISION_MIN PREV_DAYS_DECISION_MAX PREV_DAYS_DECISION_MEAN PREV_CNT_PAYMENT_MEAN PREV_CNT_PAYMENT_SUM PREV_PREV_CREDIT_DIFF_MEAN PREV_PREV_CREDIT_DIFF_MAX PREV_PREV_CREDIT_DIFF_SUM PREV_PREV_CREDIT_APPL_RATIO_MEAN PREV_PREV_CREDIT_APPL_RATIO_MAX PREV_PREV_GOODS_DIFF_MEAN PREV_PREV_GOODS_DIFF_MAX PREV_PREV_GOODS_DIFF_SUM PREV_PREV_GOODS_APPL_RATIO_MEAN PREV_PREV_GOODS_APPL_RATIO_MAX PREV_PREV_DAYS_LAST_DUE_DIFF_MEAN PREV_PREV_DAYS_LAST_DUE_DIFF_MAX PREV_PREV_DAYS_LAST_DUE_DIFF_SUM PREV_PREV_INTERESTS_RATE_MEAN PREV_PREV_INTERESTS_RATE_MAX
SK_ID_CURR
100001 1 23787.000000 23787.0 23787.0 3951.000000 3951.000 3951.000 24835.500000 24835.5 24835.50 2520.000000 2520.0 2520.00 24835.500000 24835.5 24835.50 0.104326 0.104326 0.104326 -1740 -1740 -1740.000000 8.000000 8.0 1048.500000 1048.5 1048.50 0.957782 0.957782 0.0 0.0 0.0 1.0 1.0 120.0 120.0 120.0 0.041099 0.041099
100002 1 179055.000000 179055.0 179055.0 9251.775000 9251.775 9251.775 179055.000000 179055.0 179055.00 0.000000 0.0 0.00 179055.000000 179055.0 179055.00 0.000000 0.000000 0.000000 -606 -606 -606.000000 24.000000 24.0 0.000000 0.0 0.00 1.000000 1.000000 0.0 0.0 0.0 1.0 1.0 150.0 150.0 150.0 0.010003 0.010003
100003 3 484191.000000 1035882.0 1452573.0 56553.990000 98356.995 169661.970 435436.500000 900000.0 1306309.50 3442.500000 6885.0 6885.00 435436.500000 900000.0 1306309.50 0.000000 0.100061 0.050030 -2341 -746 -1305.000000 10.000000 30.0 -48754.500000 756.0 -146263.50 1.057664 1.150980 0.0 0.0 0.0 1.0 1.0 50.0 150.0 150.0 0.015272 0.018533
100004 1 20106.000000 20106.0 20106.0 5357.250000 5357.250 5357.250 24282.000000 24282.0 24282.00 4860.000000 4860.0 4860.00 24282.000000 24282.0 24282.00 0.212008 0.212008 0.212008 -815 -815 -815.000000 4.000000 4.0 4176.000000 4176.0 4176.00 0.828021 0.828021 0.0 0.0 0.0 1.0 1.0 30.0 30.0 30.0 0.016450 0.016450
100005 2 20076.750000 40153.5 40153.5 4813.200000 4813.200 4813.200 22308.750000 44617.5 44617.50 4464.000000 4464.0 4464.00 44617.500000 44617.5 44617.50 0.108964 0.108964 0.108964 -757 -315 -536.000000 12.000000 12.0 2232.000000 4464.0 4464.00 0.899950 0.899950 0.0 0.0 0.0 1.0 1.0 90.0 90.0 90.0 0.036537 0.036537
100006 9 291695.500000 906615.0 2625259.5 23651.175000 39954.510 141907.050 272203.260000 688500.0 2449829.34 34840.170000 66987.0 69680.34 408304.890000 688500.0 2449829.34 0.108994 0.217830 0.163412 -617 -181 -272.444444 23.000000 138.0 -19492.240000 66987.0 -175430.16 1.012684 1.316797 0.0 0.0 0.0 1.0 1.0 810.0 1410.0 1620.0 -inf 0.027424
100007 6 166638.750000 284400.0 999832.5 12278.805000 22678.785 73672.830 150530.250000 247500.0 903181.50 3390.750000 3676.5 6781.50 150530.250000 247500.0 903181.50 0.100143 0.218890 0.159516 -2357 -374 -1222.833333 20.666667 124.0 -16108.500000 2560.5 -96651.00 1.046356 1.264000 0.0 0.0 0.0 1.0 1.0 7.5 30.0 30.0 0.026538 0.036164
100008 5 162767.700000 501975.0 813838.5 15839.696250 25309.575 63358.785 155701.800000 450000.0 778509.00 5548.500000 12145.5 16645.50 194627.250000 450000.0 778509.00 0.000000 0.110243 0.073051 -2536 -82 -1192.000000 14.000000 56.0 -7065.900000 12145.5 -35329.50 0.978569 1.115500 0.0 0.0 0.0 1.0 1.0 165.0 660.0 660.0 0.018055 0.034034
100009 7 70137.642857 98239.5 490963.5 10051.412143 17341.605 70359.885 76741.714286 110160.0 537192.00 9203.142857 22032.0 64422.00 76741.714286 110160.0 537192.00 0.000000 0.209525 0.126602 -1562 -74 -719.285714 8.000000 56.0 6604.071429 17671.5 46228.50 0.916226 1.000000 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.019536 0.030703
100010 1 260811.000000 260811.0 260811.0 27463.410000 27463.410 27463.410 247212.000000 247212.0 247212.00 0.000000 0.0 0.00 247212.000000 247212.0 247212.00 0.000000 0.000000 0.000000 -1070 -1070 -1070.000000 10.000000 10.0 -13599.000000 -13599.0 -13599.00 1.055009 1.055009 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.005300 0.005300
prev_amt_agg.shape
(338857, 39)

SK_ID_CURR별로 NAME_CONTRACT_STATUS가 Refused 일 경우의 건수 및 과거 대출건 대비 비율

Group by 기준 컬럼 기반에서 다른 컬럼들의 기준에 따라 세분화된 aggregation 수행.

prev['NAME_CONTRACT_STATUS'].value_counts()
Approved        1036781
Canceled         316319
Refused          290678
Unused offer      26436
Name: NAME_CONTRACT_STATUS, dtype: int64

SQL Group by Case when 과 pandas의 차이

SK_ID_CURR레벨로 groupby 된 count와 name_contract_status가 Refused일 때의 count

select sk_id_curr, cnt_refused/cnt
from
(
    select sk_id_curr, count(*) cnt, count(case when name_contract_status == 'Refused' end) cnt_refused
    from previous group by sk_id_curr
) 

Pandas는 원 DataFrame 에 groupby 적용된 DataFrame 과 세부기준으로 filtering 된 DataFrame에 groupby 적용된 DataFrame 을 조인하여 생성.

  • NAME_CONTRACT_STATUS == 'Refused' 세부 기준으로 filtering 및 filtering 된 DataFrame에 groupby 적용
  • groupby 완료 후 기존 prev_amt_agg와 조인
  • 효율적인 오류 방지를 위해서 groupby 시 적용후 groupby key값을 DataFrame의 Index가 아닌 일반 컬럼으로 변경.
cond_refused = (prev['NAME_CONTRACT_STATUS'] == 'Refused')
prev_refused = prev[cond_refused]
prev_refused.shape, prev.shape
((290678, 44), (1670214, 44))
prev_refused_agg = prev_refused.groupby('SK_ID_CURR')['SK_ID_CURR'].count()
prev_refused_agg.shape, prev_amt_agg.shape
((118277,), (338857, 39))
prev_refused_agg
SK_ID_CURR
100006     1
100011     1
100027     1
100030    10
100035     8
          ..
456244     1
456247     1
456249     1
456250     1
456255     2
Name: SK_ID_CURR, Length: 118277, dtype: int64
# 칼럼과 인덱스 이름이 같아서 안된다. 일부러 error 냈음
prev_amt_agg.merge(prev_refused_agg, how='left', on='SK_ID_CURR')
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-54-3bccc0d949fd> in <module>
----> 1 prev_amt_agg.merge(prev_refused_agg, how='left', on='SK_ID_CURR')


~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in merge(self, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
   7319         from pandas.core.reshape.merge import merge
   7320 
-> 7321         return merge(
   7322             self,
   7323             right,


~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/reshape/merge.py in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
     66     validate=None,
     67 ):
---> 68     op = _MergeOperation(
     69         left,
     70         right,


~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/reshape/merge.py in __init__(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy, indicator, validate)
    624             self.right_join_keys,
    625             self.join_names,
--> 626         ) = self._get_merge_keys()
    627 
    628         # validate the merge keys dtypes. We may need to coerce


~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/reshape/merge.py in _get_merge_keys(self)
    973                     if not is_rkey(rk):
    974                         if rk is not None:
--> 975                             right_keys.append(right._get_label_or_level_values(rk))
    976                         else:
    977                             # work-around for merge_asof(right_index=True)


~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in _get_label_or_level_values(self, key, axis)
   1767 
   1768         if self._is_label_reference(key, axis=axis):
-> 1769             self._check_label_or_level_ambiguity(key, axis=axis)
   1770             values = self.xs(key, axis=other_axes[0])._values
   1771         elif self._is_level_reference(key, axis=axis):


~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in _check_label_or_level_ambiguity(self, key, axis)
   1727                 label_type=label_type,
   1728             )
-> 1729             raise ValueError(msg)
   1730 
   1731     def _get_label_or_level_values(self, key, axis=0):


ValueError: 'SK_ID_CURR' is both an index level and a column label, which is ambiguous.
pd.DataFrame(prev_refused_agg)
SK_ID_CURR
SK_ID_CURR
100006 1
100011 1
100027 1
100030 10
100035 8
... ...
456244 1
456247 1
456249 1
456250 1
456255 2

118277 rows × 1 columns

prev_refused_agg.reset_index(name='PREV_REFUSED_COUNT')
SK_ID_CURR PREV_REFUSED_COUNT
0 100006 1
1 100011 1
2 100027 1
3 100030 10
4 100035 8
... ... ...
118272 456244 1
118273 456247 1
118274 456249 1
118275 456250 1
118276 456255 2

118277 rows × 2 columns

prev_refused_agg = prev_refused_agg.reset_index(name='PREV_REFUSED_COUNT')
prev_amt_agg = prev_amt_agg.reset_index()

prev_amt_refused_agg = prev_amt_agg.merge(prev_refused_agg, on='SK_ID_CURR', how='left')
prev_amt_refused_agg.head(10)
SK_ID_CURR PREV_SK_ID_CURR_COUNT PREV_AMT_CREDIT_MEAN PREV_AMT_CREDIT_MAX PREV_AMT_CREDIT_SUM PREV_AMT_ANNUITY_MEAN PREV_AMT_ANNUITY_MAX PREV_AMT_ANNUITY_SUM PREV_AMT_APPLICATION_MEAN PREV_AMT_APPLICATION_MAX PREV_AMT_APPLICATION_SUM PREV_AMT_DOWN_PAYMENT_MEAN PREV_AMT_DOWN_PAYMENT_MAX PREV_AMT_DOWN_PAYMENT_SUM PREV_AMT_GOODS_PRICE_MEAN PREV_AMT_GOODS_PRICE_MAX PREV_AMT_GOODS_PRICE_SUM PREV_RATE_DOWN_PAYMENT_MIN PREV_RATE_DOWN_PAYMENT_MAX PREV_RATE_DOWN_PAYMENT_MEAN PREV_DAYS_DECISION_MIN PREV_DAYS_DECISION_MAX PREV_DAYS_DECISION_MEAN PREV_CNT_PAYMENT_MEAN PREV_CNT_PAYMENT_SUM PREV_PREV_CREDIT_DIFF_MEAN PREV_PREV_CREDIT_DIFF_MAX PREV_PREV_CREDIT_DIFF_SUM PREV_PREV_CREDIT_APPL_RATIO_MEAN PREV_PREV_CREDIT_APPL_RATIO_MAX PREV_PREV_GOODS_DIFF_MEAN PREV_PREV_GOODS_DIFF_MAX PREV_PREV_GOODS_DIFF_SUM PREV_PREV_GOODS_APPL_RATIO_MEAN PREV_PREV_GOODS_APPL_RATIO_MAX PREV_PREV_DAYS_LAST_DUE_DIFF_MEAN PREV_PREV_DAYS_LAST_DUE_DIFF_MAX PREV_PREV_DAYS_LAST_DUE_DIFF_SUM PREV_PREV_INTERESTS_RATE_MEAN PREV_PREV_INTERESTS_RATE_MAX PREV_REFUSED_COUNT
0 100001 1 23787.000000 23787.0 23787.0 3951.000000 3951.000 3951.000 24835.500000 24835.5 24835.50 2520.000000 2520.0 2520.00 24835.500000 24835.5 24835.50 0.104326 0.104326 0.104326 -1740 -1740 -1740.000000 8.000000 8.0 1048.500000 1048.5 1048.50 0.957782 0.957782 0.0 0.0 0.0 1.0 1.0 120.0 120.0 120.0 0.041099 0.041099 NaN
1 100002 1 179055.000000 179055.0 179055.0 9251.775000 9251.775 9251.775 179055.000000 179055.0 179055.00 0.000000 0.0 0.00 179055.000000 179055.0 179055.00 0.000000 0.000000 0.000000 -606 -606 -606.000000 24.000000 24.0 0.000000 0.0 0.00 1.000000 1.000000 0.0 0.0 0.0 1.0 1.0 150.0 150.0 150.0 0.010003 0.010003 NaN
2 100003 3 484191.000000 1035882.0 1452573.0 56553.990000 98356.995 169661.970 435436.500000 900000.0 1306309.50 3442.500000 6885.0 6885.00 435436.500000 900000.0 1306309.50 0.000000 0.100061 0.050030 -2341 -746 -1305.000000 10.000000 30.0 -48754.500000 756.0 -146263.50 1.057664 1.150980 0.0 0.0 0.0 1.0 1.0 50.0 150.0 150.0 0.015272 0.018533 NaN
3 100004 1 20106.000000 20106.0 20106.0 5357.250000 5357.250 5357.250 24282.000000 24282.0 24282.00 4860.000000 4860.0 4860.00 24282.000000 24282.0 24282.00 0.212008 0.212008 0.212008 -815 -815 -815.000000 4.000000 4.0 4176.000000 4176.0 4176.00 0.828021 0.828021 0.0 0.0 0.0 1.0 1.0 30.0 30.0 30.0 0.016450 0.016450 NaN
4 100005 2 20076.750000 40153.5 40153.5 4813.200000 4813.200 4813.200 22308.750000 44617.5 44617.50 4464.000000 4464.0 4464.00 44617.500000 44617.5 44617.50 0.108964 0.108964 0.108964 -757 -315 -536.000000 12.000000 12.0 2232.000000 4464.0 4464.00 0.899950 0.899950 0.0 0.0 0.0 1.0 1.0 90.0 90.0 90.0 0.036537 0.036537 NaN
5 100006 9 291695.500000 906615.0 2625259.5 23651.175000 39954.510 141907.050 272203.260000 688500.0 2449829.34 34840.170000 66987.0 69680.34 408304.890000 688500.0 2449829.34 0.108994 0.217830 0.163412 -617 -181 -272.444444 23.000000 138.0 -19492.240000 66987.0 -175430.16 1.012684 1.316797 0.0 0.0 0.0 1.0 1.0 810.0 1410.0 1620.0 -inf 0.027424 1.0
6 100007 6 166638.750000 284400.0 999832.5 12278.805000 22678.785 73672.830 150530.250000 247500.0 903181.50 3390.750000 3676.5 6781.50 150530.250000 247500.0 903181.50 0.100143 0.218890 0.159516 -2357 -374 -1222.833333 20.666667 124.0 -16108.500000 2560.5 -96651.00 1.046356 1.264000 0.0 0.0 0.0 1.0 1.0 7.5 30.0 30.0 0.026538 0.036164 NaN
7 100008 5 162767.700000 501975.0 813838.5 15839.696250 25309.575 63358.785 155701.800000 450000.0 778509.00 5548.500000 12145.5 16645.50 194627.250000 450000.0 778509.00 0.000000 0.110243 0.073051 -2536 -82 -1192.000000 14.000000 56.0 -7065.900000 12145.5 -35329.50 0.978569 1.115500 0.0 0.0 0.0 1.0 1.0 165.0 660.0 660.0 0.018055 0.034034 NaN
8 100009 7 70137.642857 98239.5 490963.5 10051.412143 17341.605 70359.885 76741.714286 110160.0 537192.00 9203.142857 22032.0 64422.00 76741.714286 110160.0 537192.00 0.000000 0.209525 0.126602 -1562 -74 -719.285714 8.000000 56.0 6604.071429 17671.5 46228.50 0.916226 1.000000 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.019536 0.030703 NaN
9 100010 1 260811.000000 260811.0 260811.0 27463.410000 27463.410 27463.410 247212.000000 247212.0 247212.00 0.000000 0.0 0.00 247212.000000 247212.0 247212.00 0.000000 0.000000 0.000000 -1070 -1070 -1070.000000 10.000000 10.0 -13599.000000 -13599.0 -13599.00 1.055009 1.055009 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.005300 0.005300 NaN
prev_amt_refused_agg['PREV_REFUSED_COUNT'].value_counts(dropna=False)
NaN     220580
1.0      54616
2.0      26793
3.0      14025
4.0       8243
5.0       4859
6.0       3147
7.0       2069
8.0       1247
9.0        871
10.0       656
11.0       437
12.0       340
13.0       233
14.0       157
15.0       149
16.0        94
17.0        62
18.0        54
19.0        48
20.0        28
21.0        25
22.0        21
24.0        18
25.0        12
23.0        11
26.0        11
27.0         8
28.0         5
29.0         5
32.0         4
41.0         3
31.0         3
37.0         3
30.0         3
39.0         2
35.0         2
34.0         2
33.0         2
36.0         2
68.0         1
47.0         1
43.0         1
54.0         1
51.0         1
64.0         1
58.0         1
Name: PREV_REFUSED_COUNT, dtype: int64

계산된 PREV_REFUSED_COUNT 중 Null값은 0 으로 변경하고 SK_ID_CURR 개별 건수 대비 PREV_REFUSED_COUNT 비율 계산

prev_amt_refused_agg = prev_amt_refused_agg.fillna(0)
prev_amt_refused_agg['PREV_REFUSE_RATIO'] = prev_amt_refused_agg['PREV_REFUSED_COUNT'] / prev_amt_refused_agg['PREV_SK_ID_CURR_COUNT']
prev_amt_refused_agg.head(10)
SK_ID_CURR PREV_SK_ID_CURR_COUNT PREV_AMT_CREDIT_MEAN PREV_AMT_CREDIT_MAX PREV_AMT_CREDIT_SUM PREV_AMT_ANNUITY_MEAN PREV_AMT_ANNUITY_MAX PREV_AMT_ANNUITY_SUM PREV_AMT_APPLICATION_MEAN PREV_AMT_APPLICATION_MAX PREV_AMT_APPLICATION_SUM PREV_AMT_DOWN_PAYMENT_MEAN PREV_AMT_DOWN_PAYMENT_MAX PREV_AMT_DOWN_PAYMENT_SUM PREV_AMT_GOODS_PRICE_MEAN PREV_AMT_GOODS_PRICE_MAX PREV_AMT_GOODS_PRICE_SUM PREV_RATE_DOWN_PAYMENT_MIN PREV_RATE_DOWN_PAYMENT_MAX PREV_RATE_DOWN_PAYMENT_MEAN PREV_DAYS_DECISION_MIN PREV_DAYS_DECISION_MAX PREV_DAYS_DECISION_MEAN PREV_CNT_PAYMENT_MEAN PREV_CNT_PAYMENT_SUM PREV_PREV_CREDIT_DIFF_MEAN PREV_PREV_CREDIT_DIFF_MAX PREV_PREV_CREDIT_DIFF_SUM PREV_PREV_CREDIT_APPL_RATIO_MEAN PREV_PREV_CREDIT_APPL_RATIO_MAX PREV_PREV_GOODS_DIFF_MEAN PREV_PREV_GOODS_DIFF_MAX PREV_PREV_GOODS_DIFF_SUM PREV_PREV_GOODS_APPL_RATIO_MEAN PREV_PREV_GOODS_APPL_RATIO_MAX PREV_PREV_DAYS_LAST_DUE_DIFF_MEAN PREV_PREV_DAYS_LAST_DUE_DIFF_MAX PREV_PREV_DAYS_LAST_DUE_DIFF_SUM PREV_PREV_INTERESTS_RATE_MEAN PREV_PREV_INTERESTS_RATE_MAX PREV_REFUSED_COUNT PREV_REFUSE_RATIO
0 100001 1 23787.000000 23787.0 23787.0 3951.000000 3951.000 3951.000 24835.500000 24835.5 24835.50 2520.000000 2520.0 2520.00 24835.500000 24835.5 24835.50 0.104326 0.104326 0.104326 -1740 -1740 -1740.000000 8.000000 8.0 1048.500000 1048.5 1048.50 0.957782 0.957782 0.0 0.0 0.0 1.0 1.0 120.0 120.0 120.0 0.041099 0.041099 0.0 0.000000
1 100002 1 179055.000000 179055.0 179055.0 9251.775000 9251.775 9251.775 179055.000000 179055.0 179055.00 0.000000 0.0 0.00 179055.000000 179055.0 179055.00 0.000000 0.000000 0.000000 -606 -606 -606.000000 24.000000 24.0 0.000000 0.0 0.00 1.000000 1.000000 0.0 0.0 0.0 1.0 1.0 150.0 150.0 150.0 0.010003 0.010003 0.0 0.000000
2 100003 3 484191.000000 1035882.0 1452573.0 56553.990000 98356.995 169661.970 435436.500000 900000.0 1306309.50 3442.500000 6885.0 6885.00 435436.500000 900000.0 1306309.50 0.000000 0.100061 0.050030 -2341 -746 -1305.000000 10.000000 30.0 -48754.500000 756.0 -146263.50 1.057664 1.150980 0.0 0.0 0.0 1.0 1.0 50.0 150.0 150.0 0.015272 0.018533 0.0 0.000000
3 100004 1 20106.000000 20106.0 20106.0 5357.250000 5357.250 5357.250 24282.000000 24282.0 24282.00 4860.000000 4860.0 4860.00 24282.000000 24282.0 24282.00 0.212008 0.212008 0.212008 -815 -815 -815.000000 4.000000 4.0 4176.000000 4176.0 4176.00 0.828021 0.828021 0.0 0.0 0.0 1.0 1.0 30.0 30.0 30.0 0.016450 0.016450 0.0 0.000000
4 100005 2 20076.750000 40153.5 40153.5 4813.200000 4813.200 4813.200 22308.750000 44617.5 44617.50 4464.000000 4464.0 4464.00 44617.500000 44617.5 44617.50 0.108964 0.108964 0.108964 -757 -315 -536.000000 12.000000 12.0 2232.000000 4464.0 4464.00 0.899950 0.899950 0.0 0.0 0.0 1.0 1.0 90.0 90.0 90.0 0.036537 0.036537 0.0 0.000000
5 100006 9 291695.500000 906615.0 2625259.5 23651.175000 39954.510 141907.050 272203.260000 688500.0 2449829.34 34840.170000 66987.0 69680.34 408304.890000 688500.0 2449829.34 0.108994 0.217830 0.163412 -617 -181 -272.444444 23.000000 138.0 -19492.240000 66987.0 -175430.16 1.012684 1.316797 0.0 0.0 0.0 1.0 1.0 810.0 1410.0 1620.0 -inf 0.027424 1.0 0.111111
6 100007 6 166638.750000 284400.0 999832.5 12278.805000 22678.785 73672.830 150530.250000 247500.0 903181.50 3390.750000 3676.5 6781.50 150530.250000 247500.0 903181.50 0.100143 0.218890 0.159516 -2357 -374 -1222.833333 20.666667 124.0 -16108.500000 2560.5 -96651.00 1.046356 1.264000 0.0 0.0 0.0 1.0 1.0 7.5 30.0 30.0 0.026538 0.036164 0.0 0.000000
7 100008 5 162767.700000 501975.0 813838.5 15839.696250 25309.575 63358.785 155701.800000 450000.0 778509.00 5548.500000 12145.5 16645.50 194627.250000 450000.0 778509.00 0.000000 0.110243 0.073051 -2536 -82 -1192.000000 14.000000 56.0 -7065.900000 12145.5 -35329.50 0.978569 1.115500 0.0 0.0 0.0 1.0 1.0 165.0 660.0 660.0 0.018055 0.034034 0.0 0.000000
8 100009 7 70137.642857 98239.5 490963.5 10051.412143 17341.605 70359.885 76741.714286 110160.0 537192.00 9203.142857 22032.0 64422.00 76741.714286 110160.0 537192.00 0.000000 0.209525 0.126602 -1562 -74 -719.285714 8.000000 56.0 6604.071429 17671.5 46228.50 0.916226 1.000000 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.019536 0.030703 0.0 0.000000
9 100010 1 260811.000000 260811.0 260811.0 27463.410000 27463.410 27463.410 247212.000000 247212.0 247212.00 0.000000 0.0 0.00 247212.000000 247212.0 247212.00 0.000000 0.000000 0.000000 -1070 -1070 -1070.000000 10.000000 10.0 -13599.000000 -13599.0 -13599.00 1.055009 1.055009 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.005300 0.005300 0.0 0.000000

세부 레벨 groupby 와 unstack()을 이용하여 SQL Group by Case when 구현. 세부 조건이 2개 이상일때

SQL 일 경우
SELECT COUNT(CASE WHEN == 'Approved' END) , COUNT(CASE WHEN == 'Refused' END) FROM PREV GROUP BY SK_ID_CURR

Pandas로 수행

prev_refused_appr_group = prev[prev['NAME_CONTRACT_STATUS'].isin(['Approved', 'Refused'])].groupby(['SK_ID_CURR', 'NAME_CONTRACT_STATUS'])
prev_refused_appr_agg = prev_refused_appr_group['SK_ID_CURR'].count().unstack()
prev_refused_appr_agg.head(10)
NAME_CONTRACT_STATUS Approved Refused
SK_ID_CURR
100001 1.0 NaN
100002 1.0 NaN
100003 3.0 NaN
100004 1.0 NaN
100005 1.0 NaN
100006 5.0 1.0
100007 6.0 NaN
100008 4.0 NaN
100009 7.0 NaN
100010 1.0 NaN

컬럼명 변경, Null 처리, 그리고 기존의 prev_amt_agg와 조인 후 데이터 가공

prev_refused_appr_agg = prev_refused_appr_agg.fillna(0)
prev_refused_appr_agg.columns = ['PREV_APPROVED_COUNT', 'PREV_REFUSED_COUNT']
prev_refused_appr_agg = prev_refused_appr_agg.reset_index()
prev_refused_appr_agg.head(10)
SK_ID_CURR PREV_APPROVED_COUNT PREV_REFUSED_COUNT
0 100001 1.0 0.0
1 100002 1.0 0.0
2 100003 3.0 0.0
3 100004 1.0 0.0
4 100005 1.0 0.0
5 100006 5.0 1.0
6 100007 6.0 0.0
7 100008 4.0 0.0
8 100009 7.0 0.0
9 100010 1.0 0.0
prev_amt_agg.head()
SK_ID_CURR PREV_SK_ID_CURR_COUNT PREV_AMT_CREDIT_MEAN PREV_AMT_CREDIT_MAX PREV_AMT_CREDIT_SUM PREV_AMT_ANNUITY_MEAN PREV_AMT_ANNUITY_MAX PREV_AMT_ANNUITY_SUM PREV_AMT_APPLICATION_MEAN PREV_AMT_APPLICATION_MAX PREV_AMT_APPLICATION_SUM PREV_AMT_DOWN_PAYMENT_MEAN PREV_AMT_DOWN_PAYMENT_MAX PREV_AMT_DOWN_PAYMENT_SUM PREV_AMT_GOODS_PRICE_MEAN PREV_AMT_GOODS_PRICE_MAX PREV_AMT_GOODS_PRICE_SUM PREV_RATE_DOWN_PAYMENT_MIN PREV_RATE_DOWN_PAYMENT_MAX PREV_RATE_DOWN_PAYMENT_MEAN PREV_DAYS_DECISION_MIN PREV_DAYS_DECISION_MAX PREV_DAYS_DECISION_MEAN PREV_CNT_PAYMENT_MEAN PREV_CNT_PAYMENT_SUM PREV_PREV_CREDIT_DIFF_MEAN PREV_PREV_CREDIT_DIFF_MAX PREV_PREV_CREDIT_DIFF_SUM PREV_PREV_CREDIT_APPL_RATIO_MEAN PREV_PREV_CREDIT_APPL_RATIO_MAX PREV_PREV_GOODS_DIFF_MEAN PREV_PREV_GOODS_DIFF_MAX PREV_PREV_GOODS_DIFF_SUM PREV_PREV_GOODS_APPL_RATIO_MEAN PREV_PREV_GOODS_APPL_RATIO_MAX PREV_PREV_DAYS_LAST_DUE_DIFF_MEAN PREV_PREV_DAYS_LAST_DUE_DIFF_MAX PREV_PREV_DAYS_LAST_DUE_DIFF_SUM PREV_PREV_INTERESTS_RATE_MEAN PREV_PREV_INTERESTS_RATE_MAX
0 100001 1 23787.00 23787.0 23787.0 3951.000 3951.000 3951.000 24835.50 24835.5 24835.5 2520.0 2520.0 2520.0 24835.5 24835.5 24835.5 0.104326 0.104326 0.104326 -1740 -1740 -1740.0 8.0 8.0 1048.5 1048.5 1048.5 0.957782 0.957782 0.0 0.0 0.0 1.0 1.0 120.0 120.0 120.0 0.041099 0.041099
1 100002 1 179055.00 179055.0 179055.0 9251.775 9251.775 9251.775 179055.00 179055.0 179055.0 0.0 0.0 0.0 179055.0 179055.0 179055.0 0.000000 0.000000 0.000000 -606 -606 -606.0 24.0 24.0 0.0 0.0 0.0 1.000000 1.000000 0.0 0.0 0.0 1.0 1.0 150.0 150.0 150.0 0.010003 0.010003
2 100003 3 484191.00 1035882.0 1452573.0 56553.990 98356.995 169661.970 435436.50 900000.0 1306309.5 3442.5 6885.0 6885.0 435436.5 900000.0 1306309.5 0.000000 0.100061 0.050030 -2341 -746 -1305.0 10.0 30.0 -48754.5 756.0 -146263.5 1.057664 1.150980 0.0 0.0 0.0 1.0 1.0 50.0 150.0 150.0 0.015272 0.018533
3 100004 1 20106.00 20106.0 20106.0 5357.250 5357.250 5357.250 24282.00 24282.0 24282.0 4860.0 4860.0 4860.0 24282.0 24282.0 24282.0 0.212008 0.212008 0.212008 -815 -815 -815.0 4.0 4.0 4176.0 4176.0 4176.0 0.828021 0.828021 0.0 0.0 0.0 1.0 1.0 30.0 30.0 30.0 0.016450 0.016450
4 100005 2 20076.75 40153.5 40153.5 4813.200 4813.200 4813.200 22308.75 44617.5 44617.5 4464.0 4464.0 4464.0 44617.5 44617.5 44617.5 0.108964 0.108964 0.108964 -757 -315 -536.0 12.0 12.0 2232.0 4464.0 4464.0 0.899950 0.899950 0.0 0.0 0.0 1.0 1.0 90.0 90.0 90.0 0.036537 0.036537
# prev_amt_agg와 조인. prev_amt_agg와 prev_refused_appr_agg 모두 SK_ID_CURR을 INDEX로 가지고 있음. 
prev_agg = prev_amt_agg.merge(prev_refused_appr_agg, on='SK_ID_CURR', how='left')
# SK_ID_CURR별 과거 대출건수 대비 APPROVED_COUNT 및 REFUSED_COUNT 비율 생성. 
prev_agg['PREV_REFUSED_RATIO'] = prev_agg['PREV_REFUSED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']
prev_agg['PREV_APPROVED_RATIO'] = prev_agg['PREV_APPROVED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']
# 'PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT' 컬럼 drop 
prev_agg = prev_agg.drop(['PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT'], axis=1)
# prev_amt_agg와 prev_refused_appr_agg INDEX인 SK_ID_CURR이 조인 후 정식 컬럼으로 생성됨. 
prev_agg.head(30)
SK_ID_CURR PREV_SK_ID_CURR_COUNT PREV_AMT_CREDIT_MEAN PREV_AMT_CREDIT_MAX PREV_AMT_CREDIT_SUM PREV_AMT_ANNUITY_MEAN PREV_AMT_ANNUITY_MAX PREV_AMT_ANNUITY_SUM PREV_AMT_APPLICATION_MEAN PREV_AMT_APPLICATION_MAX PREV_AMT_APPLICATION_SUM PREV_AMT_DOWN_PAYMENT_MEAN PREV_AMT_DOWN_PAYMENT_MAX PREV_AMT_DOWN_PAYMENT_SUM PREV_AMT_GOODS_PRICE_MEAN PREV_AMT_GOODS_PRICE_MAX PREV_AMT_GOODS_PRICE_SUM PREV_RATE_DOWN_PAYMENT_MIN PREV_RATE_DOWN_PAYMENT_MAX PREV_RATE_DOWN_PAYMENT_MEAN PREV_DAYS_DECISION_MIN PREV_DAYS_DECISION_MAX PREV_DAYS_DECISION_MEAN PREV_CNT_PAYMENT_MEAN PREV_CNT_PAYMENT_SUM PREV_PREV_CREDIT_DIFF_MEAN PREV_PREV_CREDIT_DIFF_MAX PREV_PREV_CREDIT_DIFF_SUM PREV_PREV_CREDIT_APPL_RATIO_MEAN PREV_PREV_CREDIT_APPL_RATIO_MAX PREV_PREV_GOODS_DIFF_MEAN PREV_PREV_GOODS_DIFF_MAX PREV_PREV_GOODS_DIFF_SUM PREV_PREV_GOODS_APPL_RATIO_MEAN PREV_PREV_GOODS_APPL_RATIO_MAX PREV_PREV_DAYS_LAST_DUE_DIFF_MEAN PREV_PREV_DAYS_LAST_DUE_DIFF_MAX PREV_PREV_DAYS_LAST_DUE_DIFF_SUM PREV_PREV_INTERESTS_RATE_MEAN PREV_PREV_INTERESTS_RATE_MAX PREV_REFUSED_RATIO PREV_APPROVED_RATIO
0 100001 1 23787.000000 23787.0 23787.000 3951.000000 3951.000 3951.000 24835.500000 24835.5 24835.500 2520.000000 2520.0 2520.00 24835.500000 24835.5 24835.500 0.104326 0.104326 0.104326 -1740 -1740 -1740.000000 8.000000 8.0 1048.500000 1048.5 1048.50 0.957782 0.957782 0.0 0.0 0.0 1.0 1.0 120.00 120.0 120.0 0.041099 0.041099 0.000000 1.000000
1 100002 1 179055.000000 179055.0 179055.000 9251.775000 9251.775 9251.775 179055.000000 179055.0 179055.000 0.000000 0.0 0.00 179055.000000 179055.0 179055.000 0.000000 0.000000 0.000000 -606 -606 -606.000000 24.000000 24.0 0.000000 0.0 0.00 1.000000 1.000000 0.0 0.0 0.0 1.0 1.0 150.00 150.0 150.0 0.010003 0.010003 0.000000 1.000000
2 100003 3 484191.000000 1035882.0 1452573.000 56553.990000 98356.995 169661.970 435436.500000 900000.0 1306309.500 3442.500000 6885.0 6885.00 435436.500000 900000.0 1306309.500 0.000000 0.100061 0.050030 -2341 -746 -1305.000000 10.000000 30.0 -48754.500000 756.0 -146263.50 1.057664 1.150980 0.0 0.0 0.0 1.0 1.0 50.00 150.0 150.0 0.015272 0.018533 0.000000 1.000000
3 100004 1 20106.000000 20106.0 20106.000 5357.250000 5357.250 5357.250 24282.000000 24282.0 24282.000 4860.000000 4860.0 4860.00 24282.000000 24282.0 24282.000 0.212008 0.212008 0.212008 -815 -815 -815.000000 4.000000 4.0 4176.000000 4176.0 4176.00 0.828021 0.828021 0.0 0.0 0.0 1.0 1.0 30.00 30.0 30.0 0.016450 0.016450 0.000000 1.000000
4 100005 2 20076.750000 40153.5 40153.500 4813.200000 4813.200 4813.200 22308.750000 44617.5 44617.500 4464.000000 4464.0 4464.00 44617.500000 44617.5 44617.500 0.108964 0.108964 0.108964 -757 -315 -536.000000 12.000000 12.0 2232.000000 4464.0 4464.00 0.899950 0.899950 0.0 0.0 0.0 1.0 1.0 90.00 90.0 90.0 0.036537 0.036537 0.000000 0.500000
5 100006 9 291695.500000 906615.0 2625259.500 23651.175000 39954.510 141907.050 272203.260000 688500.0 2449829.340 34840.170000 66987.0 69680.34 408304.890000 688500.0 2449829.340 0.108994 0.217830 0.163412 -617 -181 -272.444444 23.000000 138.0 -19492.240000 66987.0 -175430.16 1.012684 1.316797 0.0 0.0 0.0 1.0 1.0 810.00 1410.0 1620.0 -inf 0.027424 0.111111 0.555556
6 100007 6 166638.750000 284400.0 999832.500 12278.805000 22678.785 73672.830 150530.250000 247500.0 903181.500 3390.750000 3676.5 6781.50 150530.250000 247500.0 903181.500 0.100143 0.218890 0.159516 -2357 -374 -1222.833333 20.666667 124.0 -16108.500000 2560.5 -96651.00 1.046356 1.264000 0.0 0.0 0.0 1.0 1.0 7.50 30.0 30.0 0.026538 0.036164 0.000000 1.000000
7 100008 5 162767.700000 501975.0 813838.500 15839.696250 25309.575 63358.785 155701.800000 450000.0 778509.000 5548.500000 12145.5 16645.50 194627.250000 450000.0 778509.000 0.000000 0.110243 0.073051 -2536 -82 -1192.000000 14.000000 56.0 -7065.900000 12145.5 -35329.50 0.978569 1.115500 0.0 0.0 0.0 1.0 1.0 165.00 660.0 660.0 0.018055 0.034034 0.000000 0.800000
8 100009 7 70137.642857 98239.5 490963.500 10051.412143 17341.605 70359.885 76741.714286 110160.0 537192.000 9203.142857 22032.0 64422.00 76741.714286 110160.0 537192.000 0.000000 0.209525 0.126602 -1562 -74 -719.285714 8.000000 56.0 6604.071429 17671.5 46228.50 0.916226 1.000000 0.0 0.0 0.0 1.0 1.0 0.00 0.0 0.0 0.019536 0.030703 0.000000 1.000000
9 100010 1 260811.000000 260811.0 260811.000 27463.410000 27463.410 27463.410 247212.000000 247212.0 247212.000 0.000000 0.0 0.00 247212.000000 247212.0 247212.000 0.000000 0.000000 0.000000 -1070 -1070 -1070.000000 10.000000 10.0 -13599.000000 -13599.0 -13599.00 1.055009 1.055009 0.0 0.0 0.0 1.0 1.0 0.00 0.0 0.0 0.005300 0.005300 0.000000 1.000000
10 100011 4 261840.375000 732915.0 1047361.500 18303.195000 31295.250 54909.585 202732.875000 675000.0 810931.500 6797.250000 13594.5 13594.50 270310.500000 675000.0 810931.500 0.000000 0.100010 0.050005 -2508 -1162 -1784.500000 14.000000 42.0 -59107.500000 1485.0 -236430.00 inf inf 0.0 0.0 0.0 1.0 1.0 0.00 0.0 0.0 -inf 0.025367 0.250000 0.750000
11 100012 4 74119.500000 158508.0 296478.000 7894.155000 11188.035 23682.465 60930.000000 135000.0 243720.000 0.000000 0.0 0.00 81240.000000 135000.0 243720.000 0.000000 0.000000 0.000000 -1673 -107 -779.750000 18.000000 54.0 -13189.500000 0.0 -52758.00 1.236566 1.269700 0.0 0.0 0.0 1.0 1.0 100.00 180.0 300.0 0.033372 0.043775 0.000000 0.750000
12 100013 4 146134.125000 512370.0 584536.500 11478.195000 23153.985 34434.585 130871.250000 450000.0 523485.000 3375.000000 6750.0 6750.00 174495.000000 450000.0 523485.000 0.000000 0.134434 0.067217 -1999 -222 -837.500000 17.333333 52.0 -15262.875000 3096.0 -61051.50 1.052363 1.138600 0.0 0.0 0.0 1.0 1.0 200.00 570.0 600.0 0.027617 0.036400 0.000000 0.750000
13 100014 2 102834.000000 131868.0 205668.000 12806.550000 14045.625 25613.100 96536.250000 119272.5 193072.500 0.000000 0.0 0.00 96536.250000 119272.5 193072.500 0.000000 0.000000 0.000000 -844 -102 -473.000000 9.000000 18.0 -6297.750000 0.0 -12595.50 1.052801 1.105603 0.0 0.0 0.0 1.0 1.0 0.00 0.0 0.0 0.014020 0.023653 0.000000 1.000000
14 100015 1 67077.000000 67077.0 67077.000 7666.920000 7666.920 7666.920 68850.000000 68850.0 68850.000 6885.000000 6885.0 6885.00 68850.000000 68850.0 68850.000 0.101382 0.101382 0.101382 -2396 -2396 -2396.000000 10.000000 10.0 1773.000000 1773.0 1773.00 0.974248 0.974248 0.0 0.0 0.0 1.0 1.0 0.00 0.0 0.0 0.014300 0.014300 0.000000 1.000000
15 100016 4 106221.375000 232200.0 424885.500 8593.155000 14480.460 34372.620 120745.125000 290250.0 482980.500 15690.375000 58050.0 62761.50 120745.125000 290250.0 482980.500 0.000000 0.217818 0.081682 -2370 -1049 -1779.500000 17.000000 68.0 14523.750000 58050.0 58095.00 0.949761 1.099045 0.0 0.0 0.0 1.0 1.0 70.00 210.0 210.0 0.029580 0.033600 0.000000 1.000000
16 100017 2 175565.250000 198409.5 351130.500 14375.407500 16967.295 28750.815 176665.500000 195291.0 353331.000 19152.000000 22500.0 38304.00 176665.500000 195291.0 353331.000 0.102133 0.110926 0.106529 -2441 -612 -1526.500000 17.000000 34.0 1100.250000 5319.0 2200.50 0.991156 1.015968 0.0 0.0 0.0 1.0 1.0 0.00 0.0 0.0 0.014412 0.017723 0.000000 1.000000
17 100018 4 285989.591250 536364.0 1143958.365 38669.703750 94906.035 154678.815 298363.466250 571774.5 1193453.865 14295.375000 57181.5 57181.50 298363.466250 571774.5 1193453.865 0.000000 0.108917 0.027229 -1648 -188 -755.250000 10.000000 40.0 12373.875000 57181.5 49495.50 1.014609 1.158442 0.0 0.0 0.0 1.0 1.0 -9.00 0.0 -18.0 0.018384 0.038257 0.000000 1.000000
18 100019 1 104683.500000 104683.5 104683.500 10518.615000 10518.615 10518.615 84136.500000 84136.5 84136.500 0.000000 0.0 0.00 84136.500000 84136.5 84136.500 0.000000 0.000000 0.000000 -925 -925 -925.000000 12.000000 12.0 -20547.000000 -20547.0 -20547.00 1.244210 1.244210 0.0 0.0 0.0 1.0 1.0 150.00 150.0 150.0 0.017147 0.017147 0.000000 1.000000
19 100020 2 41706.000000 48325.5 83412.000 7254.517500 8779.770 14509.035 39206.250000 45900.0 78412.500 0.000000 0.0 0.00 39206.250000 45900.0 78412.500 0.000000 0.000000 0.000000 -472 -243 -357.500000 7.000000 14.0 -2499.750000 -2425.5 -4999.50 1.066006 1.079170 0.0 0.0 0.0 1.0 1.0 0.00 0.0 0.0 0.026651 0.038290 0.000000 1.000000
20 100021 6 153615.750000 675000.0 921694.500 10686.165000 33750.000 64116.990 40961.250000 99189.0 245767.500 3770.100000 9922.5 18850.50 49153.500000 99189.0 245767.500 0.000000 0.433539 0.106606 -2811 -261 -943.833333 7.000000 42.0 -112654.500000 8928.0 -675927.00 inf inf 0.0 0.0 0.0 1.0 1.0 90.00 180.0 270.0 -inf 0.034033 0.000000 1.000000
21 100022 1 103203.000000 103203.0 103203.000 6765.975000 6765.975 6765.975 89091.000000 89091.0 89091.000 0.000000 0.0 0.00 89091.000000 89091.0 89091.000 0.000000 0.000000 0.000000 -239 -239 -239.000000 18.000000 18.0 -14112.000000 -14112.0 -14112.00 1.158400 1.158400 0.0 0.0 0.0 1.0 1.0 NaN NaN 0.0 0.010004 0.010004 0.000000 1.000000
22 100023 4 113517.000000 239242.5 454068.000 8895.892500 16822.440 35583.570 98746.875000 180000.0 394987.500 8435.250000 9000.0 16870.50 98746.875000 180000.0 394987.500 0.095959 0.101380 0.098669 -2424 -262 -1215.750000 12.500000 50.0 -14770.125000 2025.0 -59080.50 1.080951 1.329125 0.0 0.0 0.0 1.0 1.0 240.00 600.0 720.0 -inf 0.028649 0.000000 1.000000
23 100025 8 295425.781875 1042560.0 2363406.255 27645.595714 55678.095 193519.170 259795.344375 900000.0 2078362.755 5845.500000 23382.0 23382.00 296908.965000 900000.0 2078362.755 0.000000 0.423065 0.105766 -2060 -7 -731.000000 13.714286 96.0 -35630.437500 21645.0 -285043.50 1.073930 1.269700 0.0 0.0 0.0 1.0 1.0 174.25 367.0 697.0 0.022884 0.027871 0.000000 0.875000
24 100026 3 292281.000000 744498.0 876843.000 33193.665000 42778.845 66387.330 269115.000000 675000.0 807345.000 0.000000 0.0 0.00 403672.500000 675000.0 807345.000 0.000000 0.000000 0.000000 -1427 -152 -682.333333 21.000000 42.0 -23166.000000 0.0 -69498.00 1.051480 1.102960 0.0 0.0 0.0 1.0 1.0 510.00 1020.0 1020.0 0.020701 0.029682 0.000000 0.666667
25 100027 4 173795.625000 239850.0 695182.500 24067.755000 28077.570 72203.265 163621.125000 226984.5 654484.500 0.000000 0.0 0.00 218161.500000 226984.5 654484.500 0.000000 0.000000 0.000000 -795 -181 -407.000000 11.333333 34.0 -10174.500000 0.0 -40698.00 1.062332 1.066000 0.0 0.0 0.0 1.0 1.0 0.00 0.0 0.0 0.014849 0.017250 0.250000 0.500000
26 100028 5 92920.500000 225000.0 464602.500 8091.585000 11250.000 24274.755 49207.500000 130765.5 246037.500 3750.000000 11250.0 11250.00 82012.500000 130765.5 246037.500 0.000000 0.173124 0.057708 -1805 -531 -1124.200000 11.333333 34.0 -43713.000000 6435.0 -218565.00 inf inf 0.0 0.0 0.0 1.0 1.0 75.00 150.0 150.0 -inf 0.017550 0.000000 0.600000
27 100029 1 126000.000000 126000.0 126000.000 22891.680000 22891.680 22891.680 126000.000000 126000.0 126000.000 0.000000 0.0 0.00 126000.000000 126000.0 126000.000 0.000000 0.000000 0.000000 -324 -324 -324.000000 6.000000 6.0 0.000000 0.0 0.00 1.000000 1.000000 0.0 0.0 0.0 1.0 1.0 60.00 60.0 60.0 0.015013 0.015013 0.000000 1.000000
28 100030 12 32534.643750 49734.0 390415.725 4731.041250 9035.685 37848.330 33497.268750 47241.0 401967.225 1262.625000 6714.0 15151.50 33497.268750 47241.0 401967.225 0.000000 0.210919 0.044804 -2856 -378 -1981.083333 8.625000 69.0 962.625000 5607.0 11551.50 0.965475 1.052772 0.0 0.0 0.0 1.0 1.0 60.00 90.0 120.0 0.017772 0.037233 0.833333 0.166667
29 100032 1 42255.000000 42255.0 42255.000 7910.145000 7910.145 7910.145 69255.000000 69255.0 69255.000 27000.000000 27000.0 27000.00 69255.000000 69255.0 69255.000 0.424597 0.424597 0.424597 -1497 -1497 -1497.000000 6.000000 6.0 27000.000000 27000.0 27000.00 0.610136 0.610136 0.0 0.0 0.0 1.0 1.0 0.00 0.0 0.0 0.020534 0.020534 0.000000 1.000000

가공된 최종 데이터 세트 생성

이전에 application 데이터 세트의 feature engineering 수행 후 새롭게 previous 데이터 세트로 가공된 데이터를 조인.

apps_all =  get_apps_processed(apps)
print(apps_all.shape, prev_agg.shape)
apps_all = apps_all.merge(prev_agg, on='SK_ID_CURR', how='left')
print(apps_all.shape)
(356255, 135) (338857, 42)
(356255, 176)
apps_all.head()
AMT_ANNUITY AMT_CREDIT AMT_GOODS_PRICE AMT_INCOME_TOTAL AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_YEAR APARTMENTS_AVG APARTMENTS_MEDI APARTMENTS_MODE BASEMENTAREA_AVG BASEMENTAREA_MEDI BASEMENTAREA_MODE CNT_CHILDREN CNT_FAM_MEMBERS CODE_GENDER COMMONAREA_AVG COMMONAREA_MEDI COMMONAREA_MODE DAYS_BIRTH DAYS_EMPLOYED DAYS_ID_PUBLISH DAYS_LAST_PHONE_CHANGE DAYS_REGISTRATION DEF_30_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE ELEVATORS_AVG ELEVATORS_MEDI ELEVATORS_MODE EMERGENCYSTATE_MODE ENTRANCES_AVG ENTRANCES_MEDI ENTRANCES_MODE EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 FLAG_CONT_MOBILE FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_2 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_EMAIL FLAG_EMP_PHONE FLAG_MOBIL FLAG_OWN_CAR FLAG_OWN_REALTY FLAG_PHONE FLAG_WORK_PHONE FLOORSMAX_AVG FLOORSMAX_MEDI FLOORSMAX_MODE FLOORSMIN_AVG FLOORSMIN_MEDI FLOORSMIN_MODE FONDKAPREMONT_MODE HOUR_APPR_PROCESS_START HOUSETYPE_MODE LANDAREA_AVG LANDAREA_MEDI LANDAREA_MODE LIVE_CITY_NOT_WORK_CITY LIVE_REGION_NOT_WORK_REGION LIVINGAPARTMENTS_AVG LIVINGAPARTMENTS_MEDI LIVINGAPARTMENTS_MODE LIVINGAREA_AVG LIVINGAREA_MEDI LIVINGAREA_MODE NAME_CONTRACT_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE NAME_INCOME_TYPE NAME_TYPE_SUITE NONLIVINGAPARTMENTS_AVG NONLIVINGAPARTMENTS_MEDI NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_AVG NONLIVINGAREA_MEDI NONLIVINGAREA_MODE OBS_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE OCCUPATION_TYPE ORGANIZATION_TYPE OWN_CAR_AGE REGION_POPULATION_RELATIVE REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION SK_ID_CURR TARGET TOTALAREA_MODE WALLSMATERIAL_MODE WEEKDAY_APPR_PROCESS_START YEARS_BEGINEXPLUATATION_AVG YEARS_BEGINEXPLUATATION_MEDI YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_AVG YEARS_BUILD_MEDI YEARS_BUILD_MODE APPS_EXT_SOURCE_MEAN APPS_EXT_SOURCE_STD APPS_ANNUITY_CREDIT_RATIO APPS_GOODS_CREDIT_RATIO APPS_ANNUITY_INCOME_RATIO APPS_CREDIT_INCOME_RATIO APPS_GOODS_INCOME_RATIO APPS_CNT_FAM_INCOME_RATIO APPS_EMPLOYED_BIRTH_RATIO APPS_INCOME_EMPLOYED_RATIO APPS_INCOME_BIRTH_RATIO APPS_CAR_BIRTH_RATIO APPS_CAR_EMPLOYED_RATIO PREV_SK_ID_CURR_COUNT PREV_AMT_CREDIT_MEAN PREV_AMT_CREDIT_MAX PREV_AMT_CREDIT_SUM PREV_AMT_ANNUITY_MEAN PREV_AMT_ANNUITY_MAX PREV_AMT_ANNUITY_SUM PREV_AMT_APPLICATION_MEAN PREV_AMT_APPLICATION_MAX PREV_AMT_APPLICATION_SUM PREV_AMT_DOWN_PAYMENT_MEAN PREV_AMT_DOWN_PAYMENT_MAX PREV_AMT_DOWN_PAYMENT_SUM PREV_AMT_GOODS_PRICE_MEAN PREV_AMT_GOODS_PRICE_MAX PREV_AMT_GOODS_PRICE_SUM PREV_RATE_DOWN_PAYMENT_MIN PREV_RATE_DOWN_PAYMENT_MAX PREV_RATE_DOWN_PAYMENT_MEAN PREV_DAYS_DECISION_MIN PREV_DAYS_DECISION_MAX PREV_DAYS_DECISION_MEAN PREV_CNT_PAYMENT_MEAN PREV_CNT_PAYMENT_SUM PREV_PREV_CREDIT_DIFF_MEAN PREV_PREV_CREDIT_DIFF_MAX PREV_PREV_CREDIT_DIFF_SUM PREV_PREV_CREDIT_APPL_RATIO_MEAN PREV_PREV_CREDIT_APPL_RATIO_MAX PREV_PREV_GOODS_DIFF_MEAN PREV_PREV_GOODS_DIFF_MAX PREV_PREV_GOODS_DIFF_SUM PREV_PREV_GOODS_APPL_RATIO_MEAN PREV_PREV_GOODS_APPL_RATIO_MAX PREV_PREV_DAYS_LAST_DUE_DIFF_MEAN PREV_PREV_DAYS_LAST_DUE_DIFF_MAX PREV_PREV_DAYS_LAST_DUE_DIFF_SUM PREV_PREV_INTERESTS_RATE_MEAN PREV_PREV_INTERESTS_RATE_MAX PREV_REFUSED_RATIO PREV_APPROVED_RATIO
0 24700.5 406597.5 351000.0 202500.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0247 0.0250 0.0252 0.0369 0.0369 0.0383 0 1.0 M 0.0143 0.0144 0.0144 -9461 -637 -2120 -1134.0 -3648.0 2.0 2.0 0.00 0.00 0.0000 No 0.0690 0.0690 0.0690 0.083037 0.262949 0.139376 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 N Y 1 0 0.0833 0.0833 0.0833 0.1250 0.1250 0.1250 reg oper account 10 block of flats 0.0369 0.0375 0.0377 0 0 0.0202 0.0205 0.022 0.0190 0.0193 0.0198 Cash loans Secondary / secondary special Single / not married House / apartment Working Unaccompanied 0.0000 0.0000 0.0 0.0000 0.00 0.0 2.0 2.0 Laborers Business Entity Type 3 NaN 0.018801 2 2 0 0 0 0 100002 1.0 0.0149 Stone, brick WEDNESDAY 0.9722 0.9722 0.9722 0.6192 0.6243 0.6341 0.161787 0.092026 0.060749 0.863262 0.121978 2.007889 1.733333 202500.0 0.067329 -317.896389 -21.403657 NaN NaN 1.0 179055.00 179055.0 179055.0 9251.775 9251.775 9251.775 179055.00 179055.0 179055.00 0.00 0.0 0.00 179055.00 179055.0 179055.00 0.000000 0.000000 0.000000 -606.0 -606.0 -606.000000 24.000000 24.0 0.00 0.0 0.00 1.000000 1.000000 0.0 0.0 0.0 1.0 1.0 150.0 150.0 150.0 0.010003 0.010003 0.000000 1.000000
1 35698.5 1293502.5 1129500.0 270000.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0959 0.0968 0.0924 0.0529 0.0529 0.0538 0 2.0 F 0.0605 0.0608 0.0497 -16765 -1188 -291 -828.0 -1186.0 0.0 0.0 0.08 0.08 0.0806 No 0.0345 0.0345 0.0345 0.311267 0.622246 NaN 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 N N 1 0 0.2917 0.2917 0.2917 0.3333 0.3333 0.3333 reg oper account 11 block of flats 0.0130 0.0132 0.0128 0 0 0.0773 0.0787 0.079 0.0549 0.0558 0.0554 Cash loans Higher education Married House / apartment State servant Family 0.0039 0.0039 0.0 0.0098 0.01 0.0 1.0 1.0 Core staff School NaN 0.003541 1 1 0 0 0 0 100003 0.0 0.0714 Block MONDAY 0.9851 0.9851 0.9851 0.7960 0.7987 0.8040 0.466757 0.219895 0.027598 0.873211 0.132217 4.790750 4.183333 135000.0 0.070862 -227.272727 -16.104981 NaN NaN 3.0 484191.00 1035882.0 1452573.0 56553.990 98356.995 169661.970 435436.50 900000.0 1306309.50 3442.50 6885.0 6885.00 435436.50 900000.0 1306309.50 0.000000 0.100061 0.050030 -2341.0 -746.0 -1305.000000 10.000000 30.0 -48754.50 756.0 -146263.50 1.057664 1.150980 0.0 0.0 0.0 1.0 1.0 50.0 150.0 150.0 0.015272 0.018533 0.000000 1.000000
2 6750.0 135000.0 135000.0 67500.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN 0 1.0 M NaN NaN NaN -19046 -225 -2531 -815.0 -4260.0 0.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN 0.555912 0.729567 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 Y Y 1 1 NaN NaN NaN NaN NaN NaN NaN 9 NaN NaN NaN NaN 0 0 NaN NaN NaN NaN NaN NaN Revolving loans Secondary / secondary special Single / not married House / apartment Working Unaccompanied NaN NaN NaN NaN NaN NaN 0.0 0.0 Laborers Government 26.0 0.010032 2 2 0 0 0 0 100004 0.0 NaN NaN MONDAY NaN NaN NaN NaN NaN NaN 0.642739 0.122792 0.050000 1.000000 0.100000 2.000000 2.000000 67500.0 0.011814 -300.000000 -3.544051 -0.001365 -0.115556 1.0 20106.00 20106.0 20106.0 5357.250 5357.250 5357.250 24282.00 24282.0 24282.00 4860.00 4860.0 4860.00 24282.00 24282.0 24282.00 0.212008 0.212008 0.212008 -815.0 -815.0 -815.000000 4.000000 4.0 4176.00 4176.0 4176.00 0.828021 0.828021 0.0 0.0 0.0 1.0 1.0 30.0 30.0 30.0 0.016450 0.016450 0.000000 1.000000
3 29686.5 312682.5 297000.0 135000.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 2.0 F NaN NaN NaN -19005 -3039 -2437 -617.0 -9833.0 0.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN 0.650442 NaN 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 N Y 0 0 NaN NaN NaN NaN NaN NaN NaN 17 NaN NaN NaN NaN 0 0 NaN NaN NaN NaN NaN NaN Cash loans Secondary / secondary special Civil marriage House / apartment Working Unaccompanied NaN NaN NaN NaN NaN NaN 2.0 2.0 Laborers Business Entity Type 3 NaN 0.008019 2 2 0 0 0 0 100006 0.0 NaN NaN WEDNESDAY NaN NaN NaN NaN NaN NaN 0.650442 0.151008 0.094941 0.949845 0.219900 2.316167 2.200000 67500.0 0.159905 -44.422507 -7.103394 NaN NaN 9.0 291695.50 906615.0 2625259.5 23651.175 39954.510 141907.050 272203.26 688500.0 2449829.34 34840.17 66987.0 69680.34 408304.89 688500.0 2449829.34 0.108994 0.217830 0.163412 -617.0 -181.0 -272.444444 23.000000 138.0 -19492.24 66987.0 -175430.16 1.012684 1.316797 0.0 0.0 0.0 1.0 1.0 810.0 1410.0 1620.0 -inf 0.027424 0.111111 0.555556
4 21865.5 513000.0 513000.0 121500.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN 0 1.0 M NaN NaN NaN -19932 -3038 -3458 -1106.0 -4311.0 0.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN 0.322738 NaN 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 N Y 0 0 NaN NaN NaN NaN NaN NaN NaN 11 NaN NaN NaN NaN 1 0 NaN NaN NaN NaN NaN NaN Cash loans Secondary / secondary special Single / not married House / apartment Working Unaccompanied NaN NaN NaN NaN NaN NaN 0.0 0.0 Core staff Religion NaN 0.028663 2 2 0 1 0 0 100007 0.0 NaN NaN THURSDAY NaN NaN NaN NaN NaN NaN 0.322738 0.151008 0.042623 1.000000 0.179963 4.222222 4.222222 121500.0 0.152418 -39.993417 -6.095725 NaN NaN 6.0 166638.75 284400.0 999832.5 12278.805 22678.785 73672.830 150530.25 247500.0 903181.50 3390.75 3676.5 6781.50 150530.25 247500.0 903181.50 0.100143 0.218890 0.159516 -2357.0 -374.0 -1222.833333 20.666667 124.0 -16108.50 2560.5 -96651.00 1.046356 1.264000 0.0 0.0 0.0 1.0 1.0 7.5 30.0 30.0 0.026538 0.036164 0.000000 1.000000
apps_all.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 356254
Columns: 176 entries, AMT_ANNUITY to PREV_APPROVED_RATIO
dtypes: float64(120), int64(40), object(16)
memory usage: 481.1+ MB

데이터 레이블 인코딩, NULL값은 LightGBM 내부에서 처리하도록 특별한 변경하지 않음.

object_columns = apps_all.dtypes[apps_all.dtypes == 'object'].index.tolist()
for column in object_columns:
    apps_all[column] = pd.factorize(apps_all[column])[0]
apps_all.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 356254
Columns: 176 entries, AMT_ANNUITY to PREV_APPROVED_RATIO
dtypes: float64(120), int64(56)
memory usage: 481.1 MB

학습 데이터와 테스트 데이터 다시 분리

apps_all_train = apps_all[~apps_all['TARGET'].isnull()]
apps_all_test = apps_all[apps_all['TARGET'].isnull()]

apps_all_test = apps_all_test.drop('TARGET', axis=1)
apps_all_train.columns.tolist()
['AMT_ANNUITY',
 'AMT_CREDIT',
 'AMT_GOODS_PRICE',
 'AMT_INCOME_TOTAL',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'APARTMENTS_AVG',
 'APARTMENTS_MEDI',
 'APARTMENTS_MODE',
 'BASEMENTAREA_AVG',
 'BASEMENTAREA_MEDI',
 'BASEMENTAREA_MODE',
 'CNT_CHILDREN',
 'CNT_FAM_MEMBERS',
 'CODE_GENDER',
 'COMMONAREA_AVG',
 'COMMONAREA_MEDI',
 'COMMONAREA_MODE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_ID_PUBLISH',
 'DAYS_LAST_PHONE_CHANGE',
 'DAYS_REGISTRATION',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'ELEVATORS_AVG',
 'ELEVATORS_MEDI',
 'ELEVATORS_MODE',
 'EMERGENCYSTATE_MODE',
 'ENTRANCES_AVG',
 'ENTRANCES_MEDI',
 'ENTRANCES_MODE',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'FLAG_CONT_MOBILE',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_21',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_EMAIL',
 'FLAG_EMP_PHONE',
 'FLAG_MOBIL',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'FLAG_PHONE',
 'FLAG_WORK_PHONE',
 'FLOORSMAX_AVG',
 'FLOORSMAX_MEDI',
 'FLOORSMAX_MODE',
 'FLOORSMIN_AVG',
 'FLOORSMIN_MEDI',
 'FLOORSMIN_MODE',
 'FONDKAPREMONT_MODE',
 'HOUR_APPR_PROCESS_START',
 'HOUSETYPE_MODE',
 'LANDAREA_AVG',
 'LANDAREA_MEDI',
 'LANDAREA_MODE',
 'LIVE_CITY_NOT_WORK_CITY',
 'LIVE_REGION_NOT_WORK_REGION',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAPARTMENTS_MEDI',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAREA_AVG',
 'LIVINGAREA_MEDI',
 'LIVINGAREA_MODE',
 'NAME_CONTRACT_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'NAME_INCOME_TYPE',
 'NAME_TYPE_SUITE',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAPARTMENTS_MEDI',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_AVG',
 'NONLIVINGAREA_MEDI',
 'NONLIVINGAREA_MODE',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'OCCUPATION_TYPE',
 'ORGANIZATION_TYPE',
 'OWN_CAR_AGE',
 'REGION_POPULATION_RELATIVE',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'SK_ID_CURR',
 'TARGET',
 'TOTALAREA_MODE',
 'WALLSMATERIAL_MODE',
 'WEEKDAY_APPR_PROCESS_START',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BUILD_AVG',
 'YEARS_BUILD_MEDI',
 'YEARS_BUILD_MODE',
 'APPS_EXT_SOURCE_MEAN',
 'APPS_EXT_SOURCE_STD',
 'APPS_ANNUITY_CREDIT_RATIO',
 'APPS_GOODS_CREDIT_RATIO',
 'APPS_ANNUITY_INCOME_RATIO',
 'APPS_CREDIT_INCOME_RATIO',
 'APPS_GOODS_INCOME_RATIO',
 'APPS_CNT_FAM_INCOME_RATIO',
 'APPS_EMPLOYED_BIRTH_RATIO',
 'APPS_INCOME_EMPLOYED_RATIO',
 'APPS_INCOME_BIRTH_RATIO',
 'APPS_CAR_BIRTH_RATIO',
 'APPS_CAR_EMPLOYED_RATIO',
 'PREV_SK_ID_CURR_COUNT',
 'PREV_AMT_CREDIT_MEAN',
 'PREV_AMT_CREDIT_MAX',
 'PREV_AMT_CREDIT_SUM',
 'PREV_AMT_ANNUITY_MEAN',
 'PREV_AMT_ANNUITY_MAX',
 'PREV_AMT_ANNUITY_SUM',
 'PREV_AMT_APPLICATION_MEAN',
 'PREV_AMT_APPLICATION_MAX',
 'PREV_AMT_APPLICATION_SUM',
 'PREV_AMT_DOWN_PAYMENT_MEAN',
 'PREV_AMT_DOWN_PAYMENT_MAX',
 'PREV_AMT_DOWN_PAYMENT_SUM',
 'PREV_AMT_GOODS_PRICE_MEAN',
 'PREV_AMT_GOODS_PRICE_MAX',
 'PREV_AMT_GOODS_PRICE_SUM',
 'PREV_RATE_DOWN_PAYMENT_MIN',
 'PREV_RATE_DOWN_PAYMENT_MAX',
 'PREV_RATE_DOWN_PAYMENT_MEAN',
 'PREV_DAYS_DECISION_MIN',
 'PREV_DAYS_DECISION_MAX',
 'PREV_DAYS_DECISION_MEAN',
 'PREV_CNT_PAYMENT_MEAN',
 'PREV_CNT_PAYMENT_SUM',
 'PREV_PREV_CREDIT_DIFF_MEAN',
 'PREV_PREV_CREDIT_DIFF_MAX',
 'PREV_PREV_CREDIT_DIFF_SUM',
 'PREV_PREV_CREDIT_APPL_RATIO_MEAN',
 'PREV_PREV_CREDIT_APPL_RATIO_MAX',
 'PREV_PREV_GOODS_DIFF_MEAN',
 'PREV_PREV_GOODS_DIFF_MAX',
 'PREV_PREV_GOODS_DIFF_SUM',
 'PREV_PREV_GOODS_APPL_RATIO_MEAN',
 'PREV_PREV_GOODS_APPL_RATIO_MAX',
 'PREV_PREV_DAYS_LAST_DUE_DIFF_MEAN',
 'PREV_PREV_DAYS_LAST_DUE_DIFF_MAX',
 'PREV_PREV_DAYS_LAST_DUE_DIFF_SUM',
 'PREV_PREV_INTERESTS_RATE_MEAN',
 'PREV_PREV_INTERESTS_RATE_MAX',
 'PREV_REFUSED_RATIO',
 'PREV_APPROVED_RATIO']

학습 데이터를 검증 데이터로 분리하고 LGBM Classifier로 학습 수행.

from sklearn.model_selection import train_test_split

ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
target_app = apps_all_train['TARGET']

train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
train_x.shape, valid_x.shape
((215257, 174), (92254, 174))
from lightgbm import LGBMClassifier

clf = LGBMClassifier(
        n_jobs=-1,
        n_estimators=1000,
        learning_rate=0.02,
        num_leaves=32,
        subsample=0.8,
        max_depth=12,
        silent=-1,
        verbose=-1
        )

clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
        early_stopping_rounds= 50)
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.76611	training's binary_logloss: 0.245964	valid_1's auc: 0.753701	valid_1's binary_logloss: 0.248684
[200]	training's auc: 0.789025	training's binary_logloss: 0.237779	valid_1's auc: 0.766044	valid_1's binary_logloss: 0.244034
[300]	training's auc: 0.804705	training's binary_logloss: 0.232402	valid_1's auc: 0.770975	valid_1's binary_logloss: 0.242307
[400]	training's auc: 0.81739	training's binary_logloss: 0.228102	valid_1's auc: 0.773264	valid_1's binary_logloss: 0.241518
[500]	training's auc: 0.828598	training's binary_logloss: 0.224327	valid_1's auc: 0.773917	valid_1's binary_logloss: 0.241253
[600]	training's auc: 0.839337	training's binary_logloss: 0.220751	valid_1's auc: 0.77427	valid_1's binary_logloss: 0.241111
[700]	training's auc: 0.848905	training's binary_logloss: 0.217397	valid_1's auc: 0.774436	valid_1's binary_logloss: 0.241006
[800]	training's auc: 0.857485	training's binary_logloss: 0.214237	valid_1's auc: 0.774647	valid_1's binary_logloss: 0.240922
[900]	training's auc: 0.866027	training's binary_logloss: 0.211076	valid_1's auc: 0.774776	valid_1's binary_logloss: 0.24087
Early stopping, best iteration is:
[853]	training's auc: 0.862238	training's binary_logloss: 0.212526	valid_1's auc: 0.774799	valid_1's binary_logloss: 0.240865





LGBMClassifier(learning_rate=0.02, max_depth=12, n_estimators=1000,
               num_leaves=32, silent=-1, subsample=0.8, verbose=-1)
from lightgbm import plot_importance

plot_importance(clf, figsize=(16, 32))

학습된 Classifier를 이용하여 테스트 데이터 예측하고 결과를 Kaggle로 Submit 수행.

preds = clf.predict_proba(apps_all_test.drop('SK_ID_CURR', axis=1))[:, 1 ]
apps_all_test['TARGET'] = preds
apps_all_test[['SK_ID_CURR', 'TARGET']].to_csv('prev_baseline_01.csv', index=False)
차수Private ScorePublic Score설명
1차0.740880.74448application 데이터 세트 기본 preprocessing
2차0.754580.75882application 데이터 세트 Feature Engineering
3차0.763960.77579previous 데이터 세트 Feature Engineering

지금까지 만든 로직을 별도의 함수로 생성.

from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier

# 신청금액과 실제 대출액, 상품금액과의 차이 비율, 만기일 차이 비교, 이자율 계산등의 주요 컬럼 가공 생산. 
def get_prev_processed(prev):
    # 대출 신청 금액과 실제 대출액/대출 상품금액 차이 및 비율
    prev['PREV_CREDIT_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_CREDIT']
    prev['PREV_GOODS_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_GOODS_PRICE']
    prev['PREV_CREDIT_APPL_RATIO'] = prev['AMT_CREDIT']/prev['AMT_APPLICATION']
    # prev['PREV_ANNUITY_APPL_RATIO'] = prev['AMT_ANNUITY']/prev['AMT_APPLICATION']
    prev['PREV_GOODS_APPL_RATIO'] = prev['AMT_GOODS_PRICE']/prev['AMT_APPLICATION']
    
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # 첫번째 만기일과 마지막 만기일까지의 기간
    prev['PREV_DAYS_LAST_DUE_DIFF'] = prev['DAYS_LAST_DUE_1ST_VERSION'] - prev['DAYS_LAST_DUE']
    # 매월 납부 금액과 납부 횟수 곱해서 전체 납부 금액 구함. 
    all_pay = prev['AMT_ANNUITY'] * prev['CNT_PAYMENT']
    # 전체 납부 금액 대비 AMT_CREDIT 비율을 구하고 여기에 다시 납부횟수로 나누어서 이자율 계산. 
    prev['PREV_INTERESTS_RATE'] = (all_pay/prev['AMT_CREDIT'] - 1)/prev['CNT_PAYMENT']
        
    return prev

# 기존 컬럼및 위에서 가공된 신규 컬럼들에 대해서 SK_ID_CURR 레벨로 Aggregation 수행.  
def get_prev_amt_agg(prev):

    agg_dict = {
         # 기존 주요 컬럼들을 SK_ID_CURR 레벨로 Aggregation 수행. . 
        'SK_ID_CURR':['count'],
        'AMT_CREDIT':['mean', 'max', 'sum'],
        'AMT_ANNUITY':['mean', 'max', 'sum'], 
        'AMT_APPLICATION':['mean', 'max', 'sum'],
        'AMT_DOWN_PAYMENT':['mean', 'max', 'sum'],
        'AMT_GOODS_PRICE':['mean', 'max', 'sum'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
        # 신규 가공 컬럼들을 SK_ID_CURR 레벨로 Aggregation 수행. .
        'PREV_CREDIT_DIFF':['mean', 'max', 'sum'], 
        'PREV_CREDIT_APPL_RATIO':['mean', 'max'],
        'PREV_GOODS_DIFF':['mean', 'max', 'sum'],
        'PREV_GOODS_APPL_RATIO':['mean', 'max'],
        'PREV_DAYS_LAST_DUE_DIFF':['mean', 'max', 'sum'],
        'PREV_INTERESTS_RATE':['mean', 'max']
    }

    prev_group = prev.groupby('SK_ID_CURR')
    prev_amt_agg = prev_group.agg(agg_dict)

    # multi index 컬럼을 '_'로 연결하여 컬럼명 변경
    prev_amt_agg.columns = ["PREV_"+ "_".join(x).upper() for x in prev_amt_agg.columns.ravel()]
    
    # 'SK_ID_CURR'로 조인하기 위해 SK_ID_CURR을 컬럼으로 변환  
    prev_amt_agg = prev_amt_agg.reset_index()
    
    return prev_amt_agg

# NAME_CONTRACT_STATUS의 SK_ID_CURR별 Approved, Refused의 건수 계산.  
def get_prev_refused_appr_agg(prev):
    # 원래 groupby 컬럼 + 세부 기준 컬럼으로 groupby 수행. 세분화된 레벨로 aggregation 수행 한 뒤에 unstack()으로 컬럼레벨로 변형. 
    prev_refused_appr_group = prev[prev['NAME_CONTRACT_STATUS'].isin(['Approved', 'Refused'])].groupby([ 'SK_ID_CURR', 'NAME_CONTRACT_STATUS'])
    prev_refused_appr_agg = prev_refused_appr_group['SK_ID_CURR'].count().unstack()
    # 컬럼명 변경. 
    prev_refused_appr_agg.columns = ['PREV_APPROVED_COUNT', 'PREV_REFUSED_COUNT' ]
    # NaN값은 모두 0으로 변경. 
    prev_refused_appr_agg = prev_refused_appr_agg.fillna(0)
    
    # 'SK_ID_CURR'로 조인하기 위해 SK_ID_CURR을 컬럼으로 변환  
    prev_refused_appr_agg = prev_refused_appr_agg.reset_index()
    
    return prev_refused_appr_agg

    
# 앞에서 구한 prev_amt_agg와 prev_refused_appr_agg를 조인하고 SK_ID_CURR별 APPROVED_COUNT 및 REFUSED_COUNT 비율 생성
def get_prev_agg(prev):
    prev = get_prev_processed(prev)
    prev_amt_agg = get_prev_amt_agg(prev)
    prev_refused_appr_agg = get_prev_refused_appr_agg(prev)
    
    # prev_amt_agg와 조인. 
    prev_agg = prev_amt_agg.merge(prev_refused_appr_agg, on='SK_ID_CURR', how='left')
    # SK_ID_CURR별 과거 대출건수 대비 APPROVED_COUNT 및 REFUSED_COUNT 비율 생성. 
    prev_agg['PREV_REFUSED_RATIO'] = prev_agg['PREV_REFUSED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']
    prev_agg['PREV_APPROVED_RATIO'] = prev_agg['PREV_APPROVED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']
    # 'PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT' 컬럼 drop 
    prev_agg = prev_agg.drop(['PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT'], axis=1)
    
    return prev_agg

# apps와 previous 데이터 세트를 SK_ID_CURR레벨로 다양한 컬럼이 aggregation되어 있는 prev_agg 조인
def get_apps_all_with_prev_agg(apps, prev):
    apps_all =  get_apps_processed(apps)
    prev_agg = get_prev_agg(prev)
    print('prev_agg shape:', prev_agg.shape)
    print('apps_all before merge shape:', apps_all.shape)
    apps_all = apps_all.merge(prev_agg, on='SK_ID_CURR', how='left')
    print('apps_all after merge with prev_agg shape:', apps_all.shape)
    
    return apps_all

# Label 인코딩 수행. 
def get_apps_all_encoded(apps_all):
    object_columns = apps_all.dtypes[apps_all.dtypes == 'object'].index.tolist()
    for column in object_columns:
        apps_all[column] = pd.factorize(apps_all[column])[0]
    
    return apps_all

# 학습 데이터와 테스트 데이터 세트 분리. 
def get_apps_all_train_test(apps_all):
    apps_all_train = apps_all[~apps_all['TARGET'].isnull()]
    apps_all_test = apps_all[apps_all['TARGET'].isnull()]

    apps_all_test = apps_all_test.drop('TARGET', axis=1)
    
    return apps_all_train, apps_all_test

# 학습 수행. 
def train_apps_all(apps_all_train):
    ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    target_app = apps_all_train['TARGET']

    train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
    print('train shape:', train_x.shape, 'valid shape:', valid_x.shape)
    clf = LGBMClassifier(
            n_jobs=-1,
            n_estimators=1000,
            learning_rate=0.02,
            num_leaves=32,
            subsample=0.8,
            max_depth=12,
            silent=-1,
            verbose=-1
                )

    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
                early_stopping_rounds= 100)
    
    return clf

함수를 호출하여 재학습 및 평가

def get_dataset():
    app_train = pd.read_csv('application_train.csv')
    app_test = pd.read_csv('application_test.csv')
    
    apps = pd.concat([app_train, app_test])
    prev = pd.read_csv('previous_application.csv')
    return apps, prev
apps, prev = get_dataset()
apps_all = get_apps_all_with_prev_agg(apps, prev)
apps_all = get_apps_all_encoded(apps_all)
apps_all_train, apps_all_test = get_apps_all_train_test(apps_all)
clf = train_apps_all(apps_all_train)
prev_agg shape: (338857, 42)
apps_all before merge shape: (356255, 135)
apps_all after merge with prev_agg shape: (356255, 176)
train shape: (215257, 174) valid shape: (92254, 174)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.76611	training's binary_logloss: 0.245964	valid_1's auc: 0.753701	valid_1's binary_logloss: 0.248684
[200]	training's auc: 0.789025	training's binary_logloss: 0.237779	valid_1's auc: 0.766044	valid_1's binary_logloss: 0.244034
[300]	training's auc: 0.804705	training's binary_logloss: 0.232402	valid_1's auc: 0.770975	valid_1's binary_logloss: 0.242307
[400]	training's auc: 0.81739	training's binary_logloss: 0.228102	valid_1's auc: 0.773264	valid_1's binary_logloss: 0.241518
[500]	training's auc: 0.828598	training's binary_logloss: 0.224327	valid_1's auc: 0.773917	valid_1's binary_logloss: 0.241253
[600]	training's auc: 0.839337	training's binary_logloss: 0.220751	valid_1's auc: 0.77427	valid_1's binary_logloss: 0.241111
[700]	training's auc: 0.848905	training's binary_logloss: 0.217397	valid_1's auc: 0.774436	valid_1's binary_logloss: 0.241006
[800]	training's auc: 0.857485	training's binary_logloss: 0.214237	valid_1's auc: 0.774647	valid_1's binary_logloss: 0.240922
[900]	training's auc: 0.866027	training's binary_logloss: 0.211076	valid_1's auc: 0.774776	valid_1's binary_logloss: 0.24087
[1000]	training's auc: 0.873512	training's binary_logloss: 0.208114	valid_1's auc: 0.774799	valid_1's binary_logloss: 0.240861
Did not meet early stopping. Best iteration is:
[1000]	training's auc: 0.873512	training's binary_logloss: 0.208114	valid_1's auc: 0.774799	valid_1's binary_logloss: 0.240861
preds = clf.predict_proba(apps_all_test.drop(['SK_ID_CURR'], axis=1))[:, 1 ]
apps_all_test['TARGET'] = preds
apps_all_test[['SK_ID_CURR', 'TARGET']].to_csv('prev_baseline_02.csv', index=False)
차수Private ScorePublic Score설명
1차0.740880.74448application 데이터 세트 기본 preprocessing
2차0.754580.75882application 데이터 세트 Feature Engineering
3차0.763960.77579previous 데이터 세트 Feature Engineering
4차0.764200.77583previous 데이터 세트 Feature Engineering 함수화

Source: inflearn / 캐글 Advanced 머신러닝 실전 박치기

profile
데이터 분석 공부용 벨로그

0개의 댓글