Z-Score(표준점수): (관측값 - 데이터의 평균) / 표준편차
# 컬럼명과 자료형 확인
cc_df.info()
# 출력:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491134 entries, 0 to 491133
Data columns (total 22 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 trans_date_trans_time 491134 non-null object
1 cc_num 491134 non-null int64
2 merchant 491134 non-null object
3 category 491134 non-null object
4 amt 491134 non-null float64
5 first 491134 non-null object
6 last 491134 non-null object
7 gender 491134 non-null object
8 street 491134 non-null object
9 city 491134 non-null object
10 state 491134 non-null object
11 zip 491134 non-null int64
12 lat 491134 non-null float64
13 long 491134 non-null float64
14 city_pop 491134 non-null int64
15 job 491134 non-null object
16 dob 491134 non-null object
17 trans_num 491134 non-null object
18 unix_time 491134 non-null int64
19 merch_lat 491134 non-null float64
20 merch_long 491134 non-null float64
21 is_fraud 491134 non-null int64
# cc_num 컬럼의 값마다 amt의 평균, 표준편차를 계산하여 amt_info에 저장
# cc_num: account number
# amt: amount
amt_info = cc_df.groupby('cc_num)['amt'].agg(['mean','std']).reset_index()
# cc_num 컬럼을 기준으로 cc_df와 amt_info 데이터 merge
cc_df = cc_df.merge(amt_info, on='cc_num', how='left')
# z-score 계산
cc_df['amt_z'] = (cc_df['amt'] - cc_df['mean']) / cc_df['std']
# category와 cc_num 컬럼의 값마다가 mean, std 계산
cat_info = cc_df.groupby(['cc_num', 'category'])['amt'].agg(['mean','std']).reset_index()
# cc_df 데이터와 cat_info 데이터 합치기
cc_df = cc_df.merge(cat_info, on=['cc_num','category'],how='left')
# 결제금액의 z-score 계산하기
cc_df['cat_amt_z'] = (cc_df['amt'] - cc_df['mean']) / cc_df['std']