import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic
from scipy import stats as spst
import statsmodels.api as sm
import warnings
warnings.filterwarnings(action='ignore')
# mobile data
data_path = "https://raw.githubusercontent.com/DA4BAM/dataset/master/mobile_NA2.csv"
mobile = pd.read_csv(data_path)
mobile.drop(['id', 'REPORTED_USAGE_LEVEL','OVER_15MINS_CALLS_PER_MONTH'], axis = 1, inplace = True)
mobile.head()
COLLEGE | INCOME | OVERAGE | LEFTOVER | HOUSE | HANDSET_PRICE | AVERAGE_CALL_DURATION | REPORTED_SATISFACTION | CONSIDERING_CHANGE_OF_PLAN | CHURN | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 47711 | 183 | 17 | 730589.0 | 192 | 5 | unsat | considering | 0 |
1 | 0 | 74132 | 191 | 43 | 535092.0 | 349 | 2 | unsat | no | 1 |
2 | 1 | 150419 | 0 | 14 | 204004.0 | 682 | 6 | unsat | considering | 0 |
3 | 0 | 159567 | 0 | 58 | 281969.0 | 634 | 1 | very_unsat | never_thought | 0 |
4 | 1 | 23392 | 0 | 0 | 216707.0 | 233 | 15 | unsat | no | 1 |
변수 명 | 내용 | 구분 |
---|---|---|
COLLEGE | 대학졸업 여부(1,0) | |
INCOME | 연 수입액(달러) | |
OVERAGE | 월 초과사용 시간(분) | |
LEFTOVER | 월 사용 잔여시간(%) | |
HOUSE | 집 가격(달러) | |
HANDSET_PRICE | 핸드폰 가격(달러) | |
AVERAGE_CALL_DURATION | 평균 통화시간(분) | |
REPORTED_SATISFACTION | 만족도 설문('very_unsat', 'unsat', 'avg', 'sat', 'very_sat' ) | |
CONSIDERING_CHANGE_OF_PLAN | 변경 계획 설문('never_thought', 'no', 'perhaps', 'considering', 'actively_looking_into_it') | |
CHURN | 이탈여부(1 : 이탈, 0 : 잔류) | Label |
① 변수의 비즈니스 의미
② 숫자? 범주?
③ 전처리 사항
④ 기초통계량
⑤ 분포
⑥ 위 정보로부터 파악한 내용(비즈니스!) 정리
⑦ 추가 분석해볼 사항 도출
변수의 비즈니스 의미: 고객의 대학졸업 여부(1, 0)
범주형 데이터
전처리 사항:
# 4. 기초 통계량
print(mobile['COLLEGE'].value_counts())
print('-'*50)
print(mobile['COLLEGE'].value_counts()/len(mobile['COLLEGE']))
1 9049
0 8951
Name: COLLEGE, dtype: int64
--------------------------------------------------
1 0.502722
0 0.497278
Name: COLLEGE, dtype: float64
# 5. 분포
cnt = mobile['COLLEGE'].value_counts()
plt.subplot(1,2,1)
sns.barplot(x = cnt.index, y = cnt.values)
plt.subplot(1,2,2)
plt.pie(cnt.values, labels = cnt.index, autopct = '%.2f%%')
plt.show()
# 4. 기초 통계량
print(mobile['INCOME'].describe())
# 5. 분포
plt.figure(figsize = (16,4))
plt.subplot(1,2,1)
sns.histplot(data = mobile, x= 'INCOME', bins = 30)
plt.subplot(1,2,2)
sns.kdeplot(data = mobile, x = 'INCOME')
plt.tight_layout()
plt.show()
count 18000.000000
mean 80314.400278
std 41703.420130
min 20007.000000
25% 42151.500000
50% 75501.500000
75% 116082.250000
max 159983.000000
Name: INCOME, dtype: float64
# 4. 기초 통계량
print(mobile['HANDSET_PRICE'].describe())
# 5. 분포
plt.figure(figsize = (16,4))
plt.subplot(1,2,1)
sns.histplot(data = mobile, x= 'HANDSET_PRICE', bins = 30)
plt.subplot(1,2,2)
sns.kdeplot(data = mobile, x = 'HANDSET_PRICE')
plt.tight_layout()
plt.show()
count 18000.000000
mean 390.096667
std 214.239639
min 130.000000
25% 219.000000
50% 326.000000
75% 536.000000
max 899.000000
Name: HANDSET_PRICE, dtype: float64
# 4. 기초 통계량
print(mobile['AVERAGE_CALL_DURATION'].describe())
# 5. 분포
plt.figure(figsize = (16,4))
plt.subplot(1,2,1)
sns.histplot(data = mobile, x= 'AVERAGE_CALL_DURATION', bins = 30)
plt.subplot(1,2,2)
sns.kdeplot(data = mobile, x = 'AVERAGE_CALL_DURATION')
plt.tight_layout()
plt.show()
count 18000.000000
mean 5.990944
std 4.398894
min 1.000000
25% 2.000000
50% 5.000000
75% 10.000000
max 15.000000
Name: AVERAGE_CALL_DURATION, dtype: float64
# 4. 기초 통계량
print(mobile['REPORTED_SATISFACTION'].value_counts())
print('-'*50)
print(mobile['REPORTED_SATISFACTION'].value_counts()/len(mobile['REPORTED_SATISFACTION']))
# 5. 분포
cnt = mobile['REPORTED_SATISFACTION'].value_counts()
plt.figure(figsize = (16,4))
plt.subplot(1,2,1)
sns.barplot(x = cnt.index, y = cnt.values)
plt.subplot(1,2,2)
plt.pie(cnt.values, labels = cnt.index, autopct = '%.2f%%')
plt.tight_layout()
plt.show()
very_unsat 7072
very_sat 4562
unsat 3590
avg 1806
sat 932
Name: REPORTED_SATISFACTION, dtype: int64
--------------------------------------------------
very_unsat 0.392889
very_sat 0.253444
unsat 0.199444
avg 0.100333
sat 0.051778
Name: REPORTED_SATISFACTION, dtype: float64
# 4. 기초 통계량
print(mobile['CONSIDERING_CHANGE_OF_PLAN'].value_counts())
print('-'*50)
print(mobile['CONSIDERING_CHANGE_OF_PLAN'].value_counts()/len(mobile['CONSIDERING_CHANGE_OF_PLAN']))
# 5. 분포
cnt = mobile['CONSIDERING_CHANGE_OF_PLAN'].value_counts()
plt.figure(figsize = (16,4))
plt.subplot(1,2,1)
sns.barplot(x = cnt.index, y = cnt.values)
plt.subplot(1,2,2)
plt.pie(cnt.values, labels = cnt.index, autopct = '%.2f%%')
plt.tight_layout()
plt.show()
considering 7141
actively_looking_into_it 4453
no 3644
never_thought 1804
perhaps 958
Name: CONSIDERING_CHANGE_OF_PLAN, dtype: int64
--------------------------------------------------
considering 0.396722
actively_looking_into_it 0.247389
no 0.202444
never_thought 0.100222
perhaps 0.053222
Name: CONSIDERING_CHANGE_OF_PLAN, dtype: float64
# 4. 기초 통계량
print(mobile['CHURN'].value_counts())
print('-'*50)
print(mobile['CHURN'].value_counts()/len(mobile['CHURN']))
# 5. 분포
cnt = mobile['CHURN'].value_counts()
plt.figure(figsize = (16,4))
plt.subplot(1,2,1)
sns.barplot(x = cnt.index, y = cnt.values)
plt.subplot(1,2,2)
plt.pie(cnt.values, labels = cnt.index, autopct = '%.2f%%')
plt.tight_layout()
plt.show()
0 9131
1 8869
Name: CHURN, dtype: int64
--------------------------------------------------
0 0.507278
1 0.492722
Name: CHURN, dtype: float64
target = 'CHURN'
feature = 'COLLEGE'
# 교차표 생성
table = pd.crosstab(mobile[feature], mobile[target], normalize = 'index')
# 1. 교차표 출력
print('교차표\n', table)
print('-' * 100)
# 2. 시각화(
plt.figure(figsize = (16, 4))
table.plot.bar(stacked=True)
plt.axhline(1 - mobile[target].mean(), color = 'r')
plt.show()
교차표
CHURN 0 1
COLLEGE
0 0.515697 0.484303
1 0.498950 0.501050
----------------------------------------------------------------------------------------------------
<Figure size 1152x288 with 0 Axes>
mosaic(mobile, [feature, target])
plt.axhline(1 - mobile[target].mean(), color = 'r')
plt.show()
# 3. 가설검정(카이제곱검정)
result = spst.chi2_contingency(table)
print('카이제곱통계량', result[0])
print('p-value', result[1])
print('기대빈도\n',result[3])
카이제곱통계량 0.0
p-value 1.0
기대빈도
[[0.50732337 0.49267663]
[0.50732337 0.49267663]]
feature = 'REPORTED_SATISFACTION'
mobile
COLLEGE | INCOME | OVERAGE | LEFTOVER | HOUSE | HANDSET_PRICE | AVERAGE_CALL_DURATION | REPORTED_SATISFACTION | CONSIDERING_CHANGE_OF_PLAN | CHURN | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 47711 | 183 | 17 | 730589.0 | 192 | 5 | unsat | considering | 0 |
1 | 0 | 74132 | 191 | 43 | 535092.0 | 349 | 2 | unsat | no | 1 |
2 | 1 | 150419 | 0 | 14 | 204004.0 | 682 | 6 | unsat | considering | 0 |
3 | 0 | 159567 | 0 | 58 | 281969.0 | 634 | 1 | very_unsat | never_thought | 0 |
4 | 1 | 23392 | 0 | 0 | 216707.0 | 233 | 15 | unsat | no | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
17995 | 0 | 31079 | 200 | 40 | NaN | 181 | 11 | unsat | actively_looking_into_it | 0 |
17996 | 0 | 137194 | 63 | 0 | NaN | 787 | 11 | very_unsat | no | 1 |
17997 | 0 | 82931 | 0 | 77 | NaN | 232 | 2 | very_unsat | never_thought | 0 |
17998 | 0 | 124391 | 0 | 19 | NaN | 862 | 2 | unsat | no | 0 |
17999 | 0 | 104849 | 223 | 5 | NaN | 463 | 5 | very_unsat | considering | 1 |
18000 rows × 10 columns
# 가변수화
temp = mobile.loc[mobile[feature].notnull()]
temp['Satisfaction'] = 0
temp.loc[temp[feature] == 'very_sat', 'Satisfaction'] = 1
temp.loc[temp[feature] == 'sat', 'Satisfaction'] = 1
temp.loc[temp[feature] == 'avg', 'Satisfaction'] = 0
temp.loc[temp[feature] == 'unsat', 'Satisfaction'] = 0
temp.loc[temp[feature] == 'very_unsat', 'Satisfaction'] = 0
feature = 'Satisfaction'
# 교차표 생성
table = pd.crosstab(temp[feature], temp[target], normalize = 'index')
# 1. 교차표 출력
print('교차표\n', table)
print('-' * 100)
# 2. 시각화(
plt.figure(figsize = (16, 4))
table.plot.bar(stacked=True)
plt.axhline(1 - temp[feature].mean(), color = 'r')
plt.show()
교차표
CHURN 0 1
Satisfaction
0 0.504251 0.495749
1 0.514379 0.485621
----------------------------------------------------------------------------------------------------
<Figure size 1152x288 with 0 Axes>
mosaic(temp, [feature, target])
plt.axhline(1 - temp[target].mean(), color = 'r')
plt.show()
# 3. 가설검정(카이제곱검정)
result = spst.chi2_contingency(table)
print('카이제곱통계량', result[0])
print('p-value', result[1])
print('기대빈도\n',result[3])
카이제곱통계량 0.0
p-value 1.0
기대빈도
[[0.5093151 0.4906849]
[0.5093151 0.4906849]]
feature = 'CONSIDERING_CHANGE_OF_PLAN'
# 가변수화
temp = mobile.loc[mobile[feature].notnull()]
temp['CONSIDERING'] = 0
temp.loc[temp[feature] == 'actively_looking_into_it', 'CONSIDERING'] = 1
temp.loc[temp[feature] == 'considering', 'CONSIDERING'] = 1
temp.loc[temp[feature] == 'perhaps', 'CONSIDERING'] = 0
temp.loc[temp[feature] == 'no', 'CONSIDERING'] = 0
temp.loc[temp[feature] == 'never_thought', 'CONSIDERING'] = 0
feature = 'CONSIDERING'
# 교차표 생성
table = pd.crosstab(temp[feature], temp[target], normalize = 'index')
# 1. 교차표 출력
print('교차표\n', table)
print('-' * 100)
# 2. 시각화(
plt.figure(figsize = (16, 4))
table.plot.bar(stacked=True)
plt.axhline(1 - temp[feature].mean(), color = 'r')
plt.show()
교차표
CHURN 0 1
CONSIDERING
0 0.500312 0.499688
1 0.511126 0.488874
----------------------------------------------------------------------------------------------------
<Figure size 1152x288 with 0 Axes>
mosaic(temp, [feature, target])
plt.axhline(1 - temp[target].mean(), color = 'r')
plt.show()
# 3. 가설검정(카이제곱검정)
result = spst.chi2_contingency(table)
print('카이제곱통계량', result[0])
print('p-value', result[1])
print('기대빈도\n',result[3])
카이제곱통계량 0.0
p-value 1.0
기대빈도
[[0.50571933 0.49428067]
[0.50571933 0.49428067]]
feature = 'INCOME'
# 1. 시각화
plt.figure(figsize = (16, 4))
plt.subplot(1, 2, 1)
sns.histplot(x = feature, data = mobile, hue = target)
plt.subplot(1, 2, 2)
sns.kdeplot(x = feature, data = mobile, hue = target, common_norm = False)
plt.tight_layout()
plt.show()
# 2. 로지스틱 회귀
model = sm.Logit(mobile[target], mobile[feature])
result = model.fit()
print(result.pvalues)
Optimization terminated successfully.
Current function value: 0.692683
Iterations 2
INCOME 0.000044
dtype: float64
feature = 'HANDSET_PRICE'
# 1. 시각화
plt.figure(figsize = (16, 4))
plt.subplot(1, 2, 1)
sns.histplot(x = feature, data = mobile, hue = target)
plt.subplot(1, 2, 2)
sns.kdeplot(x = feature, data = mobile, hue = target, common_norm = False)
plt.tight_layout()
plt.show()
# 2. 로지스틱 회귀
model = sm.Logit(temp[target], temp[feature])
result = model.fit()
print(result.pvalues)
Optimization terminated successfully.
Current function value: 0.692730
Iterations 3
HANDSET_PRICE 0.000107
dtype: float64
feature = 'AVERAGE_CALL_DURATION'
# 1. 시각화
plt.figure(figsize = (16, 4))
plt.subplot(1, 2, 1)
sns.histplot(x = feature, data = mobile, hue = target)
plt.subplot(1, 2, 2)
sns.kdeplot(x = feature, data = mobile, hue = target, common_norm = False)
plt.tight_layout()
plt.show()
# 2. 로지스틱 회귀
model = sm.Logit(temp[target], temp[feature])
result = model.fit()
print(result.pvalues)
Optimization terminated successfully.
Current function value: 0.693027
Iterations 3
AVERAGE_CALL_DURATION 0.03783
dtype: float64
INCOME, HANDSET_PRICE
AVERAGE_CALL_DURATION
REPORTED_SATISFACTION, COLLEGE, REPORTED_SATISFACTION