AARRR은 시장 진입 단계에 맞는 특정 지표를 기준으로 우리 서비스의 상태를 가늠할 수 있는 효율적인 기준이다.
수 많은 데이터 중 현 시점에서 가장 핵심적인 지표에 집중할 수 있게 함으로써 분석할 리소스가 충분하지 않은 스타트업에게 매력적인 프레임워크
https://www.kaggle.com/datasets/mashlyn/online-retail-ii-uci
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt
import koreanize_matplotlib
%config InlineBackend.figure_format = 'retina'
# 데이터 로드
df = pd.read_csv("data/online_retail.csv")
df.shape
>>>>
(541909, 8)
df.head(2)
df.tail(2)
df.info()
>>>>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 InvoiceNo 541909 non-null object
1 StockCode 541909 non-null object
2 Description 540455 non-null object
3 Quantity 541909 non-null int64
4 InvoiceDate 541909 non-null object
5 UnitPrice 541909 non-null float64
6 CustomerID 406829 non-null float64
7 Country 541909 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB
df.describe()
>>>>
Quantity UnitPrice CustomerID
count 541909.000000 541909.000000 406829.000000
mean 9.552250 4.611114 15287.690570
std 218.081158 96.759853 1713.600303
min -80995.000000 -11062.060000 12346.000000
25% 1.000000 1.250000 13953.000000
50% 3.000000 2.080000 15152.000000
75% 10.000000 4.130000 16791.000000
max 80995.000000 38970.000000 18287.000000
df.describe(include="O")
>>>>
InvoiceNo StockCode Description \
count 541909 541909 540455
unique 25900 4070 4223
top 573585 85123A WHITE HANGING HEART T-LIGHT HOLDER
freq 1114 2313 2369
InvoiceDate Country
count 541909 541909
unique 23260 38
top 2011-10-31 14:41:00 United Kingdom
freq 1114 495478
df.isnull().sum()
>>>>
InvoiceNo 0
StockCode 0
Description 1454
Quantity 0
InvoiceDate 0
UnitPrice 0
CustomerID 135080
Country 0
dtype: int64
# 결측치 비율
df.isnull().mean() * 100
>>>>
InvoiceNo 0.000000
StockCode 0.000000
Description 0.268311
Quantity 0.000000
InvoiceDate 0.000000
UnitPrice 0.000000
CustomerID 24.926694
Country 0.000000
TotalPrice 0.000000
dtype: float64
# 결측치 시각화
plt.figure(figsize=(12, 4))
sns.heatmap(data=df.isnull(), cmap="BuPu")
df.hist(figsize=(10, 5), bins=100);
df["TotalPrice"] = df["Quantity"]*df["UnitPrice"]
df.head()
df.loc[df["CustomerID"].isnull(), "Country"].value_counts()
>>>>
United Kingdom 133600
EIRE 711
Hong Kong 288
Unspecified 202
Switzerland 125
France 66
Israel 47
Portugal 39
Bahrain 2
Name: Country, dtype: int64
df.loc[df["CustomerID"].notnull(), "Country"].value_counts()
>>>>
United Kingdom 361878
Germany 9495
France 8491
EIRE 7485
Spain 2533
Netherlands 2371
Belgium 2069
Switzerland 1877
Portugal 1480
Australia 1259
Norway 1086
Italy 803
Channel Islands 758
Finland 695
Cyprus 622
Sweden 462
Austria 401
Denmark 389
Japan 358
Poland 341
USA 291
Israel 250
Unspecified 244
Singapore 229
Iceland 182
Canada 151
Greece 146
Malta 127
United Arab Emirates 68
European Community 61
RSA 58
Lebanon 45
Lithuania 35
Brazil 32
Czech Republic 30
Bahrain 17
Saudi Arabia 10
Name: Country, dtype: int64
df.groupby("Country")["TotalPrice"].agg(['mean',
'sum']).nlargest(10, 'sum').style.format("{:,.0f}")
>>>>
mean sum
Country
United Kingdom 17 8,187,806
Netherlands 120 284,662
EIRE 32 263,277
Germany 23 221,698
France 23 197,404
Australia 109 137,077
Switzerland 28 56,385
Spain 22 54,775
Belgium 20 40,911
Sweden 79 36,596
stock_sale = df.groupby(['StockCode').agg({"InvoiceNO": "count",
"Quantity": "sum",
"TotalPrice": "sum",
}).nlargest(10, "InvoiceNO")
stock_sale
>>>>
InvoiceNo Quantity TotalPrice
StockCode
85123A 2313 38830 97894.50
22423 2203 12980 164762.19
85099B 2159 47363 92356.03
47566 1727 18022 98302.98
20725 1639 18979 35187.31
84879 1502 36221 58959.73
22720 1477 7286 37413.44
22197 1476 56450 50987.47
21212 1385 36039 21059.72
20727 1350 12112 22219.01
stock_desc = df.loc[df['StockCode'].isin(stock_sale.index),
["StockCode", "Description"]].drop_duplicates(
"StockCode", keep='first').set_index("StockCode")
stock_desc
stock_sale['Desc'] = stock_desc
stock_sale
df[df["Quantity"] < 0].head(2)
df["Cancel"] = df["Quantity"] < 0
df.head()
df.groupby("CustomerID")["Cancel"].mean()
>>>>
CustomerID
12346.0 0.500000
12347.0 0.000000
12348.0 0.000000
12349.0 0.000000
12350.0 0.000000
...
18280.0 0.000000
18281.0 0.000000
18282.0 0.076923
18283.0 0.000000
18287.0 0.000000
df.groupby("CustomerID")["Cancel"].value_counts(normalize=True)
>>>>
CustomerID Cancel
12346.0 False 0.500000
True 0.500000
12347.0 False 1.000000
12348.0 False 1.000000
12349.0 False 1.000000
...
18281.0 False 1.000000
18282.0 False 0.923077
True 0.076923
18283.0 False 1.000000
18287.0 False 1.000000
df.groupby("CustomerID")["Cancel"].value_counts().unstack().nlargest(10, False)
>>>>
Cancel False True
CustomerID
17841.0 7847.0 136.0
14911.0 5677.0 226.0
14096.0 5111.0 17.0
12748.0 4596.0 46.0
14606.0 2700.0 82.0
15311.0 2379.0 112.0
14646.0 2080.0 5.0
13089.0 1818.0 39.0
13263.0 1677.0 NaN
14298.0 1637.0 3.0
cancel_stock = df.groupby(['StockCode']).agg({"InvoiceNo":"count",
"Cancel": "mean"})
cancel_stock.nlargest(10, "InvoiceNo")
>>>>
InvoiceNo Cancel
StockCode
85123A 2313 0.018591
22423 2203 0.083522
85099B 2159 0.020380
47566 1727 0.011581
20725 1639 0.026846
84879 1502 0.008655
22720 1477 0.051456
22197 1476 0.033875
21212 1385 0.010830
20727 1350 0.016296
cancel_country = df.groupby("Country").agg({"InvoiceNo": "count",
"Cancel": "mean"})
cancel_country.nlargest(20, "Cancel")
>>>>
InvoiceNo Cancel
Country
USA 291 0.384880
Czech Republic 30 0.166667
Malta 127 0.118110
Japan 358 0.103352
Saudi Arabia 10 0.100000
Australia 1259 0.058777
Italy 803 0.056040
Bahrain 19 0.052632
Germany 9495 0.047709
EIRE 8196 0.036847
Poland 341 0.032258
Singapore 229 0.030568
Sweden 462 0.023810
Denmark 389 0.023136
Spain 2533 0.018950
United Kingdom 495478 0.018552
Belgium 2069 0.018366
Switzerland 2002 0.017483
France 8557 0.017413
European Community 61 0.016393
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])
df["InvoiceYear"] = df["InvoiceDate"].dt.year
df["InvoiceMonth"] = df["InvoiceDate"].dt.month
df["InvoiceDay"] = df["InvoiceDate"].dt.day
df["InvoiceDow"] = df["InvoiceDate"].dt.dayofweek
df.head(1)
df['InvoiceYM'] = df['InvoiceDate'].astype(str).str[:7]
df.head()
df["InvoiceTime"] = df["InvoiceDate"].dt.time
df["InvoiceHour"] = df["InvoiceDate"].dt.hour
df.head(1)
sns.countplot(data=df, x='InvoiceYear')
sns.countplot(data=df, x='InvoiceMonth')
plt.figure(figsize=(12, 4))
sns.countplot(data=df, x='InvoiceYM')
sns.countplot(data=df, x='InvoiceDow')
sns.countplot(data=df, x='InvoiceDow', hue='Cancel')
plt.title("요일별 구매 취소")
sns.countplot(data=df[df["Cancel"] == True], x='InvoiceDow')
day_name = [w for w in "월화수목금토일"]
# 데이터를 보면 토요일이 없음
day_name.remove("토")
dow_count = df['InvoiceDow'].value_counts().sort_index()
dow_count.index = day_name
dow_count.plot.bar(rot=0, figsize=(7, 3))
plt.figure(figsize=(12, 4))
sns.countplot(data=df, x='InvoiceHour')
plt.figure(figsize=(12, 4))
sns.pointplot(data=df, x="InvoiceHour", y='TotalPrice',
estimator='count', errorbar=None)
hour_dow = pd.crosstab(index=df['InvoiceHour'], columns=df['InvoiceDow'])
hour_dow.index = day_name
hour_dow
>>>>
월 화 수 목 금 일
InvoiceHour
6 0 0 0 41 0 0
7 61 65 59 64 134 0
8 1641 1751 1826 1834 1857 0
9 7497 6440 5186 8060 7119 30
10 8872 9125 7904 9321 10215 3600
11 8926 10052 9973 8369 9589 10765
12 11801 14327 14527 13485 10942 13627
13 12756 12619 11901 12428 10488 12067
14 11502 11656 11006 11518 11737 10052
15 13977 16017 16510 11830 9043 10142
16 9529 11924 10768 11765 6438 4092
17 8136 5746 4849 5257 4521 0
18 413 2086 23 5439 13 0
19 0 0 15 3601 89 0
20 0 0 18 845 8 0
hour_dow.style.background_gradient(cmap='PuBuGn_r').format("{:,}")
sns.heatmap(hour_dow, cmap='BuPu', annot=True, fmt=',.0f')
hour_dow.plot.area(figsize=(12, 3))
hour_dow.plot.area(figsize=(12, 5), subplots=True);
df_valid = df[df['CustomerID'].notnull() & (df["Quantity"] > 0) & (df["UnitPrice"] > 0)].copy()
df.shape, df_valid.shape
>>>>
((541909, 17), (397884, 17))
# 중복 데이터 제거
df_valid = df_valid.drop_duplicates()
df_valid.shape
>>>>
(392692, 17)
ARPU : Average Revenue Per User
- 가입한 서비스에 대해 가입자 1명이 특정 기간 동안 지출한 평균 금액
- ARPU = 매출 / 중복을 제외한 순수 활동 사용자 수
ARPPU : Average Revenue Per Paying User
- 지불 유저 1명당 한 달에 결제하는 평균 금액을 산정한 수치
# ARPPU : CustomerID 를 사용할 때는 count가 아닌 nunique 사용 -> 중복을 고려하지 않는다
arppu = df_valid.groupby("InvoiceYM").agg({"TotalPrice":"sum", "CustomerID":"nunique"})
arppu.columns = ['sale_sum', 'customer_count']
arppu["ARPPU"] = arppu['sale_sum'] / arppu['customer_count']
arppu.style.format("{:,.0f}")
arppu['ARPPU'].plot(figsize=(10, 3), title="Monthly ARPPU")
cust_agg = df_valid.groupby(['CustomerID']).agg({"InvoiceNo":['nunique', 'count'],
'TotalPrice':['mean', 'sum']})
cust_agg.columns = ['nunique', 'count', 'mean', 'sum']
cust_agg
mau = df_valid.groupby('InvoiceYM')['CustomerID'].agg('nunique')
mau.plot.bar(figsize=(10, 3), rot=30, title="MAU(Monthly Active User)")
df_valid.groupby("InvoiceYM").agg({"InvoiceNo" : "count",
"StockCode": "nunique",
"CustomerID": "nunique",
"UnitPrice" : "sum",
"Quantity" : "sum",
"TotalPrice" : "sum"})
>>>>
InvoiceNo StockCode CustomerID UnitPrice Quantity TotalPrice
InvoiceYM
2010-12 25670 2411 885 80679.600 311048 570422.730
2011-01 20988 2121 741 66234.650 348473 568101.310
2011-02 19706 2124 758 62619.480 265027 446084.920
2011-03 26870 2234 974 87864.790 347582 594081.760
2011-04 22433 2217 856 78543.481 291366 468374.331
2011-05 28073 2219 1056 101500.910 372864 677355.150
2011-06 26926 2339 991 84602.660 363014 660046.050
2011-07 26580 2351 949 75454.521 367360 598962.901
2011-08 26790 2356 935 78877.090 397373 644051.040
2011-09 39669 2545 1266 118160.322 543652 950690.202
2011-10 48793 2622 1364 164084.090 591543 1035642.450
2011-11 63168 2695 1664 182340.090 665923 1156205.610
2011-12 17026 2173 615 46559.700 286777 517190.440
df_valid['InvoiceDate1'] = pd.to_datetime(df_valid["InvoiceYM"])
df_valid['InvoiceDate1'].head()
>>>>
0 2010-12-01
1 2010-12-01
2 2010-12-01
3 2010-12-01
4 2010-12-01
-> 일자를 '1'로 통일한 이유는 월별 잔존율을 구하기 위해서 (월 단위)
df_valid[["InvoiceDate", "InvoiceDate1"]].sample(5)
>>>>
InvoiceDate InvoiceDate1
431356 2011-10-31 14:48:00 2011-10-01
7755 2010-12-05 11:40:00 2010-12-01
150305 2011-04-08 11:54:00 2011-04-01
285985 2011-08-01 13:31:00 2011-08-01
91562 2011-02-16 10:47:00 2011-02-01
df_valid["InvoiceDateMin"] = df_valid.groupby('CustomerID')['InvoiceDate1'].transform('min')
df_valid["InvoiceDateMin"]
>>>>
0 2010-12-01
1 2010-12-01
2 2010-12-01
3 2010-12-01
4 2010-12-01
...
541904 2011-08-01
541905 2011-08-01
541906 2011-08-01
541907 2011-08-01
541908 2011-08-01
df_valid[["CustomerID", "InvoiceDate", "InvoiceDateMin", "InvoiceDate1"]].sample(5)
year_diff = df_valid['InvoiceDate1'].dt.year - df_valid['InvoiceDateMin'].dt.year
year_diff
month_diff = df_valid['InvoiceDate1'].dt.month - df_valid['InvoiceDateMin'].dt.month
month_diff
df_valid["CohortIndex"] = (year_diff * 12) + month_diff + 1
df_valid[['CustomerID', 'InvoiceDate1', 'InvoiceDateMin', 'CohortIndex']].sample(5)
df_valid['CohortIndex'].value_counts().sort_index()
>>>>
1 116857
2 27516
3 26727
4 26993
5 25165
6 26673
7 23462
8 23298
9 22751
10 22968
11 20098
12 23011
13 7173
plt.figure(figsize=(12, 4))
sns.countplot(data=df_valid, x='CohortIndex')
➡️ 첫 달에만 구매하고 다음달 부터 구매하지 않는 사람이 많다.
➡️ 마케팅비 많이 쏟아서 고객을 유치했지만 유지가 잘 되지 않는 것으로 보여진다.
➡️ 휴면 고객을 위한 이벤트, 쿠폰 등이 적절한 시점에 있으면 도움이 되겠다는 계획을 세워볼 수 있다.
cohort_count = df_valid.groupby(['InvoiceDateMin', 'CohortIndex'])['CustomerID'].nunique().unstack()
cohort_count.index = cohort_count.index.astype(str)
cohort_count
plt.figure(figsize=(12, 8))
sns.heatmap(cohort_count, annot=True, fmt=".0f", cmap="BuPu")
cohort_count[1].plot.bar(figsize=(16, 3), title="월별 신규 유입 수", rot=0)
cohort_norm = cohort_count.div(cohort_count[1], axis=0)
cohort_norm
plt.figure(figsize=(12, 8))
sns.heatmap(cohort_norm, cmap="BuPu", annot=True, fmt=".2f")