SQL | 프로젝트 #6 리텐션

소리·2024년 7월 3일
0

코호트 분석 (고객 세분화 중 시간의 흐름에 따라 고객을 나누어보는 것)으로
리텐션 값(지속적으로 사용하는지, 핵심 가치를 꾸준히 경험하는지를 측정)을 얻는다.

데이터 가공

  1. purchase만 모아서 전체 월 합치기
  2. event_time 시간으로 바꾸기

진행 사항

  • 5개 월 데이터를 전체 합쳐 테이블 생성
CREATE TABLE purchase_t AS
SELECT 
    STR_TO_DATE(SUBSTRING(event_time, 1, 19), '%Y-%m-%d %H:%i:%s') AS event_time,
    event_type,
    product_id,
    category_id,
    category_code,
    brand,
    price,
    user_id,
    user_session
FROM (
    SELECT * FROM oct19 WHERE event_type = 'purchase'
    UNION ALL
    SELECT * FROM nov19 WHERE event_type = 'purchase'
    UNION ALL
    SELECT * FROM dec19 WHERE event_type = 'purchase'
    UNION ALL
    SELECT * FROM jan20 WHERE event_type = 'purchase'
    UNION ALL
    SELECT * FROM feb20 WHERE event_type = 'purchase'
) AS combined_data;
  • 전체 user_id 수 : 110518개
SELECT COUNT(DISTINCT user_id)
FROM purchase_t;
  • id 별 첫 구매 날짜 테이블 생성
CREATE TABLE first_month (
SELECT user_id
	, MIN(event_time) AS first_event_time
    , DATE_FORMAT(MIN(event_time), '%Y-%m-01') AS first_month
FROM purchase_t 
GROUP BY user_id
ORDER BY first_event_time);

SELECT *
FROM first_month;

  • 주문일자, 주문 아이디, 해당 아이디의 첫 주문 일자 테이블 생성
SELECT p.event_time AS order_date
	, p.user_id
    , f.first_month AS first_order_month
FROM purchase_t p
LEFT JOIN first_month f ON p.user_id = f.user_id;

  • 월 별 첫 구매 id 갯수
SELECT first_month, COUNT(DISTINCT user_id) AS user_counts
FROM first_month
GROUP BY first_month;

  • 리텐션 테이블
WITH a AS (
SELECT p.user_id
    , f.first_month AS first_order_month
	, p.event_time AS order_date
FROM purchase_t p
INNER JOIN first_month f ON p.user_id = f.user_id
)

SELECT 
    first_order_month,
    COUNT(DISTINCT user_id) AS month0,
    COUNT(DISTINCT CASE WHEN DATE_FORMAT(DATE_ADD(first_order_month, INTERVAL 1 MONTH), '%Y-%m') = DATE_FORMAT(order_date, '%Y-%m') THEN user_id END) AS month1,
    COUNT(DISTINCT CASE WHEN DATE_FORMAT(DATE_ADD(first_order_month, INTERVAL 2 MONTH), '%Y-%m') = DATE_FORMAT(order_date, '%Y-%m') THEN user_id END) AS month2,
    COUNT(DISTINCT CASE WHEN DATE_FORMAT(DATE_ADD(first_order_month, INTERVAL 3 MONTH), '%Y-%m') = DATE_FORMAT(order_date, '%Y-%m') THEN user_id END) AS month3,
    COUNT(DISTINCT CASE WHEN DATE_FORMAT(DATE_ADD(first_order_month, INTERVAL 4 MONTH), '%Y-%m') = DATE_FORMAT(order_date, '%Y-%m') THEN user_id END) AS month4
FROM a
GROUP BY first_order_month
ORDER BY first_order_month;

  • 비율


월별로 했을 경우 5개월 분량으로 자세히 확인하기 어려워 주차별로 수정

  • weekly를 확인할 수 있게 전면 수정

여기서 year을 제외하고 week만 했을 때 해가 달라지는 이슈 때문에 계산이 제대로 되지 않아 년도 + 주차로 계산하였다.

클래식 리텐션

-- id 별 첫 구매 주차 테이블 생성
CREATE TABLE first_week (
SELECT user_id
	, MIN(event_time) AS first_event_time
    , YEARWEEK(MIN(event_time), 1) AS first_order_week
FROM purchase_t 
GROUP BY user_id
ORDER BY first_event_time);

-- 클래식 리텐션
WITH all_weeks AS (
    SELECT 
        p.user_id,
        p.event_time AS order_date,
        YEARWEEK(p.event_time, 1) AS order_week,
        f.first_order_week,
        SUBSTRING(YEARWEEK(p.event_time, 1), 1, 4) AS order_year,
        SUBSTRING(YEARWEEK(p.event_time, 1), 5, 2) AS order_week_num,
        SUBSTRING(f.first_order_week, 1, 4) AS first_order_year,
        SUBSTRING(f.first_order_week, 5, 2) AS first_order_week_num
    FROM 
        purchase_t p
    INNER JOIN 
        first_week f ON p.user_id = f.user_id
)

SELECT 
    first_order_week,
    COUNT(DISTINCT CASE WHEN order_week = first_order_week THEN user_id END) AS week0,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 1 THEN user_id END) AS week1,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 2 THEN user_id END) AS week2,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 3 THEN user_id END) AS week3,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 4 THEN user_id END) AS week4,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 5 THEN user_id END) AS week5,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 6 THEN user_id END) AS week6,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 7 THEN user_id END) AS week7,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 8 THEN user_id END) AS week8,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 9 THEN user_id END) AS week9,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 10 THEN user_id END) AS week10,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 11 THEN user_id END) AS week11,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 12 THEN user_id END) AS week12,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 13 THEN user_id END) AS week13,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 14 THEN user_id END) AS week14,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 15 THEN user_id END) AS week15,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 16 THEN user_id END) AS week16,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 17 THEN user_id END) AS week17,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 18 THEN user_id END) AS week18,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 19 THEN user_id END) AS week19,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 20 THEN user_id END) AS week20,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 21 THEN user_id END) AS week21,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num = first_order_year * 52 + first_order_week_num + 22 THEN user_id END) AS week22
FROM 
    all_weeks
GROUP BY 
    first_order_week
ORDER BY 
    first_order_week;

10월부터 2월까지 총 22주가 존재한다.

  • 결과
    <일반 값>

    <비율>

<그래프>


롤링 리텐션

-- last_week 값이 필요한 관계로 추가 테이블 생성
CREATE TABLE first_and_last_week (
SELECT user_id
	, MIN(event_time) AS first_event_time
    , YEARWEEK(MIN(event_time), 1) AS first_order_week
	, MAX(event_time) AS last_event_time
    , YEARWEEK(MAX(event_time), 1) AS last_order_week
FROM purchase_t 
GROUP BY user_id);

WITH all_weeks AS (
    SELECT 
        p.user_id,
        p.event_time AS order_date,
        YEARWEEK(p.event_time, 1) AS order_week,
        f.first_order_week,
        SUBSTRING(YEARWEEK(p.event_time, 1), 1, 4) AS order_year,
        SUBSTRING(YEARWEEK(p.event_time, 1), 5, 2) AS order_week_num,
        SUBSTRING(f.first_order_week, 1, 4) AS first_order_year,
        SUBSTRING(f.first_order_week, 5, 2) AS first_order_week_num,
        f.last_order_week,
        SUBSTRING(f.last_order_week, 1, 4) AS last_order_year,
        SUBSTRING(f.last_order_week, 5, 2) AS last_order_week_num
    FROM 
        purchase_t p
    INNER JOIN 
        first_and_last_week f ON p.user_id = f.user_id
)

SELECT first_order_week,
    COUNT(DISTINCT CASE WHEN order_week = last_order_week THEN user_id END) AS week0,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 1 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week1,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 2 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week2,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 3 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week3,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 4 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week4,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 5 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week5,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 6 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week6,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 7 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week7,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 8 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week8,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 9 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week9,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 10 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week10,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 11 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week11,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 12 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week12,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 13 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week13,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 14 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week14,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 15 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week15,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 16 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week16,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 17 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week17,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 18 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week18,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 19 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week19,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 20 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week20,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 21 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week21,
    COUNT(DISTINCT CASE WHEN order_year * 52 + order_week_num + 22 <= last_order_year * 52 + last_order_week_num THEN user_id END) AS week22
FROM 
    all_weeks
GROUP BY 
    first_order_week
ORDER BY 
    first_order_week;
  • 결과
    <일반 값>

    <비율>

<그래프>


(추가) 범위 리텐션 시도

  • 범위 리텐션 (range retention)은 주로 서비스 사용주기가 길거나 주기적인 서비스에서 많이 사용한다.

자세한 설명 참고

구간은 겹치지 않고 연달아 설정한다. Day 1-3 다음에 Day4-6 이 나오게, 기간을 묶어서 진행하기 때문에 노이즈에 강하다. 하루 접속하지 않아도 리텐션에 영향을 주지 않는다.

Range 리텐션 = Next Period에 방문한 사람 수 / 최초 Period에 방문한 사람 수

해당 데이터에서 평균 고객들의 재구매 시점을 파악했을 때 약 33일 정도 걸린다.

클래식 리텐션: 특정 기준일 이후 특정 시점에 얼마나 많은 사용자가 다시 활동했는지 측정.
범위 리텐션: 특정 주차에 시작한 사용자가 해당 주차를 포함하여 이후 주차까지 누적하여 활동했는지 여부를 측정 (특정 기간 내에 누적된 사용자의 수)

  • 예시
    클래식 리텐션
    첫 번째 구매 후 1개월에 다시 방문한 사용자 수.
    첫 번째 구매 후 2개월에 다시 방문한 사용자 수.
    범위 리텐션 (한 달 기간)
    첫 번째 구매 후 첫 번째 달 동안 방문한 사용자 수.
    첫 번째 구매 후 첫 번째 달부터 두 번째 달 동안 방문한 사용자 수.

  • 클래식 리텐션 장 단점
    -장점 : 간단한 개념, 눈에 띄는 값을 확인할 수 있다.
    -단점 : 일별 이벤트에 영향을 많이 받는다, 얼마나 자주 왔는지 반영되지 않는다.
  • 롤링 리텐션 장 단점
    -장점 : 이탈에 대한 정보를 얻을 수 있다.
    -단점 : 충성고객과 일시고객이 동일하게 고려된다, 시간이 지나 오지 않던 고객이 기간 후 오게되면 리텐션에 카운트 되서 숫자가 바뀔 수 있다.
    -그래서 기존리텐션 지표의 보조로 사용하는 경우가 많다.

  • 범위 리텐션 장 단점
    -클래식 리텐션과 비슷하지만 기간을 늘려서 살펴본다.
    -장점 : 개념이 간단하기 때문에 설명하기쉽다.
    -단점 : period 기간만큼 시간이 지나야 지표를 확인할 수 있다.
    -그래서 기간은 비즈니스에 맞게 선정하는 것이 중요하다.

참고

profile
데이터로 경로를 탐색합니다.

0개의 댓글