Google BigQuery 내에 데이터가 저장되어 있으므로, 내가 수행한 문제 풀이만 남겨 두었다.
-- 노드10-2 일별 매출액 구하기
-- orders 데이터 불러오기
-- SELECT * FROM oceanic-antler-456102-i6.modulabs.orders;
-- payments 데이터 불러오기
-- SELECT * FROM oceanic-antler-456102-i6.modulabs.payments;
-- 문제 해결 방안
-- TIMESTAMP를 날짜로 변경
-- order-id 기준으로 join
-- 날짜로 group by
WITH T0 as(
SELECT o.order_id, o.order_timestamp,substr(cast(o.order_timestamp as string ),1,10)as date,
p.value
from oceanic-antler-456102-i6.modulabs.orders as o
join oceanic-antler-456102-i6.modulabs.payments as p
on o.order_id = p.order_id
)
SELECT T0.date, sum(T0.value) as Date_Revenue
FROM T0
GROUP BY date
ORDER BY date
-- 노드10-3, PV,UV 계산
-- 페이지별 PV 구하기
-- 이건 page_url별로 단순 방문 수 group-by 하면 됨
-- SELECT v.page_url, count(page_url) as PV
-- FROM oceanic-antler-456102-i6.modulabs.visits as v
-- GROUP BY v.page_url;
-- 페이지별 Unique PV 구하기
-- SELECT v.page_url, count(distinct(v.user_id)) as UniquePV
-- FROM oceanic-antler-456102-i6.modulabs.visits as v
-- GROUP BY v.page_url;
-- 10-3-3 페이지별 visits
-- 1단계, lag 함수 활용?
-- SELECT
-- user_id,
-- page_url,
-- timestamp,
-- LAG(timestamp) OVER (PARTITION BY user_id, page_url ORDER BY timestamp) AS last_timestamp
-- FROM oceanic-antler-456102-i6.modulabs.visits;
-- 2단계, lag 함수 활용 결과를 임시테이블 저장(LAST SESSION)
-- WITH LAST_SESSION AS
-- (SELECT user_id, page_url,
-- timestamp, LAG(timestamp) OVER (PARTITION BY user_id, page_url ORDER BY timestamp) AS last_timestamp
-- FROM oceanic-antler-456102-i6.modulabs.visits
-- );
-- 3단계 SESSION DIFF라는 임시테이블 생성
-- timestamp 값과 last_timestamp 사이의 시간 차이를 계산
-- 시간차가 30분을 초과한 경우는 별도의 방문으로 집계하고, 시간차가 30분 이내인 경우는 동일 방문으로 집계
-- WITH LAST_SESSION AS
-- (
-- SELECT
-- user_id,
-- page_url,
-- timestamp,
-- LAG(timestamp) OVER (PARTITION BY user_id, page_url ORDER BY timestamp) AS last_timestamp
-- FROM
-- oceanic-antler-456102-i6.modulabs.visits
-- ),
-- SESSION_DIFF AS
-- (
-- SELECT
-- *,
-- TIMESTAMP_DIFF(timestamp, last_timestamp, MINUTE) AS diff_minutes,
-- CASE
-- WHEN last_timestamp IS NULL THEN 1
-- WHEN TIMESTAMP_DIFF(timestamp, last_timestamp, MINUTE) > 30 THEN 1
-- ELSE 0
-- END AS is_new_visit
-- FROM
-- LAST_SESSION
-- )
-- SELECT * FROM SESSION_DIFF;
-- 4단계
-- 최종적으로
-- SESSION_DIFF 테이블에 대하여 페이지 단위의 방문자 수(Visits)구하기
-- WITH LAST_SESSION AS
-- (
-- SELECT
-- user_id,
-- page_url,
-- timestamp,
-- LAG(timestamp) OVER (PARTITION BY user_id, page_url ORDER BY timestamp) AS last_timestamp
-- FROM oceanic-antler-456102-i6.modulabs.visits
-- ),
-- SESSION_DIFF AS
-- (
-- SELECT *,
-- TIMESTAMP_DIFF(timestamp, last_timestamp, MINUTE) AS diff_minutes,
-- CASE WHEN last_timestamp IS NULL THEN 1
-- WHEN TIMESTAMP_DIFF(timestamp,last_timestamp,MINUTE) > 30 THEN 1
-- ELSE 0
-- END AS is_new_visit
-- FROM LAST_SESSION
-- )
-- SELECT
-- page_url,
-- COUNTIF(is_new_visit = 1) AS session_based_uv
-- FROM
-- SESSION_DIFF
-- GROUP BY
-- page_url
-- ORDER BY
-- session_based_uv DESC;
-- 페이지별 unique visitors 구하기
WITH LAST_SESSION AS
(
SELECT
user_id,
page_url,
timestamp,
LAG(timestamp) OVER (PARTITION BY user_id, page_url ORDER BY timestamp) AS last_timestamp
FROM oceanic-antler-456102-i6.modulabs.visits
),
SESSION_DIFF AS
(
SELECT *,
TIMESTAMP_DIFF(timestamp, last_timestamp, MINUTE) AS diff_minutes,
CASE WHEN last_timestamp IS NULL THEN 1
WHEN TIMESTAMP_DIFF(timestamp,last_timestamp,MINUTE) > 30 THEN 1
ELSE 0
END AS is_new_visit
FROM LAST_SESSION
)
SELECT
page_url,
COUNT(DISTINCT user_id) AS UniqueVisitors -- 이 부분만 바뀜?
FROM
SESSION_DIFF
GROUP BY
page_url
ORDER BY
UniqueVisitors DESC;
-- 10-4 ARPU, ARPPU
-- 일단 ARPU라고 만든 테이블 불러와보기
-- SELECT * FROM
-- oceanic-antler-456102-i6.modulabs.arpu as arpu
-- LIMIT 5;
-- (문제와 상관없으나) 유저 숫자 구해볼까?
-- 전체 데이터 숫자 30개
-- 유저숫자 30명 (중복제거 염려 없음)... 실전은 이렇지 않겠지?
-- SELECT count(*)as Nrow,
-- count(user_id) as user_num
-- FROM oceanic-antler-456102-i6.modulabs.arpu as arpu;
-- (1단계) revenue가 0인 경우 / 0이상 구분하는 임시 테이블
-- WITH T0 AS
-- (
-- SELECT user_id, purchase_date, revenue,
-- CASE WHEN revenue > 0 THEN 'Y'
-- ELSE 'N'
-- END AS paying_user_yn
-- FROM oceanic-antler-456102-i6.modulabs.arpu as arpu
-- )
-- SELECT * FROM T0;
-- (2단계) paying 구분없이 전체평균 구하면? ARPU
WITH T0 AS
(
SELECT user_id, purchase_date, revenue,
CASE WHEN revenue > 0 THEN 'Y'
ELSE 'N'
END AS paying_user_yn
FROM oceanic-antler-456102-i6.modulabs.arpu as arpu
),
T1 AS
(
SELECT
COUNT(DISTINCT(user_id)) as total_users,
COUNT(DISTINCT CASE WHEN revenue > 0 THEN user_id END) AS paying_users,
SUM(revenue) AS total_revenue
FROM T0
)
SELECT
total_revenue / total_users AS ARPU,
total_revenue / paying_users AS ARPPU
FROM T1;
-- 노드10-5 funnel분석 --
-- funnel data 확인
-- SELECT * FROM oceanic-antler-456102-i6.modulabs.funnel
-- limit 5;
-- (1단계) action개수 확인
-- visits 18개, singup 8개, add_to_cart 11개, purchase 7개
-- SELECT f.action, count(f.action) FROM
-- oceanic-antler-456102-i6.modulabs.funnel as f
-- GROUP BY f.action;
-- (2단계) funnel 분석용으로 임시 테이블 작성
-- WITH FUNNEL_BASE AS
-- (
-- SELECT
-- COUNT(DISTINCT CASE WHEN action = 'visit' THEN user_id END ) AS visits,
-- COUNT(DISTINCT CASE WHEN action = 'signup' THEN user_id END ) AS signup,
-- COUNT(DISTINCT CASE WHEN action = 'add_to_cart'THEN user_id END ) AS add_to_carts,
-- COUNT(DISTINCT CASE WHEN action = 'purchase' THEN user_id END) AS purchases
-- FROM
-- oceanic-antler-456102-i6.modulabs.funnel
-- )
-- SELECT * FROM FUNNEL_BASE;
-- (3단계) 임시table활용, FUNNEL분석
WITH FUNNEL_BASE AS
(
SELECT
COUNT(DISTINCT CASE WHEN action = 'visit' THEN user_id END ) AS visits,
COUNT(DISTINCT CASE WHEN action = 'signup' THEN user_id END ) AS signup,
COUNT(DISTINCT CASE WHEN action = 'add_to_cart'THEN user_id END ) AS add_to_carts,
COUNT(DISTINCT CASE WHEN action = 'purchase' THEN user_id END) AS purchases
FROM
oceanic-antler-456102-i6.modulabs.funnel
)
SELECT
visits,
signup,
add_to_carts,
purchases,
ROUND(signup / visits, 4) AS SignupRate,
ROUND(add_to_carts / visits, 4) AS AddToCartsRate,
ROUND(purchases / visits, 4) AS PurchaseRate
FROM FUNNEL_BASE;
-- 노드10-6 리텐션 분석? --
-- (문제풀이와 무관 EDA)
-- 00-1 (데이터 확인)
-- SELECT * FROM oceanic-antler-456102-i6.modulabs.retention
-- LIMIT 5;
-- 00-2 중복 수 체크?
-- 30개 데이터, 13명의 distinct한 유저(retention 문제니까!)
-- SELECT count(*) as total_count,
-- count(distinct(user_id)) as distinct_user
-- FROM oceanic-antler-456102-i6.modulabs.retention;
-- (힌트 1단계)
-- user_id별로 가장 최초의 구매 일자(invoice_date)를 cohort_day라는 컬럼 명으로 구해 주세요.
-- user_id, invoice_date,sales_amount
-- SELECT user_id,
-- MIN(invoice_date) OVER (PARTITION BY user_id ) AS cohort_day,
-- FROM oceanic-antler-456102-i6.modulabs.retention;
-- (힌트 2단계)
-- DATE_DIFF(DATE(invoice_date), cohort_day, MONTH)를 사용하여 cohort_index를 생성해주세요
-- 또 DATE_TRUNC(cohort_day, MONTH)를 활용하여 cohort_group를 나눠주세요.
-- SELECT user_id,
-- MIN(invoice_date) OVER (PARTITION BY user_id) AS cohort_day,
-- -- cohort_index 생성
-- DATE_TRUNC(MIN(invoice_date) OVER (PARTITION BY user_id), MONTH) AS cohort_group,
-- -- cohort_group 생성
-- -- 현재 구매 시점과 최초 구매일의 월 차
-- DATE_DIFF(DATE(invoice_date), MIN(invoice_date) OVER (PARTITION BY user_id), MONTH) as cohort_index
-- FROM oceanic-antler-456102-i6.modulabs.retention;
-- (힌트3단계)
-- cohort_group, cohort_index를 기준으로 그룹핑을 해주세요.
-- 이후 각 그룹별 distinct user_id를 세어 user_count를 계산해 주세요.
-- (2단계) 데이터를 WITH로 묶어서 임시 테이블 생성하면 됨..
WITH cohort_data AS
(
SELECT user_id,
MIN(invoice_date) OVER (PARTITION BY user_id) AS cohort_day,
-- cohort_index 생성
DATE_TRUNC(MIN(invoice_date) OVER (PARTITION BY user_id), MONTH) AS cohort_group,
-- cohort_group 생성
-- 현재 구매 시점과 최초 구매일의 월 차
DATE_DIFF(DATE(invoice_date), MIN(invoice_date) OVER (PARTITION BY user_id), MONTH) as cohort_index
FROM oceanic-antler-456102-i6.modulabs.retention
)
SELECT
cohort_group,
cohort_index,
COUNT(DISTINCT(user_id)) AS user_count
FROM cohort_data
GROUP BY
cohort_group, cohort_index
ORDER BY
cohort_group, cohort_index;
금융권에 재직했을 때는 RFM 분석 자체를 직접 하지 않더라도, 고객을
다양한 관점에서 segmentation하는 경우가 있음. 금융업권(은행, 증권, 보험, 카드)에 따라 segmentation하는 방법은 당연히 상이할 수 밖에 없음.
-- 테이블 생성
-- SELECT * FROM oceanic-antler-456102-i6.modulabs.rfm
-- LIMIT 5;
-- (힌트1단계)
-- RFM이라는 이름의 임시 테이블을 생성하여, Recency, Frequency, Monetary라는 3개의 컬럼을 추가합니다.
-- Recency 는 유저 별로 가장 최근에 일어났던 구매 일자를 오늘 날짜에서 뺀 값으로 계산합니다.
-- 오늘 날짜를 반환하는 함수는 CURRENT_DATE()입니다.
-- Frequency 는 유저별로 구매한 주문 건수를 계산한 값이고,
-- Monetary 는 유저별로 구매한 총액을 집계한 값입니다.
-- WITH RFM AS
-- (
-- SELECT user_id,
-- DATE_DIFF(CURRENT_DATE(), DATE(MAX(order_date)), DAY) AS recency,
-- COUNT(user_id) AS frequency,
-- SUM(order_value) AS monetary
-- FROM oceanic-antler-456102-i6.modulabs.rfm
-- GROUP BY user_id
-- )
-- SELECT * FROM RFM;
-- (힌트2단계)
-- RFMScores라는 임시 테이블을 생성하여, Recency, Frequency, Monetary에 기반하여 1점에서 5점까지의 척도로 점수
-- RecencyScore, FrequencyScore, MonetaryScore
WITH RFM AS
(
SELECT user_id,
DATE_DIFF(CURRENT_DATE(), DATE(MAX(order_date)), DAY) AS recency,
COUNT(user_id) AS frequency,
SUM(order_value) AS monetary
FROM oceanic-antler-456102-i6.modulabs.rfm
GROUP BY user_id
),
RFM_SCORE AS
(
SELECT *,
NTILE(5) OVER (ORDER BY RFM.recency DESC) AS RecencyScore,
NTILE(5) OVER (ORDER BY RFM.frequency DESC) AS FrequencyScore,
NTILE(5) OVER (ORDER BY RFM.monetary DESC) AS MonetaryScore,
FROM RFM
)
SELECT
user_id,
recency,
frequency,
monetary,
RecencyScore,
FrequencyScore,
MonetaryScore,
RecencyScore + FrequencyScore + MonetaryScore AS RFMScore
FROM RFM_SCORE;