session_timestamp와 user_session_channel은 sessionid를 기준으로 1:1 관계
user_session_channel과 session_transaction은 N:1 관계. 즉, 유저는 거래 세션을 포함한 여러 세션을 가질 수 있고, 그 중 거래가 발생한 세션이 session_transaction 테이블에 저장되어 있다.
SELECT ts, channel
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st ON usc.sessionid = st.sessionid
WHERE userid = 250
ORDER BY ts asc
limit 1;
SELECT ts, channel
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st ON usc.sessionid = st.sessionid
WHERE userid = 250
ORDER BY ts desc
limit 1;
-> 각 유저별 정렬 기능을 수행하기 위해 ROW_NUMBER를 사용할 수 있다.1. 유저별로 시간순 일련번호를 매기고 싶다면?
2. 새로운 컬럼 seq추가 : 유저별로 레코드를 모으고 그 안에서 시간순으로 정렬한 후 사용자별로 1부터 번호를 부여한다.
3. ROW_NUMBER를 쓰면 2번을 구현할 수 있다.
ROW_NUMBER() OVER (partition by userid order by ts ASC) seq
with ts_seq as
(
select userid, ts, channel,
row_number() over (partition by userid order by ts asc) as seq_first,
row_number() over (partition by userid order by ts desc) as seq_last
from raw_data.user_session_channel usc, raw_data.session_timestamp st
where usc.sessionid = st.sessionid
)
select userid,
MAX(CASE WHEN seq_first = 1 THEN channel END) f_channel,
MAX(CASE WHEN seq_last = 1 THEN channel END) l_channel
from ts_seq
where seq_first = 1 or seq_last = 1
group by 1 ; -- 하나의 userid에 대해 한줄로 f_channel,l_channel을 출력하기 위해
WITH cte AS (
SELECT userid, channel,
(ROW_NUMBER() OVER (PARTITION BY usc.userid ORDER BY st.ts asc)) AS arn,
(ROW_NUMBER() OVER (PARTITION BY usc.userid ORDER BY st.ts desc)) AS drn
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st ON usc.sessionid = st.sessionid
)
SELECT cte1.userid, cte1.channel AS first_touch, cte2.channel AS last_touch
FROM cte cte1
JOIN cte cte2 ON cte1.userid = cte2.userid and cte2.drn = 1
WHERE cte1.arn = 1
ORDER BY 1;
WITH Subquery를 빌딩블록으로 사용하여 SELF JOIN
SELECT DISTINCT A.userid,
FIRST_VALUE(A.channel) over(partition by A.userid order by B.ts
rows between unbounded preceding and unbounded following) AS First_Channel,
LAST_VALUE(A.channel) over(partition by A.userid order by B.ts
rows between unbounded preceding and unbounded following) AS Last_Channel
FROM raw_data.user_session_channel A
LEFT JOIN raw_data.session_timestamp B
ON A.sessionid = B.sessionid;
userid 별 그룹화, 그룹 내에서 ts순으로 정렬
rows between unbounded preceding and unbounded following
unbounded : 제약이 없다.
preceding : 앞으로 몇 개를 볼 것인지
following : 뒤로 몇 개를 볼 것인지
value 테이블 : 1, 2, 3, 4, 5
rolling sum : 6, 10, 15, 14, 12
SELECT
SUM(value)
OVER (order by value rows between 2 preceding and 2 following) AS rolling_sum
FROM raw_data.rows_test ;
만약, unbounded 옵션을 주게 되면 1, 2, 3, 4, 5 모든 위치에서 값이 15가 나오게 된다.
Gross revenue: Refund 포함한 매출
select userid, sum(amount) as gross_revenue
from raw_data.session_transaction as st
left join raw_data.session_timestamp as ts
on st.sessionid = ts.sessionid
left join raw_data.user_session_channel as usc
on ts.sessionid = usc.sessionid
group by userid
order by 2 desc
limit 10 ;
SELECT userID,
SUM(amount) gross_revenue,
SUM(CASE WHEN st.refunded is not TRUE THEN amount END) net_revenue
FROM raw_data.session_transaction st
JOIN raw_data.user_session_channel usc ON st.sessionid = usc.sessionid
GROUP BY 1
ORDER BY 2 DESC
LIMIT 10;
st.refunded is not TRUE
# 숙제 4 : 채널별/월별 매출액 테이블 만들기
%%sql
with paid as (
select *, to_char(stime.ts, 'YYYY-MM') as year_month,
case
when stran.sessionid is not null then 'paid'
else 'nonpaid'
end as paidUsers
from raw_data.user_session_channel as usc
left join raw_data.session_transaction as stran
on stran.sessionid = usc.sessionid
left join raw_data.session_timestamp as stime
on usc.sessionid = stime.sessionid
where paidUsers like 'paid')
select paid.year_month,
usc.channel,
count(distinct(usc.userid))as uniqueUsers,
count(distinct(case
when stran.sessionid is not null then 'paid'
else 'nonpaid'
end)) as paidUsers, -- end as paidUserss)) as paidUsers, ->
paid.paidUsers::float/nullif(uniqueUsers::float, 0) as conversionRate,
sum(stran.amount) as grossRevenue,
sum(case
when stran.refunded is not True then stran.amount -- like 'True'가 아니라 is not True로 해야 함
else NULL -- else 생략하면 자동으로 null로 채워지나? sum(stran.amount) 가 아니라 amount로,,,합은 앞단에서 했고, year_month, channel 별로 합산하기로 아래에서 group by를 통해
end) netRevenue -- end as 말고 end로 끝내기
from raw_data.session_timestamp as stime
left join raw_data.user_session_channel as usc
on stime.sessionid = usc.sessionid
left join raw_data.session_transaction as stran
on usc.sessionid = stran.sessionid,
paid
group by year_month, usc.channel
📌 error
column "paid.paidusers" must appear in the GROUP BY clause or be used in an aggregate function
SELECT LEFT(ts, 7) "month",
usc.channel,
COUNT(DISTINCT userid) uniqueUsers,
COUNT(DISTINCT (CASE WHEN amount is not NULL THEN userid END)) paidUsers,
-- amount가 null이 아닌 고객ID를 PaidUsers로 정의
ROUND(paidUsers::float*100/NULLIF(uniqueUsers, 0),2) conversionRate, -- 51.49%
SUM(amount) grossRevenue,
SUM(CASE WHEN refunded is not True THEN amount END) netRevenue
-- refunded가 True가 아닌 = False, Null인 row의 amount의 총합을 netRevenue로 정의
FROM raw_data.user_session_channel usc
LEFT JOIN raw_data.session_timestamp t ON t.sessionid = usc.sessionid
LEFT JOIN raw_data.session_transaction st ON st.sessionid = usc.sessionid
GROUP BY 1, 2
ORDER BY 1, 2;
ROUND(paidUsers::float*100/NULLIF(uniqueUsers, 0),2)