CTAS
- SELECT를 가지고 테이블 생성
- 간단하게 새로운 테이블을 만드는 방법
- 자주 사용하는 테이블을 미리 만들기
DROP TABLE IF EXISTS adhoc.jaehyun_session_summary;
CREATE TABLE adhoc.jaehyun_session_summary AS
SELECT B.*, A.ts FROM raw_data.session_timestamp A
JOIN raw_data.session_channel B ON A.sessionid = B.sessionid;
SELECT
FROM adhoc.jaehyun_session_summary
GROUP BY 1
ORDER BY 1
데이터 품질 확인 방법
중복된 레코드들 체크하기
SELECT COUNT(1)
FROM adhoc.jaehyun_session_summary
SELECT COUNT(1)
FROM (
SELECT DISTINCE userId, sessionId, ts, channel
FROM adhoc.jaehyun_session_summary
);
최근 데이터의 존재 여부 체크하기
SELECT MIN(ts), MAX(ts)
FROM adhoc.jaehyun_session_summary;
Primary key uniqueness가 지켜지는 지 체크하기
SELECT sessionId, COUNT(1)
FROM adhoc.jaehyun_session_summary
GROUP BY 1
ORDER BY 2 DESC
LIMIT 1;
값이 비어있는 컬럼들이 있는 지 체크하기
SELECT
COUNT(CASE WHEN sessionId is NULL THEN 1 END) sessionid_null_count,
COUNT(CASE WHEN userId is NULL THEN 1 END) userId_null_count,
COUNT(CASE WHEN ts is NULL THEN 1 END) ts_null_count,
COUNT(CASE WHEN channel is NULL THEN 1 END) channel_null_count,
FROM adhoc.jaehyun_session_summary;
숙제
- month
- channel
- uniqueUsers (총방문 사용자)
- paidUsers (구매 사용자: refund한 경우도 판매로 고려)
- conversionRate (구매 사용자 / 총방문 사용자), float
- grossRevenue (refund 포함)
- netRevenue (refund 미포함)
- 결과 집합
- 내가 가져오려고 하는 데이터가 어디에 있는가
- month -> session_timestamp.ts
- channel -> user_session_channel.channel
- uniqueUsers (총방문 사용자) -> user_session_channel.userId
- paidUsers (구매 사용자: refund한 경우도 판매로 고려) -> user_session_channel.userId, session_transaction.amount
- conversionRate (구매 사용자 / 총방문 사용자), float -> uniqueUsers, paidUsers
- grossRevenue (refund 포함) -> session_transaction.amount, refunded
- netRevenue (refund 미포함) -> session_transaction.amount, refunded
- 어떻게 연결할 것인가
- session_timestamp.sessionId inner join user_session_channel.sessionId
- session_transaction.sessionId left join user_session_channel.seesionId
- 필터링을 세분화하자
- 채널 별: group by channel
- 월 별: group by month
DROP TABLE IF EXISTS adhoc.jaehyun_session_summary;
CREATE TABLE adhoc.jaehyun_session_summary AS
SELECT
B.*,
A.ts,
C.refunded,
C.amount
FROM
raw_data.session_timestamp A
INNER JOIN raw_data.user_session_channel B ON A.sessionid = B.sessionid
LEFT JOIN raw_data.session_transaction C ON C.sessionid = A.sessionid;
SELECT
TO_CHAR(ts, 'YYYY-MM') AS month,
channel AS channel,
COUNT(DISTINCT(userId)) AS uniqueUsers,
COUNT(CASE WHEN amount > 0 THEN 1 END) AS paidUsers,
CAST(COUNT(CASE WHEN amount > 0 THEN 1 END) AS FLOAT) / CAST(COUNT(DISTINCT userId) AS FLOAT) AS conversionRate,
SUM(amount) AS grossRevenue,
SUM(CASE WHEN refunded = TRUE THEN amount ELSE 0 END) AS netRevenue
FROM adhoc.jaehyun_session_summary
GROUP BY 1,2;