SELECT ts, channel
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st
ON usc.sessionId = st.sessionId
WHERE userId = 251
ORDER BY 1;
-- 방법 1 - CTE를 빌딩 블록으로 사용
-- first : 모든 사용자별 첫 번째(옛날) 채널 리턴
WITH first AS (
SELECT userId, ts, channel, ROW_NUMBER() OVER(PARTITION BY userId ORDER BY ts) seq
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st
ON usc.sessionId = st.sessionId ),
-- last : 모든 사용자별 마지막(최근) 채널 리턴
last AS (
SELECT userId, ts, channel, ROW_NUMBER() OVER(PARTITION BY userId ORDER BY ts DESC) seq
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st
ON usc.sessionId = st.sessionId )
-- 사용자별 처음 채널, 마지막 채널 리턴
SELECT frist.userId AS userId, first.channel AS first_channel, last.channel AS last_channel
FROM first
JOIN last
ON first.userId = last.userId AND last.seq = 1
WHERE first.seq = 1;
-- 방법 2 - JOIN 방식. 방법 1의 CTE 빌딩 블록을 FROM 문으로 넣어서 한 것과 동일.
SELECT first.userId AS userId, first.channel AS first_channel, last.channel AS last_channel
FROM (
SELECT userId, ts, channel, ROW_NUMBER() OVER(PARTITION BY userId ORDER BY ts) seq
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st
ON usc.sessionId = st.sessionId ) first,
JOIN (
SELECT userId, ts, channel, ROW_NUMBER() OVER(PARTITION BY userId ORDER BY ts DESC) seq
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st
ON usc.sessionId = st.sessionId ) last
ON first.userId = last.userId AND last.seq = 1
WHERE first.seq = 1;
-- 방법 3 - GROUP BY 방식 (복잡한 방법)
SELECT userId,
MAX(CASE WHEN rn1 = 1 THEN channel END) first_channel,
MAX(CASE WHEN rn2 = 1 THEN channel END) last_channel
FROM (
SELECT userId, channel,
(ROW_NUMBER() OVER(PARTITION BY usc.userId ORDER BY st.ts ASC)) AS rn1,
(ROW_NUMBER() OVER(PARTITION BY usc.userId ORDER BY st.ts DESC)) AS rn2
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st
ON usc.sessionId = st.sessionId )
GROUP BY 1;
-- 방법 4 - FIRST_VALUE / LAST_VALUE 활용 (간단한 방법)
SELECT DISTINCT A.userId,
FIRST_VALUE(A.channel) OVER(PARTITION BY A.userId ORDER BY B.ts
-- default Attribute 필요.
rows between unbounded preceding and unbounded following) AS first_channel,
LAST_VALUE(A.channel) OVER(PARTITION BY A.userId ORDER BY B.ts
rows between unbounded preceding and unbounded following) AS last_channel
FROM raw_data.user_session_channel A
LEFT JOIN raw_data.session_timestamp B
ON A.sessionId = B.sessionId;
-- 방법 1 - GROUP BY 활용
SELECT userId, SUM(amount)
FROM raw_data.session_transaction st
LEFT JOIN raw_data.user_session_channel usc
ON st.sessionId = usc.sessionId
GROUP BY 1
ORDER BY 2 DESC
-- LIMIT 10의 경우의 문제점 : 동점자들에 대한 정보가 누락될 수 있다.
LIMIT 10;
-- 방법 2 - SUM OVER 활용. 앞의 방법과 달리 모든 userId가 선택됨.
SELECT DISTINCT usc.userId, SUM(amount) OVER(PARTITION BY usc.userId)
FROM raw_data.user_session_channel AS usc
JOIN raw_data.session_transaction AS revenue
ON revenue.sessionId = usc.sessionId
ORDER BY 2 DESC
LIMIT 10;
-- 방법 1.
SELECT month, ROUND((promoters-detractors)*100.0/NULLIF(total_count, 0)), 2) AS overall_nps
FROM (
SELECT LEFT(created, 7) AS month,
COUNT(CASE WHEN score >= 9 THEN 1 END) AS promoters,
COUNT(CASE WHEN score <= 6 THEN 1 END) AS detractors,
COUNT(CASE WHEN score > 6 AND score < 9 THEN 1 END) AS passives,
COUNT(1) AS total_count
FROM raw_data.nps
GROUP BY 1
-- ORDER BY 1
);
-- 방법 2.
SELECT LEFT(created, 7) AS month,
ROUND(SUM(CASE
WHEN score >= 9 THEN 1
WHEN score <= 6 THEN -1 END) * 100.0 / COUNT(1), 2)
FROM raw_data.nps
GROUP BY 1
ORDER BY 1;
Atomic하게 실행되어야 하는 SQL들을 묶어서 하나의 작업처럼 처리하는 방법입니다.
예) 은행 계좌 이체 (입금, 출금이 atmoic하게 실행되어야 함.)
-- 아래 두 명령어는 마치 하나의 명령어처럼 작동됨.
BEGIN
A의 계좌로부터 인출;
B의 계좌로 입금;
END;
-- BEGIN 전의 상태로 돌아가고 싶다면 ROLLBACK 실행.
트랜잭션 커밋 모드 : autocommit (postgresql)
Google Colab의 트랜잭션
psycopg2의 트랜잭션
- autocommit이라는 파라미터로 조절 가능.
DELETE FROM vs. TRUNCATE
DELETE FROM table_name
TRUNCATE table_name
UNION, EXCEPT, INTERSECT
CALESCE, NULLIF
LISTAGG
LAG
WINDOW 함수
JSON Parsing 함수
UNION (합집합)
EXCEPT(or MINUS) (차집합)
GROUP BY에서 사용되는 Aggregate 함수 중의 하나.
ex) 사용자 ID별로 채널을 순서대로 리스트:
-- 구분자 X
SELECT userId,
LISTAGG(channel) WITHIN GROUP (ORDER BY ts) channels
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st
ON usc.sessionId = st.sessionId
GROUP BY 1
LIMIT 10;
-- 결과 예시
-- 68 YotubeGoogleInstagramYoutube...
-- 구분자 O
SELECT userId,
LISTAGG(channel, '->') WITHIN GROUP (ORDER BY ts) channels
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st
ON usc.sessionId = st.sessionId
GROUP BY 1
LIMIT 10;
-- 결과 예시
-- 68 Yotube->Google->Instagram->Youtube...
Syntax: function(exp) OVER (PARTITION BY exp ORDER BY exp)
Useful functions:
-- 이전 채널 찾기
SELECT usc.*, st.ts,
LAG(channel, 1) OVER (PARTITION BY userId ORDER BY ts) prev_channel
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st
ON usc.sessionId = st.sessionId
ORDER BY usc.userId, st.ts;
-- 다음 채널 찾기
SELECT usc.*, st.ts,
LAG(channel, 1) OVER (PARTITION BY userId ORDER BY ts DESC) next_channel
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st
ON usc.sessionId = st.sessionId
ORDER BY usc.userId, st.ts;
JSON의 포맷을 이미 아는 상황에서 사용 가능한 함수
예제) JOSN_EXTRACT_PATH_TEXT