1. JOIN
2. SQL
JOIN이란?
SQL 조인은 두 개 혹은 그 이상의 테이블들을 공통 필드를 가지고 머지하는데 사용됨, 이는 스타 스키마로 구성된 테이블들로 분산되어 있던 정보를 통합하는데 사용됨.
JOIN시 고려해야할 점
중복 레코드가 없고 Primary Key의 uniqueness가 보장됨을 체크
-> 가장 중요!
조인하는 테이블들간의 관계를 명확하게 정의
One to one
이때, left join을 할지, inner join을 할지 잘 생각해봐야한다.
inner join의 경우 null이 생기지 않지만 그만큼 데이터의 소실이 생길 것이고,
left join의 경우 부분 집합에 해당하는 쪽에서 null값이 생길 것이기에..
One to many
-> 이 경우 중복이 있을 경우 증폭되므로 더욱 문제가 됨
Many to one
-> One to many와 사실상 동일
Many to many
-> 이런 경우는 많지 않으며 이는 one to one, one to many로 바꾸는 것이 가능하다면 변환하여 join하는 것이 덜 위험
어느 테이블을 베이스로 잡을지 결정해야함
JOIN의 종류
INNER JOIN
LEFT JOIN
FULL JOIN
e.g.)
SELECT * FROM raw_data.Vital v FULL JOIN raw_data.Alert a ON v.vitalid = a.vitalid;
CROSS JOIN (= CARTESIAN JOIN )
e.g.)
SELECT * FROM raw_data.Vital v CROSS JOIN raw_data.Alert a;
SELF JOIN
SELECT * FROM raw_data.Vital v1 JOIN raw_data.Vital v2 ON v1.vitalid = v2.vitalid;OUT JOIN (minus)
SELECT DISTINCT sessionid FROM raw_data.user_session_channel
MINUS
SELECT DISTINCT sessionid FROM raw_data.user_session_timestamp;
BOOLEAN 타입 처리
True or False
다음 2개는 동일한 표현
다음 2개는 동일한 표현인가?
NULL 비교
NULLIF
0으로 나누는 경우 divide by 0 에러 발생
NULLIF를 사용하여 0을 NULL로 변경
-> e.g.) NULLIF(uniqueUsers,0)
COALESCE
NULL 값을 다른 값으로 바꿔주는 함수
COALESCE(exp1, exp2, exp3, ...)
e.g)
SELECT
value1, value2
/*
value1이 NULL이면 value2를 확인하고
value2가 NULL이 아니면 value2를 반환,
value2도 NULL이면 0을 리턴
*/
COALESCE(value1, value2, 0)
FROM raw_data.count_test;
e.g.)
CREATE TABLE keeyong.test(
-- group은 예약키워드이기 때문에 에러 발생
group int primary key,
-- ''(single quote)를 이해 못하기에 에러 발생
'mailing address' varchar(32)
)
-- 수정
CREATE TABLE keeyong.test(
"group" int primary key,
"mailing address" varchar(32)
)
ROW_NUMBER vs. FIRST_VALUE/LAST_VALUE
사용자 251번의 시간순으로 봤을 때 첫 번째 채널과 마지막 채널은 무엇인가?
SELECT ts, channel
FROM raw_data.user_session_channel a
JOIN raw_data.session_timestamp b
ON a.sessionid = b.sessionid
WHERE userid = 251
ORDER BY 1
ROW_NUMBER를 이용해보자
ROW_NUMBER() OVER (PARTITION BY field1 ORDER BY field2) nne.g.)
ROW_NUMBER OVER (PARTITION BY userid ORDER BY ts) seq
내 풀이
%%sql
with temp as(
SELECT
row_number() over (partition by userid order by ts) nn,
count(1) over (partition by userid) max_cnt,
channel,
userid
FROM raw_data.user_session_channel a
JOIN raw_data.session_timestamp b
ON a.sessionid = b.sessionid
)
SELECT channel, userid
FROM temp
WHERE nn = 1 or nn = max_cnt;
with first as (
select userid, ts, channel, row_number() over(partition by userid order by ts) seq
from raw_data.user_session_channel usc
join raw_data.session_timestamp st ON usc.sessionid = st.sessionid
), last as(
select userid, ts, channel, row_number() over(partition by userid order by ts desc) seq
from raw_data.user_session_channel usc
join raw_data.session_timestamp st ON usc.sessionid = st.sessionid
)
select first.userid as userid, first.channel as first_channel, last.channel as last_channel
from first
join last on first.userid = last.userid and last.seq=1
where first.seq = 1;
select first.userid as userid, first.channel as first_channel, last.channel as last_channel
from(
select userid, ts, channel, row_number() over(partition by userid order by ts) seq
from raw_data.user_session_channel usc
join raw_data.session_timestamp st ON usc.sessionid = st.sessionid
) first
join(
select userid, ts, channel, row_number() over(partition by userid order by ts desc) seq
from raw_data.user_session_channel usc
join raw_data.session_timestamp st ON usc.sessionid = st.sessionid
) last
on first.userid = last.userid and last.seq=1
where first.seq = 1;
select userid,
-- 일종의 편법
-- max를 사용하든 min을 사용하든 상관없음
-- 그룹 별 첫번째 인덱스만을 띄우기 위함
max(case when rn1=1 then channel end) first_touch,
max(case when rn2=1 then channel end) last_touch,
from(
select userid,
channel,
(row_number() over (partition by usc.userid orderby st.ts asc)) as rn1,
(row_number() over (partition by usc.userid orderby st.ts desc)) as rn2
from raw_data.user_session_channel usc
join raw_data.session_timestamp st on usc.sessionid = st.sessionid
)
group by 1;
rows between unbounded preceding and unbounded following)select distinct
a.userid,
first_value(a.channel) over(partition by a.userid order by b.ts rows between unbounded preceding and unbounded following) as first_channel,
last_value(a.channel) over(partition by a.userid order by b.ts rows between unbounded preceding and unbounded following) as last_channel
from raw_data.user_session_channel a
left join raw_data.session_timestamp b on a.sessionid = b.sessionid;
사용 테이블
Gross revenue : Refund 포함한 매출
내 풀이
%%sql
SELECT
userid,
CASE WHEN SUM(amount) is NULL THEN 0
ELSE SUM(amount) END grossRevenue
FROM raw_data.user_session_channel usc
LEFT JOIN raw_data.session_timestamp t ON t.sessionid = usc.sessionid
LEFT JOIN raw_data.session_transaction st ON st.sessionid = usc.sessionid
GROUP BY 1
ORDER BY grossRevenue desc
LIMIT 10;
SELECT
userid,
SUM(amount)
FROM raw_data.user_session_channel usc
LEFT JOIN raw_data.session_transaction st ON st.sessionid = usc.sessionid
GROUP BY 1
ORDER BY 2 desc
LIMIT 10;
select
distinct usc.userid,
sum(amount) over(partition by usc.userid)
FROM raw_data.user_session_channel usc
JOIN raw_data.session_transaction st ON st.sessionid = usc.sessionid
ORDER BY 2 desc
LIMIT 10;
SELECT * FROM raw_data.nps LIMIT 10;
고객들이 0(의향 없음)에서 10(의향 아주 높음)
detractor(비추천자) : 0에서 6
passive(소극자) : 7이나 8점
promoter(홍보자) : 9나 10점
NPS = promoter 퍼센트 - detractor 퍼센트
내 풀이
%%sql
SELECT TO_CHAR(created_at, 'YYYY-MM') year_month,
ROUND(COUNT(CASE WHEN score between 0 and 6 THEN 1 END)*100./NULLIF(COUNT(1),0),2) detractor,
ROUND(COUNT(CASE WHEN score between 9 and 10 THEN 1 END)*100./NULLIF(COUNT(1),0),2) promoter,
promoter-detractor NPS
FROM raw_data.nps
GROUP BY 1
ORDER BY 1;
SELECT month,
round((promoters-detractors)::float/total_count*100,2) as overall_nps
FROM (
SELECT LEFT(created_at,7) as month,
count(case when score >= 9 then 1 end) as promoters,
count(case when score <= 6 then 1 end) as detractors,
count(1) as total_count
FROM raw_data.nps
GROUP BY 1
ORDER BY 1
)