1. GROUP BY와 AGGREGATE 함수
2. CTAS와 CTE
1. GROUP BY & Aggregate 함수
- 월별 세션수를 계산하는 SQL
- raw_data.session_timestamp를 사용 (sessionId와 ts 필드)
#
SELECT LEFT(ts, 7) AS mon,
COUNT(1) AS session_count
FROM raw_data.session_timestamp
GROUP BY 1 -- GROUP BY mon, GROUP BY LEFT(ts, 7)
ORDER BY 1 ;
SELECT channel,
COUNT(1) AS session_count,
COUNT(DISTINCT userId) AS user_count
FROM raw_data.user_session_channel
GROUP BY 1 -- GROUP BY channel
ORDER BY 2 DESC; -- ORDER BY session_count DESC
SELECT userId,
COUNT(1) AS count
FROM raw_data.user_session_channel
GROUP BY 1 -- GROUP BY userId
ORDER BY 2 DESC -- ORDER BY count DESC
LIMIT 1;
SELECT TO_CHAR(A.ts, 'YYYY-MM') AS month,
channel,
COUNT(DISTINCT B.userid) AS mau
FROM raw_data.session_timestamp A
JOIN raw_data.user_session_channel B ON A.sessionid = B.sessionid
GROUP BY 1, 2
ORDER BY 1 DESC, 2;
- TO_CHAR (A.ts, ‘YYYY-MM’)
- LEFT(A.ts, 7)
- DATE_TRUNC(‘month’, A.ts)
- SUBSTRING(A.ts, 1, 7)
필드/테이블 이름에 Alias 사용. AS는 필수가 아님.
- COUNT(DISTINCT B.userid) AS mau와
- COUNT(DISTINCT B.userid) mau는 동일
ORDER BY와 GROUP BY
- 포지션 번호 vs. 필드 이름
- GROUP BY 1 == GROUP BY month == GROUP BY TO_CHAR(A.ts, 'YYYY-MM')
SELECT TO_CHAR(A.ts, 'YYYY-MM') AS month,
channel,
COUNT(DISTINCT B.userid) AS mau
FROM raw_data.session_timestamp A
JOIN raw_data.user_session_channel B ON A.sessionid = B.sessionid
GROUP BY 1, 2
ORDER BY 1 DESC, 2;
1. CTAS: SELECT를 가지고 테이블 생성
2. 항상 시도해봐야하는 데이터 품질 확인 방법들
1. 중복된 레코드들 체크하기
- 다음 두 개의 카운트를 비교
SELECT COUNT(1)
FROM table;
SELECT COUNT(1)
FROM (
SELECT DISTINCT userId, sessionId, ts, channel
FROM table
);
- CTE를 사용해서 중복 제거 후 카운트 해보기
With ds AS (
SELECT DISTINCT userId, sessionId, ts, channel
FROM table
)
SELECT COUNT(1)
FROM ds;
2. 최근 데이터의 존재 여부 체크하기 (freshness)
SELECT MIN(ts), MAX(ts)
FROM table;
3. Primary key uniqueness가 지켜지는지 체크하기
SELECT sessionId, COUNT(1)
FROM table
GROUP BY 1
ORDER BY 2 DESC
LIMIT 1;
4. 값이 비어있는 컬럼들이 있는지 체크하기
SELECT
COUNT(CASE WHEN sessionId is NULL THEN 1 END) sessionid_null_count,
COUNT(CASE WHEN userId is NULL THEN 1 END) userid_null_count,
COUNT(CASE WHEN ts is NULL THEN 1 END) ts_null_count,
COUNT(CASE WHEN channel is NULL THEN 1 END) channel_null_count
FROM table;