๐Ÿ“’ SQL(3)

Kimdongkiยท2024๋…„ 4์›” 24์ผ

DB

๋ชฉ๋ก ๋ณด๊ธฐ
6/33

๐Ÿ“Œ GROUP BY & Aggregate

  • ํ…Œ์ด๋ธ”์˜ ๋ ˆ์ฝ”๋“œ๋ฅผ ๊ทธ๋ฃนํ•‘ํ•˜์—ฌ ๊ทธ๋ฃน๋ณ„๋กœ ๋‹ค์–‘ํ•œ ์ •๋ณด๋ฅผ ๊ณ„์‚ฐํ•œ๋‹ค.

  • ์ด๋Š” ๋‘ ๋‹จ๊ณ„๋กœ ์ด๋ฃจ์–ด์ ธ์žˆ๋‹ค.

    1. ๊ทธ๋ฃนํ•‘ํ•  ํ•„๋“œ๋ฅผ ๊ฒฐ์ •
      -> Group by๋กœ ์ง€์ •(ํ•„๋“œ ์ด๋ฆ„ or ํ•„๋“œ ์ผ๋ จ๋ฒˆํ˜ธ ์‚ฌ์šฉ)
    2. ๋‹ค์Œ ๊ทธ๋ฃน๋ณ„๋กœ ๊ณ„์‚ฐํ•  ๋‚ด์šฉ์„ ๊ฒฐ์ •
      -> Aggregateํ•จ์ˆ˜ ์‚ฌ์šฉ
      -> COUNT, SUM, AVG, MIN, MAX, LISTAGG,...
      ๋ณดํ†ต ํ•„๋“œ ์ด๋ฆ„์„ ์ง€์ •ํ•˜๋Š” ๊ฒƒ์ด ์ผ๋ฐ˜์ ์ด๋‹ค.(alias)
  • ์›”๋ณ„ ์„ธ์…˜์ˆ˜ -> ๋งŒ์•ฝ ์ผ๋ณ„์„ ์›ํ•œ๋‹ค๋ฉด LEFT(ts, 10)์œผ๋กœ ํ•˜๋ฉด ๋œ๋‹ค.

SELECT
	LEFT(ts, 7) AS mon,
    COUNT(1) AS session_count
FROM raw_data.session_timestamp
GROUP BY 1
ORDER BY 1;
  • ๊ฐ€์žฅ ๋งŽ์ด ์‚ฌ์šฉ๋œ ์ฑ„๋„
    • ์‚ฌ์šฉ์ž ๊ธฐ๋ฐ˜ or ์„ธ์…˜ ๊ธฐ๋ฐ˜
    • ํ•„์š”ํ•œ ์ •๋ณด -> ์ฑ„๋„ ์ •๋ณด, ์‚ฌ์šฉ์ž ์ •๋ณด or ์„ธ์…˜ ์ •๋ณด
    • ํ…Œ์ด๋ธ” ์„ ํƒ(Join ์—ฌ๋ถ€ ํ™•์ธ)
SELECT
	channel,
    COUNT(1) AS session_count,
    COUNT(DISTINCT userid) AS user_count
FROM raw_data.user_session_channel
GROUP BY 1
ORDER BY 2 DESC;
  • ๊ฐ€์žฅ ๋งŽ์€ ์„ธ์…˜์„ ๋งŒ๋“ค์–ด๋‚ธ ์‚ฌ์šฉ์ž ID
    • ํ•„์š”ํ•œ ์ •๋ณด -> ์„ธ์…˜ ์ •๋ณด, ์‚ฌ์šฉ์ž ์ •๋ณด
    • ํ…Œ์ด๋ธ” ์„ ํƒ(Join ์—ฌ๋ถ€ ํ™•์ธ)
SELECT
	userid,
    COUNT(1) AS count
FROM raw_data.user_session_channel
GROUP BY 1
ORDER BY 2 DESC
LIMIT 1;
  • ์›”๋ณ„ ์œ ๋‹ˆํฌํ•œ ์‚ฌ์šฉ์ž ์ˆ˜(MAU - Monthly Active User)
    • ํ•„์š”ํ•œ ์ •๋ณด -> ์‹œ๊ฐ„ ์ •๋ณด, ์‚ฌ์šฉ์ž ์ •๋ณด
    • ํ…Œ์ด๋ธ” ์„ ํƒ
      • user_session_channel(userid, sessionid, channel)
      • session_timestamp(sessionid, ts) -> ts = timestemp
        -> Join ํ•ด์•ผํ•จ
SELECT
	TO_CHAR(A.ts, 'YYYY-MM') AS month,
    COUNT(DISTINCT B.userid) AS mau
FROM raw_data.session_timestamp A
JOIN raw_data.user_session_channel B ON A.sessionid = B.sessionid
GROUP BY 1
ORDER BY 1 DESC;
  • TO_CHAR(A.ts, 'YYYY-MM')
    • Left(A.ts, 7)
    • DATE_TRUNC('month', A.ts)
    • SUBSTRING(A.ts, 1, 7)
  • ์›”๋ณ„ ์ฑ„๋„๋ณ„ ์œ ๋‹ˆํฌํ•œ ์‚ฌ์šฉ์ž ์ˆ˜
SELECT
	TO_CHAR(A.ts, 'YYYY-MM') AS month,
    channel,
    COUNT(DISTINCT B.userid) AS mau
FROM raw_data.session_timestamp A
JOIN raw_data.user_session_channel B ON A.sessionid = B.sessionid
GROUP BY 1, 2
ORDER BY 1 DESC, 2;
  • ์ฑ„๋„๋ณ„ ์›”๋ณ„ ๋งค์ถœ์•ก ํ…Œ์ด๋ธ” ๋งŒ๋“ค๊ธฐ
    • session_timestamp, user_session_channel, session_transaction ์‚ฌ์šฉ
    • ํ•„๋“œ ๊ตฌ์„ฑ
      • month
      • channel
      • uniqueUser(์ด ๋ฐฉ๋ฌธ ์‚ฌ์šฉ์ž)
      • paidUsers(๊ตฌ๋งค ์‚ฌ์šฉ์ž : refundํ•œ ๊ฒฝ์šฐ๋„ ํŒ๋งค๋กœ ๊ณ ๋ ค)
      • conversionRate(๊ตฌ๋งค ์‚ฌ์šฉ์ž / ์ด ๋ฐฉ๋ฌธ ์‚ฌ์šฉ์ž)
      • grossRevenue(refund ํฌํ•จ)
      • netRevenue(refund ์ œ์™ธ)
%%sql
CREATE TABLE adhoc.dongki_amount AS
SELECT
    TO_CHAR(A.ts, 'YYYY-MM') AS month,
    B.channel AS channel,
    COUNT(DISTINCT B.userid) AS uniqueUsers,
    COUNT(DISTINCT CASE WHEN C.refunded = 'false' THEN B.userid END) AS paidUsers,
    CASE
        WHEN COUNT(DISTINCT B.userid)=0 THEN 0
        ELSE ROUND(paidUsers * 100 / uniqueUsers, 1)
    END AS conversionRate,
    SUM(C.amount) AS grossRevenue,
    SUM(CASE WHEN C.refunded = 'true' THEN 0 ELSE C.amount END) AS netRevenue
FROM raw_data.session_timestamp A
JOIN raw_data.user_session_channel B ON A.sessionid = B.sessionid
JOIN raw_data.session_transaction C ON A.sessionid = C.sessionid
GROUP BY
    TO_CHAR(A.ts, 'YYYY-MM'),
    B.channel;

๐Ÿ“Œ CATS

  • SELECT๋ฅผ ๊ฐ€์ง€๊ณ  ํ…Œ์ด๋ธ”์„ ์ƒ์„ฑํ•œ๋‹ค.
  • ๊ฐ„๋‹จํ•˜๊ฒŒ ์ƒˆ๋กœ์šด ํ…Œ์ด๋ธ”์„ ๋งŒ๋“œ๋Š” ๋ฐฉ๋ฒ•
  • ์ž์ฃผ ์กฐ์ธํ•˜๋Š” ํ…Œ์ด๋ธ”๋“ค์ด ์žˆ๋‹ค๋ฉด CATS๋ฅผ ์‚ฌ์šฉํ•ด์„œ ์กฐ์ธํ•ด์ฃผ๋ฉด ํŽธ๋ฆฌํ•ด์ง„๋‹ค.
DROP TABLE IF EXISTS adhoc.kdk_session_summary;
CREATE TABLE adhoc.kdk_session_summary AS
SELECT B.*, A.ts FROM raw_data.session_timestamp A
JOIN raw_data.user_session_channel B ON A.session = B.sessionid;

๐Ÿ“Œ ๋ฐ์ดํ„ฐ ํ’ˆ์งˆ ํ™•์ธ ๋ฐฉ๋ฒ•

๐Ÿ“Œ ์ค‘๋ณต๋œ ๋ ˆ์ฝ”๋“œ ์ฒดํฌ

  1. ๋‹ค์Œ ๋‘ ๊ฐœ์˜ ์นด์šดํŠธ๋ฅผ ๋น„๊ต
SELECT COUNT(1)
FROM adhoc.kdk_session_summary;

SELECT COUNT(1)
FROM(
	SELECT DISTINCT userid, sessionid, ts, channel
    FROM adhoc.kdk_session_summary
);
  1. CTE๋ฅผ ์‚ฌ์šฉํ•ด์„œ ์ค‘๋ณต ์ œ๊ฑฐ ํ›„ ์นด์šดํŠธ
With ds AS(
	SELECT DISTINCT userid, sessionid, ts, channel
    FROM adhoc.kdk_session_summary
)

SELECT COUNT(1)
FROM ds

๐Ÿ“Œ ์ตœ๊ทผ ๋ฐ์ดํ„ฐ์˜ ์กด์žฌ ์—ฌ๋ถ€ ์ฒดํฌ(freshness)

SELECT MIN(ts), MAX(ts)
FROM adhoc.kdk_session_summary;

๐Ÿ“Œ Primary key uniqueness๊ฐ€ ์ง€์ผœ์ง€๋Š”์ง€ ์ฒดํฌ

SELECT sessionid, COUNT(1)
FROM adhoc.kdk_session_summary
GROUP BY 1
ORDER BY 2 DESC
LIMIT 1;

๐Ÿ“Œ ๊ฐ’์ด ๋น„์–ด์žˆ๋Š” ์ปฌ๋Ÿผ๋“ค์ด ์žˆ๋Š”์ง€ ์ฒดํฌ

SELECT
	COUNT(CASE WHEN sessionid is NULL THEN 1 END) sessionid_null_count,
    COUNT(CASE WHEN userid is NULL THEN 1 END) userid_null_count,
    COUNT(CASE WHEN ts is NULL THEN 1 END) ts_null_count,
    COUNT(CASE WHEN channel is NULL THEN 1 END) channel_null_count
FROM adhoc.kdk_session_summary;

0๊ฐœ์˜ ๋Œ“๊ธ€