선행 커멘드:
use qcc;
dim_years
은 2000년부터 2030년까지의 연도 데이터를 포함하고 있습니다.sales_order
테이블에 저장되어 있으며,customer
테이블에 저장되어 있습니다.customerid
로 연결됩니다.2000년부터 2030년까지의 연도 데이터.
컬럼 이름 | 데이터 타입 | 설명 |
---|---|---|
year | INT | 연도 (PK ) |
컬럼 이름 | 데이터 타입 | 설명 |
---|---|---|
salesorderid | INT | 주문 ID (PK ) |
unitprice | DECIMAL(19,4) | 제품 단가 |
orderqty | SMALLINT | 주문 수량 |
customerid | INT | 고객 ID (FK , 고객 테이블과 연결) |
orderdate | TIMESTAMP | 주문 날짜 |
컬럼 이름 | 데이터 타입 | 설명 |
---|---|---|
customerid | INT | 고객 ID (PK ) |
firstname | VARCHAR(100) | 고객 이름 (성) |
lastname | VARCHAR(100) | 고객 성 (이름) |
당신은 이커머스 회사 ’루팡(주)’의 고객 코호트 분석팀에 속해 있습니다.
이번 분석의 목표는 2011년 12월에 첫 주문을 한 고객에 대해,
여러 연도별 매출 관련 지표를 집계하는 것입니다.
분석해야 할 내용은 다음과 같습니다 :
첫 주문 날짜가 2011년 12월인 고객만 필터링합니다.
각 고객의 첫 주문과 마지막 주문 사이의 기간을 기준으로, 그 기간 동안의 연평균 총거래액(GMV)을 계산합니다. 소수점 2째자리까지 계산해주세요.
GMV = 제품 단가 (
unitPrice
) × 주문 수량 (orderQty
)
0
으로 처리하고 계산합니다.고객별 각 연도에서의 주문 수를 집계하고, 최댓값과 그 최대값을 발생시킨 연도를 찾습니다.
결과는 고객 ID(customerid
) 기준으로 오름차순으로 정렬합니다.
결과에는 다음과 같은 데이터가 포함되어야 합니다:
customerid
)customer_name
)avg_yearly_gmv
), 소수점 2째자리max_order_year
)max_order_count
)customerid | customer_name | avg_yearly_gmv | max_order_year | max_order_count |
---|---|---|---|---|
11387 | Megan Ramirez | 2319.6 | 2013 | 5 |
11388 | Joseph Martin | 2328.92 | 2013 | 6 |
11394 | George McDonald | 2040.49 | 2013 | 4 |
11398 | Colin Nath | 2028.51 | 2014 | 2 |
11400 | Franklin Raji | 2322.59 | 2013 | 7 |
# 첫 주문 날짜가 2011년 12월인 고객만 ▶ 해결
## sales_order DATE_FORMAT(orderdate, '%Y%m') = '201112'
# 각 고객의 첫 주문과 마지막 주문 사이의 기간을 기준으로, 그 기간 동안의 연평균 총거래액(GMV)을 계산
## customer와 sales_order JOIN
## GROUP BY customerid
## 첫 주문: MIN(orderdate), 마지막 주문: MAX(orderdate)
## 그 기간 동안의 연평균 총거래액(GMV)
### GROUP BY YEAR(orderdate)하고 제품 단가 (unitPrice) × 주문 수량 (orderQty)
### 특정 연도에 주문을 하지 않았다면, 그 연도의 GMV와 주문 수는 0으로 처리 → IFNULL
# 고객별 각 연도에서의 주문 수를 집계하고, 최대값과 그 최대값을 발생시킨 연도를 찾기
## 고객 B: 2011년 (주문 10개), 2012년 (주문 5개), 2013년 (주문 10개)
## 최대 주문수: 10개
## 최대 주문수가 발생한 연도: 2013년 (2011년과 같은 주문 수지만 더 최근의 연도) → RANK 연도 DESC
# 결과는 고객 ID(customerid) 기준으로 오름차순으로 정렬
# 출력해야 하는 것
## 고객 ID (customerid) ▶ 해결
## 고객 이름 (customer_name) = 고객이름 (firstName) + " " + 고객성 (lastName) ▶ 해결
## 연평균 매출 (avg_yearly_gmv), 소수점 2째자리 → ROUND( ,2)
## 최대 주문 수 발생 연도 (max_order_year)
## 최대 주문 수 (max_order_count)
# 2011년 12월 첫 주문
SELECT
DATE(orderdate)
FROM
customer c
JOIN sales_order s
USING(customerid)
WHERE
DATE_FORMAT(orderdate, '%Y%m') = '201112'
GROUP BY
1
ORDER BY
1
;
# 각 고객의 첫 주문과 마지막 주문 사이의 기간을 기준으로, 그 기간 동안의 연평균 총거래액(GMV)을 계산
WITH customer_dec AS(
SELECT
DISTINCT c.customerid AS customerid
, CONCAT(firstName, " ", c.lastName) AS customer_name
FROM
customer c
JOIN sales_order s
USING(customerid)
WHERE
DATE_FORMAT(orderdate, '%Y%m') = '201112'
)
SELECT
s.customerid
, c.customer_name
, YEAR(orderdate)
, SUM(unitPrice*orderQty) AS GMV
, COUNT(orderQty) AS cnt
FROM
sales_order s
JOIN customer_dec c
USING(customerid)
GROUP BY
s.customerid
, c.customer_name
, YEAR(orderdate)
;
# 연도별 gmv, ordercount
WITH customer_dec AS(
SELECT
DISTINCT c.customerid AS customerid
, CONCAT(firstName, " ", c.lastName) AS customer_name
FROM
customer c
JOIN sales_order s
USING(customerid)
WHERE
DATE_FORMAT(orderdate, '%Y%m') = '201112'
)
, customer_sales_year AS(
SELECT
s.customerid
, c.customer_name
, YEAR(orderdate) AS `year`
, SUM(unitPrice*orderQty) AS GMV
, COUNT(orderQty) AS cnt
FROM
sales_order s
JOIN customer_dec c
USING(customerid)
GROUP BY
s.customerid
, c.customer_name
, YEAR(orderdate)
)
SELECT
customerid
, customer_name
, y.`year`
, IFNULL(GMV, 0) AS GMV
, IFNULL(cnt, 0) AS cnt
FROM
dim_years y
LEFT JOIN customer_sales_year c
USING(`year`)
WHERE
customerid IS NOT NULL
;
# 최종 제출용
WITH customer_dec AS(
SELECT
DISTINCT c.customerid AS customerid
, CONCAT(firstName, " ", c.lastName) AS customer_name
FROM
customer c
JOIN sales_order s
USING(customerid)
WHERE
DATE_FORMAT(orderdate, '%Y%m') = '201112'
)
, customer_sales_year AS(
SELECT
s.customerid
, c.customer_name
, YEAR(orderdate) AS `year`
, SUM(unitPrice*orderQty) AS GMV
, COUNT(orderQty) AS cnt
FROM
sales_order s
JOIN customer_dec c
USING(customerid)
GROUP BY
s.customerid
, c.customer_name
, YEAR(orderdate)
)
, table_fillna AS(
SELECT
customerid
, customer_name
, y.`year`
, IFNULL(GMV, 0) AS GMV
, IFNULL(cnt, 0) AS cnt
FROM
dim_years y
LEFT JOIN customer_sales_year c
USING(`year`)
WHERE
customerid IS NOT NULL
)
, table_final AS (
SELECT
customerid
, customer_name
, `year`
, MAX(cnt) AS max_order_count
, ROUND(SUM(GMV)/COUNT(`year`), 2) AS avg_yearly_gmv
, RANK() OVER (PARTITION BY customerid ORDER BY `year` DESC) AS rnk_year
FROM
table_fillna
GROUP BY
customerid
, `year`
ORDER BY
customerid
)
, rnk_year AS(
SELECT
customerid
, `year` AS max_order_year
FROM
table_final
WHERE
rnk_year = 1
)
SELECT
f.customerid
, customer_name
, avg_yearly_gmv
, max_order_year
, max_order_count
FROM
table_final f
JOIN rnk_year y
USING(customerid)
GROUP BY
f.customerid
, customer_name
, avg_yearly_gmv
, max_order_year
ORDER BY
customerid
;
→ 마무리 못하고 시간이 다 되어서 그냥 제출했음😭
-- divide and conquer
WITH customer_order_range AS (
-- 첫 주문과 마지막 주문 연도 계산
-- 고객 이름 = 이름 + 성
-- Filter: 2011년 12월에 첫 주문한 고객
SELECT
c.customerid,
CONCAT(c.firstname, ' ', c.lastname) customer_name,
MIN(YEAR(so.orderdate)) AS first_order_year,
MAX(YEAR(so.orderdate)) AS last_order_year
FROM
qcc.customer c
INNER JOIN qcc.sales_order so ON c.customerid = so.customerid
GROUP BY c.customerid
HAVING DATE_FORMAT(MIN(so.orderdate), '%Y-%m') = '2011-12'
), customer_years AS (
-- 첫 주문과 마지막 주문 사이
SELECT
c.customerid,
c.customer_name,
dy.year
FROM
customer_order_range c
LEFT JOIN qcc.dim_years dy
ON dy.year BETWEEN c.first_order_year AND c.last_order_year
), customer_gmv AS (
-- 고객 별 연도 별 GMV, 주문수
SELECT
cy.customerid,
cy.customer_name,
cy.year,
COALESCE(SUM(so.unitprice * so.orderqty), 0) AS gmv,
COALESCE(SUM(so.orderqty), 0) AS orders
FROM
customer_years cy
LEFT JOIN qcc.sales_order so ON cy.customerid = so.customerid AND YEAR(so.orderdate) = cy.year
GROUP BY cy.customerid, cy.customer_name, cy.year
), avg_gmv_orders AS (
-- 고객 별 연평균 GMV
SELECT
customerid,
customer_name,
ROUND(AVG(gmv), 2) AS avg_yearly_gmv
FROM
customer_gmv
GROUP BY customerid, customer_name
), max_orders_per_year AS (
-- 고객 별 연 최대 주문 수, 최대 주문 수의 연도
SELECT customerid, customer_name, year, orders
FROM (
SELECT
customerid,
customer_name,
year,
orders,
ROW_NUMBER() OVER (PARTITION BY customerid ORDER BY orders DESC, year DESC) AS order_rank
FROM
customer_gmv
) a
WHERE order_rank = 1
)
SELECT
a.customerid,
a.customer_name,
a.avg_yearly_gmv,
m.year AS max_order_year,
m.orders AS max_order_count
FROM
avg_gmv_orders a
INNER JOIN
max_orders_per_year m ON a.customerid = m.customerid
ORDER BY
a.customerid;
use qcc;
select
c.customerid
, concat(c.firstname, ' ', c.lastname) customer_name
, min(year(orderdate)) first_order_year
, max(year(orderdate)) last_order_year
from
customer c
join sales_order o
on c.customerid = o.customerid
group by 1, 2
having min(date_format(o.orderdate, '%Y%m')) = 201112
select
year(so.orderdate) yr
, c.customerid
, SUM(so.unitprice * so.orderqty) as gmv
from
customer c
join sales_order so
on c.customerid = so.customerid
group by 1, 2
1+2
with target_customer as(
select
c.customerid
, concat(c.firstname, ' ', c.lastname) customer_name
, min(year(orderdate)) first_order_year
, max(year(orderdate)) last_order_year
from
customer c
join sales_order o
on c.customerid = o.customerid
group by 1, 2
having min(date_format(o.orderdate, '%Y%m')) = 201112
)
select
year(so.orderdate) yr
, c.customerid
, SUM(so.unitprice * so.orderqty) as gmv
from
customer c
join sales_order so
on c.customerid = so.customerid
join target_customer tc
on c.customerid = tc.customerid
group by 1, 2
※ 다른 방법으로:
고객별 마지막 year 와 2011과의 차이로 나누면, 미구입한 년도를 포함한 평균이 구해진다고 생각했습니다.
(1 + 0 + 0 + 3) / 4 = (1 + 3) /4
sum(GMV) / (lastyear - firstyear +1)
dim_year 활용할 때 cross join으로도 됨
WITH customer_order_range AS (
SELECT
c.customerid
, CONCAT(c.firstname, ' ', c.lastname) AS customer_name
, MIN(YEAR(so.orderdate)) AS first_order_year
, MAX(YEAR(so.orderdate)) AS last_order_year
FROM
qcc_3.customer AS c
JOIN qcc_3.sales_order AS so
USING(customerid)
GROUP BY
c.customerid
, CONCAT(c.firstname, ' ', c.lastname)
HAVING
DATE_FORMAT(MIN(so.orderdate), '%Y%m') = '201112'
)
, customer_order_years AS (
SELECT
c.customerid
, c.customer_name
, y.`year`
FROM
customer_order_range c
LEFT JOIN qcc_3.dim_years y
ON y.`year` BETWEEN first_order_year AND last_order_year
)
, customer_gmv AS (
SELECT
y.customerid
, y.customer_name
, y.`year`
, COALESCE(SUM(so.unitprice * so.orderqty), 0) AS gmv
, COALESCE(SUM(so.orderqty), 0) AS orders
FROM
customer_order_years y
LEFT JOIN qcc_3.sales_order so
ON y.customerid = so.customerid
AND YEAR(so.orderdate) = y.`year`
GROUP BY
y.customerid
, y.customer_name
, y.`year`
)
, customer_gmv_avg AS (
SELECT
customerid
, customer_name
, ROUND(AVG(gmv), 2) AS avg_yearly_gmv
FROM
customer_gmv
GROUP BY
customerid
, customer_name
)
, customer_max_rnk AS (
SELECT
customerid
, customer_name
, `year`
, orders
, ROW_NUMBER() OVER (PARTITION BY customerid ORDER BY orders DESC, `year` DESC) AS order_rnk
FROM
customer_gmv
)
, customer_year_orders_max AS (
SELECT
customerid
, customer_name
, `year`
, orders
FROM
customer_max_rnk
WHERE
order_rnk = 1
)
SELECT
ca.customerid
, ca.customer_name
, ca.avg_yearly_gmv
, cm.`year` AS max_order_year
, cm.orders AS max_order_count
FROM
customer_gmv_avg ca
JOIN customer_year_orders_max cm
USING(customerid)
ORDER BY
ca.customerid
;
WITH customer_dec_range AS ( -- 첫 주문과 마지막 주문 구해야 함!
SELECT
c.customerid
, CONCAT(c.firstname, ' ', c.lastname) AS customer_name
, MIN(YEAR(s.orderdate)) AS first_order_year
, MAX(YEAR(s.orderdate)) AS last_order_year
FROM
customer c
JOIN sales_order s
USING (customerid)
GROUP BY
c.customerid
, CONCAT(c.firstname, ' ', c.lastname)
HAVING
DATE_FORMAT(MIN(s.orderdate), '%Y%m') = '201112' -- 첫 주문 2011년 12월 고객만
)
, customer_sales_year AS ( -- 년도별 테이블 만들기
SELECT
c.customerid
, c.customer_name
, d.`year`
FROM
customer_dec_range c
LEFT JOIN dim_years d
ON d.`year` BETWEEN c.first_order_year AND c.last_order_year -- 핵심!
)
, customer_gmv_orders AS (
SELECT
c.customerid
, c.customer_name
, c.`year`
, COALESCE(SUM(s.unitPrice*s.orderQty), 0) AS GMV
, COALESCE(SUM(s.orderQty), 0) AS orders
FROM
customer_sales_year c
LEFT JOIN sales_order s
ON c.customerid = s.customerid
AND YEAR(s.orderdate) = c.`year`
GROUP BY
c.customerid
, c.customer_name
, c.`year`
)
, customer_avg_gmv AS (
SELECT
customerid
, customer_name
, ROUND(AVG(gmv), 2) AS avg_yearly_gmv
FROM
customer_gmv_orders
GROUP BY
customerid
, customer_name
)
, customer_max_year AS (
SELECT
customerid
, customer_name
, `year`
, orders
, ROW_NUMBER() OVER (PARTITION BY customerid ORDER BY orders DESC, `year` DESC) AS order_rnk
FROM
customer_gmv_orders
)
SELECT
a.customerid
, a.customer_name
, a.avg_yearly_gmv
, m.`year` AS max_order_year
, m.orders AS max_order_count
FROM
customer_avg_gmv a
JOIN customer_max_year m
USING(customerid)
WHERE
m.order_rnk = 1
ORDER BY
a.customerid
;