SELECT
job_id
, query
, user_email
, total_bytes_processed
, total_slot_ms
FROM `some-project`.INFORMATION_SCHEMA.JOBS_BY_PROJECT
WHERE EXTRACT(YEAR FROM creation_time) = 2019
ORDER BY total_bytes_processed DESC
LIMIT 5
SELECT의 대상을 명확히 하기
-- 두번째 쿼리는 첫번째 쿼리에 비해 3배 느리고 7배 비싸다.
SELECT
bike_id
, duration
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
ORDER BY duration DESC
LIMIT 1
SELECT
*
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
ORDER BY duration DESC
LIMIT 1
데이터 읽는 양 줄이기
-- 이름을 사용해 쿼리의 I/O 오버헤드를 줄인 쿼리
-- 위의 쿼리보다 아래의 쿼리가 30% 정도 속도가 향상되는데
-- start_station_id를 읽을 필요가 없기 때문에 제외시켰기 때문이다.
SELECT
MIN(start_station_name) AS start_station_name
, MIN(end_station_name) AS end_station_name
, APPROX_QUANTILES(duration, 10)[OFFSET(5)] AS typical_duration
, COUNT(duration) AS num_trips
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
WHERE
start_station_id != end_station_id
GROUP BY
start_station_id, end_station_id
ORDER BY num_trips DESC
LIMIT 10
SELECT
start_station_name
, end_station_name
, APPROX_QUANTILES(duration, 10)[OFFSET(5)] AS typical_duration
, COUNT(duration) AS num_trips
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
WHERE
start_station_name != end_station_name
GROUP BY
start_station_name, end_station_name
ORDER BY num_trips DESC
LIMIT 10
비용이 높은 연산 줄이기
WITH trip_distance AS (
SELECT
bike_id
, ST_Distance(ST_GeogPoint(s.longitude, s.latitude),
ST_GeogPoint(e.longitude, e.latitude)) AS distance
FROM
`bigquery-public-data`.london_bicycles.cycle_hire,
`bigquery-public-data`.london_bicycles.cycle_stations s,
`bigquery-public-data`.london_bicycles.cycle_stations e
WHERE
start_station_id = s.id
AND end_station_id = e.id
)
SELECT
bike_id
, SUM(distance)/1000 AS total_distance
FROM trip_distance
GROUP BY bike_id
ORDER BY total_distance DESC
LIMIT 5;
-- with 부분만 변경한다면 속도가 30% 향상됨
WITH stations AS (
SELECT
s.id AS start_id
, e.id AS end_id
, ST_Distance(ST_GeogPoint(s.longitude, s.latitude),
ST_GeogPoint(e.longitude, e.latitude)) AS distance
FROM
`bigquery-public-data`.london_bicycles.cycle_stations s,
`bigquery-public-data`.london_bicycles.cycle_stations e
)
, trip_distance AS (
SELECT
bike_id
, distance
FROM
`bigquery-public-data`.london_bicycles.cycle_hire,
`bigquery-public-data`.london_bicycles.cycle_stations s,
`bigquery-public-data`.london_bicycles.cycle_stations e
WHERE
start_station_id = s.id
AND end_station_id = e.id
)
SELECT
bike_id
, SUM(distance)/1000 AS total_distance
FROM trip_distance
GROUP BY bike_id
ORDER BY total_distance DESC
LIMIT 5;
구체화된 뷰를 이용한 캐싱 연산
WITH oneway AS (
SELECT EXTRACT(date FROM start_date) AS rental_date,
duration, start_station_name, end_station_name
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
WHERE start_station_name != end_station_name
)
SELECT
rental_date, AVG(duration) AS avg_duration,
start_station_name, end_station_name
FROM oneway
WHERE rental_date BETWEEN '2015-01-01' AND '2015-01-07'
GROUP BY rental_date, start_station_name, end_station_name
-- 이 쿼리는 사용자가 다른 날짜를 선택할 때마다 다른 rental_date 기간을 선택하기 위해 데이터를 처리한다.
-- 이 경우 전체 테이블을 다시 스캔해서 집계값을 계산해야 하기 때문이다.
-- 하지만 다음과 같이 모든 날짜에 대한 구체화된 뷰를 만들어둘 수 있다.
CREATE OR REPLACE MATERIALIZED VIEW ch07eu.oneway_rentals -- 이부분 수정필요
AS
WITH oneway AS (
SELECT EXTRACT(date FROM start_date) AS rental_date,
duration, start_station_name, end_station_name
FROM
ch07eu.cycle_hire
WHERE start_station_name != end_station_name
)
SELECT
rental_date, AVG(duration) AS avg_duration,
start_station_name, end_station_name
FROM oneway
GROUP BY rental_date, start_station_name, end_station_name
뒤에서 이어짐