이어서 계속
이전쿼리 결과 캐싱하기
빅쿼리의 쿼리 결과를 임시 테이블에 자동으로 캐싱한다. 그리고 약 24시간 이내에 같은 쿼리를 제출하면 연산을 수행하지 않고 임시 테이블에서 결과를 제공한다. 캐시된 결과는 매우 빠르고 비용이 발생하지 않는다.
주의해야 할 상황
중간 결과 캐싱
WITH typical_trip AS (
SELECT
start_station_name
, end_station_name
, APPROX_QUANTILES(duration, 10)[OFFSET(5)] AS typical_duration
, COUNT(duration) AS num_trips
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
GROUP BY
start_station_name, end_station_name
)
CREATE OR REPLACE TABLE ch07eu.typical_trip AS
SELECT
start_station_name
, end_station_name
, APPROX_QUANTILES(duration, 10)[OFFSET(5)] AS typical_duration
, COUNT(duration) AS num_trips
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
GROUP BY
start_station_name, end_station_name
BI 엔진으로 쿼리 가속화
효율적으로 조인하기
-- 셀프 조인을 사용해 미국 전체의 남녀 아기 모두에게 많이 지어진 이름을 구하는 쿼리
WITH male_babies AS (
SELECT
name
, number AS num_babies
FROM `bigquery-public-data`.usa_names.usa_1910_current
WHERE gender = 'M'
),
female_babies AS (
SELECT
name
, number AS num_babies
FROM `bigquery-public-data`.usa_names.usa_1910_current
WHERE gender = 'F'
),
both_genders AS (
SELECT
name
, SUM(m.num_babies) + SUM(f.num_babies) AS num_babies
, SUM(m.num_babies) / (SUM(m.num_babies) + SUM(f.num_babies)) AS frac_male
FROM male_babies AS m
JOIN female_babies AS f
USING (name)
GROUP BY name
)
SELECT * FROM both_genders
WHERE frac_male BETWEEN 0.3 AND 0.7
ORDER BY num_babies DESC
LIMIT 5
-- 이는 실제 결과와 다르다. 아래의 코드는 테이블을 한 번만 읽고 셀프 조인을 수행하지 않는다.
-- 또한 속도도 2초밖에 걸리지 않았다.
WITH all_babies AS (
SELECT
name
, SUM(IF(gender = 'M', number, 0)) AS male_babies
, SUM(IF(gender = 'F', number, 0)) AS female_babies
FROM `bigquery-public-data.usa_names.usa_1910_current`
GROUP BY name
),
both_genders AS (
SELECT
name
, (male_babies + female_babies) AS num_babies
, SAFE_DIVIDE(male_babies, male_babies + female_babies) AS frac_male
FROM all_babies
WHERE male_babies > 0 AND female_babies > 0
)
SELECT * FROM both_genders
WHERE frac_male BETWEEN 0.3 and 0.7
ORDER BY num_babies DESC
limit 5
with all_names AS (
SELECT name, gender, SUM(number) AS num_babies
FROM `bigquery-public-data`.usa_names.usa_1910_current
GROUP BY name, gender
),
male_names AS (
SELECT name, num_babies
FROM all_names
WHERE gender = 'M'
),
female_names AS (
SELECT name, num_babies
FROM all_names
WHERE gender = 'F'
),
ratio AS (
SELECT
name
, (f.num_babies + m.num_babies) AS num_babies
, m.num_babies / (f.num_babies + m.num_babies) AS frac_male
FROM male_names AS m
JOIN female_names AS f
USING (name)
)
SELECT * from ratio
WHERE frac_male BETWEEN 0.3 and 0.7
ORDER BY num_babies DESC
LIMIT 5
WITH unused AS (
SELECT
bike_id
, start_station_name
, start_date
, end_date
, TIMESTAMP_DIFF(start_date, LAG(end_date) OVER (PARTITION BY bike_id ORDER BY start_date), SECOND) AS time_at_station
FROM `bigquery-public-data`.london_bicycles.cycle_hire
)
SELECT
start_station_name
, AVG(time_at_station) AS unused_seconds
FROM unused
GROUP BY start_station_name
ORDER BY unused_seconds ASC
LIMIT 5
5. 미리 계산된 값과 조인하기
워커의 과도한 작업 피하기
1. 큰 정렬 제한하기
2. 데이터 왜도
근사 집계 함수 사용하기
SELECT
COUNT(DISTINCT repo_name) AS num_repos
FROM `bigquery-public-data`.github_repos.commits, UNNEST(repo_name) AS repo_name
SELECT
APPROX_COUNT_DISTINCT(repo_name) AS num_repos
FROM `bigquery-public-data`.github_repos.commits, UNNEST(repo_name) AS repo_name
WITH sketch AS (
SELECT
HLL_COUNT.INIT(start_station_name) AS hll_start
, HLL_COUNT.INIT(end_station_name) AS hll_end
FROM `bigquery-public-data`.london_bicycles.cycle_hire
)
SELECT
HLL_COUNT.MERGE(hll_start) AS distinct_start
, HLL_COUNT.MERGE(hll_end) AS distinct_end
, HLL_COUNT.MERGE(hll_both) AS distinct_station
FROM sketch, UNNEST([hll_start, hll_end]) AS hll_both;
SELECT
APPROX_COUNT_DISTINCT(start_station_name) AS distinct_start
, APPROX_COUNT_DISTINCT(end_station_name) AS distinct_end
, APPROX_COUNT_DISTINCT(both_stations) AS distinct_station
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
, UNNEST([start_station_name, end_station_name]) AS both_stations
뒤에서 이어짐