이어서 계속
사용자 정의 함수는 가능하면 SQL로 작성하는 것이 바람직이다.
빅쿼리는 SQL을 최적화하고 분산 처리해 효율적으로 실행하지만 자바스크립트 사용자 정의 함수는 이런 혜택을 받지 못한다.
자바스크립트 사용자 정의 함수에는 코드 크기, 출력의 크기, 한 쿼리에서 사용할 수 있는 자바스크립트 함수 수 등에 제한이 있기 때문이다.
자바스크립트 사용자 정의 함수는 SQL로 작성하기 어려운 복잡한 계산을 수행해야 하는 상황에 유용하며, 앞서 언급한 제한 사항에도 불구하고 편리한 대안으로 사용할 수 있다.
CREATE OR REPLACE FUNCTION ch08eu.computePrice(dur INT64)
RETURNS INT64
LANGUAGE js AS """
function factorial(n) {
return (n > 1) ? n * factorial(n - 1) : 1;
}
var nhours = 1 + Math.floor(dur/3600.0);
var f = factorial(nhours);
var discount = 0.8/(1+Math.pow(Math.E, -f));
return 3 + Math.floor(dur * (1-discount) * 0.0023)
""";
SELECT
duration, ch08eu.computePrice(duration) AS price
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
LIMIT 5
CREATE OR REPLACE FUNCTION ch08eu.computePrice(dur INT64)
RETURNS INT64
LANGUAGE js AS """
function factorial(n) {
return (n > 1) ? n * factorial(n - 1) : 1;
}
var nhours = 1 + Math.floor(dur/3600.0);
var f = factorial(nhours);
var discount = 0.8/(1+Math.pow(Math.E, -f));
return 3 + Math.floor(dur * (1-discount) * 0.0023)
""";
SELECT
duration, ch08eu.computePrice(duration) AS price
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
LIMIT 5
CREATE OR REPLACE TABLE ch08eu.typical_trip AS
SELECT
start_station_name
, end_station_name
, APPROX_QUANTILES(duration, 10)[OFFSET(5)] AS typical_duration
, COUNT(*) AS num_trips
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
GROUP BY
start_station_name, end_station_name
;
CREATE OR REPLACE TABLE ch08eu.unusual_days AS
SELECT
EXTRACT (DATE FROM start_date) AS trip_date
, APPROX_QUANTILES(duration / typical_duration, 10)[OFFSET(5)] AS ratio
, COUNT(*) AS num_trips_on_day
FROM
`bigquery-public-data`.london_bicycles.cycle_hire AS hire
, ch08eu.typical_trip AS trip
WHERE
hire.start_station_name = trip.start_station_name
AND hire.end_station_name = trip.end_station_name
AND num_trips > 10
GROUP BY trip_date
HAVING num_trips_on_day > 10
ORDER BY ratio DESC
;
DROP TABLE ch08eu.typical_trip;
사실 스크립트가 필요한 상황의 상당 부분은 WITH 절, 조인, 상관된 서브 쿼리 또는 GROUP BY 절을 사용해도 해결할 수 있다. 스크립트를 작성하기 전에 단일 쿼리를 사용해 문제점을 해결할 수 있는지 고려해야한다. 대부분의 경우 단일 쿼리가 훨씬 효율적이다.
CREATE TEMPORARY TABLE typical_trip AS
SELECT
start_station_name
, end_station_name
, APPROX_QUANTILES(duration, 10)[OFFSET(5)] AS typical_duration
, COUNT(*) AS num_trips
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
GROUP BY
start_station_name, end_station_name
;
CREATE OR REPLACE TABLE ch08eu.unusual_days AS
SELECT
EXTRACT (DATE FROM start_date) AS trip_date
, APPROX_QUANTILES(duration / typical_duration, 10)[OFFSET(5)] AS ratio
, COUNT(*) AS num_trips_on_day
FROM
`bigquery-public-data`.london_bicycles.cycle_hire AS hire
, typical_trip AS trip
WHERE
hire.start_station_name = trip.start_station_name
AND hire.end_station_name = trip.end_station_name
AND num_trips > 10
GROUP BY trip_date
HAVING num_trips_on_day > 10
ORDER BY ratio DESC
;
-- 변수
DECLARE PATTERN STRING DEFAULT '%Hyde%';
DECLARE stations ARRAY<STRING>;
DECLARE MIN_TRIPS_THRESH INT64 DEFAULT 100;
-- 쿼리 결과를 변수에 저장하는 쿼리
SET stations = (
SELECT
ARRAY_AGG(name)
FROM
`bigquery-public-data`.london_bicycles.cycle_stations
WHERE
name LIKE PATTERN
);
-- 대여 시간이 가장 긴 대여 건의 반납 대여소를 찾는 쿼리
SELECT
start_station_name
, end_station_name
, AVG(duration) AS avg_duration
, COUNT(duration) AS num_trips
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
, UNNEST(stations) AS station
WHERE
start_station_name = station
GROUP BY start_station_name, end_station_name
HAVING num_trips > MIN_TRIPS_THRESH
ORDER BY avg_duration DESC
LIMIT 5
-- Variables
DECLARE PATTERN STRING DEFAULT '%Hyde%';
DECLARE MIN_TRIPS_THRESH INT64 DEFAULT 100;
DECLARE stations ARRAY<STRING>;
-- Find stations of interest
SET stations = (
SELECT
ARRAY_AGG(name)
FROM
`bigquery-public-data`.london_bicycles.cycle_stations
WHERE
name LIKE PATTERN
);
-- Loop through a number of thresholds
WHILE MIN_TRIPS_THRESH < 1000 DO
SELECT
start_station_name
, end_station_name
, AVG(duration) AS avg_duration
, COUNT(duration) AS num_trips
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
, UNNEST(stations) AS station
WHERE
start_station_name = station
GROUP BY start_station_name, end_station_name
HAVING num_trips > MIN_TRIPS_THRESH
ORDER BY avg_duration DESC
LIMIT 5;
SET MIN_TRIPS_THRESH = MIN_TRIPS_THRESH * 2;
END WHILE
LOOP
IF MIN_TRIPS_THRESH >= 1000 THEN
BREAK;
END IF;
SELECT MIN_TRIPS_THRESH;
SET MIN_TRIPS_THRESH = MIN_TRIPS_THRESH * 2;
END LOOP;
BEGIN
DECLARE stations ARRAY<INT64>;
SET stations = (
SELECT
ARRAY_AGG(CAST(name AS INT64)) names
FROM
`bigquery-public-data`.london_bicycles.cycle_stations
WHERE
name LIKE '%Kings%'
);
EXCEPTION WHEN ERROR THEN
SELECT
@@error.message, -- 작동이 안됨.
@@error.stack-trace;
END;
SELECT
MIN(duration) AS min_dur
, MAX(duration) AS max_dur
, COUNT(duration) AS num_dur
, AVG(duration) AS avg_dur
, SUM(duration) AS total_dur
, STDDEV(duration) AS stddev_dur
, VARIANCE(duration) AS variance_dur
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
SELECT
APPROX_QUANTILES(duration, 3)
FROM
`bigquery-public-data`.london_bicycles.cycle_hire