query = """
SELECT
start_station_name
, AVG(duration) as avg_duration
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
WHERE
start_station_name LIKE CONCAT('%', @STATION, '%') -- 중요
AND duration BETWEEN @MIN_DURATION AND @MAX_DURATION
GROUP BY start_station_name
"""
query_params = [
bigquery.ScalarQueryParameter(
"STATION", "STRING", station_name),
bigquery.ScalarQueryParameter(
"MIN_DURATION", "FLOAT64", min_duration),
bigquery.ScalarQueryParameter(
"MAX_DURATION", "FLOAT64", max_duration),
]
def print_query_results(client,
station_name,
min_duration=0,
max_duration=84000):
job_config = bigquery.QueryJobConfig()
job_config.query_parameters = query_params
query_job = client.query(
query,
location="EU",
job_config=job_config,
)
for row in query_job:
print("{}: \t{}".format(
row.start_station_name, row.avg_duration))
client = bigquery.Client()
print_query_results(client, 'Kennington', 300)
print_query_results(client, 'Hyde Park', 600, 6000)
from google.cloud import bigquery
from datetime import datetime
from datetime import timedelta
import pytz
def print_query_results(client, mid_time):
start_time = mid_time - timedelta(minutes=30)
end_time = mid_time + timedelta(minutes=30)
query = """
SELECT
AVG(duration) as avg_duration
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
WHERE
start_date BETWEEN @START_TIME AND @END_TIME
"""
query_params = [
bigquery.ScalarQueryParameter(
"START_TIME", "TIMESTAMP", start_time),
bigquery.ScalarQueryParameter(
"END_TIME", "TIMESTAMP", end_time),
]
job_config = bigquery.QueryJobConfig()
job_config.query_parameters = query_params
query_job = client.query(
query,
location="EU",
job_config=job_config,
)
for row in query_job:
print(row.avg_duration)
print("______________________")
# 아래의 쿼리는 특정 날을 표현하는 datetime 객체를 전달해 함수를 호출하는 과정이다.
client = bigquery.Client()
print_query_results(client, datetime(2021, 6, 12, 18, 38, tzinfo=pytz.UTC)
def print_query_results(client, params):
query = """
SELECT
start_station_name
, AVG(duration) as avg_duration
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
WHERE
start_station_name LIKE CONCAT('%', ?, '%')
AND duration BETWEEN ? AND ?
GROUP BY start_station_name
"""
query_params = [
bigquery.ScalarQueryParameter(
None, "STRING", params[0]),
bigquery.ScalarQueryParameter(
None, "FLOAT64", params[1]),
bigquery.ScalarQueryParameter(
None, "FLOAT64", params[2]),
]
job_config = bigquery.QueryJobConfig()
job_config.query_parameters = query_params
query_job = client.query(
query,
location="EU",
job_config=job_config,
)
for row in query_job:
print("{}: \t{}".format(
row.start_station_name, row.avg_duration))
print("______________________")
client = bigquery.Client()
print_query_results(client, ['Kennington', 300, 84000])
print_query_results(client, ['Hyde Park', 600, 6000])
query = """
SELECT
start_station_id
, COUNT(*) AS num_trips
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
WHERE
start_station_id IN UNNEST(@STATIONS) # 이부분 설정
AND duration BETWEEN @MIN_DURATION AND @MAX_DURATION
GROUP BY start_station_id
"""
query_params = [
bigquery.ArrayQueryParameter( # 이부분 Array 설정
'STATIONS', "INT64", ids),
bigquery.ScalarQueryParameter(
'MIN_DURATION', "FLOAT64", min_duration),
bigquery.ScalarQueryParameter(
'MAX_DURATION', "FLOAT64", max_duration),
]
# station 파라미터에 ID가 저장된 파이썬 배열을 전달
print_query_results(client, [270, 235, 62, 149], 300, 600)
CREATE TEMPORARY FUNCTION dayOfWeek(x TIMESTAMP) AS
(
['Sun','Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
[ORDINAL(EXTRACT(DAYOFWEEK FROM x))]
);
CREATE TEMPORARY FUNCTION getDate(x TIMESTAMP) AS
(
EXTRACT(DATE FROM x)
);
WITH overnight_trips AS (
SELECT
duration
, dayOfWeek(start_date) AS start_day
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
WHERE
getDate(start_date) != getDate(end_date)
)
SELECT
start_day
, COUNT(*) AS num_overnight_rentals
, AVG(duration)/3600 AS avg_duration_hours
FROM
overnight_trips
GROUP BY
start_day
ORDER BY num_overnight_rentals DESC
이전 절의 함수는 임시 함수로 정의되었으므로 함수를 선언한 쿼리 내에서만 사용할 수 있다. 다른 쿼리에서 다시 사용하려면 함수를 복사해서 붙여넣어야 한다. 복사 후 붙여넣기는 본질적으로 오류가 발생하기 쉬운 작업이다.
함수를 여러 쿼리에서 재사용하고 싶다면 그 함수를 데이터셋에 저장하고 쿼리에서 참조하는 것이 바람직하다.
CREATE OR REPLACE FUNCTION ch08eu.dayOfWeek(x TIMESTAMP) AS
(
['Sun','Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
[ORDINAL(EXTRACT(DAYOFWEEK FROM x))]
);
SELECT
start_date,
COUNT(*) AS num_long_trips
FROM -- "첫 번째 FROM 절"
(SELECT
start_station_name
, duration
, EXTRACT(DATE FROM start_date) AS start_date
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
WHERE
start_station_name = end_station_name) AS roundtrips
WHERE -- "바깥쪽의 WHERE"
duration > 2*(
SELECT
AVG(duration) AS avg_duration
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
WHERE
start_station_name = end_station_name
AND roundtrips.start_station_name = start_station_name
)
GROUP BY start_date
ORDER BY num_long_trips DESC
LIMIT 5
WITH roundtrips AS (
SELECT
start_station_name
, duration
, EXTRACT(DATE FROM start_date) AS start_date
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
WHERE
start_station_name = end_station_name
),
station_avg AS (
SELECT
start_station_name
, AVG(duration) as avg_duration
FROM
roundtrips
GROUP BY start_station_name
)
SELECT
start_date,
COUNT(*) AS num_long_trips
FROM
roundtrips
JOIN station_avg USING(start_station_name)
WHERE duration > 2*avg_duration
GROUP BY start_date
ORDER BY num_long_trips DESC
LIMIT 5
WITH params AS (
SELECT 600 AS DURATION_THRESH
)
SELECT
start_station_name
, COUNT(duration) AS num_trips
FROM
`bigquery-public-data`.london_bicycles.cycle_hire
, params
WHERE duration >= DURATION_THRESH
GROUP BY start_station_name
ORDER BY num_trips DESC
LIMIT 5
뒤에서 이어짐