pg_analytics는 PostgreSQL 확장으로, DuckDB의 강력한 OLAP(Online Analytical Processing) 기능을 PostgreSQL에 추가하는 방식이다.
이를 통해 PostgreSQL에서 Parquet, CSV, JSON 파일을 직접 읽고 분석할 수 있다.
특히, pg_analytics는 duckdb를 이용하여 S3 버킷에 저장된 데이터에 직접 접근하여 PostgreSQL에서 외부 테이블로 불러오는 기능을 제공한다.
pg_analytics : https://github.com/paradedb/pg_analytics
duckdb : https://duckdb.org/docs/data/overview
OS : Rocky linux9 on wsl
DB : PostgreSQL 16.1.2
[root@DESKTOP-OUB301V postgresql]# dnf install ./pg_analytics_16-0.2.4-1PARADEDB.el9.x86_64.rpm
[postgres@DESKTOP-OUB301V ~]$ cat /var/lib/pgsql/16/data/postgresql.conf |grep 'shared_preload_libraries'
shared_preload_libraries = 'pg_analytics' # (change requires restart)
[postgres@DESKTOP-OUB301V ~]$ sudo systemctl restart postgresql-16
postgres=# create database s3test;
CREATE DATABASE
postgres=# \c s3test
You are now connected to database "s3test" as user "postgres".
s3test=# create extension pg_analytics ;
CREATE EXTENSION
-- S3 parquet test
CREATE FOREIGN DATA WRAPPER parquet_wrapper HANDLER parquet_fdw_handler VALIDATOR parquet_fdw_validator;
-- Provide S3 credentials
CREATE SERVER parquet_server FOREIGN DATA WRAPPER parquet_wrapper;
-- Create foreign table with auto schema creation
CREATE FOREIGN TABLE trips ()
SERVER parquet_server
OPTIONS (files 's3://paradedb-benchmarks/yellow_tripdata_2024-01.parquet');
s3test=# \d trips
Foreign table "public.trips"
Column | Type | Collation | Nullable | Default | FDW options
-----------------------+-----------------------------+-----------+----------+---------+-------------
vendorid | integer | | | |
tpep_pickup_datetime | timestamp without time zone | | | |
tpep_dropoff_datetime | timestamp without time zone | | | |
passenger_count | bigint | | | |
trip_distance | double precision | | | |
ratecodeid | bigint | | | |
store_and_fwd_flag | character varying | | | |
pulocationid | integer | | | |
dolocationid | integer | | | |
payment_type | bigint | | | |
fare_amount | double precision | | | |
extra | double precision | | | |
mta_tax | double precision | | | |
tip_amount | double precision | | | |
tolls_amount | double precision | | | |
improvement_surcharge | double precision | | | |
total_amount | double precision | | | |
congestion_surcharge | double precision | | | |
airport_fee | double precision | | | |
Server: parquet_server
FDW options: (files 's3://paradedb-benchmarks/yellow_tripdata_2024-01.parquet')
-- Success! Now you can query the remote Parquet file like a regular Postgres table
SELECT COUNT(*) FROM trips;
count
---------
2964624
(1 row)
-- 조회 성능 확인
s3test=# SELECT count(*) FROM trips;
count
---------
2964624
(1 row)
Time: 2942.286 ms (00:02.942)
s3test=# SELECT * FROM trips limit 1;
vendorid | tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | ratecodeid | store_and_fwd_flag | pulocationid | dolocationid |
payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | congestion_surcharge | airport_fee
----------+----------------------+-----------------------+-----------------+---------------+------------+--------------------+--------------+--------------+
--------------+-------------+-------+---------+------------+--------------+-----------------------+--------------+----------------------+-------------
2 | 2024-01-01 00:57:55 | 2024-01-01 01:17:43 | 1 | 1.72 | 1 | N | 186 | 79 |
2 | 17.7 | 1 | 0.5 | 0 | 0 | 1 | 22.7 | 2.5 | 0
(1 row)
Time: 53570.829 ms (00:53.571)
s3test=# create table ctas_trips as select * from trips;
SELECT 2964624
Time: 33662.761 ms (00:33.663)
s3test=# select count(*) from ctas_trips ;
count
---------
2964624
(1 row)
Time: 447.115 ms
s3test=# create table csvtest (name text, num int);
CREATE TABLE
s3test=# insert into csvtest values ('jkjung',1);
INSERT 0 1
s3test=# insert into csvtest values ('kilee',2);
INSERT 0 1
s3test=# insert into csvtest values ('hmson',3);
INSERT 0 1
s3test=# copy csvtest to '/home/postgres/csvtest.csv';
COPY 3
[postgres@DESKTOP-OUB301V ~]$ aws s3 cp csvtest.csv s3://test-parquet-jkjung
[postgres@DESKTOP-OUB301V ~]$ aws s3 ls s3://test-parquet-jkjung
2025-01-06 21:24:55 25 csvtest.csv
CREATE FOREIGN DATA WRAPPER csv_wrapper
HANDLER csv_fdw_handler
VALIDATOR csv_fdw_validator;
CREATE SERVER csv_server
FOREIGN DATA WRAPPER csv_wrapper;
CREATE USER MAPPING FOR postgres
SERVER csv_server
OPTIONS (
type 'S3',
key_id '****',
secret '****',
region 'ap-northeast-2'
);
CREATE FOREIGN TABLE s3csvtest ()
SERVER csv_server
OPTIONS (files 's3://test-parquet-jkjung/csvtest.csv');
s3test=# \d s3csvtest
Foreign table "public.s3csvtest"
Column | Type | Collation | Nullable | Default | FDW options
---------+-------------------+-----------+----------+---------+-------------
column0 | character varying | | | |
column1 | bigint | | | |
Server: csv_server
FDW options: (files 's3://test-parquet-jkjung/csvtest.csv')
s3test=# select * from s3csvtest ;
column0 | column1
---------+---------
jkjung | 1
kilee | 2
hmson | 3
(3 rows)