# Use the official PostgreSQL 12.20 image as the base image
FROM postgres:12.20
# Set environment variables for PostgreSQL
ENV POSTGRES_USER=postgres
ENV POSTGRES_PASSWORD=0000
ENV POSTGRES_DB=postgres
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
libpq-dev \
curl \
git \
postgresql-server-dev-12
# Install pgvector 0.7.0
RUN git clone --branch v0.7.0 https://github.com/pgvector/pgvector.git \
&& cd pgvector \
&& make \
&& make install
# Clean up to reduce the image size
RUN apt-get remove --purge -y build-essential git \
&& apt-get autoremove -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* /pgvector
# Expose the default PostgreSQL port
EXPOSE 5432
# 컨테이너 빌드
docker build -t my-postgres:12.20-pgvector-0.7.0 .
# 컨테이너 실행
docker run -d --name my-postgres-container -p 5432:5432 my-postgres:12.20-pgvector-0.7.0
import textwrap
from typing import List
import psycopg2
class VectorDB:
_connection = None
def __init__(self):
pass
def connect(self):
connection = psycopg2.connect(
host="localhost",
port=5432,
database="postgres",
user="postgres",
password="0000",
)
cursor = connection.cursor()
cursor.execute("SET TIMEZONE TO 'Asia/Seoul'")
cursor.execute(f"SET search_path TO 'public'")
cursor.close()
self._connection = connection
def close(self):
if self._connection is not None:
self._connection.close()
self._connection = None
def execute_query(self, query: str):
self.execute_queries([query])
def execute_queries(self, queries: List[str]):
queries = [textwrap.dedent(query) for query in queries]
connection = self.get_connection()
for query in queries:
if query.strip():
cursor = connection.cursor()
cursor.execute(query)
connection.commit()
cursor.close()
def fetch_query(self, query: str):
query = textwrap.dedent(query)
connection = self.get_connection()
cursor = connection.cursor()
cursor.execute(query)
result = cursor.fetchall()
cursor.close()
return result
def explain_analyze_query(self, query: str):
query = textwrap.dedent(f"EXPLAIN ANALYZE {query}")
connection = self.get_connection()
cursor = connection.cursor()
cursor.execute(query)
result = cursor.fetchall()
cursor.close()
return result
def get_connection(self):
self.connect()
return self._connection
# pgVector 생성
vector = VectorDB()
query = """
CREATE EXTENSION vector;
"""
vector.execute_query(query)
vector = VectorDB()
query = """
CREATE TABLE public.items (
id serial PRIMARY KEY,
name TEXT,
category_id INT,
embedding vector(256)
);
"""
vector.execute_query(query)
테스트 Data 생성
import random
# 벡터 데이터 생성 함수
def generate_random_vector(dimension: int) -> str:
vector = [random.uniform(-1, 1) for _ in range(dimension)] # -1.0에서 1.0 사이의 랜덤 float 생성
vector_str = ','.join(map(str, vector)) # 문자열로 변환하여 쉼표로 연결
return f"'[{vector_str}]'"
# 10만개의 예시 데이터 삽입 함수
def insert_data(db: VectorDB, num_rows: int, dimension: int):
queries = []
for i in range(num_rows):
name = f"Item {i + 1}"
category_id = random.randint(1, 3) # 1, 2, 3 중 랜덤 선택
embedding = generate_random_vector(dimension)
query = f"""
INSERT INTO public.items (name, category_id, embedding)
VALUES ('{name}', {category_id}, vector({embedding}));
"""
queries.append(query)
# 배치 실행: 한 번에 1000개의 쿼리를 실행
if (i + 1) % 1000 == 0:
db.execute_queries(queries)
queries.clear()
# 남은 쿼리 실행
if queries:
db.execute_queries(queries)
# 메인 실행 코드
if __name__ == "__main__":
vector_db = VectorDB()
try:
dimension = 256 # 벡터의 차원
num_rows = 100000 # 삽입할 데이터 수
insert_data(vector_db, num_rows, dimension)
print(f"Successfully inserted {num_rows} rows into the database.")
finally:
vector_db.close()
ALTER TABLE items ADD COLUMN embedding vector(3);
COPY items (embedding) FROM STDIN WITH (FORMAT BINARY);
INSERT INTO items (id, embedding) VALUES (1, '[1,2,3]'), (2, '[4,5,6]')
ON CONFLICT (id) DO UPDATE SET embedding = EXCLUDED.embedding;
UPDATE items SET embedding = '[1,2,3]' WHERE id = 1;
DELETE FROM items WHERE id = 1;
# 256차원의 벡터 예시
embedding_vector = [
0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.10,
0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20,
0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30,
0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.40,
0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.50,
0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.60,
0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.70,
0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.80,
0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.90,
0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 0.100,
0.101, 0.102, 0.103, 0.104, 0.105, 0.106, 0.107, 0.108, 0.109, 0.110,
0.111, 0.112, 0.113, 0.114, 0.115, 0.116, 0.117, 0.118, 0.119, 0.120,
0.121, 0.122, 0.123, 0.124, 0.125, 0.126, 0.127, 0.128, 0.129, 0.130,
0.131, 0.132, 0.133, 0.134, 0.135, 0.136, 0.137, 0.138, 0.139, 0.140,
0.141, 0.142, 0.143, 0.144, 0.145, 0.146, 0.147, 0.148, 0.149, 0.150,
0.151, 0.152, 0.153, 0.154, 0.155, 0.156, 0.157, 0.158, 0.159, 0.160,
0.161, 0.162, 0.163, 0.164, 0.165, 0.166, 0.167, 0.168, 0.169, 0.170,
0.171, 0.172, 0.173, 0.174, 0.175, 0.176, 0.177, 0.178, 0.179, 0.180,
0.181, 0.182, 0.183, 0.184, 0.185, 0.186, 0.187, 0.188, 0.189, 0.190,
0.191, 0.192, 0.193, 0.194, 0.195, 0.196, 0.197, 0.198, 0.199, 0.200,
0.201, 0.202, 0.203, 0.204, 0.205, 0.206, 0.207, 0.208, 0.209, 0.210,
0.211, 0.212, 0.213, 0.214, 0.215, 0.216, 0.217, 0.218, 0.219, 0.220,
0.221, 0.222, 0.223, 0.224, 0.225, 0.226, 0.227, 0.228, 0.229, 0.230,
0.231, 0.232, 0.233, 0.234, 0.235, 0.236, 0.237, 0.238, 0.239, 0.240,
0.241, 0.242, 0.243, 0.244, 0.245, 0.246, 0.247, 0.248, 0.249, 0.250,
0.251, 0.252, 0.253, 0.254, 0.255, 0.256
]
SELECT * FROM items ORDER BY embedding <-> {embedding_vector} LIMIT 5;
<-> - L2 distance<#> - (negative) inner product<=> - cosine distance<+> - L1 distance (added in 0.7.0)SELECT * FROM items WHERE id != 1 ORDER BY embedding <-> (SELECT embedding FROM items WHERE id = 1) LIMIT 5;
SELECT * FROM items WHERE embedding <-> '[3,1,2]' < 5;
SELECT embedding <-> '[3,1,2]' AS distance FROM items;
SELECT (embedding <#> '[3,1,2]') * -1 AS inner_product FROM items;
SELECT 1 - (embedding <=> '[3,1,2]') AS cosine_similarity FROM items;
SELECT AVG(embedding) FROM items;
SELECT category_id, AVG(embedding) FROM items GROUP BY category_id;
SELECT pg_size_pretty(pg_relation_size('index_name'));CREATE INDEX ON items USING hnsw (embedding vector_l2_ops);
CREATE INDEX ON items USING hnsw (embedding vector_ip_ops);
CREATE INDEX ON items USING hnsw (embedding vector_cosine_ops);
CREATE INDEX ON items USING hnsw (embedding vector_l1_ops);
CREATE INDEX ON items USING hnsw (embedding bit_hamming_ops);
CREATE INDEX ON items USING hnsw (embedding bit_jaccard_ops);
vector - up to 2,000 dimensionshalfvec - up to 4,000 dimensions (added in 0.7.0)bit - up to 64,000 dimensions (added in 0.7.0)sparsevec - up to 1,000 non-zero elements (added in 0.7.0)m- 레이어당 최대 연결 수(기본값 16)ef_construction- 그래프를 구성하기 위한 동적 후보 목록의 크기(기본값 64)CREATE INDEX ON items USING hnsw (embedding vector_l2_ops) WITH (m = 16, ef_construction = 64);
ef_construction의 값이 높을수록 Index 생성/삽입속도가 떨어지지만 recall값은 올라간다.SET hnsw.ef_search = 100;
BEGIN;
SET LOCAL hnsw.ef_search = 100;
SELECT ...
COMMIT;
SET LOCAL로 단일 쿼리에 대해 설정하기 위해 transaction 내부에서 사용SET maintenance_work_mem = '8GB';
NOTICE: hnsw graph no longer fits into maintenance_work_mem after 100000 tuples
DETAIL: Building will take significantly more time.
HINT: Increase maintenance_work_mem to speed up builds.
SET max_parallel_maintenance_workers = 7; -- plus leader
SELECT phase, round(100.0 * blocks_done / nullif(blocks_total, 0), 1) AS "%" FROM pg_stat_progress_create_index;
HNSW는 2개의 Phase로 구성
probes: 검색할 때 탐색할 목록의 수 CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100);
CREATE INDEX ON items USING ivfflat (embedding vector_ip_ops) WITH (lists = 100);
CREATE INDEX ON items USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
CREATE INDEX ON items USING ivfflat (embedding bit_hamming_ops) WITH (lists = 100);
SET ivfflat.probes = 10;
BEGIN;
SET LOCAL ivfflat.probes = 10;
SELECT ...
COMMIT;
SET LOCAL로 단일 쿼리에 대해 설정하기 위해 transaction 내부에서 사용SET max_parallel_maintenance_workers = 7; -- plus leader
SELECT phase, round(100.0 * tuples_done / nullif(tuples_total, 0), 1) AS "%" FROM pg_stat_progress_create_index;
IVFFlat는 4개의 Phase로 구성
CREATE INDEX ON items (category_id);
CREATE INDEX ON items USING hnsw (embedding vector_l2_ops) WHERE (category_id = 123);
CREATE TABLE items (embedding vector(3), category_id int) PARTITION BY LIST(category_id);
halfvec 반정밀 벡터를 사용하여 저장CREATE TABLE items (id bigserial PRIMARY KEY, embedding halfvec(3));
CREATE INDEX ON items USING hnsw ((embedding::halfvec(3)) halfvec_l2_ops);
SELECT * FROM items ORDER BY embedding::halfvec(3) <-> '[1,2,3]' LIMIT 5;
CREATE TABLE items (id bigserial PRIMARY KEY, embedding bit(3));
INSERT INTO items (embedding) VALUES ('000'), ('111');
SELECT * FROM items ORDER BY embedding <~> '101' LIMIT 5;
SELECT * FROM items ORDER BY bit_count(embedding # '101') LIMIT 5;
<%> 도 지원CREATE INDEX ON items USING hnsw ((binary_quantize(embedding)::bit(3)) bit_hamming_ops);
SELECT * FROM items ORDER BY binary_quantize(embedding)::bit(3) <~> binary_quantize('[1,-2,3]') LIMIT 5;
SELECT * FROM (
SELECT * FROM items ORDER BY binary_quantize(embedding)::bit(3) <~> binary_quantize('[1,-2,3]') LIMIT 20
) ORDER BY embedding <=> '[1,-2,3]' LIMIT 5;
CREATE TABLE items (id bigserial PRIMARY KEY, embedding sparsevec(5));
INSERT INTO items (embedding) VALUES ('{1:1,3:2,5:3}/5'), ('{1:4,3:5,5:6}/5');
SELECT * FROM items ORDER BY embedding <-> '{1:3,3:1,5:2}/5' LIMIT 5;
SELECT id, content FROM items, plainto_tsquery('hello search') query
WHERE textsearch @@ query ORDER BY ts_rank_cd(textsearch, query) DESC LIMIT 5;
CREATE INDEX ON items USING hnsw ((subvector(embedding, 1, 3)::vector(3)) vector_cosine_ops);
SELECT * FROM items ORDER BY subvector(embedding, 1, 3)::vector(3) <=> subvector('[1,2,3,4,5]'::vector, 1, 3) LIMIT 5;
SELECT * FROM (
SELECT * FROM items ORDER BY subvector(embedding, 1, 3)::vector(3) <=> subvector('[1,2,3,4,5]'::vector, 1, 3) LIMIT 20
) ORDER BY embedding <=> '[1,2,3,4,5]' LIMIT 5;
SHOW config_file;
SHOW shared_buffers;
shared_buffers는 일반적으로 서버 메모리의 25%여야 한다.COPY items (embedding) FROM STDIN WITH (FORMAT BINARY);
COPY 사용CREATE INDEX CONCURRENTLY ...
EXPLAIN ANALYZE SELECT * FROM items ORDER BY embedding <-> '[3,1,2]' LIMIT 5;
EXPLAIN ANALYZE로 디버깅SET max_parallel_workers_per_gather = 4;
max_parallel_workers_per_gather 를 늘려 속도 조절SELECT * FROM items ORDER BY embedding <#> '[3,1,2]' LIMIT 5;
CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 1000);
REINDEX INDEX CONCURRENTLY index_name;
VACUUM table_name;
CREATE EXTENSION pg_stat_statements;
SELECT query, calls, ROUND((total_plan_time + total_exec_time) / calls) AS avg_time_ms,
ROUND((total_plan_time + total_exec_time) / 60000) AS total_time_min
FROM pg_stat_statements ORDER BY total_plan_time + total_exec_time DESC LIMIT 20;
BEGIN;
SET LOCAL enable_indexscan = off; -- use exact search
SELECT ...
COMMIT;
Postgres에서 분할되지 않은 테이블은 기본적으로 32TB로 제한
pgvector는 복제와 특정 시점 복구를 가능하게 하는 WAL(쓰기 사전 로그)을 사용
2,000개가 넘는 차원을 가진 벡터를 Indexing 할때 half-precision indexing을 사용하여 최대 4000 차원을 Indexing 하거나 binary quantization를 사용하여 최대 64000 차원을 Indexing 아니면 차원 감소를 이용한 후 Indexing
같은 Column에는 동일한 차원의 데이터만 사용이 가능
double precision[]을 사용하면 numeric[]벡터를 더 정밀하게 저장 가능
CREATE TABLE items (id bigserial PRIMARY KEY, embedding double precision[]);
-- use {} instead of [] for Postgres arrays
INSERT INTO items (embedding) VALUES ('{1,2,3}'), ('{4,5,6}');
쿼리에서 Index를 사용할때는 Order by로 오름차순으로 정렬하고 Limit을 같이 사용되어야 한다. (내림차순으로 하면 Index 사용이 안됨)
-- index
ORDER BY embedding <=> '[3,1,2]' LIMIT 5;
-- no index
ORDER BY 1 - (embedding <=> '[3,1,2]') DESC LIMIT 5;
HNSW에서 hnsw.ef_search를 쿼리 결과의 2배로 설정하는 것을 권장
만약 limit이 500개가 넘으면 IVFFlat을 사용하는 것을 권장
NULL vector는 0 vector와 마찬가지로 Indexing이 되지 않는다.
IVFFlat에서는 Index를 생성하기전 SET ivfflat.probes로 쿼리 결과 조정
만약 너무 작은 데이터가 생성되면 Drop Index