c++로 작성된 오픈소스 컬럼 기반 DB 관리 시스템으로, 인메모리(in-memory) OLAP에 최적화 되어있는 특징


import duckdb
# Pandas DataFrame
import pandas as pd
pandas_df = pd.DataFrame({"a": [42]})
duckdb.sql("SELECT * FROM pandas_df")
# Polars DataFrame
import polars as pl
polars_df = pl.DataFrame({"a": [42]})
duckdb.sql("SELECT * FROM polars_df")
# pyarrow table
import pyarrow as pa
arrow_table = pa.Table.from_pydict({"a": [42]})
duckdb.sql("SELECT * FROM arrow_table")
import duckdb
duckdb.sql("SELECT 42").fetchall() # Python objects
duckdb.sql("SELECT 42").df() # Pandas DataFrame
duckdb.sql("SELECT 42").pl() # Polars DataFrame
duckdb.sql("SELECT 42").arrow() # Arrow Table
duckdb.sql("SELECT 42").fetchnumpy() # NumPy Arrays
def import_data_directory_to_duckdb(data_path, db_path):
# Connect to DuckDB database (will create a new one if it doesn't exist)
con = duckdb.connect(db_path)
# Iterate through all data files in the folder
for file in os.listdir(data_path):
# If the file is not a csv or parquet, skip
if not file.endswith('.csv') and not file.endswith('.parquet'):
continue
# Split the file name and the file extension
file_name, _ = os.path.splitext(file)
# Create a table name by stripping the extension and replacing dots with underscores
table_name = file_name.replace(".", "_")
# Construct the full path to the data file
file_path = os.path.join(data_path, file)
try:
# Create if not exists or replace a table and load the data into DuckDB
con.execute(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT * FROM '{file_path}';
""")
except duckdb.CatalogException:
# If the table already exists, catch the exception and print a message
print(f"The table: '{table_name}' already exists.")
except Exception as e:
# Catch any other exceptions that may occur
print(f"An error occurred: {e}")
else:
# If there were no errors
print(f"Inserted {file_name} into table {table_name}")
# Close the connection when done
con.close()