Today I experimented with DuckDB. I am super impressed by it! I created 1 billion rows of data: app_name, machine_id, date totaling 15.2 GB. It took my machine over 12 hours to create the data. I have AMD Ryzen 7 4800H. I used multiprocessing to use all 16 logical cores.

import random
from faker import Faker
import polars as pl
from concurrent.futures import ProcessPoolExecutor
from datetime import datetime
from multiprocessing import cpu_count
import traceback
faker = Faker()
chunk_size = 1_000_000
num_chunks = 1000
def generate_data(chunk_size):
app_names = [faker.name() for _ in range(chunk_size)]
machine_ids = random.choices(range(1000), k=chunk_size)
timestamps = [faker.date_time_this_decade() for _ in range(chunk_size)]
data = {
'app_name': app_names,
'machine_id': machine_ids,
'timestamp': timestamps,
}
return pl.DataFrame(data)
def create_parquet_file(chunk_index):
try:
print(f'Generating file {chunk_index} at {datetime.now()}...')
df = generate_data(chunk_size)
print(f'Data {chunk_index} generated. Writing to parquet...')
df.write_parquet(f'app_{chunk_index}.parquet', compression='snappy')
print(f'Parquet file {chunk_index} created at {datetime.now()}.')
except Exception as e:
print(f'Error in chunk {chunk_index}: {e}')
traceback.print_exc()
def run_multiprocessing():
max_workers = cpu_count()
with ProcessPoolExecutor(max_workers=max_workers) as executor:
list(executor.map(create_parquet_file, range(num_chunks)))
if __name__ == '__main__':
run_multiprocessing()
For a simple query with 1 million rows, it took 0.1887 seconds, and it took 646.3240 seconds for 100 million rows.
import duckdb
import time
con = duckdb.connect(database=':memory:')
query = """
COPY(
SELECT app_name, machine_id, DATEPART('day',timestamp) as day, DATEPART('hour',timestamp) as hour, COUNT(*) as count
FROM read_parquet('app_0.parquet')
GROUP BY app_name, machine_id, day, hour
)
TO 'result.parquet' (FORMAT 'parquet')
"""
print("Executing query now...")
start = time.time()
con.execute(query)
print("Query executed in", time.time() - start, "seconds")
For all 1 billion rows, it took 795.925 seconds. Now for a more complex query with one CTE, I have to limit to 100 million rows, as it was using all of my disk space (700 GB).
import duckdb
import time
con = duckdb.connect(database=':memory:')
query = """
COPY(
SELECT app_name, machine_id, DATEPART('day',timestamp) as day, DATEPART('hour',timestamp) as hour, COUNT(*) as count
FROM read_parquet('app_0.parquet')
GROUP BY app_name, machine_id, day, hour
)
TO 'result1.parquet' (FORMAT 'parquet')
"""
print("Executing query now...")
start = time.time()
con.execute(query)
print("Query executed in", time.time() - start, "seconds")
Now to do the above query with 1 million rows, it took only 0.521 seconds.
Leave a comment