ffmerge

Jun 17, 2025

A tool to merge multiple files into one and convert between CSV, Parquet, JSONL, and JSON.

#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.12"
# dependencies = [
#     "duckdb",
#     "typer",
# ]
# ///
# SPDX-FileCopyrightText: 2025 kurt.town
# SPDX-License-Identifier: MIT

from enum import StrEnum
import duckdb
import logging
import os
import glob
import typer

class LogLevel(StrEnum):
    DEBUG = "DEBUG"
    INFO = "INFO"
    WARNING = "WARNING"
    ERROR = "ERROR"
    CRITICAL = "CRITICAL"

class Formats(StrEnum):
    CSV = "csv"
    PARQUET = "parquet"
    JSONL = "jsonl"
    JSON = "json"

FORMAT_TO_DDB_READ_FXN = {
    Formats.CSV: "read_csv",
    Formats.PARQUET: "read_parquet",
    Formats.JSONL: "read_ndjson",
    Formats.JSON: "read_json",
}

FORMAT_TO_DDB_COPY_FORMAT_EXPR = {
    Formats.CSV: "(FORMAT CSV)",
    Formats.PARQUET: "(FORMAT PARQUET)",
    Formats.JSONL: "(FORMAT JSON)",
    Formats.JSON: "(FORMAT JSON, ARRAY true)",
}

def get_format_from_path(path: str) -> Formats | None:
    try:
        ext = path.split(".")[-1]
        return Formats(ext)
    except ValueError:
        return None

def main(glob_pattern: str, output_path: str, remove: bool = False, log_level: LogLevel = LogLevel.INFO):
    """
    Merge multiple files into one file.\n
    Formats supported: CSV, Parquet, JSONL, JSON.\n
    Formats are determined by the file extension (.csv, .parquet, .jsonl, .json).\n

    Args:\n
    - GLOB_PATTERN: The glob pattern to match files to merge.\n
    - OUTPUT_PATH: The path to the output file.\n
    - remove: Whether to remove the files matched by the glob pattern after merging.\n
    - log_level: The logging level to use.\n
    """
    logging.basicConfig(level=log_level.value)

    input_format = get_format_from_path(glob_pattern)
    output_format = get_format_from_path(output_path)
    
    basedir = os.path.dirname(output_path)
    if basedir:
        os.makedirs(basedir, exist_ok=True)


    if input_format is None:
        logging.error(f"could not determine format from {glob_pattern}")
        exit(1)
    
    if output_format is None:
        logging.error(f"could not determine format from {output_path}")
        exit(1)

    ddb_read_fxn = FORMAT_TO_DDB_READ_FXN[input_format]
    ddb_format_expr = FORMAT_TO_DDB_COPY_FORMAT_EXPR[output_format]

    con = duckdb.connect()
    con.execute(f"CREATE TABLE data AS SELECT * FROM {ddb_read_fxn}('{glob_pattern}', union_by_name = true)")
    con.execute(f"COPY data TO '{output_path}' {ddb_format_expr}")
    con.close()

    logging.info(f"merged files from {glob_pattern} into {output_path}")

    if remove:
        for f in glob.glob(glob_pattern):
            os.remove(f)
            logging.debug(f"removed {f}")
        logging.info(f"removed files matched by the glob pattern {glob_pattern}") 

if __name__ == "__main__":
    typer.run(main)
RSS
https://kurt.town/posts/feed.xml