databuild/examples/podcast_reviews/extract_podcasts_job.py

#!/usr/bin/env python3

import sys
import json
import os
import sqlite3
import duckdb
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any

def main():
    if len(sys.argv) < 2:
        print("Usage: extract_podcasts_job.py {config|exec} [args...]", file=sys.stderr)
        sys.exit(1)

    command = sys.argv[1]

    if command == "config":
        handle_config(sys.argv[2:])
    elif command == "exec":
        handle_exec(sys.argv[2:])
    else:
        print(f"Unknown command: {command}", file=sys.stderr)
        sys.exit(1)

def handle_config(args):
    if len(args) < 1:
        print("Config mode requires partition ref", file=sys.stderr)
        sys.exit(1)

    partition_ref = args[0]

    # This job produces a single partition with all podcast metadata
    if partition_ref != "podcasts/all":
        print(f"Invalid partition ref: {partition_ref}. Expected 'podcasts/all'", file=sys.stderr)
        sys.exit(1)

    config = {
        "configs": [{
            "outputs": [{"str": partition_ref}],
            "inputs": [],
            "args": [],
            "env": {
                "PARTITION_REF": partition_ref
            }
        }]
    }

    print(json.dumps(config))

def handle_exec(args):
    partition_ref = os.getenv('PARTITION_REF', 'podcasts/all')

    # Database paths
    db_path = "/tmp/databuild_test/examples/podcast_reviews/data/ingest/database.sqlite"
    if not os.path.exists(db_path):
        # Fallback to relative path for development
        db_path = "data/ingest/database.sqlite"

    # Output path
    output_dir = Path("/tmp/databuild_test/examples/podcast_reviews/podcasts")
    output_dir.mkdir(parents=True, exist_ok=True)
    output_file = output_dir / "podcasts.parquet"

    try:
        # Extract all podcasts with their categories
        extract_podcasts_with_categories(db_path, str(output_file))

        print(f"Successfully extracted podcast metadata")
        print(f"Output written to: {output_file}")

        # Create manifest
        manifest = {
            "outputs": [{"str": partition_ref}],
            "inputs": [],
            "start_time": datetime.now().isoformat(),
            "end_time": datetime.now().isoformat(),
            "task": {
                "job": {"label": "//examples/podcast_reviews:extract_podcasts_job"},
                "config": {
                    "outputs": [{"str": partition_ref}],
                    "inputs": [],
                    "args": [],
                    "env": {"PARTITION_REF": partition_ref}
                }
            }
        }

        manifest_file = output_dir / "manifest.json"
        with open(manifest_file, 'w') as f:
            json.dump(manifest, f, indent=2)

    except Exception as e:
        print(f"Error extracting podcasts: {e}", file=sys.stderr)
        sys.exit(1)

def extract_podcasts_with_categories(db_path: str, output_file: str):
    """Extract all podcasts with their categories and save as parquet."""

    # Connect to SQLite
    sqlite_conn = sqlite3.connect(db_path)

    # Connect to DuckDB for processing
    duckdb_conn = duckdb.connect()

    try:
        # Try to install and load parquet extension, but don't fail if it's already installed
        try:
            duckdb_conn.execute("INSTALL parquet")
        except Exception:
            pass  # Extension might already be installed

        duckdb_conn.execute("LOAD parquet")

        # Query to get podcasts with categories (handling multiple categories per podcast)
        query = """
        WITH podcast_categories AS (
            SELECT
                p.podcast_id,
                p.itunes_id,
                p.slug,
                p.itunes_url,
                p.title,
                c.category,
                ROW_NUMBER() OVER (PARTITION BY p.podcast_id ORDER BY c.category) as category_rank
            FROM sqlite_scan(?, 'podcasts') p
            LEFT JOIN sqlite_scan(?, 'categories') c ON p.podcast_id = c.podcast_id
        ),
        primary_categories AS (
            SELECT
                podcast_id,
                itunes_id,
                slug,
                itunes_url,
                title,
                category as primary_category
            FROM podcast_categories
            WHERE category_rank = 1
        ),
        all_categories AS (
            SELECT
                podcast_id,
                STRING_AGG(category, '|' ORDER BY category) as all_categories
            FROM podcast_categories
            WHERE category IS NOT NULL
            GROUP BY podcast_id
        )
        SELECT
            pc.podcast_id,
            pc.itunes_id,
            pc.slug,
            pc.itunes_url,
            pc.title,
            pc.primary_category,
            COALESCE(ac.all_categories, pc.primary_category) as all_categories
        FROM primary_categories pc
        LEFT JOIN all_categories ac ON pc.podcast_id = ac.podcast_id
        ORDER BY pc.title
        """

        # Execute query and save to parquet
        duckdb_conn.execute(f"COPY ({query}) TO '{output_file}' (FORMAT PARQUET)", [db_path, db_path])

        # Get row count for logging
        count_result = duckdb_conn.execute(
            "SELECT COUNT(*) FROM sqlite_scan(?, 'podcasts')",
            [db_path]
        ).fetchone()

        row_count = count_result[0] if count_result else 0
        print(f"Extracted {row_count} podcasts with category information")

    finally:
        sqlite_conn.close()
        duckdb_conn.close()

if __name__ == "__main__":
    main()