#!/usr/bin/env python3 import sys import json import os import sqlite3 import duckdb from datetime import datetime from pathlib import Path from typing import List, Dict, Any def main(): if len(sys.argv) < 2: print("Usage: extract_podcasts_job.py {config|exec} [args...]", file=sys.stderr) sys.exit(1) command = sys.argv[1] if command == "config": handle_config(sys.argv[2:]) elif command == "exec": handle_exec(sys.argv[2:]) else: print(f"Unknown command: {command}", file=sys.stderr) sys.exit(1) def handle_config(args): if len(args) < 1: print("Config mode requires partition ref", file=sys.stderr) sys.exit(1) partition_ref = args[0] # This job produces a single partition with all podcast metadata if partition_ref != "podcasts/all": print(f"Invalid partition ref: {partition_ref}. Expected 'podcasts/all'", file=sys.stderr) sys.exit(1) config = { "configs": [{ "outputs": [{"str": partition_ref}], "inputs": [], "args": [], "env": { "PARTITION_REF": partition_ref } }] } print(json.dumps(config)) def handle_exec(args): partition_ref = os.getenv('PARTITION_REF', 'podcasts/all') # Database paths db_path = "/tmp/databuild_test/examples/podcast_reviews/data/ingest/database.sqlite" if not os.path.exists(db_path): # Fallback to relative path for development db_path = "data/ingest/database.sqlite" # Output path output_dir = Path("/tmp/databuild_test/examples/podcast_reviews/podcasts") output_dir.mkdir(parents=True, exist_ok=True) output_file = output_dir / "podcasts.parquet" try: # Extract all podcasts with their categories extract_podcasts_with_categories(db_path, str(output_file)) print(f"Successfully extracted podcast metadata") print(f"Output written to: {output_file}") # Create manifest manifest = { "outputs": [{"str": partition_ref}], "inputs": [], "start_time": datetime.now().isoformat(), "end_time": datetime.now().isoformat(), "task": { "job": {"label": "//examples/podcast_reviews:extract_podcasts_job"}, "config": { "outputs": [{"str": partition_ref}], "inputs": [], "args": [], "env": {"PARTITION_REF": partition_ref} } } } manifest_file = output_dir / "manifest.json" with open(manifest_file, 'w') as f: json.dump(manifest, f, indent=2) except Exception as e: print(f"Error extracting podcasts: {e}", file=sys.stderr) sys.exit(1) def extract_podcasts_with_categories(db_path: str, output_file: str): """Extract all podcasts with their categories and save as parquet.""" # Connect to SQLite sqlite_conn = sqlite3.connect(db_path) # Connect to DuckDB for processing duckdb_conn = duckdb.connect() try: # Try to install and load parquet extension, but don't fail if it's already installed try: duckdb_conn.execute("INSTALL parquet") except Exception: pass # Extension might already be installed duckdb_conn.execute("LOAD parquet") # Query to get podcasts with categories (handling multiple categories per podcast) query = """ WITH podcast_categories AS ( SELECT p.podcast_id, p.itunes_id, p.slug, p.itunes_url, p.title, c.category, ROW_NUMBER() OVER (PARTITION BY p.podcast_id ORDER BY c.category) as category_rank FROM sqlite_scan(?, 'podcasts') p LEFT JOIN sqlite_scan(?, 'categories') c ON p.podcast_id = c.podcast_id ), primary_categories AS ( SELECT podcast_id, itunes_id, slug, itunes_url, title, category as primary_category FROM podcast_categories WHERE category_rank = 1 ), all_categories AS ( SELECT podcast_id, STRING_AGG(category, '|' ORDER BY category) as all_categories FROM podcast_categories WHERE category IS NOT NULL GROUP BY podcast_id ) SELECT pc.podcast_id, pc.itunes_id, pc.slug, pc.itunes_url, pc.title, pc.primary_category, COALESCE(ac.all_categories, pc.primary_category) as all_categories FROM primary_categories pc LEFT JOIN all_categories ac ON pc.podcast_id = ac.podcast_id ORDER BY pc.title """ # Execute query and save to parquet duckdb_conn.execute(f"COPY ({query}) TO '{output_file}' (FORMAT PARQUET)", [db_path, db_path]) # Get row count for logging count_result = duckdb_conn.execute( "SELECT COUNT(*) FROM sqlite_scan(?, 'podcasts')", [db_path] ).fetchone() row_count = count_result[0] if count_result else 0 print(f"Extracted {row_count} podcasts with category information") finally: sqlite_conn.close() duckdb_conn.close() if __name__ == "__main__": main()