databuild/examples/podcast_reviews/extract_podcasts_job.py
2025-06-30 22:15:48 -07:00

179 lines
No EOL
5.6 KiB
Python

#!/usr/bin/env python3
import sys
import json
import os
import sqlite3
import duckdb
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any
def main():
if len(sys.argv) < 2:
print("Usage: extract_podcasts_job.py {config|exec} [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
if command == "config":
handle_config(sys.argv[2:])
elif command == "exec":
handle_exec(sys.argv[2:])
else:
print(f"Unknown command: {command}", file=sys.stderr)
sys.exit(1)
def handle_config(args):
if len(args) < 1:
print("Config mode requires partition ref", file=sys.stderr)
sys.exit(1)
partition_ref = args[0]
# This job produces a single partition with all podcast metadata
if partition_ref != "podcasts/all":
print(f"Invalid partition ref: {partition_ref}. Expected 'podcasts/all'", file=sys.stderr)
sys.exit(1)
config = {
"configs": [{
"outputs": [{"str": partition_ref}],
"inputs": [],
"args": [],
"env": {
"PARTITION_REF": partition_ref
}
}]
}
print(json.dumps(config))
def handle_exec(args):
partition_ref = os.getenv('PARTITION_REF', 'podcasts/all')
# Database paths
db_path = "/tmp/databuild_test/examples/podcast_reviews/data/ingest/database.sqlite"
if not os.path.exists(db_path):
# Fallback to relative path for development
db_path = "data/ingest/database.sqlite"
# Output path
output_dir = Path("/tmp/databuild_test/examples/podcast_reviews/podcasts")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "podcasts.parquet"
try:
# Extract all podcasts with their categories
extract_podcasts_with_categories(db_path, str(output_file))
print(f"Successfully extracted podcast metadata")
print(f"Output written to: {output_file}")
# Create manifest
manifest = {
"outputs": [{"str": partition_ref}],
"inputs": [],
"start_time": datetime.now().isoformat(),
"end_time": datetime.now().isoformat(),
"task": {
"job": {"label": "//examples/podcast_reviews:extract_podcasts_job"},
"config": {
"outputs": [{"str": partition_ref}],
"inputs": [],
"args": [],
"env": {"PARTITION_REF": partition_ref}
}
}
}
manifest_file = output_dir / "manifest.json"
with open(manifest_file, 'w') as f:
json.dump(manifest, f, indent=2)
except Exception as e:
print(f"Error extracting podcasts: {e}", file=sys.stderr)
sys.exit(1)
def extract_podcasts_with_categories(db_path: str, output_file: str):
"""Extract all podcasts with their categories and save as parquet."""
# Connect to SQLite
sqlite_conn = sqlite3.connect(db_path)
# Connect to DuckDB for processing
duckdb_conn = duckdb.connect()
try:
# Try to install and load parquet extension, but don't fail if it's already installed
try:
duckdb_conn.execute("INSTALL parquet")
except Exception:
pass # Extension might already be installed
duckdb_conn.execute("LOAD parquet")
# Query to get podcasts with categories (handling multiple categories per podcast)
query = """
WITH podcast_categories AS (
SELECT
p.podcast_id,
p.itunes_id,
p.slug,
p.itunes_url,
p.title,
c.category,
ROW_NUMBER() OVER (PARTITION BY p.podcast_id ORDER BY c.category) as category_rank
FROM sqlite_scan(?, 'podcasts') p
LEFT JOIN sqlite_scan(?, 'categories') c ON p.podcast_id = c.podcast_id
),
primary_categories AS (
SELECT
podcast_id,
itunes_id,
slug,
itunes_url,
title,
category as primary_category
FROM podcast_categories
WHERE category_rank = 1
),
all_categories AS (
SELECT
podcast_id,
STRING_AGG(category, '|' ORDER BY category) as all_categories
FROM podcast_categories
WHERE category IS NOT NULL
GROUP BY podcast_id
)
SELECT
pc.podcast_id,
pc.itunes_id,
pc.slug,
pc.itunes_url,
pc.title,
pc.primary_category,
COALESCE(ac.all_categories, pc.primary_category) as all_categories
FROM primary_categories pc
LEFT JOIN all_categories ac ON pc.podcast_id = ac.podcast_id
ORDER BY pc.title
"""
# Execute query and save to parquet
duckdb_conn.execute(f"COPY ({query}) TO '{output_file}' (FORMAT PARQUET)", [db_path, db_path])
# Get row count for logging
count_result = duckdb_conn.execute(
"SELECT COUNT(*) FROM sqlite_scan(?, 'podcasts')",
[db_path]
).fetchone()
row_count = count_result[0] if count_result else 0
print(f"Extracted {row_count} podcasts with category information")
finally:
sqlite_conn.close()
duckdb_conn.close()
if __name__ == "__main__":
main()