179 lines
No EOL
5.6 KiB
Python
179 lines
No EOL
5.6 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import sys
|
|
import json
|
|
import os
|
|
import sqlite3
|
|
import duckdb
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print("Usage: extract_podcasts_job.py {config|exec} [args...]", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
command = sys.argv[1]
|
|
|
|
if command == "config":
|
|
handle_config(sys.argv[2:])
|
|
elif command == "exec":
|
|
handle_exec(sys.argv[2:])
|
|
else:
|
|
print(f"Unknown command: {command}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
def handle_config(args):
|
|
if len(args) < 1:
|
|
print("Config mode requires partition ref", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
partition_ref = args[0]
|
|
|
|
# This job produces a single partition with all podcast metadata
|
|
if partition_ref != "podcasts/all":
|
|
print(f"Invalid partition ref: {partition_ref}. Expected 'podcasts/all'", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
config = {
|
|
"configs": [{
|
|
"outputs": [{"str": partition_ref}],
|
|
"inputs": [],
|
|
"args": [],
|
|
"env": {
|
|
"PARTITION_REF": partition_ref
|
|
}
|
|
}]
|
|
}
|
|
|
|
print(json.dumps(config))
|
|
|
|
def handle_exec(args):
|
|
partition_ref = os.getenv('PARTITION_REF', 'podcasts/all')
|
|
|
|
# Database paths
|
|
db_path = "/tmp/databuild_test/examples/podcast_reviews/data/ingest/database.sqlite"
|
|
if not os.path.exists(db_path):
|
|
# Fallback to relative path for development
|
|
db_path = "data/ingest/database.sqlite"
|
|
|
|
# Output path
|
|
output_dir = Path("/tmp/databuild_test/examples/podcast_reviews/podcasts")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_file = output_dir / "podcasts.parquet"
|
|
|
|
try:
|
|
# Extract all podcasts with their categories
|
|
extract_podcasts_with_categories(db_path, str(output_file))
|
|
|
|
print(f"Successfully extracted podcast metadata")
|
|
print(f"Output written to: {output_file}")
|
|
|
|
# Create manifest
|
|
manifest = {
|
|
"outputs": [{"str": partition_ref}],
|
|
"inputs": [],
|
|
"start_time": datetime.now().isoformat(),
|
|
"end_time": datetime.now().isoformat(),
|
|
"task": {
|
|
"job": {"label": "//examples/podcast_reviews:extract_podcasts_job"},
|
|
"config": {
|
|
"outputs": [{"str": partition_ref}],
|
|
"inputs": [],
|
|
"args": [],
|
|
"env": {"PARTITION_REF": partition_ref}
|
|
}
|
|
}
|
|
}
|
|
|
|
manifest_file = output_dir / "manifest.json"
|
|
with open(manifest_file, 'w') as f:
|
|
json.dump(manifest, f, indent=2)
|
|
|
|
except Exception as e:
|
|
print(f"Error extracting podcasts: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
def extract_podcasts_with_categories(db_path: str, output_file: str):
|
|
"""Extract all podcasts with their categories and save as parquet."""
|
|
|
|
# Connect to SQLite
|
|
sqlite_conn = sqlite3.connect(db_path)
|
|
|
|
# Connect to DuckDB for processing
|
|
duckdb_conn = duckdb.connect()
|
|
|
|
try:
|
|
# Try to install and load parquet extension, but don't fail if it's already installed
|
|
try:
|
|
duckdb_conn.execute("INSTALL parquet")
|
|
except Exception:
|
|
pass # Extension might already be installed
|
|
|
|
duckdb_conn.execute("LOAD parquet")
|
|
|
|
# Query to get podcasts with categories (handling multiple categories per podcast)
|
|
query = """
|
|
WITH podcast_categories AS (
|
|
SELECT
|
|
p.podcast_id,
|
|
p.itunes_id,
|
|
p.slug,
|
|
p.itunes_url,
|
|
p.title,
|
|
c.category,
|
|
ROW_NUMBER() OVER (PARTITION BY p.podcast_id ORDER BY c.category) as category_rank
|
|
FROM sqlite_scan(?, 'podcasts') p
|
|
LEFT JOIN sqlite_scan(?, 'categories') c ON p.podcast_id = c.podcast_id
|
|
),
|
|
primary_categories AS (
|
|
SELECT
|
|
podcast_id,
|
|
itunes_id,
|
|
slug,
|
|
itunes_url,
|
|
title,
|
|
category as primary_category
|
|
FROM podcast_categories
|
|
WHERE category_rank = 1
|
|
),
|
|
all_categories AS (
|
|
SELECT
|
|
podcast_id,
|
|
STRING_AGG(category, '|' ORDER BY category) as all_categories
|
|
FROM podcast_categories
|
|
WHERE category IS NOT NULL
|
|
GROUP BY podcast_id
|
|
)
|
|
SELECT
|
|
pc.podcast_id,
|
|
pc.itunes_id,
|
|
pc.slug,
|
|
pc.itunes_url,
|
|
pc.title,
|
|
pc.primary_category,
|
|
COALESCE(ac.all_categories, pc.primary_category) as all_categories
|
|
FROM primary_categories pc
|
|
LEFT JOIN all_categories ac ON pc.podcast_id = ac.podcast_id
|
|
ORDER BY pc.title
|
|
"""
|
|
|
|
# Execute query and save to parquet
|
|
duckdb_conn.execute(f"COPY ({query}) TO '{output_file}' (FORMAT PARQUET)", [db_path, db_path])
|
|
|
|
# Get row count for logging
|
|
count_result = duckdb_conn.execute(
|
|
"SELECT COUNT(*) FROM sqlite_scan(?, 'podcasts')",
|
|
[db_path]
|
|
).fetchone()
|
|
|
|
row_count = count_result[0] if count_result else 0
|
|
print(f"Extracted {row_count} podcasts with category information")
|
|
|
|
finally:
|
|
sqlite_conn.close()
|
|
duckdb_conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
main() |