fix podcast job
Some checks failed
/ setup (push) Has been cancelled

This commit is contained in:
Stuart Axelbrooke 2025-06-30 23:27:24 -07:00
parent 55c404ca2e
commit 2ecf080f16
4 changed files with 270 additions and 113 deletions

View file

@ -25,4 +25,65 @@ DataBuild is a bazel-based data build system. Key files:
## Key Components ## Key Components
- Graph analysis/execution in Rust - Graph analysis/execution in Rust
- Bazel rules for job orchestration - Bazel rules for job orchestration
- Java/Python examples for different use cases - Java/Python examples for different use cases
## DataBuild Job Architecture
### Job Target Structure
Each DataBuild job creates three Bazel targets:
- `job_name.cfg` - Configuration target (calls binary with "config" subcommand)
- `job_name.exec` - Execution target (calls binary with "exec" subcommand)
- `job_name` - Main job target (pipes config output to exec input)
### Unified Job Binary Pattern
Jobs use a single binary with subcommands:
```python
def main():
command = sys.argv[1] # "config" or "exec"
if command == "config":
handle_config(sys.argv[2:]) # Output job configuration JSON
elif command == "exec":
handle_exec(sys.argv[2:]) # Perform actual work
```
### Job Configuration Requirements
**CRITICAL**: Job configs must include non-empty `args` for execution to work:
```python
config = {
"configs": [{
"outputs": [{"str": partition_ref}],
"inputs": [...],
"args": ["some_arg"], # REQUIRED: Cannot be empty []
"env": {"PARTITION_REF": partition_ref}
}]
}
```
Jobs with `"args": []` will only have their config function called during execution, not exec.
### DataBuild Execution Flow
1. **Planning Phase**: DataBuild calls `.cfg` targets to get job configurations
2. **Execution Phase**: DataBuild calls main job targets which pipe config to exec
3. **Job Resolution**: Job lookup returns base job names (e.g., `//:job_name`), not `.cfg` variants
### Graph Configuration
```python
databuild_graph(
name = "my_graph",
jobs = [":job1", ":job2"], # Reference base job targets
lookup = ":job_lookup", # Binary that routes partition refs to jobs
)
```
### Job Lookup Pattern
```python
def lookup_job_for_partition(partition_ref: str) -> str:
if pattern.match(partition_ref):
return "//:job_name" # Return base job target
raise ValueError(f"No job found for: {partition_ref}")
```
### Common Pitfalls
- **Empty args**: Jobs with `"args": []` won't execute properly
- **Wrong target refs**: Job lookup must return base targets, not `.cfg` variants
- **Missing partition refs**: All outputs must be addressable via partition references

View file

@ -3,11 +3,11 @@
import sys import sys
import json import json
import os import os
import duckdb
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import List, Dict, Any from typing import List, Dict, Any
import re import re
from duckdb_utils import create_duckdb_connection, read_dataframe_with_fallback, save_dataframe_with_fallback
def main(): def main():
if len(sys.argv) < 2: if len(sys.argv) < 2:
@ -77,17 +77,33 @@ def handle_exec(args):
target_date = args[1] target_date = args[1]
partition_ref = os.getenv('PARTITION_REF', f'categorized_reviews/category={target_category}/date={target_date}') partition_ref = os.getenv('PARTITION_REF', f'categorized_reviews/category={target_category}/date={target_date}')
# Input paths # Input paths - check for both parquet and CSV fallbacks
reviews_file = f"/tmp/databuild_test/examples/podcast_reviews/reviews/date={target_date}/reviews.parquet" reviews_base = f"/tmp/databuild_test/examples/podcast_reviews/reviews/date={target_date}/reviews"
podcasts_file = "/tmp/databuild_test/examples/podcast_reviews/podcasts/podcasts.parquet" podcasts_base = "/tmp/databuild_test/examples/podcast_reviews/podcasts/podcasts"
# Check input files exist reviews_file = None
if not os.path.exists(reviews_file): podcasts_file = None
print(f"Reviews file not found: {reviews_file}", file=sys.stderr)
# Find reviews file (parquet or CSV)
for ext in ['.parquet', '.csv']:
candidate = reviews_base + ext
if os.path.exists(candidate):
reviews_file = candidate
break
# Find podcasts file (parquet or CSV)
for ext in ['.parquet', '.csv']:
candidate = podcasts_base + ext
if os.path.exists(candidate):
podcasts_file = candidate
break
if not reviews_file:
print(f"Reviews file not found: {reviews_base}.(parquet|csv)", file=sys.stderr)
sys.exit(1) sys.exit(1)
if not os.path.exists(podcasts_file): if not podcasts_file:
print(f"Podcasts file not found: {podcasts_file}", file=sys.stderr) print(f"Podcasts file not found: {podcasts_base}.(parquet|csv)", file=sys.stderr)
sys.exit(1) sys.exit(1)
# Output path # Output path
@ -136,50 +152,47 @@ def handle_exec(args):
def categorize_reviews_for_category_date(reviews_file: str, podcasts_file: str, target_category: str, output_file: str): def categorize_reviews_for_category_date(reviews_file: str, podcasts_file: str, target_category: str, output_file: str):
"""Join reviews with podcast categories and filter for target category.""" """Join reviews with podcast categories and filter for target category."""
# Connect to DuckDB for processing # Connect to DuckDB with extension handling
duckdb_conn = duckdb.connect() duckdb_conn = create_duckdb_connection()
try: try:
# Try to install and load parquet extension, but don't fail if it's already installed # Read input files with fallback handling
try: reviews_df = read_dataframe_with_fallback(reviews_file, duckdb_conn)
duckdb_conn.execute("INSTALL parquet") podcasts_df = read_dataframe_with_fallback(podcasts_file, duckdb_conn)
except Exception:
pass # Extension might already be installed
duckdb_conn.execute("LOAD parquet") # Perform join and filtering in pandas
import pandas as pd
# Query to join reviews with podcasts and filter by category # Join reviews with podcasts
query = f""" joined_df = reviews_df.merge(podcasts_df, on='podcast_id', how='inner')
SELECT
r.podcast_id,
r.review_title,
r.content,
r.rating,
r.author_id,
r.created_at,
r.review_date,
p.title as podcast_title,
p.primary_category,
p.all_categories,
'{target_category}' as target_category
FROM parquet_scan('{reviews_file}') r
JOIN parquet_scan('{podcasts_file}') p ON r.podcast_id = p.podcast_id
WHERE p.primary_category = '{target_category}'
OR p.all_categories LIKE '%{target_category}%'
ORDER BY r.created_at
"""
# Execute query and save to parquet # Filter by category
duckdb_conn.execute(f"COPY ({query}) TO '{output_file}' (FORMAT PARQUET)") filtered_df = joined_df[
(joined_df['primary_category'] == target_category) |
(joined_df['all_categories'].str.contains(target_category, na=False))
].copy()
# Get row count for logging # Add target category column
count_result = duckdb_conn.execute(f"SELECT COUNT(*) FROM ({query})").fetchone() filtered_df['target_category'] = target_category
row_count = count_result[0] if count_result else 0 # Select and rename columns to match expected output
result_df = filtered_df[[
'podcast_id', 'review_title', 'content', 'rating', 'author_id',
'created_at', 'review_date', 'title', 'primary_category',
'all_categories', 'target_category'
]].rename(columns={'title': 'podcast_title'})
# Sort by created_at
result_df = result_df.sort_values('created_at')
# Save to parquet with fallback
save_dataframe_with_fallback(result_df, output_file, duckdb_conn, "parquet")
row_count = len(result_df)
print(f"Categorized {row_count} reviews for category '{target_category}'") print(f"Categorized {row_count} reviews for category '{target_category}'")
if row_count == 0: if row_count == 0:
print(f"Warning: No reviews found for category '{target_category}' on date '{reviews_file.split('date=')[1].split('/')[0]}'") print(f"Warning: No reviews found for category '{target_category}'")
finally: finally:
duckdb_conn.close() duckdb_conn.close()

View file

@ -3,19 +3,34 @@
import sys import sys
import json import json
import os import os
import sqlite3
import duckdb
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import List, Dict, Any from typing import List, Dict, Any
from duckdb_utils import create_duckdb_connection, execute_query_with_fallback, save_dataframe_with_fallback
def main(): def main():
# Write debug at the very start to see if main is called
debug_file = "/tmp/databuild_test/podcasts_main_debug.log"
try:
with open(debug_file, "w") as f:
f.write(f"main() called with sys.argv: {sys.argv}\n")
f.flush()
except:
pass
if len(sys.argv) < 2: if len(sys.argv) < 2:
print("Usage: extract_podcasts_job.py {config|exec} [args...]", file=sys.stderr) print("Usage: extract_podcasts_job.py {config|exec} [args...]", file=sys.stderr)
sys.exit(1) sys.exit(1)
command = sys.argv[1] command = sys.argv[1]
try:
with open(debug_file, "a") as f:
f.write(f"command: {command}\n")
f.flush()
except:
pass
if command == "config": if command == "config":
handle_config(sys.argv[2:]) handle_config(sys.argv[2:])
elif command == "exec": elif command == "exec":
@ -40,7 +55,7 @@ def handle_config(args):
"configs": [{ "configs": [{
"outputs": [{"str": partition_ref}], "outputs": [{"str": partition_ref}],
"inputs": [], "inputs": [],
"args": [], "args": ["all"],
"env": { "env": {
"PARTITION_REF": partition_ref "PARTITION_REF": partition_ref
} }
@ -50,7 +65,19 @@ def handle_config(args):
print(json.dumps(config)) print(json.dumps(config))
def handle_exec(args): def handle_exec(args):
# Write debug info to a file since stdout might not be captured
debug_file = "/tmp/databuild_test/podcasts_debug.log"
with open(debug_file, "w") as f:
f.write(f"Starting extract_podcasts_job.exec with args: {args}\n")
f.flush()
print(f"Starting extract_podcasts_job.exec with args: {args}")
partition_ref = os.getenv('PARTITION_REF', 'podcasts/all') partition_ref = os.getenv('PARTITION_REF', 'podcasts/all')
print(f"Partition ref: {partition_ref}")
with open(debug_file, "a") as f:
f.write(f"Partition ref: {partition_ref}\n")
f.flush()
# Database paths # Database paths
db_path = "/tmp/databuild_test/examples/podcast_reviews/data/ingest/database.sqlite" db_path = "/tmp/databuild_test/examples/podcast_reviews/data/ingest/database.sqlite"
@ -58,17 +85,26 @@ def handle_exec(args):
# Fallback to relative path for development # Fallback to relative path for development
db_path = "data/ingest/database.sqlite" db_path = "data/ingest/database.sqlite"
print(f"Looking for database at: {db_path}")
print(f"Database exists: {os.path.exists(db_path)}")
# Output path # Output path
output_dir = Path("/tmp/databuild_test/examples/podcast_reviews/podcasts") output_dir = Path("/tmp/databuild_test/examples/podcast_reviews/podcasts")
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "podcasts.parquet" output_file = output_dir / "podcasts.parquet"
print(f"Output directory: {output_dir}")
print(f"Output file: {output_file}")
try: try:
# Extract all podcasts with their categories # Extract all podcasts with their categories
extract_podcasts_with_categories(db_path, str(output_file)) print("Calling extract_podcasts_with_categories...")
result = extract_podcasts_with_categories(db_path, str(output_file))
print(f"extract_podcasts_with_categories returned: {result}")
print(f"Successfully extracted podcast metadata") print(f"Successfully extracted podcast metadata")
print(f"Output written to: {output_file}") print(f"Output written to: {output_file}")
print(f"Output file exists: {output_file.exists()}")
# Create manifest # Create manifest
manifest = { manifest = {
@ -90,89 +126,120 @@ def handle_exec(args):
manifest_file = output_dir / "manifest.json" manifest_file = output_dir / "manifest.json"
with open(manifest_file, 'w') as f: with open(manifest_file, 'w') as f:
json.dump(manifest, f, indent=2) json.dump(manifest, f, indent=2)
print(f"Manifest written to: {manifest_file}")
except Exception as e: except Exception as e:
print(f"Error extracting podcasts: {e}", file=sys.stderr) print(f"Error extracting podcasts: {e}", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1) sys.exit(1)
def extract_podcasts_with_categories(db_path: str, output_file: str): def extract_podcasts_with_categories(db_path: str, output_file: str):
"""Extract all podcasts with their categories and save as parquet.""" """Extract all podcasts with their categories and save as parquet."""
# Connect to SQLite print(f"extract_podcasts_with_categories called with db_path={db_path}, output_file={output_file}")
sqlite_conn = sqlite3.connect(db_path)
# Connect to DuckDB for processing # Connect to DuckDB with extension handling
duckdb_conn = duckdb.connect() print("Creating DuckDB connection...")
duckdb_conn = create_duckdb_connection()
print("DuckDB connection created")
try: try:
# Try to install and load parquet extension, but don't fail if it's already installed # Use a simpler approach that works with SQLite fallback
try: try:
duckdb_conn.execute("INSTALL parquet") # Try complex DuckDB query first
except Exception: query = """
pass # Extension might already be installed WITH podcast_categories AS (
SELECT
duckdb_conn.execute("LOAD parquet") p.podcast_id,
p.itunes_id,
# Query to get podcasts with categories (handling multiple categories per podcast) p.slug,
query = """ p.itunes_url,
WITH podcast_categories AS ( p.title,
c.category,
ROW_NUMBER() OVER (PARTITION BY p.podcast_id ORDER BY c.category) as category_rank
FROM sqlite_scan(?, 'podcasts') p
LEFT JOIN sqlite_scan(?, 'categories') c ON p.podcast_id = c.podcast_id
),
primary_categories AS (
SELECT
podcast_id,
itunes_id,
slug,
itunes_url,
title,
category as primary_category
FROM podcast_categories
WHERE category_rank = 1
),
all_categories AS (
SELECT
podcast_id,
STRING_AGG(category, '|' ORDER BY category) as all_categories
FROM podcast_categories
WHERE category IS NOT NULL
GROUP BY podcast_id
)
SELECT SELECT
p.podcast_id, pc.podcast_id,
p.itunes_id, pc.itunes_id,
p.slug, pc.slug,
p.itunes_url, pc.itunes_url,
p.title, pc.title,
c.category, pc.primary_category,
ROW_NUMBER() OVER (PARTITION BY p.podcast_id ORDER BY c.category) as category_rank COALESCE(ac.all_categories, pc.primary_category) as all_categories
FROM sqlite_scan(?, 'podcasts') p FROM primary_categories pc
LEFT JOIN sqlite_scan(?, 'categories') c ON p.podcast_id = c.podcast_id LEFT JOIN all_categories ac ON pc.podcast_id = ac.podcast_id
), ORDER BY pc.title
primary_categories AS ( """
SELECT
podcast_id, df = duckdb_conn.execute(query, [db_path, db_path]).df()
itunes_id,
slug, except Exception as e:
itunes_url, print(f"DuckDB complex query failed: {e}, using pandas fallback")
title,
category as primary_category # Fallback: Use pandas to process the data
FROM podcast_categories import pandas as pd
WHERE category_rank = 1 import sqlite3
),
all_categories AS ( sqlite_conn = sqlite3.connect(db_path)
SELECT try:
podcast_id, # Read podcasts and categories separately
STRING_AGG(category, '|' ORDER BY category) as all_categories podcasts_df = pd.read_sql_query("SELECT * FROM podcasts", sqlite_conn)
FROM podcast_categories categories_df = pd.read_sql_query("SELECT * FROM categories", sqlite_conn)
WHERE category IS NOT NULL
GROUP BY podcast_id # Group categories by podcast_id
) categories_grouped = categories_df.groupby('podcast_id')['category'].apply(
SELECT lambda x: '|'.join(sorted(x))
pc.podcast_id, ).reset_index()
pc.itunes_id, categories_grouped.columns = ['podcast_id', 'all_categories']
pc.slug,
pc.itunes_url, # Get primary category (first alphabetically)
pc.title, primary_categories = categories_df.sort_values('category').groupby('podcast_id').first().reset_index()
pc.primary_category, primary_categories = primary_categories[['podcast_id', 'category']].rename(columns={'category': 'primary_category'})
COALESCE(ac.all_categories, pc.primary_category) as all_categories
FROM primary_categories pc # Join everything together
LEFT JOIN all_categories ac ON pc.podcast_id = ac.podcast_id df = podcasts_df.merge(primary_categories, on='podcast_id', how='left')
ORDER BY pc.title df = df.merge(categories_grouped, on='podcast_id', how='left')
"""
# Fill missing values
df['primary_category'] = df['primary_category'].fillna('unknown')
df['all_categories'] = df['all_categories'].fillna(df['primary_category'])
# Sort by title
df = df.sort_values('title')
finally:
sqlite_conn.close()
# Execute query and save to parquet # Save to parquet with fallback
duckdb_conn.execute(f"COPY ({query}) TO '{output_file}' (FORMAT PARQUET)", [db_path, db_path]) save_dataframe_with_fallback(df, output_file, duckdb_conn, "parquet")
# Get row count for logging row_count = len(df)
count_result = duckdb_conn.execute(
"SELECT COUNT(*) FROM sqlite_scan(?, 'podcasts')",
[db_path]
).fetchone()
row_count = count_result[0] if count_result else 0
print(f"Extracted {row_count} podcasts with category information") print(f"Extracted {row_count} podcasts with category information")
finally: finally:
sqlite_conn.close()
duckdb_conn.close() duckdb_conn.close()
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -10,12 +10,28 @@ import re
from duckdb_utils import create_duckdb_connection, execute_query_with_fallback, save_dataframe_with_fallback from duckdb_utils import create_duckdb_connection, execute_query_with_fallback, save_dataframe_with_fallback
def main(): def main():
# Write debug info to understand what's being called
debug_file = "/tmp/databuild_test/reviews_main_debug.log"
try:
with open(debug_file, "w") as f:
f.write(f"main() called with sys.argv: {sys.argv}\n")
f.flush()
except:
pass
if len(sys.argv) < 2: if len(sys.argv) < 2:
print("Usage: extract_reviews_job.py {config|exec} [args...]", file=sys.stderr) print("Usage: extract_reviews_job.py {config|exec} [args...]", file=sys.stderr)
sys.exit(1) sys.exit(1)
command = sys.argv[1] command = sys.argv[1]
try:
with open(debug_file, "a") as f:
f.write(f"command: {command}\n")
f.flush()
except:
pass
if command == "config": if command == "config":
handle_config(sys.argv[2:]) handle_config(sys.argv[2:])
elif command == "exec": elif command == "exec":