fix podcast job
Some checks failed
/ setup (push) Has been cancelled

This commit is contained in:
Stuart Axelbrooke 2025-06-30 23:27:24 -07:00
parent 55c404ca2e
commit 2ecf080f16
4 changed files with 270 additions and 113 deletions

View file

@ -26,3 +26,64 @@ DataBuild is a bazel-based data build system. Key files:
- Graph analysis/execution in Rust
- Bazel rules for job orchestration
- Java/Python examples for different use cases
## DataBuild Job Architecture
### Job Target Structure
Each DataBuild job creates three Bazel targets:
- `job_name.cfg` - Configuration target (calls binary with "config" subcommand)
- `job_name.exec` - Execution target (calls binary with "exec" subcommand)
- `job_name` - Main job target (pipes config output to exec input)
### Unified Job Binary Pattern
Jobs use a single binary with subcommands:
```python
def main():
command = sys.argv[1] # "config" or "exec"
if command == "config":
handle_config(sys.argv[2:]) # Output job configuration JSON
elif command == "exec":
handle_exec(sys.argv[2:]) # Perform actual work
```
### Job Configuration Requirements
**CRITICAL**: Job configs must include non-empty `args` for execution to work:
```python
config = {
"configs": [{
"outputs": [{"str": partition_ref}],
"inputs": [...],
"args": ["some_arg"], # REQUIRED: Cannot be empty []
"env": {"PARTITION_REF": partition_ref}
}]
}
```
Jobs with `"args": []` will only have their config function called during execution, not exec.
### DataBuild Execution Flow
1. **Planning Phase**: DataBuild calls `.cfg` targets to get job configurations
2. **Execution Phase**: DataBuild calls main job targets which pipe config to exec
3. **Job Resolution**: Job lookup returns base job names (e.g., `//:job_name`), not `.cfg` variants
### Graph Configuration
```python
databuild_graph(
name = "my_graph",
jobs = [":job1", ":job2"], # Reference base job targets
lookup = ":job_lookup", # Binary that routes partition refs to jobs
)
```
### Job Lookup Pattern
```python
def lookup_job_for_partition(partition_ref: str) -> str:
if pattern.match(partition_ref):
return "//:job_name" # Return base job target
raise ValueError(f"No job found for: {partition_ref}")
```
### Common Pitfalls
- **Empty args**: Jobs with `"args": []` won't execute properly
- **Wrong target refs**: Job lookup must return base targets, not `.cfg` variants
- **Missing partition refs**: All outputs must be addressable via partition references

View file

@ -3,11 +3,11 @@
import sys
import json
import os
import duckdb
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any
import re
from duckdb_utils import create_duckdb_connection, read_dataframe_with_fallback, save_dataframe_with_fallback
def main():
if len(sys.argv) < 2:
@ -77,17 +77,33 @@ def handle_exec(args):
target_date = args[1]
partition_ref = os.getenv('PARTITION_REF', f'categorized_reviews/category={target_category}/date={target_date}')
# Input paths
reviews_file = f"/tmp/databuild_test/examples/podcast_reviews/reviews/date={target_date}/reviews.parquet"
podcasts_file = "/tmp/databuild_test/examples/podcast_reviews/podcasts/podcasts.parquet"
# Input paths - check for both parquet and CSV fallbacks
reviews_base = f"/tmp/databuild_test/examples/podcast_reviews/reviews/date={target_date}/reviews"
podcasts_base = "/tmp/databuild_test/examples/podcast_reviews/podcasts/podcasts"
# Check input files exist
if not os.path.exists(reviews_file):
print(f"Reviews file not found: {reviews_file}", file=sys.stderr)
reviews_file = None
podcasts_file = None
# Find reviews file (parquet or CSV)
for ext in ['.parquet', '.csv']:
candidate = reviews_base + ext
if os.path.exists(candidate):
reviews_file = candidate
break
# Find podcasts file (parquet or CSV)
for ext in ['.parquet', '.csv']:
candidate = podcasts_base + ext
if os.path.exists(candidate):
podcasts_file = candidate
break
if not reviews_file:
print(f"Reviews file not found: {reviews_base}.(parquet|csv)", file=sys.stderr)
sys.exit(1)
if not os.path.exists(podcasts_file):
print(f"Podcasts file not found: {podcasts_file}", file=sys.stderr)
if not podcasts_file:
print(f"Podcasts file not found: {podcasts_base}.(parquet|csv)", file=sys.stderr)
sys.exit(1)
# Output path
@ -136,50 +152,47 @@ def handle_exec(args):
def categorize_reviews_for_category_date(reviews_file: str, podcasts_file: str, target_category: str, output_file: str):
"""Join reviews with podcast categories and filter for target category."""
# Connect to DuckDB for processing
duckdb_conn = duckdb.connect()
# Connect to DuckDB with extension handling
duckdb_conn = create_duckdb_connection()
try:
# Try to install and load parquet extension, but don't fail if it's already installed
try:
duckdb_conn.execute("INSTALL parquet")
except Exception:
pass # Extension might already be installed
# Read input files with fallback handling
reviews_df = read_dataframe_with_fallback(reviews_file, duckdb_conn)
podcasts_df = read_dataframe_with_fallback(podcasts_file, duckdb_conn)
duckdb_conn.execute("LOAD parquet")
# Perform join and filtering in pandas
import pandas as pd
# Query to join reviews with podcasts and filter by category
query = f"""
SELECT
r.podcast_id,
r.review_title,
r.content,
r.rating,
r.author_id,
r.created_at,
r.review_date,
p.title as podcast_title,
p.primary_category,
p.all_categories,
'{target_category}' as target_category
FROM parquet_scan('{reviews_file}') r
JOIN parquet_scan('{podcasts_file}') p ON r.podcast_id = p.podcast_id
WHERE p.primary_category = '{target_category}'
OR p.all_categories LIKE '%{target_category}%'
ORDER BY r.created_at
"""
# Join reviews with podcasts
joined_df = reviews_df.merge(podcasts_df, on='podcast_id', how='inner')
# Execute query and save to parquet
duckdb_conn.execute(f"COPY ({query}) TO '{output_file}' (FORMAT PARQUET)")
# Filter by category
filtered_df = joined_df[
(joined_df['primary_category'] == target_category) |
(joined_df['all_categories'].str.contains(target_category, na=False))
].copy()
# Get row count for logging
count_result = duckdb_conn.execute(f"SELECT COUNT(*) FROM ({query})").fetchone()
# Add target category column
filtered_df['target_category'] = target_category
row_count = count_result[0] if count_result else 0
# Select and rename columns to match expected output
result_df = filtered_df[[
'podcast_id', 'review_title', 'content', 'rating', 'author_id',
'created_at', 'review_date', 'title', 'primary_category',
'all_categories', 'target_category'
]].rename(columns={'title': 'podcast_title'})
# Sort by created_at
result_df = result_df.sort_values('created_at')
# Save to parquet with fallback
save_dataframe_with_fallback(result_df, output_file, duckdb_conn, "parquet")
row_count = len(result_df)
print(f"Categorized {row_count} reviews for category '{target_category}'")
if row_count == 0:
print(f"Warning: No reviews found for category '{target_category}' on date '{reviews_file.split('date=')[1].split('/')[0]}'")
print(f"Warning: No reviews found for category '{target_category}'")
finally:
duckdb_conn.close()

View file

@ -3,19 +3,34 @@
import sys
import json
import os
import sqlite3
import duckdb
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any
from duckdb_utils import create_duckdb_connection, execute_query_with_fallback, save_dataframe_with_fallback
def main():
# Write debug at the very start to see if main is called
debug_file = "/tmp/databuild_test/podcasts_main_debug.log"
try:
with open(debug_file, "w") as f:
f.write(f"main() called with sys.argv: {sys.argv}\n")
f.flush()
except:
pass
if len(sys.argv) < 2:
print("Usage: extract_podcasts_job.py {config|exec} [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
try:
with open(debug_file, "a") as f:
f.write(f"command: {command}\n")
f.flush()
except:
pass
if command == "config":
handle_config(sys.argv[2:])
elif command == "exec":
@ -40,7 +55,7 @@ def handle_config(args):
"configs": [{
"outputs": [{"str": partition_ref}],
"inputs": [],
"args": [],
"args": ["all"],
"env": {
"PARTITION_REF": partition_ref
}
@ -50,7 +65,19 @@ def handle_config(args):
print(json.dumps(config))
def handle_exec(args):
# Write debug info to a file since stdout might not be captured
debug_file = "/tmp/databuild_test/podcasts_debug.log"
with open(debug_file, "w") as f:
f.write(f"Starting extract_podcasts_job.exec with args: {args}\n")
f.flush()
print(f"Starting extract_podcasts_job.exec with args: {args}")
partition_ref = os.getenv('PARTITION_REF', 'podcasts/all')
print(f"Partition ref: {partition_ref}")
with open(debug_file, "a") as f:
f.write(f"Partition ref: {partition_ref}\n")
f.flush()
# Database paths
db_path = "/tmp/databuild_test/examples/podcast_reviews/data/ingest/database.sqlite"
@ -58,17 +85,26 @@ def handle_exec(args):
# Fallback to relative path for development
db_path = "data/ingest/database.sqlite"
print(f"Looking for database at: {db_path}")
print(f"Database exists: {os.path.exists(db_path)}")
# Output path
output_dir = Path("/tmp/databuild_test/examples/podcast_reviews/podcasts")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "podcasts.parquet"
print(f"Output directory: {output_dir}")
print(f"Output file: {output_file}")
try:
# Extract all podcasts with their categories
extract_podcasts_with_categories(db_path, str(output_file))
print("Calling extract_podcasts_with_categories...")
result = extract_podcasts_with_categories(db_path, str(output_file))
print(f"extract_podcasts_with_categories returned: {result}")
print(f"Successfully extracted podcast metadata")
print(f"Output written to: {output_file}")
print(f"Output file exists: {output_file.exists()}")
# Create manifest
manifest = {
@ -91,29 +127,28 @@ def handle_exec(args):
with open(manifest_file, 'w') as f:
json.dump(manifest, f, indent=2)
print(f"Manifest written to: {manifest_file}")
except Exception as e:
print(f"Error extracting podcasts: {e}", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)
def extract_podcasts_with_categories(db_path: str, output_file: str):
"""Extract all podcasts with their categories and save as parquet."""
# Connect to SQLite
sqlite_conn = sqlite3.connect(db_path)
print(f"extract_podcasts_with_categories called with db_path={db_path}, output_file={output_file}")
# Connect to DuckDB for processing
duckdb_conn = duckdb.connect()
# Connect to DuckDB with extension handling
print("Creating DuckDB connection...")
duckdb_conn = create_duckdb_connection()
print("DuckDB connection created")
try:
# Try to install and load parquet extension, but don't fail if it's already installed
# Use a simpler approach that works with SQLite fallback
try:
duckdb_conn.execute("INSTALL parquet")
except Exception:
pass # Extension might already be installed
duckdb_conn.execute("LOAD parquet")
# Query to get podcasts with categories (handling multiple categories per podcast)
# Try complex DuckDB query first
query = """
WITH podcast_categories AS (
SELECT
@ -159,20 +194,52 @@ def extract_podcasts_with_categories(db_path: str, output_file: str):
ORDER BY pc.title
"""
# Execute query and save to parquet
duckdb_conn.execute(f"COPY ({query}) TO '{output_file}' (FORMAT PARQUET)", [db_path, db_path])
df = duckdb_conn.execute(query, [db_path, db_path]).df()
# Get row count for logging
count_result = duckdb_conn.execute(
"SELECT COUNT(*) FROM sqlite_scan(?, 'podcasts')",
[db_path]
).fetchone()
except Exception as e:
print(f"DuckDB complex query failed: {e}, using pandas fallback")
row_count = count_result[0] if count_result else 0
print(f"Extracted {row_count} podcasts with category information")
# Fallback: Use pandas to process the data
import pandas as pd
import sqlite3
sqlite_conn = sqlite3.connect(db_path)
try:
# Read podcasts and categories separately
podcasts_df = pd.read_sql_query("SELECT * FROM podcasts", sqlite_conn)
categories_df = pd.read_sql_query("SELECT * FROM categories", sqlite_conn)
# Group categories by podcast_id
categories_grouped = categories_df.groupby('podcast_id')['category'].apply(
lambda x: '|'.join(sorted(x))
).reset_index()
categories_grouped.columns = ['podcast_id', 'all_categories']
# Get primary category (first alphabetically)
primary_categories = categories_df.sort_values('category').groupby('podcast_id').first().reset_index()
primary_categories = primary_categories[['podcast_id', 'category']].rename(columns={'category': 'primary_category'})
# Join everything together
df = podcasts_df.merge(primary_categories, on='podcast_id', how='left')
df = df.merge(categories_grouped, on='podcast_id', how='left')
# Fill missing values
df['primary_category'] = df['primary_category'].fillna('unknown')
df['all_categories'] = df['all_categories'].fillna(df['primary_category'])
# Sort by title
df = df.sort_values('title')
finally:
sqlite_conn.close()
# Save to parquet with fallback
save_dataframe_with_fallback(df, output_file, duckdb_conn, "parquet")
row_count = len(df)
print(f"Extracted {row_count} podcasts with category information")
finally:
duckdb_conn.close()
if __name__ == "__main__":

View file

@ -10,12 +10,28 @@ import re
from duckdb_utils import create_duckdb_connection, execute_query_with_fallback, save_dataframe_with_fallback
def main():
# Write debug info to understand what's being called
debug_file = "/tmp/databuild_test/reviews_main_debug.log"
try:
with open(debug_file, "w") as f:
f.write(f"main() called with sys.argv: {sys.argv}\n")
f.flush()
except:
pass
if len(sys.argv) < 2:
print("Usage: extract_reviews_job.py {config|exec} [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
try:
with open(debug_file, "a") as f:
f.write(f"command: {command}\n")
f.flush()
except:
pass
if command == "config":
handle_config(sys.argv[2:])
elif command == "exec":