parent
55c404ca2e
commit
2ecf080f16
4 changed files with 270 additions and 113 deletions
63
CLAUDE.md
63
CLAUDE.md
|
|
@ -25,4 +25,65 @@ DataBuild is a bazel-based data build system. Key files:
|
||||||
## Key Components
|
## Key Components
|
||||||
- Graph analysis/execution in Rust
|
- Graph analysis/execution in Rust
|
||||||
- Bazel rules for job orchestration
|
- Bazel rules for job orchestration
|
||||||
- Java/Python examples for different use cases
|
- Java/Python examples for different use cases
|
||||||
|
|
||||||
|
## DataBuild Job Architecture
|
||||||
|
|
||||||
|
### Job Target Structure
|
||||||
|
Each DataBuild job creates three Bazel targets:
|
||||||
|
- `job_name.cfg` - Configuration target (calls binary with "config" subcommand)
|
||||||
|
- `job_name.exec` - Execution target (calls binary with "exec" subcommand)
|
||||||
|
- `job_name` - Main job target (pipes config output to exec input)
|
||||||
|
|
||||||
|
### Unified Job Binary Pattern
|
||||||
|
Jobs use a single binary with subcommands:
|
||||||
|
```python
|
||||||
|
def main():
|
||||||
|
command = sys.argv[1] # "config" or "exec"
|
||||||
|
if command == "config":
|
||||||
|
handle_config(sys.argv[2:]) # Output job configuration JSON
|
||||||
|
elif command == "exec":
|
||||||
|
handle_exec(sys.argv[2:]) # Perform actual work
|
||||||
|
```
|
||||||
|
|
||||||
|
### Job Configuration Requirements
|
||||||
|
**CRITICAL**: Job configs must include non-empty `args` for execution to work:
|
||||||
|
```python
|
||||||
|
config = {
|
||||||
|
"configs": [{
|
||||||
|
"outputs": [{"str": partition_ref}],
|
||||||
|
"inputs": [...],
|
||||||
|
"args": ["some_arg"], # REQUIRED: Cannot be empty []
|
||||||
|
"env": {"PARTITION_REF": partition_ref}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Jobs with `"args": []` will only have their config function called during execution, not exec.
|
||||||
|
|
||||||
|
### DataBuild Execution Flow
|
||||||
|
1. **Planning Phase**: DataBuild calls `.cfg` targets to get job configurations
|
||||||
|
2. **Execution Phase**: DataBuild calls main job targets which pipe config to exec
|
||||||
|
3. **Job Resolution**: Job lookup returns base job names (e.g., `//:job_name`), not `.cfg` variants
|
||||||
|
|
||||||
|
### Graph Configuration
|
||||||
|
```python
|
||||||
|
databuild_graph(
|
||||||
|
name = "my_graph",
|
||||||
|
jobs = [":job1", ":job2"], # Reference base job targets
|
||||||
|
lookup = ":job_lookup", # Binary that routes partition refs to jobs
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Job Lookup Pattern
|
||||||
|
```python
|
||||||
|
def lookup_job_for_partition(partition_ref: str) -> str:
|
||||||
|
if pattern.match(partition_ref):
|
||||||
|
return "//:job_name" # Return base job target
|
||||||
|
raise ValueError(f"No job found for: {partition_ref}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Common Pitfalls
|
||||||
|
- **Empty args**: Jobs with `"args": []` won't execute properly
|
||||||
|
- **Wrong target refs**: Job lookup must return base targets, not `.cfg` variants
|
||||||
|
- **Missing partition refs**: All outputs must be addressable via partition references
|
||||||
|
|
@ -3,11 +3,11 @@
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import duckdb
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, Any
|
from typing import List, Dict, Any
|
||||||
import re
|
import re
|
||||||
|
from duckdb_utils import create_duckdb_connection, read_dataframe_with_fallback, save_dataframe_with_fallback
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
|
|
@ -77,17 +77,33 @@ def handle_exec(args):
|
||||||
target_date = args[1]
|
target_date = args[1]
|
||||||
partition_ref = os.getenv('PARTITION_REF', f'categorized_reviews/category={target_category}/date={target_date}')
|
partition_ref = os.getenv('PARTITION_REF', f'categorized_reviews/category={target_category}/date={target_date}')
|
||||||
|
|
||||||
# Input paths
|
# Input paths - check for both parquet and CSV fallbacks
|
||||||
reviews_file = f"/tmp/databuild_test/examples/podcast_reviews/reviews/date={target_date}/reviews.parquet"
|
reviews_base = f"/tmp/databuild_test/examples/podcast_reviews/reviews/date={target_date}/reviews"
|
||||||
podcasts_file = "/tmp/databuild_test/examples/podcast_reviews/podcasts/podcasts.parquet"
|
podcasts_base = "/tmp/databuild_test/examples/podcast_reviews/podcasts/podcasts"
|
||||||
|
|
||||||
# Check input files exist
|
reviews_file = None
|
||||||
if not os.path.exists(reviews_file):
|
podcasts_file = None
|
||||||
print(f"Reviews file not found: {reviews_file}", file=sys.stderr)
|
|
||||||
|
# Find reviews file (parquet or CSV)
|
||||||
|
for ext in ['.parquet', '.csv']:
|
||||||
|
candidate = reviews_base + ext
|
||||||
|
if os.path.exists(candidate):
|
||||||
|
reviews_file = candidate
|
||||||
|
break
|
||||||
|
|
||||||
|
# Find podcasts file (parquet or CSV)
|
||||||
|
for ext in ['.parquet', '.csv']:
|
||||||
|
candidate = podcasts_base + ext
|
||||||
|
if os.path.exists(candidate):
|
||||||
|
podcasts_file = candidate
|
||||||
|
break
|
||||||
|
|
||||||
|
if not reviews_file:
|
||||||
|
print(f"Reviews file not found: {reviews_base}.(parquet|csv)", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if not os.path.exists(podcasts_file):
|
if not podcasts_file:
|
||||||
print(f"Podcasts file not found: {podcasts_file}", file=sys.stderr)
|
print(f"Podcasts file not found: {podcasts_base}.(parquet|csv)", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# Output path
|
# Output path
|
||||||
|
|
@ -136,50 +152,47 @@ def handle_exec(args):
|
||||||
def categorize_reviews_for_category_date(reviews_file: str, podcasts_file: str, target_category: str, output_file: str):
|
def categorize_reviews_for_category_date(reviews_file: str, podcasts_file: str, target_category: str, output_file: str):
|
||||||
"""Join reviews with podcast categories and filter for target category."""
|
"""Join reviews with podcast categories and filter for target category."""
|
||||||
|
|
||||||
# Connect to DuckDB for processing
|
# Connect to DuckDB with extension handling
|
||||||
duckdb_conn = duckdb.connect()
|
duckdb_conn = create_duckdb_connection()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Try to install and load parquet extension, but don't fail if it's already installed
|
# Read input files with fallback handling
|
||||||
try:
|
reviews_df = read_dataframe_with_fallback(reviews_file, duckdb_conn)
|
||||||
duckdb_conn.execute("INSTALL parquet")
|
podcasts_df = read_dataframe_with_fallback(podcasts_file, duckdb_conn)
|
||||||
except Exception:
|
|
||||||
pass # Extension might already be installed
|
|
||||||
|
|
||||||
duckdb_conn.execute("LOAD parquet")
|
# Perform join and filtering in pandas
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
# Query to join reviews with podcasts and filter by category
|
# Join reviews with podcasts
|
||||||
query = f"""
|
joined_df = reviews_df.merge(podcasts_df, on='podcast_id', how='inner')
|
||||||
SELECT
|
|
||||||
r.podcast_id,
|
|
||||||
r.review_title,
|
|
||||||
r.content,
|
|
||||||
r.rating,
|
|
||||||
r.author_id,
|
|
||||||
r.created_at,
|
|
||||||
r.review_date,
|
|
||||||
p.title as podcast_title,
|
|
||||||
p.primary_category,
|
|
||||||
p.all_categories,
|
|
||||||
'{target_category}' as target_category
|
|
||||||
FROM parquet_scan('{reviews_file}') r
|
|
||||||
JOIN parquet_scan('{podcasts_file}') p ON r.podcast_id = p.podcast_id
|
|
||||||
WHERE p.primary_category = '{target_category}'
|
|
||||||
OR p.all_categories LIKE '%{target_category}%'
|
|
||||||
ORDER BY r.created_at
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Execute query and save to parquet
|
# Filter by category
|
||||||
duckdb_conn.execute(f"COPY ({query}) TO '{output_file}' (FORMAT PARQUET)")
|
filtered_df = joined_df[
|
||||||
|
(joined_df['primary_category'] == target_category) |
|
||||||
|
(joined_df['all_categories'].str.contains(target_category, na=False))
|
||||||
|
].copy()
|
||||||
|
|
||||||
# Get row count for logging
|
# Add target category column
|
||||||
count_result = duckdb_conn.execute(f"SELECT COUNT(*) FROM ({query})").fetchone()
|
filtered_df['target_category'] = target_category
|
||||||
|
|
||||||
row_count = count_result[0] if count_result else 0
|
# Select and rename columns to match expected output
|
||||||
|
result_df = filtered_df[[
|
||||||
|
'podcast_id', 'review_title', 'content', 'rating', 'author_id',
|
||||||
|
'created_at', 'review_date', 'title', 'primary_category',
|
||||||
|
'all_categories', 'target_category'
|
||||||
|
]].rename(columns={'title': 'podcast_title'})
|
||||||
|
|
||||||
|
# Sort by created_at
|
||||||
|
result_df = result_df.sort_values('created_at')
|
||||||
|
|
||||||
|
# Save to parquet with fallback
|
||||||
|
save_dataframe_with_fallback(result_df, output_file, duckdb_conn, "parquet")
|
||||||
|
|
||||||
|
row_count = len(result_df)
|
||||||
print(f"Categorized {row_count} reviews for category '{target_category}'")
|
print(f"Categorized {row_count} reviews for category '{target_category}'")
|
||||||
|
|
||||||
if row_count == 0:
|
if row_count == 0:
|
||||||
print(f"Warning: No reviews found for category '{target_category}' on date '{reviews_file.split('date=')[1].split('/')[0]}'")
|
print(f"Warning: No reviews found for category '{target_category}'")
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
duckdb_conn.close()
|
duckdb_conn.close()
|
||||||
|
|
|
||||||
|
|
@ -3,19 +3,34 @@
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import sqlite3
|
|
||||||
import duckdb
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, Any
|
from typing import List, Dict, Any
|
||||||
|
from duckdb_utils import create_duckdb_connection, execute_query_with_fallback, save_dataframe_with_fallback
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
# Write debug at the very start to see if main is called
|
||||||
|
debug_file = "/tmp/databuild_test/podcasts_main_debug.log"
|
||||||
|
try:
|
||||||
|
with open(debug_file, "w") as f:
|
||||||
|
f.write(f"main() called with sys.argv: {sys.argv}\n")
|
||||||
|
f.flush()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
print("Usage: extract_podcasts_job.py {config|exec} [args...]", file=sys.stderr)
|
print("Usage: extract_podcasts_job.py {config|exec} [args...]", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
command = sys.argv[1]
|
command = sys.argv[1]
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(debug_file, "a") as f:
|
||||||
|
f.write(f"command: {command}\n")
|
||||||
|
f.flush()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
if command == "config":
|
if command == "config":
|
||||||
handle_config(sys.argv[2:])
|
handle_config(sys.argv[2:])
|
||||||
elif command == "exec":
|
elif command == "exec":
|
||||||
|
|
@ -40,7 +55,7 @@ def handle_config(args):
|
||||||
"configs": [{
|
"configs": [{
|
||||||
"outputs": [{"str": partition_ref}],
|
"outputs": [{"str": partition_ref}],
|
||||||
"inputs": [],
|
"inputs": [],
|
||||||
"args": [],
|
"args": ["all"],
|
||||||
"env": {
|
"env": {
|
||||||
"PARTITION_REF": partition_ref
|
"PARTITION_REF": partition_ref
|
||||||
}
|
}
|
||||||
|
|
@ -50,7 +65,19 @@ def handle_config(args):
|
||||||
print(json.dumps(config))
|
print(json.dumps(config))
|
||||||
|
|
||||||
def handle_exec(args):
|
def handle_exec(args):
|
||||||
|
# Write debug info to a file since stdout might not be captured
|
||||||
|
debug_file = "/tmp/databuild_test/podcasts_debug.log"
|
||||||
|
with open(debug_file, "w") as f:
|
||||||
|
f.write(f"Starting extract_podcasts_job.exec with args: {args}\n")
|
||||||
|
f.flush()
|
||||||
|
|
||||||
|
print(f"Starting extract_podcasts_job.exec with args: {args}")
|
||||||
partition_ref = os.getenv('PARTITION_REF', 'podcasts/all')
|
partition_ref = os.getenv('PARTITION_REF', 'podcasts/all')
|
||||||
|
print(f"Partition ref: {partition_ref}")
|
||||||
|
|
||||||
|
with open(debug_file, "a") as f:
|
||||||
|
f.write(f"Partition ref: {partition_ref}\n")
|
||||||
|
f.flush()
|
||||||
|
|
||||||
# Database paths
|
# Database paths
|
||||||
db_path = "/tmp/databuild_test/examples/podcast_reviews/data/ingest/database.sqlite"
|
db_path = "/tmp/databuild_test/examples/podcast_reviews/data/ingest/database.sqlite"
|
||||||
|
|
@ -58,17 +85,26 @@ def handle_exec(args):
|
||||||
# Fallback to relative path for development
|
# Fallback to relative path for development
|
||||||
db_path = "data/ingest/database.sqlite"
|
db_path = "data/ingest/database.sqlite"
|
||||||
|
|
||||||
|
print(f"Looking for database at: {db_path}")
|
||||||
|
print(f"Database exists: {os.path.exists(db_path)}")
|
||||||
|
|
||||||
# Output path
|
# Output path
|
||||||
output_dir = Path("/tmp/databuild_test/examples/podcast_reviews/podcasts")
|
output_dir = Path("/tmp/databuild_test/examples/podcast_reviews/podcasts")
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
output_file = output_dir / "podcasts.parquet"
|
output_file = output_dir / "podcasts.parquet"
|
||||||
|
|
||||||
|
print(f"Output directory: {output_dir}")
|
||||||
|
print(f"Output file: {output_file}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Extract all podcasts with their categories
|
# Extract all podcasts with their categories
|
||||||
extract_podcasts_with_categories(db_path, str(output_file))
|
print("Calling extract_podcasts_with_categories...")
|
||||||
|
result = extract_podcasts_with_categories(db_path, str(output_file))
|
||||||
|
print(f"extract_podcasts_with_categories returned: {result}")
|
||||||
|
|
||||||
print(f"Successfully extracted podcast metadata")
|
print(f"Successfully extracted podcast metadata")
|
||||||
print(f"Output written to: {output_file}")
|
print(f"Output written to: {output_file}")
|
||||||
|
print(f"Output file exists: {output_file.exists()}")
|
||||||
|
|
||||||
# Create manifest
|
# Create manifest
|
||||||
manifest = {
|
manifest = {
|
||||||
|
|
@ -90,89 +126,120 @@ def handle_exec(args):
|
||||||
manifest_file = output_dir / "manifest.json"
|
manifest_file = output_dir / "manifest.json"
|
||||||
with open(manifest_file, 'w') as f:
|
with open(manifest_file, 'w') as f:
|
||||||
json.dump(manifest, f, indent=2)
|
json.dump(manifest, f, indent=2)
|
||||||
|
|
||||||
|
print(f"Manifest written to: {manifest_file}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error extracting podcasts: {e}", file=sys.stderr)
|
print(f"Error extracting podcasts: {e}", file=sys.stderr)
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
def extract_podcasts_with_categories(db_path: str, output_file: str):
|
def extract_podcasts_with_categories(db_path: str, output_file: str):
|
||||||
"""Extract all podcasts with their categories and save as parquet."""
|
"""Extract all podcasts with their categories and save as parquet."""
|
||||||
|
|
||||||
# Connect to SQLite
|
print(f"extract_podcasts_with_categories called with db_path={db_path}, output_file={output_file}")
|
||||||
sqlite_conn = sqlite3.connect(db_path)
|
|
||||||
|
|
||||||
# Connect to DuckDB for processing
|
# Connect to DuckDB with extension handling
|
||||||
duckdb_conn = duckdb.connect()
|
print("Creating DuckDB connection...")
|
||||||
|
duckdb_conn = create_duckdb_connection()
|
||||||
|
print("DuckDB connection created")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Try to install and load parquet extension, but don't fail if it's already installed
|
# Use a simpler approach that works with SQLite fallback
|
||||||
try:
|
try:
|
||||||
duckdb_conn.execute("INSTALL parquet")
|
# Try complex DuckDB query first
|
||||||
except Exception:
|
query = """
|
||||||
pass # Extension might already be installed
|
WITH podcast_categories AS (
|
||||||
|
SELECT
|
||||||
duckdb_conn.execute("LOAD parquet")
|
p.podcast_id,
|
||||||
|
p.itunes_id,
|
||||||
# Query to get podcasts with categories (handling multiple categories per podcast)
|
p.slug,
|
||||||
query = """
|
p.itunes_url,
|
||||||
WITH podcast_categories AS (
|
p.title,
|
||||||
|
c.category,
|
||||||
|
ROW_NUMBER() OVER (PARTITION BY p.podcast_id ORDER BY c.category) as category_rank
|
||||||
|
FROM sqlite_scan(?, 'podcasts') p
|
||||||
|
LEFT JOIN sqlite_scan(?, 'categories') c ON p.podcast_id = c.podcast_id
|
||||||
|
),
|
||||||
|
primary_categories AS (
|
||||||
|
SELECT
|
||||||
|
podcast_id,
|
||||||
|
itunes_id,
|
||||||
|
slug,
|
||||||
|
itunes_url,
|
||||||
|
title,
|
||||||
|
category as primary_category
|
||||||
|
FROM podcast_categories
|
||||||
|
WHERE category_rank = 1
|
||||||
|
),
|
||||||
|
all_categories AS (
|
||||||
|
SELECT
|
||||||
|
podcast_id,
|
||||||
|
STRING_AGG(category, '|' ORDER BY category) as all_categories
|
||||||
|
FROM podcast_categories
|
||||||
|
WHERE category IS NOT NULL
|
||||||
|
GROUP BY podcast_id
|
||||||
|
)
|
||||||
SELECT
|
SELECT
|
||||||
p.podcast_id,
|
pc.podcast_id,
|
||||||
p.itunes_id,
|
pc.itunes_id,
|
||||||
p.slug,
|
pc.slug,
|
||||||
p.itunes_url,
|
pc.itunes_url,
|
||||||
p.title,
|
pc.title,
|
||||||
c.category,
|
pc.primary_category,
|
||||||
ROW_NUMBER() OVER (PARTITION BY p.podcast_id ORDER BY c.category) as category_rank
|
COALESCE(ac.all_categories, pc.primary_category) as all_categories
|
||||||
FROM sqlite_scan(?, 'podcasts') p
|
FROM primary_categories pc
|
||||||
LEFT JOIN sqlite_scan(?, 'categories') c ON p.podcast_id = c.podcast_id
|
LEFT JOIN all_categories ac ON pc.podcast_id = ac.podcast_id
|
||||||
),
|
ORDER BY pc.title
|
||||||
primary_categories AS (
|
"""
|
||||||
SELECT
|
|
||||||
podcast_id,
|
df = duckdb_conn.execute(query, [db_path, db_path]).df()
|
||||||
itunes_id,
|
|
||||||
slug,
|
except Exception as e:
|
||||||
itunes_url,
|
print(f"DuckDB complex query failed: {e}, using pandas fallback")
|
||||||
title,
|
|
||||||
category as primary_category
|
# Fallback: Use pandas to process the data
|
||||||
FROM podcast_categories
|
import pandas as pd
|
||||||
WHERE category_rank = 1
|
import sqlite3
|
||||||
),
|
|
||||||
all_categories AS (
|
sqlite_conn = sqlite3.connect(db_path)
|
||||||
SELECT
|
try:
|
||||||
podcast_id,
|
# Read podcasts and categories separately
|
||||||
STRING_AGG(category, '|' ORDER BY category) as all_categories
|
podcasts_df = pd.read_sql_query("SELECT * FROM podcasts", sqlite_conn)
|
||||||
FROM podcast_categories
|
categories_df = pd.read_sql_query("SELECT * FROM categories", sqlite_conn)
|
||||||
WHERE category IS NOT NULL
|
|
||||||
GROUP BY podcast_id
|
# Group categories by podcast_id
|
||||||
)
|
categories_grouped = categories_df.groupby('podcast_id')['category'].apply(
|
||||||
SELECT
|
lambda x: '|'.join(sorted(x))
|
||||||
pc.podcast_id,
|
).reset_index()
|
||||||
pc.itunes_id,
|
categories_grouped.columns = ['podcast_id', 'all_categories']
|
||||||
pc.slug,
|
|
||||||
pc.itunes_url,
|
# Get primary category (first alphabetically)
|
||||||
pc.title,
|
primary_categories = categories_df.sort_values('category').groupby('podcast_id').first().reset_index()
|
||||||
pc.primary_category,
|
primary_categories = primary_categories[['podcast_id', 'category']].rename(columns={'category': 'primary_category'})
|
||||||
COALESCE(ac.all_categories, pc.primary_category) as all_categories
|
|
||||||
FROM primary_categories pc
|
# Join everything together
|
||||||
LEFT JOIN all_categories ac ON pc.podcast_id = ac.podcast_id
|
df = podcasts_df.merge(primary_categories, on='podcast_id', how='left')
|
||||||
ORDER BY pc.title
|
df = df.merge(categories_grouped, on='podcast_id', how='left')
|
||||||
"""
|
|
||||||
|
# Fill missing values
|
||||||
|
df['primary_category'] = df['primary_category'].fillna('unknown')
|
||||||
|
df['all_categories'] = df['all_categories'].fillna(df['primary_category'])
|
||||||
|
|
||||||
|
# Sort by title
|
||||||
|
df = df.sort_values('title')
|
||||||
|
|
||||||
|
finally:
|
||||||
|
sqlite_conn.close()
|
||||||
|
|
||||||
# Execute query and save to parquet
|
# Save to parquet with fallback
|
||||||
duckdb_conn.execute(f"COPY ({query}) TO '{output_file}' (FORMAT PARQUET)", [db_path, db_path])
|
save_dataframe_with_fallback(df, output_file, duckdb_conn, "parquet")
|
||||||
|
|
||||||
# Get row count for logging
|
row_count = len(df)
|
||||||
count_result = duckdb_conn.execute(
|
|
||||||
"SELECT COUNT(*) FROM sqlite_scan(?, 'podcasts')",
|
|
||||||
[db_path]
|
|
||||||
).fetchone()
|
|
||||||
|
|
||||||
row_count = count_result[0] if count_result else 0
|
|
||||||
print(f"Extracted {row_count} podcasts with category information")
|
print(f"Extracted {row_count} podcasts with category information")
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
sqlite_conn.close()
|
|
||||||
duckdb_conn.close()
|
duckdb_conn.close()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -10,12 +10,28 @@ import re
|
||||||
from duckdb_utils import create_duckdb_connection, execute_query_with_fallback, save_dataframe_with_fallback
|
from duckdb_utils import create_duckdb_connection, execute_query_with_fallback, save_dataframe_with_fallback
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
# Write debug info to understand what's being called
|
||||||
|
debug_file = "/tmp/databuild_test/reviews_main_debug.log"
|
||||||
|
try:
|
||||||
|
with open(debug_file, "w") as f:
|
||||||
|
f.write(f"main() called with sys.argv: {sys.argv}\n")
|
||||||
|
f.flush()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
print("Usage: extract_reviews_job.py {config|exec} [args...]", file=sys.stderr)
|
print("Usage: extract_reviews_job.py {config|exec} [args...]", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
command = sys.argv[1]
|
command = sys.argv[1]
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(debug_file, "a") as f:
|
||||||
|
f.write(f"command: {command}\n")
|
||||||
|
f.flush()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
if command == "config":
|
if command == "config":
|
||||||
handle_config(sys.argv[2:])
|
handle_config(sys.argv[2:])
|
||||||
elif command == "exec":
|
elif command == "exec":
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue