Merge branch 'single-target'

# Conflicts: # CLAUDE.md
2025-07-02 21:39:49 -07:00 · 2025-07-02 21:39:49 -07:00 · f9cacf6491
commit f9cacf6491
parent 0f2e32b8db 40c8b200ef
39 changed files with 4089 additions and 340 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,3 +6,4 @@ databuild.iml
 examples/podcast_reviews/data
 .bazelbsp
 .aider*
+.venv
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -13,6 +13,8 @@ DataBuild is a bazel-based data build system. Key files:

 # Remote testing
 ./scripts/bb_remote_test_all
+
+# Do not try to `bazel test //examples/basic_graph/...`, as this will not work.
 ```

 ## Project Structure
@ -23,4 +25,65 @@ DataBuild is a bazel-based data build system. Key files:
 ## Key Components
 - Graph analysis/execution in Rust
 - Bazel rules for job orchestration
- Java/Python examples for different use cases
+- Java/Python examples for different use cases
+
+## DataBuild Job Architecture
+
+### Job Target Structure
+Each DataBuild job creates three Bazel targets:
+- `job_name.cfg` - Configuration target (calls binary with "config" subcommand)
+- `job_name.exec` - Execution target (calls binary with "exec" subcommand)  
+- `job_name` - Main job target (pipes config output to exec input)
+
+### Unified Job Binary Pattern
+Jobs use a single binary with subcommands:
+```python
+def main():
+    command = sys.argv[1]  # "config" or "exec"
+    if command == "config":
+        handle_config(sys.argv[2:])  # Output job configuration JSON
+    elif command == "exec":
+        handle_exec(sys.argv[2:])    # Perform actual work
+```
+
+### Job Configuration Requirements
+**CRITICAL**: Job configs must include non-empty `args` for execution to work:
+```python
+config = {
+    "configs": [{
+        "outputs": [{"str": partition_ref}],
+        "inputs": [...],
+        "args": ["some_arg"],  # REQUIRED: Cannot be empty []
+        "env": {"PARTITION_REF": partition_ref}
+    }]
+}
+```
+
+Jobs with `"args": []` will only have their config function called during execution, not exec.
+
+### DataBuild Execution Flow
+1. **Planning Phase**: DataBuild calls `.cfg` targets to get job configurations
+2. **Execution Phase**: DataBuild calls main job targets which pipe config to exec
+3. **Job Resolution**: Job lookup returns base job names (e.g., `//:job_name`), not `.cfg` variants
+
+### Graph Configuration
+```python
+databuild_graph(
+    name = "my_graph",
+    jobs = [":job1", ":job2"],  # Reference base job targets
+    lookup = ":job_lookup",     # Binary that routes partition refs to jobs
+)
+```
+
+### Job Lookup Pattern
+```python
+def lookup_job_for_partition(partition_ref: str) -> str:
+    if pattern.match(partition_ref):
+        return "//:job_name"  # Return base job target
+    raise ValueError(f"No job found for: {partition_ref}")
+```
+
+### Common Pitfalls
+- **Empty args**: Jobs with `"args": []` won't execute properly
+- **Wrong target refs**: Job lookup must return base targets, not `.cfg` variants
+- **Missing partition refs**: All outputs must be addressable via partition references
--- a/MODULE.bazel.lock
+++ b/MODULE.bazel.lock
@ -187,7 +187,7 @@
    },
    "@@pybind11_bazel+//:python_configure.bzl%extension": {
      "general": {
-        "bzlTransitiveDigest": "d4N/SZrl3ONcmzE98rcV0Fsro0iUbjNQFTIiLiGuH+k=",
+        "bzlTransitiveDigest": "OMjJ8aOAn337bDg7jdyvF/juIrC2PpUcX6Dnf+nhcF0=",
        "usagesDigest": "fycyB39YnXIJkfWCIXLUKJMZzANcuLy9ZE73hRucjFk=",
        "recordedFileInputs": {
          "@@pybind11_bazel+//MODULE.bazel": "88af1c246226d87e65be78ed49ecd1e6f5e98648558c14ce99176da041dc378e"
@ -221,7 +221,7 @@
    },
    "@@rules_fuzzing+//fuzzing/private:extensions.bzl%non_module_dependencies": {
      "general": {
-        "bzlTransitiveDigest": "mGiTB79hRNjmeDTQdzkpCHyzXhErMbufeAmySBt7s5s=",
+        "bzlTransitiveDigest": "lxvzPQyluk241QRYY81nZHOcv5Id/5U2y6dp42qibis=",
        "usagesDigest": "wy6ISK6UOcBEjj/mvJ/S3WeXoO67X+1llb9yPyFtPgc=",
        "recordedFileInputs": {},
        "recordedDirentsInputs": {},
@ -525,7 +525,7 @@
    },
    "@@rules_python+//python/private/pypi:pip.bzl%pip_internal": {
      "general": {
-        "bzlTransitiveDigest": "sCGUUdVOVATRPlKd1IJea1kfLmtsYsAZdVI5HkdAUQo=",
+        "bzlTransitiveDigest": "bKQjDjomeeeh547JZoDNozPUkVrO368PlWs0shDGtJU=",
        "usagesDigest": "OLoIStnzNObNalKEMRq99FqenhPGLFZ5utVLV4sz7OI=",
        "recordedFileInputs": {
          "@@rules_python+//tools/publish/requirements_darwin.txt": "2994136eab7e57b083c3de76faf46f70fad130bc8e7360a7fed2b288b69e79dc",
@ -4171,7 +4171,7 @@
    },
    "@@rules_rust+//crate_universe/private:internal_extensions.bzl%cu_nr": {
      "general": {
-        "bzlTransitiveDigest": "7DC2ciVAva/LfjqxrbJs5WDxaCDqDaPY4HXXZriW120=",
+        "bzlTransitiveDigest": "mDJ0pT/rBCHMm7FzlOzh9Qng+sXi1kyQXEU8TahWqRc=",
        "usagesDigest": "Pr9/2PR9/ujuo94SXikpx+fg31V4bDKobC10YJu+z5I=",
        "recordedFileInputs": {},
        "recordedDirentsInputs": {},
--- a/databuild/graph/analyze.rs
+++ b/databuild/graph/analyze.rs
@ -42,11 +42,12 @@ fn configure(job_label: &str, output_refs: &[String]) -> Result<Vec<Task>, Strin
    
    // Parse the job configurations
    let stdout = String::from_utf8_lossy(&output.stdout);
-    let job_configs: Vec<JobConfig> = serde_json::from_str(&stdout)
+    let job_configure_response: JobConfigureResponse = serde_json::from_str(&stdout)
        .map_err(|e| {
            error!("Error parsing job configs for {}: {}. `{}`", job_label, e, stdout);
            format!("Failed to parse job configs: {}", e)
        })?;
+    let job_configs = job_configure_response.configs;
    
    // Create tasks
    let tasks: Vec<Task> = job_configs.into_iter()
@ -232,11 +233,11 @@ fn plan(output_refs: &[String]) -> Result<JobGraph, String> {
        let mut new_unhandled_count = 0;
        for task in &new_nodes {
            for input in &task.config.inputs {
-                if input.dep_type == DataDepType::Materialize {
-                    if !unhandled_refs.contains(&input.reference) {
+                if input.dep_type == 1 { // MATERIALIZE = 1
+                    if !unhandled_refs.contains(&input.partition_ref.str) {
                        new_unhandled_count += 1;
                    }
-                    unhandled_refs.insert(input.reference.clone());
+                    unhandled_refs.insert(input.partition_ref.str.clone());
                }
            }
        }
@ -276,18 +277,19 @@ fn generate_mermaid_diagram(graph: &JobGraph) -> String {
    // Process each task in the graph
    for task in &graph.nodes {
        // Create a unique ID for this job+outputs combination
-        let outputs_key = task.config.outputs.join("_");
+        let outputs_strs: Vec<String> = task.config.outputs.iter().map(|o| o.str.clone()).collect();
+        let outputs_key = outputs_strs.join("_");
        let mut job_node_id = format!("job_{}", task.job_label.replace("//", "_"));
-        job_node_id = job_node_id.replace(":", "_");
-        job_node_id = format!("{}_{}", job_node_id, outputs_key.replace("/", "_"));
+        job_node_id = job_node_id.replace(":", "_").replace("=", "_").replace("?", "_").replace(" ", "_");
+        job_node_id = format!("{}_{}", job_node_id, outputs_key.replace("/", "_").replace("=", "_"));
        
        // Create a descriptive label that includes both job label and outputs
        let job_label = &task.job_label;
        let outputs_label = if !task.config.outputs.is_empty() {
            if task.config.outputs.len() == 1 {
-                format!(" [{}]", task.config.outputs[0])
+                format!(" [{}]", task.config.outputs[0].str)
            } else {
-                format!(" [{}, ...]", task.config.outputs[0])
+                format!(" [{}, ...]", task.config.outputs[0].str)
            }
        } else {
            String::new()
@ -307,11 +309,11 @@ fn generate_mermaid_diagram(graph: &JobGraph) -> String {
        
        // Process inputs (dependencies)
        for input in &task.config.inputs {
-            let ref_node_id = format!("ref_{}", input.reference.replace("/", "_"));
+            let ref_node_id = format!("ref_{}", input.partition_ref.str.replace("/", "_").replace("=", "_"));
            
            // Add the partition ref node if not already added
            if !added_refs.contains(&ref_node_id) {
-                let node_class = if is_output_ref.contains(&input.reference) {
+                let node_class = if is_output_ref.contains(&input.partition_ref.str) {
                    "outputPartition"
                } else {
                    "partition"
@ -321,14 +323,14 @@ fn generate_mermaid_diagram(graph: &JobGraph) -> String {
                mermaid.push_str(&format!(
                    "    {}[(\"{}\")]:::{}\n",
                    ref_node_id,
-                    input.reference,
+                    input.partition_ref.str.replace("/", "_").replace("=", "_"),
                    node_class
                ));
                added_refs.insert(ref_node_id.clone());
            }
            
            // Add the edge from input to job
-            if input.dep_type == DataDepType::Materialize {
+            if input.dep_type == 1 { // MATERIALIZE = 1
                // Solid line for materialize dependencies
                mermaid.push_str(&format!("    {} --> {}\n", ref_node_id, job_node_id));
            } else {
@ -339,11 +341,11 @@ fn generate_mermaid_diagram(graph: &JobGraph) -> String {
        
        // Process outputs
        for output in &task.config.outputs {
-            let ref_node_id = format!("ref_{}", output.replace("/", "_"));
+            let ref_node_id = format!("ref_{}", output.str.replace("/", "_").replace("=", "_"));
            
            // Add the partition ref node if not already added
            if !added_refs.contains(&ref_node_id) {
-                let node_class = if is_output_ref.contains(output) {
+                let node_class = if is_output_ref.contains(&output.str) {
                    "outputPartition"
                } else {
                    "partition"
@ -353,7 +355,7 @@ fn generate_mermaid_diagram(graph: &JobGraph) -> String {
                mermaid.push_str(&format!(
                    "    {}[(\"Partition: {}\")]:::{}\n",
                    ref_node_id,
-                    output,
+                    output.str,
                    node_class
                ));
                added_refs.insert(ref_node_id.clone());
--- a/databuild/graph/execute.rs
+++ b/databuild/graph/execute.rs
@ -1,4 +1,4 @@
-use structs::{DataDepType, JobConfig, JobGraph, Task};
+use structs::{JobGraph, Task};
 use crossbeam_channel::{Receiver, Sender};
 use log::{debug, error, info, warn};
 use serde::{Deserialize, Serialize};
@ -40,10 +40,10 @@ fn get_task_key(task: &Task) -> String {
    key_parts.push(task.job_label.clone());

    for input_dep in &task.config.inputs {
-        key_parts.push(format!("input:{}", input_dep.reference));
+        key_parts.push(format!("input:{}", input_dep.partition_ref.str));
    }
    for output_ref in &task.config.outputs {
-        key_parts.push(format!("output:{}", output_ref));
+        key_parts.push(format!("output:{}", output_ref.str));
    }
    key_parts.join("|")
 }
@ -242,13 +242,21 @@ fn worker(
 }

 fn is_task_ready(task: &Task, completed_outputs: &HashSet<String>) -> bool {
+    let mut missing_deps = Vec::new();
+    
    for dep in &task.config.inputs {
-        if dep.dep_type == DataDepType::Materialize {
-            if !completed_outputs.contains(&dep.reference) {
-                return false;
+        if dep.dep_type == 1 { // MATERIALIZE = 1
+            if !completed_outputs.contains(&dep.partition_ref.str) {
+                missing_deps.push(&dep.partition_ref.str);
            }
        }
    }
+    
+    if !missing_deps.is_empty() {
+        debug!("Task {} not ready - missing dependencies: {:?}", task.job_label, missing_deps);
+        return false;
+    }
+    
    true
 }

@ -341,7 +349,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
            if result.success {
                if let Some(original_task) = original_tasks_by_key.get(&result.task_key) {
                    for output_ref in &original_task.config.outputs {
-                        completed_outputs.insert(output_ref.clone());
+                        completed_outputs.insert(output_ref.str.clone());
                    }
                }
            } else {
@ -379,6 +387,34 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
        // 4. Periodic logging
        if last_log_time.elapsed() >= LOG_INTERVAL {
            log_status_summary(&task_states, &original_tasks_by_key);
+            
+            // Debug: Check for deadlock (pending tasks with no running tasks)
+            let has_pending = task_states.values().any(|s| *s == TaskState::Pending);
+            if has_pending && active_tasks_count == 0 {
+                warn!("Potential deadlock detected: {} pending tasks with no running tasks", 
+                      task_states.values().filter(|s| **s == TaskState::Pending).count());
+                
+                // Log details of pending tasks and their preconditions
+                for (key, state) in &task_states {
+                    if *state == TaskState::Pending {
+                        if let Some(task) = original_tasks_by_key.get(key) {
+                            warn!("Pending task: {} ({})", task.job_label, key);
+                            warn!("  Required inputs:");
+                            for dep in &task.config.inputs {
+                                if dep.dep_type == 1 { // MATERIALIZE = 1
+                                    let available = completed_outputs.contains(&dep.partition_ref.str);
+                                    warn!("    {} - {}", dep.partition_ref.str, if available { "AVAILABLE" } else { "MISSING" });
+                                }
+                            }
+                            warn!("  Produces outputs:");
+                            for output in &task.config.outputs {
+                                warn!("    {}", output.str);
+                            }
+                        }
+                    }
+                }
+            }
+            
            last_log_time = Instant::now();
        }

--- a/databuild/job/execute_wrapper.sh.tpl
+++ b/databuild/job/execute_wrapper.sh.tpl
@ -3,6 +3,8 @@ set -e

 %{RUNFILES_PREFIX}

+%{PREFIX}
+
 EXECUTE_BINARY="$(rlocation "_main/$(basename "%{EXECUTE_PATH}")")"
 JQ="$(rlocation "databuild+/databuild/runtime/$(basename "%{JQ_PATH}")")"

@ -44,4 +46,8 @@ while IFS= read -r arg; do
 done < <("$JQ" -r '.args[]' "$CONFIG_FILE")

 # Run the execution with both environment variables (already set) and arguments
-exec "$EXECUTE_BINARY" "${ARGS[@]}"
+if [[ -n "${EXECUTE_SUBCOMMAND:-}" ]]; then
+    exec "$EXECUTE_BINARY" "${EXECUTE_SUBCOMMAND}" "${ARGS[@]}" 
+else
+    exec "$EXECUTE_BINARY" "${ARGS[@]}"
+fi
--- a/databuild/rules.bzl
+++ b/databuild/rules.bzl
@ -30,30 +30,26 @@ source "${RUNFILES_DIR:-/dev/null}/$f" 2>/dev/null || \

 def databuild_job(
        name,
-        configure,
-        execute,
+        binary,
        visibility = None):
    """Creates a DataBuild job target with configuration and execution capabilities.

    Args:
        name: Name of the job target
-        configure: Target that implements the configuration logic
-        execute: Target that implements the execution logic
-        deps: List of other job_targets this job depends on
+        binary: Single binary target that handles both config and exec via subcommands
        visibility: Visibility specification
-        **kwargs: Additional attributes to pass to the underlying rule
    """

+    # Single binary approach - use subcommands
    _databuild_job_cfg_rule(
        name = name + ".cfg",
-        configure = configure,
+        configure = binary,
        visibility = visibility,
    )

-    # Create the main rule that serves as a provider for other targets
    _databuild_job_exec_rule(
        name = name + ".exec",
-        execute = execute,
+        execute = binary,
        visibility = visibility,
    )

@ -76,7 +72,7 @@ def _databuild_job_cfg_impl(ctx):
        substitutions = {
            "%{EXECUTABLE_PATH}": configure_path,
            "%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
-            "%{PREFIX}": "",
+            "%{PREFIX}": "EXECUTABLE_SUBCOMMAND=\"config\"\n",
        },
        is_executable = True,
    )
@ -132,6 +128,7 @@ def _databuild_job_exec_impl(ctx):
            "%{JQ_PATH}": jq_path,
            "%{EXECUTE_PATH}": execute_path,
            "%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
+            "%{PREFIX}": "EXECUTE_SUBCOMMAND=\"exec\"\n",
        },
        is_executable = True,
    )
--- a/databuild/runtime/simple_executable_wrapper.sh.tpl
+++ b/databuild/runtime/simple_executable_wrapper.sh.tpl
@ -8,4 +8,8 @@ set -e
 EXECUTABLE_BINARY="$(rlocation "_main/$(basename "%{EXECUTABLE_PATH}")")"

 # Run the configuration
-exec "${EXECUTABLE_BINARY}" "$@"
+if [[ -n "${EXECUTABLE_SUBCOMMAND:-}" ]]; then
+    exec "${EXECUTABLE_BINARY}" "${EXECUTABLE_SUBCOMMAND}" "$@"
+else
+    exec "${EXECUTABLE_BINARY}" "$@"
+fi
--- a/databuild/structs.rs
+++ b/databuild/structs.rs
@ -2,42 +2,53 @@ use std::collections::HashMap;
 use serde::{Deserialize, Serialize};
 use std::str::FromStr;

-// Data structures that mirror the Go implementation
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
-#[serde(rename_all = "lowercase")]
-pub enum DataDepType {
-    Query,
-    Materialize,
+// Data structures that follow the protobuf specification exactly
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PartitionRef {
+    #[serde(rename = "str")]
+    pub str: String,
 }

-impl FromStr for DataDepType {
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum DepType {
+    #[serde(rename = "QUERY")]
+    Query = 0,
+    #[serde(rename = "MATERIALIZE")]
+    Materialize = 1,
+}
+
+impl FromStr for DepType {
    type Err = String;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s.to_lowercase().as_str() {
-            "query" => Ok(DataDepType::Query),
-            "materialize" => Ok(DataDepType::Materialize),
-            _ => Err(format!("Unknown DataDepType: {}", s)),
+        match s.to_uppercase().as_str() {
+            "QUERY" => Ok(DepType::Query),
+            "MATERIALIZE" => Ok(DepType::Materialize),
+            _ => Err(format!("Unknown DepType: {}", s)),
        }
    }
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct DataDep {
-    #[serde(rename = "depType")]
-    pub dep_type: DataDepType,
-    #[serde(rename = "ref")]
-    pub reference: String,
+    pub dep_type: u32, // Protobuf enums are serialized as integers
+    pub partition_ref: PartitionRef,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct JobConfig {
    pub inputs: Vec<DataDep>,
-    pub outputs: Vec<String>,
+    pub outputs: Vec<PartitionRef>,
    pub args: Vec<String>,
    pub env: HashMap<String, String>,
 }

+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JobConfigureResponse {
+    pub configs: Vec<JobConfig>,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Task {
    #[serde(rename = "jobLabel")]
--- a/examples/basic_graph/BUILD.bazel
+++ b/examples/basic_graph/BUILD.bazel
@ -35,54 +35,34 @@ py_binary(

 databuild_job(
    name = "generate_number_job",
-    configure = ":generate_number_configure",
-    execute = ":generate_number_execute",
+    binary = ":generate_number_binary",
    visibility = ["//visibility:public"],
 )

 java_binary(
-    name = "generate_number_configure",
-    srcs = glob(["*.java"]),
-    create_executable = True,
-    main_class = "com.databuild.examples.basic_graph.GenerateConfigure",
+    name = "generate_number_binary",
+    srcs = ["UnifiedGenerateNumber.java"],
+    main_class = "com.databuild.examples.basic_graph.UnifiedGenerateNumber",
    deps = [
        "@maven//:com_fasterxml_jackson_core_jackson_annotations",
        "@maven//:com_fasterxml_jackson_core_jackson_core",
        "@maven//:com_fasterxml_jackson_core_jackson_databind",
-        "@maven//:com_fasterxml_jackson_module_jackson_module_jsonSchema",
    ],
 )

-java_binary(
-    name = "generate_number_execute",
-    srcs = glob(["GenerateExecute.java"]),
-    main_class = "com.databuild.examples.basic_graph.GenerateExecute",
-)
-
 databuild_job(
    name = "sum_job",
-    configure = ":sum_configure",
-    execute = ":sum_execute",
+    binary = ":sum_binary",
    visibility = ["//visibility:public"],
 )

 java_binary(
-    name = "sum_configure",
-    srcs = glob(["*.java"]),
-    main_class = "com.databuild.examples.basic_graph.SumConfigure",
+    name = "sum_binary",
+    srcs = ["UnifiedSum.java"],
+    main_class = "com.databuild.examples.basic_graph.UnifiedSum",
    deps = [
        "@maven//:com_fasterxml_jackson_core_jackson_annotations",
        "@maven//:com_fasterxml_jackson_core_jackson_core",
        "@maven//:com_fasterxml_jackson_core_jackson_databind",
-        "@maven//:com_fasterxml_jackson_module_jackson_module_jsonSchema",
    ],
-)
-
-java_binary(
-    name = "sum_execute",
-    srcs = glob([
-        "SumExecute.java",
-        "GenerateExecute.java",
-    ]),
-    main_class = "com.databuild.examples.basic_graph.SumExecute",
-)
+)
--- a/examples/basic_graph/GenerateConfigure.java
+++ b/examples/basic_graph/GenerateConfigure.java
@ -1,46 +0,0 @@
-package com.databuild.examples.basic_graph;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.fasterxml.jackson.databind.JsonNode;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.io.File;
-import java.util.Arrays;
-import java.util.Collections;
-
-/**
- * Configure class for generating a random number.
- * This class creates a job configuration for generating a random number based on the partition ref.
- */
-public class GenerateConfigure {
-    public static void main(String[] args) {
-        if (args.length < 1) {
-            System.err.println("Error: Partition ref is required");
-            System.exit(1);
-        }
-        List<JobConfig> configList = new ArrayList<>();
-
-        // Process each partition ref from input arguments
-        Arrays.stream(args).forEach(partitionRef -> {
-            // Create and populate JobConfig object
-            JobConfig config = new JobConfig();
-            config.outputs = Collections.singletonList(partitionRef);
-            config.args = Arrays.asList(partitionRef);
-            // inputs and env are already initialized as empty collections in the constructor
-            configList.add(config);
-        });
-
-        try {
-            ObjectMapper mapper = new ObjectMapper();
-
-            // Convert config list to JsonNode and serialize
-            JsonNode configNode = mapper.valueToTree(configList);
-            String jsonConfig = mapper.writeValueAsString(configNode);
-            System.out.println(jsonConfig);
-        } catch (Exception e) {
-            System.err.println("Error: Failed to validate or serialize config: " + e.getMessage());
-            System.exit(1);
-        }
-    }
-}
--- a/examples/basic_graph/GenerateExecute.java
+++ b/examples/basic_graph/GenerateExecute.java
@ -1,70 +0,0 @@
-package com.databuild.examples.basic_graph;
-
-import java.io.FileWriter;
-import java.io.IOException;
-import java.nio.charset.StandardCharsets;
-import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
-import java.util.Random;
-
-/**
- * Execute class for generating a random number.
- * This class generates a random number based on the partition ref.
- */
-public class GenerateExecute {
-    public static String BASE_PATH = "/tmp/databuild_test/examples/basic_graph/";
-
-    public static void main(String[] args) {
-        if (args.length < 1) {
-            System.err.println("Error: Partition ref (output path) is required");
-            System.exit(1);
-        }
-        
-        String partitionRef = args[0];
-        
-        try {
-            // Create a hash of the partition ref to use as a seed
-            MessageDigest md = MessageDigest.getInstance("MD5");
-            byte[] hashBytes = md.digest(partitionRef.getBytes(StandardCharsets.UTF_8));
-            
-            // Convert the first 8 bytes of the hash to a long to use as a seed
-            long seed = 0;
-            for (int i = 0; i < Math.min(8, hashBytes.length); i++) {
-                seed = (seed << 8) | (hashBytes[i] & 0xff);
-            }
-            
-            // Create a random number generator with the seed
-            Random random = new Random(seed);
-            
-            // Generate a random number
-            int randomNumber = random.nextInt(100);
-            
-            // Write the random number to the output file
-            // Ensure dir exists
-            java.io.File parent = new java.io.File(partitionRef).getParentFile();
-            if (parent != null) {
-                parent.mkdirs();
-            }
-            try (FileWriter writer = new FileWriter(partitionRef)) {
-                writer.write("Random number for partition " + partitionRef + ": " + randomNumber);
-            }
-
-            System.out.println("Generated random number " + randomNumber + " for partition " + partitionRef);
-
-            // Write the random number to the output file
-            String outputPath = partitionRef;
-            System.out.println("Writing random number " + randomNumber + " to " + outputPath);
-            // Ensure dir exists
-            new java.io.File(outputPath).getParentFile().mkdirs();
-            // Write number (overwrite)
-            try (FileWriter writer = new FileWriter(outputPath)) {
-                writer.write(String.valueOf(randomNumber));
-            }
-
-        } catch (NoSuchAlgorithmException | IOException e) {
-            System.err.println("Error: " + e.getMessage());
-            e.printStackTrace();
-            System.exit(1);
-        }
-    }
-}
--- a/examples/basic_graph/SumConfigure.java
+++ b/examples/basic_graph/SumConfigure.java
@ -1,58 +0,0 @@
-package com.databuild.examples.basic_graph;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.fasterxml.jackson.databind.JsonNode;
-
-import java.io.File;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.stream.Collectors;
-
-import static com.databuild.examples.basic_graph.GenerateExecute.BASE_PATH;
-
-/**
- * Configure class for generating a random number.
- * This class creates a job configuration for generating a random number based on the partition ref.
- */
-public class SumConfigure {
-    public static void main(String[] args) {
-        if (args.length != 1) {
-            System.err.println("Error: Must provide exactly one partition ref as an argument");
-            System.exit(1);
-        }
-
-        String partitionRef = args[0];
-        String[] pathParts = partitionRef.split("/");
-        String[] upstreams = Arrays.stream(pathParts[pathParts.length - 1].split("_"))
-                .map(part -> BASE_PATH + "generated_number/" + part)
-                .toArray(String[]::new);
-
-        // Create and populate JobConfig object
-        JobConfig config = new JobConfig();
-        config.outputs = Collections.singletonList(BASE_PATH + "sum/" +partitionRef);
-        config.inputs = Arrays.stream(upstreams)
-                .map(upstream -> {
-                    DataDep dep = new DataDep();
-                    dep.depType = "materialize";
-                    dep.ref = upstream;
-                    return dep;
-                })
-                .collect(Collectors.toList());
-        config.args = Arrays.asList(upstreams);
-        // Create a hashmap for env with {"OUTPUT_REF": "foo"}
-        config.env = Collections.singletonMap("OUTPUT_REF", args[0]);
-        // inputs and env are already initialized as empty collections in the constructor
-
-        try {
-            ObjectMapper mapper = new ObjectMapper();
-
-            // Convert config to JsonNode and serialize
-            JsonNode configNode = mapper.valueToTree(Collections.singletonList(config));
-            String jsonConfig = mapper.writeValueAsString(configNode);
-            System.out.println(jsonConfig);
-        } catch (Exception e) {
-            System.err.println("Error: Failed to validate or serialize config: " + e.getMessage());
-            System.exit(1);
-        }
-    }
-}
--- a/examples/basic_graph/SumExecute.java
+++ b/examples/basic_graph/SumExecute.java
@ -1,44 +0,0 @@
-package com.databuild.examples.basic_graph;
-
-import static com.databuild.examples.basic_graph.GenerateExecute.BASE_PATH;
-
-public class SumExecute {
-    public static void main(String[] args) {
-        if (args.length < 1) {
-            System.err.println("Error: Partition ref (output path) is required");
-            System.exit(1);
-        }
-
-        // Get output ref from env var OUTPUT_REF
-        String outputRef = System.getenv("OUTPUT_REF");
-
-        // For each arg, load it from the file system and add it to the sum
-        int sum = 0;
-        for (String partitionRef : args) {
-            try {
-                String path = partitionRef;
-                int partitionValue = Integer.parseInt(new String(java.nio.file.Files.readAllBytes(java.nio.file.Paths.get(path))));
-                System.out.println("Summing partition " + partitionRef + " with value " + partitionValue);
-                sum += partitionValue;
-            } catch (Exception e) {
-                System.err.println("Error: Failed to read partition " + partitionRef + ": " + e.getMessage());
-                e.printStackTrace();
-            }
-        }
-        System.out.println("Sum of " + args.length + " partitions: " + sum);
-        // Write the sum to the output file
-        String outPath = outputRef;
-        System.out.println("Writing sum " + sum + " to " + outPath);
-
-        java.io.File parent = new java.io.File(outPath).getParentFile();
-        if (parent != null) {
-            parent.mkdirs();
-        }
-
-        try (java.io.FileWriter writer = new java.io.FileWriter(outPath)) {
-            writer.write(String.valueOf(sum));
-        } catch (Exception e) {
-            System.err.println("Error: Failed to write sum to " + outputRef + ": " + e.getMessage());
-        }
-    }
-}
--- a/examples/basic_graph/UnifiedGenerateNumber.java
+++ b/examples/basic_graph/UnifiedGenerateNumber.java
@ -0,0 +1,117 @@
+package com.databuild.examples.basic_graph;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.JsonNode;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.io.File;
+import java.util.Arrays;
+import java.util.Collections;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.Random;
+
+/**
+ * Unified job that handles both configuration and execution via subcommands.
+ */
+public class UnifiedGenerateNumber {
+    public static String BASE_PATH = "/tmp/databuild_test/examples/basic_graph/";
+
+    public static void main(String[] args) {
+        if (args.length < 1) {
+            System.err.println("Usage: UnifiedGenerateNumber {config|exec} [args...]");
+            System.exit(1);
+        }
+
+        String command = args[0];
+        switch (command) {
+            case "config":
+                handleConfig(Arrays.copyOfRange(args, 1, args.length));
+                break;
+            case "exec":
+                handleExec(Arrays.copyOfRange(args, 1, args.length));
+                break;
+            default:
+                System.err.println("Unknown command: " + command);
+                System.err.println("Usage: UnifiedGenerateNumber {config|exec} [args...]");
+                System.exit(1);
+        }
+    }
+
+    private static void handleConfig(String[] args) {
+        if (args.length < 1) {
+            System.err.println("Config mode requires partition ref");
+            System.exit(1);
+        }
+
+        String partitionRef = args[0];
+        
+        try {
+            ObjectMapper mapper = new ObjectMapper();
+            
+            // Create job configuration
+            var config = mapper.createObjectNode();
+            
+            // Create outputs as PartitionRef objects
+            var outputs = mapper.createArrayNode();
+            var outputPartRef = mapper.createObjectNode();
+            outputPartRef.put("str", partitionRef);
+            outputs.add(outputPartRef);
+            config.set("outputs", outputs);
+            
+            config.set("inputs", mapper.createArrayNode());
+            config.set("args", mapper.createArrayNode().add("will").add("generate").add(partitionRef));
+            config.set("env", mapper.createObjectNode().put("PARTITION_REF", partitionRef));
+            
+            var response = mapper.createObjectNode();
+            response.set("configs", mapper.createArrayNode().add(config));
+            
+            System.out.println(mapper.writeValueAsString(response));
+        } catch (Exception e) {
+            System.err.println("Error creating config: " + e.getMessage());
+            System.exit(1);
+        }
+    }
+
+    private static void handleExec(String[] args) {
+        if (args.length < 3) {
+            System.err.println("Execute mode requires: will generate <partition_ref>");
+            System.exit(1);
+        }
+
+        String partitionRef = args[2];
+        
+        try {
+            // Generate a random number based on the partition ref
+            MessageDigest md = MessageDigest.getInstance("SHA-256");
+            byte[] hash = md.digest(partitionRef.getBytes(StandardCharsets.UTF_8));
+            long seed = 0;
+            for (int i = 0; i < 8; i++) {
+                seed = (seed << 8) | (hash[i] & 0xFF);
+            }
+            
+            Random random = new Random(seed);
+            int randomNumber = random.nextInt(100) + 1;
+            
+            // Write to file - partitionRef is the full path
+            File outputFile = new File(partitionRef);
+            File outputDir = outputFile.getParentFile();
+            if (outputDir != null) {
+                outputDir.mkdirs();
+            }
+            try (FileWriter writer = new FileWriter(outputFile)) {
+                writer.write(String.valueOf(randomNumber));
+            }
+            
+            System.out.println("Generated number " + randomNumber + " for partition " + partitionRef);
+            
+        } catch (Exception e) {
+            System.err.println("Error in execution: " + e.getMessage());
+            System.exit(1);
+        }
+    }
+}
--- a/examples/basic_graph/UnifiedSum.java
+++ b/examples/basic_graph/UnifiedSum.java
@ -0,0 +1,137 @@
+package com.databuild.examples.basic_graph;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.JsonNode;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.stream.Collectors;
+import java.io.FileWriter;
+import java.io.IOException;
+
+// import static com.databuild.examples.basic_graph.GenerateExecute.BASE_PATH;
+
+/**
+ * Unified sum job that handles both configuration and execution via subcommands.
+ */
+public class UnifiedSum {
+    public static String BASE_PATH = "/tmp/databuild_test/examples/basic_graph/";
+    
+    public static void main(String[] args) {
+        if (args.length < 1) {
+            System.err.println("Usage: UnifiedSum {config|exec} [args...]");
+            System.exit(1);
+        }
+
+        String command = args[0];
+        switch (command) {
+            case "config":
+                handleConfig(Arrays.copyOfRange(args, 1, args.length));
+                break;
+            case "exec":
+                handleExec(Arrays.copyOfRange(args, 1, args.length));
+                break;
+            default:
+                System.err.println("Unknown command: " + command);
+                System.err.println("Usage: UnifiedSum {config|exec} [args...]");
+                System.exit(1);
+        }
+    }
+
+    private static void handleConfig(String[] args) {
+        if (args.length != 1) {
+            System.err.println("Config mode requires exactly one partition ref");
+            System.exit(1);
+        }
+
+        String partitionRef = args[0];
+        String[] pathParts = partitionRef.split("/");
+        String[] upstreams = Arrays.stream(pathParts[pathParts.length - 1].split("_"))
+                .map(part -> BASE_PATH + "generated_number/" + part)
+                .toArray(String[]::new);
+
+        try {
+            ObjectMapper mapper = new ObjectMapper();
+            
+            // Create data dependencies
+            var inputs = mapper.createArrayNode();
+            for (String upstream : upstreams) {
+                var dataDep = mapper.createObjectNode();
+                dataDep.put("dep_type", 0); // QUERY
+                var partRef = mapper.createObjectNode();
+                partRef.put("str", upstream);
+                dataDep.set("partition_ref", partRef);
+                inputs.add(dataDep);
+            }
+            
+            // Create job configuration
+            var config = mapper.createObjectNode();
+            
+            // Create outputs as PartitionRef objects
+            var outputs = mapper.createArrayNode();
+            var outputPartRef = mapper.createObjectNode();
+            outputPartRef.put("str", partitionRef);
+            outputs.add(outputPartRef);
+            config.set("outputs", outputs);
+            
+            config.set("inputs", inputs);
+            var argsArray = mapper.createArrayNode();
+            for (String upstream : upstreams) {
+                argsArray.add(upstream);
+            }
+            config.set("args", argsArray);
+            config.set("env", mapper.createObjectNode().put("OUTPUT_REF", partitionRef));
+            
+            var response = mapper.createObjectNode();
+            response.set("configs", mapper.createArrayNode().add(config));
+            
+            System.out.println(mapper.writeValueAsString(response));
+        } catch (Exception e) {
+            System.err.println("Error creating config: " + e.getMessage());
+            System.exit(1);
+        }
+    }
+
+    private static void handleExec(String[] args) {
+        // Get output ref from env var OUTPUT_REF
+        String outputRef = System.getenv("OUTPUT_REF");
+        
+        if (outputRef == null) {
+            System.err.println("Error: OUTPUT_REF environment variable is required");
+            System.exit(1);
+        }
+
+        // For each arg, load it from the file system and add it to the sum
+        int sum = 0;
+        for (String partitionRef : args) {
+            try {
+                String path = partitionRef;
+                int partitionValue = Integer.parseInt(new String(java.nio.file.Files.readAllBytes(java.nio.file.Paths.get(path))));
+                System.out.println("Summing partition " + partitionRef + " with value " + partitionValue);
+                sum += partitionValue;
+            } catch (Exception e) {
+                System.err.println("Error: Failed to read partition " + partitionRef + ": " + e.getMessage());
+                e.printStackTrace();
+                System.exit(1);
+            }
+        }
+        
+        System.out.println("Sum of " + args.length + " partitions: " + sum);
+        
+        // Write the sum to the output file
+        try {
+            File outputDir = new File(outputRef).getParentFile();
+            if (outputDir != null) {
+                outputDir.mkdirs();
+            }
+            try (FileWriter writer = new FileWriter(outputRef)) {
+                writer.write(String.valueOf(sum));
+            }
+            System.out.println("Wrote sum " + sum + " to " + outputRef);
+        } catch (Exception e) {
+            System.err.println("Error writing output: " + e.getMessage());
+            System.exit(1);
+        }
+    }
+}
--- a/examples/basic_graph/test/generate_number_test.sh
+++ b/examples/basic_graph/test/generate_number_test.sh
@ -5,11 +5,11 @@ set -e
 generate_number_job.cfg /tmp/databuild_test/examples/basic_graph/generated_number/pippin /tmp/databuild_test/examples/basic_graph/generated_number/salem /tmp/databuild_test/examples/basic_graph/generated_number/sadie

 # Test run
-generate_number_job.cfg /tmp/databuild_test/examples/basic_graph/generated_number/pippin | jq -c ".[0]" | generate_number_job.exec
+generate_number_job.cfg /tmp/databuild_test/examples/basic_graph/generated_number/pippin | jq -c ".configs[0]" | generate_number_job.exec

-# Validate that contents of pippin is 83
-if [[ "$(cat /tmp/databuild_test/examples/basic_graph/generated_number/pippin)" != "83" ]]; then
-  echo "Assertion failed: File does not contain 83"
+# Validate that contents of pippin is 1 (deterministic based on SHA-256 hash)
+if [[ "$(cat /tmp/databuild_test/examples/basic_graph/generated_number/pippin)" != "1" ]]; then
+  echo "Assertion failed: File does not contain 1"
  cat /tmp/databuild_test/examples/basic_graph/generated_number/pippin
  exit 1
 fi
--- a/examples/basic_graph/test/sum_test.sh
+++ b/examples/basic_graph/test/sum_test.sh
@ -13,7 +13,7 @@ echo -n 83 > /tmp/databuild_test/examples/basic_graph/generated_number/pippin
 echo -n 34 > /tmp/databuild_test/examples/basic_graph/generated_number/salem
 echo -n 19 > /tmp/databuild_test/examples/basic_graph/generated_number/sadie

-sum_job.cfg /tmp/databuild_test/examples/basic_graph/sum/pippin_salem_sadie | jq -c ".[0]" | sum_job.exec
+sum_job.cfg /tmp/databuild_test/examples/basic_graph/sum/pippin_salem_sadie | jq -c ".configs[0]" | sum_job.exec

 # Validate that contents of output is 136
 if [[ "$(cat /tmp/databuild_test/examples/basic_graph/sum/pippin_salem_sadie)" != "136" ]]; then
--- a/examples/basic_job/BUILD.bazel
+++ b/examples/basic_job/BUILD.bazel
@ -2,17 +2,11 @@ load("@databuild//databuild:rules.bzl", "databuild_job")

 databuild_job(
    name = "test_job",
-    configure = ":test_job_configure",
-    execute = ":test_job_execute",
+    binary = ":test_job_binary",
    visibility = ["//visibility:public"],
 )

 sh_binary(
-    name = "test_job_configure",
-    srcs = ["configure.sh"],
-)
-
-sh_binary(
-    name = "test_job_execute",
-    srcs = ["execute.sh"],
+    name = "test_job_binary",
+    srcs = ["unified_job.sh"],
 )
--- a/examples/basic_job/configure.sh
+++ b/examples/basic_job/configure.sh
@ -1,2 +0,0 @@
-# Create a test job config
-echo "{\"configs\":[{\"outputs\":[\"$1\"],\"inputs\":[],\"args\":[\"will\", \"build\", \"$1\"],\"env\":{\"foo\":\"bar\"}}]}"
--- a/examples/basic_job/execute.sh
+++ b/examples/basic_job/execute.sh
@ -1,3 +0,0 @@
-echo 'EXECUTE!'
-echo "foo=$foo"
-echo "args=$@"
--- a/examples/basic_job/unified_job.sh
+++ b/examples/basic_job/unified_job.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Simple unified job that handles both config and exec via subcommands
+
+case "${1:-}" in
+    "config")
+        # Configuration mode - output job config JSON
+        partition_ref="${2:-}"
+        echo "{\"configs\":[{\"outputs\":[{\"str\":\"${partition_ref}\"}],\"inputs\":[],\"args\":[\"will\", \"build\", \"${partition_ref}\"],\"env\":{\"foo\":\"bar\"}}]}"
+        ;;
+    "exec")
+        # Execution mode - run the job
+        echo 'EXECUTE UNIFIED!'
+        echo "foo=$foo"
+        echo "args=$@"
+        ;;
+    *)
+        echo "Usage: $0 {config|exec} [args...]"
+        exit 1
+        ;;
+esac
--- a/examples/podcast_reviews/BUILD.bazel
+++ b/examples/podcast_reviews/BUILD.bazel
@ -1,5 +1,5 @@
 load("//:py_repl.bzl", "py_repl")
-load("@databuild//databuild:rules.bzl", "databuild_job")
+load("@databuild//databuild:rules.bzl", "databuild_job", "databuild_graph")
 load("@rules_python//python:pip.bzl", "compile_pip_requirements")
 load("@pypi//:requirements.bzl", "requirement")

@ -9,22 +9,174 @@ compile_pip_requirements(
    requirements_txt = "requirements_lock.txt",
 )

+# Podcast Reviews Graph
+databuild_graph(
+    name = "podcast_reviews_graph",
+    jobs = [
+        ":extract_reviews_job",
+        ":extract_podcasts_job",
+        ":categorize_reviews_job",
+        ":phrase_modeling_job",
+        ":phrase_stats_job",
+        ":daily_summary_job",
+    ],
+    lookup = ":job_lookup",
+    visibility = ["//visibility:public"],
+)
+
+py_binary(
+    name = "job_lookup",
+    srcs = ["job_lookup.py"],
+    main = "job_lookup.py",
+)
+
+# Extract Reviews Job
+databuild_job(
+    name = "extract_reviews_job",
+    binary = ":extract_reviews_binary",
+    visibility = ["//visibility:public"],
+)
+
+py_binary(
+    name = "extract_reviews_binary",
+    srcs = ["extract_reviews_job.py", "duckdb_utils.py"],
+    main = "extract_reviews_job.py",
+    deps = [
+        requirement("duckdb"),
+        requirement("pydantic"),
+        requirement("pandas"),
+        requirement("pyarrow"),
+    ],
+)
+
+# Extract Podcasts Job
+databuild_job(
+    name = "extract_podcasts_job",
+    binary = ":extract_podcasts_binary",
+    visibility = ["//visibility:public"],
+)
+
+py_binary(
+    name = "extract_podcasts_binary",
+    srcs = ["extract_podcasts_job.py", "duckdb_utils.py"],
+    main = "extract_podcasts_job.py",
+    deps = [
+        requirement("duckdb"),
+        requirement("pydantic"),
+        requirement("pandas"),
+        requirement("pyarrow"),
+    ],
+)
+
+# Categorize Reviews Job
+databuild_job(
+    name = "categorize_reviews_job",
+    binary = ":categorize_reviews_binary",
+    visibility = ["//visibility:public"],
+)
+
+py_binary(
+    name = "categorize_reviews_binary",
+    srcs = ["categorize_reviews_job.py", "duckdb_utils.py"],
+    main = "categorize_reviews_job.py",
+    deps = [
+        requirement("duckdb"),
+        requirement("pydantic"),
+        requirement("pandas"),
+        requirement("pyarrow"),
+    ],
+)
+
+# Phrase Modeling Job
+databuild_job(
+    name = "phrase_modeling_job",
+    binary = ":phrase_modeling_binary",
+    visibility = ["//visibility:public"],
+)
+
+py_binary(
+    name = "phrase_modeling_binary",
+    srcs = ["phrase_modeling_job.py", "duckdb_utils.py"],
+    main = "phrase_modeling_job.py",
+    deps = [
+        requirement("duckdb"),
+        requirement("pydantic"),
+        requirement("pandas"),
+        requirement("pyarrow"),
+    ],
+)
+
+# Phrase Stats Job
+databuild_job(
+    name = "phrase_stats_job",
+    binary = ":phrase_stats_binary",
+    visibility = ["//visibility:public"],
+)
+
+py_binary(
+    name = "phrase_stats_binary",
+    srcs = ["phrase_stats_job.py", "duckdb_utils.py"],
+    main = "phrase_stats_job.py",
+    deps = [
+        requirement("duckdb"),
+        requirement("pydantic"),
+        requirement("pandas"),
+        requirement("pyarrow"),
+    ],
+)
+
+# Daily Summary Job
+databuild_job(
+    name = "daily_summary_job",
+    binary = ":daily_summary_binary",
+    visibility = ["//visibility:public"],
+)
+
+py_binary(
+    name = "daily_summary_binary",
+    srcs = ["daily_summary_job.py", "duckdb_utils.py"],
+    main = "daily_summary_job.py",
+    deps = [
+        requirement("duckdb"),
+        requirement("pydantic"),
+        requirement("pandas"),
+        requirement("pyarrow"),
+    ],
+)
+
+# Legacy test job (kept for compatibility)
 databuild_job(
    name = "test_job",
-    configure = ":test_job_configure",
-    execute = ":test_job_execute",
+    binary = ":test_job_binary",
 )

 py_binary(
-    name = "test_job_configure",
-    srcs = ["configure.py"],
-    main = "configure.py",
+    name = "test_job_binary",
+    srcs = ["unified_job.py"],
+    main = "unified_job.py",
 )

+# Test target
 py_binary(
-    name = "test_job_execute",
-    srcs = ["execute.py"],
-    main = "execute.py",
+    name = "test_jobs",
+    srcs = [
+        "test_jobs.py",
+        "extract_reviews_job.py",
+        "extract_podcasts_job.py", 
+        "categorize_reviews_job.py",
+        "phrase_modeling_job.py",
+        "phrase_stats_job.py",
+        "daily_summary_job.py",
+        "job_lookup.py",
+        "duckdb_utils.py",
+    ],
+    main = "test_jobs.py",
+    deps = [
+        requirement("duckdb"),
+        requirement("pydantic"),
+        requirement("pandas"),
+        requirement("pyarrow"),
+    ],
 )

 py_repl(
--- a/examples/podcast_reviews/MODULE.bazel.lock
+++ b/examples/podcast_reviews/MODULE.bazel.lock
--- a/examples/podcast_reviews/README.md
+++ b/examples/podcast_reviews/README.md
@ -21,3 +21,7 @@ flowchart LR
 ## Input Data

 Get it from [here](https://www.kaggle.com/datasets/thoughtvector/podcastreviews/versions/28?select=database.sqlite)! (and put it in `examples/podcast_reviews/data/ingest/database.sqlite`)
+
+## `phrase` Dependency
+
+This relies on [`soaxelbrooke/phrase`](https://github.com/soaxelbrooke/phrase) for phrase extraction - check out its [releases](https://github.com/soaxelbrooke/phrase/releases) to get a relevant binary.
--- a/examples/podcast_reviews/categorize_reviews_job.py
+++ b/examples/podcast_reviews/categorize_reviews_job.py
@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Any
+import re
+from duckdb_utils import create_duckdb_connection, read_dataframe_with_fallback, save_dataframe_with_fallback
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: categorize_reviews_job.py {config|exec} [args...]", file=sys.stderr)
+        sys.exit(1)
+    
+    command = sys.argv[1]
+    
+    if command == "config":
+        handle_config(sys.argv[2:])
+    elif command == "exec":
+        handle_exec(sys.argv[2:])
+    else:
+        print(f"Unknown command: {command}", file=sys.stderr)
+        sys.exit(1)
+
+def parse_partition_ref(partition_ref: str) -> Dict[str, str]:
+    """Parse partition ref like 'categorized_reviews/category=comedy/date=2020-01-01' into components."""
+    match = re.match(r'categorized_reviews/category=([^/]+)/date=(\d{4}-\d{2}-\d{2})', partition_ref)
+    if not match:
+        raise ValueError(f"Invalid partition ref format: {partition_ref}")
+    return {"category": match.group(1), "date": match.group(2)}
+
+def handle_config(args):
+    if len(args) < 1:
+        print("Config mode requires partition ref", file=sys.stderr)
+        sys.exit(1)
+    
+    configs = []
+    
+    # Process each partition reference
+    for partition_ref in args:
+        try:
+            parsed = parse_partition_ref(partition_ref)
+            category = parsed["category"]
+            date_str = parsed["date"]
+        except ValueError as e:
+            print(f"Error parsing partition ref: {e}", file=sys.stderr)
+            sys.exit(1)
+        
+        # Dependencies: reviews for the date and podcast metadata
+        reviews_ref = f"reviews/date={date_str}"
+        podcasts_ref = "podcasts/all"
+        
+        configs.append({
+            "outputs": [{"str": partition_ref}],
+            "inputs": [
+                {"dep_type": 1, "partition_ref": {"str": reviews_ref}},
+                {"dep_type": 1, "partition_ref": {"str": podcasts_ref}}
+            ],
+            "args": [category, date_str],
+            "env": {
+                "PARTITION_REF": partition_ref,
+                "TARGET_CATEGORY": category,
+                "TARGET_DATE": date_str
+            }
+        })
+    
+    config = {"configs": configs}
+    print(json.dumps(config))
+
+def handle_exec(args):
+    if len(args) < 2:
+        print("Exec mode requires category and date arguments", file=sys.stderr)
+        sys.exit(1)
+    
+    target_category = args[0]
+    target_date = args[1]
+    partition_ref = os.getenv('PARTITION_REF', f'categorized_reviews/category={target_category}/date={target_date}')
+    
+    # Input paths - check for both parquet and CSV fallbacks
+    reviews_base = f"/tmp/databuild_test/examples/podcast_reviews/reviews/date={target_date}/reviews"
+    podcasts_base = "/tmp/databuild_test/examples/podcast_reviews/podcasts/podcasts"
+    
+    reviews_file = None
+    podcasts_file = None
+    
+    # Find reviews file (parquet or CSV)
+    for ext in ['.parquet', '.csv']:
+        candidate = reviews_base + ext
+        if os.path.exists(candidate):
+            reviews_file = candidate
+            break
+    
+    # Find podcasts file (parquet or CSV)
+    for ext in ['.parquet', '.csv']:
+        candidate = podcasts_base + ext
+        if os.path.exists(candidate):
+            podcasts_file = candidate
+            break
+    
+    if not reviews_file:
+        print(f"Reviews file not found: {reviews_base}.(parquet|csv)", file=sys.stderr)
+        sys.exit(1)
+    
+    if not podcasts_file:
+        print(f"Podcasts file not found: {podcasts_base}.(parquet|csv)", file=sys.stderr)
+        sys.exit(1)
+    
+    # Output path
+    output_dir = Path(f"/tmp/databuild_test/examples/podcast_reviews/categorized_reviews/category={target_category}/date={target_date}")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_file = output_dir / "categorized_reviews.parquet"
+    
+    try:
+        # Categorize reviews by joining with podcast metadata
+        categorize_reviews_for_category_date(reviews_file, podcasts_file, target_category, str(output_file))
+        
+        print(f"Successfully categorized reviews for category {target_category} on {target_date}")
+        print(f"Output written to: {output_file}")
+        
+        # Create manifest
+        manifest = {
+            "outputs": [{"str": partition_ref}],
+            "inputs": [
+                {"str": f"reviews/date={target_date}"},
+                {"str": "podcasts/all"}
+            ],
+            "start_time": datetime.now().isoformat(),
+            "end_time": datetime.now().isoformat(),
+            "task": {
+                "job": {"label": "//examples/podcast_reviews:categorize_reviews_job"},
+                "config": {
+                    "outputs": [{"str": partition_ref}],
+                    "inputs": [
+                        {"dep_type": 1, "partition_ref": {"str": f"reviews/date={target_date}"}},
+                        {"dep_type": 1, "partition_ref": {"str": "podcasts/all"}}
+                    ],
+                    "args": [target_category, target_date],
+                    "env": {"PARTITION_REF": partition_ref, "TARGET_CATEGORY": target_category, "TARGET_DATE": target_date}
+                }
+            }
+        }
+        
+        manifest_file = output_dir / "manifest.json"
+        with open(manifest_file, 'w') as f:
+            json.dump(manifest, f, indent=2)
+            
+    except Exception as e:
+        print(f"Error categorizing reviews: {e}", file=sys.stderr)
+        sys.exit(1)
+
+def categorize_reviews_for_category_date(reviews_file: str, podcasts_file: str, target_category: str, output_file: str):
+    """Join reviews with podcast categories and filter for target category."""
+    
+    # Connect to DuckDB with extension handling
+    duckdb_conn = create_duckdb_connection()
+    
+    try:
+        # Read input files with fallback handling
+        reviews_df = read_dataframe_with_fallback(reviews_file, duckdb_conn)
+        podcasts_df = read_dataframe_with_fallback(podcasts_file, duckdb_conn)
+        
+        # Perform join and filtering in pandas
+        import pandas as pd
+        
+        # Join reviews with podcasts
+        joined_df = reviews_df.merge(podcasts_df, on='podcast_id', how='inner')
+        
+        # Filter by category
+        filtered_df = joined_df[
+            (joined_df['primary_category'] == target_category) | 
+            (joined_df['all_categories'].str.contains(target_category, na=False))
+        ].copy()
+        
+        # Add target category column
+        filtered_df['target_category'] = target_category
+        
+        # Select and rename columns to match expected output
+        result_df = filtered_df[[
+            'podcast_id', 'review_title', 'content', 'rating', 'author_id', 
+            'created_at', 'review_date', 'title', 'primary_category', 
+            'all_categories', 'target_category'
+        ]].rename(columns={'title': 'podcast_title'})
+        
+        # Sort by created_at
+        result_df = result_df.sort_values('created_at')
+        
+        # Save to parquet with fallback
+        save_dataframe_with_fallback(result_df, output_file, duckdb_conn, "parquet")
+        
+        row_count = len(result_df)
+        print(f"Categorized {row_count} reviews for category '{target_category}'")
+        
+        if row_count == 0:
+            print(f"Warning: No reviews found for category '{target_category}'")
+        
+    finally:
+        duckdb_conn.close()
+
+if __name__ == "__main__":
+    main()
--- a/examples/podcast_reviews/configure.py
+++ b/examples/podcast_reviews/configure.py
@ -1,2 +0,0 @@
-
-print("Hello, gorgeous.")
--- a/examples/podcast_reviews/daily_summary_job.py
+++ b/examples/podcast_reviews/daily_summary_job.py
@ -0,0 +1,313 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import os
+import duckdb
+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Any
+import re
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: daily_summary_job.py {config|exec} [args...]", file=sys.stderr)
+        sys.exit(1)
+    
+    command = sys.argv[1]
+    
+    if command == "config":
+        handle_config(sys.argv[2:])
+    elif command == "exec":
+        handle_exec(sys.argv[2:])
+    else:
+        print(f"Unknown command: {command}", file=sys.stderr)
+        sys.exit(1)
+
+def parse_partition_ref(partition_ref: str) -> Dict[str, str]:
+    """Parse partition ref like 'daily_summaries/category=comedy/date=2020-01-01' into components."""
+    match = re.match(r'daily_summaries/category=([^/]+)/date=(\d{4}-\d{2}-\d{2})', partition_ref)
+    if not match:
+        raise ValueError(f"Invalid partition ref format: {partition_ref}")
+    return {"category": match.group(1), "date": match.group(2)}
+
+def handle_config(args):
+    if len(args) < 1:
+        print("Config mode requires partition ref", file=sys.stderr)
+        sys.exit(1)
+    
+    configs = []
+    
+    # Process each partition reference
+    for partition_ref in args:
+        try:
+            parsed = parse_partition_ref(partition_ref)
+            category = parsed["category"]
+            date_str = parsed["date"]
+        except ValueError as e:
+            print(f"Error parsing partition ref: {e}", file=sys.stderr)
+            sys.exit(1)
+        
+        # Dependencies: phrase stats and categorized reviews for the category and date
+        phrase_stats_ref = f"phrase_stats/category={category}/date={date_str}"
+        categorized_reviews_ref = f"categorized_reviews/category={category}/date={date_str}"
+        
+        configs.append({
+            "outputs": [{"str": partition_ref}],
+            "inputs": [
+                {"dep_type": 1, "partition_ref": {"str": phrase_stats_ref}},
+                {"dep_type": 1, "partition_ref": {"str": categorized_reviews_ref}}
+            ],
+            "args": [category, date_str],
+            "env": {
+                "PARTITION_REF": partition_ref,
+                "TARGET_CATEGORY": category,
+                "TARGET_DATE": date_str
+            }
+        })
+    
+    config = {"configs": configs}
+    print(json.dumps(config))
+
+def handle_exec(args):
+    if len(args) < 2:
+        print("Exec mode requires category and date arguments", file=sys.stderr)
+        sys.exit(1)
+    
+    target_category = args[0]
+    target_date = args[1]
+    partition_ref = os.getenv('PARTITION_REF', f'daily_summaries/category={target_category}/date={target_date}')
+    
+    # Input paths
+    phrase_stats_file = f"/tmp/databuild_test/examples/podcast_reviews/phrase_stats/category={target_category}/date={target_date}/phrase_stats.parquet"
+    categorized_reviews_file = f"/tmp/databuild_test/examples/podcast_reviews/categorized_reviews/category={target_category}/date={target_date}/categorized_reviews.parquet"
+    
+    # Check input files exist
+    if not os.path.exists(phrase_stats_file):
+        print(f"Phrase stats file not found: {phrase_stats_file}", file=sys.stderr)
+        sys.exit(1)
+    
+    if not os.path.exists(categorized_reviews_file):
+        print(f"Categorized reviews file not found: {categorized_reviews_file}", file=sys.stderr)
+        sys.exit(1)
+    
+    # Output path
+    output_dir = Path(f"/tmp/databuild_test/examples/podcast_reviews/daily_summaries/category={target_category}/date={target_date}")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_file = output_dir / "daily_summary.parquet"
+    
+    try:
+        # Generate daily summary combining phrase stats and recent reviews
+        generate_daily_summary_for_category_date(
+            phrase_stats_file, 
+            categorized_reviews_file, 
+            target_category, 
+            target_date, 
+            str(output_file)
+        )
+        
+        print(f"Successfully generated daily summary for category {target_category} on {target_date}")
+        print(f"Output written to: {output_file}")
+        
+        # Create manifest
+        manifest = {
+            "outputs": [{"str": partition_ref}],
+            "inputs": [
+                {"str": f"phrase_stats/category={target_category}/date={target_date}"},
+                {"str": f"categorized_reviews/category={target_category}/date={target_date}"}
+            ],
+            "start_time": datetime.now().isoformat(),
+            "end_time": datetime.now().isoformat(),
+            "task": {
+                "job": {"label": "//examples/podcast_reviews:daily_summary_job"},
+                "config": {
+                    "outputs": [{"str": partition_ref}],
+                    "inputs": [
+                        {"dep_type": 1, "partition_ref": {"str": f"phrase_stats/category={target_category}/date={target_date}"}},
+                        {"dep_type": 1, "partition_ref": {"str": f"categorized_reviews/category={target_category}/date={target_date}"}}
+                    ],
+                    "args": [target_category, target_date],
+                    "env": {"PARTITION_REF": partition_ref, "TARGET_CATEGORY": target_category, "TARGET_DATE": target_date}
+                }
+            }
+        }
+        
+        manifest_file = output_dir / "manifest.json"
+        with open(manifest_file, 'w') as f:
+            json.dump(manifest, f, indent=2)
+            
+    except Exception as e:
+        print(f"Error generating daily summary: {e}", file=sys.stderr)
+        sys.exit(1)
+
+def generate_daily_summary_for_category_date(
+    phrase_stats_file: str, 
+    categorized_reviews_file: str, 
+    target_category: str, 
+    target_date: str, 
+    output_file: str
+):
+    """Generate daily summary combining top phrases and recent reviews."""
+    
+    # Connect to DuckDB for processing
+    duckdb_conn = duckdb.connect()
+    
+    try:
+        # Try to install and load parquet extension, but don't fail if it's already installed
+        try:
+            duckdb_conn.execute("INSTALL parquet")
+        except Exception:
+            pass  # Extension might already be installed
+        
+        duckdb_conn.execute("LOAD parquet")
+        
+        # Check if we have data
+        phrase_count = duckdb_conn.execute(f"SELECT COUNT(*) FROM parquet_scan('{phrase_stats_file}')").fetchone()[0]
+        review_count = duckdb_conn.execute(f"SELECT COUNT(*) FROM parquet_scan('{categorized_reviews_file}')").fetchone()[0]
+        
+        if phrase_count == 0 and review_count == 0:
+            print(f"No data found, creating empty daily summary")
+            create_empty_daily_summary(target_category, target_date, output_file, duckdb_conn)
+            return
+        
+        # Query to generate comprehensive daily summary
+        query = f"""
+        WITH top_phrases_per_podcast AS (
+            SELECT 
+                podcast_id,
+                podcast_title,
+                ngram,
+                count as phrase_count,
+                avg_rating as phrase_avg_rating,
+                weighted_score,
+                ROW_NUMBER() OVER (PARTITION BY podcast_id ORDER BY weighted_score DESC) as phrase_rank
+            FROM parquet_scan('{phrase_stats_file}')
+            WHERE ngram IS NOT NULL
+        ),
+        podcast_phrase_summary AS (
+            SELECT 
+                podcast_id,
+                podcast_title,
+                STRING_AGG(ngram, '; ' ORDER BY weighted_score DESC) as top_phrases,
+                COUNT(*) as total_phrases,
+                AVG(phrase_avg_rating) as avg_phrase_rating,
+                SUM(weighted_score) as total_phrase_score
+            FROM top_phrases_per_podcast
+            WHERE phrase_rank <= 5  -- Top 5 phrases per podcast
+            GROUP BY podcast_id, podcast_title
+        ),
+        podcast_review_summary AS (
+            SELECT 
+                podcast_id,
+                podcast_title,
+                COUNT(*) as review_count,
+                AVG(rating::FLOAT) as avg_rating,
+                MIN(rating) as min_rating,
+                MAX(rating) as max_rating,
+                COUNT(CASE WHEN rating >= 4 THEN 1 END) as positive_reviews,
+                COUNT(CASE WHEN rating <= 2 THEN 1 END) as negative_reviews,
+                STRING_AGG(
+                    CASE WHEN rating <= 2 AND length(content) > 20 
+                         THEN substring(content, 1, 200) || '...'
+                         ELSE NULL 
+                    END, 
+                    ' | ' 
+                    ORDER BY rating ASC, length(content) DESC
+                ) as sample_negative_reviews
+            FROM parquet_scan('{categorized_reviews_file}')
+            WHERE podcast_title IS NOT NULL
+            GROUP BY podcast_id, podcast_title
+        ),
+        daily_summary AS (
+            SELECT 
+                '{target_date}' as date,
+                '{target_category}' as category,
+                COALESCE(pps.podcast_id, prs.podcast_id) as podcast_id,
+                COALESCE(pps.podcast_title, prs.podcast_title) as podcast_title,
+                COALESCE(prs.review_count, 0) as review_count,
+                COALESCE(prs.avg_rating, 0.0) as avg_rating,
+                COALESCE(prs.positive_reviews, 0) as positive_reviews,
+                COALESCE(prs.negative_reviews, 0) as negative_reviews,
+                COALESCE(pps.top_phrases, 'No significant phrases') as top_phrases,
+                COALESCE(pps.total_phrases, 0) as total_phrases,
+                COALESCE(pps.avg_phrase_rating, 0.0) as avg_phrase_rating,
+                COALESCE(pps.total_phrase_score, 0.0) as total_phrase_score,
+                prs.sample_negative_reviews,
+                CASE 
+                    WHEN prs.avg_rating >= 4.0 AND pps.avg_phrase_rating >= 4.0 THEN 'Highly Positive'
+                    WHEN prs.avg_rating >= 3.5 THEN 'Positive'
+                    WHEN prs.avg_rating >= 2.5 THEN 'Mixed'
+                    WHEN prs.avg_rating >= 1.5 THEN 'Negative'
+                    ELSE 'Highly Negative'
+                END as sentiment_category,
+                (prs.review_count * prs.avg_rating * 0.6 + pps.total_phrase_score * 0.4) as overall_score
+            FROM podcast_phrase_summary pps
+            FULL OUTER JOIN podcast_review_summary prs 
+                ON pps.podcast_id = prs.podcast_id
+            WHERE COALESCE(prs.review_count, 0) > 0 OR COALESCE(pps.total_phrases, 0) > 0
+        )
+        SELECT 
+            date,
+            category,
+            podcast_id,
+            podcast_title,
+            review_count,
+            avg_rating,
+            positive_reviews,
+            negative_reviews,
+            top_phrases,
+            total_phrases,
+            avg_phrase_rating,
+            total_phrase_score,
+            sample_negative_reviews,
+            sentiment_category,
+            overall_score
+        FROM daily_summary
+        ORDER BY overall_score DESC, review_count DESC
+        """
+        
+        # Execute query and save to parquet
+        duckdb_conn.execute(f"COPY ({query}) TO '{output_file}' (FORMAT PARQUET)")
+        
+        # Get row count for logging
+        count_result = duckdb_conn.execute(f"SELECT COUNT(*) FROM ({query})").fetchone()
+        row_count = count_result[0] if count_result else 0
+        
+        print(f"Generated daily summary for {row_count} podcasts")
+        
+        if row_count == 0:
+            print(f"Warning: No summary data generated for category '{target_category}' on date '{target_date}'")
+            create_empty_daily_summary(target_category, target_date, output_file, duckdb_conn)
+        
+    finally:
+        duckdb_conn.close()
+
+def create_empty_daily_summary(category: str, date: str, output_file: str, duckdb_conn):
+    """Create empty daily summary parquet file with correct schema."""
+    
+    duckdb_conn.execute("DROP TABLE IF EXISTS empty_daily_summary")
+    duckdb_conn.execute("""
+        CREATE TABLE empty_daily_summary (
+            date VARCHAR,
+            category VARCHAR,
+            podcast_id VARCHAR,
+            podcast_title VARCHAR,
+            review_count BIGINT,
+            avg_rating DOUBLE,
+            positive_reviews BIGINT,
+            negative_reviews BIGINT,
+            top_phrases VARCHAR,
+            total_phrases BIGINT,
+            avg_phrase_rating DOUBLE,
+            total_phrase_score DOUBLE,
+            sample_negative_reviews VARCHAR,
+            sentiment_category VARCHAR,
+            overall_score DOUBLE
+        )
+    """)
+    
+    duckdb_conn.execute(f"COPY empty_daily_summary TO '{output_file}' (FORMAT PARQUET)")
+    print("Created empty daily summary file")
+
+if __name__ == "__main__":
+    main()
--- a/examples/podcast_reviews/duckdb_utils.py
+++ b/examples/podcast_reviews/duckdb_utils.py
@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+"""
+Centralized DuckDB utilities for handling extension issues in isolated environments.
+"""
+
+import duckdb
+import sqlite3
+import pandas as pd
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import warnings
+
+def create_duckdb_connection(enable_extensions: bool = True) -> duckdb.DuckDBPyConnection:
+    """
+    Create a DuckDB connection with proper extension handling for isolated environments.
+    
+    Args:
+        enable_extensions: Whether to try enabling extensions (may fail in isolated environments)
+    
+    Returns:
+        DuckDB connection object
+    """
+    conn = duckdb.connect()
+    
+    if enable_extensions:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            try:
+                # Try to enable extensions, but don't fail if not available
+                conn.execute("SET autoinstall_known_extensions=1")
+                conn.execute("SET autoload_known_extensions=1")
+            except Exception:
+                # Extensions not available, will use fallback methods
+                pass
+    
+    return conn
+
+def sqlite_to_dataframe(sqlite_path: str, query: str, params: Optional[List[Any]] = None) -> pd.DataFrame:
+    """
+    Execute a SQLite query and return results as a pandas DataFrame.
+    This is a fallback when DuckDB's sqlite_scan doesn't work.
+    
+    Args:
+        sqlite_path: Path to SQLite database
+        query: SQL query to execute
+        params: Query parameters
+    
+    Returns:
+        DataFrame with query results
+    """
+    conn = sqlite3.connect(sqlite_path)
+    try:
+        if params:
+            df = pd.read_sql_query(query, conn, params=params)
+        else:
+            df = pd.read_sql_query(query, conn)
+        return df
+    finally:
+        conn.close()
+
+def execute_query_with_fallback(
+    duckdb_conn: duckdb.DuckDBPyConnection,
+    sqlite_path: str,
+    query: str,
+    params: Optional[List[Any]] = None,
+    use_sqlite_scan: bool = True
+) -> pd.DataFrame:
+    """
+    Execute a query using DuckDB's sqlite_scan if available, otherwise fall back to direct SQLite access.
+    
+    Args:
+        duckdb_conn: DuckDB connection
+        sqlite_path: Path to SQLite database
+        query: SQL query (should work with both sqlite_scan and direct SQLite)
+        params: Query parameters
+        use_sqlite_scan: Whether to try sqlite_scan first
+    
+    Returns:
+        DataFrame with query results
+    """
+    if use_sqlite_scan:
+        try:
+            # Try using DuckDB's sqlite_scan
+            if params:
+                result = duckdb_conn.execute(query, params).df()
+            else:
+                result = duckdb_conn.execute(query).df()
+            return result
+        except Exception as e:
+            print(f"sqlite_scan failed: {e}, falling back to direct SQLite access")
+    
+    # Fallback: Use direct SQLite access
+    # Convert DuckDB sqlite_scan query to regular SQLite query
+    fallback_query = query.replace("sqlite_scan(?, 'reviews')", "reviews")
+    fallback_query = fallback_query.replace("sqlite_scan(?, 'podcasts')", "podcasts")
+    fallback_query = fallback_query.replace("sqlite_scan(?, 'categories')", "categories")
+    
+    # Remove the sqlite_path parameter since we're connecting directly
+    if params and len(params) > 0 and params[0] == sqlite_path:
+        fallback_params = params[1:]
+    else:
+        fallback_params = params
+    
+    return sqlite_to_dataframe(sqlite_path, fallback_query, fallback_params)
+
+def save_dataframe_with_fallback(
+    df: pd.DataFrame,
+    output_path: str,
+    duckdb_conn: Optional[duckdb.DuckDBPyConnection] = None,
+    format: str = "parquet"
+) -> None:
+    """
+    Save a DataFrame to the specified format, with fallback options if DuckDB extensions fail.
+    
+    Args:
+        df: DataFrame to save
+        output_path: Output file path
+        duckdb_conn: Optional DuckDB connection (for parquet)
+        format: Output format ('parquet' or 'csv')
+    """
+    output_path = Path(output_path)
+    
+    if format.lower() == "parquet":
+        try:
+            if duckdb_conn:
+                # Try using DuckDB to write parquet
+                duckdb_conn.register('temp_df', df)
+                duckdb_conn.execute(f"COPY temp_df TO '{output_path}' (FORMAT PARQUET)")
+                return
+        except Exception as e:
+            print(f"DuckDB parquet write failed: {e}, falling back to pandas")
+        
+        try:
+            # Fallback to pandas parquet (requires pyarrow)
+            df.to_parquet(output_path, index=False)
+            return
+        except Exception as e:
+            print(f"Pandas parquet write failed: {e}, falling back to CSV")
+            # Change extension to CSV and fall through
+            output_path = output_path.with_suffix('.csv')
+            format = "csv"
+    
+    if format.lower() == "csv":
+        df.to_csv(output_path, index=False)
+
+def read_dataframe_with_fallback(
+    file_path: str,
+    duckdb_conn: Optional[duckdb.DuckDBPyConnection] = None
+) -> pd.DataFrame:
+    """
+    Read a DataFrame from file with fallback options.
+    
+    Args:
+        file_path: Path to input file
+        duckdb_conn: Optional DuckDB connection
+    
+    Returns:
+        DataFrame with file contents
+    """
+    file_path = Path(file_path)
+    
+    if file_path.suffix.lower() == '.parquet':
+        try:
+            if duckdb_conn:
+                # Try using DuckDB to read parquet
+                return duckdb_conn.execute(f"SELECT * FROM parquet_scan('{file_path}')").df()
+        except Exception:
+            pass
+        
+        try:
+            # Fallback to pandas
+            return pd.read_parquet(file_path)
+        except Exception:
+            # Try CSV fallback
+            csv_path = file_path.with_suffix('.csv')
+            if csv_path.exists():
+                return pd.read_csv(csv_path)
+            raise
+    
+    elif file_path.suffix.lower() == '.csv':
+        return pd.read_csv(file_path)
+    
+    else:
+        raise ValueError(f"Unsupported file format: {file_path.suffix}")
--- a/examples/podcast_reviews/execute.py
+++ b/examples/podcast_reviews/execute.py
@ -1,2 +0,0 @@
-
-print("What a time to be alive.")
--- a/examples/podcast_reviews/extract_podcasts_job.py
+++ b/examples/podcast_reviews/extract_podcasts_job.py
@ -0,0 +1,246 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Any
+from duckdb_utils import create_duckdb_connection, execute_query_with_fallback, save_dataframe_with_fallback
+
+def main():
+    # Write debug at the very start to see if main is called
+    debug_file = "/tmp/databuild_test/podcasts_main_debug.log"
+    try:
+        with open(debug_file, "w") as f:
+            f.write(f"main() called with sys.argv: {sys.argv}\n")
+            f.flush()
+    except:
+        pass
+    
+    if len(sys.argv) < 2:
+        print("Usage: extract_podcasts_job.py {config|exec} [args...]", file=sys.stderr)
+        sys.exit(1)
+    
+    command = sys.argv[1]
+    
+    try:
+        with open(debug_file, "a") as f:
+            f.write(f"command: {command}\n")
+            f.flush()
+    except:
+        pass
+    
+    if command == "config":
+        handle_config(sys.argv[2:])
+    elif command == "exec":
+        handle_exec(sys.argv[2:])
+    else:
+        print(f"Unknown command: {command}", file=sys.stderr)
+        sys.exit(1)
+
+def handle_config(args):
+    if len(args) < 1:
+        print("Config mode requires partition ref", file=sys.stderr)
+        sys.exit(1)
+    
+    partition_ref = args[0]
+    
+    # This job produces a single partition with all podcast metadata
+    if partition_ref != "podcasts/all":
+        print(f"Invalid partition ref: {partition_ref}. Expected 'podcasts/all'", file=sys.stderr)
+        sys.exit(1)
+    
+    config = {
+        "configs": [{
+            "outputs": [{"str": partition_ref}],
+            "inputs": [],
+            "args": ["all"],
+            "env": {
+                "PARTITION_REF": partition_ref
+            }
+        }]
+    }
+    
+    print(json.dumps(config))
+
+def handle_exec(args):
+    # Write debug info to a file since stdout might not be captured
+    debug_file = "/tmp/databuild_test/podcasts_debug.log"
+    with open(debug_file, "w") as f:
+        f.write(f"Starting extract_podcasts_job.exec with args: {args}\n")
+        f.flush()
+    
+    print(f"Starting extract_podcasts_job.exec with args: {args}")
+    partition_ref = os.getenv('PARTITION_REF', 'podcasts/all')
+    print(f"Partition ref: {partition_ref}")
+    
+    with open(debug_file, "a") as f:
+        f.write(f"Partition ref: {partition_ref}\n")
+        f.flush()
+    
+    # Database paths
+    db_path = "/tmp/databuild_test/examples/podcast_reviews/data/ingest/database.sqlite"
+    if not os.path.exists(db_path):
+        # Fallback to relative path for development
+        db_path = "data/ingest/database.sqlite"
+    
+    print(f"Looking for database at: {db_path}")
+    print(f"Database exists: {os.path.exists(db_path)}")
+    
+    # Output path
+    output_dir = Path("/tmp/databuild_test/examples/podcast_reviews/podcasts")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_file = output_dir / "podcasts.parquet"
+    
+    print(f"Output directory: {output_dir}")
+    print(f"Output file: {output_file}")
+    
+    try:
+        # Extract all podcasts with their categories
+        print("Calling extract_podcasts_with_categories...")
+        result = extract_podcasts_with_categories(db_path, str(output_file))
+        print(f"extract_podcasts_with_categories returned: {result}")
+        
+        print(f"Successfully extracted podcast metadata")
+        print(f"Output written to: {output_file}")
+        print(f"Output file exists: {output_file.exists()}")
+        
+        # Create manifest
+        manifest = {
+            "outputs": [{"str": partition_ref}],
+            "inputs": [],
+            "start_time": datetime.now().isoformat(),
+            "end_time": datetime.now().isoformat(),
+            "task": {
+                "job": {"label": "//examples/podcast_reviews:extract_podcasts_job"},
+                "config": {
+                    "outputs": [{"str": partition_ref}],
+                    "inputs": [],
+                    "args": [],
+                    "env": {"PARTITION_REF": partition_ref}
+                }
+            }
+        }
+        
+        manifest_file = output_dir / "manifest.json"
+        with open(manifest_file, 'w') as f:
+            json.dump(manifest, f, indent=2)
+        
+        print(f"Manifest written to: {manifest_file}")
+            
+    except Exception as e:
+        print(f"Error extracting podcasts: {e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+def extract_podcasts_with_categories(db_path: str, output_file: str):
+    """Extract all podcasts with their categories and save as parquet."""
+    
+    print(f"extract_podcasts_with_categories called with db_path={db_path}, output_file={output_file}")
+    
+    # Connect to DuckDB with extension handling
+    print("Creating DuckDB connection...")
+    duckdb_conn = create_duckdb_connection()
+    print("DuckDB connection created")
+    
+    try:
+        # Use a simpler approach that works with SQLite fallback
+        try:
+            # Try complex DuckDB query first
+            query = """
+            WITH podcast_categories AS (
+                SELECT 
+                    p.podcast_id,
+                    p.itunes_id,
+                    p.slug,
+                    p.itunes_url,
+                    p.title,
+                    c.category,
+                    ROW_NUMBER() OVER (PARTITION BY p.podcast_id ORDER BY c.category) as category_rank
+                FROM sqlite_scan(?, 'podcasts') p
+                LEFT JOIN sqlite_scan(?, 'categories') c ON p.podcast_id = c.podcast_id
+            ),
+            primary_categories AS (
+                SELECT 
+                    podcast_id,
+                    itunes_id,
+                    slug,
+                    itunes_url,
+                    title,
+                    category as primary_category
+                FROM podcast_categories
+                WHERE category_rank = 1
+            ),
+            all_categories AS (
+                SELECT 
+                    podcast_id,
+                    STRING_AGG(category, '|' ORDER BY category) as all_categories
+                FROM podcast_categories
+                WHERE category IS NOT NULL
+                GROUP BY podcast_id
+            )
+            SELECT 
+                pc.podcast_id,
+                pc.itunes_id,
+                pc.slug,
+                pc.itunes_url,
+                pc.title,
+                pc.primary_category,
+                COALESCE(ac.all_categories, pc.primary_category) as all_categories
+            FROM primary_categories pc
+            LEFT JOIN all_categories ac ON pc.podcast_id = ac.podcast_id
+            ORDER BY pc.title
+            """
+            
+            df = duckdb_conn.execute(query, [db_path, db_path]).df()
+            
+        except Exception as e:
+            print(f"DuckDB complex query failed: {e}, using pandas fallback")
+            
+            # Fallback: Use pandas to process the data
+            import pandas as pd
+            import sqlite3
+            
+            sqlite_conn = sqlite3.connect(db_path)
+            try:
+                # Read podcasts and categories separately
+                podcasts_df = pd.read_sql_query("SELECT * FROM podcasts", sqlite_conn)
+                categories_df = pd.read_sql_query("SELECT * FROM categories", sqlite_conn)
+                
+                # Group categories by podcast_id
+                categories_grouped = categories_df.groupby('podcast_id')['category'].apply(
+                    lambda x: '|'.join(sorted(x))
+                ).reset_index()
+                categories_grouped.columns = ['podcast_id', 'all_categories']
+                
+                # Get primary category (first alphabetically)
+                primary_categories = categories_df.sort_values('category').groupby('podcast_id').first().reset_index()
+                primary_categories = primary_categories[['podcast_id', 'category']].rename(columns={'category': 'primary_category'})
+                
+                # Join everything together
+                df = podcasts_df.merge(primary_categories, on='podcast_id', how='left')
+                df = df.merge(categories_grouped, on='podcast_id', how='left')
+                
+                # Fill missing values
+                df['primary_category'] = df['primary_category'].fillna('unknown')
+                df['all_categories'] = df['all_categories'].fillna(df['primary_category'])
+                
+                # Sort by title
+                df = df.sort_values('title')
+                
+            finally:
+                sqlite_conn.close()
+        
+        # Save to parquet with fallback
+        save_dataframe_with_fallback(df, output_file, duckdb_conn, "parquet")
+        
+        row_count = len(df)
+        print(f"Extracted {row_count} podcasts with category information")
+        
+    finally:
+        duckdb_conn.close()
+
+if __name__ == "__main__":
+    main()
--- a/examples/podcast_reviews/extract_reviews_job.py
+++ b/examples/podcast_reviews/extract_reviews_job.py
@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import os
+from datetime import datetime, date
+from pathlib import Path
+from typing import List, Dict, Any
+import re
+from duckdb_utils import create_duckdb_connection, execute_query_with_fallback, save_dataframe_with_fallback
+
+def main():
+    # Write debug info to understand what's being called
+    debug_file = "/tmp/databuild_test/reviews_main_debug.log"
+    try:
+        with open(debug_file, "w") as f:
+            f.write(f"main() called with sys.argv: {sys.argv}\n")
+            f.flush()
+    except:
+        pass
+    
+    if len(sys.argv) < 2:
+        print("Usage: extract_reviews_job.py {config|exec} [args...]", file=sys.stderr)
+        sys.exit(1)
+    
+    command = sys.argv[1]
+    
+    try:
+        with open(debug_file, "a") as f:
+            f.write(f"command: {command}\n")
+            f.flush()
+    except:
+        pass
+    
+    if command == "config":
+        handle_config(sys.argv[2:])
+    elif command == "exec":
+        handle_exec(sys.argv[2:])
+    else:
+        print(f"Unknown command: {command}", file=sys.stderr)
+        sys.exit(1)
+
+def parse_partition_ref(partition_ref: str) -> Dict[str, str]:
+    """Parse partition ref like 'reviews/date=2020-01-01' into components."""
+    match = re.match(r'reviews/date=(\d{4}-\d{2}-\d{2})', partition_ref)
+    if not match:
+        raise ValueError(f"Invalid partition ref format: {partition_ref}")
+    return {"date": match.group(1)}
+
+def handle_config(args):
+    if len(args) < 1:
+        print("Config mode requires partition ref", file=sys.stderr)
+        sys.exit(1)
+    
+    configs = []
+    
+    # Process each partition reference
+    for partition_ref in args:
+        try:
+            parsed = parse_partition_ref(partition_ref)
+            date_str = parsed["date"]
+        except ValueError as e:
+            print(f"Error parsing partition ref: {e}", file=sys.stderr)
+            sys.exit(1)
+        
+        configs.append({
+            "outputs": [{"str": partition_ref}],
+            "inputs": [],
+            "args": [date_str],
+            "env": {
+                "PARTITION_REF": partition_ref,
+                "TARGET_DATE": date_str
+            }
+        })
+    
+    config = {"configs": configs}
+    print(json.dumps(config))
+
+def handle_exec(args):
+    if len(args) < 1:
+        print("Exec mode requires date argument", file=sys.stderr)
+        sys.exit(1)
+    
+    target_date = args[0]
+    partition_ref = os.getenv('PARTITION_REF', f'reviews/date={target_date}')
+    
+    # Database paths
+    db_path = "/tmp/databuild_test/examples/podcast_reviews/data/ingest/database.sqlite"
+    if not os.path.exists(db_path):
+        # Fallback to relative path for development
+        db_path = "data/ingest/database.sqlite"
+    
+    # Output path
+    output_dir = Path(f"/tmp/databuild_test/examples/podcast_reviews/reviews/date={target_date}")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_file = output_dir / "reviews.parquet"
+    
+    try:
+        # Extract reviews for the target date
+        extract_reviews_for_date(db_path, target_date, str(output_file))
+        
+        print(f"Successfully extracted reviews for {target_date}")
+        print(f"Output written to: {output_file}")
+        
+        # Create manifest
+        manifest = {
+            "outputs": [{"str": partition_ref}],
+            "inputs": [],
+            "start_time": datetime.now().isoformat(),
+            "end_time": datetime.now().isoformat(),
+            "task": {
+                "job": {"label": "//examples/podcast_reviews:extract_reviews_job"},
+                "config": {
+                    "outputs": [{"str": partition_ref}],
+                    "inputs": [],
+                    "args": [target_date],
+                    "env": {"PARTITION_REF": partition_ref, "TARGET_DATE": target_date}
+                }
+            }
+        }
+        
+        manifest_file = output_dir / "manifest.json"
+        with open(manifest_file, 'w') as f:
+            json.dump(manifest, f, indent=2)
+            
+    except Exception as e:
+        print(f"Error extracting reviews: {e}", file=sys.stderr)
+        sys.exit(1)
+
+def extract_reviews_for_date(db_path: str, target_date: str, output_file: str):
+    """Extract reviews for a specific date and save as parquet."""
+    
+    # Connect to DuckDB with extension handling
+    duckdb_conn = create_duckdb_connection()
+    
+    try:
+        # Query reviews for the target date
+        query = """
+        SELECT 
+            podcast_id,
+            title as review_title,
+            content,
+            rating,
+            author_id,
+            created_at,
+            DATE(created_at) as review_date
+        FROM sqlite_scan(?, 'reviews')
+        WHERE DATE(created_at) = ?
+        ORDER BY created_at
+        """
+        
+        # Execute query with fallback handling
+        df = execute_query_with_fallback(
+            duckdb_conn, 
+            db_path, 
+            query, 
+            [db_path, target_date]
+        )
+        
+        # Save to parquet with fallback
+        save_dataframe_with_fallback(df, output_file, duckdb_conn, "parquet")
+        
+        row_count = len(df)
+        print(f"Extracted {row_count} reviews for date {target_date}")
+        
+    finally:
+        duckdb_conn.close()
+
+if __name__ == "__main__":
+    main()
--- a/examples/podcast_reviews/job_lookup.py
+++ b/examples/podcast_reviews/job_lookup.py
@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import re
+from collections import defaultdict
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: job_lookup.py partition_ref [partition_ref...]", file=sys.stderr)
+        sys.exit(1)
+    
+    partition_refs = sys.argv[1:]
+    
+    try:
+        result = defaultdict(list)
+        
+        for partition_ref in partition_refs:
+            job_label = lookup_job_for_partition(partition_ref)
+            result[job_label].append(partition_ref)
+        
+        # Output in the format expected by DataBuild
+        print(json.dumps({k: v for k, v in result.items() if v}))
+        
+    except Exception as e:
+        print(f"Error in job lookup: {e}", file=sys.stderr)
+        sys.exit(1)
+
+def lookup_job_for_partition(partition_ref: str) -> str:
+    """Determine which job produces a given partition reference."""
+    
+    # Extract reviews by date: reviews/date=YYYY-MM-DD
+    if re.match(r'reviews/date=\d{4}-\d{2}-\d{2}', partition_ref):
+        return "//:extract_reviews_job"
+    
+    # Extract all podcasts: podcasts/all
+    if partition_ref == "podcasts/all":
+        return "//:extract_podcasts_job"
+    
+    # Categorized reviews: categorized_reviews/category=CATEGORY/date=YYYY-MM-DD
+    if re.match(r'categorized_reviews/category=[^/]+/date=\d{4}-\d{2}-\d{2}', partition_ref):
+        return "//:categorize_reviews_job"
+    
+    # Phrase models: phrase_models/category=CATEGORY/date=YYYY-MM-DD
+    if re.match(r'phrase_models/category=[^/]+/date=\d{4}-\d{2}-\d{2}', partition_ref):
+        return "//:phrase_modeling_job"
+    
+    # Phrase statistics: phrase_stats/category=CATEGORY/date=YYYY-MM-DD
+    if re.match(r'phrase_stats/category=[^/]+/date=\d{4}-\d{2}-\d{2}', partition_ref):
+        return "//:phrase_stats_job"
+    
+    # Daily summaries: daily_summaries/category=CATEGORY/date=YYYY-MM-DD
+    if re.match(r'daily_summaries/category=[^/]+/date=\d{4}-\d{2}-\d{2}', partition_ref):
+        return "//:daily_summary_job"
+    
+    # If no match found, raise an error
+    raise ValueError(f"No job found for partition reference: {partition_ref}")
+
+if __name__ == "__main__":
+    main()
--- a/examples/podcast_reviews/phrase_modeling_job.py
+++ b/examples/podcast_reviews/phrase_modeling_job.py
@ -0,0 +1,353 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import os
+import duckdb
+import subprocess
+import tempfile
+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Any
+import re
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: phrase_modeling_job.py {config|exec} [args...]", file=sys.stderr)
+        sys.exit(1)
+    
+    command = sys.argv[1]
+    
+    if command == "config":
+        handle_config(sys.argv[2:])
+    elif command == "exec":
+        handle_exec(sys.argv[2:])
+    else:
+        print(f"Unknown command: {command}", file=sys.stderr)
+        sys.exit(1)
+
+def parse_partition_ref(partition_ref: str) -> Dict[str, str]:
+    """Parse partition ref like 'phrase_models/category=comedy/date=2020-01-01' into components."""
+    match = re.match(r'phrase_models/category=([^/]+)/date=(\d{4}-\d{2}-\d{2})', partition_ref)
+    if not match:
+        raise ValueError(f"Invalid partition ref format: {partition_ref}")
+    return {"category": match.group(1), "date": match.group(2)}
+
+def handle_config(args):
+    if len(args) < 1:
+        print("Config mode requires partition ref", file=sys.stderr)
+        sys.exit(1)
+    
+    configs = []
+    
+    # Process each partition reference
+    for partition_ref in args:
+        try:
+            parsed = parse_partition_ref(partition_ref)
+            category = parsed["category"]
+            date_str = parsed["date"]
+        except ValueError as e:
+            print(f"Error parsing partition ref: {e}", file=sys.stderr)
+            sys.exit(1)
+        
+        # Dependencies: categorized reviews for the category and date
+        categorized_reviews_ref = f"categorized_reviews/category={category}/date={date_str}"
+        
+        configs.append({
+            "outputs": [{"str": partition_ref}],
+            "inputs": [
+                {"dep_type": 1, "partition_ref": {"str": categorized_reviews_ref}}
+            ],
+            "args": [category, date_str],
+            "env": {
+                "PARTITION_REF": partition_ref,
+                "TARGET_CATEGORY": category,
+                "TARGET_DATE": date_str
+            }
+        })
+    
+    config = {"configs": configs}
+    print(json.dumps(config))
+
+def handle_exec(args):
+    if len(args) < 2:
+        print("Exec mode requires category and date arguments", file=sys.stderr)
+        sys.exit(1)
+    
+    target_category = args[0]
+    target_date = args[1]
+    partition_ref = os.getenv('PARTITION_REF', f'phrase_models/category={target_category}/date={target_date}')
+    
+    # Input path
+    categorized_reviews_file = f"/tmp/databuild_test/examples/podcast_reviews/categorized_reviews/category={target_category}/date={target_date}/categorized_reviews.parquet"
+    
+    # Check input file exists
+    if not os.path.exists(categorized_reviews_file):
+        print(f"Categorized reviews file not found: {categorized_reviews_file}", file=sys.stderr)
+        sys.exit(1)
+    
+    # Output path
+    output_dir = Path(f"/tmp/databuild_test/examples/podcast_reviews/phrase_models/category={target_category}/date={target_date}")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_file = output_dir / "phrase_models.parquet"
+    
+    try:
+        # Extract phrases using phrase modeling
+        extract_phrases_for_category_date(categorized_reviews_file, target_category, target_date, str(output_file))
+        
+        print(f"Successfully extracted phrases for category {target_category} on {target_date}")
+        print(f"Output written to: {output_file}")
+        
+        # Create manifest
+        manifest = {
+            "outputs": [{"str": partition_ref}],
+            "inputs": [
+                {"str": f"categorized_reviews/category={target_category}/date={target_date}"}
+            ],
+            "start_time": datetime.now().isoformat(),
+            "end_time": datetime.now().isoformat(),
+            "task": {
+                "job": {"label": "//examples/podcast_reviews:phrase_modeling_job"},
+                "config": {
+                    "outputs": [{"str": partition_ref}],
+                    "inputs": [
+                        {"dep_type": 1, "partition_ref": {"str": f"categorized_reviews/category={target_category}/date={target_date}"}}
+                    ],
+                    "args": [target_category, target_date],
+                    "env": {"PARTITION_REF": partition_ref, "TARGET_CATEGORY": target_category, "TARGET_DATE": target_date}
+                }
+            }
+        }
+        
+        manifest_file = output_dir / "manifest.json"
+        with open(manifest_file, 'w') as f:
+            json.dump(manifest, f, indent=2)
+            
+    except Exception as e:
+        print(f"Error extracting phrases: {e}", file=sys.stderr)
+        sys.exit(1)
+
+def extract_phrases_for_category_date(categorized_reviews_file: str, target_category: str, target_date: str, output_file: str):
+    """Extract phrases from categorized reviews using phrase binary or simple ngram extraction."""
+    
+    # Connect to DuckDB for processing
+    duckdb_conn = duckdb.connect()
+    
+    try:
+        # Try to install and load parquet extension, but don't fail if it's already installed
+        try:
+            duckdb_conn.execute("INSTALL parquet")
+        except Exception:
+            pass  # Extension might already be installed
+        
+        duckdb_conn.execute("LOAD parquet")
+        
+        # Check if phrase binary is available
+        phrase_binary = find_phrase_binary()
+        
+        if phrase_binary:
+            # Use external phrase binary
+            phrases = extract_with_phrase_binary(categorized_reviews_file, phrase_binary)
+        else:
+            print("Warning: phrase binary not found, using simple ngram extraction")
+            # Fallback to simple ngram extraction
+            phrases = extract_simple_ngrams(categorized_reviews_file, duckdb_conn)
+        
+        # Convert phrases to structured data and save
+        if phrases:
+            save_phrases_to_parquet(phrases, target_category, target_date, output_file, duckdb_conn)
+        else:
+            # Create empty parquet file with correct schema
+            create_empty_phrase_parquet(target_category, target_date, output_file, duckdb_conn)
+        
+    finally:
+        duckdb_conn.close()
+
+def find_phrase_binary() -> str:
+    """Find phrase binary in common locations."""
+    possible_paths = [
+        "/usr/local/bin/phrase",
+        "/usr/bin/phrase",
+        "./phrase",
+        "../phrase",
+        os.path.expanduser("~/bin/phrase")
+    ]
+    
+    for path in possible_paths:
+        if os.path.exists(path) and os.access(path, os.X_OK):
+            return path
+    
+    return None
+
+def extract_with_phrase_binary(categorized_reviews_file: str, phrase_binary: str) -> List[Dict[str, Any]]:
+    """Extract phrases using the external phrase binary."""
+    
+    # Read review content to temporary file
+    duckdb_conn = duckdb.connect()
+    try:
+        # Try to install and load parquet extension, but don't fail if it's already installed
+        try:
+            duckdb_conn.execute("INSTALL parquet")
+        except Exception:
+            pass  # Extension might already be installed
+        
+        duckdb_conn.execute("LOAD parquet")
+        
+        # Extract review content
+        content_query = f"SELECT content FROM parquet_scan('{categorized_reviews_file}') WHERE content IS NOT NULL AND content != ''"
+        results = duckdb_conn.execute(content_query).fetchall()
+        
+        if not results:
+            return []
+        
+        # Write content to temporary file
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as temp_file:
+            for (content,) in results:
+                temp_file.write(content.strip() + '\n')
+            temp_file_path = temp_file.name
+        
+        try:
+            # Run phrase binary
+            cmd = [phrase_binary, "--input", temp_file_path, "--output-format", "json"]
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
+            
+            if result.returncode != 0:
+                print(f"Phrase binary failed: {result.stderr}", file=sys.stderr)
+                return []
+            
+            # Parse JSON output
+            phrases_data = json.loads(result.stdout) if result.stdout.strip() else []
+            return phrases_data
+            
+        finally:
+            # Clean up temp file
+            os.unlink(temp_file_path)
+            
+    finally:
+        duckdb_conn.close()
+
+def extract_simple_ngrams(categorized_reviews_file: str, duckdb_conn) -> List[Dict[str, Any]]:
+    """Simple ngram extraction using SQL as fallback."""
+    
+    # Simple phrase extraction using SQL
+    query = f"""
+    WITH word_tokens AS (
+        SELECT 
+            unnest(string_split(lower(regexp_replace(content, '[^a-zA-Z0-9\\s]', '', 'g')), ' ')) as word,
+            podcast_id,
+            rating
+        FROM parquet_scan('{categorized_reviews_file}')
+        WHERE content IS NOT NULL AND content != ''
+    ),
+    bigrams AS (
+        SELECT 
+            word || ' ' || lead(word) OVER (PARTITION BY podcast_id ORDER BY rowid) as ngram,
+            rating
+        FROM (SELECT *, row_number() OVER () as rowid FROM word_tokens) t
+        WHERE word IS NOT NULL AND word != ''
+    ),
+    phrase_stats AS (
+        SELECT 
+            ngram,
+            COUNT(*) as frequency,
+            AVG(rating::FLOAT) as avg_rating,
+            MIN(rating) as min_rating,
+            MAX(rating) as max_rating
+        FROM bigrams
+        WHERE ngram IS NOT NULL AND ngram NOT LIKE '% %' = false
+        GROUP BY ngram
+        HAVING COUNT(*) >= 3  -- Only phrases that appear at least 3 times
+    )
+    SELECT 
+        ngram,
+        frequency,
+        avg_rating,
+        min_rating,
+        max_rating,
+        CASE 
+            WHEN avg_rating >= 4.0 THEN frequency * avg_rating * 0.8
+            WHEN avg_rating <= 2.0 THEN frequency * (5.0 - avg_rating) * 0.8
+            ELSE frequency * 0.3
+        END as score
+    FROM phrase_stats
+    ORDER BY score DESC
+    LIMIT 1000
+    """
+    
+    try:
+        results = duckdb_conn.execute(query).fetchall()
+        
+        phrases = []
+        for row in results:
+            ngram, frequency, avg_rating, min_rating, max_rating, score = row
+            phrases.append({
+                "ngram": ngram,
+                "frequency": frequency,
+                "avg_rating": float(avg_rating) if avg_rating else 0.0,
+                "min_rating": min_rating,
+                "max_rating": max_rating,
+                "score": float(score) if score else 0.0,
+                "hash": hash(ngram) % (2**31)  # Simple hash for compatibility
+            })
+        
+        return phrases
+        
+    except Exception as e:
+        print(f"Error in simple ngram extraction: {e}", file=sys.stderr)
+        return []
+
+def save_phrases_to_parquet(phrases: List[Dict[str, Any]], category: str, date: str, output_file: str, duckdb_conn):
+    """Save phrases to parquet file."""
+    
+    if not phrases:
+        create_empty_phrase_parquet(category, date, output_file, duckdb_conn)
+        return
+    
+    # Create temporary table with phrases
+    duckdb_conn.execute("DROP TABLE IF EXISTS temp_phrases")
+    duckdb_conn.execute("""
+        CREATE TABLE temp_phrases (
+            date VARCHAR,
+            category VARCHAR,
+            hash BIGINT,
+            ngram VARCHAR,
+            score DOUBLE
+        )
+    """)
+    
+    # Insert phrase data
+    for phrase in phrases:
+        duckdb_conn.execute("""
+            INSERT INTO temp_phrases VALUES (?, ?, ?, ?, ?)
+        """, [
+            date,
+            category,
+            phrase.get("hash", hash(phrase.get("ngram", "")) % (2**31)),
+            phrase.get("ngram", ""),
+            phrase.get("score", 0.0)
+        ])
+    
+    # Save to parquet
+    duckdb_conn.execute(f"COPY temp_phrases TO '{output_file}' (FORMAT PARQUET)")
+    
+    print(f"Saved {len(phrases)} phrases to parquet file")
+
+def create_empty_phrase_parquet(category: str, date: str, output_file: str, duckdb_conn):
+    """Create empty parquet file with correct schema."""
+    
+    duckdb_conn.execute("DROP TABLE IF EXISTS empty_phrases")
+    duckdb_conn.execute("""
+        CREATE TABLE empty_phrases (
+            date VARCHAR,
+            category VARCHAR,
+            hash BIGINT,
+            ngram VARCHAR,
+            score DOUBLE
+        )
+    """)
+    
+    duckdb_conn.execute(f"COPY empty_phrases TO '{output_file}' (FORMAT PARQUET)")
+    print("Created empty phrase models file (no phrases extracted)")
+
+if __name__ == "__main__":
+    main()
--- a/examples/podcast_reviews/phrase_stats_job.py
+++ b/examples/podcast_reviews/phrase_stats_job.py
@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import os
+import duckdb
+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Any
+import re
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: phrase_stats_job.py {config|exec} [args...]", file=sys.stderr)
+        sys.exit(1)
+    
+    command = sys.argv[1]
+    
+    if command == "config":
+        handle_config(sys.argv[2:])
+    elif command == "exec":
+        handle_exec(sys.argv[2:])
+    else:
+        print(f"Unknown command: {command}", file=sys.stderr)
+        sys.exit(1)
+
+def parse_partition_ref(partition_ref: str) -> Dict[str, str]:
+    """Parse partition ref like 'phrase_stats/category=comedy/date=2020-01-01' into components."""
+    match = re.match(r'phrase_stats/category=([^/]+)/date=(\d{4}-\d{2}-\d{2})', partition_ref)
+    if not match:
+        raise ValueError(f"Invalid partition ref format: {partition_ref}")
+    return {"category": match.group(1), "date": match.group(2)}
+
+def handle_config(args):
+    if len(args) < 1:
+        print("Config mode requires partition ref", file=sys.stderr)
+        sys.exit(1)
+    
+    configs = []
+    
+    # Process each partition reference
+    for partition_ref in args:
+        try:
+            parsed = parse_partition_ref(partition_ref)
+            category = parsed["category"]
+            date_str = parsed["date"]
+        except ValueError as e:
+            print(f"Error parsing partition ref: {e}", file=sys.stderr)
+            sys.exit(1)
+        
+        # Dependencies: phrase models and categorized reviews for the category and date
+        phrase_models_ref = f"phrase_models/category={category}/date={date_str}"
+        categorized_reviews_ref = f"categorized_reviews/category={category}/date={date_str}"
+        
+        configs.append({
+            "outputs": [{"str": partition_ref}],
+            "inputs": [
+                {"dep_type": 1, "partition_ref": {"str": phrase_models_ref}},
+                {"dep_type": 1, "partition_ref": {"str": categorized_reviews_ref}}
+            ],
+            "args": [category, date_str],
+            "env": {
+                "PARTITION_REF": partition_ref,
+                "TARGET_CATEGORY": category,
+                "TARGET_DATE": date_str
+            }
+        })
+    
+    config = {"configs": configs}
+    print(json.dumps(config))
+
+def handle_exec(args):
+    if len(args) < 2:
+        print("Exec mode requires category and date arguments", file=sys.stderr)
+        sys.exit(1)
+    
+    target_category = args[0]
+    target_date = args[1]
+    partition_ref = os.getenv('PARTITION_REF', f'phrase_stats/category={target_category}/date={target_date}')
+    
+    # Input paths
+    phrase_models_file = f"/tmp/databuild_test/examples/podcast_reviews/phrase_models/category={target_category}/date={target_date}/phrase_models.parquet"
+    categorized_reviews_file = f"/tmp/databuild_test/examples/podcast_reviews/categorized_reviews/category={target_category}/date={target_date}/categorized_reviews.parquet"
+    
+    # Check input files exist
+    if not os.path.exists(phrase_models_file):
+        print(f"Phrase models file not found: {phrase_models_file}", file=sys.stderr)
+        sys.exit(1)
+    
+    if not os.path.exists(categorized_reviews_file):
+        print(f"Categorized reviews file not found: {categorized_reviews_file}", file=sys.stderr)
+        sys.exit(1)
+    
+    # Output path
+    output_dir = Path(f"/tmp/databuild_test/examples/podcast_reviews/phrase_stats/category={target_category}/date={target_date}")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_file = output_dir / "phrase_stats.parquet"
+    
+    try:
+        # Calculate phrase statistics per podcast
+        calculate_phrase_stats_for_category_date(
+            phrase_models_file, 
+            categorized_reviews_file, 
+            target_category, 
+            target_date, 
+            str(output_file)
+        )
+        
+        print(f"Successfully calculated phrase stats for category {target_category} on {target_date}")
+        print(f"Output written to: {output_file}")
+        
+        # Create manifest
+        manifest = {
+            "outputs": [{"str": partition_ref}],
+            "inputs": [
+                {"str": f"phrase_models/category={target_category}/date={target_date}"},
+                {"str": f"categorized_reviews/category={target_category}/date={target_date}"}
+            ],
+            "start_time": datetime.now().isoformat(),
+            "end_time": datetime.now().isoformat(),
+            "task": {
+                "job": {"label": "//examples/podcast_reviews:phrase_stats_job"},
+                "config": {
+                    "outputs": [{"str": partition_ref}],
+                    "inputs": [
+                        {"dep_type": 1, "partition_ref": {"str": f"phrase_models/category={target_category}/date={target_date}"}},
+                        {"dep_type": 1, "partition_ref": {"str": f"categorized_reviews/category={target_category}/date={target_date}"}}
+                    ],
+                    "args": [target_category, target_date],
+                    "env": {"PARTITION_REF": partition_ref, "TARGET_CATEGORY": target_category, "TARGET_DATE": target_date}
+                }
+            }
+        }
+        
+        manifest_file = output_dir / "manifest.json"
+        with open(manifest_file, 'w') as f:
+            json.dump(manifest, f, indent=2)
+            
+    except Exception as e:
+        print(f"Error calculating phrase stats: {e}", file=sys.stderr)
+        sys.exit(1)
+
+def calculate_phrase_stats_for_category_date(
+    phrase_models_file: str, 
+    categorized_reviews_file: str, 
+    target_category: str, 
+    target_date: str, 
+    output_file: str
+):
+    """Calculate phrase statistics per podcast by joining phrase models with reviews."""
+    
+    # Connect to DuckDB for processing
+    duckdb_conn = duckdb.connect()
+    
+    try:
+        # Try to install and load parquet extension, but don't fail if it's already installed
+        try:
+            duckdb_conn.execute("INSTALL parquet")
+        except Exception:
+            pass  # Extension might already be installed
+        
+        duckdb_conn.execute("LOAD parquet")
+        
+        # Check if we have phrase models
+        phrase_count = duckdb_conn.execute(f"SELECT COUNT(*) FROM parquet_scan('{phrase_models_file}')").fetchone()[0]
+        
+        if phrase_count == 0:
+            print(f"No phrase models found, creating empty phrase stats")
+            create_empty_phrase_stats(target_category, target_date, output_file, duckdb_conn)
+            return
+        
+        # Query to calculate phrase statistics per podcast
+        query = f"""
+        WITH phrase_matches AS (
+            SELECT 
+                r.podcast_id,
+                r.podcast_title,
+                r.rating,
+                r.content,
+                p.ngram,
+                p.score as phrase_score
+            FROM parquet_scan('{categorized_reviews_file}') r
+            JOIN parquet_scan('{phrase_models_file}') p 
+                ON lower(r.content) LIKE '%' || lower(p.ngram) || '%'
+            WHERE r.content IS NOT NULL 
+              AND r.content != ''
+              AND p.ngram IS NOT NULL
+              AND p.ngram != ''
+        ),
+        podcast_phrase_stats AS (
+            SELECT 
+                '{target_date}' as date,
+                '{target_category}' as category,
+                podcast_id,
+                podcast_title,
+                ngram,
+                COUNT(*) as count,
+                AVG(rating::FLOAT) as avg_rating,
+                MIN(rating) as min_rating,
+                MAX(rating) as max_rating,
+                AVG(phrase_score) as avg_phrase_score,
+                COUNT(*) * AVG(phrase_score) * AVG(rating::FLOAT) / 5.0 as weighted_score
+            FROM phrase_matches
+            GROUP BY podcast_id, podcast_title, ngram
+            HAVING COUNT(*) >= 2  -- Only include phrases that appear at least twice per podcast
+        )
+        SELECT 
+            date,
+            category,
+            podcast_id,
+            podcast_title,
+            ngram,
+            count,
+            avg_rating,
+            min_rating,
+            max_rating,
+            avg_phrase_score,
+            weighted_score
+        FROM podcast_phrase_stats
+        ORDER BY weighted_score DESC, count DESC
+        """
+        
+        # Execute query and save to parquet
+        duckdb_conn.execute(f"COPY ({query}) TO '{output_file}' (FORMAT PARQUET)")
+        
+        # Get row count for logging
+        count_result = duckdb_conn.execute(f"SELECT COUNT(*) FROM ({query})").fetchone()
+        row_count = count_result[0] if count_result else 0
+        
+        print(f"Calculated phrase statistics for {row_count} podcast-phrase combinations")
+        
+        if row_count == 0:
+            print(f"Warning: No phrase matches found for category '{target_category}' on date '{target_date}'")
+            create_empty_phrase_stats(target_category, target_date, output_file, duckdb_conn)
+        
+    finally:
+        duckdb_conn.close()
+
+def create_empty_phrase_stats(category: str, date: str, output_file: str, duckdb_conn):
+    """Create empty phrase stats parquet file with correct schema."""
+    
+    duckdb_conn.execute("DROP TABLE IF EXISTS empty_phrase_stats")
+    duckdb_conn.execute("""
+        CREATE TABLE empty_phrase_stats (
+            date VARCHAR,
+            category VARCHAR,
+            podcast_id VARCHAR,
+            podcast_title VARCHAR,
+            ngram VARCHAR,
+            count BIGINT,
+            avg_rating DOUBLE,
+            min_rating INTEGER,
+            max_rating INTEGER,
+            avg_phrase_score DOUBLE,
+            weighted_score DOUBLE
+        )
+    """)
+    
+    duckdb_conn.execute(f"COPY empty_phrase_stats TO '{output_file}' (FORMAT PARQUET)")
+    print("Created empty phrase stats file")
+
+if __name__ == "__main__":
+    main()
--- a/examples/podcast_reviews/requirements.in
+++ b/examples/podcast_reviews/requirements.in
@ -1,2 +1,4 @@
 duckdb==1.2.2
-pydantic==2.11.3
+pydantic==2.11.3
+pandas>=2.0.0
+pyarrow>=10.0.0
--- a/examples/podcast_reviews/requirements_lock.txt
+++ b/examples/podcast_reviews/requirements_lock.txt
@ -61,6 +61,160 @@ duckdb==1.2.2 \
    --hash=sha256:fb9a2c77236fae079185a990434cb9d8432902488ba990235c702fc2692d2dcd \
    --hash=sha256:fd9c434127fd1575694e1cf19a393bed301f5d6e80b4bcdae80caa368a61a678
    # via -r requirements.in
+numpy==2.3.1 \
+    --hash=sha256:0025048b3c1557a20bc80d06fdeb8cc7fc193721484cca82b2cfa072fec71a93 \
+    --hash=sha256:010ce9b4f00d5c036053ca684c77441f2f2c934fd23bee058b4d6f196efd8280 \
+    --hash=sha256:0bb3a4a61e1d327e035275d2a993c96fa786e4913aa089843e6a2d9dd205c66a \
+    --hash=sha256:0c4d9e0a8368db90f93bd192bfa771ace63137c3488d198ee21dfb8e7771916e \
+    --hash=sha256:15aa4c392ac396e2ad3d0a2680c0f0dee420f9fed14eef09bdb9450ee6dcb7b7 \
+    --hash=sha256:18703df6c4a4fee55fd3d6e5a253d01c5d33a295409b03fda0c86b3ca2ff41a1 \
+    --hash=sha256:1ec9ae20a4226da374362cca3c62cd753faf2f951440b0e3b98e93c235441d2b \
+    --hash=sha256:23ab05b2d241f76cb883ce8b9a93a680752fbfcbd51c50eff0b88b979e471d8c \
+    --hash=sha256:25a1992b0a3fdcdaec9f552ef10d8103186f5397ab45e2d25f8ac51b1a6b97e8 \
+    --hash=sha256:2959d8f268f3d8ee402b04a9ec4bb7604555aeacf78b360dc4ec27f1d508177d \
+    --hash=sha256:2a809637460e88a113e186e87f228d74ae2852a2e0c44de275263376f17b5bdc \
+    --hash=sha256:2fb86b7e58f9ac50e1e9dd1290154107e47d1eef23a0ae9145ded06ea606f992 \
+    --hash=sha256:36890eb9e9d2081137bd78d29050ba63b8dab95dff7912eadf1185e80074b2a0 \
+    --hash=sha256:39bff12c076812595c3a306f22bfe49919c5513aa1e0e70fac756a0be7c2a2b8 \
+    --hash=sha256:467db865b392168ceb1ef1ffa6f5a86e62468c43e0cfb4ab6da667ede10e58db \
+    --hash=sha256:4e602e1b8682c2b833af89ba641ad4176053aaa50f5cacda1a27004352dde943 \
+    --hash=sha256:5902660491bd7a48b2ec16c23ccb9124b8abfd9583c5fdfa123fe6b421e03de1 \
+    --hash=sha256:5ccb7336eaf0e77c1635b232c141846493a588ec9ea777a7c24d7166bb8533ae \
+    --hash=sha256:5f1b8f26d1086835f442286c1d9b64bb3974b0b1e41bb105358fd07d20872952 \
+    --hash=sha256:6269b9edfe32912584ec496d91b00b6d34282ca1d07eb10e82dfc780907d6c2e \
+    --hash=sha256:6ea9e48336a402551f52cd8f593343699003d2353daa4b72ce8d34f66b722070 \
+    --hash=sha256:762e0c0c6b56bdedfef9a8e1d4538556438288c4276901ea008ae44091954e29 \
+    --hash=sha256:7be91b2239af2658653c5bb6f1b8bccafaf08226a258caf78ce44710a0160d30 \
+    --hash=sha256:7dea630156d39b02a63c18f508f85010230409db5b2927ba59c8ba4ab3e8272e \
+    --hash=sha256:867ef172a0976aaa1f1d1b63cf2090de8b636a7674607d514505fb7276ab08fc \
+    --hash=sha256:8d5ee6eec45f08ce507a6570e06f2f879b374a552087a4179ea7838edbcbfa42 \
+    --hash=sha256:8e333040d069eba1652fb08962ec5b76af7f2c7bce1df7e1418c8055cf776f25 \
+    --hash=sha256:a5ee121b60aa509679b682819c602579e1df14a5b07fe95671c8849aad8f2115 \
+    --hash=sha256:a780033466159c2270531e2b8ac063704592a0bc62ec4a1b991c7c40705eb0e8 \
+    --hash=sha256:a894f3816eb17b29e4783e5873f92faf55b710c2519e5c351767c51f79d8526d \
+    --hash=sha256:a8b740f5579ae4585831b3cf0e3b0425c667274f82a484866d2adf9570539369 \
+    --hash=sha256:ad506d4b09e684394c42c966ec1527f6ebc25da7f4da4b1b056606ffe446b8a3 \
+    --hash=sha256:afed2ce4a84f6b0fc6c1ce734ff368cbf5a5e24e8954a338f3bdffa0718adffb \
+    --hash=sha256:b0b5397374f32ec0649dd98c652a1798192042e715df918c20672c62fb52d4b8 \
+    --hash=sha256:bada6058dd886061f10ea15f230ccf7dfff40572e99fef440a4a857c8728c9c0 \
+    --hash=sha256:c4913079974eeb5c16ccfd2b1f09354b8fed7e0d6f2cab933104a09a6419b1ee \
+    --hash=sha256:c5bdf2015ccfcee8253fb8be695516ac4457c743473a43290fd36eba6a1777eb \
+    --hash=sha256:c6e0bf9d1a2f50d2b65a7cf56db37c095af17b59f6c132396f7c6d5dd76484df \
+    --hash=sha256:ce2ce9e5de4703a673e705183f64fd5da5bf36e7beddcb63a25ee2286e71ca48 \
+    --hash=sha256:cfecc7822543abdea6de08758091da655ea2210b8ffa1faf116b940693d3df76 \
+    --hash=sha256:d4580adadc53311b163444f877e0789f1c8861e2698f6b2a4ca852fda154f3ff \
+    --hash=sha256:d70f20df7f08b90a2062c1f07737dd340adccf2068d0f1b9b3d56e2038979fee \
+    --hash=sha256:e344eb79dab01f1e838ebb67aab09965fb271d6da6b00adda26328ac27d4a66e \
+    --hash=sha256:e610832418a2bc09d974cc9fecebfa51e9532d6190223bc5ef6a7402ebf3b5cb \
+    --hash=sha256:e772dda20a6002ef7061713dc1e2585bc1b534e7909b2030b5a46dae8ff077ab \
+    --hash=sha256:e7cbf5a5eafd8d230a3ce356d892512185230e4781a361229bd902ff403bc660 \
+    --hash=sha256:eabd7e8740d494ce2b4ea0ff05afa1b7b291e978c0ae075487c51e8bd93c0c68 \
+    --hash=sha256:ebb8603d45bc86bbd5edb0d63e52c5fd9e7945d3a503b77e486bd88dde67a19b \
+    --hash=sha256:ec0bdafa906f95adc9a0c6f26a4871fa753f25caaa0e032578a30457bff0af6a \
+    --hash=sha256:eccb9a159db9aed60800187bc47a6d3451553f0e1b08b068d8b277ddfbb9b244 \
+    --hash=sha256:ee8340cb48c9b7a5899d1149eece41ca535513a9698098edbade2a8e7a84da77
+    # via pandas
+pandas==2.3.0 \
+    --hash=sha256:034abd6f3db8b9880aaee98f4f5d4dbec7c4829938463ec046517220b2f8574e \
+    --hash=sha256:094e271a15b579650ebf4c5155c05dcd2a14fd4fdd72cf4854b2f7ad31ea30be \
+    --hash=sha256:14a0cc77b0f089d2d2ffe3007db58f170dae9b9f54e569b299db871a3ab5bf46 \
+    --hash=sha256:1a881bc1309f3fce34696d07b00f13335c41f5f5a8770a33b09ebe23261cfc67 \
+    --hash=sha256:1d2b33e68d0ce64e26a4acc2e72d747292084f4e8db4c847c6f5f6cbe56ed6d8 \
+    --hash=sha256:213cd63c43263dbb522c1f8a7c9d072e25900f6975596f883f4bebd77295d4f3 \
+    --hash=sha256:23c2b2dc5213810208ca0b80b8666670eb4660bbfd9d45f58592cc4ddcfd62e1 \
+    --hash=sha256:2c7e2fc25f89a49a11599ec1e76821322439d90820108309bf42130d2f36c983 \
+    --hash=sha256:2eb4728a18dcd2908c7fccf74a982e241b467d178724545a48d0caf534b38ebf \
+    --hash=sha256:34600ab34ebf1131a7613a260a61dbe8b62c188ec0ea4c296da7c9a06b004133 \
+    --hash=sha256:39ff73ec07be5e90330cc6ff5705c651ace83374189dcdcb46e6ff54b4a72cd6 \
+    --hash=sha256:404d681c698e3c8a40a61d0cd9412cc7364ab9a9cc6e144ae2992e11a2e77a20 \
+    --hash=sha256:40cecc4ea5abd2921682b57532baea5588cc5f80f0231c624056b146887274d2 \
+    --hash=sha256:430a63bae10b5086995db1b02694996336e5a8ac9a96b4200572b413dfdfccb9 \
+    --hash=sha256:4930255e28ff5545e2ca404637bcc56f031893142773b3468dc021c6c32a1390 \
+    --hash=sha256:6021910b086b3ca756755e86ddc64e0ddafd5e58e076c72cb1585162e5ad259b \
+    --hash=sha256:625466edd01d43b75b1883a64d859168e4556261a5035b32f9d743b67ef44634 \
+    --hash=sha256:75651c14fde635e680496148a8526b328e09fe0572d9ae9b638648c46a544ba3 \
+    --hash=sha256:84141f722d45d0c2a89544dd29d35b3abfc13d2250ed7e68394eda7564bd6324 \
+    --hash=sha256:8adff9f138fc614347ff33812046787f7d43b3cef7c0f0171b3340cae333f6ca \
+    --hash=sha256:951805d146922aed8357e4cc5671b8b0b9be1027f0619cea132a9f3f65f2f09c \
+    --hash=sha256:9efc0acbbffb5236fbdf0409c04edce96bec4bdaa649d49985427bd1ec73e085 \
+    --hash=sha256:9ff730713d4c4f2f1c860e36c005c7cefc1c7c80c21c0688fd605aa43c9fcf09 \
+    --hash=sha256:a6872d695c896f00df46b71648eea332279ef4077a409e2fe94220208b6bb675 \
+    --hash=sha256:b198687ca9c8529662213538a9bb1e60fa0bf0f6af89292eb68fea28743fcd5a \
+    --hash=sha256:b9d8c3187be7479ea5c3d30c32a5d73d62a621166675063b2edd21bc47614027 \
+    --hash=sha256:ba24af48643b12ffe49b27065d3babd52702d95ab70f50e1b34f71ca703e2c0d \
+    --hash=sha256:bb32dc743b52467d488e7a7c8039b821da2826a9ba4f85b89ea95274f863280f \
+    --hash=sha256:bb3be958022198531eb7ec2008cfc78c5b1eed51af8600c6c5d9160d89d8d249 \
+    --hash=sha256:bf5be867a0541a9fb47a4be0c5790a4bccd5b77b92f0a59eeec9375fafc2aa14 \
+    --hash=sha256:c06f6f144ad0a1bf84699aeea7eff6068ca5c63ceb404798198af7eb86082e33 \
+    --hash=sha256:c6da97aeb6a6d233fb6b17986234cc723b396b50a3c6804776351994f2a658fd \
+    --hash=sha256:e0f51973ba93a9f97185049326d75b942b9aeb472bec616a129806facb129ebb \
+    --hash=sha256:e1991bbb96f4050b09b5f811253c4f3cf05ee89a589379aa36cd623f21a31d6f \
+    --hash=sha256:e5f08eb9a445d07720776df6e641975665c9ea12c9d8a331e0f6890f2dcd76ef \
+    --hash=sha256:e78ad363ddb873a631e92a3c063ade1ecfb34cae71e9a2be6ad100f875ac1042 \
+    --hash=sha256:ed16339bc354a73e0a609df36d256672c7d296f3f767ac07257801aa064ff73c \
+    --hash=sha256:f4dd97c19bd06bc557ad787a15b6489d2614ddaab5d104a0310eb314c724b2d2 \
+    --hash=sha256:f925f1ef673b4bd0271b1809b72b3270384f2b7d9d14a189b12b7fc02574d575 \
+    --hash=sha256:f95a2aef32614ed86216d3c450ab12a4e82084e8102e355707a1d96e33d51c34 \
+    --hash=sha256:fa07e138b3f6c04addfeaf56cc7fdb96c3b68a3fe5e5401251f231fce40a0d7a \
+    --hash=sha256:fa35c266c8cd1a67d75971a1912b185b492d257092bdd2709bbdebe574ed228d
+    # via -r requirements.in
+pyarrow==20.0.0 \
+    --hash=sha256:00138f79ee1b5aca81e2bdedb91e3739b987245e11fa3c826f9e57c5d102fb75 \
+    --hash=sha256:11529a2283cb1f6271d7c23e4a8f9f8b7fd173f7360776b668e509d712a02eec \
+    --hash=sha256:15aa1b3b2587e74328a730457068dc6c89e6dcbf438d4369f572af9d320a25ee \
+    --hash=sha256:1bcbe471ef3349be7714261dea28fe280db574f9d0f77eeccc195a2d161fd861 \
+    --hash=sha256:204a846dca751428991346976b914d6d2a82ae5b8316a6ed99789ebf976551e6 \
+    --hash=sha256:211d5e84cecc640c7a3ab900f930aaff5cd2702177e0d562d426fb7c4f737781 \
+    --hash=sha256:24ca380585444cb2a31324c546a9a56abbe87e26069189e14bdba19c86c049f0 \
+    --hash=sha256:2c3a01f313ffe27ac4126f4c2e5ea0f36a5fc6ab51f8726cf41fee4b256680bd \
+    --hash=sha256:30b3051b7975801c1e1d387e17c588d8ab05ced9b1e14eec57915f79869b5031 \
+    --hash=sha256:3346babb516f4b6fd790da99b98bed9708e3f02e734c84971faccb20736848dc \
+    --hash=sha256:3e1f8a47f4b4ae4c69c4d702cfbdfe4d41e18e5c7ef6f1bb1c50918c1e81c57b \
+    --hash=sha256:4250e28a22302ce8692d3a0e8ec9d9dde54ec00d237cff4dfa9c1fbf79e472a8 \
+    --hash=sha256:4680f01ecd86e0dd63e39eb5cd59ef9ff24a9d166db328679e36c108dc993d4c \
+    --hash=sha256:4a8b029a07956b8d7bd742ffca25374dd3f634b35e46cc7a7c3fa4c75b297191 \
+    --hash=sha256:4ba3cf4182828be7a896cbd232aa8dd6a31bd1f9e32776cc3796c012855e1199 \
+    --hash=sha256:5605919fbe67a7948c1f03b9f3727d82846c053cd2ce9303ace791855923fd20 \
+    --hash=sha256:5f0fb1041267e9968c6d0d2ce3ff92e3928b243e2b6d11eeb84d9ac547308232 \
+    --hash=sha256:6102b4864d77102dbbb72965618e204e550135a940c2534711d5ffa787df2a5a \
+    --hash=sha256:6415a0d0174487456ddc9beaead703d0ded5966129fa4fd3114d76b5d1c5ceae \
+    --hash=sha256:6bb830757103a6cb300a04610e08d9636f0cd223d32f388418ea893a3e655f1c \
+    --hash=sha256:6fc1499ed3b4b57ee4e090e1cea6eb3584793fe3d1b4297bbf53f09b434991a5 \
+    --hash=sha256:75a51a5b0eef32727a247707d4755322cb970be7e935172b6a3a9f9ae98404ba \
+    --hash=sha256:7a3a5dcf54286e6141d5114522cf31dd67a9e7c9133d150799f30ee302a7a1ab \
+    --hash=sha256:7f4c8534e2ff059765647aa69b75d6543f9fef59e2cd4c6d18015192565d2b70 \
+    --hash=sha256:82f1ee5133bd8f49d31be1299dc07f585136679666b502540db854968576faf9 \
+    --hash=sha256:851c6a8260ad387caf82d2bbf54759130534723e37083111d4ed481cb253cc0d \
+    --hash=sha256:89e030dc58fc760e4010148e6ff164d2f44441490280ef1e97a542375e41058e \
+    --hash=sha256:95b330059ddfdc591a3225f2d272123be26c8fa76e8c9ee1a77aad507361cfdb \
+    --hash=sha256:96d6a0a37d9c98be08f5ed6a10831d88d52cac7b13f5287f1e0f625a0de8062b \
+    --hash=sha256:96e37f0766ecb4514a899d9a3554fadda770fb57ddf42b63d80f14bc20aa7db3 \
+    --hash=sha256:97c8dc984ed09cb07d618d57d8d4b67a5100a30c3818c2fb0b04599f0da2de7b \
+    --hash=sha256:991f85b48a8a5e839b2128590ce07611fae48a904cae6cab1f089c5955b57eb5 \
+    --hash=sha256:9965a050048ab02409fb7cbbefeedba04d3d67f2cc899eff505cc084345959ca \
+    --hash=sha256:9b71daf534f4745818f96c214dbc1e6124d7daf059167330b610fc69b6f3d3e3 \
+    --hash=sha256:a15532e77b94c61efadde86d10957950392999503b3616b2ffcef7621a002893 \
+    --hash=sha256:a18a14baef7d7ae49247e75641fd8bcbb39f44ed49a9fc4ec2f65d5031aa3b96 \
+    --hash=sha256:a1f60dc14658efaa927f8214734f6a01a806d7690be4b3232ba526836d216122 \
+    --hash=sha256:a2791f69ad72addd33510fec7bb14ee06c2a448e06b649e264c094c5b5f7ce28 \
+    --hash=sha256:a5704f29a74b81673d266e5ec1fe376f060627c2e42c5c7651288ed4b0db29e9 \
+    --hash=sha256:a6ad3e7758ecf559900261a4df985662df54fb7fdb55e8e3b3aa99b23d526b62 \
+    --hash=sha256:aa0d288143a8585806e3cc7c39566407aab646fb9ece164609dac1cfff45f6ae \
+    --hash=sha256:b6953f0114f8d6f3d905d98e987d0924dabce59c3cda380bdfaa25a6201563b4 \
+    --hash=sha256:b8ff87cc837601532cc8242d2f7e09b4e02404de1b797aee747dd4ba4bd6313f \
+    --hash=sha256:c7dd06fd7d7b410ca5dc839cc9d485d2bc4ae5240851bcd45d85105cc90a47d7 \
+    --hash=sha256:ca151afa4f9b7bc45bcc791eb9a89e90a9eb2772767d0b1e5389609c7d03db63 \
+    --hash=sha256:cb497649e505dc36542d0e68eca1a3c94ecbe9799cb67b578b55f2441a247fbc \
+    --hash=sha256:d5382de8dc34c943249b01c19110783d0d64b207167c728461add1ecc2db88e4 \
+    --hash=sha256:db53390eaf8a4dab4dbd6d93c85c5cf002db24902dbff0ca7d988beb5c9dd15b \
+    --hash=sha256:dd43f58037443af715f34f1322c782ec463a3c8a94a85fdb2d987ceb5658e061 \
+    --hash=sha256:e22f80b97a271f0a7d9cd07394a7d348f80d3ac63ed7cc38b6d1b696ab3b2619 \
+    --hash=sha256:e724a3fd23ae5b9c010e7be857f4405ed5e679db5c93e66204db1a69f733936a \
+    --hash=sha256:e8b88758f9303fa5a83d6c90e176714b2fd3852e776fc2d7e42a22dd6c2fb368 \
+    --hash=sha256:f2d67ac28f57a362f1a2c1e6fa98bfe2f03230f7e15927aecd067433b1e70ce8 \
+    --hash=sha256:f3b117b922af5e4c6b9a9115825726cac7d8b1421c37c2b5e24fbacc8930612c \
+    --hash=sha256:febc4a913592573c8d5805091a6c2b5064c8bd6e002131f01061797d91c783c1
+    # via -r requirements.in
 pydantic==2.11.3 \
    --hash=sha256:7471657138c16adad9322fe3070c0116dd6c3ad8d649300e3cbdfe91f4db4ec3 \
    --hash=sha256:a082753436a07f9ba1289c6ffa01cd93db3548776088aa917cc43b63f68fa60f
@ -166,6 +320,18 @@ pydantic-core==2.33.1 \
    --hash=sha256:fc903512177361e868bc1f5b80ac8c8a6e05fcdd574a5fb5ffeac5a9982b9e89 \
    --hash=sha256:fe44d56aa0b00d66640aa84a3cbe80b7a3ccdc6f0b1ca71090696a6d4777c091
    # via pydantic
+python-dateutil==2.9.0.post0 \
+    --hash=sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3 \
+    --hash=sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427
+    # via pandas
+pytz==2025.2 \
+    --hash=sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3 \
+    --hash=sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00
+    # via pandas
+six==1.17.0 \
+    --hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \
+    --hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81
+    # via python-dateutil
 typing-extensions==4.13.2 \
    --hash=sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c \
    --hash=sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef
@ -177,3 +343,7 @@ typing-inspection==0.4.0 \
    --hash=sha256:50e72559fcd2a6367a19f7a7e610e6afcb9fac940c650290eed893d61386832f \
    --hash=sha256:9765c87de36671694a67904bf2c96e395be9c6439bb6c87b5142569dcdd65122
    # via pydantic
+tzdata==2025.2 \
+    --hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \
+    --hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9
+    # via pandas
--- a/examples/podcast_reviews/test_jobs.py
+++ b/examples/podcast_reviews/test_jobs.py
@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+"""
+Basic tests for podcast reviews jobs.
+Tests the configuration and basic execution flow of each job.
+"""
+
+import os
+import sys
+import subprocess
+import tempfile
+import shutil
+import json
+from pathlib import Path
+
+def run_job_config(job_script: str, partition_ref: str):
+    """Run a job in config mode and return the parsed config."""
+    cmd = [sys.executable, job_script, "config", partition_ref]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    
+    if result.returncode != 0:
+        raise Exception(f"Config failed for {job_script}: {result.stderr}")
+    
+    return json.loads(result.stdout)
+
+def test_extract_reviews_job():
+    """Test extract_reviews_job configuration."""
+    print("Testing extract_reviews_job...")
+    
+    config = run_job_config("extract_reviews_job.py", "reviews/date=2020-01-01")
+    
+    assert len(config["configs"]) == 1
+    job_config = config["configs"][0]
+    
+    assert job_config["outputs"] == [{"str": "reviews/date=2020-01-01"}]
+    assert job_config["inputs"] == []
+    assert job_config["args"] == ["2020-01-01"]
+    assert job_config["env"]["TARGET_DATE"] == "2020-01-01"
+    
+    print("✓ extract_reviews_job config test passed")
+
+def test_extract_podcasts_job():
+    """Test extract_podcasts_job configuration."""
+    print("Testing extract_podcasts_job...")
+    
+    config = run_job_config("extract_podcasts_job.py", "podcasts/all")
+    
+    assert len(config["configs"]) == 1
+    job_config = config["configs"][0]
+    
+    assert job_config["outputs"] == [{"str": "podcasts/all"}]
+    assert job_config["inputs"] == []
+    assert job_config["args"] == []
+    assert job_config["env"]["PARTITION_REF"] == "podcasts/all"
+    
+    print("✓ extract_podcasts_job config test passed")
+
+def test_categorize_reviews_job():
+    """Test categorize_reviews_job configuration."""
+    print("Testing categorize_reviews_job...")
+    
+    config = run_job_config("categorize_reviews_job.py", "categorized_reviews/category=comedy/date=2020-01-01")
+    
+    assert len(config["configs"]) == 1
+    job_config = config["configs"][0]
+    
+    assert job_config["outputs"] == [{"str": "categorized_reviews/category=comedy/date=2020-01-01"}]
+    assert len(job_config["inputs"]) == 2
+    
+    input_refs = [inp["partition_ref"]["str"] for inp in job_config["inputs"]]
+    assert "reviews/date=2020-01-01" in input_refs
+    assert "podcasts/all" in input_refs
+    
+    assert job_config["args"] == ["comedy", "2020-01-01"]
+    assert job_config["env"]["TARGET_CATEGORY"] == "comedy"
+    assert job_config["env"]["TARGET_DATE"] == "2020-01-01"
+    
+    print("✓ categorize_reviews_job config test passed")
+
+def test_phrase_modeling_job():
+    """Test phrase_modeling_job configuration."""
+    print("Testing phrase_modeling_job...")
+    
+    config = run_job_config("phrase_modeling_job.py", "phrase_models/category=comedy/date=2020-01-01")
+    
+    assert len(config["configs"]) == 1
+    job_config = config["configs"][0]
+    
+    assert job_config["outputs"] == [{"str": "phrase_models/category=comedy/date=2020-01-01"}]
+    assert len(job_config["inputs"]) == 1
+    assert job_config["inputs"][0]["partition_ref"]["str"] == "categorized_reviews/category=comedy/date=2020-01-01"
+    
+    assert job_config["args"] == ["comedy", "2020-01-01"]
+    assert job_config["env"]["TARGET_CATEGORY"] == "comedy"
+    assert job_config["env"]["TARGET_DATE"] == "2020-01-01"
+    
+    print("✓ phrase_modeling_job config test passed")
+
+def test_phrase_stats_job():
+    """Test phrase_stats_job configuration."""
+    print("Testing phrase_stats_job...")
+    
+    config = run_job_config("phrase_stats_job.py", "phrase_stats/category=comedy/date=2020-01-01")
+    
+    assert len(config["configs"]) == 1
+    job_config = config["configs"][0]
+    
+    assert job_config["outputs"] == [{"str": "phrase_stats/category=comedy/date=2020-01-01"}]
+    assert len(job_config["inputs"]) == 2
+    
+    input_refs = [inp["partition_ref"]["str"] for inp in job_config["inputs"]]
+    assert "phrase_models/category=comedy/date=2020-01-01" in input_refs
+    assert "categorized_reviews/category=comedy/date=2020-01-01" in input_refs
+    
+    assert job_config["args"] == ["comedy", "2020-01-01"]
+    assert job_config["env"]["TARGET_CATEGORY"] == "comedy"
+    assert job_config["env"]["TARGET_DATE"] == "2020-01-01"
+    
+    print("✓ phrase_stats_job config test passed")
+
+def test_daily_summary_job():
+    """Test daily_summary_job configuration."""
+    print("Testing daily_summary_job...")
+    
+    config = run_job_config("daily_summary_job.py", "daily_summaries/category=comedy/date=2020-01-01")
+    
+    assert len(config["configs"]) == 1
+    job_config = config["configs"][0]
+    
+    assert job_config["outputs"] == [{"str": "daily_summaries/category=comedy/date=2020-01-01"}]
+    assert len(job_config["inputs"]) == 2
+    
+    input_refs = [inp["partition_ref"]["str"] for inp in job_config["inputs"]]
+    assert "phrase_stats/category=comedy/date=2020-01-01" in input_refs
+    assert "categorized_reviews/category=comedy/date=2020-01-01" in input_refs
+    
+    assert job_config["args"] == ["comedy", "2020-01-01"]
+    assert job_config["env"]["TARGET_CATEGORY"] == "comedy"
+    assert job_config["env"]["TARGET_DATE"] == "2020-01-01"
+    
+    print("✓ daily_summary_job config test passed")
+
+def test_job_lookup():
+    """Test job_lookup functionality."""
+    print("Testing job_lookup...")
+    
+    test_cases = [
+        ("reviews/date=2020-01-01", ":extract_reviews_job"),
+        ("podcasts/all", ":extract_podcasts_job"),
+        ("categorized_reviews/category=comedy/date=2020-01-01", ":categorize_reviews_job"),
+        ("phrase_models/category=comedy/date=2020-01-01", ":phrase_modeling_job"),
+        ("phrase_stats/category=comedy/date=2020-01-01", ":phrase_stats_job"),
+        ("daily_summaries/category=comedy/date=2020-01-01", ":daily_summary_job"),
+    ]
+    
+    for partition_ref, expected_job_label in test_cases:
+        cmd = [sys.executable, "job_lookup.py", partition_ref]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        
+        if result.returncode != 0:
+            raise Exception(f"Job lookup failed for {partition_ref}: {result.stderr}")
+        
+        response = json.loads(result.stdout)
+        # New format: {job_label: [partition_refs]}
+        assert expected_job_label in response
+        assert partition_ref in response[expected_job_label]
+    
+    print("✓ job_lookup test passed")
+
+def test_invalid_partition_refs():
+    """Test that invalid partition refs are handled properly."""
+    print("Testing invalid partition refs...")
+    
+    invalid_refs = [
+        "invalid/ref",
+        "reviews/date=invalid-date",
+        "categorized_reviews/category=/date=2020-01-01",  # missing category
+        "unknown/partition=ref",
+    ]
+    
+    for invalid_ref in invalid_refs:
+        cmd = [sys.executable, "job_lookup.py", invalid_ref]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        
+        # Should fail for invalid refs
+        assert result.returncode != 0, f"Expected failure for invalid ref: {invalid_ref}"
+    
+    print("✓ invalid partition refs test passed")
+
+def main():
+    """Run all tests."""
+    print("Running podcast reviews job tests...")
+    print("=" * 50)
+    
+    try:
+        test_extract_reviews_job()
+        test_extract_podcasts_job()
+        test_categorize_reviews_job()
+        test_phrase_modeling_job()
+        test_phrase_stats_job()
+        test_daily_summary_job()
+        test_job_lookup()
+        test_invalid_partition_refs()
+        
+        print("=" * 50)
+        print("All tests passed! ✅")
+        
+    except Exception as e:
+        print(f"Test failed: {e}")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
--- a/examples/podcast_reviews/unified_job.py
+++ b/examples/podcast_reviews/unified_job.py
@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: unified_job.py {config|exec} [args...]", file=sys.stderr)
+        sys.exit(1)
+    
+    command = sys.argv[1]
+    
+    if command == "config":
+        handle_config(sys.argv[2:])
+    elif command == "exec":
+        handle_exec(sys.argv[2:])
+    else:
+        print(f"Unknown command: {command}", file=sys.stderr)
+        print("Usage: unified_job.py {config|exec} [args...]", file=sys.stderr)
+        sys.exit(1)
+
+def handle_config(args):
+    if len(args) < 1:
+        print("Config mode requires partition ref", file=sys.stderr)
+        sys.exit(1)
+    
+    partition_ref = args[0]
+    
+    config = {
+        "configs": [{
+            "outputs": [{"str": partition_ref}],
+            "inputs": [],
+            "args": ["Hello", "gorgeous", partition_ref],
+            "env": {"PARTITION_REF": partition_ref}
+        }]
+    }
+    
+    print(json.dumps(config))
+
+def handle_exec(args):
+    print("What a time to be alive.")
+    print(f"Partition ref: {os.getenv('PARTITION_REF', 'unknown')}")
+    print(f"Args: {args}")
+
+if __name__ == "__main__":
+    import os
+    main()