Merge branch 'single-target'
Some checks are pending
/ setup (push) Waiting to run

# Conflicts:
#	CLAUDE.md
This commit is contained in:
Stuart Axelbrooke 2025-07-02 21:39:49 -07:00
commit f9cacf6491
39 changed files with 4089 additions and 340 deletions

1
.gitignore vendored
View file

@ -6,3 +6,4 @@ databuild.iml
examples/podcast_reviews/data
.bazelbsp
.aider*
.venv

View file

@ -13,6 +13,8 @@ DataBuild is a bazel-based data build system. Key files:
# Remote testing
./scripts/bb_remote_test_all
# Do not try to `bazel test //examples/basic_graph/...`, as this will not work.
```
## Project Structure
@ -23,4 +25,65 @@ DataBuild is a bazel-based data build system. Key files:
## Key Components
- Graph analysis/execution in Rust
- Bazel rules for job orchestration
- Java/Python examples for different use cases
- Java/Python examples for different use cases
## DataBuild Job Architecture
### Job Target Structure
Each DataBuild job creates three Bazel targets:
- `job_name.cfg` - Configuration target (calls binary with "config" subcommand)
- `job_name.exec` - Execution target (calls binary with "exec" subcommand)
- `job_name` - Main job target (pipes config output to exec input)
### Unified Job Binary Pattern
Jobs use a single binary with subcommands:
```python
def main():
command = sys.argv[1] # "config" or "exec"
if command == "config":
handle_config(sys.argv[2:]) # Output job configuration JSON
elif command == "exec":
handle_exec(sys.argv[2:]) # Perform actual work
```
### Job Configuration Requirements
**CRITICAL**: Job configs must include non-empty `args` for execution to work:
```python
config = {
"configs": [{
"outputs": [{"str": partition_ref}],
"inputs": [...],
"args": ["some_arg"], # REQUIRED: Cannot be empty []
"env": {"PARTITION_REF": partition_ref}
}]
}
```
Jobs with `"args": []` will only have their config function called during execution, not exec.
### DataBuild Execution Flow
1. **Planning Phase**: DataBuild calls `.cfg` targets to get job configurations
2. **Execution Phase**: DataBuild calls main job targets which pipe config to exec
3. **Job Resolution**: Job lookup returns base job names (e.g., `//:job_name`), not `.cfg` variants
### Graph Configuration
```python
databuild_graph(
name = "my_graph",
jobs = [":job1", ":job2"], # Reference base job targets
lookup = ":job_lookup", # Binary that routes partition refs to jobs
)
```
### Job Lookup Pattern
```python
def lookup_job_for_partition(partition_ref: str) -> str:
if pattern.match(partition_ref):
return "//:job_name" # Return base job target
raise ValueError(f"No job found for: {partition_ref}")
```
### Common Pitfalls
- **Empty args**: Jobs with `"args": []` won't execute properly
- **Wrong target refs**: Job lookup must return base targets, not `.cfg` variants
- **Missing partition refs**: All outputs must be addressable via partition references

View file

@ -187,7 +187,7 @@
},
"@@pybind11_bazel+//:python_configure.bzl%extension": {
"general": {
"bzlTransitiveDigest": "d4N/SZrl3ONcmzE98rcV0Fsro0iUbjNQFTIiLiGuH+k=",
"bzlTransitiveDigest": "OMjJ8aOAn337bDg7jdyvF/juIrC2PpUcX6Dnf+nhcF0=",
"usagesDigest": "fycyB39YnXIJkfWCIXLUKJMZzANcuLy9ZE73hRucjFk=",
"recordedFileInputs": {
"@@pybind11_bazel+//MODULE.bazel": "88af1c246226d87e65be78ed49ecd1e6f5e98648558c14ce99176da041dc378e"
@ -221,7 +221,7 @@
},
"@@rules_fuzzing+//fuzzing/private:extensions.bzl%non_module_dependencies": {
"general": {
"bzlTransitiveDigest": "mGiTB79hRNjmeDTQdzkpCHyzXhErMbufeAmySBt7s5s=",
"bzlTransitiveDigest": "lxvzPQyluk241QRYY81nZHOcv5Id/5U2y6dp42qibis=",
"usagesDigest": "wy6ISK6UOcBEjj/mvJ/S3WeXoO67X+1llb9yPyFtPgc=",
"recordedFileInputs": {},
"recordedDirentsInputs": {},
@ -525,7 +525,7 @@
},
"@@rules_python+//python/private/pypi:pip.bzl%pip_internal": {
"general": {
"bzlTransitiveDigest": "sCGUUdVOVATRPlKd1IJea1kfLmtsYsAZdVI5HkdAUQo=",
"bzlTransitiveDigest": "bKQjDjomeeeh547JZoDNozPUkVrO368PlWs0shDGtJU=",
"usagesDigest": "OLoIStnzNObNalKEMRq99FqenhPGLFZ5utVLV4sz7OI=",
"recordedFileInputs": {
"@@rules_python+//tools/publish/requirements_darwin.txt": "2994136eab7e57b083c3de76faf46f70fad130bc8e7360a7fed2b288b69e79dc",
@ -4171,7 +4171,7 @@
},
"@@rules_rust+//crate_universe/private:internal_extensions.bzl%cu_nr": {
"general": {
"bzlTransitiveDigest": "7DC2ciVAva/LfjqxrbJs5WDxaCDqDaPY4HXXZriW120=",
"bzlTransitiveDigest": "mDJ0pT/rBCHMm7FzlOzh9Qng+sXi1kyQXEU8TahWqRc=",
"usagesDigest": "Pr9/2PR9/ujuo94SXikpx+fg31V4bDKobC10YJu+z5I=",
"recordedFileInputs": {},
"recordedDirentsInputs": {},

View file

@ -42,11 +42,12 @@ fn configure(job_label: &str, output_refs: &[String]) -> Result<Vec<Task>, Strin
// Parse the job configurations
let stdout = String::from_utf8_lossy(&output.stdout);
let job_configs: Vec<JobConfig> = serde_json::from_str(&stdout)
let job_configure_response: JobConfigureResponse = serde_json::from_str(&stdout)
.map_err(|e| {
error!("Error parsing job configs for {}: {}. `{}`", job_label, e, stdout);
format!("Failed to parse job configs: {}", e)
})?;
let job_configs = job_configure_response.configs;
// Create tasks
let tasks: Vec<Task> = job_configs.into_iter()
@ -232,11 +233,11 @@ fn plan(output_refs: &[String]) -> Result<JobGraph, String> {
let mut new_unhandled_count = 0;
for task in &new_nodes {
for input in &task.config.inputs {
if input.dep_type == DataDepType::Materialize {
if !unhandled_refs.contains(&input.reference) {
if input.dep_type == 1 { // MATERIALIZE = 1
if !unhandled_refs.contains(&input.partition_ref.str) {
new_unhandled_count += 1;
}
unhandled_refs.insert(input.reference.clone());
unhandled_refs.insert(input.partition_ref.str.clone());
}
}
}
@ -276,18 +277,19 @@ fn generate_mermaid_diagram(graph: &JobGraph) -> String {
// Process each task in the graph
for task in &graph.nodes {
// Create a unique ID for this job+outputs combination
let outputs_key = task.config.outputs.join("_");
let outputs_strs: Vec<String> = task.config.outputs.iter().map(|o| o.str.clone()).collect();
let outputs_key = outputs_strs.join("_");
let mut job_node_id = format!("job_{}", task.job_label.replace("//", "_"));
job_node_id = job_node_id.replace(":", "_");
job_node_id = format!("{}_{}", job_node_id, outputs_key.replace("/", "_"));
job_node_id = job_node_id.replace(":", "_").replace("=", "_").replace("?", "_").replace(" ", "_");
job_node_id = format!("{}_{}", job_node_id, outputs_key.replace("/", "_").replace("=", "_"));
// Create a descriptive label that includes both job label and outputs
let job_label = &task.job_label;
let outputs_label = if !task.config.outputs.is_empty() {
if task.config.outputs.len() == 1 {
format!(" [{}]", task.config.outputs[0])
format!(" [{}]", task.config.outputs[0].str)
} else {
format!(" [{}, ...]", task.config.outputs[0])
format!(" [{}, ...]", task.config.outputs[0].str)
}
} else {
String::new()
@ -307,11 +309,11 @@ fn generate_mermaid_diagram(graph: &JobGraph) -> String {
// Process inputs (dependencies)
for input in &task.config.inputs {
let ref_node_id = format!("ref_{}", input.reference.replace("/", "_"));
let ref_node_id = format!("ref_{}", input.partition_ref.str.replace("/", "_").replace("=", "_"));
// Add the partition ref node if not already added
if !added_refs.contains(&ref_node_id) {
let node_class = if is_output_ref.contains(&input.reference) {
let node_class = if is_output_ref.contains(&input.partition_ref.str) {
"outputPartition"
} else {
"partition"
@ -321,14 +323,14 @@ fn generate_mermaid_diagram(graph: &JobGraph) -> String {
mermaid.push_str(&format!(
" {}[(\"{}\")]:::{}\n",
ref_node_id,
input.reference,
input.partition_ref.str.replace("/", "_").replace("=", "_"),
node_class
));
added_refs.insert(ref_node_id.clone());
}
// Add the edge from input to job
if input.dep_type == DataDepType::Materialize {
if input.dep_type == 1 { // MATERIALIZE = 1
// Solid line for materialize dependencies
mermaid.push_str(&format!(" {} --> {}\n", ref_node_id, job_node_id));
} else {
@ -339,11 +341,11 @@ fn generate_mermaid_diagram(graph: &JobGraph) -> String {
// Process outputs
for output in &task.config.outputs {
let ref_node_id = format!("ref_{}", output.replace("/", "_"));
let ref_node_id = format!("ref_{}", output.str.replace("/", "_").replace("=", "_"));
// Add the partition ref node if not already added
if !added_refs.contains(&ref_node_id) {
let node_class = if is_output_ref.contains(output) {
let node_class = if is_output_ref.contains(&output.str) {
"outputPartition"
} else {
"partition"
@ -353,7 +355,7 @@ fn generate_mermaid_diagram(graph: &JobGraph) -> String {
mermaid.push_str(&format!(
" {}[(\"Partition: {}\")]:::{}\n",
ref_node_id,
output,
output.str,
node_class
));
added_refs.insert(ref_node_id.clone());

View file

@ -1,4 +1,4 @@
use structs::{DataDepType, JobConfig, JobGraph, Task};
use structs::{JobGraph, Task};
use crossbeam_channel::{Receiver, Sender};
use log::{debug, error, info, warn};
use serde::{Deserialize, Serialize};
@ -40,10 +40,10 @@ fn get_task_key(task: &Task) -> String {
key_parts.push(task.job_label.clone());
for input_dep in &task.config.inputs {
key_parts.push(format!("input:{}", input_dep.reference));
key_parts.push(format!("input:{}", input_dep.partition_ref.str));
}
for output_ref in &task.config.outputs {
key_parts.push(format!("output:{}", output_ref));
key_parts.push(format!("output:{}", output_ref.str));
}
key_parts.join("|")
}
@ -242,13 +242,21 @@ fn worker(
}
fn is_task_ready(task: &Task, completed_outputs: &HashSet<String>) -> bool {
let mut missing_deps = Vec::new();
for dep in &task.config.inputs {
if dep.dep_type == DataDepType::Materialize {
if !completed_outputs.contains(&dep.reference) {
return false;
if dep.dep_type == 1 { // MATERIALIZE = 1
if !completed_outputs.contains(&dep.partition_ref.str) {
missing_deps.push(&dep.partition_ref.str);
}
}
}
if !missing_deps.is_empty() {
debug!("Task {} not ready - missing dependencies: {:?}", task.job_label, missing_deps);
return false;
}
true
}
@ -341,7 +349,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
if result.success {
if let Some(original_task) = original_tasks_by_key.get(&result.task_key) {
for output_ref in &original_task.config.outputs {
completed_outputs.insert(output_ref.clone());
completed_outputs.insert(output_ref.str.clone());
}
}
} else {
@ -379,6 +387,34 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
// 4. Periodic logging
if last_log_time.elapsed() >= LOG_INTERVAL {
log_status_summary(&task_states, &original_tasks_by_key);
// Debug: Check for deadlock (pending tasks with no running tasks)
let has_pending = task_states.values().any(|s| *s == TaskState::Pending);
if has_pending && active_tasks_count == 0 {
warn!("Potential deadlock detected: {} pending tasks with no running tasks",
task_states.values().filter(|s| **s == TaskState::Pending).count());
// Log details of pending tasks and their preconditions
for (key, state) in &task_states {
if *state == TaskState::Pending {
if let Some(task) = original_tasks_by_key.get(key) {
warn!("Pending task: {} ({})", task.job_label, key);
warn!(" Required inputs:");
for dep in &task.config.inputs {
if dep.dep_type == 1 { // MATERIALIZE = 1
let available = completed_outputs.contains(&dep.partition_ref.str);
warn!(" {} - {}", dep.partition_ref.str, if available { "AVAILABLE" } else { "MISSING" });
}
}
warn!(" Produces outputs:");
for output in &task.config.outputs {
warn!(" {}", output.str);
}
}
}
}
}
last_log_time = Instant::now();
}

View file

@ -3,6 +3,8 @@ set -e
%{RUNFILES_PREFIX}
%{PREFIX}
EXECUTE_BINARY="$(rlocation "_main/$(basename "%{EXECUTE_PATH}")")"
JQ="$(rlocation "databuild+/databuild/runtime/$(basename "%{JQ_PATH}")")"
@ -44,4 +46,8 @@ while IFS= read -r arg; do
done < <("$JQ" -r '.args[]' "$CONFIG_FILE")
# Run the execution with both environment variables (already set) and arguments
exec "$EXECUTE_BINARY" "${ARGS[@]}"
if [[ -n "${EXECUTE_SUBCOMMAND:-}" ]]; then
exec "$EXECUTE_BINARY" "${EXECUTE_SUBCOMMAND}" "${ARGS[@]}"
else
exec "$EXECUTE_BINARY" "${ARGS[@]}"
fi

View file

@ -30,30 +30,26 @@ source "${RUNFILES_DIR:-/dev/null}/$f" 2>/dev/null || \
def databuild_job(
name,
configure,
execute,
binary,
visibility = None):
"""Creates a DataBuild job target with configuration and execution capabilities.
Args:
name: Name of the job target
configure: Target that implements the configuration logic
execute: Target that implements the execution logic
deps: List of other job_targets this job depends on
binary: Single binary target that handles both config and exec via subcommands
visibility: Visibility specification
**kwargs: Additional attributes to pass to the underlying rule
"""
# Single binary approach - use subcommands
_databuild_job_cfg_rule(
name = name + ".cfg",
configure = configure,
configure = binary,
visibility = visibility,
)
# Create the main rule that serves as a provider for other targets
_databuild_job_exec_rule(
name = name + ".exec",
execute = execute,
execute = binary,
visibility = visibility,
)
@ -76,7 +72,7 @@ def _databuild_job_cfg_impl(ctx):
substitutions = {
"%{EXECUTABLE_PATH}": configure_path,
"%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
"%{PREFIX}": "",
"%{PREFIX}": "EXECUTABLE_SUBCOMMAND=\"config\"\n",
},
is_executable = True,
)
@ -132,6 +128,7 @@ def _databuild_job_exec_impl(ctx):
"%{JQ_PATH}": jq_path,
"%{EXECUTE_PATH}": execute_path,
"%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
"%{PREFIX}": "EXECUTE_SUBCOMMAND=\"exec\"\n",
},
is_executable = True,
)

View file

@ -8,4 +8,8 @@ set -e
EXECUTABLE_BINARY="$(rlocation "_main/$(basename "%{EXECUTABLE_PATH}")")"
# Run the configuration
exec "${EXECUTABLE_BINARY}" "$@"
if [[ -n "${EXECUTABLE_SUBCOMMAND:-}" ]]; then
exec "${EXECUTABLE_BINARY}" "${EXECUTABLE_SUBCOMMAND}" "$@"
else
exec "${EXECUTABLE_BINARY}" "$@"
fi

View file

@ -2,42 +2,53 @@ use std::collections::HashMap;
use serde::{Deserialize, Serialize};
use std::str::FromStr;
// Data structures that mirror the Go implementation
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum DataDepType {
Query,
Materialize,
// Data structures that follow the protobuf specification exactly
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PartitionRef {
#[serde(rename = "str")]
pub str: String,
}
impl FromStr for DataDepType {
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub enum DepType {
#[serde(rename = "QUERY")]
Query = 0,
#[serde(rename = "MATERIALIZE")]
Materialize = 1,
}
impl FromStr for DepType {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"query" => Ok(DataDepType::Query),
"materialize" => Ok(DataDepType::Materialize),
_ => Err(format!("Unknown DataDepType: {}", s)),
match s.to_uppercase().as_str() {
"QUERY" => Ok(DepType::Query),
"MATERIALIZE" => Ok(DepType::Materialize),
_ => Err(format!("Unknown DepType: {}", s)),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DataDep {
#[serde(rename = "depType")]
pub dep_type: DataDepType,
#[serde(rename = "ref")]
pub reference: String,
pub dep_type: u32, // Protobuf enums are serialized as integers
pub partition_ref: PartitionRef,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct JobConfig {
pub inputs: Vec<DataDep>,
pub outputs: Vec<String>,
pub outputs: Vec<PartitionRef>,
pub args: Vec<String>,
pub env: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct JobConfigureResponse {
pub configs: Vec<JobConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Task {
#[serde(rename = "jobLabel")]

View file

@ -35,54 +35,34 @@ py_binary(
databuild_job(
name = "generate_number_job",
configure = ":generate_number_configure",
execute = ":generate_number_execute",
binary = ":generate_number_binary",
visibility = ["//visibility:public"],
)
java_binary(
name = "generate_number_configure",
srcs = glob(["*.java"]),
create_executable = True,
main_class = "com.databuild.examples.basic_graph.GenerateConfigure",
name = "generate_number_binary",
srcs = ["UnifiedGenerateNumber.java"],
main_class = "com.databuild.examples.basic_graph.UnifiedGenerateNumber",
deps = [
"@maven//:com_fasterxml_jackson_core_jackson_annotations",
"@maven//:com_fasterxml_jackson_core_jackson_core",
"@maven//:com_fasterxml_jackson_core_jackson_databind",
"@maven//:com_fasterxml_jackson_module_jackson_module_jsonSchema",
],
)
java_binary(
name = "generate_number_execute",
srcs = glob(["GenerateExecute.java"]),
main_class = "com.databuild.examples.basic_graph.GenerateExecute",
)
databuild_job(
name = "sum_job",
configure = ":sum_configure",
execute = ":sum_execute",
binary = ":sum_binary",
visibility = ["//visibility:public"],
)
java_binary(
name = "sum_configure",
srcs = glob(["*.java"]),
main_class = "com.databuild.examples.basic_graph.SumConfigure",
name = "sum_binary",
srcs = ["UnifiedSum.java"],
main_class = "com.databuild.examples.basic_graph.UnifiedSum",
deps = [
"@maven//:com_fasterxml_jackson_core_jackson_annotations",
"@maven//:com_fasterxml_jackson_core_jackson_core",
"@maven//:com_fasterxml_jackson_core_jackson_databind",
"@maven//:com_fasterxml_jackson_module_jackson_module_jsonSchema",
],
)
java_binary(
name = "sum_execute",
srcs = glob([
"SumExecute.java",
"GenerateExecute.java",
]),
main_class = "com.databuild.examples.basic_graph.SumExecute",
)
)

View file

@ -1,46 +0,0 @@
package com.databuild.examples.basic_graph;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.JsonNode;
import java.util.ArrayList;
import java.util.List;
import java.io.File;
import java.util.Arrays;
import java.util.Collections;
/**
* Configure class for generating a random number.
* This class creates a job configuration for generating a random number based on the partition ref.
*/
public class GenerateConfigure {
public static void main(String[] args) {
if (args.length < 1) {
System.err.println("Error: Partition ref is required");
System.exit(1);
}
List<JobConfig> configList = new ArrayList<>();
// Process each partition ref from input arguments
Arrays.stream(args).forEach(partitionRef -> {
// Create and populate JobConfig object
JobConfig config = new JobConfig();
config.outputs = Collections.singletonList(partitionRef);
config.args = Arrays.asList(partitionRef);
// inputs and env are already initialized as empty collections in the constructor
configList.add(config);
});
try {
ObjectMapper mapper = new ObjectMapper();
// Convert config list to JsonNode and serialize
JsonNode configNode = mapper.valueToTree(configList);
String jsonConfig = mapper.writeValueAsString(configNode);
System.out.println(jsonConfig);
} catch (Exception e) {
System.err.println("Error: Failed to validate or serialize config: " + e.getMessage());
System.exit(1);
}
}
}

View file

@ -1,70 +0,0 @@
package com.databuild.examples.basic_graph;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Random;
/**
* Execute class for generating a random number.
* This class generates a random number based on the partition ref.
*/
public class GenerateExecute {
public static String BASE_PATH = "/tmp/databuild_test/examples/basic_graph/";
public static void main(String[] args) {
if (args.length < 1) {
System.err.println("Error: Partition ref (output path) is required");
System.exit(1);
}
String partitionRef = args[0];
try {
// Create a hash of the partition ref to use as a seed
MessageDigest md = MessageDigest.getInstance("MD5");
byte[] hashBytes = md.digest(partitionRef.getBytes(StandardCharsets.UTF_8));
// Convert the first 8 bytes of the hash to a long to use as a seed
long seed = 0;
for (int i = 0; i < Math.min(8, hashBytes.length); i++) {
seed = (seed << 8) | (hashBytes[i] & 0xff);
}
// Create a random number generator with the seed
Random random = new Random(seed);
// Generate a random number
int randomNumber = random.nextInt(100);
// Write the random number to the output file
// Ensure dir exists
java.io.File parent = new java.io.File(partitionRef).getParentFile();
if (parent != null) {
parent.mkdirs();
}
try (FileWriter writer = new FileWriter(partitionRef)) {
writer.write("Random number for partition " + partitionRef + ": " + randomNumber);
}
System.out.println("Generated random number " + randomNumber + " for partition " + partitionRef);
// Write the random number to the output file
String outputPath = partitionRef;
System.out.println("Writing random number " + randomNumber + " to " + outputPath);
// Ensure dir exists
new java.io.File(outputPath).getParentFile().mkdirs();
// Write number (overwrite)
try (FileWriter writer = new FileWriter(outputPath)) {
writer.write(String.valueOf(randomNumber));
}
} catch (NoSuchAlgorithmException | IOException e) {
System.err.println("Error: " + e.getMessage());
e.printStackTrace();
System.exit(1);
}
}
}

View file

@ -1,58 +0,0 @@
package com.databuild.examples.basic_graph;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.JsonNode;
import java.io.File;
import java.util.Arrays;
import java.util.Collections;
import java.util.stream.Collectors;
import static com.databuild.examples.basic_graph.GenerateExecute.BASE_PATH;
/**
* Configure class for generating a random number.
* This class creates a job configuration for generating a random number based on the partition ref.
*/
public class SumConfigure {
public static void main(String[] args) {
if (args.length != 1) {
System.err.println("Error: Must provide exactly one partition ref as an argument");
System.exit(1);
}
String partitionRef = args[0];
String[] pathParts = partitionRef.split("/");
String[] upstreams = Arrays.stream(pathParts[pathParts.length - 1].split("_"))
.map(part -> BASE_PATH + "generated_number/" + part)
.toArray(String[]::new);
// Create and populate JobConfig object
JobConfig config = new JobConfig();
config.outputs = Collections.singletonList(BASE_PATH + "sum/" +partitionRef);
config.inputs = Arrays.stream(upstreams)
.map(upstream -> {
DataDep dep = new DataDep();
dep.depType = "materialize";
dep.ref = upstream;
return dep;
})
.collect(Collectors.toList());
config.args = Arrays.asList(upstreams);
// Create a hashmap for env with {"OUTPUT_REF": "foo"}
config.env = Collections.singletonMap("OUTPUT_REF", args[0]);
// inputs and env are already initialized as empty collections in the constructor
try {
ObjectMapper mapper = new ObjectMapper();
// Convert config to JsonNode and serialize
JsonNode configNode = mapper.valueToTree(Collections.singletonList(config));
String jsonConfig = mapper.writeValueAsString(configNode);
System.out.println(jsonConfig);
} catch (Exception e) {
System.err.println("Error: Failed to validate or serialize config: " + e.getMessage());
System.exit(1);
}
}
}

View file

@ -1,44 +0,0 @@
package com.databuild.examples.basic_graph;
import static com.databuild.examples.basic_graph.GenerateExecute.BASE_PATH;
public class SumExecute {
public static void main(String[] args) {
if (args.length < 1) {
System.err.println("Error: Partition ref (output path) is required");
System.exit(1);
}
// Get output ref from env var OUTPUT_REF
String outputRef = System.getenv("OUTPUT_REF");
// For each arg, load it from the file system and add it to the sum
int sum = 0;
for (String partitionRef : args) {
try {
String path = partitionRef;
int partitionValue = Integer.parseInt(new String(java.nio.file.Files.readAllBytes(java.nio.file.Paths.get(path))));
System.out.println("Summing partition " + partitionRef + " with value " + partitionValue);
sum += partitionValue;
} catch (Exception e) {
System.err.println("Error: Failed to read partition " + partitionRef + ": " + e.getMessage());
e.printStackTrace();
}
}
System.out.println("Sum of " + args.length + " partitions: " + sum);
// Write the sum to the output file
String outPath = outputRef;
System.out.println("Writing sum " + sum + " to " + outPath);
java.io.File parent = new java.io.File(outPath).getParentFile();
if (parent != null) {
parent.mkdirs();
}
try (java.io.FileWriter writer = new java.io.FileWriter(outPath)) {
writer.write(String.valueOf(sum));
} catch (Exception e) {
System.err.println("Error: Failed to write sum to " + outputRef + ": " + e.getMessage());
}
}
}

View file

@ -0,0 +1,117 @@
package com.databuild.examples.basic_graph;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.JsonNode;
import java.util.ArrayList;
import java.util.List;
import java.io.File;
import java.util.Arrays;
import java.util.Collections;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Random;
/**
* Unified job that handles both configuration and execution via subcommands.
*/
public class UnifiedGenerateNumber {
public static String BASE_PATH = "/tmp/databuild_test/examples/basic_graph/";
public static void main(String[] args) {
if (args.length < 1) {
System.err.println("Usage: UnifiedGenerateNumber {config|exec} [args...]");
System.exit(1);
}
String command = args[0];
switch (command) {
case "config":
handleConfig(Arrays.copyOfRange(args, 1, args.length));
break;
case "exec":
handleExec(Arrays.copyOfRange(args, 1, args.length));
break;
default:
System.err.println("Unknown command: " + command);
System.err.println("Usage: UnifiedGenerateNumber {config|exec} [args...]");
System.exit(1);
}
}
private static void handleConfig(String[] args) {
if (args.length < 1) {
System.err.println("Config mode requires partition ref");
System.exit(1);
}
String partitionRef = args[0];
try {
ObjectMapper mapper = new ObjectMapper();
// Create job configuration
var config = mapper.createObjectNode();
// Create outputs as PartitionRef objects
var outputs = mapper.createArrayNode();
var outputPartRef = mapper.createObjectNode();
outputPartRef.put("str", partitionRef);
outputs.add(outputPartRef);
config.set("outputs", outputs);
config.set("inputs", mapper.createArrayNode());
config.set("args", mapper.createArrayNode().add("will").add("generate").add(partitionRef));
config.set("env", mapper.createObjectNode().put("PARTITION_REF", partitionRef));
var response = mapper.createObjectNode();
response.set("configs", mapper.createArrayNode().add(config));
System.out.println(mapper.writeValueAsString(response));
} catch (Exception e) {
System.err.println("Error creating config: " + e.getMessage());
System.exit(1);
}
}
private static void handleExec(String[] args) {
if (args.length < 3) {
System.err.println("Execute mode requires: will generate <partition_ref>");
System.exit(1);
}
String partitionRef = args[2];
try {
// Generate a random number based on the partition ref
MessageDigest md = MessageDigest.getInstance("SHA-256");
byte[] hash = md.digest(partitionRef.getBytes(StandardCharsets.UTF_8));
long seed = 0;
for (int i = 0; i < 8; i++) {
seed = (seed << 8) | (hash[i] & 0xFF);
}
Random random = new Random(seed);
int randomNumber = random.nextInt(100) + 1;
// Write to file - partitionRef is the full path
File outputFile = new File(partitionRef);
File outputDir = outputFile.getParentFile();
if (outputDir != null) {
outputDir.mkdirs();
}
try (FileWriter writer = new FileWriter(outputFile)) {
writer.write(String.valueOf(randomNumber));
}
System.out.println("Generated number " + randomNumber + " for partition " + partitionRef);
} catch (Exception e) {
System.err.println("Error in execution: " + e.getMessage());
System.exit(1);
}
}
}

View file

@ -0,0 +1,137 @@
package com.databuild.examples.basic_graph;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.JsonNode;
import java.io.File;
import java.util.Arrays;
import java.util.Collections;
import java.util.stream.Collectors;
import java.io.FileWriter;
import java.io.IOException;
// import static com.databuild.examples.basic_graph.GenerateExecute.BASE_PATH;
/**
* Unified sum job that handles both configuration and execution via subcommands.
*/
public class UnifiedSum {
public static String BASE_PATH = "/tmp/databuild_test/examples/basic_graph/";
public static void main(String[] args) {
if (args.length < 1) {
System.err.println("Usage: UnifiedSum {config|exec} [args...]");
System.exit(1);
}
String command = args[0];
switch (command) {
case "config":
handleConfig(Arrays.copyOfRange(args, 1, args.length));
break;
case "exec":
handleExec(Arrays.copyOfRange(args, 1, args.length));
break;
default:
System.err.println("Unknown command: " + command);
System.err.println("Usage: UnifiedSum {config|exec} [args...]");
System.exit(1);
}
}
private static void handleConfig(String[] args) {
if (args.length != 1) {
System.err.println("Config mode requires exactly one partition ref");
System.exit(1);
}
String partitionRef = args[0];
String[] pathParts = partitionRef.split("/");
String[] upstreams = Arrays.stream(pathParts[pathParts.length - 1].split("_"))
.map(part -> BASE_PATH + "generated_number/" + part)
.toArray(String[]::new);
try {
ObjectMapper mapper = new ObjectMapper();
// Create data dependencies
var inputs = mapper.createArrayNode();
for (String upstream : upstreams) {
var dataDep = mapper.createObjectNode();
dataDep.put("dep_type", 0); // QUERY
var partRef = mapper.createObjectNode();
partRef.put("str", upstream);
dataDep.set("partition_ref", partRef);
inputs.add(dataDep);
}
// Create job configuration
var config = mapper.createObjectNode();
// Create outputs as PartitionRef objects
var outputs = mapper.createArrayNode();
var outputPartRef = mapper.createObjectNode();
outputPartRef.put("str", partitionRef);
outputs.add(outputPartRef);
config.set("outputs", outputs);
config.set("inputs", inputs);
var argsArray = mapper.createArrayNode();
for (String upstream : upstreams) {
argsArray.add(upstream);
}
config.set("args", argsArray);
config.set("env", mapper.createObjectNode().put("OUTPUT_REF", partitionRef));
var response = mapper.createObjectNode();
response.set("configs", mapper.createArrayNode().add(config));
System.out.println(mapper.writeValueAsString(response));
} catch (Exception e) {
System.err.println("Error creating config: " + e.getMessage());
System.exit(1);
}
}
private static void handleExec(String[] args) {
// Get output ref from env var OUTPUT_REF
String outputRef = System.getenv("OUTPUT_REF");
if (outputRef == null) {
System.err.println("Error: OUTPUT_REF environment variable is required");
System.exit(1);
}
// For each arg, load it from the file system and add it to the sum
int sum = 0;
for (String partitionRef : args) {
try {
String path = partitionRef;
int partitionValue = Integer.parseInt(new String(java.nio.file.Files.readAllBytes(java.nio.file.Paths.get(path))));
System.out.println("Summing partition " + partitionRef + " with value " + partitionValue);
sum += partitionValue;
} catch (Exception e) {
System.err.println("Error: Failed to read partition " + partitionRef + ": " + e.getMessage());
e.printStackTrace();
System.exit(1);
}
}
System.out.println("Sum of " + args.length + " partitions: " + sum);
// Write the sum to the output file
try {
File outputDir = new File(outputRef).getParentFile();
if (outputDir != null) {
outputDir.mkdirs();
}
try (FileWriter writer = new FileWriter(outputRef)) {
writer.write(String.valueOf(sum));
}
System.out.println("Wrote sum " + sum + " to " + outputRef);
} catch (Exception e) {
System.err.println("Error writing output: " + e.getMessage());
System.exit(1);
}
}
}

View file

@ -5,11 +5,11 @@ set -e
generate_number_job.cfg /tmp/databuild_test/examples/basic_graph/generated_number/pippin /tmp/databuild_test/examples/basic_graph/generated_number/salem /tmp/databuild_test/examples/basic_graph/generated_number/sadie
# Test run
generate_number_job.cfg /tmp/databuild_test/examples/basic_graph/generated_number/pippin | jq -c ".[0]" | generate_number_job.exec
generate_number_job.cfg /tmp/databuild_test/examples/basic_graph/generated_number/pippin | jq -c ".configs[0]" | generate_number_job.exec
# Validate that contents of pippin is 83
if [[ "$(cat /tmp/databuild_test/examples/basic_graph/generated_number/pippin)" != "83" ]]; then
echo "Assertion failed: File does not contain 83"
# Validate that contents of pippin is 1 (deterministic based on SHA-256 hash)
if [[ "$(cat /tmp/databuild_test/examples/basic_graph/generated_number/pippin)" != "1" ]]; then
echo "Assertion failed: File does not contain 1"
cat /tmp/databuild_test/examples/basic_graph/generated_number/pippin
exit 1
fi

View file

@ -13,7 +13,7 @@ echo -n 83 > /tmp/databuild_test/examples/basic_graph/generated_number/pippin
echo -n 34 > /tmp/databuild_test/examples/basic_graph/generated_number/salem
echo -n 19 > /tmp/databuild_test/examples/basic_graph/generated_number/sadie
sum_job.cfg /tmp/databuild_test/examples/basic_graph/sum/pippin_salem_sadie | jq -c ".[0]" | sum_job.exec
sum_job.cfg /tmp/databuild_test/examples/basic_graph/sum/pippin_salem_sadie | jq -c ".configs[0]" | sum_job.exec
# Validate that contents of output is 136
if [[ "$(cat /tmp/databuild_test/examples/basic_graph/sum/pippin_salem_sadie)" != "136" ]]; then

View file

@ -2,17 +2,11 @@ load("@databuild//databuild:rules.bzl", "databuild_job")
databuild_job(
name = "test_job",
configure = ":test_job_configure",
execute = ":test_job_execute",
binary = ":test_job_binary",
visibility = ["//visibility:public"],
)
sh_binary(
name = "test_job_configure",
srcs = ["configure.sh"],
)
sh_binary(
name = "test_job_execute",
srcs = ["execute.sh"],
name = "test_job_binary",
srcs = ["unified_job.sh"],
)

View file

@ -1,2 +0,0 @@
# Create a test job config
echo "{\"configs\":[{\"outputs\":[\"$1\"],\"inputs\":[],\"args\":[\"will\", \"build\", \"$1\"],\"env\":{\"foo\":\"bar\"}}]}"

View file

@ -1,3 +0,0 @@
echo 'EXECUTE!'
echo "foo=$foo"
echo "args=$@"

View file

@ -0,0 +1,21 @@
#!/bin/bash
# Simple unified job that handles both config and exec via subcommands
case "${1:-}" in
"config")
# Configuration mode - output job config JSON
partition_ref="${2:-}"
echo "{\"configs\":[{\"outputs\":[{\"str\":\"${partition_ref}\"}],\"inputs\":[],\"args\":[\"will\", \"build\", \"${partition_ref}\"],\"env\":{\"foo\":\"bar\"}}]}"
;;
"exec")
# Execution mode - run the job
echo 'EXECUTE UNIFIED!'
echo "foo=$foo"
echo "args=$@"
;;
*)
echo "Usage: $0 {config|exec} [args...]"
exit 1
;;
esac

View file

@ -1,5 +1,5 @@
load("//:py_repl.bzl", "py_repl")
load("@databuild//databuild:rules.bzl", "databuild_job")
load("@databuild//databuild:rules.bzl", "databuild_job", "databuild_graph")
load("@rules_python//python:pip.bzl", "compile_pip_requirements")
load("@pypi//:requirements.bzl", "requirement")
@ -9,22 +9,174 @@ compile_pip_requirements(
requirements_txt = "requirements_lock.txt",
)
# Podcast Reviews Graph
databuild_graph(
name = "podcast_reviews_graph",
jobs = [
":extract_reviews_job",
":extract_podcasts_job",
":categorize_reviews_job",
":phrase_modeling_job",
":phrase_stats_job",
":daily_summary_job",
],
lookup = ":job_lookup",
visibility = ["//visibility:public"],
)
py_binary(
name = "job_lookup",
srcs = ["job_lookup.py"],
main = "job_lookup.py",
)
# Extract Reviews Job
databuild_job(
name = "extract_reviews_job",
binary = ":extract_reviews_binary",
visibility = ["//visibility:public"],
)
py_binary(
name = "extract_reviews_binary",
srcs = ["extract_reviews_job.py", "duckdb_utils.py"],
main = "extract_reviews_job.py",
deps = [
requirement("duckdb"),
requirement("pydantic"),
requirement("pandas"),
requirement("pyarrow"),
],
)
# Extract Podcasts Job
databuild_job(
name = "extract_podcasts_job",
binary = ":extract_podcasts_binary",
visibility = ["//visibility:public"],
)
py_binary(
name = "extract_podcasts_binary",
srcs = ["extract_podcasts_job.py", "duckdb_utils.py"],
main = "extract_podcasts_job.py",
deps = [
requirement("duckdb"),
requirement("pydantic"),
requirement("pandas"),
requirement("pyarrow"),
],
)
# Categorize Reviews Job
databuild_job(
name = "categorize_reviews_job",
binary = ":categorize_reviews_binary",
visibility = ["//visibility:public"],
)
py_binary(
name = "categorize_reviews_binary",
srcs = ["categorize_reviews_job.py", "duckdb_utils.py"],
main = "categorize_reviews_job.py",
deps = [
requirement("duckdb"),
requirement("pydantic"),
requirement("pandas"),
requirement("pyarrow"),
],
)
# Phrase Modeling Job
databuild_job(
name = "phrase_modeling_job",
binary = ":phrase_modeling_binary",
visibility = ["//visibility:public"],
)
py_binary(
name = "phrase_modeling_binary",
srcs = ["phrase_modeling_job.py", "duckdb_utils.py"],
main = "phrase_modeling_job.py",
deps = [
requirement("duckdb"),
requirement("pydantic"),
requirement("pandas"),
requirement("pyarrow"),
],
)
# Phrase Stats Job
databuild_job(
name = "phrase_stats_job",
binary = ":phrase_stats_binary",
visibility = ["//visibility:public"],
)
py_binary(
name = "phrase_stats_binary",
srcs = ["phrase_stats_job.py", "duckdb_utils.py"],
main = "phrase_stats_job.py",
deps = [
requirement("duckdb"),
requirement("pydantic"),
requirement("pandas"),
requirement("pyarrow"),
],
)
# Daily Summary Job
databuild_job(
name = "daily_summary_job",
binary = ":daily_summary_binary",
visibility = ["//visibility:public"],
)
py_binary(
name = "daily_summary_binary",
srcs = ["daily_summary_job.py", "duckdb_utils.py"],
main = "daily_summary_job.py",
deps = [
requirement("duckdb"),
requirement("pydantic"),
requirement("pandas"),
requirement("pyarrow"),
],
)
# Legacy test job (kept for compatibility)
databuild_job(
name = "test_job",
configure = ":test_job_configure",
execute = ":test_job_execute",
binary = ":test_job_binary",
)
py_binary(
name = "test_job_configure",
srcs = ["configure.py"],
main = "configure.py",
name = "test_job_binary",
srcs = ["unified_job.py"],
main = "unified_job.py",
)
# Test target
py_binary(
name = "test_job_execute",
srcs = ["execute.py"],
main = "execute.py",
name = "test_jobs",
srcs = [
"test_jobs.py",
"extract_reviews_job.py",
"extract_podcasts_job.py",
"categorize_reviews_job.py",
"phrase_modeling_job.py",
"phrase_stats_job.py",
"daily_summary_job.py",
"job_lookup.py",
"duckdb_utils.py",
],
main = "test_jobs.py",
deps = [
requirement("duckdb"),
requirement("pydantic"),
requirement("pandas"),
requirement("pyarrow"),
],
)
py_repl(

File diff suppressed because one or more lines are too long

View file

@ -21,3 +21,7 @@ flowchart LR
## Input Data
Get it from [here](https://www.kaggle.com/datasets/thoughtvector/podcastreviews/versions/28?select=database.sqlite)! (and put it in `examples/podcast_reviews/data/ingest/database.sqlite`)
## `phrase` Dependency
This relies on [`soaxelbrooke/phrase`](https://github.com/soaxelbrooke/phrase) for phrase extraction - check out its [releases](https://github.com/soaxelbrooke/phrase/releases) to get a relevant binary.

View file

@ -0,0 +1,202 @@
#!/usr/bin/env python3
import sys
import json
import os
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any
import re
from duckdb_utils import create_duckdb_connection, read_dataframe_with_fallback, save_dataframe_with_fallback
def main():
if len(sys.argv) < 2:
print("Usage: categorize_reviews_job.py {config|exec} [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
if command == "config":
handle_config(sys.argv[2:])
elif command == "exec":
handle_exec(sys.argv[2:])
else:
print(f"Unknown command: {command}", file=sys.stderr)
sys.exit(1)
def parse_partition_ref(partition_ref: str) -> Dict[str, str]:
"""Parse partition ref like 'categorized_reviews/category=comedy/date=2020-01-01' into components."""
match = re.match(r'categorized_reviews/category=([^/]+)/date=(\d{4}-\d{2}-\d{2})', partition_ref)
if not match:
raise ValueError(f"Invalid partition ref format: {partition_ref}")
return {"category": match.group(1), "date": match.group(2)}
def handle_config(args):
if len(args) < 1:
print("Config mode requires partition ref", file=sys.stderr)
sys.exit(1)
configs = []
# Process each partition reference
for partition_ref in args:
try:
parsed = parse_partition_ref(partition_ref)
category = parsed["category"]
date_str = parsed["date"]
except ValueError as e:
print(f"Error parsing partition ref: {e}", file=sys.stderr)
sys.exit(1)
# Dependencies: reviews for the date and podcast metadata
reviews_ref = f"reviews/date={date_str}"
podcasts_ref = "podcasts/all"
configs.append({
"outputs": [{"str": partition_ref}],
"inputs": [
{"dep_type": 1, "partition_ref": {"str": reviews_ref}},
{"dep_type": 1, "partition_ref": {"str": podcasts_ref}}
],
"args": [category, date_str],
"env": {
"PARTITION_REF": partition_ref,
"TARGET_CATEGORY": category,
"TARGET_DATE": date_str
}
})
config = {"configs": configs}
print(json.dumps(config))
def handle_exec(args):
if len(args) < 2:
print("Exec mode requires category and date arguments", file=sys.stderr)
sys.exit(1)
target_category = args[0]
target_date = args[1]
partition_ref = os.getenv('PARTITION_REF', f'categorized_reviews/category={target_category}/date={target_date}')
# Input paths - check for both parquet and CSV fallbacks
reviews_base = f"/tmp/databuild_test/examples/podcast_reviews/reviews/date={target_date}/reviews"
podcasts_base = "/tmp/databuild_test/examples/podcast_reviews/podcasts/podcasts"
reviews_file = None
podcasts_file = None
# Find reviews file (parquet or CSV)
for ext in ['.parquet', '.csv']:
candidate = reviews_base + ext
if os.path.exists(candidate):
reviews_file = candidate
break
# Find podcasts file (parquet or CSV)
for ext in ['.parquet', '.csv']:
candidate = podcasts_base + ext
if os.path.exists(candidate):
podcasts_file = candidate
break
if not reviews_file:
print(f"Reviews file not found: {reviews_base}.(parquet|csv)", file=sys.stderr)
sys.exit(1)
if not podcasts_file:
print(f"Podcasts file not found: {podcasts_base}.(parquet|csv)", file=sys.stderr)
sys.exit(1)
# Output path
output_dir = Path(f"/tmp/databuild_test/examples/podcast_reviews/categorized_reviews/category={target_category}/date={target_date}")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "categorized_reviews.parquet"
try:
# Categorize reviews by joining with podcast metadata
categorize_reviews_for_category_date(reviews_file, podcasts_file, target_category, str(output_file))
print(f"Successfully categorized reviews for category {target_category} on {target_date}")
print(f"Output written to: {output_file}")
# Create manifest
manifest = {
"outputs": [{"str": partition_ref}],
"inputs": [
{"str": f"reviews/date={target_date}"},
{"str": "podcasts/all"}
],
"start_time": datetime.now().isoformat(),
"end_time": datetime.now().isoformat(),
"task": {
"job": {"label": "//examples/podcast_reviews:categorize_reviews_job"},
"config": {
"outputs": [{"str": partition_ref}],
"inputs": [
{"dep_type": 1, "partition_ref": {"str": f"reviews/date={target_date}"}},
{"dep_type": 1, "partition_ref": {"str": "podcasts/all"}}
],
"args": [target_category, target_date],
"env": {"PARTITION_REF": partition_ref, "TARGET_CATEGORY": target_category, "TARGET_DATE": target_date}
}
}
}
manifest_file = output_dir / "manifest.json"
with open(manifest_file, 'w') as f:
json.dump(manifest, f, indent=2)
except Exception as e:
print(f"Error categorizing reviews: {e}", file=sys.stderr)
sys.exit(1)
def categorize_reviews_for_category_date(reviews_file: str, podcasts_file: str, target_category: str, output_file: str):
"""Join reviews with podcast categories and filter for target category."""
# Connect to DuckDB with extension handling
duckdb_conn = create_duckdb_connection()
try:
# Read input files with fallback handling
reviews_df = read_dataframe_with_fallback(reviews_file, duckdb_conn)
podcasts_df = read_dataframe_with_fallback(podcasts_file, duckdb_conn)
# Perform join and filtering in pandas
import pandas as pd
# Join reviews with podcasts
joined_df = reviews_df.merge(podcasts_df, on='podcast_id', how='inner')
# Filter by category
filtered_df = joined_df[
(joined_df['primary_category'] == target_category) |
(joined_df['all_categories'].str.contains(target_category, na=False))
].copy()
# Add target category column
filtered_df['target_category'] = target_category
# Select and rename columns to match expected output
result_df = filtered_df[[
'podcast_id', 'review_title', 'content', 'rating', 'author_id',
'created_at', 'review_date', 'title', 'primary_category',
'all_categories', 'target_category'
]].rename(columns={'title': 'podcast_title'})
# Sort by created_at
result_df = result_df.sort_values('created_at')
# Save to parquet with fallback
save_dataframe_with_fallback(result_df, output_file, duckdb_conn, "parquet")
row_count = len(result_df)
print(f"Categorized {row_count} reviews for category '{target_category}'")
if row_count == 0:
print(f"Warning: No reviews found for category '{target_category}'")
finally:
duckdb_conn.close()
if __name__ == "__main__":
main()

View file

@ -1,2 +0,0 @@
print("Hello, gorgeous.")

View file

@ -0,0 +1,313 @@
#!/usr/bin/env python3
import sys
import json
import os
import duckdb
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any
import re
def main():
if len(sys.argv) < 2:
print("Usage: daily_summary_job.py {config|exec} [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
if command == "config":
handle_config(sys.argv[2:])
elif command == "exec":
handle_exec(sys.argv[2:])
else:
print(f"Unknown command: {command}", file=sys.stderr)
sys.exit(1)
def parse_partition_ref(partition_ref: str) -> Dict[str, str]:
"""Parse partition ref like 'daily_summaries/category=comedy/date=2020-01-01' into components."""
match = re.match(r'daily_summaries/category=([^/]+)/date=(\d{4}-\d{2}-\d{2})', partition_ref)
if not match:
raise ValueError(f"Invalid partition ref format: {partition_ref}")
return {"category": match.group(1), "date": match.group(2)}
def handle_config(args):
if len(args) < 1:
print("Config mode requires partition ref", file=sys.stderr)
sys.exit(1)
configs = []
# Process each partition reference
for partition_ref in args:
try:
parsed = parse_partition_ref(partition_ref)
category = parsed["category"]
date_str = parsed["date"]
except ValueError as e:
print(f"Error parsing partition ref: {e}", file=sys.stderr)
sys.exit(1)
# Dependencies: phrase stats and categorized reviews for the category and date
phrase_stats_ref = f"phrase_stats/category={category}/date={date_str}"
categorized_reviews_ref = f"categorized_reviews/category={category}/date={date_str}"
configs.append({
"outputs": [{"str": partition_ref}],
"inputs": [
{"dep_type": 1, "partition_ref": {"str": phrase_stats_ref}},
{"dep_type": 1, "partition_ref": {"str": categorized_reviews_ref}}
],
"args": [category, date_str],
"env": {
"PARTITION_REF": partition_ref,
"TARGET_CATEGORY": category,
"TARGET_DATE": date_str
}
})
config = {"configs": configs}
print(json.dumps(config))
def handle_exec(args):
if len(args) < 2:
print("Exec mode requires category and date arguments", file=sys.stderr)
sys.exit(1)
target_category = args[0]
target_date = args[1]
partition_ref = os.getenv('PARTITION_REF', f'daily_summaries/category={target_category}/date={target_date}')
# Input paths
phrase_stats_file = f"/tmp/databuild_test/examples/podcast_reviews/phrase_stats/category={target_category}/date={target_date}/phrase_stats.parquet"
categorized_reviews_file = f"/tmp/databuild_test/examples/podcast_reviews/categorized_reviews/category={target_category}/date={target_date}/categorized_reviews.parquet"
# Check input files exist
if not os.path.exists(phrase_stats_file):
print(f"Phrase stats file not found: {phrase_stats_file}", file=sys.stderr)
sys.exit(1)
if not os.path.exists(categorized_reviews_file):
print(f"Categorized reviews file not found: {categorized_reviews_file}", file=sys.stderr)
sys.exit(1)
# Output path
output_dir = Path(f"/tmp/databuild_test/examples/podcast_reviews/daily_summaries/category={target_category}/date={target_date}")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "daily_summary.parquet"
try:
# Generate daily summary combining phrase stats and recent reviews
generate_daily_summary_for_category_date(
phrase_stats_file,
categorized_reviews_file,
target_category,
target_date,
str(output_file)
)
print(f"Successfully generated daily summary for category {target_category} on {target_date}")
print(f"Output written to: {output_file}")
# Create manifest
manifest = {
"outputs": [{"str": partition_ref}],
"inputs": [
{"str": f"phrase_stats/category={target_category}/date={target_date}"},
{"str": f"categorized_reviews/category={target_category}/date={target_date}"}
],
"start_time": datetime.now().isoformat(),
"end_time": datetime.now().isoformat(),
"task": {
"job": {"label": "//examples/podcast_reviews:daily_summary_job"},
"config": {
"outputs": [{"str": partition_ref}],
"inputs": [
{"dep_type": 1, "partition_ref": {"str": f"phrase_stats/category={target_category}/date={target_date}"}},
{"dep_type": 1, "partition_ref": {"str": f"categorized_reviews/category={target_category}/date={target_date}"}}
],
"args": [target_category, target_date],
"env": {"PARTITION_REF": partition_ref, "TARGET_CATEGORY": target_category, "TARGET_DATE": target_date}
}
}
}
manifest_file = output_dir / "manifest.json"
with open(manifest_file, 'w') as f:
json.dump(manifest, f, indent=2)
except Exception as e:
print(f"Error generating daily summary: {e}", file=sys.stderr)
sys.exit(1)
def generate_daily_summary_for_category_date(
phrase_stats_file: str,
categorized_reviews_file: str,
target_category: str,
target_date: str,
output_file: str
):
"""Generate daily summary combining top phrases and recent reviews."""
# Connect to DuckDB for processing
duckdb_conn = duckdb.connect()
try:
# Try to install and load parquet extension, but don't fail if it's already installed
try:
duckdb_conn.execute("INSTALL parquet")
except Exception:
pass # Extension might already be installed
duckdb_conn.execute("LOAD parquet")
# Check if we have data
phrase_count = duckdb_conn.execute(f"SELECT COUNT(*) FROM parquet_scan('{phrase_stats_file}')").fetchone()[0]
review_count = duckdb_conn.execute(f"SELECT COUNT(*) FROM parquet_scan('{categorized_reviews_file}')").fetchone()[0]
if phrase_count == 0 and review_count == 0:
print(f"No data found, creating empty daily summary")
create_empty_daily_summary(target_category, target_date, output_file, duckdb_conn)
return
# Query to generate comprehensive daily summary
query = f"""
WITH top_phrases_per_podcast AS (
SELECT
podcast_id,
podcast_title,
ngram,
count as phrase_count,
avg_rating as phrase_avg_rating,
weighted_score,
ROW_NUMBER() OVER (PARTITION BY podcast_id ORDER BY weighted_score DESC) as phrase_rank
FROM parquet_scan('{phrase_stats_file}')
WHERE ngram IS NOT NULL
),
podcast_phrase_summary AS (
SELECT
podcast_id,
podcast_title,
STRING_AGG(ngram, '; ' ORDER BY weighted_score DESC) as top_phrases,
COUNT(*) as total_phrases,
AVG(phrase_avg_rating) as avg_phrase_rating,
SUM(weighted_score) as total_phrase_score
FROM top_phrases_per_podcast
WHERE phrase_rank <= 5 -- Top 5 phrases per podcast
GROUP BY podcast_id, podcast_title
),
podcast_review_summary AS (
SELECT
podcast_id,
podcast_title,
COUNT(*) as review_count,
AVG(rating::FLOAT) as avg_rating,
MIN(rating) as min_rating,
MAX(rating) as max_rating,
COUNT(CASE WHEN rating >= 4 THEN 1 END) as positive_reviews,
COUNT(CASE WHEN rating <= 2 THEN 1 END) as negative_reviews,
STRING_AGG(
CASE WHEN rating <= 2 AND length(content) > 20
THEN substring(content, 1, 200) || '...'
ELSE NULL
END,
' | '
ORDER BY rating ASC, length(content) DESC
) as sample_negative_reviews
FROM parquet_scan('{categorized_reviews_file}')
WHERE podcast_title IS NOT NULL
GROUP BY podcast_id, podcast_title
),
daily_summary AS (
SELECT
'{target_date}' as date,
'{target_category}' as category,
COALESCE(pps.podcast_id, prs.podcast_id) as podcast_id,
COALESCE(pps.podcast_title, prs.podcast_title) as podcast_title,
COALESCE(prs.review_count, 0) as review_count,
COALESCE(prs.avg_rating, 0.0) as avg_rating,
COALESCE(prs.positive_reviews, 0) as positive_reviews,
COALESCE(prs.negative_reviews, 0) as negative_reviews,
COALESCE(pps.top_phrases, 'No significant phrases') as top_phrases,
COALESCE(pps.total_phrases, 0) as total_phrases,
COALESCE(pps.avg_phrase_rating, 0.0) as avg_phrase_rating,
COALESCE(pps.total_phrase_score, 0.0) as total_phrase_score,
prs.sample_negative_reviews,
CASE
WHEN prs.avg_rating >= 4.0 AND pps.avg_phrase_rating >= 4.0 THEN 'Highly Positive'
WHEN prs.avg_rating >= 3.5 THEN 'Positive'
WHEN prs.avg_rating >= 2.5 THEN 'Mixed'
WHEN prs.avg_rating >= 1.5 THEN 'Negative'
ELSE 'Highly Negative'
END as sentiment_category,
(prs.review_count * prs.avg_rating * 0.6 + pps.total_phrase_score * 0.4) as overall_score
FROM podcast_phrase_summary pps
FULL OUTER JOIN podcast_review_summary prs
ON pps.podcast_id = prs.podcast_id
WHERE COALESCE(prs.review_count, 0) > 0 OR COALESCE(pps.total_phrases, 0) > 0
)
SELECT
date,
category,
podcast_id,
podcast_title,
review_count,
avg_rating,
positive_reviews,
negative_reviews,
top_phrases,
total_phrases,
avg_phrase_rating,
total_phrase_score,
sample_negative_reviews,
sentiment_category,
overall_score
FROM daily_summary
ORDER BY overall_score DESC, review_count DESC
"""
# Execute query and save to parquet
duckdb_conn.execute(f"COPY ({query}) TO '{output_file}' (FORMAT PARQUET)")
# Get row count for logging
count_result = duckdb_conn.execute(f"SELECT COUNT(*) FROM ({query})").fetchone()
row_count = count_result[0] if count_result else 0
print(f"Generated daily summary for {row_count} podcasts")
if row_count == 0:
print(f"Warning: No summary data generated for category '{target_category}' on date '{target_date}'")
create_empty_daily_summary(target_category, target_date, output_file, duckdb_conn)
finally:
duckdb_conn.close()
def create_empty_daily_summary(category: str, date: str, output_file: str, duckdb_conn):
"""Create empty daily summary parquet file with correct schema."""
duckdb_conn.execute("DROP TABLE IF EXISTS empty_daily_summary")
duckdb_conn.execute("""
CREATE TABLE empty_daily_summary (
date VARCHAR,
category VARCHAR,
podcast_id VARCHAR,
podcast_title VARCHAR,
review_count BIGINT,
avg_rating DOUBLE,
positive_reviews BIGINT,
negative_reviews BIGINT,
top_phrases VARCHAR,
total_phrases BIGINT,
avg_phrase_rating DOUBLE,
total_phrase_score DOUBLE,
sample_negative_reviews VARCHAR,
sentiment_category VARCHAR,
overall_score DOUBLE
)
""")
duckdb_conn.execute(f"COPY empty_daily_summary TO '{output_file}' (FORMAT PARQUET)")
print("Created empty daily summary file")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,184 @@
#!/usr/bin/env python3
"""
Centralized DuckDB utilities for handling extension issues in isolated environments.
"""
import duckdb
import sqlite3
import pandas as pd
from pathlib import Path
from typing import Any, Dict, List, Optional
import warnings
def create_duckdb_connection(enable_extensions: bool = True) -> duckdb.DuckDBPyConnection:
"""
Create a DuckDB connection with proper extension handling for isolated environments.
Args:
enable_extensions: Whether to try enabling extensions (may fail in isolated environments)
Returns:
DuckDB connection object
"""
conn = duckdb.connect()
if enable_extensions:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
try:
# Try to enable extensions, but don't fail if not available
conn.execute("SET autoinstall_known_extensions=1")
conn.execute("SET autoload_known_extensions=1")
except Exception:
# Extensions not available, will use fallback methods
pass
return conn
def sqlite_to_dataframe(sqlite_path: str, query: str, params: Optional[List[Any]] = None) -> pd.DataFrame:
"""
Execute a SQLite query and return results as a pandas DataFrame.
This is a fallback when DuckDB's sqlite_scan doesn't work.
Args:
sqlite_path: Path to SQLite database
query: SQL query to execute
params: Query parameters
Returns:
DataFrame with query results
"""
conn = sqlite3.connect(sqlite_path)
try:
if params:
df = pd.read_sql_query(query, conn, params=params)
else:
df = pd.read_sql_query(query, conn)
return df
finally:
conn.close()
def execute_query_with_fallback(
duckdb_conn: duckdb.DuckDBPyConnection,
sqlite_path: str,
query: str,
params: Optional[List[Any]] = None,
use_sqlite_scan: bool = True
) -> pd.DataFrame:
"""
Execute a query using DuckDB's sqlite_scan if available, otherwise fall back to direct SQLite access.
Args:
duckdb_conn: DuckDB connection
sqlite_path: Path to SQLite database
query: SQL query (should work with both sqlite_scan and direct SQLite)
params: Query parameters
use_sqlite_scan: Whether to try sqlite_scan first
Returns:
DataFrame with query results
"""
if use_sqlite_scan:
try:
# Try using DuckDB's sqlite_scan
if params:
result = duckdb_conn.execute(query, params).df()
else:
result = duckdb_conn.execute(query).df()
return result
except Exception as e:
print(f"sqlite_scan failed: {e}, falling back to direct SQLite access")
# Fallback: Use direct SQLite access
# Convert DuckDB sqlite_scan query to regular SQLite query
fallback_query = query.replace("sqlite_scan(?, 'reviews')", "reviews")
fallback_query = fallback_query.replace("sqlite_scan(?, 'podcasts')", "podcasts")
fallback_query = fallback_query.replace("sqlite_scan(?, 'categories')", "categories")
# Remove the sqlite_path parameter since we're connecting directly
if params and len(params) > 0 and params[0] == sqlite_path:
fallback_params = params[1:]
else:
fallback_params = params
return sqlite_to_dataframe(sqlite_path, fallback_query, fallback_params)
def save_dataframe_with_fallback(
df: pd.DataFrame,
output_path: str,
duckdb_conn: Optional[duckdb.DuckDBPyConnection] = None,
format: str = "parquet"
) -> None:
"""
Save a DataFrame to the specified format, with fallback options if DuckDB extensions fail.
Args:
df: DataFrame to save
output_path: Output file path
duckdb_conn: Optional DuckDB connection (for parquet)
format: Output format ('parquet' or 'csv')
"""
output_path = Path(output_path)
if format.lower() == "parquet":
try:
if duckdb_conn:
# Try using DuckDB to write parquet
duckdb_conn.register('temp_df', df)
duckdb_conn.execute(f"COPY temp_df TO '{output_path}' (FORMAT PARQUET)")
return
except Exception as e:
print(f"DuckDB parquet write failed: {e}, falling back to pandas")
try:
# Fallback to pandas parquet (requires pyarrow)
df.to_parquet(output_path, index=False)
return
except Exception as e:
print(f"Pandas parquet write failed: {e}, falling back to CSV")
# Change extension to CSV and fall through
output_path = output_path.with_suffix('.csv')
format = "csv"
if format.lower() == "csv":
df.to_csv(output_path, index=False)
def read_dataframe_with_fallback(
file_path: str,
duckdb_conn: Optional[duckdb.DuckDBPyConnection] = None
) -> pd.DataFrame:
"""
Read a DataFrame from file with fallback options.
Args:
file_path: Path to input file
duckdb_conn: Optional DuckDB connection
Returns:
DataFrame with file contents
"""
file_path = Path(file_path)
if file_path.suffix.lower() == '.parquet':
try:
if duckdb_conn:
# Try using DuckDB to read parquet
return duckdb_conn.execute(f"SELECT * FROM parquet_scan('{file_path}')").df()
except Exception:
pass
try:
# Fallback to pandas
return pd.read_parquet(file_path)
except Exception:
# Try CSV fallback
csv_path = file_path.with_suffix('.csv')
if csv_path.exists():
return pd.read_csv(csv_path)
raise
elif file_path.suffix.lower() == '.csv':
return pd.read_csv(file_path)
else:
raise ValueError(f"Unsupported file format: {file_path.suffix}")

View file

@ -1,2 +0,0 @@
print("What a time to be alive.")

View file

@ -0,0 +1,246 @@
#!/usr/bin/env python3
import sys
import json
import os
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any
from duckdb_utils import create_duckdb_connection, execute_query_with_fallback, save_dataframe_with_fallback
def main():
# Write debug at the very start to see if main is called
debug_file = "/tmp/databuild_test/podcasts_main_debug.log"
try:
with open(debug_file, "w") as f:
f.write(f"main() called with sys.argv: {sys.argv}\n")
f.flush()
except:
pass
if len(sys.argv) < 2:
print("Usage: extract_podcasts_job.py {config|exec} [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
try:
with open(debug_file, "a") as f:
f.write(f"command: {command}\n")
f.flush()
except:
pass
if command == "config":
handle_config(sys.argv[2:])
elif command == "exec":
handle_exec(sys.argv[2:])
else:
print(f"Unknown command: {command}", file=sys.stderr)
sys.exit(1)
def handle_config(args):
if len(args) < 1:
print("Config mode requires partition ref", file=sys.stderr)
sys.exit(1)
partition_ref = args[0]
# This job produces a single partition with all podcast metadata
if partition_ref != "podcasts/all":
print(f"Invalid partition ref: {partition_ref}. Expected 'podcasts/all'", file=sys.stderr)
sys.exit(1)
config = {
"configs": [{
"outputs": [{"str": partition_ref}],
"inputs": [],
"args": ["all"],
"env": {
"PARTITION_REF": partition_ref
}
}]
}
print(json.dumps(config))
def handle_exec(args):
# Write debug info to a file since stdout might not be captured
debug_file = "/tmp/databuild_test/podcasts_debug.log"
with open(debug_file, "w") as f:
f.write(f"Starting extract_podcasts_job.exec with args: {args}\n")
f.flush()
print(f"Starting extract_podcasts_job.exec with args: {args}")
partition_ref = os.getenv('PARTITION_REF', 'podcasts/all')
print(f"Partition ref: {partition_ref}")
with open(debug_file, "a") as f:
f.write(f"Partition ref: {partition_ref}\n")
f.flush()
# Database paths
db_path = "/tmp/databuild_test/examples/podcast_reviews/data/ingest/database.sqlite"
if not os.path.exists(db_path):
# Fallback to relative path for development
db_path = "data/ingest/database.sqlite"
print(f"Looking for database at: {db_path}")
print(f"Database exists: {os.path.exists(db_path)}")
# Output path
output_dir = Path("/tmp/databuild_test/examples/podcast_reviews/podcasts")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "podcasts.parquet"
print(f"Output directory: {output_dir}")
print(f"Output file: {output_file}")
try:
# Extract all podcasts with their categories
print("Calling extract_podcasts_with_categories...")
result = extract_podcasts_with_categories(db_path, str(output_file))
print(f"extract_podcasts_with_categories returned: {result}")
print(f"Successfully extracted podcast metadata")
print(f"Output written to: {output_file}")
print(f"Output file exists: {output_file.exists()}")
# Create manifest
manifest = {
"outputs": [{"str": partition_ref}],
"inputs": [],
"start_time": datetime.now().isoformat(),
"end_time": datetime.now().isoformat(),
"task": {
"job": {"label": "//examples/podcast_reviews:extract_podcasts_job"},
"config": {
"outputs": [{"str": partition_ref}],
"inputs": [],
"args": [],
"env": {"PARTITION_REF": partition_ref}
}
}
}
manifest_file = output_dir / "manifest.json"
with open(manifest_file, 'w') as f:
json.dump(manifest, f, indent=2)
print(f"Manifest written to: {manifest_file}")
except Exception as e:
print(f"Error extracting podcasts: {e}", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)
def extract_podcasts_with_categories(db_path: str, output_file: str):
"""Extract all podcasts with their categories and save as parquet."""
print(f"extract_podcasts_with_categories called with db_path={db_path}, output_file={output_file}")
# Connect to DuckDB with extension handling
print("Creating DuckDB connection...")
duckdb_conn = create_duckdb_connection()
print("DuckDB connection created")
try:
# Use a simpler approach that works with SQLite fallback
try:
# Try complex DuckDB query first
query = """
WITH podcast_categories AS (
SELECT
p.podcast_id,
p.itunes_id,
p.slug,
p.itunes_url,
p.title,
c.category,
ROW_NUMBER() OVER (PARTITION BY p.podcast_id ORDER BY c.category) as category_rank
FROM sqlite_scan(?, 'podcasts') p
LEFT JOIN sqlite_scan(?, 'categories') c ON p.podcast_id = c.podcast_id
),
primary_categories AS (
SELECT
podcast_id,
itunes_id,
slug,
itunes_url,
title,
category as primary_category
FROM podcast_categories
WHERE category_rank = 1
),
all_categories AS (
SELECT
podcast_id,
STRING_AGG(category, '|' ORDER BY category) as all_categories
FROM podcast_categories
WHERE category IS NOT NULL
GROUP BY podcast_id
)
SELECT
pc.podcast_id,
pc.itunes_id,
pc.slug,
pc.itunes_url,
pc.title,
pc.primary_category,
COALESCE(ac.all_categories, pc.primary_category) as all_categories
FROM primary_categories pc
LEFT JOIN all_categories ac ON pc.podcast_id = ac.podcast_id
ORDER BY pc.title
"""
df = duckdb_conn.execute(query, [db_path, db_path]).df()
except Exception as e:
print(f"DuckDB complex query failed: {e}, using pandas fallback")
# Fallback: Use pandas to process the data
import pandas as pd
import sqlite3
sqlite_conn = sqlite3.connect(db_path)
try:
# Read podcasts and categories separately
podcasts_df = pd.read_sql_query("SELECT * FROM podcasts", sqlite_conn)
categories_df = pd.read_sql_query("SELECT * FROM categories", sqlite_conn)
# Group categories by podcast_id
categories_grouped = categories_df.groupby('podcast_id')['category'].apply(
lambda x: '|'.join(sorted(x))
).reset_index()
categories_grouped.columns = ['podcast_id', 'all_categories']
# Get primary category (first alphabetically)
primary_categories = categories_df.sort_values('category').groupby('podcast_id').first().reset_index()
primary_categories = primary_categories[['podcast_id', 'category']].rename(columns={'category': 'primary_category'})
# Join everything together
df = podcasts_df.merge(primary_categories, on='podcast_id', how='left')
df = df.merge(categories_grouped, on='podcast_id', how='left')
# Fill missing values
df['primary_category'] = df['primary_category'].fillna('unknown')
df['all_categories'] = df['all_categories'].fillna(df['primary_category'])
# Sort by title
df = df.sort_values('title')
finally:
sqlite_conn.close()
# Save to parquet with fallback
save_dataframe_with_fallback(df, output_file, duckdb_conn, "parquet")
row_count = len(df)
print(f"Extracted {row_count} podcasts with category information")
finally:
duckdb_conn.close()
if __name__ == "__main__":
main()

View file

@ -0,0 +1,170 @@
#!/usr/bin/env python3
import sys
import json
import os
from datetime import datetime, date
from pathlib import Path
from typing import List, Dict, Any
import re
from duckdb_utils import create_duckdb_connection, execute_query_with_fallback, save_dataframe_with_fallback
def main():
# Write debug info to understand what's being called
debug_file = "/tmp/databuild_test/reviews_main_debug.log"
try:
with open(debug_file, "w") as f:
f.write(f"main() called with sys.argv: {sys.argv}\n")
f.flush()
except:
pass
if len(sys.argv) < 2:
print("Usage: extract_reviews_job.py {config|exec} [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
try:
with open(debug_file, "a") as f:
f.write(f"command: {command}\n")
f.flush()
except:
pass
if command == "config":
handle_config(sys.argv[2:])
elif command == "exec":
handle_exec(sys.argv[2:])
else:
print(f"Unknown command: {command}", file=sys.stderr)
sys.exit(1)
def parse_partition_ref(partition_ref: str) -> Dict[str, str]:
"""Parse partition ref like 'reviews/date=2020-01-01' into components."""
match = re.match(r'reviews/date=(\d{4}-\d{2}-\d{2})', partition_ref)
if not match:
raise ValueError(f"Invalid partition ref format: {partition_ref}")
return {"date": match.group(1)}
def handle_config(args):
if len(args) < 1:
print("Config mode requires partition ref", file=sys.stderr)
sys.exit(1)
configs = []
# Process each partition reference
for partition_ref in args:
try:
parsed = parse_partition_ref(partition_ref)
date_str = parsed["date"]
except ValueError as e:
print(f"Error parsing partition ref: {e}", file=sys.stderr)
sys.exit(1)
configs.append({
"outputs": [{"str": partition_ref}],
"inputs": [],
"args": [date_str],
"env": {
"PARTITION_REF": partition_ref,
"TARGET_DATE": date_str
}
})
config = {"configs": configs}
print(json.dumps(config))
def handle_exec(args):
if len(args) < 1:
print("Exec mode requires date argument", file=sys.stderr)
sys.exit(1)
target_date = args[0]
partition_ref = os.getenv('PARTITION_REF', f'reviews/date={target_date}')
# Database paths
db_path = "/tmp/databuild_test/examples/podcast_reviews/data/ingest/database.sqlite"
if not os.path.exists(db_path):
# Fallback to relative path for development
db_path = "data/ingest/database.sqlite"
# Output path
output_dir = Path(f"/tmp/databuild_test/examples/podcast_reviews/reviews/date={target_date}")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "reviews.parquet"
try:
# Extract reviews for the target date
extract_reviews_for_date(db_path, target_date, str(output_file))
print(f"Successfully extracted reviews for {target_date}")
print(f"Output written to: {output_file}")
# Create manifest
manifest = {
"outputs": [{"str": partition_ref}],
"inputs": [],
"start_time": datetime.now().isoformat(),
"end_time": datetime.now().isoformat(),
"task": {
"job": {"label": "//examples/podcast_reviews:extract_reviews_job"},
"config": {
"outputs": [{"str": partition_ref}],
"inputs": [],
"args": [target_date],
"env": {"PARTITION_REF": partition_ref, "TARGET_DATE": target_date}
}
}
}
manifest_file = output_dir / "manifest.json"
with open(manifest_file, 'w') as f:
json.dump(manifest, f, indent=2)
except Exception as e:
print(f"Error extracting reviews: {e}", file=sys.stderr)
sys.exit(1)
def extract_reviews_for_date(db_path: str, target_date: str, output_file: str):
"""Extract reviews for a specific date and save as parquet."""
# Connect to DuckDB with extension handling
duckdb_conn = create_duckdb_connection()
try:
# Query reviews for the target date
query = """
SELECT
podcast_id,
title as review_title,
content,
rating,
author_id,
created_at,
DATE(created_at) as review_date
FROM sqlite_scan(?, 'reviews')
WHERE DATE(created_at) = ?
ORDER BY created_at
"""
# Execute query with fallback handling
df = execute_query_with_fallback(
duckdb_conn,
db_path,
query,
[db_path, target_date]
)
# Save to parquet with fallback
save_dataframe_with_fallback(df, output_file, duckdb_conn, "parquet")
row_count = len(df)
print(f"Extracted {row_count} reviews for date {target_date}")
finally:
duckdb_conn.close()
if __name__ == "__main__":
main()

View file

@ -0,0 +1,60 @@
#!/usr/bin/env python3
import sys
import json
import re
from collections import defaultdict
def main():
if len(sys.argv) < 2:
print("Usage: job_lookup.py partition_ref [partition_ref...]", file=sys.stderr)
sys.exit(1)
partition_refs = sys.argv[1:]
try:
result = defaultdict(list)
for partition_ref in partition_refs:
job_label = lookup_job_for_partition(partition_ref)
result[job_label].append(partition_ref)
# Output in the format expected by DataBuild
print(json.dumps({k: v for k, v in result.items() if v}))
except Exception as e:
print(f"Error in job lookup: {e}", file=sys.stderr)
sys.exit(1)
def lookup_job_for_partition(partition_ref: str) -> str:
"""Determine which job produces a given partition reference."""
# Extract reviews by date: reviews/date=YYYY-MM-DD
if re.match(r'reviews/date=\d{4}-\d{2}-\d{2}', partition_ref):
return "//:extract_reviews_job"
# Extract all podcasts: podcasts/all
if partition_ref == "podcasts/all":
return "//:extract_podcasts_job"
# Categorized reviews: categorized_reviews/category=CATEGORY/date=YYYY-MM-DD
if re.match(r'categorized_reviews/category=[^/]+/date=\d{4}-\d{2}-\d{2}', partition_ref):
return "//:categorize_reviews_job"
# Phrase models: phrase_models/category=CATEGORY/date=YYYY-MM-DD
if re.match(r'phrase_models/category=[^/]+/date=\d{4}-\d{2}-\d{2}', partition_ref):
return "//:phrase_modeling_job"
# Phrase statistics: phrase_stats/category=CATEGORY/date=YYYY-MM-DD
if re.match(r'phrase_stats/category=[^/]+/date=\d{4}-\d{2}-\d{2}', partition_ref):
return "//:phrase_stats_job"
# Daily summaries: daily_summaries/category=CATEGORY/date=YYYY-MM-DD
if re.match(r'daily_summaries/category=[^/]+/date=\d{4}-\d{2}-\d{2}', partition_ref):
return "//:daily_summary_job"
# If no match found, raise an error
raise ValueError(f"No job found for partition reference: {partition_ref}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,353 @@
#!/usr/bin/env python3
import sys
import json
import os
import duckdb
import subprocess
import tempfile
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any
import re
def main():
if len(sys.argv) < 2:
print("Usage: phrase_modeling_job.py {config|exec} [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
if command == "config":
handle_config(sys.argv[2:])
elif command == "exec":
handle_exec(sys.argv[2:])
else:
print(f"Unknown command: {command}", file=sys.stderr)
sys.exit(1)
def parse_partition_ref(partition_ref: str) -> Dict[str, str]:
"""Parse partition ref like 'phrase_models/category=comedy/date=2020-01-01' into components."""
match = re.match(r'phrase_models/category=([^/]+)/date=(\d{4}-\d{2}-\d{2})', partition_ref)
if not match:
raise ValueError(f"Invalid partition ref format: {partition_ref}")
return {"category": match.group(1), "date": match.group(2)}
def handle_config(args):
if len(args) < 1:
print("Config mode requires partition ref", file=sys.stderr)
sys.exit(1)
configs = []
# Process each partition reference
for partition_ref in args:
try:
parsed = parse_partition_ref(partition_ref)
category = parsed["category"]
date_str = parsed["date"]
except ValueError as e:
print(f"Error parsing partition ref: {e}", file=sys.stderr)
sys.exit(1)
# Dependencies: categorized reviews for the category and date
categorized_reviews_ref = f"categorized_reviews/category={category}/date={date_str}"
configs.append({
"outputs": [{"str": partition_ref}],
"inputs": [
{"dep_type": 1, "partition_ref": {"str": categorized_reviews_ref}}
],
"args": [category, date_str],
"env": {
"PARTITION_REF": partition_ref,
"TARGET_CATEGORY": category,
"TARGET_DATE": date_str
}
})
config = {"configs": configs}
print(json.dumps(config))
def handle_exec(args):
if len(args) < 2:
print("Exec mode requires category and date arguments", file=sys.stderr)
sys.exit(1)
target_category = args[0]
target_date = args[1]
partition_ref = os.getenv('PARTITION_REF', f'phrase_models/category={target_category}/date={target_date}')
# Input path
categorized_reviews_file = f"/tmp/databuild_test/examples/podcast_reviews/categorized_reviews/category={target_category}/date={target_date}/categorized_reviews.parquet"
# Check input file exists
if not os.path.exists(categorized_reviews_file):
print(f"Categorized reviews file not found: {categorized_reviews_file}", file=sys.stderr)
sys.exit(1)
# Output path
output_dir = Path(f"/tmp/databuild_test/examples/podcast_reviews/phrase_models/category={target_category}/date={target_date}")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "phrase_models.parquet"
try:
# Extract phrases using phrase modeling
extract_phrases_for_category_date(categorized_reviews_file, target_category, target_date, str(output_file))
print(f"Successfully extracted phrases for category {target_category} on {target_date}")
print(f"Output written to: {output_file}")
# Create manifest
manifest = {
"outputs": [{"str": partition_ref}],
"inputs": [
{"str": f"categorized_reviews/category={target_category}/date={target_date}"}
],
"start_time": datetime.now().isoformat(),
"end_time": datetime.now().isoformat(),
"task": {
"job": {"label": "//examples/podcast_reviews:phrase_modeling_job"},
"config": {
"outputs": [{"str": partition_ref}],
"inputs": [
{"dep_type": 1, "partition_ref": {"str": f"categorized_reviews/category={target_category}/date={target_date}"}}
],
"args": [target_category, target_date],
"env": {"PARTITION_REF": partition_ref, "TARGET_CATEGORY": target_category, "TARGET_DATE": target_date}
}
}
}
manifest_file = output_dir / "manifest.json"
with open(manifest_file, 'w') as f:
json.dump(manifest, f, indent=2)
except Exception as e:
print(f"Error extracting phrases: {e}", file=sys.stderr)
sys.exit(1)
def extract_phrases_for_category_date(categorized_reviews_file: str, target_category: str, target_date: str, output_file: str):
"""Extract phrases from categorized reviews using phrase binary or simple ngram extraction."""
# Connect to DuckDB for processing
duckdb_conn = duckdb.connect()
try:
# Try to install and load parquet extension, but don't fail if it's already installed
try:
duckdb_conn.execute("INSTALL parquet")
except Exception:
pass # Extension might already be installed
duckdb_conn.execute("LOAD parquet")
# Check if phrase binary is available
phrase_binary = find_phrase_binary()
if phrase_binary:
# Use external phrase binary
phrases = extract_with_phrase_binary(categorized_reviews_file, phrase_binary)
else:
print("Warning: phrase binary not found, using simple ngram extraction")
# Fallback to simple ngram extraction
phrases = extract_simple_ngrams(categorized_reviews_file, duckdb_conn)
# Convert phrases to structured data and save
if phrases:
save_phrases_to_parquet(phrases, target_category, target_date, output_file, duckdb_conn)
else:
# Create empty parquet file with correct schema
create_empty_phrase_parquet(target_category, target_date, output_file, duckdb_conn)
finally:
duckdb_conn.close()
def find_phrase_binary() -> str:
"""Find phrase binary in common locations."""
possible_paths = [
"/usr/local/bin/phrase",
"/usr/bin/phrase",
"./phrase",
"../phrase",
os.path.expanduser("~/bin/phrase")
]
for path in possible_paths:
if os.path.exists(path) and os.access(path, os.X_OK):
return path
return None
def extract_with_phrase_binary(categorized_reviews_file: str, phrase_binary: str) -> List[Dict[str, Any]]:
"""Extract phrases using the external phrase binary."""
# Read review content to temporary file
duckdb_conn = duckdb.connect()
try:
# Try to install and load parquet extension, but don't fail if it's already installed
try:
duckdb_conn.execute("INSTALL parquet")
except Exception:
pass # Extension might already be installed
duckdb_conn.execute("LOAD parquet")
# Extract review content
content_query = f"SELECT content FROM parquet_scan('{categorized_reviews_file}') WHERE content IS NOT NULL AND content != ''"
results = duckdb_conn.execute(content_query).fetchall()
if not results:
return []
# Write content to temporary file
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as temp_file:
for (content,) in results:
temp_file.write(content.strip() + '\n')
temp_file_path = temp_file.name
try:
# Run phrase binary
cmd = [phrase_binary, "--input", temp_file_path, "--output-format", "json"]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
if result.returncode != 0:
print(f"Phrase binary failed: {result.stderr}", file=sys.stderr)
return []
# Parse JSON output
phrases_data = json.loads(result.stdout) if result.stdout.strip() else []
return phrases_data
finally:
# Clean up temp file
os.unlink(temp_file_path)
finally:
duckdb_conn.close()
def extract_simple_ngrams(categorized_reviews_file: str, duckdb_conn) -> List[Dict[str, Any]]:
"""Simple ngram extraction using SQL as fallback."""
# Simple phrase extraction using SQL
query = f"""
WITH word_tokens AS (
SELECT
unnest(string_split(lower(regexp_replace(content, '[^a-zA-Z0-9\\s]', '', 'g')), ' ')) as word,
podcast_id,
rating
FROM parquet_scan('{categorized_reviews_file}')
WHERE content IS NOT NULL AND content != ''
),
bigrams AS (
SELECT
word || ' ' || lead(word) OVER (PARTITION BY podcast_id ORDER BY rowid) as ngram,
rating
FROM (SELECT *, row_number() OVER () as rowid FROM word_tokens) t
WHERE word IS NOT NULL AND word != ''
),
phrase_stats AS (
SELECT
ngram,
COUNT(*) as frequency,
AVG(rating::FLOAT) as avg_rating,
MIN(rating) as min_rating,
MAX(rating) as max_rating
FROM bigrams
WHERE ngram IS NOT NULL AND ngram NOT LIKE '% %' = false
GROUP BY ngram
HAVING COUNT(*) >= 3 -- Only phrases that appear at least 3 times
)
SELECT
ngram,
frequency,
avg_rating,
min_rating,
max_rating,
CASE
WHEN avg_rating >= 4.0 THEN frequency * avg_rating * 0.8
WHEN avg_rating <= 2.0 THEN frequency * (5.0 - avg_rating) * 0.8
ELSE frequency * 0.3
END as score
FROM phrase_stats
ORDER BY score DESC
LIMIT 1000
"""
try:
results = duckdb_conn.execute(query).fetchall()
phrases = []
for row in results:
ngram, frequency, avg_rating, min_rating, max_rating, score = row
phrases.append({
"ngram": ngram,
"frequency": frequency,
"avg_rating": float(avg_rating) if avg_rating else 0.0,
"min_rating": min_rating,
"max_rating": max_rating,
"score": float(score) if score else 0.0,
"hash": hash(ngram) % (2**31) # Simple hash for compatibility
})
return phrases
except Exception as e:
print(f"Error in simple ngram extraction: {e}", file=sys.stderr)
return []
def save_phrases_to_parquet(phrases: List[Dict[str, Any]], category: str, date: str, output_file: str, duckdb_conn):
"""Save phrases to parquet file."""
if not phrases:
create_empty_phrase_parquet(category, date, output_file, duckdb_conn)
return
# Create temporary table with phrases
duckdb_conn.execute("DROP TABLE IF EXISTS temp_phrases")
duckdb_conn.execute("""
CREATE TABLE temp_phrases (
date VARCHAR,
category VARCHAR,
hash BIGINT,
ngram VARCHAR,
score DOUBLE
)
""")
# Insert phrase data
for phrase in phrases:
duckdb_conn.execute("""
INSERT INTO temp_phrases VALUES (?, ?, ?, ?, ?)
""", [
date,
category,
phrase.get("hash", hash(phrase.get("ngram", "")) % (2**31)),
phrase.get("ngram", ""),
phrase.get("score", 0.0)
])
# Save to parquet
duckdb_conn.execute(f"COPY temp_phrases TO '{output_file}' (FORMAT PARQUET)")
print(f"Saved {len(phrases)} phrases to parquet file")
def create_empty_phrase_parquet(category: str, date: str, output_file: str, duckdb_conn):
"""Create empty parquet file with correct schema."""
duckdb_conn.execute("DROP TABLE IF EXISTS empty_phrases")
duckdb_conn.execute("""
CREATE TABLE empty_phrases (
date VARCHAR,
category VARCHAR,
hash BIGINT,
ngram VARCHAR,
score DOUBLE
)
""")
duckdb_conn.execute(f"COPY empty_phrases TO '{output_file}' (FORMAT PARQUET)")
print("Created empty phrase models file (no phrases extracted)")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,263 @@
#!/usr/bin/env python3
import sys
import json
import os
import duckdb
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any
import re
def main():
if len(sys.argv) < 2:
print("Usage: phrase_stats_job.py {config|exec} [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
if command == "config":
handle_config(sys.argv[2:])
elif command == "exec":
handle_exec(sys.argv[2:])
else:
print(f"Unknown command: {command}", file=sys.stderr)
sys.exit(1)
def parse_partition_ref(partition_ref: str) -> Dict[str, str]:
"""Parse partition ref like 'phrase_stats/category=comedy/date=2020-01-01' into components."""
match = re.match(r'phrase_stats/category=([^/]+)/date=(\d{4}-\d{2}-\d{2})', partition_ref)
if not match:
raise ValueError(f"Invalid partition ref format: {partition_ref}")
return {"category": match.group(1), "date": match.group(2)}
def handle_config(args):
if len(args) < 1:
print("Config mode requires partition ref", file=sys.stderr)
sys.exit(1)
configs = []
# Process each partition reference
for partition_ref in args:
try:
parsed = parse_partition_ref(partition_ref)
category = parsed["category"]
date_str = parsed["date"]
except ValueError as e:
print(f"Error parsing partition ref: {e}", file=sys.stderr)
sys.exit(1)
# Dependencies: phrase models and categorized reviews for the category and date
phrase_models_ref = f"phrase_models/category={category}/date={date_str}"
categorized_reviews_ref = f"categorized_reviews/category={category}/date={date_str}"
configs.append({
"outputs": [{"str": partition_ref}],
"inputs": [
{"dep_type": 1, "partition_ref": {"str": phrase_models_ref}},
{"dep_type": 1, "partition_ref": {"str": categorized_reviews_ref}}
],
"args": [category, date_str],
"env": {
"PARTITION_REF": partition_ref,
"TARGET_CATEGORY": category,
"TARGET_DATE": date_str
}
})
config = {"configs": configs}
print(json.dumps(config))
def handle_exec(args):
if len(args) < 2:
print("Exec mode requires category and date arguments", file=sys.stderr)
sys.exit(1)
target_category = args[0]
target_date = args[1]
partition_ref = os.getenv('PARTITION_REF', f'phrase_stats/category={target_category}/date={target_date}')
# Input paths
phrase_models_file = f"/tmp/databuild_test/examples/podcast_reviews/phrase_models/category={target_category}/date={target_date}/phrase_models.parquet"
categorized_reviews_file = f"/tmp/databuild_test/examples/podcast_reviews/categorized_reviews/category={target_category}/date={target_date}/categorized_reviews.parquet"
# Check input files exist
if not os.path.exists(phrase_models_file):
print(f"Phrase models file not found: {phrase_models_file}", file=sys.stderr)
sys.exit(1)
if not os.path.exists(categorized_reviews_file):
print(f"Categorized reviews file not found: {categorized_reviews_file}", file=sys.stderr)
sys.exit(1)
# Output path
output_dir = Path(f"/tmp/databuild_test/examples/podcast_reviews/phrase_stats/category={target_category}/date={target_date}")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "phrase_stats.parquet"
try:
# Calculate phrase statistics per podcast
calculate_phrase_stats_for_category_date(
phrase_models_file,
categorized_reviews_file,
target_category,
target_date,
str(output_file)
)
print(f"Successfully calculated phrase stats for category {target_category} on {target_date}")
print(f"Output written to: {output_file}")
# Create manifest
manifest = {
"outputs": [{"str": partition_ref}],
"inputs": [
{"str": f"phrase_models/category={target_category}/date={target_date}"},
{"str": f"categorized_reviews/category={target_category}/date={target_date}"}
],
"start_time": datetime.now().isoformat(),
"end_time": datetime.now().isoformat(),
"task": {
"job": {"label": "//examples/podcast_reviews:phrase_stats_job"},
"config": {
"outputs": [{"str": partition_ref}],
"inputs": [
{"dep_type": 1, "partition_ref": {"str": f"phrase_models/category={target_category}/date={target_date}"}},
{"dep_type": 1, "partition_ref": {"str": f"categorized_reviews/category={target_category}/date={target_date}"}}
],
"args": [target_category, target_date],
"env": {"PARTITION_REF": partition_ref, "TARGET_CATEGORY": target_category, "TARGET_DATE": target_date}
}
}
}
manifest_file = output_dir / "manifest.json"
with open(manifest_file, 'w') as f:
json.dump(manifest, f, indent=2)
except Exception as e:
print(f"Error calculating phrase stats: {e}", file=sys.stderr)
sys.exit(1)
def calculate_phrase_stats_for_category_date(
phrase_models_file: str,
categorized_reviews_file: str,
target_category: str,
target_date: str,
output_file: str
):
"""Calculate phrase statistics per podcast by joining phrase models with reviews."""
# Connect to DuckDB for processing
duckdb_conn = duckdb.connect()
try:
# Try to install and load parquet extension, but don't fail if it's already installed
try:
duckdb_conn.execute("INSTALL parquet")
except Exception:
pass # Extension might already be installed
duckdb_conn.execute("LOAD parquet")
# Check if we have phrase models
phrase_count = duckdb_conn.execute(f"SELECT COUNT(*) FROM parquet_scan('{phrase_models_file}')").fetchone()[0]
if phrase_count == 0:
print(f"No phrase models found, creating empty phrase stats")
create_empty_phrase_stats(target_category, target_date, output_file, duckdb_conn)
return
# Query to calculate phrase statistics per podcast
query = f"""
WITH phrase_matches AS (
SELECT
r.podcast_id,
r.podcast_title,
r.rating,
r.content,
p.ngram,
p.score as phrase_score
FROM parquet_scan('{categorized_reviews_file}') r
JOIN parquet_scan('{phrase_models_file}') p
ON lower(r.content) LIKE '%' || lower(p.ngram) || '%'
WHERE r.content IS NOT NULL
AND r.content != ''
AND p.ngram IS NOT NULL
AND p.ngram != ''
),
podcast_phrase_stats AS (
SELECT
'{target_date}' as date,
'{target_category}' as category,
podcast_id,
podcast_title,
ngram,
COUNT(*) as count,
AVG(rating::FLOAT) as avg_rating,
MIN(rating) as min_rating,
MAX(rating) as max_rating,
AVG(phrase_score) as avg_phrase_score,
COUNT(*) * AVG(phrase_score) * AVG(rating::FLOAT) / 5.0 as weighted_score
FROM phrase_matches
GROUP BY podcast_id, podcast_title, ngram
HAVING COUNT(*) >= 2 -- Only include phrases that appear at least twice per podcast
)
SELECT
date,
category,
podcast_id,
podcast_title,
ngram,
count,
avg_rating,
min_rating,
max_rating,
avg_phrase_score,
weighted_score
FROM podcast_phrase_stats
ORDER BY weighted_score DESC, count DESC
"""
# Execute query and save to parquet
duckdb_conn.execute(f"COPY ({query}) TO '{output_file}' (FORMAT PARQUET)")
# Get row count for logging
count_result = duckdb_conn.execute(f"SELECT COUNT(*) FROM ({query})").fetchone()
row_count = count_result[0] if count_result else 0
print(f"Calculated phrase statistics for {row_count} podcast-phrase combinations")
if row_count == 0:
print(f"Warning: No phrase matches found for category '{target_category}' on date '{target_date}'")
create_empty_phrase_stats(target_category, target_date, output_file, duckdb_conn)
finally:
duckdb_conn.close()
def create_empty_phrase_stats(category: str, date: str, output_file: str, duckdb_conn):
"""Create empty phrase stats parquet file with correct schema."""
duckdb_conn.execute("DROP TABLE IF EXISTS empty_phrase_stats")
duckdb_conn.execute("""
CREATE TABLE empty_phrase_stats (
date VARCHAR,
category VARCHAR,
podcast_id VARCHAR,
podcast_title VARCHAR,
ngram VARCHAR,
count BIGINT,
avg_rating DOUBLE,
min_rating INTEGER,
max_rating INTEGER,
avg_phrase_score DOUBLE,
weighted_score DOUBLE
)
""")
duckdb_conn.execute(f"COPY empty_phrase_stats TO '{output_file}' (FORMAT PARQUET)")
print("Created empty phrase stats file")
if __name__ == "__main__":
main()

View file

@ -1,2 +1,4 @@
duckdb==1.2.2
pydantic==2.11.3
pydantic==2.11.3
pandas>=2.0.0
pyarrow>=10.0.0

View file

@ -61,6 +61,160 @@ duckdb==1.2.2 \
--hash=sha256:fb9a2c77236fae079185a990434cb9d8432902488ba990235c702fc2692d2dcd \
--hash=sha256:fd9c434127fd1575694e1cf19a393bed301f5d6e80b4bcdae80caa368a61a678
# via -r requirements.in
numpy==2.3.1 \
--hash=sha256:0025048b3c1557a20bc80d06fdeb8cc7fc193721484cca82b2cfa072fec71a93 \
--hash=sha256:010ce9b4f00d5c036053ca684c77441f2f2c934fd23bee058b4d6f196efd8280 \
--hash=sha256:0bb3a4a61e1d327e035275d2a993c96fa786e4913aa089843e6a2d9dd205c66a \
--hash=sha256:0c4d9e0a8368db90f93bd192bfa771ace63137c3488d198ee21dfb8e7771916e \
--hash=sha256:15aa4c392ac396e2ad3d0a2680c0f0dee420f9fed14eef09bdb9450ee6dcb7b7 \
--hash=sha256:18703df6c4a4fee55fd3d6e5a253d01c5d33a295409b03fda0c86b3ca2ff41a1 \
--hash=sha256:1ec9ae20a4226da374362cca3c62cd753faf2f951440b0e3b98e93c235441d2b \
--hash=sha256:23ab05b2d241f76cb883ce8b9a93a680752fbfcbd51c50eff0b88b979e471d8c \
--hash=sha256:25a1992b0a3fdcdaec9f552ef10d8103186f5397ab45e2d25f8ac51b1a6b97e8 \
--hash=sha256:2959d8f268f3d8ee402b04a9ec4bb7604555aeacf78b360dc4ec27f1d508177d \
--hash=sha256:2a809637460e88a113e186e87f228d74ae2852a2e0c44de275263376f17b5bdc \
--hash=sha256:2fb86b7e58f9ac50e1e9dd1290154107e47d1eef23a0ae9145ded06ea606f992 \
--hash=sha256:36890eb9e9d2081137bd78d29050ba63b8dab95dff7912eadf1185e80074b2a0 \
--hash=sha256:39bff12c076812595c3a306f22bfe49919c5513aa1e0e70fac756a0be7c2a2b8 \
--hash=sha256:467db865b392168ceb1ef1ffa6f5a86e62468c43e0cfb4ab6da667ede10e58db \
--hash=sha256:4e602e1b8682c2b833af89ba641ad4176053aaa50f5cacda1a27004352dde943 \
--hash=sha256:5902660491bd7a48b2ec16c23ccb9124b8abfd9583c5fdfa123fe6b421e03de1 \
--hash=sha256:5ccb7336eaf0e77c1635b232c141846493a588ec9ea777a7c24d7166bb8533ae \
--hash=sha256:5f1b8f26d1086835f442286c1d9b64bb3974b0b1e41bb105358fd07d20872952 \
--hash=sha256:6269b9edfe32912584ec496d91b00b6d34282ca1d07eb10e82dfc780907d6c2e \
--hash=sha256:6ea9e48336a402551f52cd8f593343699003d2353daa4b72ce8d34f66b722070 \
--hash=sha256:762e0c0c6b56bdedfef9a8e1d4538556438288c4276901ea008ae44091954e29 \
--hash=sha256:7be91b2239af2658653c5bb6f1b8bccafaf08226a258caf78ce44710a0160d30 \
--hash=sha256:7dea630156d39b02a63c18f508f85010230409db5b2927ba59c8ba4ab3e8272e \
--hash=sha256:867ef172a0976aaa1f1d1b63cf2090de8b636a7674607d514505fb7276ab08fc \
--hash=sha256:8d5ee6eec45f08ce507a6570e06f2f879b374a552087a4179ea7838edbcbfa42 \
--hash=sha256:8e333040d069eba1652fb08962ec5b76af7f2c7bce1df7e1418c8055cf776f25 \
--hash=sha256:a5ee121b60aa509679b682819c602579e1df14a5b07fe95671c8849aad8f2115 \
--hash=sha256:a780033466159c2270531e2b8ac063704592a0bc62ec4a1b991c7c40705eb0e8 \
--hash=sha256:a894f3816eb17b29e4783e5873f92faf55b710c2519e5c351767c51f79d8526d \
--hash=sha256:a8b740f5579ae4585831b3cf0e3b0425c667274f82a484866d2adf9570539369 \
--hash=sha256:ad506d4b09e684394c42c966ec1527f6ebc25da7f4da4b1b056606ffe446b8a3 \
--hash=sha256:afed2ce4a84f6b0fc6c1ce734ff368cbf5a5e24e8954a338f3bdffa0718adffb \
--hash=sha256:b0b5397374f32ec0649dd98c652a1798192042e715df918c20672c62fb52d4b8 \
--hash=sha256:bada6058dd886061f10ea15f230ccf7dfff40572e99fef440a4a857c8728c9c0 \
--hash=sha256:c4913079974eeb5c16ccfd2b1f09354b8fed7e0d6f2cab933104a09a6419b1ee \
--hash=sha256:c5bdf2015ccfcee8253fb8be695516ac4457c743473a43290fd36eba6a1777eb \
--hash=sha256:c6e0bf9d1a2f50d2b65a7cf56db37c095af17b59f6c132396f7c6d5dd76484df \
--hash=sha256:ce2ce9e5de4703a673e705183f64fd5da5bf36e7beddcb63a25ee2286e71ca48 \
--hash=sha256:cfecc7822543abdea6de08758091da655ea2210b8ffa1faf116b940693d3df76 \
--hash=sha256:d4580adadc53311b163444f877e0789f1c8861e2698f6b2a4ca852fda154f3ff \
--hash=sha256:d70f20df7f08b90a2062c1f07737dd340adccf2068d0f1b9b3d56e2038979fee \
--hash=sha256:e344eb79dab01f1e838ebb67aab09965fb271d6da6b00adda26328ac27d4a66e \
--hash=sha256:e610832418a2bc09d974cc9fecebfa51e9532d6190223bc5ef6a7402ebf3b5cb \
--hash=sha256:e772dda20a6002ef7061713dc1e2585bc1b534e7909b2030b5a46dae8ff077ab \
--hash=sha256:e7cbf5a5eafd8d230a3ce356d892512185230e4781a361229bd902ff403bc660 \
--hash=sha256:eabd7e8740d494ce2b4ea0ff05afa1b7b291e978c0ae075487c51e8bd93c0c68 \
--hash=sha256:ebb8603d45bc86bbd5edb0d63e52c5fd9e7945d3a503b77e486bd88dde67a19b \
--hash=sha256:ec0bdafa906f95adc9a0c6f26a4871fa753f25caaa0e032578a30457bff0af6a \
--hash=sha256:eccb9a159db9aed60800187bc47a6d3451553f0e1b08b068d8b277ddfbb9b244 \
--hash=sha256:ee8340cb48c9b7a5899d1149eece41ca535513a9698098edbade2a8e7a84da77
# via pandas
pandas==2.3.0 \
--hash=sha256:034abd6f3db8b9880aaee98f4f5d4dbec7c4829938463ec046517220b2f8574e \
--hash=sha256:094e271a15b579650ebf4c5155c05dcd2a14fd4fdd72cf4854b2f7ad31ea30be \
--hash=sha256:14a0cc77b0f089d2d2ffe3007db58f170dae9b9f54e569b299db871a3ab5bf46 \
--hash=sha256:1a881bc1309f3fce34696d07b00f13335c41f5f5a8770a33b09ebe23261cfc67 \
--hash=sha256:1d2b33e68d0ce64e26a4acc2e72d747292084f4e8db4c847c6f5f6cbe56ed6d8 \
--hash=sha256:213cd63c43263dbb522c1f8a7c9d072e25900f6975596f883f4bebd77295d4f3 \
--hash=sha256:23c2b2dc5213810208ca0b80b8666670eb4660bbfd9d45f58592cc4ddcfd62e1 \
--hash=sha256:2c7e2fc25f89a49a11599ec1e76821322439d90820108309bf42130d2f36c983 \
--hash=sha256:2eb4728a18dcd2908c7fccf74a982e241b467d178724545a48d0caf534b38ebf \
--hash=sha256:34600ab34ebf1131a7613a260a61dbe8b62c188ec0ea4c296da7c9a06b004133 \
--hash=sha256:39ff73ec07be5e90330cc6ff5705c651ace83374189dcdcb46e6ff54b4a72cd6 \
--hash=sha256:404d681c698e3c8a40a61d0cd9412cc7364ab9a9cc6e144ae2992e11a2e77a20 \
--hash=sha256:40cecc4ea5abd2921682b57532baea5588cc5f80f0231c624056b146887274d2 \
--hash=sha256:430a63bae10b5086995db1b02694996336e5a8ac9a96b4200572b413dfdfccb9 \
--hash=sha256:4930255e28ff5545e2ca404637bcc56f031893142773b3468dc021c6c32a1390 \
--hash=sha256:6021910b086b3ca756755e86ddc64e0ddafd5e58e076c72cb1585162e5ad259b \
--hash=sha256:625466edd01d43b75b1883a64d859168e4556261a5035b32f9d743b67ef44634 \
--hash=sha256:75651c14fde635e680496148a8526b328e09fe0572d9ae9b638648c46a544ba3 \
--hash=sha256:84141f722d45d0c2a89544dd29d35b3abfc13d2250ed7e68394eda7564bd6324 \
--hash=sha256:8adff9f138fc614347ff33812046787f7d43b3cef7c0f0171b3340cae333f6ca \
--hash=sha256:951805d146922aed8357e4cc5671b8b0b9be1027f0619cea132a9f3f65f2f09c \
--hash=sha256:9efc0acbbffb5236fbdf0409c04edce96bec4bdaa649d49985427bd1ec73e085 \
--hash=sha256:9ff730713d4c4f2f1c860e36c005c7cefc1c7c80c21c0688fd605aa43c9fcf09 \
--hash=sha256:a6872d695c896f00df46b71648eea332279ef4077a409e2fe94220208b6bb675 \
--hash=sha256:b198687ca9c8529662213538a9bb1e60fa0bf0f6af89292eb68fea28743fcd5a \
--hash=sha256:b9d8c3187be7479ea5c3d30c32a5d73d62a621166675063b2edd21bc47614027 \
--hash=sha256:ba24af48643b12ffe49b27065d3babd52702d95ab70f50e1b34f71ca703e2c0d \
--hash=sha256:bb32dc743b52467d488e7a7c8039b821da2826a9ba4f85b89ea95274f863280f \
--hash=sha256:bb3be958022198531eb7ec2008cfc78c5b1eed51af8600c6c5d9160d89d8d249 \
--hash=sha256:bf5be867a0541a9fb47a4be0c5790a4bccd5b77b92f0a59eeec9375fafc2aa14 \
--hash=sha256:c06f6f144ad0a1bf84699aeea7eff6068ca5c63ceb404798198af7eb86082e33 \
--hash=sha256:c6da97aeb6a6d233fb6b17986234cc723b396b50a3c6804776351994f2a658fd \
--hash=sha256:e0f51973ba93a9f97185049326d75b942b9aeb472bec616a129806facb129ebb \
--hash=sha256:e1991bbb96f4050b09b5f811253c4f3cf05ee89a589379aa36cd623f21a31d6f \
--hash=sha256:e5f08eb9a445d07720776df6e641975665c9ea12c9d8a331e0f6890f2dcd76ef \
--hash=sha256:e78ad363ddb873a631e92a3c063ade1ecfb34cae71e9a2be6ad100f875ac1042 \
--hash=sha256:ed16339bc354a73e0a609df36d256672c7d296f3f767ac07257801aa064ff73c \
--hash=sha256:f4dd97c19bd06bc557ad787a15b6489d2614ddaab5d104a0310eb314c724b2d2 \
--hash=sha256:f925f1ef673b4bd0271b1809b72b3270384f2b7d9d14a189b12b7fc02574d575 \
--hash=sha256:f95a2aef32614ed86216d3c450ab12a4e82084e8102e355707a1d96e33d51c34 \
--hash=sha256:fa07e138b3f6c04addfeaf56cc7fdb96c3b68a3fe5e5401251f231fce40a0d7a \
--hash=sha256:fa35c266c8cd1a67d75971a1912b185b492d257092bdd2709bbdebe574ed228d
# via -r requirements.in
pyarrow==20.0.0 \
--hash=sha256:00138f79ee1b5aca81e2bdedb91e3739b987245e11fa3c826f9e57c5d102fb75 \
--hash=sha256:11529a2283cb1f6271d7c23e4a8f9f8b7fd173f7360776b668e509d712a02eec \
--hash=sha256:15aa1b3b2587e74328a730457068dc6c89e6dcbf438d4369f572af9d320a25ee \
--hash=sha256:1bcbe471ef3349be7714261dea28fe280db574f9d0f77eeccc195a2d161fd861 \
--hash=sha256:204a846dca751428991346976b914d6d2a82ae5b8316a6ed99789ebf976551e6 \
--hash=sha256:211d5e84cecc640c7a3ab900f930aaff5cd2702177e0d562d426fb7c4f737781 \
--hash=sha256:24ca380585444cb2a31324c546a9a56abbe87e26069189e14bdba19c86c049f0 \
--hash=sha256:2c3a01f313ffe27ac4126f4c2e5ea0f36a5fc6ab51f8726cf41fee4b256680bd \
--hash=sha256:30b3051b7975801c1e1d387e17c588d8ab05ced9b1e14eec57915f79869b5031 \
--hash=sha256:3346babb516f4b6fd790da99b98bed9708e3f02e734c84971faccb20736848dc \
--hash=sha256:3e1f8a47f4b4ae4c69c4d702cfbdfe4d41e18e5c7ef6f1bb1c50918c1e81c57b \
--hash=sha256:4250e28a22302ce8692d3a0e8ec9d9dde54ec00d237cff4dfa9c1fbf79e472a8 \
--hash=sha256:4680f01ecd86e0dd63e39eb5cd59ef9ff24a9d166db328679e36c108dc993d4c \
--hash=sha256:4a8b029a07956b8d7bd742ffca25374dd3f634b35e46cc7a7c3fa4c75b297191 \
--hash=sha256:4ba3cf4182828be7a896cbd232aa8dd6a31bd1f9e32776cc3796c012855e1199 \
--hash=sha256:5605919fbe67a7948c1f03b9f3727d82846c053cd2ce9303ace791855923fd20 \
--hash=sha256:5f0fb1041267e9968c6d0d2ce3ff92e3928b243e2b6d11eeb84d9ac547308232 \
--hash=sha256:6102b4864d77102dbbb72965618e204e550135a940c2534711d5ffa787df2a5a \
--hash=sha256:6415a0d0174487456ddc9beaead703d0ded5966129fa4fd3114d76b5d1c5ceae \
--hash=sha256:6bb830757103a6cb300a04610e08d9636f0cd223d32f388418ea893a3e655f1c \
--hash=sha256:6fc1499ed3b4b57ee4e090e1cea6eb3584793fe3d1b4297bbf53f09b434991a5 \
--hash=sha256:75a51a5b0eef32727a247707d4755322cb970be7e935172b6a3a9f9ae98404ba \
--hash=sha256:7a3a5dcf54286e6141d5114522cf31dd67a9e7c9133d150799f30ee302a7a1ab \
--hash=sha256:7f4c8534e2ff059765647aa69b75d6543f9fef59e2cd4c6d18015192565d2b70 \
--hash=sha256:82f1ee5133bd8f49d31be1299dc07f585136679666b502540db854968576faf9 \
--hash=sha256:851c6a8260ad387caf82d2bbf54759130534723e37083111d4ed481cb253cc0d \
--hash=sha256:89e030dc58fc760e4010148e6ff164d2f44441490280ef1e97a542375e41058e \
--hash=sha256:95b330059ddfdc591a3225f2d272123be26c8fa76e8c9ee1a77aad507361cfdb \
--hash=sha256:96d6a0a37d9c98be08f5ed6a10831d88d52cac7b13f5287f1e0f625a0de8062b \
--hash=sha256:96e37f0766ecb4514a899d9a3554fadda770fb57ddf42b63d80f14bc20aa7db3 \
--hash=sha256:97c8dc984ed09cb07d618d57d8d4b67a5100a30c3818c2fb0b04599f0da2de7b \
--hash=sha256:991f85b48a8a5e839b2128590ce07611fae48a904cae6cab1f089c5955b57eb5 \
--hash=sha256:9965a050048ab02409fb7cbbefeedba04d3d67f2cc899eff505cc084345959ca \
--hash=sha256:9b71daf534f4745818f96c214dbc1e6124d7daf059167330b610fc69b6f3d3e3 \
--hash=sha256:a15532e77b94c61efadde86d10957950392999503b3616b2ffcef7621a002893 \
--hash=sha256:a18a14baef7d7ae49247e75641fd8bcbb39f44ed49a9fc4ec2f65d5031aa3b96 \
--hash=sha256:a1f60dc14658efaa927f8214734f6a01a806d7690be4b3232ba526836d216122 \
--hash=sha256:a2791f69ad72addd33510fec7bb14ee06c2a448e06b649e264c094c5b5f7ce28 \
--hash=sha256:a5704f29a74b81673d266e5ec1fe376f060627c2e42c5c7651288ed4b0db29e9 \
--hash=sha256:a6ad3e7758ecf559900261a4df985662df54fb7fdb55e8e3b3aa99b23d526b62 \
--hash=sha256:aa0d288143a8585806e3cc7c39566407aab646fb9ece164609dac1cfff45f6ae \
--hash=sha256:b6953f0114f8d6f3d905d98e987d0924dabce59c3cda380bdfaa25a6201563b4 \
--hash=sha256:b8ff87cc837601532cc8242d2f7e09b4e02404de1b797aee747dd4ba4bd6313f \
--hash=sha256:c7dd06fd7d7b410ca5dc839cc9d485d2bc4ae5240851bcd45d85105cc90a47d7 \
--hash=sha256:ca151afa4f9b7bc45bcc791eb9a89e90a9eb2772767d0b1e5389609c7d03db63 \
--hash=sha256:cb497649e505dc36542d0e68eca1a3c94ecbe9799cb67b578b55f2441a247fbc \
--hash=sha256:d5382de8dc34c943249b01c19110783d0d64b207167c728461add1ecc2db88e4 \
--hash=sha256:db53390eaf8a4dab4dbd6d93c85c5cf002db24902dbff0ca7d988beb5c9dd15b \
--hash=sha256:dd43f58037443af715f34f1322c782ec463a3c8a94a85fdb2d987ceb5658e061 \
--hash=sha256:e22f80b97a271f0a7d9cd07394a7d348f80d3ac63ed7cc38b6d1b696ab3b2619 \
--hash=sha256:e724a3fd23ae5b9c010e7be857f4405ed5e679db5c93e66204db1a69f733936a \
--hash=sha256:e8b88758f9303fa5a83d6c90e176714b2fd3852e776fc2d7e42a22dd6c2fb368 \
--hash=sha256:f2d67ac28f57a362f1a2c1e6fa98bfe2f03230f7e15927aecd067433b1e70ce8 \
--hash=sha256:f3b117b922af5e4c6b9a9115825726cac7d8b1421c37c2b5e24fbacc8930612c \
--hash=sha256:febc4a913592573c8d5805091a6c2b5064c8bd6e002131f01061797d91c783c1
# via -r requirements.in
pydantic==2.11.3 \
--hash=sha256:7471657138c16adad9322fe3070c0116dd6c3ad8d649300e3cbdfe91f4db4ec3 \
--hash=sha256:a082753436a07f9ba1289c6ffa01cd93db3548776088aa917cc43b63f68fa60f
@ -166,6 +320,18 @@ pydantic-core==2.33.1 \
--hash=sha256:fc903512177361e868bc1f5b80ac8c8a6e05fcdd574a5fb5ffeac5a9982b9e89 \
--hash=sha256:fe44d56aa0b00d66640aa84a3cbe80b7a3ccdc6f0b1ca71090696a6d4777c091
# via pydantic
python-dateutil==2.9.0.post0 \
--hash=sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3 \
--hash=sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427
# via pandas
pytz==2025.2 \
--hash=sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3 \
--hash=sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00
# via pandas
six==1.17.0 \
--hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \
--hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81
# via python-dateutil
typing-extensions==4.13.2 \
--hash=sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c \
--hash=sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef
@ -177,3 +343,7 @@ typing-inspection==0.4.0 \
--hash=sha256:50e72559fcd2a6367a19f7a7e610e6afcb9fac940c650290eed893d61386832f \
--hash=sha256:9765c87de36671694a67904bf2c96e395be9c6439bb6c87b5142569dcdd65122
# via pydantic
tzdata==2025.2 \
--hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \
--hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9
# via pandas

View file

@ -0,0 +1,212 @@
#!/usr/bin/env python3
"""
Basic tests for podcast reviews jobs.
Tests the configuration and basic execution flow of each job.
"""
import os
import sys
import subprocess
import tempfile
import shutil
import json
from pathlib import Path
def run_job_config(job_script: str, partition_ref: str):
"""Run a job in config mode and return the parsed config."""
cmd = [sys.executable, job_script, "config", partition_ref]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise Exception(f"Config failed for {job_script}: {result.stderr}")
return json.loads(result.stdout)
def test_extract_reviews_job():
"""Test extract_reviews_job configuration."""
print("Testing extract_reviews_job...")
config = run_job_config("extract_reviews_job.py", "reviews/date=2020-01-01")
assert len(config["configs"]) == 1
job_config = config["configs"][0]
assert job_config["outputs"] == [{"str": "reviews/date=2020-01-01"}]
assert job_config["inputs"] == []
assert job_config["args"] == ["2020-01-01"]
assert job_config["env"]["TARGET_DATE"] == "2020-01-01"
print("✓ extract_reviews_job config test passed")
def test_extract_podcasts_job():
"""Test extract_podcasts_job configuration."""
print("Testing extract_podcasts_job...")
config = run_job_config("extract_podcasts_job.py", "podcasts/all")
assert len(config["configs"]) == 1
job_config = config["configs"][0]
assert job_config["outputs"] == [{"str": "podcasts/all"}]
assert job_config["inputs"] == []
assert job_config["args"] == []
assert job_config["env"]["PARTITION_REF"] == "podcasts/all"
print("✓ extract_podcasts_job config test passed")
def test_categorize_reviews_job():
"""Test categorize_reviews_job configuration."""
print("Testing categorize_reviews_job...")
config = run_job_config("categorize_reviews_job.py", "categorized_reviews/category=comedy/date=2020-01-01")
assert len(config["configs"]) == 1
job_config = config["configs"][0]
assert job_config["outputs"] == [{"str": "categorized_reviews/category=comedy/date=2020-01-01"}]
assert len(job_config["inputs"]) == 2
input_refs = [inp["partition_ref"]["str"] for inp in job_config["inputs"]]
assert "reviews/date=2020-01-01" in input_refs
assert "podcasts/all" in input_refs
assert job_config["args"] == ["comedy", "2020-01-01"]
assert job_config["env"]["TARGET_CATEGORY"] == "comedy"
assert job_config["env"]["TARGET_DATE"] == "2020-01-01"
print("✓ categorize_reviews_job config test passed")
def test_phrase_modeling_job():
"""Test phrase_modeling_job configuration."""
print("Testing phrase_modeling_job...")
config = run_job_config("phrase_modeling_job.py", "phrase_models/category=comedy/date=2020-01-01")
assert len(config["configs"]) == 1
job_config = config["configs"][0]
assert job_config["outputs"] == [{"str": "phrase_models/category=comedy/date=2020-01-01"}]
assert len(job_config["inputs"]) == 1
assert job_config["inputs"][0]["partition_ref"]["str"] == "categorized_reviews/category=comedy/date=2020-01-01"
assert job_config["args"] == ["comedy", "2020-01-01"]
assert job_config["env"]["TARGET_CATEGORY"] == "comedy"
assert job_config["env"]["TARGET_DATE"] == "2020-01-01"
print("✓ phrase_modeling_job config test passed")
def test_phrase_stats_job():
"""Test phrase_stats_job configuration."""
print("Testing phrase_stats_job...")
config = run_job_config("phrase_stats_job.py", "phrase_stats/category=comedy/date=2020-01-01")
assert len(config["configs"]) == 1
job_config = config["configs"][0]
assert job_config["outputs"] == [{"str": "phrase_stats/category=comedy/date=2020-01-01"}]
assert len(job_config["inputs"]) == 2
input_refs = [inp["partition_ref"]["str"] for inp in job_config["inputs"]]
assert "phrase_models/category=comedy/date=2020-01-01" in input_refs
assert "categorized_reviews/category=comedy/date=2020-01-01" in input_refs
assert job_config["args"] == ["comedy", "2020-01-01"]
assert job_config["env"]["TARGET_CATEGORY"] == "comedy"
assert job_config["env"]["TARGET_DATE"] == "2020-01-01"
print("✓ phrase_stats_job config test passed")
def test_daily_summary_job():
"""Test daily_summary_job configuration."""
print("Testing daily_summary_job...")
config = run_job_config("daily_summary_job.py", "daily_summaries/category=comedy/date=2020-01-01")
assert len(config["configs"]) == 1
job_config = config["configs"][0]
assert job_config["outputs"] == [{"str": "daily_summaries/category=comedy/date=2020-01-01"}]
assert len(job_config["inputs"]) == 2
input_refs = [inp["partition_ref"]["str"] for inp in job_config["inputs"]]
assert "phrase_stats/category=comedy/date=2020-01-01" in input_refs
assert "categorized_reviews/category=comedy/date=2020-01-01" in input_refs
assert job_config["args"] == ["comedy", "2020-01-01"]
assert job_config["env"]["TARGET_CATEGORY"] == "comedy"
assert job_config["env"]["TARGET_DATE"] == "2020-01-01"
print("✓ daily_summary_job config test passed")
def test_job_lookup():
"""Test job_lookup functionality."""
print("Testing job_lookup...")
test_cases = [
("reviews/date=2020-01-01", ":extract_reviews_job"),
("podcasts/all", ":extract_podcasts_job"),
("categorized_reviews/category=comedy/date=2020-01-01", ":categorize_reviews_job"),
("phrase_models/category=comedy/date=2020-01-01", ":phrase_modeling_job"),
("phrase_stats/category=comedy/date=2020-01-01", ":phrase_stats_job"),
("daily_summaries/category=comedy/date=2020-01-01", ":daily_summary_job"),
]
for partition_ref, expected_job_label in test_cases:
cmd = [sys.executable, "job_lookup.py", partition_ref]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise Exception(f"Job lookup failed for {partition_ref}: {result.stderr}")
response = json.loads(result.stdout)
# New format: {job_label: [partition_refs]}
assert expected_job_label in response
assert partition_ref in response[expected_job_label]
print("✓ job_lookup test passed")
def test_invalid_partition_refs():
"""Test that invalid partition refs are handled properly."""
print("Testing invalid partition refs...")
invalid_refs = [
"invalid/ref",
"reviews/date=invalid-date",
"categorized_reviews/category=/date=2020-01-01", # missing category
"unknown/partition=ref",
]
for invalid_ref in invalid_refs:
cmd = [sys.executable, "job_lookup.py", invalid_ref]
result = subprocess.run(cmd, capture_output=True, text=True)
# Should fail for invalid refs
assert result.returncode != 0, f"Expected failure for invalid ref: {invalid_ref}"
print("✓ invalid partition refs test passed")
def main():
"""Run all tests."""
print("Running podcast reviews job tests...")
print("=" * 50)
try:
test_extract_reviews_job()
test_extract_podcasts_job()
test_categorize_reviews_job()
test_phrase_modeling_job()
test_phrase_stats_job()
test_daily_summary_job()
test_job_lookup()
test_invalid_partition_refs()
print("=" * 50)
print("All tests passed! ✅")
except Exception as e:
print(f"Test failed: {e}")
sys.exit(1)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,47 @@
#!/usr/bin/env python3
import sys
import json
def main():
if len(sys.argv) < 2:
print("Usage: unified_job.py {config|exec} [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
if command == "config":
handle_config(sys.argv[2:])
elif command == "exec":
handle_exec(sys.argv[2:])
else:
print(f"Unknown command: {command}", file=sys.stderr)
print("Usage: unified_job.py {config|exec} [args...]", file=sys.stderr)
sys.exit(1)
def handle_config(args):
if len(args) < 1:
print("Config mode requires partition ref", file=sys.stderr)
sys.exit(1)
partition_ref = args[0]
config = {
"configs": [{
"outputs": [{"str": partition_ref}],
"inputs": [],
"args": ["Hello", "gorgeous", partition_ref],
"env": {"PARTITION_REF": partition_ref}
}]
}
print(json.dumps(config))
def handle_exec(args):
print("What a time to be alive.")
print(f"Partition ref: {os.getenv('PARTITION_REF', 'unknown')}")
print(f"Args: {args}")
if __name__ == "__main__":
import os
main()