Big bump

2025-07-31 02:14:52 -07:00 · 2025-07-31 02:14:52 -07:00 · 82e1d0eb26
commit 82e1d0eb26
parent 6d55d54267
21 changed files with 197 additions and 2873 deletions
--- a/MODULE.bazel
+++ b/MODULE.bazel
@ -209,11 +209,11 @@ python.toolchain(

 pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip")
 pip.parse(
-    hub_name = "pypi",
+    hub_name = "databuild_pypi",
    python_version = "3.13",
    requirements_lock = "//:requirements_lock.txt",
 )
-use_repo(pip, "pypi")
+use_repo(pip, "databuild_pypi")

 # OCI (Docker images)
 oci = use_extension("@rules_oci//oci:extensions.bzl", "oci")
--- a/databuild/BUILD.bazel
+++ b/databuild/BUILD.bazel
@ -150,7 +150,7 @@ py_binary(
    srcs = ["proto_wrapper.py"],
    main = "proto_wrapper.py",
    deps = [
-        "@pypi//betterproto2_compiler",
+        "@databuild_pypi//betterproto2_compiler",
    ],
 )

@ -175,7 +175,7 @@ $(location @com_google_protobuf//:protoc) --python_betterproto2_out=$(GENDIR)/da
        ":protoc-gen-python_betterproto2",
        "//:ruff_binary",
        "@com_google_protobuf//:protoc",
-        "@pypi//betterproto2_compiler",
+        "@databuild_pypi//betterproto2_compiler",
    ],
 )

@ -187,8 +187,8 @@ py_library(
    ],
    visibility = ["//visibility:public"],
    deps = [
-        "@pypi//betterproto2_compiler",
-        "@pypi//grpcio",
-        "@pypi//pytest",
+        "@databuild_pypi//betterproto2_compiler",
+        "@databuild_pypi//grpcio",
+        "@databuild_pypi//pytest",
    ],
 )
--- a/databuild/dsl/python/test/BUILD.bazel
+++ b/databuild/dsl/python/test/BUILD.bazel
@ -3,6 +3,6 @@ py_test(
    srcs = glob(["*.py"]),
    deps = [
        "//databuild/dsl/python:dsl",
-        "@pypi//pytest",
+        "@databuild_pypi//pytest",
    ],
 )
--- a/databuild/graph/analyze.rs
+++ b/databuild/graph/analyze.rs
@ -79,8 +79,11 @@ fn resolve(output_refs: &[String]) -> Result<HashMap<String, Vec<String>>, Strin
        .map_err(|e| format!("Failed to execute job lookup: {}", e))?;
    
    if !output.status.success() {
+        error!("Job lookup failed: {}", output.status);
        let stderr = String::from_utf8_lossy(&output.stderr);
-        error!("Job lookup failed: {}", stderr);
+        error!("stderr: {}", stderr);
+        let stdout = String::from_utf8_lossy(&output.stdout);
+        error!("stdout: {}", stdout);
        return Err(format!("Failed to run job lookup: {}", stderr));
    }
    
--- a/databuild/proto.py
+++ b/databuild/proto.py
@ -1 +1,11 @@
 from databuild.py_proto_out.databuild.v1 import *
+from betterproto2 import Casing, OutputFormat
+
+
+def to_dict(d) -> dict:
+    """Helper for creating proper dicts from protobuf derived dataclasses."""
+    return d.to_dict(
+        casing=Casing.SNAKE,
+        output_format=OutputFormat.PYTHON,
+        include_default_values=True
+    )
--- a/databuild/rules.bzl
+++ b/databuild/rules.bzl
@ -4,6 +4,8 @@ load("@rules_oci//oci:defs.bzl", "oci_image", "oci_load")
 RUNFILES_PREFIX = """
 # ================= BEGIN RUNFILES INIT =================

+SCRIPT_PATH="$(realpath "$0")"
+
 # TODO should this be extracted to shared init script
 # Get the directory where the script is located
 if [[ -z "${RUNFILES_DIR:-}" ]]; then
@ -71,6 +73,7 @@ def _databuild_job_cfg_impl(ctx):
        output = script,
        substitutions = {
            "%{EXECUTABLE_PATH}": configure_path,
+            "%{EXECUTABLE_SHORT_PATH}": ctx.attr.configure.files_to_run.executable.short_path,
            "%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
            "%{PREFIX}": "EXECUTABLE_SUBCOMMAND=\"config\"\n",
        },
@ -331,6 +334,7 @@ def _databuild_graph_lookup_impl(ctx):
            "%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
            "%{PREFIX}": "",
            "%{EXECUTABLE_PATH}": ctx.attr.lookup.files_to_run.executable.path,
+            "%{EXECUTABLE_SHORT_PATH}": ctx.attr.lookup.files_to_run.executable.short_path,
        },
        is_executable = True,
    )
@ -399,6 +403,7 @@ export DATABUILD_JOB_LOOKUP_PATH=$(rlocation _main/{lookup_path})
        output = script,
        substitutions = {
            "%{EXECUTABLE_PATH}": ctx.attr._analyze.files_to_run.executable.path,
+            "%{EXECUTABLE_SHORT_PATH}": ctx.attr._analyze.files_to_run.executable.short_path,
            "%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
            "%{PREFIX}": script_prefix,
        },
--- a/databuild/runtime/simple_executable_wrapper.sh.tpl
+++ b/databuild/runtime/simple_executable_wrapper.sh.tpl
@ -5,7 +5,32 @@ set -e

 %{PREFIX}

-EXECUTABLE_BINARY="$(rlocation "_main/$(basename "%{EXECUTABLE_PATH}")")"
+# Check if rlocation function is available
+if ! type rlocation >/dev/null 2>&1; then
+    echo "Error: rlocation function not available. Runfiles may not be properly initialized." >&2
+    exit 1
+fi
+
+# Resolve the executable using rlocation
+EXECUTABLE_BINARY="$(rlocation "_main/%{EXECUTABLE_SHORT_PATH}")"
+
+# Check if rlocation returned something
+if [[ -z "${EXECUTABLE_BINARY}" ]]; then
+    echo "Error: rlocation returned empty result for '_main/%{EXECUTABLE_SHORT_PATH}'" >&2
+    exit 1
+fi
+
+# Check if the resolved binary exists
+if [[ ! -f "${EXECUTABLE_BINARY}" ]]; then
+    echo "Error: Resolved executable '${EXECUTABLE_BINARY}' does not exist" >&2
+    exit 1
+fi
+
+# Check if the resolved binary is executable
+if [[ ! -x "${EXECUTABLE_BINARY}" ]]; then
+    echo "Error: Resolved executable '${EXECUTABLE_BINARY}' is not executable" >&2
+    exit 1
+fi

 # Run the configuration
 if [[ -n "${EXECUTABLE_SUBCOMMAND:-}" ]]; then
--- a/databuild/test/app/BUILD.bazel
+++ b/databuild/test/app/BUILD.bazel
@ -36,6 +36,17 @@ py_test(
    deps = [":job_src"],
 )

+py_test(
+    name = "test_graph_analysis",
+    srcs = ["graph/graph_test.py"],
+    main = "graph/graph_test.py",
+    data = [
+        ":bazel_graph.analyze",
+        ":bazel_graph_lookup",
+    ],
+    deps = [":job_src"],
+)
+
 # Bazel-defined
 ## Graph
 databuild_graph(
@ -51,8 +62,8 @@ databuild_graph(

 py_binary(
    name = "bazel_graph_lookup",
-    srcs = ["lookup.py"],
-    main = "lookup.py",
+    srcs = ["graph/lookup.py"],
+    main = "graph/lookup.py",
 )

 ## Ingest Color Votes
--- a/databuild/test/app/graph/graph_test.py
+++ b/databuild/test/app/graph/graph_test.py
@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+"""
+Integration test for the databuild graph analysis.
+
+This test verifies that when we request color vote reports, the graph analyzer
+correctly identifies all upstream dependencies and jobs required.
+"""
+
+import subprocess
+import json
+import unittest
+import os
+from pathlib import Path
+
+
+class GraphAnalysisTest(unittest.TestCase):
+    def setUp(self):
+        # Determine the path to bazel_graph.analyze
+        # In bazel test, we need to find the executable in the runfiles
+        runfiles_dir = os.environ.get('RUNFILES_DIR')
+        test_srcdir = os.environ.get('TEST_SRCDIR')
+        
+        possible_paths = []
+        if runfiles_dir:
+            possible_paths.append(os.path.join(runfiles_dir, '_main', 'databuild', 'test', 'app', 'bazel_graph.analyze'))
+            possible_paths.append(os.path.join(runfiles_dir, 'databuild', 'test', 'app', 'bazel_graph.analyze'))
+        
+        if test_srcdir:
+            possible_paths.append(os.path.join(test_srcdir, '_main', 'databuild', 'test', 'app', 'bazel_graph.analyze'))
+            possible_paths.append(os.path.join(test_srcdir, 'databuild', 'test', 'app', 'bazel_graph.analyze'))
+        
+        # Fallback for local testing
+        possible_paths.extend([
+            'bazel-bin/databuild/test/app/bazel_graph.analyze',
+            './bazel_graph.analyze'
+        ])
+        
+        self.graph_analyze = None
+        for path in possible_paths:
+            if os.path.exists(path):
+                self.graph_analyze = path
+                break
+        
+        # Ensure the executable exists
+        if not self.graph_analyze:
+            self.skipTest(f"Graph analyze executable not found in any of these paths: {possible_paths}")
+    
+    def run_graph_analyze(self, partition_refs):
+        """Run graph.analyze with the given partition references."""
+        cmd = [self.graph_analyze] + partition_refs
+        result = subprocess.run(cmd, capture_output=True, text=True, cwd=os.getcwd())
+        
+        if result.returncode != 0:
+            self.fail(f"Graph analyze failed with return code {result.returncode}.\nStdout: {result.stdout}\nStderr: {result.stderr}")
+        
+        # Parse the JSON output
+        try:
+            return json.loads(result.stdout)
+        except json.JSONDecodeError as e:
+            self.fail(f"Failed to parse JSON output: {e}\nOutput: {result.stdout}")
+    
+    def test_single_color_report_dependencies(self):
+        """Test dependencies for a single color vote report."""
+        partition_refs = ["color_vote_report/2024-01-15/red"]
+        result = self.run_graph_analyze(partition_refs)
+        self.assertIn('nodes', result)
+        # TODO expand
+
+    def test_multiple_color_reports_same_date(self):
+        """Test dependencies when requesting multiple colors for the same date."""
+        partition_refs = [
+            "color_vote_report/2024-01-15/red",
+            "color_vote_report/2024-01-15/blue"
+        ]
+        result = self.run_graph_analyze(partition_refs)
+        self.assertIn('nodes', result)
+        # TODO expand
+
+    def test_multiple_dates_dependencies(self):
+        """Test dependencies when requesting reports for different dates."""
+        partition_refs = [
+            "color_vote_report/2024-01-15/red",
+            "color_vote_report/2024-01-16/red"
+        ]
+        result = self.run_graph_analyze(partition_refs)
+        self.assertIn('nodes', result)
+        # TODO expand
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/databuild/test/app/graph/lookup.py
+++ b/databuild/test/app/graph/lookup.py
@ -1,5 +1,8 @@
+#!/usr/bin/env python3
+
 from collections import defaultdict
 import sys
+import json

 LABEL_BASE = "//databuild/test/app"

@ -21,3 +24,6 @@ if __name__ == "__main__":
    results = defaultdict(list)
    for raw_ref in sys.argv[1:]:
        results[lookup(raw_ref)].append(raw_ref)
+    
+    # Output the results as JSON
+    print(json.dumps(dict(results)))
--- a/databuild/test/app/jobs/aggregate_color_votes/config.py
+++ b/databuild/test/app/jobs/aggregate_color_votes/config.py
@ -1,4 +1,4 @@
-from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig
+from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig, DepType, DataDep
 from databuild.test.app.colors import COLORS
 from datetime import date

@ -29,7 +29,7 @@ def configure(outputs: list[PartitionRef]) -> JobConfigureResponse:
            
            configs.append(JobConfig(
                outputs=[output],
-                inputs=inputs,
+                inputs=[DataDep(dep_type_code=DepType.MATERIALIZE, dep_type_name="materialize", partition_ref=ref) for ref in inputs],
                args=[],
                env={
                    "DATA_DATE": data_date,
--- a/databuild/test/app/jobs/aggregate_color_votes/main.py
+++ b/databuild/test/app/jobs/aggregate_color_votes/main.py
@ -3,17 +3,18 @@
 import sys
 import os
 import json
-from databuild.proto import PartitionRef
+from databuild.proto import PartitionRef, to_dict
 from databuild.test.app.jobs.aggregate_color_votes.config import configure
 from databuild.test.app.jobs.aggregate_color_votes.execute import execute

+
 if __name__ == "__main__":
    if sys.argv[1] == "config":
        response = configure([
            PartitionRef(str=raw_ref)
            for raw_ref in sys.argv[2:]
        ])
-        print(json.dumps(response.to_dict()))
+        print(json.dumps(to_dict(response)))
    elif sys.argv[1] == "exec":
        execute(os.environ["DATA_DATE"], os.environ["AGGREGATE_TYPE"])
    else:
--- a/databuild/test/app/jobs/aggregate_color_votes/test.py
+++ b/databuild/test/app/jobs/aggregate_color_votes/test.py
@ -18,7 +18,7 @@ class TestAggregateColorVotesConfig(unittest.TestCase):
        # Check that inputs are from daily_color_votes
        for i, color in enumerate(COLORS):
            expected_input = f"daily_color_votes/2024-01-15/{color}"
-            self.assertEqual(config.inputs[i].str, expected_input)
+            self.assertEqual(config.inputs[i].partition_ref.str, expected_input)
    
    def test_configure_weekly_votes(self):
        outputs = [PartitionRef(str="votes_1w/2024-01-21")]
@ -31,7 +31,7 @@ class TestAggregateColorVotesConfig(unittest.TestCase):
        # Check that inputs are from color_votes_1w
        for i, color in enumerate(COLORS):
            expected_input = f"color_votes_1w/2024-01-21/{color}"
-            self.assertEqual(config.inputs[i].str, expected_input)
+            self.assertEqual(config.inputs[i].partition_ref.str, expected_input)
    
    def test_configure_monthly_votes(self):
        outputs = [PartitionRef(str="votes_1m/2024-01-31")]
@ -44,7 +44,7 @@ class TestAggregateColorVotesConfig(unittest.TestCase):
        # Check that inputs are from color_votes_1m
        for i, color in enumerate(COLORS):
            expected_input = f"color_votes_1m/2024-01-31/{color}"
-            self.assertEqual(config.inputs[i].str, expected_input)
+            self.assertEqual(config.inputs[i].partition_ref.str, expected_input)
    
    def test_configure_multiple_outputs(self):
        outputs = [
--- a/databuild/test/app/jobs/color_vote_report_calc/config.py
+++ b/databuild/test/app/jobs/color_vote_report_calc/config.py
@ -1,4 +1,4 @@
-from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig
+from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig, DataDep, DepType
 from datetime import date
 from collections import defaultdict

@ -40,7 +40,7 @@ def configure(outputs: list[PartitionRef]) -> JobConfigureResponse:
    # Single job config for all outputs - pass output partition refs as args
    config = JobConfig(
        outputs=outputs,
-        inputs=inputs,
+        inputs=[DataDep(dep_type_code=DepType.MATERIALIZE, dep_type_name="materialize", partition_ref=ref) for ref in inputs],
        args=[output.str for output in outputs],
        env={}
    )
--- a/databuild/test/app/jobs/color_vote_report_calc/main.py
+++ b/databuild/test/app/jobs/color_vote_report_calc/main.py
@ -3,9 +3,10 @@
 import sys
 import os
 import json
-from databuild.proto import PartitionRef
+from databuild.proto import PartitionRef, to_dict
 from databuild.test.app.jobs.color_vote_report_calc.config import configure
 from databuild.test.app.jobs.color_vote_report_calc.execute import execute
+from betterproto2 import Casing, OutputFormat

 if __name__ == "__main__":
    if sys.argv[1] == "config":
@ -13,7 +14,7 @@ if __name__ == "__main__":
            PartitionRef(str=raw_ref)
            for raw_ref in sys.argv[2:]
        ])
-        print(json.dumps(response.to_dict()))
+        print(json.dumps(to_dict(response)))
    elif sys.argv[1] == "exec":
        execute(sys.argv[2:])
    else:
--- a/databuild/test/app/jobs/color_vote_report_calc/test.py
+++ b/databuild/test/app/jobs/color_vote_report_calc/test.py
@ -21,7 +21,7 @@ class TestColorVoteReportCalcConfig(unittest.TestCase):
            "color_votes_1w/2024-01-15/red",
            "color_votes_1m/2024-01-15/red"
        ]
-        actual_inputs = [inp.str for inp in config.inputs]
+        actual_inputs = [inp.partition_ref.str for inp in config.inputs]
        for expected in expected_inputs:
            self.assertIn(expected, actual_inputs)
    
@ -52,7 +52,7 @@ class TestColorVoteReportCalcConfig(unittest.TestCase):
        self.assertEqual(len(config.outputs), 2)
        
        # Should have total vote inputs for both dates
-        actual_inputs = [inp.str for inp in config.inputs]
+        actual_inputs = [inp.partition_ref.str for inp in config.inputs]
        self.assertIn("daily_votes/2024-01-15", actual_inputs)
        self.assertIn("daily_votes/2024-01-16", actual_inputs)

--- a/databuild/test/app/jobs/ingest_color_votes/main.py
+++ b/databuild/test/app/jobs/ingest_color_votes/main.py
@ -3,9 +3,10 @@
 import sys
 import os
 import json
-from databuild.proto import PartitionRef
+from databuild.proto import PartitionRef, to_dict
 from databuild.test.app.jobs.ingest_color_votes.config import configure
 from databuild.test.app.jobs.ingest_color_votes.execute import execute
+from betterproto2 import Casing

 if __name__ == "__main__":
    if sys.argv[1] == "config":
@ -13,7 +14,7 @@ if __name__ == "__main__":
            PartitionRef(str=raw_ref)
            for raw_ref in sys.argv[2:]
        ])
-        print(json.dumps(response.to_dict()))
+        print(json.dumps(to_dict(response)))
    elif sys.argv[1] == "exec":
        execute(os.environ["DATA_DATE"], os.environ["COLOR"])
    else:
--- a/databuild/test/app/jobs/trailing_color_votes/config.py
+++ b/databuild/test/app/jobs/trailing_color_votes/config.py
@ -1,4 +1,4 @@
-from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig
+from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig, DepType, DataDep
 from datetime import date, timedelta
 from collections import defaultdict

@ -41,7 +41,7 @@ def configure(outputs: list[PartitionRef]) -> JobConfigureResponse:
        
        configs.append(JobConfig(
            outputs=output_partitions, 
-            inputs=inputs, 
+            inputs=[DataDep(dep_type_code=DepType.MATERIALIZE, dep_type_name="materialize", partition_ref=ref) for ref in inputs],
            args=[], 
            env=env
        ))
--- a/databuild/test/app/jobs/trailing_color_votes/main.py
+++ b/databuild/test/app/jobs/trailing_color_votes/main.py
@ -3,9 +3,10 @@
 import sys
 import os
 import json
-from databuild.proto import PartitionRef
+from databuild.proto import PartitionRef, to_dict
 from databuild.test.app.jobs.trailing_color_votes.config import configure
 from databuild.test.app.jobs.trailing_color_votes.execute import execute
+from betterproto2 import Casing, OutputFormat

 if __name__ == "__main__":
    if sys.argv[1] == "config":
@ -13,7 +14,7 @@ if __name__ == "__main__":
            PartitionRef(str=raw_ref)
            for raw_ref in sys.argv[2:]
        ])
-        print(json.dumps(response.to_dict()))
+        print(json.dumps(to_dict(response)))
    elif sys.argv[1] == "exec":
        execute(os.environ["DATA_DATE"], os.environ["COLOR"])
    else:
--- a/examples/podcast_reviews/MODULE.bazel
+++ b/examples/podcast_reviews/MODULE.bazel
@ -25,29 +25,6 @@ pip.parse(
 )
 use_repo(pip, "pypi")

-# Rules OCI - necessary for producing a docker container
-bazel_dep(name = "rules_oci", version = "2.2.6")
-# For testing, we also recommend https://registry.bazel.build/modules/container_structure_test
-
-oci = use_extension("@rules_oci//oci:extensions.bzl", "oci")
-
-# Declare external images you need to pull, for example:
-oci.pull(
-    name = "debian",
-    image = "docker.io/library/python",
-    platforms = [
-        "linux/arm64/v8",
-        "linux/amd64",
-    ],
-    # 'latest' is not reproducible, but it's convenient.
-    # During the build we print a WARNING message that includes recommended 'digest' and 'platforms'
-    # values which you can use here in place of 'tag' to pin for reproducibility.
-    tag = "3.12-bookworm",
-)
-
-# For each oci.pull call, repeat the "name" here to expose them as dependencies.
-use_repo(oci, "debian", "debian_linux_amd64", "debian_linux_arm64_v8")
-
 # Platforms for specifying linux/arm
 bazel_dep(name = "platforms", version = "0.0.11")

--- a/examples/podcast_reviews/MODULE.bazel.lock
+++ b/examples/podcast_reviews/MODULE.bazel.lock