Big bump

2025-07-31 02:14:52 -07:00 · 2025-07-31 02:14:52 -07:00 · 82e1d0eb26
commit 82e1d0eb26
parent 6d55d54267
21 changed files with 197 additions and 2873 deletions
--- a/MODULE.bazel
+++ b/MODULE.bazel
@ -209,11 +209,11 @@ python.toolchain(
 pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip")
 pip.parse(
-    hub_name = "pypi",
+    hub_name = "databuild_pypi",
    python_version = "3.13",
    requirements_lock = "//:requirements_lock.txt",
 )
-use_repo(pip, "pypi")
+use_repo(pip, "databuild_pypi")
 # OCI (Docker images)
 oci = use_extension("@rules_oci//oci:extensions.bzl", "oci")
--- a/databuild/BUILD.bazel
+++ b/databuild/BUILD.bazel
@ -150,7 +150,7 @@ py_binary(
    srcs = ["proto_wrapper.py"],
    main = "proto_wrapper.py",
    deps = [
-        "@pypi//betterproto2_compiler",
+        "@databuild_pypi//betterproto2_compiler",
    ],
 )
@ -175,7 +175,7 @@ $(location @com_google_protobuf//:protoc) --python_betterproto2_out=$(GENDIR)/da
        ":protoc-gen-python_betterproto2",
        "//:ruff_binary",
        "@com_google_protobuf//:protoc",
-        "@pypi//betterproto2_compiler",
+        "@databuild_pypi//betterproto2_compiler",
    ],
 )
@ -187,8 +187,8 @@ py_library(
    ],
    visibility = ["//visibility:public"],
    deps = [
-        "@pypi//betterproto2_compiler",
+        "@databuild_pypi//betterproto2_compiler",
-        "@pypi//grpcio",
+        "@databuild_pypi//grpcio",
-        "@pypi//pytest",
+        "@databuild_pypi//pytest",
    ],
 )
--- a/databuild/dsl/python/test/BUILD.bazel
+++ b/databuild/dsl/python/test/BUILD.bazel
@ -3,6 +3,6 @@ py_test(
    srcs = glob(["*.py"]),
    deps = [
        "//databuild/dsl/python:dsl",
-        "@pypi//pytest",
+        "@databuild_pypi//pytest",
    ],
 )
--- a/databuild/graph/analyze.rs
+++ b/databuild/graph/analyze.rs
@ -79,8 +79,11 @@ fn resolve(output_refs: &[String]) -> Result<HashMap<String, Vec<String>>, Strin
        .map_err(|e| format!("Failed to execute job lookup: {}", e))?;
    if !output.status.success() {
        error!("Job lookup failed: {}", output.status);
        let stderr = String::from_utf8_lossy(&output.stderr);
-        error!("Job lookup failed: {}", stderr);
+        error!("stderr: {}", stderr);
        let stdout = String::from_utf8_lossy(&output.stdout);
        error!("stdout: {}", stdout);
        return Err(format!("Failed to run job lookup: {}", stderr));
    }
--- a/databuild/proto.py
+++ b/databuild/proto.py
@ -1 +1,11 @@
 from databuild.py_proto_out.databuild.v1 import *
 from betterproto2 import Casing, OutputFormat
 def to_dict(d) -> dict:
    """Helper for creating proper dicts from protobuf derived dataclasses."""
    return d.to_dict(
        casing=Casing.SNAKE,
        output_format=OutputFormat.PYTHON,
        include_default_values=True
    )
--- a/databuild/rules.bzl
+++ b/databuild/rules.bzl
@ -4,6 +4,8 @@ load("@rules_oci//oci:defs.bzl", "oci_image", "oci_load")
 RUNFILES_PREFIX = """
 # ================= BEGIN RUNFILES INIT =================
 SCRIPT_PATH="$(realpath "$0")"
 # TODO should this be extracted to shared init script
 # Get the directory where the script is located
 if [[ -z "${RUNFILES_DIR:-}" ]]; then
@ -71,6 +73,7 @@ def _databuild_job_cfg_impl(ctx):
        output = script,
        substitutions = {
            "%{EXECUTABLE_PATH}": configure_path,
            "%{EXECUTABLE_SHORT_PATH}": ctx.attr.configure.files_to_run.executable.short_path,
            "%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
            "%{PREFIX}": "EXECUTABLE_SUBCOMMAND=\"config\"\n",
        },
@ -331,6 +334,7 @@ def _databuild_graph_lookup_impl(ctx):
            "%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
            "%{PREFIX}": "",
            "%{EXECUTABLE_PATH}": ctx.attr.lookup.files_to_run.executable.path,
            "%{EXECUTABLE_SHORT_PATH}": ctx.attr.lookup.files_to_run.executable.short_path,
        },
        is_executable = True,
    )
@ -399,6 +403,7 @@ export DATABUILD_JOB_LOOKUP_PATH=$(rlocation _main/{lookup_path})
        output = script,
        substitutions = {
            "%{EXECUTABLE_PATH}": ctx.attr._analyze.files_to_run.executable.path,
            "%{EXECUTABLE_SHORT_PATH}": ctx.attr._analyze.files_to_run.executable.short_path,
            "%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
            "%{PREFIX}": script_prefix,
        },
--- a/databuild/runtime/simple_executable_wrapper.sh.tpl
+++ b/databuild/runtime/simple_executable_wrapper.sh.tpl
@ -5,7 +5,32 @@ set -e
 %{PREFIX}
-EXECUTABLE_BINARY="$(rlocation "_main/$(basename "%{EXECUTABLE_PATH}")")"
+# Check if rlocation function is available
 if ! type rlocation >/dev/null 2>&1; then
    echo "Error: rlocation function not available. Runfiles may not be properly initialized." >&2
    exit 1
 fi
 # Resolve the executable using rlocation
 EXECUTABLE_BINARY="$(rlocation "_main/%{EXECUTABLE_SHORT_PATH}")"
 # Check if rlocation returned something
 if [[ -z "${EXECUTABLE_BINARY}" ]]; then
    echo "Error: rlocation returned empty result for '_main/%{EXECUTABLE_SHORT_PATH}'" >&2
    exit 1
 fi
 # Check if the resolved binary exists
 if [[ ! -f "${EXECUTABLE_BINARY}" ]]; then
    echo "Error: Resolved executable '${EXECUTABLE_BINARY}' does not exist" >&2
    exit 1
 fi
 # Check if the resolved binary is executable
 if [[ ! -x "${EXECUTABLE_BINARY}" ]]; then
    echo "Error: Resolved executable '${EXECUTABLE_BINARY}' is not executable" >&2
    exit 1
 fi
 # Run the configuration
 if [[ -n "${EXECUTABLE_SUBCOMMAND:-}" ]]; then
--- a/databuild/test/app/BUILD.bazel
+++ b/databuild/test/app/BUILD.bazel
@ -36,6 +36,17 @@ py_test(
    deps = [":job_src"],
 )
 py_test(
    name = "test_graph_analysis",
    srcs = ["graph/graph_test.py"],
    main = "graph/graph_test.py",
    data = [
        ":bazel_graph.analyze",
        ":bazel_graph_lookup",
    ],
    deps = [":job_src"],
 )
 # Bazel-defined
 ## Graph
 databuild_graph(
@ -51,8 +62,8 @@ databuild_graph(
 py_binary(
    name = "bazel_graph_lookup",
-    srcs = ["lookup.py"],
+    srcs = ["graph/lookup.py"],
-    main = "lookup.py",
+    main = "graph/lookup.py",
 )
 ## Ingest Color Votes
--- a/databuild/test/app/graph/graph_test.py
+++ b/databuild/test/app/graph/graph_test.py
@ -0,0 +1,91 @@
 #!/usr/bin/env python3
 """
 Integration test for the databuild graph analysis.
 This test verifies that when we request color vote reports, the graph analyzer
 correctly identifies all upstream dependencies and jobs required.
 """
 import subprocess
 import json
 import unittest
 import os
 from pathlib import Path
 class GraphAnalysisTest(unittest.TestCase):
    def setUp(self):
        # Determine the path to bazel_graph.analyze
        # In bazel test, we need to find the executable in the runfiles
        runfiles_dir = os.environ.get('RUNFILES_DIR')
        test_srcdir = os.environ.get('TEST_SRCDIR')
        possible_paths = []
        if runfiles_dir:
            possible_paths.append(os.path.join(runfiles_dir, '_main', 'databuild', 'test', 'app', 'bazel_graph.analyze'))
            possible_paths.append(os.path.join(runfiles_dir, 'databuild', 'test', 'app', 'bazel_graph.analyze'))
        if test_srcdir:
            possible_paths.append(os.path.join(test_srcdir, '_main', 'databuild', 'test', 'app', 'bazel_graph.analyze'))
            possible_paths.append(os.path.join(test_srcdir, 'databuild', 'test', 'app', 'bazel_graph.analyze'))
        # Fallback for local testing
        possible_paths.extend([
            'bazel-bin/databuild/test/app/bazel_graph.analyze',
            './bazel_graph.analyze'
        ])
        self.graph_analyze = None
        for path in possible_paths:
            if os.path.exists(path):
                self.graph_analyze = path
                break
        # Ensure the executable exists
        if not self.graph_analyze:
            self.skipTest(f"Graph analyze executable not found in any of these paths: {possible_paths}")
    def run_graph_analyze(self, partition_refs):
        """Run graph.analyze with the given partition references."""
        cmd = [self.graph_analyze] + partition_refs
        result = subprocess.run(cmd, capture_output=True, text=True, cwd=os.getcwd())
        if result.returncode != 0:
            self.fail(f"Graph analyze failed with return code {result.returncode}.\nStdout: {result.stdout}\nStderr: {result.stderr}")
        # Parse the JSON output
        try:
            return json.loads(result.stdout)
        except json.JSONDecodeError as e:
            self.fail(f"Failed to parse JSON output: {e}\nOutput: {result.stdout}")
    def test_single_color_report_dependencies(self):
        """Test dependencies for a single color vote report."""
        partition_refs = ["color_vote_report/2024-01-15/red"]
        result = self.run_graph_analyze(partition_refs)
        self.assertIn('nodes', result)
        # TODO expand
    def test_multiple_color_reports_same_date(self):
        """Test dependencies when requesting multiple colors for the same date."""
        partition_refs = [
            "color_vote_report/2024-01-15/red",
            "color_vote_report/2024-01-15/blue"
        ]
        result = self.run_graph_analyze(partition_refs)
        self.assertIn('nodes', result)
        # TODO expand
    def test_multiple_dates_dependencies(self):
        """Test dependencies when requesting reports for different dates."""
        partition_refs = [
            "color_vote_report/2024-01-15/red",
            "color_vote_report/2024-01-16/red"
        ]
        result = self.run_graph_analyze(partition_refs)
        self.assertIn('nodes', result)
        # TODO expand
 if __name__ == '__main__':
    unittest.main()
--- a/databuild/test/app/graph/lookup.py
+++ b/databuild/test/app/graph/lookup.py
@ -1,5 +1,8 @@
 #!/usr/bin/env python3
 from collections import defaultdict
 import sys
 import json
 LABEL_BASE = "//databuild/test/app"
@ -21,3 +24,6 @@ if __name__ == "__main__":
    results = defaultdict(list)
    for raw_ref in sys.argv[1:]:
        results[lookup(raw_ref)].append(raw_ref)
    # Output the results as JSON
    print(json.dumps(dict(results)))
--- a/databuild/test/app/jobs/aggregate_color_votes/config.py
+++ b/databuild/test/app/jobs/aggregate_color_votes/config.py
@ -1,4 +1,4 @@
-from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig
+from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig, DepType, DataDep
 from databuild.test.app.colors import COLORS
 from datetime import date
@ -29,7 +29,7 @@ def configure(outputs: list[PartitionRef]) -> JobConfigureResponse:
            configs.append(JobConfig(
                outputs=[output],
-                inputs=inputs,
+                inputs=[DataDep(dep_type_code=DepType.MATERIALIZE, dep_type_name="materialize", partition_ref=ref) for ref in inputs],
                args=[],
                env={
                    "DATA_DATE": data_date,
--- a/databuild/test/app/jobs/aggregate_color_votes/main.py
+++ b/databuild/test/app/jobs/aggregate_color_votes/main.py
@ -3,17 +3,18 @@
 import sys
 import os
 import json
-from databuild.proto import PartitionRef
+from databuild.proto import PartitionRef, to_dict
 from databuild.test.app.jobs.aggregate_color_votes.config import configure
 from databuild.test.app.jobs.aggregate_color_votes.execute import execute
 if __name__ == "__main__":
    if sys.argv[1] == "config":
        response = configure([
            PartitionRef(str=raw_ref)
            for raw_ref in sys.argv[2:]
        ])
-        print(json.dumps(response.to_dict()))
+        print(json.dumps(to_dict(response)))
    elif sys.argv[1] == "exec":
        execute(os.environ["DATA_DATE"], os.environ["AGGREGATE_TYPE"])
    else:
--- a/databuild/test/app/jobs/aggregate_color_votes/test.py
+++ b/databuild/test/app/jobs/aggregate_color_votes/test.py
@ -18,7 +18,7 @@ class TestAggregateColorVotesConfig(unittest.TestCase):
        # Check that inputs are from daily_color_votes
        for i, color in enumerate(COLORS):
            expected_input = f"daily_color_votes/2024-01-15/{color}"
-            self.assertEqual(config.inputs[i].str, expected_input)
+            self.assertEqual(config.inputs[i].partition_ref.str, expected_input)
    def test_configure_weekly_votes(self):
        outputs = [PartitionRef(str="votes_1w/2024-01-21")]
@ -31,7 +31,7 @@ class TestAggregateColorVotesConfig(unittest.TestCase):
        # Check that inputs are from color_votes_1w
        for i, color in enumerate(COLORS):
            expected_input = f"color_votes_1w/2024-01-21/{color}"
-            self.assertEqual(config.inputs[i].str, expected_input)
+            self.assertEqual(config.inputs[i].partition_ref.str, expected_input)
    def test_configure_monthly_votes(self):
        outputs = [PartitionRef(str="votes_1m/2024-01-31")]
@ -44,7 +44,7 @@ class TestAggregateColorVotesConfig(unittest.TestCase):
        # Check that inputs are from color_votes_1m
        for i, color in enumerate(COLORS):
            expected_input = f"color_votes_1m/2024-01-31/{color}"
-            self.assertEqual(config.inputs[i].str, expected_input)
+            self.assertEqual(config.inputs[i].partition_ref.str, expected_input)
    def test_configure_multiple_outputs(self):
        outputs = [
--- a/databuild/test/app/jobs/color_vote_report_calc/config.py
+++ b/databuild/test/app/jobs/color_vote_report_calc/config.py
@ -1,4 +1,4 @@
-from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig
+from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig, DataDep, DepType
 from datetime import date
 from collections import defaultdict
@ -40,7 +40,7 @@ def configure(outputs: list[PartitionRef]) -> JobConfigureResponse:
    # Single job config for all outputs - pass output partition refs as args
    config = JobConfig(
        outputs=outputs,
-        inputs=inputs,
+        inputs=[DataDep(dep_type_code=DepType.MATERIALIZE, dep_type_name="materialize", partition_ref=ref) for ref in inputs],
        args=[output.str for output in outputs],
        env={}
    )
--- a/databuild/test/app/jobs/color_vote_report_calc/main.py
+++ b/databuild/test/app/jobs/color_vote_report_calc/main.py
@ -3,9 +3,10 @@
 import sys
 import os
 import json
-from databuild.proto import PartitionRef
+from databuild.proto import PartitionRef, to_dict
 from databuild.test.app.jobs.color_vote_report_calc.config import configure
 from databuild.test.app.jobs.color_vote_report_calc.execute import execute
 from betterproto2 import Casing, OutputFormat
 if __name__ == "__main__":
    if sys.argv[1] == "config":
@ -13,7 +14,7 @@ if __name__ == "__main__":
            PartitionRef(str=raw_ref)
            for raw_ref in sys.argv[2:]
        ])
-        print(json.dumps(response.to_dict()))
+        print(json.dumps(to_dict(response)))
    elif sys.argv[1] == "exec":
        execute(sys.argv[2:])
    else:
--- a/databuild/test/app/jobs/color_vote_report_calc/test.py
+++ b/databuild/test/app/jobs/color_vote_report_calc/test.py
@ -21,7 +21,7 @@ class TestColorVoteReportCalcConfig(unittest.TestCase):
            "color_votes_1w/2024-01-15/red",
            "color_votes_1m/2024-01-15/red"
        ]
-        actual_inputs = [inp.str for inp in config.inputs]
+        actual_inputs = [inp.partition_ref.str for inp in config.inputs]
        for expected in expected_inputs:
            self.assertIn(expected, actual_inputs)
@ -52,7 +52,7 @@ class TestColorVoteReportCalcConfig(unittest.TestCase):
        self.assertEqual(len(config.outputs), 2)
        # Should have total vote inputs for both dates
-        actual_inputs = [inp.str for inp in config.inputs]
+        actual_inputs = [inp.partition_ref.str for inp in config.inputs]
        self.assertIn("daily_votes/2024-01-15", actual_inputs)
        self.assertIn("daily_votes/2024-01-16", actual_inputs)
--- a/databuild/test/app/jobs/ingest_color_votes/main.py
+++ b/databuild/test/app/jobs/ingest_color_votes/main.py
@ -3,9 +3,10 @@
 import sys
 import os
 import json
-from databuild.proto import PartitionRef
+from databuild.proto import PartitionRef, to_dict
 from databuild.test.app.jobs.ingest_color_votes.config import configure
 from databuild.test.app.jobs.ingest_color_votes.execute import execute
 from betterproto2 import Casing
 if __name__ == "__main__":
    if sys.argv[1] == "config":
@ -13,7 +14,7 @@ if __name__ == "__main__":
            PartitionRef(str=raw_ref)
            for raw_ref in sys.argv[2:]
        ])
-        print(json.dumps(response.to_dict()))
+        print(json.dumps(to_dict(response)))
    elif sys.argv[1] == "exec":
        execute(os.environ["DATA_DATE"], os.environ["COLOR"])
    else:
--- a/databuild/test/app/jobs/trailing_color_votes/config.py
+++ b/databuild/test/app/jobs/trailing_color_votes/config.py
@ -1,4 +1,4 @@
-from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig
+from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig, DepType, DataDep
 from datetime import date, timedelta
 from collections import defaultdict
@ -41,7 +41,7 @@ def configure(outputs: list[PartitionRef]) -> JobConfigureResponse:
        configs.append(JobConfig(
            outputs=output_partitions, 
-            inputs=inputs, 
+            inputs=[DataDep(dep_type_code=DepType.MATERIALIZE, dep_type_name="materialize", partition_ref=ref) for ref in inputs],
            args=[], 
            env=env
        ))
--- a/databuild/test/app/jobs/trailing_color_votes/main.py
+++ b/databuild/test/app/jobs/trailing_color_votes/main.py
@ -3,9 +3,10 @@
 import sys
 import os
 import json
-from databuild.proto import PartitionRef
+from databuild.proto import PartitionRef, to_dict
 from databuild.test.app.jobs.trailing_color_votes.config import configure
 from databuild.test.app.jobs.trailing_color_votes.execute import execute
 from betterproto2 import Casing, OutputFormat
 if __name__ == "__main__":
    if sys.argv[1] == "config":
@ -13,7 +14,7 @@ if __name__ == "__main__":
            PartitionRef(str=raw_ref)
            for raw_ref in sys.argv[2:]
        ])
-        print(json.dumps(response.to_dict()))
+        print(json.dumps(to_dict(response)))
    elif sys.argv[1] == "exec":
        execute(os.environ["DATA_DATE"], os.environ["COLOR"])
    else:
--- a/examples/podcast_reviews/MODULE.bazel
+++ b/examples/podcast_reviews/MODULE.bazel
@ -25,29 +25,6 @@ pip.parse(
 )
 use_repo(pip, "pypi")
 # Rules OCI - necessary for producing a docker container
 bazel_dep(name = "rules_oci", version = "2.2.6")
 # For testing, we also recommend https://registry.bazel.build/modules/container_structure_test
 oci = use_extension("@rules_oci//oci:extensions.bzl", "oci")
 # Declare external images you need to pull, for example:
 oci.pull(
    name = "debian",
    image = "docker.io/library/python",
    platforms = [
        "linux/arm64/v8",
        "linux/amd64",
    ],
    # 'latest' is not reproducible, but it's convenient.
    # During the build we print a WARNING message that includes recommended 'digest' and 'platforms'
    # values which you can use here in place of 'tag' to pin for reproducibility.
    tag = "3.12-bookworm",
 )
 # For each oci.pull call, repeat the "name" here to expose them as dependencies.
 use_repo(oci, "debian", "debian_linux_amd64", "debian_linux_arm64_v8")
 # Platforms for specifying linux/arm
 bazel_dep(name = "platforms", version = "0.0.11")
--- a/examples/podcast_reviews/MODULE.bazel.lock
+++ b/examples/podcast_reviews/MODULE.bazel.lock