This commit is contained in:
Stuart Axelbrooke 2025-07-31 02:14:52 -07:00
parent 6d55d54267
commit 82e1d0eb26
21 changed files with 197 additions and 2873 deletions

View file

@ -209,11 +209,11 @@ python.toolchain(
pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip")
pip.parse(
hub_name = "pypi",
hub_name = "databuild_pypi",
python_version = "3.13",
requirements_lock = "//:requirements_lock.txt",
)
use_repo(pip, "pypi")
use_repo(pip, "databuild_pypi")
# OCI (Docker images)
oci = use_extension("@rules_oci//oci:extensions.bzl", "oci")

View file

@ -150,7 +150,7 @@ py_binary(
srcs = ["proto_wrapper.py"],
main = "proto_wrapper.py",
deps = [
"@pypi//betterproto2_compiler",
"@databuild_pypi//betterproto2_compiler",
],
)
@ -175,7 +175,7 @@ $(location @com_google_protobuf//:protoc) --python_betterproto2_out=$(GENDIR)/da
":protoc-gen-python_betterproto2",
"//:ruff_binary",
"@com_google_protobuf//:protoc",
"@pypi//betterproto2_compiler",
"@databuild_pypi//betterproto2_compiler",
],
)
@ -187,8 +187,8 @@ py_library(
],
visibility = ["//visibility:public"],
deps = [
"@pypi//betterproto2_compiler",
"@pypi//grpcio",
"@pypi//pytest",
"@databuild_pypi//betterproto2_compiler",
"@databuild_pypi//grpcio",
"@databuild_pypi//pytest",
],
)

View file

@ -3,6 +3,6 @@ py_test(
srcs = glob(["*.py"]),
deps = [
"//databuild/dsl/python:dsl",
"@pypi//pytest",
"@databuild_pypi//pytest",
],
)

View file

@ -79,8 +79,11 @@ fn resolve(output_refs: &[String]) -> Result<HashMap<String, Vec<String>>, Strin
.map_err(|e| format!("Failed to execute job lookup: {}", e))?;
if !output.status.success() {
error!("Job lookup failed: {}", output.status);
let stderr = String::from_utf8_lossy(&output.stderr);
error!("Job lookup failed: {}", stderr);
error!("stderr: {}", stderr);
let stdout = String::from_utf8_lossy(&output.stdout);
error!("stdout: {}", stdout);
return Err(format!("Failed to run job lookup: {}", stderr));
}

View file

@ -1 +1,11 @@
from databuild.py_proto_out.databuild.v1 import *
from betterproto2 import Casing, OutputFormat
def to_dict(d) -> dict:
"""Helper for creating proper dicts from protobuf derived dataclasses."""
return d.to_dict(
casing=Casing.SNAKE,
output_format=OutputFormat.PYTHON,
include_default_values=True
)

View file

@ -4,6 +4,8 @@ load("@rules_oci//oci:defs.bzl", "oci_image", "oci_load")
RUNFILES_PREFIX = """
# ================= BEGIN RUNFILES INIT =================
SCRIPT_PATH="$(realpath "$0")"
# TODO should this be extracted to shared init script
# Get the directory where the script is located
if [[ -z "${RUNFILES_DIR:-}" ]]; then
@ -71,6 +73,7 @@ def _databuild_job_cfg_impl(ctx):
output = script,
substitutions = {
"%{EXECUTABLE_PATH}": configure_path,
"%{EXECUTABLE_SHORT_PATH}": ctx.attr.configure.files_to_run.executable.short_path,
"%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
"%{PREFIX}": "EXECUTABLE_SUBCOMMAND=\"config\"\n",
},
@ -331,6 +334,7 @@ def _databuild_graph_lookup_impl(ctx):
"%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
"%{PREFIX}": "",
"%{EXECUTABLE_PATH}": ctx.attr.lookup.files_to_run.executable.path,
"%{EXECUTABLE_SHORT_PATH}": ctx.attr.lookup.files_to_run.executable.short_path,
},
is_executable = True,
)
@ -399,6 +403,7 @@ export DATABUILD_JOB_LOOKUP_PATH=$(rlocation _main/{lookup_path})
output = script,
substitutions = {
"%{EXECUTABLE_PATH}": ctx.attr._analyze.files_to_run.executable.path,
"%{EXECUTABLE_SHORT_PATH}": ctx.attr._analyze.files_to_run.executable.short_path,
"%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
"%{PREFIX}": script_prefix,
},

View file

@ -5,7 +5,32 @@ set -e
%{PREFIX}
EXECUTABLE_BINARY="$(rlocation "_main/$(basename "%{EXECUTABLE_PATH}")")"
# Check if rlocation function is available
if ! type rlocation >/dev/null 2>&1; then
echo "Error: rlocation function not available. Runfiles may not be properly initialized." >&2
exit 1
fi
# Resolve the executable using rlocation
EXECUTABLE_BINARY="$(rlocation "_main/%{EXECUTABLE_SHORT_PATH}")"
# Check if rlocation returned something
if [[ -z "${EXECUTABLE_BINARY}" ]]; then
echo "Error: rlocation returned empty result for '_main/%{EXECUTABLE_SHORT_PATH}'" >&2
exit 1
fi
# Check if the resolved binary exists
if [[ ! -f "${EXECUTABLE_BINARY}" ]]; then
echo "Error: Resolved executable '${EXECUTABLE_BINARY}' does not exist" >&2
exit 1
fi
# Check if the resolved binary is executable
if [[ ! -x "${EXECUTABLE_BINARY}" ]]; then
echo "Error: Resolved executable '${EXECUTABLE_BINARY}' is not executable" >&2
exit 1
fi
# Run the configuration
if [[ -n "${EXECUTABLE_SUBCOMMAND:-}" ]]; then

View file

@ -36,6 +36,17 @@ py_test(
deps = [":job_src"],
)
py_test(
name = "test_graph_analysis",
srcs = ["graph/graph_test.py"],
main = "graph/graph_test.py",
data = [
":bazel_graph.analyze",
":bazel_graph_lookup",
],
deps = [":job_src"],
)
# Bazel-defined
## Graph
databuild_graph(
@ -51,8 +62,8 @@ databuild_graph(
py_binary(
name = "bazel_graph_lookup",
srcs = ["lookup.py"],
main = "lookup.py",
srcs = ["graph/lookup.py"],
main = "graph/lookup.py",
)
## Ingest Color Votes

View file

@ -0,0 +1,91 @@
#!/usr/bin/env python3
"""
Integration test for the databuild graph analysis.
This test verifies that when we request color vote reports, the graph analyzer
correctly identifies all upstream dependencies and jobs required.
"""
import subprocess
import json
import unittest
import os
from pathlib import Path
class GraphAnalysisTest(unittest.TestCase):
def setUp(self):
# Determine the path to bazel_graph.analyze
# In bazel test, we need to find the executable in the runfiles
runfiles_dir = os.environ.get('RUNFILES_DIR')
test_srcdir = os.environ.get('TEST_SRCDIR')
possible_paths = []
if runfiles_dir:
possible_paths.append(os.path.join(runfiles_dir, '_main', 'databuild', 'test', 'app', 'bazel_graph.analyze'))
possible_paths.append(os.path.join(runfiles_dir, 'databuild', 'test', 'app', 'bazel_graph.analyze'))
if test_srcdir:
possible_paths.append(os.path.join(test_srcdir, '_main', 'databuild', 'test', 'app', 'bazel_graph.analyze'))
possible_paths.append(os.path.join(test_srcdir, 'databuild', 'test', 'app', 'bazel_graph.analyze'))
# Fallback for local testing
possible_paths.extend([
'bazel-bin/databuild/test/app/bazel_graph.analyze',
'./bazel_graph.analyze'
])
self.graph_analyze = None
for path in possible_paths:
if os.path.exists(path):
self.graph_analyze = path
break
# Ensure the executable exists
if not self.graph_analyze:
self.skipTest(f"Graph analyze executable not found in any of these paths: {possible_paths}")
def run_graph_analyze(self, partition_refs):
"""Run graph.analyze with the given partition references."""
cmd = [self.graph_analyze] + partition_refs
result = subprocess.run(cmd, capture_output=True, text=True, cwd=os.getcwd())
if result.returncode != 0:
self.fail(f"Graph analyze failed with return code {result.returncode}.\nStdout: {result.stdout}\nStderr: {result.stderr}")
# Parse the JSON output
try:
return json.loads(result.stdout)
except json.JSONDecodeError as e:
self.fail(f"Failed to parse JSON output: {e}\nOutput: {result.stdout}")
def test_single_color_report_dependencies(self):
"""Test dependencies for a single color vote report."""
partition_refs = ["color_vote_report/2024-01-15/red"]
result = self.run_graph_analyze(partition_refs)
self.assertIn('nodes', result)
# TODO expand
def test_multiple_color_reports_same_date(self):
"""Test dependencies when requesting multiple colors for the same date."""
partition_refs = [
"color_vote_report/2024-01-15/red",
"color_vote_report/2024-01-15/blue"
]
result = self.run_graph_analyze(partition_refs)
self.assertIn('nodes', result)
# TODO expand
def test_multiple_dates_dependencies(self):
"""Test dependencies when requesting reports for different dates."""
partition_refs = [
"color_vote_report/2024-01-15/red",
"color_vote_report/2024-01-16/red"
]
result = self.run_graph_analyze(partition_refs)
self.assertIn('nodes', result)
# TODO expand
if __name__ == '__main__':
unittest.main()

View file

@ -1,5 +1,8 @@
#!/usr/bin/env python3
from collections import defaultdict
import sys
import json
LABEL_BASE = "//databuild/test/app"
@ -21,3 +24,6 @@ if __name__ == "__main__":
results = defaultdict(list)
for raw_ref in sys.argv[1:]:
results[lookup(raw_ref)].append(raw_ref)
# Output the results as JSON
print(json.dumps(dict(results)))

View file

@ -1,4 +1,4 @@
from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig
from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig, DepType, DataDep
from databuild.test.app.colors import COLORS
from datetime import date
@ -29,7 +29,7 @@ def configure(outputs: list[PartitionRef]) -> JobConfigureResponse:
configs.append(JobConfig(
outputs=[output],
inputs=inputs,
inputs=[DataDep(dep_type_code=DepType.MATERIALIZE, dep_type_name="materialize", partition_ref=ref) for ref in inputs],
args=[],
env={
"DATA_DATE": data_date,

View file

@ -3,17 +3,18 @@
import sys
import os
import json
from databuild.proto import PartitionRef
from databuild.proto import PartitionRef, to_dict
from databuild.test.app.jobs.aggregate_color_votes.config import configure
from databuild.test.app.jobs.aggregate_color_votes.execute import execute
if __name__ == "__main__":
if sys.argv[1] == "config":
response = configure([
PartitionRef(str=raw_ref)
for raw_ref in sys.argv[2:]
])
print(json.dumps(response.to_dict()))
print(json.dumps(to_dict(response)))
elif sys.argv[1] == "exec":
execute(os.environ["DATA_DATE"], os.environ["AGGREGATE_TYPE"])
else:

View file

@ -18,7 +18,7 @@ class TestAggregateColorVotesConfig(unittest.TestCase):
# Check that inputs are from daily_color_votes
for i, color in enumerate(COLORS):
expected_input = f"daily_color_votes/2024-01-15/{color}"
self.assertEqual(config.inputs[i].str, expected_input)
self.assertEqual(config.inputs[i].partition_ref.str, expected_input)
def test_configure_weekly_votes(self):
outputs = [PartitionRef(str="votes_1w/2024-01-21")]
@ -31,7 +31,7 @@ class TestAggregateColorVotesConfig(unittest.TestCase):
# Check that inputs are from color_votes_1w
for i, color in enumerate(COLORS):
expected_input = f"color_votes_1w/2024-01-21/{color}"
self.assertEqual(config.inputs[i].str, expected_input)
self.assertEqual(config.inputs[i].partition_ref.str, expected_input)
def test_configure_monthly_votes(self):
outputs = [PartitionRef(str="votes_1m/2024-01-31")]
@ -44,7 +44,7 @@ class TestAggregateColorVotesConfig(unittest.TestCase):
# Check that inputs are from color_votes_1m
for i, color in enumerate(COLORS):
expected_input = f"color_votes_1m/2024-01-31/{color}"
self.assertEqual(config.inputs[i].str, expected_input)
self.assertEqual(config.inputs[i].partition_ref.str, expected_input)
def test_configure_multiple_outputs(self):
outputs = [

View file

@ -1,4 +1,4 @@
from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig
from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig, DataDep, DepType
from datetime import date
from collections import defaultdict
@ -40,7 +40,7 @@ def configure(outputs: list[PartitionRef]) -> JobConfigureResponse:
# Single job config for all outputs - pass output partition refs as args
config = JobConfig(
outputs=outputs,
inputs=inputs,
inputs=[DataDep(dep_type_code=DepType.MATERIALIZE, dep_type_name="materialize", partition_ref=ref) for ref in inputs],
args=[output.str for output in outputs],
env={}
)

View file

@ -3,9 +3,10 @@
import sys
import os
import json
from databuild.proto import PartitionRef
from databuild.proto import PartitionRef, to_dict
from databuild.test.app.jobs.color_vote_report_calc.config import configure
from databuild.test.app.jobs.color_vote_report_calc.execute import execute
from betterproto2 import Casing, OutputFormat
if __name__ == "__main__":
if sys.argv[1] == "config":
@ -13,7 +14,7 @@ if __name__ == "__main__":
PartitionRef(str=raw_ref)
for raw_ref in sys.argv[2:]
])
print(json.dumps(response.to_dict()))
print(json.dumps(to_dict(response)))
elif sys.argv[1] == "exec":
execute(sys.argv[2:])
else:

View file

@ -21,7 +21,7 @@ class TestColorVoteReportCalcConfig(unittest.TestCase):
"color_votes_1w/2024-01-15/red",
"color_votes_1m/2024-01-15/red"
]
actual_inputs = [inp.str for inp in config.inputs]
actual_inputs = [inp.partition_ref.str for inp in config.inputs]
for expected in expected_inputs:
self.assertIn(expected, actual_inputs)
@ -52,7 +52,7 @@ class TestColorVoteReportCalcConfig(unittest.TestCase):
self.assertEqual(len(config.outputs), 2)
# Should have total vote inputs for both dates
actual_inputs = [inp.str for inp in config.inputs]
actual_inputs = [inp.partition_ref.str for inp in config.inputs]
self.assertIn("daily_votes/2024-01-15", actual_inputs)
self.assertIn("daily_votes/2024-01-16", actual_inputs)

View file

@ -3,9 +3,10 @@
import sys
import os
import json
from databuild.proto import PartitionRef
from databuild.proto import PartitionRef, to_dict
from databuild.test.app.jobs.ingest_color_votes.config import configure
from databuild.test.app.jobs.ingest_color_votes.execute import execute
from betterproto2 import Casing
if __name__ == "__main__":
if sys.argv[1] == "config":
@ -13,7 +14,7 @@ if __name__ == "__main__":
PartitionRef(str=raw_ref)
for raw_ref in sys.argv[2:]
])
print(json.dumps(response.to_dict()))
print(json.dumps(to_dict(response)))
elif sys.argv[1] == "exec":
execute(os.environ["DATA_DATE"], os.environ["COLOR"])
else:

View file

@ -1,4 +1,4 @@
from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig
from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig, DepType, DataDep
from datetime import date, timedelta
from collections import defaultdict
@ -41,7 +41,7 @@ def configure(outputs: list[PartitionRef]) -> JobConfigureResponse:
configs.append(JobConfig(
outputs=output_partitions,
inputs=inputs,
inputs=[DataDep(dep_type_code=DepType.MATERIALIZE, dep_type_name="materialize", partition_ref=ref) for ref in inputs],
args=[],
env=env
))

View file

@ -3,9 +3,10 @@
import sys
import os
import json
from databuild.proto import PartitionRef
from databuild.proto import PartitionRef, to_dict
from databuild.test.app.jobs.trailing_color_votes.config import configure
from databuild.test.app.jobs.trailing_color_votes.execute import execute
from betterproto2 import Casing, OutputFormat
if __name__ == "__main__":
if sys.argv[1] == "config":
@ -13,7 +14,7 @@ if __name__ == "__main__":
PartitionRef(str=raw_ref)
for raw_ref in sys.argv[2:]
])
print(json.dumps(response.to_dict()))
print(json.dumps(to_dict(response)))
elif sys.argv[1] == "exec":
execute(os.environ["DATA_DATE"], os.environ["COLOR"])
else:

View file

@ -25,29 +25,6 @@ pip.parse(
)
use_repo(pip, "pypi")
# Rules OCI - necessary for producing a docker container
bazel_dep(name = "rules_oci", version = "2.2.6")
# For testing, we also recommend https://registry.bazel.build/modules/container_structure_test
oci = use_extension("@rules_oci//oci:extensions.bzl", "oci")
# Declare external images you need to pull, for example:
oci.pull(
name = "debian",
image = "docker.io/library/python",
platforms = [
"linux/arm64/v8",
"linux/amd64",
],
# 'latest' is not reproducible, but it's convenient.
# During the build we print a WARNING message that includes recommended 'digest' and 'platforms'
# values which you can use here in place of 'tag' to pin for reproducibility.
tag = "3.12-bookworm",
)
# For each oci.pull call, repeat the "name" here to expose them as dependencies.
use_repo(oci, "debian", "debian_linux_amd64", "debian_linux_arm64_v8")
# Platforms for specifying linux/arm
bazel_dep(name = "platforms", version = "0.0.11")

File diff suppressed because one or more lines are too long