From b3298e7213cd14ae3726a838dd00200d2df89fad Mon Sep 17 00:00:00 2001 From: Stuart Axelbrooke Date: Sat, 16 Aug 2025 15:53:26 -0700 Subject: [PATCH] Add test app e2e test coverage for generated graph --- databuild/test/app/BUILD.bazel | 8 +- databuild/test/app/bazel/BUILD.bazel | 2 +- databuild/test/app/bazel/test_e2e.py | 95 +++------------- .../test/app/dsl/claude-generated-dsl-test.md | 9 ++ .../test/app/dsl/generated_test/BUILD.bazel | 7 ++ .../test/app/dsl/generated_test/test_e2e.py | 37 +++++++ databuild/test/app/e2e_test_common.py | 103 ++++++++++++++++++ 7 files changed, 177 insertions(+), 84 deletions(-) create mode 100644 databuild/test/app/dsl/claude-generated-dsl-test.md create mode 100644 databuild/test/app/dsl/generated_test/BUILD.bazel create mode 100644 databuild/test/app/dsl/generated_test/test_e2e.py create mode 100644 databuild/test/app/e2e_test_common.py diff --git a/databuild/test/app/BUILD.bazel b/databuild/test/app/BUILD.bazel index b3606c7..705dd61 100644 --- a/databuild/test/app/BUILD.bazel +++ b/databuild/test/app/BUILD.bazel @@ -1,9 +1,15 @@ py_library( name = "job_src", - srcs = glob(["**/*.py"]), + srcs = glob(["**/*.py"], exclude=["e2e_test_common.py"]), visibility = ["//visibility:public"], deps = [ "//databuild:py_proto", "//databuild/dsl/python:dsl", ], ) + +py_library( + name = "e2e_test_common", + srcs = ["e2e_test_common.py"], + visibility = ["//visibility:public"], +) diff --git a/databuild/test/app/bazel/BUILD.bazel b/databuild/test/app/bazel/BUILD.bazel index ca87f55..08fa7b0 100644 --- a/databuild/test/app/bazel/BUILD.bazel +++ b/databuild/test/app/bazel/BUILD.bazel @@ -70,7 +70,7 @@ py_test( srcs = ["test_e2e.py"], data = [":bazel_graph.build"], main = "test_e2e.py", - deps = [":job_src"], + deps = ["//databuild/test/app:e2e_test_common"], ) # Bazel-defined diff --git a/databuild/test/app/bazel/test_e2e.py b/databuild/test/app/bazel/test_e2e.py index b9e2688..189d2b1 100644 --- a/databuild/test/app/bazel/test_e2e.py +++ b/databuild/test/app/bazel/test_e2e.py @@ -5,102 +5,33 @@ End-to-end test for the bazel-defined test app. Tests the full pipeline: build execution -> output verification -> JSON validation. """ -import json import os -import shutil -import subprocess -import time -import unittest -from pathlib import Path +from databuild.test.app.e2e_test_common import DataBuildE2ETestBase -class BazelE2ETest(unittest.TestCase): - """End-to-end test for the bazel test app.""" - - def setUp(self): - """Set up test environment.""" - self.output_dir = Path("/tmp/data/color_votes_1w/2025-09-01/red") - self.output_file = self.output_dir / "data.json" - self.partition_ref = "color_votes_1w/2025-09-01/red" - - # Clean up any existing test data - if self.output_dir.exists(): - shutil.rmtree(self.output_dir) - - def tearDown(self): - """Clean up test environment.""" - if self.output_dir.exists(): - shutil.rmtree(self.output_dir) +class BazelE2ETest(DataBuildE2ETestBase): + """End-to-end test for the bazel-defined test app.""" def test_end_to_end_execution(self): """Test full end-to-end execution of the bazel graph.""" - # Find the graph.build binary (following pattern from graph_test.py) - runfiles_dir = os.environ.get("RUNFILES_DIR") - test_srcdir = os.environ.get("TEST_SRCDIR") + # Build possible paths for the bazel graph build binary + possible_paths = self.get_standard_runfiles_paths( + 'databuild/test/app/bazel/bazel_graph.build' + ) - possible_paths = [] - if runfiles_dir: - possible_paths.append(os.path.join(runfiles_dir, '_main', 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build')) - possible_paths.append(os.path.join(runfiles_dir, 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build')) - - if test_srcdir: - possible_paths.append(os.path.join(test_srcdir, '_main', 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build')) - possible_paths.append(os.path.join(test_srcdir, 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build')) - - # Fallback for local testing + # Add fallback paths for local testing possible_paths.extend([ 'bazel-bin/databuild/test/app/bazel/bazel_graph.build', './bazel_graph.build' ]) - graph_build_path = None - for path in possible_paths: - if os.path.exists(path): - graph_build_path = path - break + # Find the graph build binary + graph_build_path = self.find_graph_build_binary(possible_paths) - self.assertIsNotNone(graph_build_path, - f"Graph build binary not found in any of: {possible_paths}") - - # Record start time for file modification check - start_time = time.time() - - # Execute the graph build (shell script) - result = subprocess.run( - ["bash", graph_build_path, self.partition_ref], - capture_output=True, - text=True - ) - - # Verify execution succeeded - self.assertEqual(result.returncode, 0, - f"Graph build failed with stderr: {result.stderr}") - - # Verify output file was created - self.assertTrue(self.output_file.exists(), - f"Output file {self.output_file} was not created") - - # Verify file was created recently (within 60 seconds) - file_mtime = os.path.getmtime(self.output_file) - time_diff = file_mtime - start_time - self.assertGreaterEqual(time_diff, -1, # Allow 1 second clock skew - f"File appears to be too old: {time_diff} seconds") - self.assertLessEqual(time_diff, 60, - f"File creation took too long: {time_diff} seconds") - - # Verify file contains valid JSON - with open(self.output_file, 'r') as f: - content = f.read() - - try: - data = json.loads(content) - except json.JSONDecodeError as e: - self.fail(f"Output file does not contain valid JSON: {e}") - - # Basic sanity check on JSON structure - self.assertIsInstance(data, (dict, list), - "JSON should be an object or array") + # Execute and verify the graph build + self.execute_and_verify_graph_build(graph_build_path) if __name__ == '__main__': + import unittest unittest.main() \ No newline at end of file diff --git a/databuild/test/app/dsl/claude-generated-dsl-test.md b/databuild/test/app/dsl/claude-generated-dsl-test.md new file mode 100644 index 0000000..8683038 --- /dev/null +++ b/databuild/test/app/dsl/claude-generated-dsl-test.md @@ -0,0 +1,9 @@ + +We can't write a direct `bazel test` for the DSL generated graph, because: + +1. Bazel doesn't allow you to `bazel run graph.generate` to generate a BUILD.bazel that will be used in the same build. +2. We don't want to leak test generation into the graph generation code (since tests here are app specific) + +Instead, we need to use a two phase process, where we rely on the graph to already be generated here, which will contain a test, such that `bazel test //...` will give us recall over generated source as well. This implies that this generated source is going to be checked in to git (gasp, I know), and we need a mechanism to ensure it stays up to date. To achieve this, we'll create a test that asserts that the contents of the `generated` dir is the exact same as the output of a new run of `graph.generate`. + +Our task is to implement this test that asserts equality between the two, e.g. the target could depend on `graph.generate`, and in the test run it and md5 the results, comparing it to the md5 of the existing generated dir. diff --git a/databuild/test/app/dsl/generated_test/BUILD.bazel b/databuild/test/app/dsl/generated_test/BUILD.bazel new file mode 100644 index 0000000..d03fb63 --- /dev/null +++ b/databuild/test/app/dsl/generated_test/BUILD.bazel @@ -0,0 +1,7 @@ +py_test( + name = "test_e2e", + srcs = ["test_e2e.py"], + data = ["//databuild/test/app/dsl/generated:dsl_graph.build"], + main = "test_e2e.py", + deps = ["//databuild/test/app:e2e_test_common"], +) \ No newline at end of file diff --git a/databuild/test/app/dsl/generated_test/test_e2e.py b/databuild/test/app/dsl/generated_test/test_e2e.py new file mode 100644 index 0000000..ebe9f80 --- /dev/null +++ b/databuild/test/app/dsl/generated_test/test_e2e.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +""" +End-to-end test for the DSL-generated test app. + +Tests the full pipeline: build execution -> output verification -> JSON validation. +""" + +import os +from databuild.test.app.e2e_test_common import DataBuildE2ETestBase + + +class DSLGeneratedE2ETest(DataBuildE2ETestBase): + """End-to-end test for the DSL-generated test app.""" + + def test_end_to_end_execution(self): + """Test full end-to-end execution of the DSL-generated graph.""" + # Build possible paths for the DSL-generated graph build binary + possible_paths = self.get_standard_runfiles_paths( + 'databuild/test/app/dsl/generated/dsl_graph.build' + ) + + # Add fallback paths for local testing + possible_paths.extend([ + 'bazel-bin/databuild/test/app/dsl/generated/dsl_graph.build', + './dsl_graph.build' + ]) + + # Find the graph build binary + graph_build_path = self.find_graph_build_binary(possible_paths) + + # Execute and verify the graph build + self.execute_and_verify_graph_build(graph_build_path) + + +if __name__ == '__main__': + import unittest + unittest.main() \ No newline at end of file diff --git a/databuild/test/app/e2e_test_common.py b/databuild/test/app/e2e_test_common.py new file mode 100644 index 0000000..00f3ad7 --- /dev/null +++ b/databuild/test/app/e2e_test_common.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +""" +Common end-to-end test logic for DataBuild test apps. + +Provides shared functionality for testing both bazel-defined and DSL-generated graphs. +""" + +import json +import os +import shutil +import subprocess +import time +import unittest +from pathlib import Path +from typing import List, Optional + + +class DataBuildE2ETestBase(unittest.TestCase): + """Base class for DataBuild end-to-end tests.""" + + def setUp(self): + """Set up test environment.""" + self.output_dir = Path("/tmp/data/color_votes_1w/2025-09-01/red") + self.output_file = self.output_dir / "data.json" + self.partition_ref = "color_votes_1w/2025-09-01/red" + + # Clean up any existing test data + if self.output_dir.exists(): + shutil.rmtree(self.output_dir) + + def tearDown(self): + """Clean up test environment.""" + if self.output_dir.exists(): + shutil.rmtree(self.output_dir) + + def find_graph_build_binary(self, possible_paths: List[str]) -> str: + """Find the graph.build binary from a list of possible paths.""" + graph_build_path = None + for path in possible_paths: + if os.path.exists(path): + graph_build_path = path + break + + self.assertIsNotNone(graph_build_path, + f"Graph build binary not found in any of: {possible_paths}") + return graph_build_path + + def execute_and_verify_graph_build(self, graph_build_path: str) -> None: + """Execute the graph build and verify the results.""" + # Record start time for file modification check + start_time = time.time() + + # Execute the graph build (shell script) + result = subprocess.run( + ["bash", graph_build_path, self.partition_ref], + capture_output=True, + text=True + ) + + # Verify execution succeeded + self.assertEqual(result.returncode, 0, + f"Graph build failed with stderr: {result.stderr}") + + # Verify output file was created + self.assertTrue(self.output_file.exists(), + f"Output file {self.output_file} was not created") + + # Verify file was created recently (within 60 seconds) + file_mtime = os.path.getmtime(self.output_file) + time_diff = file_mtime - start_time + self.assertGreaterEqual(time_diff, -1, # Allow 1 second clock skew + f"File appears to be too old: {time_diff} seconds") + self.assertLessEqual(time_diff, 60, + f"File creation took too long: {time_diff} seconds") + + # Verify file contains valid JSON + with open(self.output_file, 'r') as f: + content = f.read() + + try: + data = json.loads(content) + except json.JSONDecodeError as e: + self.fail(f"Output file does not contain valid JSON: {e}") + + # Basic sanity check on JSON structure + self.assertIsInstance(data, (dict, list), + "JSON should be an object or array") + + def get_standard_runfiles_paths(self, relative_path: str) -> List[str]: + """Get standard list of possible runfiles paths for a binary.""" + runfiles_dir = os.environ.get("RUNFILES_DIR") + test_srcdir = os.environ.get("TEST_SRCDIR") + + possible_paths = [] + if runfiles_dir: + possible_paths.append(os.path.join(runfiles_dir, '_main', relative_path)) + possible_paths.append(os.path.join(runfiles_dir, relative_path)) + + if test_srcdir: + possible_paths.append(os.path.join(test_srcdir, '_main', relative_path)) + possible_paths.append(os.path.join(test_srcdir, relative_path)) + + return possible_paths \ No newline at end of file