Add test app e2e test coverage for generated graph

2025-08-16 15:53:26 -07:00 · 2025-08-16 15:53:26 -07:00 · b3298e7213
commit b3298e7213
parent f92cfeb9b5
7 changed files with 177 additions and 84 deletions
--- a/databuild/test/app/BUILD.bazel
+++ b/databuild/test/app/BUILD.bazel
@ -1,9 +1,15 @@
 py_library(
    name = "job_src",
-    srcs = glob(["**/*.py"]),
+    srcs = glob(["**/*.py"], exclude=["e2e_test_common.py"]),
    visibility = ["//visibility:public"],
    deps = [
        "//databuild:py_proto",
        "//databuild/dsl/python:dsl",
    ],
 )
 py_library(
    name = "e2e_test_common",
    srcs = ["e2e_test_common.py"],
    visibility = ["//visibility:public"],
 )
--- a/databuild/test/app/bazel/BUILD.bazel
+++ b/databuild/test/app/bazel/BUILD.bazel
@ -70,7 +70,7 @@ py_test(
    srcs = ["test_e2e.py"],
    data = [":bazel_graph.build"],
    main = "test_e2e.py",
-    deps = [":job_src"],
+    deps = ["//databuild/test/app:e2e_test_common"],
 )
 # Bazel-defined
--- a/databuild/test/app/bazel/test_e2e.py
+++ b/databuild/test/app/bazel/test_e2e.py
@ -5,102 +5,33 @@ End-to-end test for the bazel-defined test app.
 Tests the full pipeline: build execution -> output verification -> JSON validation.
 """
 import json
 import os
-import shutil
+from databuild.test.app.e2e_test_common import DataBuildE2ETestBase
 import subprocess
 import time
 import unittest
 from pathlib import Path
-class BazelE2ETest(unittest.TestCase):
+class BazelE2ETest(DataBuildE2ETestBase):
-    """End-to-end test for the bazel test app."""
+    """End-to-end test for the bazel-defined test app."""
    def setUp(self):
        """Set up test environment."""
        self.output_dir = Path("/tmp/data/color_votes_1w/2025-09-01/red")
        self.output_file = self.output_dir / "data.json"
        self.partition_ref = "color_votes_1w/2025-09-01/red"
        # Clean up any existing test data
        if self.output_dir.exists():
            shutil.rmtree(self.output_dir)
    def tearDown(self):
        """Clean up test environment."""
        if self.output_dir.exists():
            shutil.rmtree(self.output_dir)
    def test_end_to_end_execution(self):
        """Test full end-to-end execution of the bazel graph."""
-        # Find the graph.build binary (following pattern from graph_test.py)
+        # Build possible paths for the bazel graph build binary
-        runfiles_dir = os.environ.get("RUNFILES_DIR")
+        possible_paths = self.get_standard_runfiles_paths(
-        test_srcdir = os.environ.get("TEST_SRCDIR")
+            'databuild/test/app/bazel/bazel_graph.build'
        )
-        possible_paths = []
+        # Add fallback paths for local testing
        if runfiles_dir:
            possible_paths.append(os.path.join(runfiles_dir, '_main', 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build'))
            possible_paths.append(os.path.join(runfiles_dir, 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build'))
        if test_srcdir:
            possible_paths.append(os.path.join(test_srcdir, '_main', 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build'))
            possible_paths.append(os.path.join(test_srcdir, 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build'))
        # Fallback for local testing
        possible_paths.extend([
            'bazel-bin/databuild/test/app/bazel/bazel_graph.build',
            './bazel_graph.build'
        ])
-        graph_build_path = None
+        # Find the graph build binary
-        for path in possible_paths:
+        graph_build_path = self.find_graph_build_binary(possible_paths)
            if os.path.exists(path):
                graph_build_path = path
                break
-        self.assertIsNotNone(graph_build_path, 
+        # Execute and verify the graph build
-                           f"Graph build binary not found in any of: {possible_paths}")
+        self.execute_and_verify_graph_build(graph_build_path)
        # Record start time for file modification check
        start_time = time.time()
        # Execute the graph build (shell script)
        result = subprocess.run(
            ["bash", graph_build_path, self.partition_ref],
            capture_output=True,
            text=True
        )
        # Verify execution succeeded
        self.assertEqual(result.returncode, 0, 
                        f"Graph build failed with stderr: {result.stderr}")
        # Verify output file was created
        self.assertTrue(self.output_file.exists(), 
                       f"Output file {self.output_file} was not created")
        # Verify file was created recently (within 60 seconds)
        file_mtime = os.path.getmtime(self.output_file)
        time_diff = file_mtime - start_time
        self.assertGreaterEqual(time_diff, -1,  # Allow 1 second clock skew
                               f"File appears to be too old: {time_diff} seconds")
        self.assertLessEqual(time_diff, 60,
                            f"File creation took too long: {time_diff} seconds")
        # Verify file contains valid JSON
        with open(self.output_file, 'r') as f:
            content = f.read()
        try:
            data = json.loads(content)
        except json.JSONDecodeError as e:
            self.fail(f"Output file does not contain valid JSON: {e}")
        # Basic sanity check on JSON structure
        self.assertIsInstance(data, (dict, list), 
                             "JSON should be an object or array")
 if __name__ == '__main__':
    import unittest
    unittest.main()
--- a/databuild/test/app/dsl/claude-generated-dsl-test.md
+++ b/databuild/test/app/dsl/claude-generated-dsl-test.md
@ -0,0 +1,9 @@
 We can't write a direct `bazel test` for the DSL generated graph, because:
 1. Bazel doesn't allow you to `bazel run graph.generate` to generate a BUILD.bazel that will be used in the same build.
 2. We don't want to leak test generation into the graph generation code (since tests here are app specific)
 Instead, we need to use a two phase process, where we rely on the graph to already be generated here, which will contain a test, such that `bazel test //...` will give us recall over generated source as well. This implies that this generated source is going to be checked in to git (gasp, I know), and we need a mechanism to ensure it stays up to date. To achieve this, we'll create a test that asserts that the contents of the `generated` dir is the exact same as the output of a new run of `graph.generate`.
 Our task is to implement this test that asserts equality between the two, e.g. the target could depend on `graph.generate`, and in the test run it and md5 the results, comparing it to the md5 of the existing generated dir.
--- a/databuild/test/app/dsl/generated_test/BUILD.bazel
+++ b/databuild/test/app/dsl/generated_test/BUILD.bazel
@ -0,0 +1,7 @@
 py_test(
    name = "test_e2e",
    srcs = ["test_e2e.py"],
    data = ["//databuild/test/app/dsl/generated:dsl_graph.build"],
    main = "test_e2e.py",
    deps = ["//databuild/test/app:e2e_test_common"],
 )
--- a/databuild/test/app/dsl/generated_test/test_e2e.py
+++ b/databuild/test/app/dsl/generated_test/test_e2e.py
@ -0,0 +1,37 @@
 #!/usr/bin/env python3
 """
 End-to-end test for the DSL-generated test app.
 Tests the full pipeline: build execution -> output verification -> JSON validation.
 """
 import os
 from databuild.test.app.e2e_test_common import DataBuildE2ETestBase
 class DSLGeneratedE2ETest(DataBuildE2ETestBase):
    """End-to-end test for the DSL-generated test app."""
    def test_end_to_end_execution(self):
        """Test full end-to-end execution of the DSL-generated graph."""
        # Build possible paths for the DSL-generated graph build binary
        possible_paths = self.get_standard_runfiles_paths(
            'databuild/test/app/dsl/generated/dsl_graph.build'
        )
        # Add fallback paths for local testing
        possible_paths.extend([
            'bazel-bin/databuild/test/app/dsl/generated/dsl_graph.build',
            './dsl_graph.build'
        ])
        # Find the graph build binary
        graph_build_path = self.find_graph_build_binary(possible_paths)
        # Execute and verify the graph build
        self.execute_and_verify_graph_build(graph_build_path)
 if __name__ == '__main__':
    import unittest
    unittest.main()
--- a/databuild/test/app/e2e_test_common.py
+++ b/databuild/test/app/e2e_test_common.py
@ -0,0 +1,103 @@
 #!/usr/bin/env python3
 """
 Common end-to-end test logic for DataBuild test apps.
 Provides shared functionality for testing both bazel-defined and DSL-generated graphs.
 """
 import json
 import os
 import shutil
 import subprocess
 import time
 import unittest
 from pathlib import Path
 from typing import List, Optional
 class DataBuildE2ETestBase(unittest.TestCase):
    """Base class for DataBuild end-to-end tests."""
    def setUp(self):
        """Set up test environment."""
        self.output_dir = Path("/tmp/data/color_votes_1w/2025-09-01/red")
        self.output_file = self.output_dir / "data.json"
        self.partition_ref = "color_votes_1w/2025-09-01/red"
        # Clean up any existing test data
        if self.output_dir.exists():
            shutil.rmtree(self.output_dir)
    def tearDown(self):
        """Clean up test environment."""
        if self.output_dir.exists():
            shutil.rmtree(self.output_dir)
    def find_graph_build_binary(self, possible_paths: List[str]) -> str:
        """Find the graph.build binary from a list of possible paths."""
        graph_build_path = None
        for path in possible_paths:
            if os.path.exists(path):
                graph_build_path = path
                break
        self.assertIsNotNone(graph_build_path, 
                           f"Graph build binary not found in any of: {possible_paths}")
        return graph_build_path
    def execute_and_verify_graph_build(self, graph_build_path: str) -> None:
        """Execute the graph build and verify the results."""
        # Record start time for file modification check
        start_time = time.time()
        # Execute the graph build (shell script)
        result = subprocess.run(
            ["bash", graph_build_path, self.partition_ref],
            capture_output=True,
            text=True
        )
        # Verify execution succeeded
        self.assertEqual(result.returncode, 0, 
                        f"Graph build failed with stderr: {result.stderr}")
        # Verify output file was created
        self.assertTrue(self.output_file.exists(), 
                       f"Output file {self.output_file} was not created")
        # Verify file was created recently (within 60 seconds)
        file_mtime = os.path.getmtime(self.output_file)
        time_diff = file_mtime - start_time
        self.assertGreaterEqual(time_diff, -1,  # Allow 1 second clock skew
                               f"File appears to be too old: {time_diff} seconds")
        self.assertLessEqual(time_diff, 60,
                            f"File creation took too long: {time_diff} seconds")
        # Verify file contains valid JSON
        with open(self.output_file, 'r') as f:
            content = f.read()
        try:
            data = json.loads(content)
        except json.JSONDecodeError as e:
            self.fail(f"Output file does not contain valid JSON: {e}")
        # Basic sanity check on JSON structure
        self.assertIsInstance(data, (dict, list), 
                             "JSON should be an object or array")
    def get_standard_runfiles_paths(self, relative_path: str) -> List[str]:
        """Get standard list of possible runfiles paths for a binary."""
        runfiles_dir = os.environ.get("RUNFILES_DIR")
        test_srcdir = os.environ.get("TEST_SRCDIR")
        possible_paths = []
        if runfiles_dir:
            possible_paths.append(os.path.join(runfiles_dir, '_main', relative_path))
            possible_paths.append(os.path.join(runfiles_dir, relative_path))
        if test_srcdir:
            possible_paths.append(os.path.join(test_srcdir, '_main', relative_path))
            possible_paths.append(os.path.join(test_srcdir, relative_path))
        return possible_paths