From b3298e7213cd14ae3726a838dd00200d2df89fad Mon Sep 17 00:00:00 2001
From: Stuart Axelbrooke <stuart@axelbrooke.com>
Date: Sat, 16 Aug 2025 15:53:26 -0700
Subject: [PATCH] Add test app e2e test coverage for generated graph

---
 databuild/test/app/BUILD.bazel                |   8 +-
 databuild/test/app/bazel/BUILD.bazel          |   2 +-
 databuild/test/app/bazel/test_e2e.py          |  95 +++-------------
 .../test/app/dsl/claude-generated-dsl-test.md |   9 ++
 .../test/app/dsl/generated_test/BUILD.bazel   |   7 ++
 .../test/app/dsl/generated_test/test_e2e.py   |  37 +++++++
 databuild/test/app/e2e_test_common.py         | 103 ++++++++++++++++++
 7 files changed, 177 insertions(+), 84 deletions(-)
 create mode 100644 databuild/test/app/dsl/claude-generated-dsl-test.md
 create mode 100644 databuild/test/app/dsl/generated_test/BUILD.bazel
 create mode 100644 databuild/test/app/dsl/generated_test/test_e2e.py
 create mode 100644 databuild/test/app/e2e_test_common.py

diff --git a/databuild/test/app/BUILD.bazel b/databuild/test/app/BUILD.bazel
index b3606c7..705dd61 100644
--- a/databuild/test/app/BUILD.bazel
+++ b/databuild/test/app/BUILD.bazel
@@ -1,9 +1,15 @@
 py_library(
     name = "job_src",
-    srcs = glob(["**/*.py"]),
+    srcs = glob(["**/*.py"], exclude=["e2e_test_common.py"]),
     visibility = ["//visibility:public"],
     deps = [
         "//databuild:py_proto",
         "//databuild/dsl/python:dsl",
     ],
 )
+
+py_library(
+    name = "e2e_test_common",
+    srcs = ["e2e_test_common.py"],
+    visibility = ["//visibility:public"],
+)
diff --git a/databuild/test/app/bazel/BUILD.bazel b/databuild/test/app/bazel/BUILD.bazel
index ca87f55..08fa7b0 100644
--- a/databuild/test/app/bazel/BUILD.bazel
+++ b/databuild/test/app/bazel/BUILD.bazel
@@ -70,7 +70,7 @@ py_test(
     srcs = ["test_e2e.py"],
     data = [":bazel_graph.build"],
     main = "test_e2e.py",
-    deps = [":job_src"],
+    deps = ["//databuild/test/app:e2e_test_common"],
 )
 
 # Bazel-defined
diff --git a/databuild/test/app/bazel/test_e2e.py b/databuild/test/app/bazel/test_e2e.py
index b9e2688..189d2b1 100644
--- a/databuild/test/app/bazel/test_e2e.py
+++ b/databuild/test/app/bazel/test_e2e.py
@@ -5,102 +5,33 @@ End-to-end test for the bazel-defined test app.
 Tests the full pipeline: build execution -> output verification -> JSON validation.
 """
 
-import json
 import os
-import shutil
-import subprocess
-import time
-import unittest
-from pathlib import Path
+from databuild.test.app.e2e_test_common import DataBuildE2ETestBase
 
 
-class BazelE2ETest(unittest.TestCase):
-    """End-to-end test for the bazel test app."""
-    
-    def setUp(self):
-        """Set up test environment."""
-        self.output_dir = Path("/tmp/data/color_votes_1w/2025-09-01/red")
-        self.output_file = self.output_dir / "data.json"
-        self.partition_ref = "color_votes_1w/2025-09-01/red"
-        
-        # Clean up any existing test data
-        if self.output_dir.exists():
-            shutil.rmtree(self.output_dir)
-    
-    def tearDown(self):
-        """Clean up test environment."""
-        if self.output_dir.exists():
-            shutil.rmtree(self.output_dir)
+class BazelE2ETest(DataBuildE2ETestBase):
+    """End-to-end test for the bazel-defined test app."""
     
     def test_end_to_end_execution(self):
         """Test full end-to-end execution of the bazel graph."""
-        # Find the graph.build binary (following pattern from graph_test.py)
-        runfiles_dir = os.environ.get("RUNFILES_DIR")
-        test_srcdir = os.environ.get("TEST_SRCDIR")
+        # Build possible paths for the bazel graph build binary
+        possible_paths = self.get_standard_runfiles_paths(
+            'databuild/test/app/bazel/bazel_graph.build'
+        )
         
-        possible_paths = []
-        if runfiles_dir:
-            possible_paths.append(os.path.join(runfiles_dir, '_main', 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build'))
-            possible_paths.append(os.path.join(runfiles_dir, 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build'))
-        
-        if test_srcdir:
-            possible_paths.append(os.path.join(test_srcdir, '_main', 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build'))
-            possible_paths.append(os.path.join(test_srcdir, 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build'))
-        
-        # Fallback for local testing
+        # Add fallback paths for local testing
         possible_paths.extend([
             'bazel-bin/databuild/test/app/bazel/bazel_graph.build',
             './bazel_graph.build'
         ])
         
-        graph_build_path = None
-        for path in possible_paths:
-            if os.path.exists(path):
-                graph_build_path = path
-                break
+        # Find the graph build binary
+        graph_build_path = self.find_graph_build_binary(possible_paths)
         
-        self.assertIsNotNone(graph_build_path, 
-                           f"Graph build binary not found in any of: {possible_paths}")
-        
-        # Record start time for file modification check
-        start_time = time.time()
-        
-        # Execute the graph build (shell script)
-        result = subprocess.run(
-            ["bash", graph_build_path, self.partition_ref],
-            capture_output=True,
-            text=True
-        )
-        
-        # Verify execution succeeded
-        self.assertEqual(result.returncode, 0, 
-                        f"Graph build failed with stderr: {result.stderr}")
-        
-        # Verify output file was created
-        self.assertTrue(self.output_file.exists(), 
-                       f"Output file {self.output_file} was not created")
-        
-        # Verify file was created recently (within 60 seconds)
-        file_mtime = os.path.getmtime(self.output_file)
-        time_diff = file_mtime - start_time
-        self.assertGreaterEqual(time_diff, -1,  # Allow 1 second clock skew
-                               f"File appears to be too old: {time_diff} seconds")
-        self.assertLessEqual(time_diff, 60,
-                            f"File creation took too long: {time_diff} seconds")
-        
-        # Verify file contains valid JSON
-        with open(self.output_file, 'r') as f:
-            content = f.read()
-        
-        try:
-            data = json.loads(content)
-        except json.JSONDecodeError as e:
-            self.fail(f"Output file does not contain valid JSON: {e}")
-        
-        # Basic sanity check on JSON structure
-        self.assertIsInstance(data, (dict, list), 
-                             "JSON should be an object or array")
+        # Execute and verify the graph build
+        self.execute_and_verify_graph_build(graph_build_path)
 
 
 if __name__ == '__main__':
+    import unittest
     unittest.main()
\ No newline at end of file
diff --git a/databuild/test/app/dsl/claude-generated-dsl-test.md b/databuild/test/app/dsl/claude-generated-dsl-test.md
new file mode 100644
index 0000000..8683038
--- /dev/null
+++ b/databuild/test/app/dsl/claude-generated-dsl-test.md
@@ -0,0 +1,9 @@
+
+We can't write a direct `bazel test` for the DSL generated graph, because:
+
+1. Bazel doesn't allow you to `bazel run graph.generate` to generate a BUILD.bazel that will be used in the same build.
+2. We don't want to leak test generation into the graph generation code (since tests here are app specific)
+
+Instead, we need to use a two phase process, where we rely on the graph to already be generated here, which will contain a test, such that `bazel test //...` will give us recall over generated source as well. This implies that this generated source is going to be checked in to git (gasp, I know), and we need a mechanism to ensure it stays up to date. To achieve this, we'll create a test that asserts that the contents of the `generated` dir is the exact same as the output of a new run of `graph.generate`.
+
+Our task is to implement this test that asserts equality between the two, e.g. the target could depend on `graph.generate`, and in the test run it and md5 the results, comparing it to the md5 of the existing generated dir.
diff --git a/databuild/test/app/dsl/generated_test/BUILD.bazel b/databuild/test/app/dsl/generated_test/BUILD.bazel
new file mode 100644
index 0000000..d03fb63
--- /dev/null
+++ b/databuild/test/app/dsl/generated_test/BUILD.bazel
@@ -0,0 +1,7 @@
+py_test(
+    name = "test_e2e",
+    srcs = ["test_e2e.py"],
+    data = ["//databuild/test/app/dsl/generated:dsl_graph.build"],
+    main = "test_e2e.py",
+    deps = ["//databuild/test/app:e2e_test_common"],
+)
\ No newline at end of file
diff --git a/databuild/test/app/dsl/generated_test/test_e2e.py b/databuild/test/app/dsl/generated_test/test_e2e.py
new file mode 100644
index 0000000..ebe9f80
--- /dev/null
+++ b/databuild/test/app/dsl/generated_test/test_e2e.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+"""
+End-to-end test for the DSL-generated test app.
+
+Tests the full pipeline: build execution -> output verification -> JSON validation.
+"""
+
+import os
+from databuild.test.app.e2e_test_common import DataBuildE2ETestBase
+
+
+class DSLGeneratedE2ETest(DataBuildE2ETestBase):
+    """End-to-end test for the DSL-generated test app."""
+    
+    def test_end_to_end_execution(self):
+        """Test full end-to-end execution of the DSL-generated graph."""
+        # Build possible paths for the DSL-generated graph build binary
+        possible_paths = self.get_standard_runfiles_paths(
+            'databuild/test/app/dsl/generated/dsl_graph.build'
+        )
+        
+        # Add fallback paths for local testing
+        possible_paths.extend([
+            'bazel-bin/databuild/test/app/dsl/generated/dsl_graph.build',
+            './dsl_graph.build'
+        ])
+        
+        # Find the graph build binary
+        graph_build_path = self.find_graph_build_binary(possible_paths)
+        
+        # Execute and verify the graph build
+        self.execute_and_verify_graph_build(graph_build_path)
+
+
+if __name__ == '__main__':
+    import unittest
+    unittest.main()
\ No newline at end of file
diff --git a/databuild/test/app/e2e_test_common.py b/databuild/test/app/e2e_test_common.py
new file mode 100644
index 0000000..00f3ad7
--- /dev/null
+++ b/databuild/test/app/e2e_test_common.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""
+Common end-to-end test logic for DataBuild test apps.
+
+Provides shared functionality for testing both bazel-defined and DSL-generated graphs.
+"""
+
+import json
+import os
+import shutil
+import subprocess
+import time
+import unittest
+from pathlib import Path
+from typing import List, Optional
+
+
+class DataBuildE2ETestBase(unittest.TestCase):
+    """Base class for DataBuild end-to-end tests."""
+    
+    def setUp(self):
+        """Set up test environment."""
+        self.output_dir = Path("/tmp/data/color_votes_1w/2025-09-01/red")
+        self.output_file = self.output_dir / "data.json"
+        self.partition_ref = "color_votes_1w/2025-09-01/red"
+        
+        # Clean up any existing test data
+        if self.output_dir.exists():
+            shutil.rmtree(self.output_dir)
+    
+    def tearDown(self):
+        """Clean up test environment."""
+        if self.output_dir.exists():
+            shutil.rmtree(self.output_dir)
+    
+    def find_graph_build_binary(self, possible_paths: List[str]) -> str:
+        """Find the graph.build binary from a list of possible paths."""
+        graph_build_path = None
+        for path in possible_paths:
+            if os.path.exists(path):
+                graph_build_path = path
+                break
+        
+        self.assertIsNotNone(graph_build_path, 
+                           f"Graph build binary not found in any of: {possible_paths}")
+        return graph_build_path
+    
+    def execute_and_verify_graph_build(self, graph_build_path: str) -> None:
+        """Execute the graph build and verify the results."""
+        # Record start time for file modification check
+        start_time = time.time()
+        
+        # Execute the graph build (shell script)
+        result = subprocess.run(
+            ["bash", graph_build_path, self.partition_ref],
+            capture_output=True,
+            text=True
+        )
+        
+        # Verify execution succeeded
+        self.assertEqual(result.returncode, 0, 
+                        f"Graph build failed with stderr: {result.stderr}")
+        
+        # Verify output file was created
+        self.assertTrue(self.output_file.exists(), 
+                       f"Output file {self.output_file} was not created")
+        
+        # Verify file was created recently (within 60 seconds)
+        file_mtime = os.path.getmtime(self.output_file)
+        time_diff = file_mtime - start_time
+        self.assertGreaterEqual(time_diff, -1,  # Allow 1 second clock skew
+                               f"File appears to be too old: {time_diff} seconds")
+        self.assertLessEqual(time_diff, 60,
+                            f"File creation took too long: {time_diff} seconds")
+        
+        # Verify file contains valid JSON
+        with open(self.output_file, 'r') as f:
+            content = f.read()
+        
+        try:
+            data = json.loads(content)
+        except json.JSONDecodeError as e:
+            self.fail(f"Output file does not contain valid JSON: {e}")
+        
+        # Basic sanity check on JSON structure
+        self.assertIsInstance(data, (dict, list), 
+                             "JSON should be an object or array")
+    
+    def get_standard_runfiles_paths(self, relative_path: str) -> List[str]:
+        """Get standard list of possible runfiles paths for a binary."""
+        runfiles_dir = os.environ.get("RUNFILES_DIR")
+        test_srcdir = os.environ.get("TEST_SRCDIR")
+        
+        possible_paths = []
+        if runfiles_dir:
+            possible_paths.append(os.path.join(runfiles_dir, '_main', relative_path))
+            possible_paths.append(os.path.join(runfiles_dir, relative_path))
+        
+        if test_srcdir:
+            possible_paths.append(os.path.join(test_srcdir, '_main', relative_path))
+            possible_paths.append(os.path.join(test_srcdir, relative_path))
+        
+        return possible_paths
\ No newline at end of file