Add test app e2e test coverage for generated graph

This commit is contained in:
Stuart Axelbrooke 2025-08-16 15:53:26 -07:00
parent f92cfeb9b5
commit b3298e7213
7 changed files with 177 additions and 84 deletions

View file

@ -1,9 +1,15 @@
py_library(
name = "job_src",
srcs = glob(["**/*.py"]),
srcs = glob(["**/*.py"], exclude=["e2e_test_common.py"]),
visibility = ["//visibility:public"],
deps = [
"//databuild:py_proto",
"//databuild/dsl/python:dsl",
],
)
py_library(
name = "e2e_test_common",
srcs = ["e2e_test_common.py"],
visibility = ["//visibility:public"],
)

View file

@ -70,7 +70,7 @@ py_test(
srcs = ["test_e2e.py"],
data = [":bazel_graph.build"],
main = "test_e2e.py",
deps = [":job_src"],
deps = ["//databuild/test/app:e2e_test_common"],
)
# Bazel-defined

View file

@ -5,102 +5,33 @@ End-to-end test for the bazel-defined test app.
Tests the full pipeline: build execution -> output verification -> JSON validation.
"""
import json
import os
import shutil
import subprocess
import time
import unittest
from pathlib import Path
from databuild.test.app.e2e_test_common import DataBuildE2ETestBase
class BazelE2ETest(unittest.TestCase):
"""End-to-end test for the bazel test app."""
def setUp(self):
"""Set up test environment."""
self.output_dir = Path("/tmp/data/color_votes_1w/2025-09-01/red")
self.output_file = self.output_dir / "data.json"
self.partition_ref = "color_votes_1w/2025-09-01/red"
# Clean up any existing test data
if self.output_dir.exists():
shutil.rmtree(self.output_dir)
def tearDown(self):
"""Clean up test environment."""
if self.output_dir.exists():
shutil.rmtree(self.output_dir)
class BazelE2ETest(DataBuildE2ETestBase):
"""End-to-end test for the bazel-defined test app."""
def test_end_to_end_execution(self):
"""Test full end-to-end execution of the bazel graph."""
# Find the graph.build binary (following pattern from graph_test.py)
runfiles_dir = os.environ.get("RUNFILES_DIR")
test_srcdir = os.environ.get("TEST_SRCDIR")
# Build possible paths for the bazel graph build binary
possible_paths = self.get_standard_runfiles_paths(
'databuild/test/app/bazel/bazel_graph.build'
)
possible_paths = []
if runfiles_dir:
possible_paths.append(os.path.join(runfiles_dir, '_main', 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build'))
possible_paths.append(os.path.join(runfiles_dir, 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build'))
if test_srcdir:
possible_paths.append(os.path.join(test_srcdir, '_main', 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build'))
possible_paths.append(os.path.join(test_srcdir, 'databuild', 'test', 'app', 'bazel', 'bazel_graph.build'))
# Fallback for local testing
# Add fallback paths for local testing
possible_paths.extend([
'bazel-bin/databuild/test/app/bazel/bazel_graph.build',
'./bazel_graph.build'
])
graph_build_path = None
for path in possible_paths:
if os.path.exists(path):
graph_build_path = path
break
# Find the graph build binary
graph_build_path = self.find_graph_build_binary(possible_paths)
self.assertIsNotNone(graph_build_path,
f"Graph build binary not found in any of: {possible_paths}")
# Record start time for file modification check
start_time = time.time()
# Execute the graph build (shell script)
result = subprocess.run(
["bash", graph_build_path, self.partition_ref],
capture_output=True,
text=True
)
# Verify execution succeeded
self.assertEqual(result.returncode, 0,
f"Graph build failed with stderr: {result.stderr}")
# Verify output file was created
self.assertTrue(self.output_file.exists(),
f"Output file {self.output_file} was not created")
# Verify file was created recently (within 60 seconds)
file_mtime = os.path.getmtime(self.output_file)
time_diff = file_mtime - start_time
self.assertGreaterEqual(time_diff, -1, # Allow 1 second clock skew
f"File appears to be too old: {time_diff} seconds")
self.assertLessEqual(time_diff, 60,
f"File creation took too long: {time_diff} seconds")
# Verify file contains valid JSON
with open(self.output_file, 'r') as f:
content = f.read()
try:
data = json.loads(content)
except json.JSONDecodeError as e:
self.fail(f"Output file does not contain valid JSON: {e}")
# Basic sanity check on JSON structure
self.assertIsInstance(data, (dict, list),
"JSON should be an object or array")
# Execute and verify the graph build
self.execute_and_verify_graph_build(graph_build_path)
if __name__ == '__main__':
import unittest
unittest.main()

View file

@ -0,0 +1,9 @@
We can't write a direct `bazel test` for the DSL generated graph, because:
1. Bazel doesn't allow you to `bazel run graph.generate` to generate a BUILD.bazel that will be used in the same build.
2. We don't want to leak test generation into the graph generation code (since tests here are app specific)
Instead, we need to use a two phase process, where we rely on the graph to already be generated here, which will contain a test, such that `bazel test //...` will give us recall over generated source as well. This implies that this generated source is going to be checked in to git (gasp, I know), and we need a mechanism to ensure it stays up to date. To achieve this, we'll create a test that asserts that the contents of the `generated` dir is the exact same as the output of a new run of `graph.generate`.
Our task is to implement this test that asserts equality between the two, e.g. the target could depend on `graph.generate`, and in the test run it and md5 the results, comparing it to the md5 of the existing generated dir.

View file

@ -0,0 +1,7 @@
py_test(
name = "test_e2e",
srcs = ["test_e2e.py"],
data = ["//databuild/test/app/dsl/generated:dsl_graph.build"],
main = "test_e2e.py",
deps = ["//databuild/test/app:e2e_test_common"],
)

View file

@ -0,0 +1,37 @@
#!/usr/bin/env python3
"""
End-to-end test for the DSL-generated test app.
Tests the full pipeline: build execution -> output verification -> JSON validation.
"""
import os
from databuild.test.app.e2e_test_common import DataBuildE2ETestBase
class DSLGeneratedE2ETest(DataBuildE2ETestBase):
"""End-to-end test for the DSL-generated test app."""
def test_end_to_end_execution(self):
"""Test full end-to-end execution of the DSL-generated graph."""
# Build possible paths for the DSL-generated graph build binary
possible_paths = self.get_standard_runfiles_paths(
'databuild/test/app/dsl/generated/dsl_graph.build'
)
# Add fallback paths for local testing
possible_paths.extend([
'bazel-bin/databuild/test/app/dsl/generated/dsl_graph.build',
'./dsl_graph.build'
])
# Find the graph build binary
graph_build_path = self.find_graph_build_binary(possible_paths)
# Execute and verify the graph build
self.execute_and_verify_graph_build(graph_build_path)
if __name__ == '__main__':
import unittest
unittest.main()

View file

@ -0,0 +1,103 @@
#!/usr/bin/env python3
"""
Common end-to-end test logic for DataBuild test apps.
Provides shared functionality for testing both bazel-defined and DSL-generated graphs.
"""
import json
import os
import shutil
import subprocess
import time
import unittest
from pathlib import Path
from typing import List, Optional
class DataBuildE2ETestBase(unittest.TestCase):
"""Base class for DataBuild end-to-end tests."""
def setUp(self):
"""Set up test environment."""
self.output_dir = Path("/tmp/data/color_votes_1w/2025-09-01/red")
self.output_file = self.output_dir / "data.json"
self.partition_ref = "color_votes_1w/2025-09-01/red"
# Clean up any existing test data
if self.output_dir.exists():
shutil.rmtree(self.output_dir)
def tearDown(self):
"""Clean up test environment."""
if self.output_dir.exists():
shutil.rmtree(self.output_dir)
def find_graph_build_binary(self, possible_paths: List[str]) -> str:
"""Find the graph.build binary from a list of possible paths."""
graph_build_path = None
for path in possible_paths:
if os.path.exists(path):
graph_build_path = path
break
self.assertIsNotNone(graph_build_path,
f"Graph build binary not found in any of: {possible_paths}")
return graph_build_path
def execute_and_verify_graph_build(self, graph_build_path: str) -> None:
"""Execute the graph build and verify the results."""
# Record start time for file modification check
start_time = time.time()
# Execute the graph build (shell script)
result = subprocess.run(
["bash", graph_build_path, self.partition_ref],
capture_output=True,
text=True
)
# Verify execution succeeded
self.assertEqual(result.returncode, 0,
f"Graph build failed with stderr: {result.stderr}")
# Verify output file was created
self.assertTrue(self.output_file.exists(),
f"Output file {self.output_file} was not created")
# Verify file was created recently (within 60 seconds)
file_mtime = os.path.getmtime(self.output_file)
time_diff = file_mtime - start_time
self.assertGreaterEqual(time_diff, -1, # Allow 1 second clock skew
f"File appears to be too old: {time_diff} seconds")
self.assertLessEqual(time_diff, 60,
f"File creation took too long: {time_diff} seconds")
# Verify file contains valid JSON
with open(self.output_file, 'r') as f:
content = f.read()
try:
data = json.loads(content)
except json.JSONDecodeError as e:
self.fail(f"Output file does not contain valid JSON: {e}")
# Basic sanity check on JSON structure
self.assertIsInstance(data, (dict, list),
"JSON should be an object or array")
def get_standard_runfiles_paths(self, relative_path: str) -> List[str]:
"""Get standard list of possible runfiles paths for a binary."""
runfiles_dir = os.environ.get("RUNFILES_DIR")
test_srcdir = os.environ.get("TEST_SRCDIR")
possible_paths = []
if runfiles_dir:
possible_paths.append(os.path.join(runfiles_dir, '_main', relative_path))
possible_paths.append(os.path.join(runfiles_dir, relative_path))
if test_srcdir:
possible_paths.append(os.path.join(test_srcdir, '_main', relative_path))
possible_paths.append(os.path.join(test_srcdir, relative_path))
return possible_paths