Detect out of date generated source

This commit is contained in:
Stuart Axelbrooke 2025-08-16 15:37:07 -07:00
parent 952366ab66
commit 07d2a9faec
4 changed files with 157 additions and 1 deletions

View file

@ -120,7 +120,7 @@ class DataBuildGraph:
import os
# Get job classes from the lookup table
job_classes = list(set(self.lookup.values()))
job_classes = sorted(set(self.lookup.values()), key=lambda cls: cls.__name__)
# Format deps for BUILD.bazel
if deps:
@ -172,6 +172,15 @@ databuild_graph(
lookup = ":{name}_job_lookup",
visibility = ["//visibility:public"],
)
# Create tar archive of generated files for testing
genrule(
name = "existing_generated",
srcs = glob(["*.py", "BUILD.bazel"]),
outs = ["existing_generated.tar"],
cmd = "mkdir -p temp && cp $(SRCS) temp/ && find temp -exec touch -t 197001010000 {{}} + && tar -cf $@ -C temp .",
visibility = ["//visibility:public"],
)
'''
with open(os.path.join(output_dir, "BUILD.bazel"), "w") as f:

View file

@ -22,3 +22,33 @@ databuild_dsl_generator(
deps = [":dsl_src"],
visibility = ["//visibility:public"],
)
# Generate fresh DSL output for comparison testing
genrule(
name = "generate_fresh_dsl",
outs = ["generated_fresh.tar"],
cmd_bash = """
# Create temporary directory for generation
mkdir -p temp_workspace/databuild/test/app/dsl
# Set environment to generate to temp directory
export BUILD_WORKSPACE_DIRECTORY="temp_workspace"
# Run the generator
$(location :graph.generate)
# Create tar archive of generated files
if [ -d "temp_workspace/databuild/test/app/dsl/generated" ]; then
find temp_workspace/databuild/test/app/dsl/generated -exec touch -t 197001010000 {} +
tar -cf $@ -C temp_workspace/databuild/test/app/dsl/generated .
else
# Create empty tar if no files generated
tar -cf $@ -T /dev/null
fi
# Clean up
rm -rf temp_workspace
""",
tools = [":graph.generate"],
visibility = ["//visibility:public"],
)

View file

@ -73,3 +73,15 @@ py_test(
"//databuild/test/app/dsl:dsl_src",
],
)
# DSL generation consistency test
py_test(
name = "test_dsl_generation_consistency",
srcs = ["test_dsl_generation_consistency.py"],
main = "test_dsl_generation_consistency.py",
data = [
"//databuild/test/app/dsl:generate_fresh_dsl",
"//databuild/test/app/dsl/generated:existing_generated",
],
deps = [],
)

View file

@ -0,0 +1,105 @@
#!/usr/bin/env python3
"""
Test that verifies the generated DSL code is up-to-date.
This test ensures that the checked-in generated directory contents match
exactly what would be produced by a fresh run of graph.generate.
"""
import hashlib
import os
import subprocess
import tempfile
import unittest
from pathlib import Path
class TestDSLGenerationConsistency(unittest.TestCase):
def setUp(self):
# Find the test runfiles directory to locate tar files
runfiles_dir = os.environ.get("RUNFILES_DIR")
if runfiles_dir:
self.runfiles_root = Path(runfiles_dir) / "_main"
else:
# Fallback for development - not expected to work in this case
self.fail("RUNFILES_DIR not set - test must be run via bazel test")
def _compute_tar_hash(self, tar_path: Path) -> str:
"""Compute MD5 hash of a tar file's contents."""
if not tar_path.exists():
self.fail(f"Tar file not found: {tar_path}")
with open(tar_path, "rb") as f:
content = f.read()
return hashlib.md5(content).hexdigest()
def _extract_and_list_tar(self, tar_path: Path) -> set:
"""Extract tar file and return set of file paths and their content hashes."""
if not tar_path.exists():
return set()
result = subprocess.run([
"tar", "-tf", str(tar_path)
], capture_output=True, text=True)
if result.returncode != 0:
self.fail(f"Failed to list tar contents: {result.stderr}")
return set(result.stdout.strip().split('\n')) if result.stdout.strip() else set()
def test_generated_code_is_up_to_date(self):
"""Test that the existing generated tar matches the fresh generated tar."""
# Find the tar files from data dependencies
existing_tar = self.runfiles_root / "databuild/test/app/dsl/generated/existing_generated.tar"
fresh_tar = self.runfiles_root / "databuild/test/app/dsl/generated_fresh.tar"
# Compute hashes of both tar files
existing_hash = self._compute_tar_hash(existing_tar)
fresh_hash = self._compute_tar_hash(fresh_tar)
# Compare hashes
if existing_hash != fresh_hash:
# Provide detailed diff information
existing_files = self._extract_and_list_tar(existing_tar)
fresh_files = self._extract_and_list_tar(fresh_tar)
only_in_existing = existing_files - fresh_files
only_in_fresh = fresh_files - existing_files
error_msg = [
"Generated DSL code is out of date!",
f"Existing tar hash: {existing_hash}",
f"Fresh tar hash: {fresh_hash}",
"",
"To fix this, run:",
" bazel run //databuild/test/app/dsl:graph.generate",
""
]
if only_in_existing:
error_msg.extend([
"Files only in existing generated code:",
*[f" - {f}" for f in sorted(only_in_existing)],
""
])
if only_in_fresh:
error_msg.extend([
"Files only in fresh generated code:",
*[f" + {f}" for f in sorted(only_in_fresh)],
""
])
common_files = existing_files & fresh_files
if common_files:
error_msg.extend([
f"Common files: {len(common_files)}",
"This suggests files have different contents.",
])
self.fail("\n".join(error_msg))
if __name__ == "__main__":
unittest.main()