diff --git a/databuild/dsl/python/dsl.py b/databuild/dsl/python/dsl.py index 6b155d8..dde2e00 100644 --- a/databuild/dsl/python/dsl.py +++ b/databuild/dsl/python/dsl.py @@ -120,7 +120,7 @@ class DataBuildGraph: import os # Get job classes from the lookup table - job_classes = list(set(self.lookup.values())) + job_classes = sorted(set(self.lookup.values()), key=lambda cls: cls.__name__) # Format deps for BUILD.bazel if deps: @@ -172,6 +172,15 @@ databuild_graph( lookup = ":{name}_job_lookup", visibility = ["//visibility:public"], ) + +# Create tar archive of generated files for testing +genrule( + name = "existing_generated", + srcs = glob(["*.py", "BUILD.bazel"]), + outs = ["existing_generated.tar"], + cmd = "mkdir -p temp && cp $(SRCS) temp/ && find temp -exec touch -t 197001010000 {{}} + && tar -cf $@ -C temp .", + visibility = ["//visibility:public"], +) ''' with open(os.path.join(output_dir, "BUILD.bazel"), "w") as f: diff --git a/databuild/test/app/dsl/BUILD.bazel b/databuild/test/app/dsl/BUILD.bazel index 10ab6b6..cdfb00b 100644 --- a/databuild/test/app/dsl/BUILD.bazel +++ b/databuild/test/app/dsl/BUILD.bazel @@ -22,3 +22,33 @@ databuild_dsl_generator( deps = [":dsl_src"], visibility = ["//visibility:public"], ) + +# Generate fresh DSL output for comparison testing +genrule( + name = "generate_fresh_dsl", + outs = ["generated_fresh.tar"], + cmd_bash = """ + # Create temporary directory for generation + mkdir -p temp_workspace/databuild/test/app/dsl + + # Set environment to generate to temp directory + export BUILD_WORKSPACE_DIRECTORY="temp_workspace" + + # Run the generator + $(location :graph.generate) + + # Create tar archive of generated files + if [ -d "temp_workspace/databuild/test/app/dsl/generated" ]; then + find temp_workspace/databuild/test/app/dsl/generated -exec touch -t 197001010000 {} + + tar -cf $@ -C temp_workspace/databuild/test/app/dsl/generated . + else + # Create empty tar if no files generated + tar -cf $@ -T /dev/null + fi + + # Clean up + rm -rf temp_workspace + """, + tools = [":graph.generate"], + visibility = ["//visibility:public"], +) diff --git a/databuild/test/app/dsl/test/BUILD.bazel b/databuild/test/app/dsl/test/BUILD.bazel index 1c93f48..0e5a19a 100644 --- a/databuild/test/app/dsl/test/BUILD.bazel +++ b/databuild/test/app/dsl/test/BUILD.bazel @@ -73,3 +73,15 @@ py_test( "//databuild/test/app/dsl:dsl_src", ], ) + +# DSL generation consistency test +py_test( + name = "test_dsl_generation_consistency", + srcs = ["test_dsl_generation_consistency.py"], + main = "test_dsl_generation_consistency.py", + data = [ + "//databuild/test/app/dsl:generate_fresh_dsl", + "//databuild/test/app/dsl/generated:existing_generated", + ], + deps = [], +) diff --git a/databuild/test/app/dsl/test/test_dsl_generation_consistency.py b/databuild/test/app/dsl/test/test_dsl_generation_consistency.py new file mode 100644 index 0000000..97bf536 --- /dev/null +++ b/databuild/test/app/dsl/test/test_dsl_generation_consistency.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +""" +Test that verifies the generated DSL code is up-to-date. + +This test ensures that the checked-in generated directory contents match +exactly what would be produced by a fresh run of graph.generate. +""" + +import hashlib +import os +import subprocess +import tempfile +import unittest +from pathlib import Path + + +class TestDSLGenerationConsistency(unittest.TestCase): + def setUp(self): + # Find the test runfiles directory to locate tar files + runfiles_dir = os.environ.get("RUNFILES_DIR") + if runfiles_dir: + self.runfiles_root = Path(runfiles_dir) / "_main" + else: + # Fallback for development - not expected to work in this case + self.fail("RUNFILES_DIR not set - test must be run via bazel test") + + def _compute_tar_hash(self, tar_path: Path) -> str: + """Compute MD5 hash of a tar file's contents.""" + if not tar_path.exists(): + self.fail(f"Tar file not found: {tar_path}") + + with open(tar_path, "rb") as f: + content = f.read() + return hashlib.md5(content).hexdigest() + + def _extract_and_list_tar(self, tar_path: Path) -> set: + """Extract tar file and return set of file paths and their content hashes.""" + if not tar_path.exists(): + return set() + + result = subprocess.run([ + "tar", "-tf", str(tar_path) + ], capture_output=True, text=True) + + if result.returncode != 0: + self.fail(f"Failed to list tar contents: {result.stderr}") + + return set(result.stdout.strip().split('\n')) if result.stdout.strip() else set() + + def test_generated_code_is_up_to_date(self): + """Test that the existing generated tar matches the fresh generated tar.""" + + # Find the tar files from data dependencies + existing_tar = self.runfiles_root / "databuild/test/app/dsl/generated/existing_generated.tar" + fresh_tar = self.runfiles_root / "databuild/test/app/dsl/generated_fresh.tar" + + # Compute hashes of both tar files + existing_hash = self._compute_tar_hash(existing_tar) + fresh_hash = self._compute_tar_hash(fresh_tar) + + # Compare hashes + if existing_hash != fresh_hash: + # Provide detailed diff information + existing_files = self._extract_and_list_tar(existing_tar) + fresh_files = self._extract_and_list_tar(fresh_tar) + + only_in_existing = existing_files - fresh_files + only_in_fresh = fresh_files - existing_files + + error_msg = [ + "Generated DSL code is out of date!", + f"Existing tar hash: {existing_hash}", + f"Fresh tar hash: {fresh_hash}", + "", + "To fix this, run:", + " bazel run //databuild/test/app/dsl:graph.generate", + "" + ] + + if only_in_existing: + error_msg.extend([ + "Files only in existing generated code:", + *[f" - {f}" for f in sorted(only_in_existing)], + "" + ]) + + if only_in_fresh: + error_msg.extend([ + "Files only in fresh generated code:", + *[f" + {f}" for f in sorted(only_in_fresh)], + "" + ]) + + common_files = existing_files & fresh_files + if common_files: + error_msg.extend([ + f"Common files: {len(common_files)}", + "This suggests files have different contents.", + ]) + + self.fail("\n".join(error_msg)) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file