Detect out of date generated source

2025-08-16 15:37:07 -07:00 · 2025-08-16 15:37:07 -07:00 · 07d2a9faec
commit 07d2a9faec
parent 952366ab66
4 changed files with 157 additions and 1 deletions
--- a/databuild/dsl/python/dsl.py
+++ b/databuild/dsl/python/dsl.py
@ -120,7 +120,7 @@ class DataBuildGraph:
        import os
        
        # Get job classes from the lookup table
-        job_classes = list(set(self.lookup.values()))
+        job_classes = sorted(set(self.lookup.values()), key=lambda cls: cls.__name__)
        
        # Format deps for BUILD.bazel
        if deps:
@ -172,6 +172,15 @@ databuild_graph(
    lookup = ":{name}_job_lookup",
    visibility = ["//visibility:public"],
 )
+
+# Create tar archive of generated files for testing
+genrule(
+    name = "existing_generated",
+    srcs = glob(["*.py", "BUILD.bazel"]),
+    outs = ["existing_generated.tar"],
+    cmd = "mkdir -p temp && cp $(SRCS) temp/ && find temp -exec touch -t 197001010000 {{}} + && tar -cf $@ -C temp .",
+    visibility = ["//visibility:public"],
+)
 '''
        
        with open(os.path.join(output_dir, "BUILD.bazel"), "w") as f:
--- a/databuild/test/app/dsl/BUILD.bazel
+++ b/databuild/test/app/dsl/BUILD.bazel
@ -22,3 +22,33 @@ databuild_dsl_generator(
    deps = [":dsl_src"],
    visibility = ["//visibility:public"],
 )
+
+# Generate fresh DSL output for comparison testing
+genrule(
+    name = "generate_fresh_dsl",
+    outs = ["generated_fresh.tar"],
+    cmd_bash = """
+        # Create temporary directory for generation
+        mkdir -p temp_workspace/databuild/test/app/dsl
+        
+        # Set environment to generate to temp directory
+        export BUILD_WORKSPACE_DIRECTORY="temp_workspace"
+        
+        # Run the generator
+        $(location :graph.generate)
+        
+        # Create tar archive of generated files
+        if [ -d "temp_workspace/databuild/test/app/dsl/generated" ]; then
+            find temp_workspace/databuild/test/app/dsl/generated -exec touch -t 197001010000 {} +
+            tar -cf $@ -C temp_workspace/databuild/test/app/dsl/generated .
+        else
+            # Create empty tar if no files generated
+            tar -cf $@ -T /dev/null
+        fi
+        
+        # Clean up
+        rm -rf temp_workspace
+    """,
+    tools = [":graph.generate"],
+    visibility = ["//visibility:public"],
+)
--- a/databuild/test/app/dsl/test/BUILD.bazel
+++ b/databuild/test/app/dsl/test/BUILD.bazel
@ -73,3 +73,15 @@ py_test(
        "//databuild/test/app/dsl:dsl_src",
    ],
 )
+
+# DSL generation consistency test
+py_test(
+    name = "test_dsl_generation_consistency",
+    srcs = ["test_dsl_generation_consistency.py"],
+    main = "test_dsl_generation_consistency.py",
+    data = [
+        "//databuild/test/app/dsl:generate_fresh_dsl",
+        "//databuild/test/app/dsl/generated:existing_generated",
+    ],
+    deps = [],
+)
--- a/databuild/test/app/dsl/test/test_dsl_generation_consistency.py
+++ b/databuild/test/app/dsl/test/test_dsl_generation_consistency.py
@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""
+Test that verifies the generated DSL code is up-to-date.
+
+This test ensures that the checked-in generated directory contents match
+exactly what would be produced by a fresh run of graph.generate.
+"""
+
+import hashlib
+import os
+import subprocess
+import tempfile
+import unittest
+from pathlib import Path
+
+
+class TestDSLGenerationConsistency(unittest.TestCase):
+    def setUp(self):
+        # Find the test runfiles directory to locate tar files
+        runfiles_dir = os.environ.get("RUNFILES_DIR")
+        if runfiles_dir:
+            self.runfiles_root = Path(runfiles_dir) / "_main"
+        else:
+            # Fallback for development - not expected to work in this case
+            self.fail("RUNFILES_DIR not set - test must be run via bazel test")
+
+    def _compute_tar_hash(self, tar_path: Path) -> str:
+        """Compute MD5 hash of a tar file's contents."""
+        if not tar_path.exists():
+            self.fail(f"Tar file not found: {tar_path}")
+            
+        with open(tar_path, "rb") as f:
+            content = f.read()
+            return hashlib.md5(content).hexdigest()
+
+    def _extract_and_list_tar(self, tar_path: Path) -> set:
+        """Extract tar file and return set of file paths and their content hashes."""
+        if not tar_path.exists():
+            return set()
+            
+        result = subprocess.run([
+            "tar", "-tf", str(tar_path)
+        ], capture_output=True, text=True)
+        
+        if result.returncode != 0:
+            self.fail(f"Failed to list tar contents: {result.stderr}")
+            
+        return set(result.stdout.strip().split('\n')) if result.stdout.strip() else set()
+
+    def test_generated_code_is_up_to_date(self):
+        """Test that the existing generated tar matches the fresh generated tar."""
+        
+        # Find the tar files from data dependencies
+        existing_tar = self.runfiles_root / "databuild/test/app/dsl/generated/existing_generated.tar"
+        fresh_tar = self.runfiles_root / "databuild/test/app/dsl/generated_fresh.tar"
+        
+        # Compute hashes of both tar files
+        existing_hash = self._compute_tar_hash(existing_tar)
+        fresh_hash = self._compute_tar_hash(fresh_tar)
+        
+        # Compare hashes
+        if existing_hash != fresh_hash:
+            # Provide detailed diff information
+            existing_files = self._extract_and_list_tar(existing_tar)
+            fresh_files = self._extract_and_list_tar(fresh_tar)
+            
+            only_in_existing = existing_files - fresh_files
+            only_in_fresh = fresh_files - existing_files
+            
+            error_msg = [
+                "Generated DSL code is out of date!",
+                f"Existing tar hash: {existing_hash}",
+                f"Fresh tar hash: {fresh_hash}",
+                "",
+                "To fix this, run:",
+                "  bazel run //databuild/test/app/dsl:graph.generate",
+                ""
+            ]
+            
+            if only_in_existing:
+                error_msg.extend([
+                    "Files only in existing generated code:",
+                    *[f"  - {f}" for f in sorted(only_in_existing)],
+                    ""
+                ])
+            
+            if only_in_fresh:
+                error_msg.extend([
+                    "Files only in fresh generated code:",
+                    *[f"  + {f}" for f in sorted(only_in_fresh)],
+                    ""
+                ])
+                
+            common_files = existing_files & fresh_files
+            if common_files:
+                error_msg.extend([
+                    f"Common files: {len(common_files)}",
+                    "This suggests files have different contents.",
+                ])
+            
+            self.fail("\n".join(error_msg))
+
+
+if __name__ == "__main__":
+    unittest.main()