Make dsl generation work for submodules

2025-08-06 22:16:01 -07:00 · 2025-08-06 22:16:01 -07:00 · ba18734190
commit ba18734190
parent f6e6dad32c
9 changed files with 8562 additions and 18 deletions
--- a/databuild/dsl/python/dsl.py
+++ b/databuild/dsl/python/dsl.py
@ -84,12 +84,13 @@ class DataBuildGraph:
        """Generates a complete databuild application, packaging up referenced jobs and this graph via bazel targets"""
        raise NotImplementedError
-    def generate_bazel_package(self, name: str, output_dir: str) -> None:
+    def generate_bazel_package(self, name: str, output_dir: str, deps: list = None) -> None:
        """Generate BUILD.bazel and binaries into a generated/ subdirectory.
        Args:
            name: Base name for the generated graph (without .generate suffix)
            output_dir: Directory to write generated files to (will create generated/ subdir)
            deps: List of Bazel dependency labels to use in generated BUILD.bazel
        """
        import os
        import shutil
@ -99,7 +100,7 @@ class DataBuildGraph:
        os.makedirs(generated_dir, exist_ok=True)
        # Generate BUILD.bazel with job and graph targets
-        self._generate_build_bazel(generated_dir, name)
+        self._generate_build_bazel(generated_dir, name, deps or [])
        # Generate individual job scripts (instead of shared wrapper)
        self._generate_job_scripts(generated_dir)
@ -114,15 +115,20 @@ class DataBuildGraph:
        else:
            print(f"Run 'bazel build generated:{name}_graph.analyze' to use the generated graph")
-    def _generate_build_bazel(self, output_dir: str, name: str) -> None:
+    def _generate_build_bazel(self, output_dir: str, name: str, deps: list) -> None:
        """Generate BUILD.bazel with databuild_job and databuild_graph targets."""
        import os
        # Get job classes from the lookup table
        job_classes = list(set(self.lookup.values()))
-        # Get parent package for dependencies
+        # Format deps for BUILD.bazel
-        parent_package = self._get_package_name()
+        if deps:
            deps_str = ", ".join([f'"{dep}"' for dep in deps])
        else:
            # Fallback to parent package if no deps provided
            parent_package = self._get_package_name()
            deps_str = f'"//{parent_package}:dsl_src"'
        # Generate py_binary targets for each job
        job_binaries = []
@ -138,7 +144,7 @@ class DataBuildGraph:
    name = "{binary_name}",
    srcs = ["{job_script_name}"],
    main = "{job_script_name}",
-    deps = ["//{parent_package}:dsl_src"],
+    deps = [{deps_str}],
 )
 databuild_job(
@ -157,7 +163,7 @@ databuild_job(
 py_binary(
    name = "{name}_job_lookup",
    srcs = ["{name}_job_lookup.py"],
-    deps = ["//{parent_package}:dsl_src"],
+    deps = [{deps_str}],
 )
 databuild_graph(
--- a/databuild/dsl/python/generator_lib.py
+++ b/databuild/dsl/python/generator_lib.py
@ -7,7 +7,7 @@ import os
 import importlib
-def generate_dsl_package(module_path: str, graph_attr: str, output_dir: str):
+def generate_dsl_package(module_path: str, graph_attr: str, output_dir: str, deps: list = None):
    """
    Generate DataBuild DSL package from a graph definition.
@ -15,6 +15,7 @@ def generate_dsl_package(module_path: str, graph_attr: str, output_dir: str):
        module_path: Python module path (e.g., "databuild.test.app.dsl.graph")
        graph_attr: Name of the graph attribute in the module
        output_dir: Directory where to generate the DSL package
        deps: List of Bazel dependency labels to use in generated BUILD.bazel
    """
    # Extract the base name from the output directory for naming
    name = os.path.basename(output_dir.rstrip('/')) or "graph"
@ -25,7 +26,7 @@ def generate_dsl_package(module_path: str, graph_attr: str, output_dir: str):
        graph = getattr(module, graph_attr)
        # Generate the bazel package
-        graph.generate_bazel_package(name, output_dir)
+        graph.generate_bazel_package(name, output_dir, deps or [])
        print(f"Generated DataBuild DSL package in {output_dir}")
--- a/databuild/rules.bzl
+++ b/databuild/rules.bzl
@ -970,7 +970,7 @@ def databuild_dsl_generator(
        visibility = visibility,
    )
-def _generate_custom_generator_script(module_path, graph_attr, package_path):
+def _generate_custom_generator_script(module_path, graph_attr, package_path, deps):
    """Generate the custom generator script content with embedded parameters."""
    return """#!/usr/bin/env python3
 import os
@ -981,9 +981,8 @@ import sys
 script_path = os.path.abspath(__file__)
 runfiles_dir = script_path + '.runfiles'
-# Debug: Runfiles path setup
+# Debug: Runfiles path setup for cross-workspace usage
-# print(f"DEBUG: Script path: {{script_path}}", file=sys.stderr)
+# Setting up runfiles paths for cross-workspace usage
 # print(f"DEBUG: Looking for runfiles at: {{runfiles_dir}}", file=sys.stderr)
 if os.path.exists(runfiles_dir):
    # Found runfiles directory, add _main to Python path
@ -991,8 +990,19 @@ if os.path.exists(runfiles_dir):
    if os.path.exists(main_runfiles_path):
        sys.path.insert(0, main_runfiles_path)
        # Successfully added main runfiles path
        # Check what other directories exist in runfiles for cross-workspace usage
        # All runfiles directories available
    else:
-        print("DEBUG: _main directory not found in runfiles", file=sys.stderr)
+        # _main directory not found in runfiles
        pass
    # Add external repository runfiles (like databuild+) for cross-workspace usage
    for entry in os.listdir(runfiles_dir):
        if entry.endswith('+') and os.path.isdir(os.path.join(runfiles_dir, entry)):
            external_path = os.path.join(runfiles_dir, entry)
            sys.path.insert(0, external_path)
            # Added external repository path
    # Also add pip package runfiles to Python path
    for entry in os.listdir(runfiles_dir):
@ -1000,9 +1010,9 @@ if os.path.exists(runfiles_dir):
            pip_site_packages = os.path.join(runfiles_dir, entry, 'site-packages')
            if os.path.exists(pip_site_packages):
                sys.path.insert(0, pip_site_packages)
-                # Successfully added pip package path
+                # Added pip package path
 else:
-    print("DEBUG: Runfiles directory not found, using workspace root", file=sys.stderr)
+    # Runfiles directory not found, falling back to workspace root
    # If runfiles not available, we're probably running in development
    # Add the workspace root to the path
    workspace_root = os.environ.get('BUILD_WORKSPACE_DIRECTORY')
@ -1025,7 +1035,7 @@ def main():
    print(f"Generating DataBuild DSL code to {{output_dir}}")
    try:
-        generate_dsl_package('{module_path}', '{graph_attr}', output_dir)
+        generate_dsl_package('{module_path}', '{graph_attr}', output_dir, {deps})
    except Exception as e:
        print(f"ERROR: {{e}}", file=sys.stderr)
        import traceback
@ -1038,6 +1048,7 @@ if __name__ == "__main__":
        module_path=module_path,
        graph_attr=graph_attr,
        package_path=package_path,
        deps=deps,
    )
 def _databuild_dsl_generator_impl(ctx):
@ -1055,10 +1066,14 @@ def _databuild_dsl_generator_impl(ctx):
    package_path = ctx.attr.output_package.strip("//").replace(":", "/")
    # Generate script content with embedded parameters
    # Convert deps to list of strings
    dep_labels = [str(dep.label) for dep in ctx.attr.deps] if ctx.attr.deps else []
    script_content = _generate_custom_generator_script(
        module_path=module_path,
        graph_attr=ctx.attr.graph_attr,
-        package_path=package_path
+        package_path=package_path,
        deps=dep_labels
    )
    ctx.actions.write(
--- a/examples/simple_python_dsl/BUILD.bazel
+++ b/examples/simple_python_dsl/BUILD.bazel
@ -0,0 +1,22 @@
 load("@databuild//databuild:rules.bzl", "databuild_dsl_generator")
 # Python DSL library containing the graph definition
 py_library(
    name = "simple_dsl_src",
    srcs = ["simple_graph.py"],
    visibility = ["//visibility:public"],
    deps = [
        "@databuild//databuild:py_proto",
        "@databuild//databuild/dsl/python:dsl",
    ],
 )
 # DSL generator that creates bazel targets from the Python DSL
 databuild_dsl_generator(
    name = "simple_graph.generate",
    graph_attr = "graph",
    graph_file = "simple_graph.py",
    output_package = "//",
    visibility = ["//visibility:public"],
    deps = [":simple_dsl_src"],
 )
--- a/examples/simple_python_dsl/MODULE.bazel
+++ b/examples/simple_python_dsl/MODULE.bazel
@ -0,0 +1,26 @@
 module(
    name = "simple_python_dsl_example",
    version = "0.1",
 )
 # Databuild dep - overridden so ignore version
 bazel_dep(name = "databuild", version = "0.0")
 local_path_override(
    module_name = "databuild",
    path = "../..",
 )
 bazel_dep(name = "rules_python", version = "1.3.0")
 python = use_extension("@rules_python//python/extensions:python.bzl", "python")
 python.toolchain(
    python_version = "3.12",
 )
 pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip")
 pip.parse(
    hub_name = "pypi",
    python_version = "3.12",
    requirements_lock = "//:requirements_lock.txt",
 )
 use_repo(pip, "pypi")
--- a/examples/simple_python_dsl/MODULE.bazel.lock
+++ b/examples/simple_python_dsl/MODULE.bazel.lock
--- a/examples/simple_python_dsl/README.md
+++ b/examples/simple_python_dsl/README.md
@ -0,0 +1,87 @@
 # Simple Python DSL Example
 This example demonstrates how to use DataBuild's Python DSL to define a simple data processing pipeline.
 ## Overview
 The example defines a basic 3-stage data processing pipeline:
 1. **IngestRawData**: Ingests raw data for a specific date
 2. **ProcessData**: Processes the raw data into a processed format  
 3. **CreateSummary**: Creates summary statistics from processed data
 ## Files
 - `simple_graph.py`: Python DSL definition of the data pipeline
 - `BUILD.bazel`: Bazel build configuration
 - `MODULE.bazel`: Bazel module configuration for dependencies
 ## Usage
 ### Generate DSL Targets
 The DSL generator can create Bazel targets from the Python DSL definition:
 ```bash
 bazel run //:simple_graph.generate
 ```
 This will generate Bazel targets in the `generated/` directory.
 ### Build Individual Jobs
 ```bash
 # Build a specific job
 bazel build //:ingest_raw_data
 # Build all jobs
 bazel build //:simple_graph
 ```
 ### Analyze the Graph
 ```bash
 # Analyze what jobs would run for specific partitions
 bazel run //:simple_graph.analyze -- "summary/date=2024-01-01"
 ```
 ### Run the Graph
 ```bash
 # Build specific partitions
 bazel run //:simple_graph.build -- "summary/date=2024-01-01"
 ```
 ## Cross-Workspace Usage
 This example can be consumed from external workspaces by adding DataBuild as a dependency in your `MODULE.bazel`:
 ```starlark
 bazel_dep(name = "databuild", version = "0.0")
 local_path_override(
    module_name = "databuild", 
    path = "path/to/databuild",
 )
 ```
 Then you can reference and extend this example:
 ```python
 from databuild.dsl.python.dsl import DataBuildGraph
 # Import and extend the simple graph
 ```
 ## Testing
 To test that the DSL generator works correctly:
 ```bash
 # Test the DSL generation
 bazel run //:simple_graph.generate
 # Verify generated files exist
 ls generated/
 # Test job lookup
 bazel run //:job_lookup -- "raw_data/date=2024-01-01"
 ```
--- a/examples/simple_python_dsl/requirements_lock.txt
+++ b/examples/simple_python_dsl/requirements_lock.txt
@ -0,0 +1,2 @@
 # Simple Python DSL example - minimal requirements
 # This file would normally be generated by pip-compile
--- a/examples/simple_python_dsl/simple_graph.py
+++ b/examples/simple_python_dsl/simple_graph.py
@ -0,0 +1,124 @@
 """
 Simple Python DSL example - basic data processing pipeline
 """
 from databuild.dsl.python.dsl import DataBuildGraph, DataBuildJob, JobConfigBuilder, PartitionPattern
 from databuild.proto import JobConfig
 from datetime import date
 import os
 # Define the graph
 graph = DataBuildGraph("//:simple_graph")
 class RawDataPartition(PartitionPattern):
    """Represents raw input data for a specific date"""
    _raw_pattern = r"raw_data/date=(?P<date>\d{4}-\d{2}-\d{2})"
    def __init__(self, date: str):
        self.date = date
    def serialize(self) -> str:
        return f"raw_data/date={self.date}"
 class ProcessedDataPartition(PartitionPattern):
    """Represents processed data for a specific date"""
    _raw_pattern = r"processed_data/date=(?P<date>\d{4}-\d{2}-\d{2})"
    def __init__(self, date: str):
        self.date = date
    def serialize(self) -> str:
        return f"processed_data/date={self.date}"
 class SummaryPartition(PartitionPattern):
    """Represents summary data for a specific date"""
    _raw_pattern = r"summary/date=(?P<date>\d{4}-\d{2}-\d{2})"
    def __init__(self, date: str):
        self.date = date
    def serialize(self) -> str:
        return f"summary/date={self.date}"
@graph.job
 class IngestRawData(DataBuildJob):
    """Job to ingest raw data for a given date"""
    output_types = [RawDataPartition]
    def config(self, outputs: list[RawDataPartition]) -> list[JobConfig]:
        configs = []
        for output in outputs:
            env = {"DATA_DATE": output.date}
            configs.append(
                JobConfigBuilder()
                .add_outputs(output)
                .set_env(env)
                .add_args("--date", output.date)
                .build()
            )
        return configs
    def exec(self, *args: str) -> None:
        # Simple implementation - just create a dummy file
        data_date = os.environ["DATA_DATE"]
        print(f"Ingesting raw data for {data_date}")
        # In a real job, this would read from external sources
        print(f"Raw data ingested successfully for {data_date}")
@graph.job  
 class ProcessData(DataBuildJob):
    """Job to process raw data into processed format"""
    output_types = [ProcessedDataPartition]
    def config(self, outputs: list[ProcessedDataPartition]) -> list[JobConfig]:
        configs = []
        for output in outputs:
            raw_input = RawDataPartition(date=output.date)
            env = {"DATA_DATE": output.date}
            configs.append(
                JobConfigBuilder()
                .add_outputs(output)
                .add_inputs(raw_input)
                .set_env(env)
                .add_args("--date", output.date)
                .build()
            )
        return configs
    def exec(self, *args: str) -> None:
        data_date = os.environ["DATA_DATE"]
        print(f"Processing data for {data_date}")
        # In a real job, this would transform the raw data
        print(f"Data processed successfully for {data_date}")
@graph.job
 class CreateSummary(DataBuildJob):
    """Job to create summary from processed data"""
    output_types = [SummaryPartition]
    def config(self, outputs: list[SummaryPartition]) -> list[JobConfig]:
        configs = []
        for output in outputs:
            processed_input = ProcessedDataPartition(date=output.date)
            env = {"DATA_DATE": output.date}
            configs.append(
                JobConfigBuilder()
                .add_outputs(output)
                .add_inputs(processed_input)
                .set_env(env)
                .add_args("--date", output.date)
                .build()
            )
        return configs
    def exec(self, *args: str) -> None:
        data_date = os.environ["DATA_DATE"]
        print(f"Creating summary for {data_date}")
        # In a real job, this would generate summary statistics
        print(f"Summary created successfully for {data_date}")
		`@ -0,0 +1,2 @@`
							`# Simple Python DSL example - minimal requirements`
							`# This file would normally be generated by pip-compile`