diff --git a/databuild/BUILD.bazel b/databuild/BUILD.bazel index 0e6b713..644cbba 100644 --- a/databuild/BUILD.bazel +++ b/databuild/BUILD.bazel @@ -174,6 +174,39 @@ export PATH=$$PATH:$$(dirname $(location :protoc-gen-python_betterproto2)) export PATH=$$PATH:$$(dirname $(location //:ruff_binary)) $(location @com_google_protobuf//:protoc) --python_betterproto2_out=$(@D) $(location databuild.proto) mkdir -p $(@D)/py_proto_out/databuild/v1 + +# Make grpc import conditional to avoid binary compatibility issues during DSL generation +cat > /tmp/fix_grpc_import.py << 'EOF' +import sys + +with open(sys.argv[1], 'r') as f: + content = f.read() + +# Replace the grpc import with conditional import +content = content.replace( + 'import grpc', + '''try: + import grpc + _GRPC_AVAILABLE = True +except ImportError: + grpc = None + _GRPC_AVAILABLE = False''' +) + +# Replace service constructors to check grpc availability +content = content.replace( + 'def __init__(self, channel: grpc.Channel):', + '''def __init__(self, channel): + if not _GRPC_AVAILABLE: + raise RuntimeError("grpc not available - required for service classes")''' +) + +with open(sys.argv[1], 'w') as f: + f.write(content) +EOF + +python3 /tmp/fix_grpc_import.py $(@D)/databuild/v1/__init__.py + cp $(@D)/databuild/__init__.py $(@D)/py_proto_out/__init__.py cp $(@D)/databuild/__init__.py $(@D)/py_proto_out/databuild/__init__.py cp $(@D)/databuild/v1/__init__.py $(@D)/py_proto_out/databuild/v1/__init__.py diff --git a/databuild/dsl/python/BUILD.bazel b/databuild/dsl/python/BUILD.bazel index e36b153..34908b9 100644 --- a/databuild/dsl/python/BUILD.bazel +++ b/databuild/dsl/python/BUILD.bazel @@ -7,16 +7,23 @@ py_library( ], ) -py_binary( - name = "generator", - srcs = ["generator.py"], - main = "generator.py", - data = ["dsl_job_wrapper.py"], +py_library( + name = "generator_lib", + srcs = ["generator_lib.py"], visibility = ["//visibility:public"], deps = [ ":dsl", "//databuild:py_proto", - # Include test app modules for testing - "//databuild/test/app/dsl:dsl_src", + ], +) + +py_binary( + name = "generator", + srcs = ["generator.py"], + data = ["dsl_job_wrapper.py"], + main = "generator.py", + visibility = ["//visibility:public"], + deps = [ + ":generator_lib", ], ) diff --git a/databuild/dsl/python/generator.py b/databuild/dsl/python/generator.py index 0551094..b5e70de 100644 --- a/databuild/dsl/python/generator.py +++ b/databuild/dsl/python/generator.py @@ -4,8 +4,7 @@ DSL code generator that can be run as a py_binary with proper dependencies. """ import sys -import os -import importlib +from databuild.dsl.python.generator_lib import generate_dsl_package def main(): @@ -17,27 +16,10 @@ def main(): graph_attr = sys.argv[2] output_dir = sys.argv[3] - # Extract the base name from the output directory for naming - name = os.path.basename(output_dir.rstrip('/')) or "graph" - try: - # Import the graph module - module = importlib.import_module(module_path) - graph = getattr(module, graph_attr) - - # Generate the bazel package - graph.generate_bazel_package(name, output_dir) - - print(f"Generated DataBuild DSL package in {output_dir}") - - except ImportError as e: - print(f"ERROR: Failed to import {graph_attr} from {module_path}: {e}", file=sys.stderr) - sys.exit(1) - except AttributeError as e: - print(f"ERROR: Module {module_path} does not have attribute {graph_attr}: {e}", file=sys.stderr) - sys.exit(1) + generate_dsl_package(module_path, graph_attr, output_dir) except Exception as e: - print(f"ERROR: Generation failed: {e}", file=sys.stderr) + print(f"ERROR: {e}", file=sys.stderr) import traceback traceback.print_exc() sys.exit(1) diff --git a/databuild/dsl/python/generator_lib.py b/databuild/dsl/python/generator_lib.py new file mode 100644 index 0000000..8b1b9b0 --- /dev/null +++ b/databuild/dsl/python/generator_lib.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +""" +Core DSL code generation library that can be imported by different generator binaries. +""" + +import os +import importlib + + +def generate_dsl_package(module_path: str, graph_attr: str, output_dir: str): + """ + Generate DataBuild DSL package from a graph definition. + + Args: + module_path: Python module path (e.g., "databuild.test.app.dsl.graph") + graph_attr: Name of the graph attribute in the module + output_dir: Directory where to generate the DSL package + """ + # Extract the base name from the output directory for naming + name = os.path.basename(output_dir.rstrip('/')) or "graph" + + try: + # Import the graph module + module = importlib.import_module(module_path) + graph = getattr(module, graph_attr) + + # Generate the bazel package + graph.generate_bazel_package(name, output_dir) + + print(f"Generated DataBuild DSL package in {output_dir}") + + except ImportError as e: + raise ImportError(f"Failed to import {graph_attr} from {module_path}: {e}") + except AttributeError as e: + raise AttributeError(f"Module {module_path} does not have attribute {graph_attr}: {e}") + except Exception as e: + raise Exception(f"Generation failed: {e}") \ No newline at end of file diff --git a/databuild/rules.bzl b/databuild/rules.bzl index a1fd122..66500ec 100644 --- a/databuild/rules.bzl +++ b/databuild/rules.bzl @@ -970,9 +970,80 @@ def databuild_dsl_generator( visibility = visibility, ) +def _generate_custom_generator_script(module_path, graph_attr, package_path): + """Generate the custom generator script content with embedded parameters.""" + return """#!/usr/bin/env python3 +import os +import sys + +# Setup runfiles for proper module resolution +# Try to find the runfiles directory relative to this script +script_path = os.path.abspath(__file__) +runfiles_dir = script_path + '.runfiles' + +# Debug: Runfiles path setup +# print(f"DEBUG: Script path: {{script_path}}", file=sys.stderr) +# print(f"DEBUG: Looking for runfiles at: {{runfiles_dir}}", file=sys.stderr) + +if os.path.exists(runfiles_dir): + # Found runfiles directory, add _main to Python path + main_runfiles_path = os.path.join(runfiles_dir, '_main') + if os.path.exists(main_runfiles_path): + sys.path.insert(0, main_runfiles_path) + # Successfully added main runfiles path + else: + print("DEBUG: _main directory not found in runfiles", file=sys.stderr) + + # Also add pip package runfiles to Python path + for entry in os.listdir(runfiles_dir): + if entry.startswith('rules_python++pip+') and os.path.isdir(os.path.join(runfiles_dir, entry)): + pip_site_packages = os.path.join(runfiles_dir, entry, 'site-packages') + if os.path.exists(pip_site_packages): + sys.path.insert(0, pip_site_packages) + # Successfully added pip package path +else: + print("DEBUG: Runfiles directory not found, using workspace root", file=sys.stderr) + # If runfiles not available, we're probably running in development + # Add the workspace root to the path + workspace_root = os.environ.get('BUILD_WORKSPACE_DIRECTORY') + if workspace_root: + sys.path.insert(0, workspace_root) + # Successfully added workspace root as fallback + +from databuild.dsl.python.generator_lib import generate_dsl_package + +def main(): + # Determine output directory + workspace_root = os.environ.get('BUILD_WORKSPACE_DIRECTORY') + if workspace_root: + # Running with bazel run - write to source tree + output_dir = os.path.join(workspace_root, '{package_path}') + else: + # Running with bazel build - write to current directory (bazel-bin) + output_dir = '.' + + print(f"Generating DataBuild DSL code to {{output_dir}}") + + try: + generate_dsl_package('{module_path}', '{graph_attr}', output_dir) + except Exception as e: + print(f"ERROR: {{e}}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) + +if __name__ == "__main__": + main() +""".format( + module_path=module_path, + graph_attr=graph_attr, + package_path=package_path, + ) + def _databuild_dsl_generator_impl(ctx): """Implementation of the DSL generator rule.""" - script = ctx.actions.declare_file(ctx.label.name + "_generator.py") + # Create custom generator script + custom_generator = ctx.actions.declare_file(ctx.label.name + "_generator.py") # Get the module path from the graph file graph_file_path = ctx.file.graph_file.short_path @@ -983,109 +1054,44 @@ def _databuild_dsl_generator_impl(ctx): # Get the package path for output package_path = ctx.attr.output_package.strip("//").replace(":", "/") - script_content = """#!/usr/bin/env python3 -import os -import sys - -# Determine output directory -workspace_root = os.environ.get('BUILD_WORKSPACE_DIRECTORY') -if workspace_root: - # Running with bazel run - write to source tree - output_dir = os.path.join(workspace_root, '{package_path}') -else: - # Running with bazel build - write to current directory (bazel-bin) - output_dir = '.' - -print(f"Generating DataBuild DSL code to {{output_dir}}") - -try: - # Setup runfiles for module resolution - try: - from python.runfiles import runfiles - r = runfiles.Create() - if r: - generator_path = r.Rlocation("databuild/databuild/dsl/python/generator") - else: - raise ImportError("runfiles not available") - except ImportError: - # Fallback - assume generator is in the same directory as this script - import subprocess - import shutil - - # Try to find the generator binary using bazel - generator_path = shutil.which("bazel") - if generator_path: - import subprocess - result = subprocess.run([ - "bazel", "run", "@databuild//databuild/dsl/python:generator", "--", - "{module_path}", "{graph_attr}", output_dir - ], cwd=workspace_root if workspace_root else ".", capture_output=True, text=True) - - if result.returncode != 0: - print(f"ERROR: Generation failed:", file=sys.stderr) - print(result.stderr, file=sys.stderr) - sys.exit(1) - - print(result.stdout) - sys.exit(0) - else: - print("ERROR: Could not find bazel to run generator", file=sys.stderr) - sys.exit(1) - - # Run the generator with proper module resolution - import subprocess - result = subprocess.run([ - generator_path, "{module_path}", "{graph_attr}", output_dir - ], capture_output=True, text=True) - - if result.returncode != 0: - print(f"ERROR: Generation failed:", file=sys.stderr) - print(result.stderr, file=sys.stderr) - sys.exit(1) - - print(result.stdout) - -except Exception as e: - print(f"ERROR: Generation failed: {{e}}", file=sys.stderr) - import traceback - traceback.print_exc() - sys.exit(1) -""".format( - package_path = package_path, - module_path = module_path, - graph_attr = ctx.attr.graph_attr, - name = ctx.attr.name.replace(".generate", ""), + # Generate script content with embedded parameters + script_content = _generate_custom_generator_script( + module_path=module_path, + graph_attr=ctx.attr.graph_attr, + package_path=package_path ) ctx.actions.write( - output = script, - content = script_content, - is_executable = True, + output=custom_generator, + content=script_content, + is_executable=True, ) # Create runfiles with all dependencies - runfiles = ctx.runfiles( - files = [ctx.file.graph_file, ctx.executable._generator], - ) + runfiles = ctx.runfiles(files=[custom_generator, ctx.file.graph_file]) - # Merge runfiles from all dependencies + # Merge runfiles from all user-specified dependencies for dep in ctx.attr.deps: if hasattr(dep, "default_runfiles"): runfiles = runfiles.merge(dep.default_runfiles) - # Add protobuf dependencies - if hasattr(ctx.attr._py_proto, "default_runfiles"): - runfiles = runfiles.merge(ctx.attr._py_proto.default_runfiles) + # Include generator_lib and its dependencies + if hasattr(ctx.attr._generator_lib, "default_runfiles"): + runfiles = runfiles.merge(ctx.attr._generator_lib.default_runfiles) - # Add generator runfiles - runfiles = runfiles.merge(ctx.attr._generator.default_runfiles) + # Explicitly include py_proto dependencies since we removed the _py_proto attribute + # but generator_lib still needs it + for py_proto_dep in [ctx.attr._py_proto]: + if hasattr(py_proto_dep, "default_runfiles"): + runfiles = runfiles.merge(py_proto_dep.default_runfiles) - # Also add Python runfiles for proper module resolution - runfiles = runfiles.merge(ctx.attr._python_runfiles.default_runfiles) + # Add Python runfiles for proper module resolution + if hasattr(ctx.attr._python_runfiles, "default_runfiles"): + runfiles = runfiles.merge(ctx.attr._python_runfiles.default_runfiles) return [DefaultInfo( - executable = script, - runfiles = runfiles, + executable=custom_generator, + runfiles=runfiles, )] _databuild_dsl_generator_rule = rule( @@ -1112,14 +1118,12 @@ _databuild_dsl_generator_rule = rule( default = "@rules_python//python/runfiles", allow_files = True, ), + "_generator_lib": attr.label( + default = "@databuild//databuild/dsl/python:generator_lib", + ), "_py_proto": attr.label( default = "//databuild:py_proto", ), - "_generator": attr.label( - default = "@databuild//databuild/dsl/python:generator", - executable = True, - cfg = "target", - ), }, executable = True, ) diff --git a/plans/17-python-dsl-generator-fix.md b/plans/17-python-dsl-generator-fix.md new file mode 100644 index 0000000..052d6fa --- /dev/null +++ b/plans/17-python-dsl-generator-fix.md @@ -0,0 +1,164 @@ +# Plan 17: Python DSL Generator Fix + +## Problem Statement + +The `databuild_dsl_generator` rule currently fails when trying to generate DSL code because it cannot properly provide dependencies to the underlying generator script. The issue manifests as import errors like "No module named 'databuild.test'" or "No module named 'betterproto2'". + +## Root Cause Analysis + +The current implementation has two separate approaches that don't work well together: + +1. **Direct py_binary approach** (`//databuild/dsl/python:generator`) - Works but requires hard-coding dependencies +2. **DSL generator rule approach** (`databuild_dsl_generator`) - Should allow dynamic deps but has runfiles propagation issues + +The core issue is that when `databuild_dsl_generator` creates a wrapper script that calls the py_binary generator via subprocess, the dependencies specified in the rule's `deps` attribute aren't available in the Python environment of the subprocess. + +## Current State + +- `databuild_dsl_generator` rule exists in `databuild/rules.bzl` (lines 944-1125) +- Generic `generator.py` exists in `databuild/dsl/python/generator.py` +- py_binary generator exists in `databuild/dsl/python/BUILD.bazel` (lines 10-22) +- Test case: `//databuild/test/app/dsl:graph.generate` + +## Solution Design: Custom py_binary per DSL + +Instead of using a generic generator + wrapper script approach, create a custom py_binary for each `databuild_dsl_generator` target that includes the specific dependencies. + +### Architecture Changes + +1. **Remove subprocess approach**: Stop calling the generic py_binary via subprocess +2. **Create custom py_binary**: Generate a py_binary rule dynamically with the specific deps +3. **Embed generator logic**: Include the generator logic directly in the custom binary +4. **Proper runfiles**: Ensure all dependencies flow through Bazel's runfiles mechanism + +## Implementation Plan + +### Phase 1: Restructure Generator Logic + +1. **Extract core generator logic** from `databuild/dsl/python/generator.py` into a library: + ```python + # databuild/dsl/python/generator_lib.py + def generate_dsl_package(module_path: str, graph_attr: str, output_dir: str): + # Move the core logic here + ``` + +2. **Create generator library target** in `databuild/dsl/python/BUILD.bazel`: + ```python + py_library( + name = "generator_lib", + srcs = ["generator_lib.py"], + deps = [":dsl", "//databuild:py_proto"], + ) + ``` + +3. **Update standalone generator** to use the library: + ```python + # databuild/dsl/python/generator.py + from databuild.dsl.python.generator_lib import generate_dsl_package + + def main(): + generate_dsl_package(sys.argv[1], sys.argv[2], sys.argv[3]) + ``` + +### Phase 2: Modify DSL Generator Rule + +4. **Update `_databuild_dsl_generator_impl`** in `databuild/rules.bzl`: + - Remove the subprocess-based wrapper script approach + - Create a custom py_binary that includes the generator library + user deps + - Generate the custom binary's source code with the specific module/attr parameters + +5. **New implementation structure**: + ```python + def _databuild_dsl_generator_impl(ctx): + # Create custom generator script + custom_generator = ctx.actions.declare_file(ctx.label.name + "_custom_generator.py") + + # Generate script content with embedded parameters + script_content = generate_custom_generator_script( + module_path=module_path, + graph_attr=ctx.attr.graph_attr, + package_path=package_path + ) + + ctx.actions.write(output=custom_generator, content=script_content) + + # Create runfiles with all dependencies + runfiles = ctx.runfiles(files=[custom_generator, ctx.file.graph_file]) + for dep in ctx.attr.deps: + runfiles = runfiles.merge(dep.default_runfiles) + # Include generator_lib and py_proto dependencies + runfiles = runfiles.merge(ctx.attr._generator_lib.default_runfiles) + runfiles = runfiles.merge(ctx.attr._py_proto.default_runfiles) + + return [DefaultInfo(executable=custom_generator, runfiles=runfiles)] + ``` + +### Phase 3: Update Rule Attributes + +6. **Add generator_lib dependency** to rule attributes: + ```python + "_generator_lib": attr.label( + default = "@databuild//databuild/dsl/python:generator_lib", + ), + ``` + +7. **Remove unnecessary attributes** like `_generator`, `_betterproto2` that are no longer needed + +### Phase 4: Custom Generator Script Template + +8. **Create template for custom generator scripts**: + ```python + def generate_custom_generator_script(module_path, graph_attr, package_path): + return f"""#!/usr/bin/env python3 + import os + import sys + from databuild.dsl.python.generator_lib import generate_dsl_package + + def main(): + workspace_root = os.environ.get('BUILD_WORKSPACE_DIRECTORY') + if workspace_root: + output_dir = os.path.join(workspace_root, '{package_path}') + else: + output_dir = '.' + + try: + generate_dsl_package('{module_path}', '{graph_attr}', output_dir) + except Exception as e: + print(f"ERROR: Generation failed: {{e}}", file=sys.stderr) + sys.exit(1) + + if __name__ == "__main__": + main() + """ + ``` + +## Testing Strategy + +1. **Unit test the generator library** separately from the rule +2. **Test with existing DSL** (`//databuild/test/app/dsl:graph.generate`) +3. **Verify dependency isolation** - ensure different DSL generators don't interfere +4. **Test pip dependency propagation** - ensure betterproto2 and other external deps work + +## Success Criteria + +- [ ] `bazel run //databuild/test/app/dsl:graph.generate` works without import errors +- [ ] Users can specify `deps = [":their_dsl_src"]` and have those deps available to generator +- [ ] No hard-coded dependencies in the core generator components +- [ ] Proper error messages when imports fail +- [ ] Generated code works with existing DataBuild infrastructure + +## Risks and Mitigations + +**Risk**: Bazel rule complexity increases +**Mitigation**: Extract reusable functions, add comprehensive comments + +**Risk**: Runfiles might still not propagate correctly +**Mitigation**: Test with various dependency types early in implementation + +## Files to Modify + +- `databuild/dsl/python/BUILD.bazel` - Add generator_lib target +- `databuild/dsl/python/generator_lib.py` - New file with extracted logic +- `databuild/dsl/python/generator.py` - Simplify to use library +- `databuild/rules.bzl` - Rewrite `_databuild_dsl_generator_impl` +- `databuild/BUILD.bazel` - Ensure betterproto2 is properly included in py_proto