Fix dsl generation with workaround for grpc in generated py protobuf dataclasses
Some checks are pending
/ setup (push) Waiting to run

This commit is contained in:
Stuart Axelbrooke 2025-08-06 17:49:15 -07:00
parent 401fd5bead
commit f6e6dad32c
6 changed files with 349 additions and 122 deletions

View file

@ -174,6 +174,39 @@ export PATH=$$PATH:$$(dirname $(location :protoc-gen-python_betterproto2))
export PATH=$$PATH:$$(dirname $(location //:ruff_binary)) export PATH=$$PATH:$$(dirname $(location //:ruff_binary))
$(location @com_google_protobuf//:protoc) --python_betterproto2_out=$(@D) $(location databuild.proto) $(location @com_google_protobuf//:protoc) --python_betterproto2_out=$(@D) $(location databuild.proto)
mkdir -p $(@D)/py_proto_out/databuild/v1 mkdir -p $(@D)/py_proto_out/databuild/v1
# Make grpc import conditional to avoid binary compatibility issues during DSL generation
cat > /tmp/fix_grpc_import.py << 'EOF'
import sys
with open(sys.argv[1], 'r') as f:
content = f.read()
# Replace the grpc import with conditional import
content = content.replace(
'import grpc',
'''try:
import grpc
_GRPC_AVAILABLE = True
except ImportError:
grpc = None
_GRPC_AVAILABLE = False'''
)
# Replace service constructors to check grpc availability
content = content.replace(
'def __init__(self, channel: grpc.Channel):',
'''def __init__(self, channel):
if not _GRPC_AVAILABLE:
raise RuntimeError("grpc not available - required for service classes")'''
)
with open(sys.argv[1], 'w') as f:
f.write(content)
EOF
python3 /tmp/fix_grpc_import.py $(@D)/databuild/v1/__init__.py
cp $(@D)/databuild/__init__.py $(@D)/py_proto_out/__init__.py cp $(@D)/databuild/__init__.py $(@D)/py_proto_out/__init__.py
cp $(@D)/databuild/__init__.py $(@D)/py_proto_out/databuild/__init__.py cp $(@D)/databuild/__init__.py $(@D)/py_proto_out/databuild/__init__.py
cp $(@D)/databuild/v1/__init__.py $(@D)/py_proto_out/databuild/v1/__init__.py cp $(@D)/databuild/v1/__init__.py $(@D)/py_proto_out/databuild/v1/__init__.py

View file

@ -7,16 +7,23 @@ py_library(
], ],
) )
py_binary( py_library(
name = "generator", name = "generator_lib",
srcs = ["generator.py"], srcs = ["generator_lib.py"],
main = "generator.py",
data = ["dsl_job_wrapper.py"],
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
deps = [ deps = [
":dsl", ":dsl",
"//databuild:py_proto", "//databuild:py_proto",
# Include test app modules for testing ],
"//databuild/test/app/dsl:dsl_src", )
py_binary(
name = "generator",
srcs = ["generator.py"],
data = ["dsl_job_wrapper.py"],
main = "generator.py",
visibility = ["//visibility:public"],
deps = [
":generator_lib",
], ],
) )

View file

@ -4,8 +4,7 @@ DSL code generator that can be run as a py_binary with proper dependencies.
""" """
import sys import sys
import os from databuild.dsl.python.generator_lib import generate_dsl_package
import importlib
def main(): def main():
@ -17,27 +16,10 @@ def main():
graph_attr = sys.argv[2] graph_attr = sys.argv[2]
output_dir = sys.argv[3] output_dir = sys.argv[3]
# Extract the base name from the output directory for naming
name = os.path.basename(output_dir.rstrip('/')) or "graph"
try: try:
# Import the graph module generate_dsl_package(module_path, graph_attr, output_dir)
module = importlib.import_module(module_path)
graph = getattr(module, graph_attr)
# Generate the bazel package
graph.generate_bazel_package(name, output_dir)
print(f"Generated DataBuild DSL package in {output_dir}")
except ImportError as e:
print(f"ERROR: Failed to import {graph_attr} from {module_path}: {e}", file=sys.stderr)
sys.exit(1)
except AttributeError as e:
print(f"ERROR: Module {module_path} does not have attribute {graph_attr}: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e: except Exception as e:
print(f"ERROR: Generation failed: {e}", file=sys.stderr) print(f"ERROR: {e}", file=sys.stderr)
import traceback import traceback
traceback.print_exc() traceback.print_exc()
sys.exit(1) sys.exit(1)

View file

@ -0,0 +1,37 @@
#!/usr/bin/env python3
"""
Core DSL code generation library that can be imported by different generator binaries.
"""
import os
import importlib
def generate_dsl_package(module_path: str, graph_attr: str, output_dir: str):
"""
Generate DataBuild DSL package from a graph definition.
Args:
module_path: Python module path (e.g., "databuild.test.app.dsl.graph")
graph_attr: Name of the graph attribute in the module
output_dir: Directory where to generate the DSL package
"""
# Extract the base name from the output directory for naming
name = os.path.basename(output_dir.rstrip('/')) or "graph"
try:
# Import the graph module
module = importlib.import_module(module_path)
graph = getattr(module, graph_attr)
# Generate the bazel package
graph.generate_bazel_package(name, output_dir)
print(f"Generated DataBuild DSL package in {output_dir}")
except ImportError as e:
raise ImportError(f"Failed to import {graph_attr} from {module_path}: {e}")
except AttributeError as e:
raise AttributeError(f"Module {module_path} does not have attribute {graph_attr}: {e}")
except Exception as e:
raise Exception(f"Generation failed: {e}")

View file

@ -970,9 +970,80 @@ def databuild_dsl_generator(
visibility = visibility, visibility = visibility,
) )
def _generate_custom_generator_script(module_path, graph_attr, package_path):
"""Generate the custom generator script content with embedded parameters."""
return """#!/usr/bin/env python3
import os
import sys
# Setup runfiles for proper module resolution
# Try to find the runfiles directory relative to this script
script_path = os.path.abspath(__file__)
runfiles_dir = script_path + '.runfiles'
# Debug: Runfiles path setup
# print(f"DEBUG: Script path: {{script_path}}", file=sys.stderr)
# print(f"DEBUG: Looking for runfiles at: {{runfiles_dir}}", file=sys.stderr)
if os.path.exists(runfiles_dir):
# Found runfiles directory, add _main to Python path
main_runfiles_path = os.path.join(runfiles_dir, '_main')
if os.path.exists(main_runfiles_path):
sys.path.insert(0, main_runfiles_path)
# Successfully added main runfiles path
else:
print("DEBUG: _main directory not found in runfiles", file=sys.stderr)
# Also add pip package runfiles to Python path
for entry in os.listdir(runfiles_dir):
if entry.startswith('rules_python++pip+') and os.path.isdir(os.path.join(runfiles_dir, entry)):
pip_site_packages = os.path.join(runfiles_dir, entry, 'site-packages')
if os.path.exists(pip_site_packages):
sys.path.insert(0, pip_site_packages)
# Successfully added pip package path
else:
print("DEBUG: Runfiles directory not found, using workspace root", file=sys.stderr)
# If runfiles not available, we're probably running in development
# Add the workspace root to the path
workspace_root = os.environ.get('BUILD_WORKSPACE_DIRECTORY')
if workspace_root:
sys.path.insert(0, workspace_root)
# Successfully added workspace root as fallback
from databuild.dsl.python.generator_lib import generate_dsl_package
def main():
# Determine output directory
workspace_root = os.environ.get('BUILD_WORKSPACE_DIRECTORY')
if workspace_root:
# Running with bazel run - write to source tree
output_dir = os.path.join(workspace_root, '{package_path}')
else:
# Running with bazel build - write to current directory (bazel-bin)
output_dir = '.'
print(f"Generating DataBuild DSL code to {{output_dir}}")
try:
generate_dsl_package('{module_path}', '{graph_attr}', output_dir)
except Exception as e:
print(f"ERROR: {{e}}", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()
""".format(
module_path=module_path,
graph_attr=graph_attr,
package_path=package_path,
)
def _databuild_dsl_generator_impl(ctx): def _databuild_dsl_generator_impl(ctx):
"""Implementation of the DSL generator rule.""" """Implementation of the DSL generator rule."""
script = ctx.actions.declare_file(ctx.label.name + "_generator.py") # Create custom generator script
custom_generator = ctx.actions.declare_file(ctx.label.name + "_generator.py")
# Get the module path from the graph file # Get the module path from the graph file
graph_file_path = ctx.file.graph_file.short_path graph_file_path = ctx.file.graph_file.short_path
@ -983,109 +1054,44 @@ def _databuild_dsl_generator_impl(ctx):
# Get the package path for output # Get the package path for output
package_path = ctx.attr.output_package.strip("//").replace(":", "/") package_path = ctx.attr.output_package.strip("//").replace(":", "/")
script_content = """#!/usr/bin/env python3 # Generate script content with embedded parameters
import os script_content = _generate_custom_generator_script(
import sys module_path=module_path,
graph_attr=ctx.attr.graph_attr,
# Determine output directory package_path=package_path
workspace_root = os.environ.get('BUILD_WORKSPACE_DIRECTORY')
if workspace_root:
# Running with bazel run - write to source tree
output_dir = os.path.join(workspace_root, '{package_path}')
else:
# Running with bazel build - write to current directory (bazel-bin)
output_dir = '.'
print(f"Generating DataBuild DSL code to {{output_dir}}")
try:
# Setup runfiles for module resolution
try:
from python.runfiles import runfiles
r = runfiles.Create()
if r:
generator_path = r.Rlocation("databuild/databuild/dsl/python/generator")
else:
raise ImportError("runfiles not available")
except ImportError:
# Fallback - assume generator is in the same directory as this script
import subprocess
import shutil
# Try to find the generator binary using bazel
generator_path = shutil.which("bazel")
if generator_path:
import subprocess
result = subprocess.run([
"bazel", "run", "@databuild//databuild/dsl/python:generator", "--",
"{module_path}", "{graph_attr}", output_dir
], cwd=workspace_root if workspace_root else ".", capture_output=True, text=True)
if result.returncode != 0:
print(f"ERROR: Generation failed:", file=sys.stderr)
print(result.stderr, file=sys.stderr)
sys.exit(1)
print(result.stdout)
sys.exit(0)
else:
print("ERROR: Could not find bazel to run generator", file=sys.stderr)
sys.exit(1)
# Run the generator with proper module resolution
import subprocess
result = subprocess.run([
generator_path, "{module_path}", "{graph_attr}", output_dir
], capture_output=True, text=True)
if result.returncode != 0:
print(f"ERROR: Generation failed:", file=sys.stderr)
print(result.stderr, file=sys.stderr)
sys.exit(1)
print(result.stdout)
except Exception as e:
print(f"ERROR: Generation failed: {{e}}", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)
""".format(
package_path = package_path,
module_path = module_path,
graph_attr = ctx.attr.graph_attr,
name = ctx.attr.name.replace(".generate", ""),
) )
ctx.actions.write( ctx.actions.write(
output = script, output=custom_generator,
content = script_content, content=script_content,
is_executable = True, is_executable=True,
) )
# Create runfiles with all dependencies # Create runfiles with all dependencies
runfiles = ctx.runfiles( runfiles = ctx.runfiles(files=[custom_generator, ctx.file.graph_file])
files = [ctx.file.graph_file, ctx.executable._generator],
)
# Merge runfiles from all dependencies # Merge runfiles from all user-specified dependencies
for dep in ctx.attr.deps: for dep in ctx.attr.deps:
if hasattr(dep, "default_runfiles"): if hasattr(dep, "default_runfiles"):
runfiles = runfiles.merge(dep.default_runfiles) runfiles = runfiles.merge(dep.default_runfiles)
# Add protobuf dependencies # Include generator_lib and its dependencies
if hasattr(ctx.attr._py_proto, "default_runfiles"): if hasattr(ctx.attr._generator_lib, "default_runfiles"):
runfiles = runfiles.merge(ctx.attr._py_proto.default_runfiles) runfiles = runfiles.merge(ctx.attr._generator_lib.default_runfiles)
# Add generator runfiles # Explicitly include py_proto dependencies since we removed the _py_proto attribute
runfiles = runfiles.merge(ctx.attr._generator.default_runfiles) # but generator_lib still needs it
for py_proto_dep in [ctx.attr._py_proto]:
if hasattr(py_proto_dep, "default_runfiles"):
runfiles = runfiles.merge(py_proto_dep.default_runfiles)
# Also add Python runfiles for proper module resolution # Add Python runfiles for proper module resolution
runfiles = runfiles.merge(ctx.attr._python_runfiles.default_runfiles) if hasattr(ctx.attr._python_runfiles, "default_runfiles"):
runfiles = runfiles.merge(ctx.attr._python_runfiles.default_runfiles)
return [DefaultInfo( return [DefaultInfo(
executable = script, executable=custom_generator,
runfiles = runfiles, runfiles=runfiles,
)] )]
_databuild_dsl_generator_rule = rule( _databuild_dsl_generator_rule = rule(
@ -1112,14 +1118,12 @@ _databuild_dsl_generator_rule = rule(
default = "@rules_python//python/runfiles", default = "@rules_python//python/runfiles",
allow_files = True, allow_files = True,
), ),
"_generator_lib": attr.label(
default = "@databuild//databuild/dsl/python:generator_lib",
),
"_py_proto": attr.label( "_py_proto": attr.label(
default = "//databuild:py_proto", default = "//databuild:py_proto",
), ),
"_generator": attr.label(
default = "@databuild//databuild/dsl/python:generator",
executable = True,
cfg = "target",
),
}, },
executable = True, executable = True,
) )

View file

@ -0,0 +1,164 @@
# Plan 17: Python DSL Generator Fix
## Problem Statement
The `databuild_dsl_generator` rule currently fails when trying to generate DSL code because it cannot properly provide dependencies to the underlying generator script. The issue manifests as import errors like "No module named 'databuild.test'" or "No module named 'betterproto2'".
## Root Cause Analysis
The current implementation has two separate approaches that don't work well together:
1. **Direct py_binary approach** (`//databuild/dsl/python:generator`) - Works but requires hard-coding dependencies
2. **DSL generator rule approach** (`databuild_dsl_generator`) - Should allow dynamic deps but has runfiles propagation issues
The core issue is that when `databuild_dsl_generator` creates a wrapper script that calls the py_binary generator via subprocess, the dependencies specified in the rule's `deps` attribute aren't available in the Python environment of the subprocess.
## Current State
- `databuild_dsl_generator` rule exists in `databuild/rules.bzl` (lines 944-1125)
- Generic `generator.py` exists in `databuild/dsl/python/generator.py`
- py_binary generator exists in `databuild/dsl/python/BUILD.bazel` (lines 10-22)
- Test case: `//databuild/test/app/dsl:graph.generate`
## Solution Design: Custom py_binary per DSL
Instead of using a generic generator + wrapper script approach, create a custom py_binary for each `databuild_dsl_generator` target that includes the specific dependencies.
### Architecture Changes
1. **Remove subprocess approach**: Stop calling the generic py_binary via subprocess
2. **Create custom py_binary**: Generate a py_binary rule dynamically with the specific deps
3. **Embed generator logic**: Include the generator logic directly in the custom binary
4. **Proper runfiles**: Ensure all dependencies flow through Bazel's runfiles mechanism
## Implementation Plan
### Phase 1: Restructure Generator Logic
1. **Extract core generator logic** from `databuild/dsl/python/generator.py` into a library:
```python
# databuild/dsl/python/generator_lib.py
def generate_dsl_package(module_path: str, graph_attr: str, output_dir: str):
# Move the core logic here
```
2. **Create generator library target** in `databuild/dsl/python/BUILD.bazel`:
```python
py_library(
name = "generator_lib",
srcs = ["generator_lib.py"],
deps = [":dsl", "//databuild:py_proto"],
)
```
3. **Update standalone generator** to use the library:
```python
# databuild/dsl/python/generator.py
from databuild.dsl.python.generator_lib import generate_dsl_package
def main():
generate_dsl_package(sys.argv[1], sys.argv[2], sys.argv[3])
```
### Phase 2: Modify DSL Generator Rule
4. **Update `_databuild_dsl_generator_impl`** in `databuild/rules.bzl`:
- Remove the subprocess-based wrapper script approach
- Create a custom py_binary that includes the generator library + user deps
- Generate the custom binary's source code with the specific module/attr parameters
5. **New implementation structure**:
```python
def _databuild_dsl_generator_impl(ctx):
# Create custom generator script
custom_generator = ctx.actions.declare_file(ctx.label.name + "_custom_generator.py")
# Generate script content with embedded parameters
script_content = generate_custom_generator_script(
module_path=module_path,
graph_attr=ctx.attr.graph_attr,
package_path=package_path
)
ctx.actions.write(output=custom_generator, content=script_content)
# Create runfiles with all dependencies
runfiles = ctx.runfiles(files=[custom_generator, ctx.file.graph_file])
for dep in ctx.attr.deps:
runfiles = runfiles.merge(dep.default_runfiles)
# Include generator_lib and py_proto dependencies
runfiles = runfiles.merge(ctx.attr._generator_lib.default_runfiles)
runfiles = runfiles.merge(ctx.attr._py_proto.default_runfiles)
return [DefaultInfo(executable=custom_generator, runfiles=runfiles)]
```
### Phase 3: Update Rule Attributes
6. **Add generator_lib dependency** to rule attributes:
```python
"_generator_lib": attr.label(
default = "@databuild//databuild/dsl/python:generator_lib",
),
```
7. **Remove unnecessary attributes** like `_generator`, `_betterproto2` that are no longer needed
### Phase 4: Custom Generator Script Template
8. **Create template for custom generator scripts**:
```python
def generate_custom_generator_script(module_path, graph_attr, package_path):
return f"""#!/usr/bin/env python3
import os
import sys
from databuild.dsl.python.generator_lib import generate_dsl_package
def main():
workspace_root = os.environ.get('BUILD_WORKSPACE_DIRECTORY')
if workspace_root:
output_dir = os.path.join(workspace_root, '{package_path}')
else:
output_dir = '.'
try:
generate_dsl_package('{module_path}', '{graph_attr}', output_dir)
except Exception as e:
print(f"ERROR: Generation failed: {{e}}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()
"""
```
## Testing Strategy
1. **Unit test the generator library** separately from the rule
2. **Test with existing DSL** (`//databuild/test/app/dsl:graph.generate`)
3. **Verify dependency isolation** - ensure different DSL generators don't interfere
4. **Test pip dependency propagation** - ensure betterproto2 and other external deps work
## Success Criteria
- [ ] `bazel run //databuild/test/app/dsl:graph.generate` works without import errors
- [ ] Users can specify `deps = [":their_dsl_src"]` and have those deps available to generator
- [ ] No hard-coded dependencies in the core generator components
- [ ] Proper error messages when imports fail
- [ ] Generated code works with existing DataBuild infrastructure
## Risks and Mitigations
**Risk**: Bazel rule complexity increases
**Mitigation**: Extract reusable functions, add comprehensive comments
**Risk**: Runfiles might still not propagate correctly
**Mitigation**: Test with various dependency types early in implementation
## Files to Modify
- `databuild/dsl/python/BUILD.bazel` - Add generator_lib target
- `databuild/dsl/python/generator_lib.py` - New file with extracted logic
- `databuild/dsl/python/generator.py` - Simplify to use library
- `databuild/rules.bzl` - Rewrite `_databuild_dsl_generator_impl`
- `databuild/BUILD.bazel` - Ensure betterproto2 is properly included in py_proto