databuild/databuild/dsl/python/dsl_job_wrapper.py
2025-08-04 02:48:29 -07:00

142 lines
No EOL
5 KiB
Python

#!/usr/bin/env python3
"""
Shared DSL job wrapper that can execute any DataBuildJob defined in a DSL graph.
Configured via environment variables:
- DATABUILD_DSL_GRAPH_MODULE: Python module path containing the graph (e.g., 'databuild.test.app.dsl.graph')
- DATABUILD_JOB_CLASS: Job class name to execute (e.g., 'IngestColorVotes')
"""
import sys
import json
import os
import importlib
from typing import List, Any
from databuild.proto import JobConfig
def parse_outputs_from_args(args: List[str], job_class: Any) -> List[Any]:
"""Parse partition output references from command line arguments into partition objects."""
outputs = []
for arg in args:
# Find which output type can deserialize this partition reference
for output_type in job_class.output_types:
try:
partition = output_type.deserialize(arg)
outputs.append(partition)
break
except ValueError:
continue
else:
raise ValueError(f"No output type in {job_class.__name__} can deserialize partition ref: {arg}")
return outputs
def main():
if len(sys.argv) < 2:
print("Usage: dsl_job_wrapper.py <config|exec> [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
# Read configuration from environment
graph_module_path = os.environ.get('DATABUILD_DSL_GRAPH_MODULE')
job_class_name = os.environ.get('DATABUILD_JOB_CLASS')
if not graph_module_path:
print("ERROR: DATABUILD_DSL_GRAPH_MODULE environment variable not set", file=sys.stderr)
sys.exit(1)
if not job_class_name:
print("ERROR: DATABUILD_JOB_CLASS environment variable not set", file=sys.stderr)
sys.exit(1)
try:
# Import the graph module
module = importlib.import_module(graph_module_path)
graph = getattr(module, 'graph')
# Get the job class
job_class = getattr(module, job_class_name)
# Create job instance
job_instance = job_class()
except (ImportError, AttributeError) as e:
print(f"ERROR: Failed to load job {job_class_name} from {graph_module_path}: {e}", file=sys.stderr)
sys.exit(1)
if command == "config":
try:
# Parse output partition references from remaining args
output_refs = sys.argv[2:]
if not output_refs:
print("ERROR: No output partition references provided", file=sys.stderr)
sys.exit(1)
outputs = parse_outputs_from_args(output_refs, job_class)
# Call job's config method
configs = job_instance.config(outputs)
# Output each config as JSON (one per line for multiple configs)
for config in configs:
# Convert JobConfig to dict for JSON serialization
config_dict = {
'outputs': [{'str': ref.str} for ref in config.outputs],
'inputs': [
{
'dep_type_code': dep.dep_type_code,
'dep_type_name': dep.dep_type_name,
'partition_ref': {'str': dep.partition_ref.str}
} for dep in config.inputs
],
'args': config.args,
'env': config.env,
}
print(json.dumps(config_dict))
except Exception as e:
print(f"ERROR: Config failed: {e}", file=sys.stderr)
sys.exit(1)
elif command == "exec":
try:
# Read config from stdin
config_json = sys.stdin.read().strip()
if not config_json:
print("ERROR: No config provided on stdin", file=sys.stderr)
sys.exit(1)
config_dict = json.loads(config_json)
# Convert dict back to JobConfig
from databuild.proto import PartitionRef, DataDep, DepType
config = JobConfig(
outputs=[PartitionRef(str=ref['str']) for ref in config_dict['outputs']],
inputs=[
DataDep(
dep_type_code=dep['dep_type_code'],
dep_type_name=dep['dep_type_name'],
partition_ref=PartitionRef(str=dep['partition_ref']['str'])
) for dep in config_dict['inputs']
],
args=config_dict['args'],
env=config_dict['env'],
)
# Call job's exec method
job_instance.exec(config)
except Exception as e:
print(f"ERROR: Execution failed: {e}", file=sys.stderr)
sys.exit(1)
else:
print(f"ERROR: Unknown command '{command}'. Use 'config' or 'exec'.", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()