142 lines
No EOL
5 KiB
Python
142 lines
No EOL
5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Shared DSL job wrapper that can execute any DataBuildJob defined in a DSL graph.
|
|
Configured via environment variables:
|
|
- DATABUILD_DSL_GRAPH_MODULE: Python module path containing the graph (e.g., 'databuild.test.app.dsl.graph')
|
|
- DATABUILD_JOB_CLASS: Job class name to execute (e.g., 'IngestColorVotes')
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import os
|
|
import importlib
|
|
from typing import List, Any
|
|
from databuild.proto import JobConfig
|
|
|
|
|
|
def parse_outputs_from_args(args: List[str], job_class: Any) -> List[Any]:
|
|
"""Parse partition output references from command line arguments into partition objects."""
|
|
outputs = []
|
|
for arg in args:
|
|
# Find which output type can deserialize this partition reference
|
|
for output_type in job_class.output_types:
|
|
try:
|
|
partition = output_type.deserialize(arg)
|
|
outputs.append(partition)
|
|
break
|
|
except ValueError:
|
|
continue
|
|
else:
|
|
raise ValueError(f"No output type in {job_class.__name__} can deserialize partition ref: {arg}")
|
|
|
|
return outputs
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print("Usage: dsl_job_wrapper.py <config|exec> [args...]", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
command = sys.argv[1]
|
|
|
|
# Read configuration from environment
|
|
graph_module_path = os.environ.get('DATABUILD_DSL_GRAPH_MODULE')
|
|
job_class_name = os.environ.get('DATABUILD_JOB_CLASS')
|
|
|
|
if not graph_module_path:
|
|
print("ERROR: DATABUILD_DSL_GRAPH_MODULE environment variable not set", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
if not job_class_name:
|
|
print("ERROR: DATABUILD_JOB_CLASS environment variable not set", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
try:
|
|
# Import the graph module
|
|
module = importlib.import_module(graph_module_path)
|
|
graph = getattr(module, 'graph')
|
|
|
|
# Get the job class
|
|
job_class = getattr(module, job_class_name)
|
|
|
|
# Create job instance
|
|
job_instance = job_class()
|
|
|
|
except (ImportError, AttributeError) as e:
|
|
print(f"ERROR: Failed to load job {job_class_name} from {graph_module_path}: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
if command == "config":
|
|
try:
|
|
# Parse output partition references from remaining args
|
|
output_refs = sys.argv[2:]
|
|
if not output_refs:
|
|
print("ERROR: No output partition references provided", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
outputs = parse_outputs_from_args(output_refs, job_class)
|
|
|
|
# Call job's config method
|
|
configs = job_instance.config(outputs)
|
|
|
|
# Output each config as JSON (one per line for multiple configs)
|
|
for config in configs:
|
|
# Convert JobConfig to dict for JSON serialization
|
|
config_dict = {
|
|
'outputs': [{'str': ref.str} for ref in config.outputs],
|
|
'inputs': [
|
|
{
|
|
'dep_type_code': dep.dep_type_code,
|
|
'dep_type_name': dep.dep_type_name,
|
|
'partition_ref': {'str': dep.partition_ref.str}
|
|
} for dep in config.inputs
|
|
],
|
|
'args': config.args,
|
|
'env': config.env,
|
|
}
|
|
print(json.dumps(config_dict))
|
|
|
|
except Exception as e:
|
|
print(f"ERROR: Config failed: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
elif command == "exec":
|
|
try:
|
|
# Read config from stdin
|
|
config_json = sys.stdin.read().strip()
|
|
if not config_json:
|
|
print("ERROR: No config provided on stdin", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
config_dict = json.loads(config_json)
|
|
|
|
# Convert dict back to JobConfig
|
|
from databuild.proto import PartitionRef, DataDep, DepType
|
|
|
|
config = JobConfig(
|
|
outputs=[PartitionRef(str=ref['str']) for ref in config_dict['outputs']],
|
|
inputs=[
|
|
DataDep(
|
|
dep_type_code=dep['dep_type_code'],
|
|
dep_type_name=dep['dep_type_name'],
|
|
partition_ref=PartitionRef(str=dep['partition_ref']['str'])
|
|
) for dep in config_dict['inputs']
|
|
],
|
|
args=config_dict['args'],
|
|
env=config_dict['env'],
|
|
)
|
|
|
|
# Call job's exec method
|
|
job_instance.exec(config)
|
|
|
|
except Exception as e:
|
|
print(f"ERROR: Execution failed: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
else:
|
|
print(f"ERROR: Unknown command '{command}'. Use 'config' or 'exec'.", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |