This commit is contained in:
parent
f6e6dad32c
commit
ba18734190
9 changed files with 8562 additions and 18 deletions
|
|
@ -84,12 +84,13 @@ class DataBuildGraph:
|
||||||
"""Generates a complete databuild application, packaging up referenced jobs and this graph via bazel targets"""
|
"""Generates a complete databuild application, packaging up referenced jobs and this graph via bazel targets"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def generate_bazel_package(self, name: str, output_dir: str) -> None:
|
def generate_bazel_package(self, name: str, output_dir: str, deps: list = None) -> None:
|
||||||
"""Generate BUILD.bazel and binaries into a generated/ subdirectory.
|
"""Generate BUILD.bazel and binaries into a generated/ subdirectory.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
name: Base name for the generated graph (without .generate suffix)
|
name: Base name for the generated graph (without .generate suffix)
|
||||||
output_dir: Directory to write generated files to (will create generated/ subdir)
|
output_dir: Directory to write generated files to (will create generated/ subdir)
|
||||||
|
deps: List of Bazel dependency labels to use in generated BUILD.bazel
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
|
@ -99,7 +100,7 @@ class DataBuildGraph:
|
||||||
os.makedirs(generated_dir, exist_ok=True)
|
os.makedirs(generated_dir, exist_ok=True)
|
||||||
|
|
||||||
# Generate BUILD.bazel with job and graph targets
|
# Generate BUILD.bazel with job and graph targets
|
||||||
self._generate_build_bazel(generated_dir, name)
|
self._generate_build_bazel(generated_dir, name, deps or [])
|
||||||
|
|
||||||
# Generate individual job scripts (instead of shared wrapper)
|
# Generate individual job scripts (instead of shared wrapper)
|
||||||
self._generate_job_scripts(generated_dir)
|
self._generate_job_scripts(generated_dir)
|
||||||
|
|
@ -114,15 +115,20 @@ class DataBuildGraph:
|
||||||
else:
|
else:
|
||||||
print(f"Run 'bazel build generated:{name}_graph.analyze' to use the generated graph")
|
print(f"Run 'bazel build generated:{name}_graph.analyze' to use the generated graph")
|
||||||
|
|
||||||
def _generate_build_bazel(self, output_dir: str, name: str) -> None:
|
def _generate_build_bazel(self, output_dir: str, name: str, deps: list) -> None:
|
||||||
"""Generate BUILD.bazel with databuild_job and databuild_graph targets."""
|
"""Generate BUILD.bazel with databuild_job and databuild_graph targets."""
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# Get job classes from the lookup table
|
# Get job classes from the lookup table
|
||||||
job_classes = list(set(self.lookup.values()))
|
job_classes = list(set(self.lookup.values()))
|
||||||
|
|
||||||
# Get parent package for dependencies
|
# Format deps for BUILD.bazel
|
||||||
parent_package = self._get_package_name()
|
if deps:
|
||||||
|
deps_str = ", ".join([f'"{dep}"' for dep in deps])
|
||||||
|
else:
|
||||||
|
# Fallback to parent package if no deps provided
|
||||||
|
parent_package = self._get_package_name()
|
||||||
|
deps_str = f'"//{parent_package}:dsl_src"'
|
||||||
|
|
||||||
# Generate py_binary targets for each job
|
# Generate py_binary targets for each job
|
||||||
job_binaries = []
|
job_binaries = []
|
||||||
|
|
@ -138,7 +144,7 @@ class DataBuildGraph:
|
||||||
name = "{binary_name}",
|
name = "{binary_name}",
|
||||||
srcs = ["{job_script_name}"],
|
srcs = ["{job_script_name}"],
|
||||||
main = "{job_script_name}",
|
main = "{job_script_name}",
|
||||||
deps = ["//{parent_package}:dsl_src"],
|
deps = [{deps_str}],
|
||||||
)
|
)
|
||||||
|
|
||||||
databuild_job(
|
databuild_job(
|
||||||
|
|
@ -157,7 +163,7 @@ databuild_job(
|
||||||
py_binary(
|
py_binary(
|
||||||
name = "{name}_job_lookup",
|
name = "{name}_job_lookup",
|
||||||
srcs = ["{name}_job_lookup.py"],
|
srcs = ["{name}_job_lookup.py"],
|
||||||
deps = ["//{parent_package}:dsl_src"],
|
deps = [{deps_str}],
|
||||||
)
|
)
|
||||||
|
|
||||||
databuild_graph(
|
databuild_graph(
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ import os
|
||||||
import importlib
|
import importlib
|
||||||
|
|
||||||
|
|
||||||
def generate_dsl_package(module_path: str, graph_attr: str, output_dir: str):
|
def generate_dsl_package(module_path: str, graph_attr: str, output_dir: str, deps: list = None):
|
||||||
"""
|
"""
|
||||||
Generate DataBuild DSL package from a graph definition.
|
Generate DataBuild DSL package from a graph definition.
|
||||||
|
|
||||||
|
|
@ -15,6 +15,7 @@ def generate_dsl_package(module_path: str, graph_attr: str, output_dir: str):
|
||||||
module_path: Python module path (e.g., "databuild.test.app.dsl.graph")
|
module_path: Python module path (e.g., "databuild.test.app.dsl.graph")
|
||||||
graph_attr: Name of the graph attribute in the module
|
graph_attr: Name of the graph attribute in the module
|
||||||
output_dir: Directory where to generate the DSL package
|
output_dir: Directory where to generate the DSL package
|
||||||
|
deps: List of Bazel dependency labels to use in generated BUILD.bazel
|
||||||
"""
|
"""
|
||||||
# Extract the base name from the output directory for naming
|
# Extract the base name from the output directory for naming
|
||||||
name = os.path.basename(output_dir.rstrip('/')) or "graph"
|
name = os.path.basename(output_dir.rstrip('/')) or "graph"
|
||||||
|
|
@ -25,7 +26,7 @@ def generate_dsl_package(module_path: str, graph_attr: str, output_dir: str):
|
||||||
graph = getattr(module, graph_attr)
|
graph = getattr(module, graph_attr)
|
||||||
|
|
||||||
# Generate the bazel package
|
# Generate the bazel package
|
||||||
graph.generate_bazel_package(name, output_dir)
|
graph.generate_bazel_package(name, output_dir, deps or [])
|
||||||
|
|
||||||
print(f"Generated DataBuild DSL package in {output_dir}")
|
print(f"Generated DataBuild DSL package in {output_dir}")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -970,7 +970,7 @@ def databuild_dsl_generator(
|
||||||
visibility = visibility,
|
visibility = visibility,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _generate_custom_generator_script(module_path, graph_attr, package_path):
|
def _generate_custom_generator_script(module_path, graph_attr, package_path, deps):
|
||||||
"""Generate the custom generator script content with embedded parameters."""
|
"""Generate the custom generator script content with embedded parameters."""
|
||||||
return """#!/usr/bin/env python3
|
return """#!/usr/bin/env python3
|
||||||
import os
|
import os
|
||||||
|
|
@ -981,9 +981,8 @@ import sys
|
||||||
script_path = os.path.abspath(__file__)
|
script_path = os.path.abspath(__file__)
|
||||||
runfiles_dir = script_path + '.runfiles'
|
runfiles_dir = script_path + '.runfiles'
|
||||||
|
|
||||||
# Debug: Runfiles path setup
|
# Debug: Runfiles path setup for cross-workspace usage
|
||||||
# print(f"DEBUG: Script path: {{script_path}}", file=sys.stderr)
|
# Setting up runfiles paths for cross-workspace usage
|
||||||
# print(f"DEBUG: Looking for runfiles at: {{runfiles_dir}}", file=sys.stderr)
|
|
||||||
|
|
||||||
if os.path.exists(runfiles_dir):
|
if os.path.exists(runfiles_dir):
|
||||||
# Found runfiles directory, add _main to Python path
|
# Found runfiles directory, add _main to Python path
|
||||||
|
|
@ -991,8 +990,19 @@ if os.path.exists(runfiles_dir):
|
||||||
if os.path.exists(main_runfiles_path):
|
if os.path.exists(main_runfiles_path):
|
||||||
sys.path.insert(0, main_runfiles_path)
|
sys.path.insert(0, main_runfiles_path)
|
||||||
# Successfully added main runfiles path
|
# Successfully added main runfiles path
|
||||||
|
|
||||||
|
# Check what other directories exist in runfiles for cross-workspace usage
|
||||||
|
# All runfiles directories available
|
||||||
else:
|
else:
|
||||||
print("DEBUG: _main directory not found in runfiles", file=sys.stderr)
|
# _main directory not found in runfiles
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Add external repository runfiles (like databuild+) for cross-workspace usage
|
||||||
|
for entry in os.listdir(runfiles_dir):
|
||||||
|
if entry.endswith('+') and os.path.isdir(os.path.join(runfiles_dir, entry)):
|
||||||
|
external_path = os.path.join(runfiles_dir, entry)
|
||||||
|
sys.path.insert(0, external_path)
|
||||||
|
# Added external repository path
|
||||||
|
|
||||||
# Also add pip package runfiles to Python path
|
# Also add pip package runfiles to Python path
|
||||||
for entry in os.listdir(runfiles_dir):
|
for entry in os.listdir(runfiles_dir):
|
||||||
|
|
@ -1000,9 +1010,9 @@ if os.path.exists(runfiles_dir):
|
||||||
pip_site_packages = os.path.join(runfiles_dir, entry, 'site-packages')
|
pip_site_packages = os.path.join(runfiles_dir, entry, 'site-packages')
|
||||||
if os.path.exists(pip_site_packages):
|
if os.path.exists(pip_site_packages):
|
||||||
sys.path.insert(0, pip_site_packages)
|
sys.path.insert(0, pip_site_packages)
|
||||||
# Successfully added pip package path
|
# Added pip package path
|
||||||
else:
|
else:
|
||||||
print("DEBUG: Runfiles directory not found, using workspace root", file=sys.stderr)
|
# Runfiles directory not found, falling back to workspace root
|
||||||
# If runfiles not available, we're probably running in development
|
# If runfiles not available, we're probably running in development
|
||||||
# Add the workspace root to the path
|
# Add the workspace root to the path
|
||||||
workspace_root = os.environ.get('BUILD_WORKSPACE_DIRECTORY')
|
workspace_root = os.environ.get('BUILD_WORKSPACE_DIRECTORY')
|
||||||
|
|
@ -1025,7 +1035,7 @@ def main():
|
||||||
print(f"Generating DataBuild DSL code to {{output_dir}}")
|
print(f"Generating DataBuild DSL code to {{output_dir}}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
generate_dsl_package('{module_path}', '{graph_attr}', output_dir)
|
generate_dsl_package('{module_path}', '{graph_attr}', output_dir, {deps})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"ERROR: {{e}}", file=sys.stderr)
|
print(f"ERROR: {{e}}", file=sys.stderr)
|
||||||
import traceback
|
import traceback
|
||||||
|
|
@ -1038,6 +1048,7 @@ if __name__ == "__main__":
|
||||||
module_path=module_path,
|
module_path=module_path,
|
||||||
graph_attr=graph_attr,
|
graph_attr=graph_attr,
|
||||||
package_path=package_path,
|
package_path=package_path,
|
||||||
|
deps=deps,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _databuild_dsl_generator_impl(ctx):
|
def _databuild_dsl_generator_impl(ctx):
|
||||||
|
|
@ -1055,10 +1066,14 @@ def _databuild_dsl_generator_impl(ctx):
|
||||||
package_path = ctx.attr.output_package.strip("//").replace(":", "/")
|
package_path = ctx.attr.output_package.strip("//").replace(":", "/")
|
||||||
|
|
||||||
# Generate script content with embedded parameters
|
# Generate script content with embedded parameters
|
||||||
|
# Convert deps to list of strings
|
||||||
|
dep_labels = [str(dep.label) for dep in ctx.attr.deps] if ctx.attr.deps else []
|
||||||
|
|
||||||
script_content = _generate_custom_generator_script(
|
script_content = _generate_custom_generator_script(
|
||||||
module_path=module_path,
|
module_path=module_path,
|
||||||
graph_attr=ctx.attr.graph_attr,
|
graph_attr=ctx.attr.graph_attr,
|
||||||
package_path=package_path
|
package_path=package_path,
|
||||||
|
deps=dep_labels
|
||||||
)
|
)
|
||||||
|
|
||||||
ctx.actions.write(
|
ctx.actions.write(
|
||||||
|
|
|
||||||
22
examples/simple_python_dsl/BUILD.bazel
Normal file
22
examples/simple_python_dsl/BUILD.bazel
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
load("@databuild//databuild:rules.bzl", "databuild_dsl_generator")
|
||||||
|
|
||||||
|
# Python DSL library containing the graph definition
|
||||||
|
py_library(
|
||||||
|
name = "simple_dsl_src",
|
||||||
|
srcs = ["simple_graph.py"],
|
||||||
|
visibility = ["//visibility:public"],
|
||||||
|
deps = [
|
||||||
|
"@databuild//databuild:py_proto",
|
||||||
|
"@databuild//databuild/dsl/python:dsl",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# DSL generator that creates bazel targets from the Python DSL
|
||||||
|
databuild_dsl_generator(
|
||||||
|
name = "simple_graph.generate",
|
||||||
|
graph_attr = "graph",
|
||||||
|
graph_file = "simple_graph.py",
|
||||||
|
output_package = "//",
|
||||||
|
visibility = ["//visibility:public"],
|
||||||
|
deps = [":simple_dsl_src"],
|
||||||
|
)
|
||||||
26
examples/simple_python_dsl/MODULE.bazel
Normal file
26
examples/simple_python_dsl/MODULE.bazel
Normal file
|
|
@ -0,0 +1,26 @@
|
||||||
|
module(
|
||||||
|
name = "simple_python_dsl_example",
|
||||||
|
version = "0.1",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Databuild dep - overridden so ignore version
|
||||||
|
bazel_dep(name = "databuild", version = "0.0")
|
||||||
|
local_path_override(
|
||||||
|
module_name = "databuild",
|
||||||
|
path = "../..",
|
||||||
|
)
|
||||||
|
|
||||||
|
bazel_dep(name = "rules_python", version = "1.3.0")
|
||||||
|
|
||||||
|
python = use_extension("@rules_python//python/extensions:python.bzl", "python")
|
||||||
|
python.toolchain(
|
||||||
|
python_version = "3.12",
|
||||||
|
)
|
||||||
|
|
||||||
|
pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip")
|
||||||
|
pip.parse(
|
||||||
|
hub_name = "pypi",
|
||||||
|
python_version = "3.12",
|
||||||
|
requirements_lock = "//:requirements_lock.txt",
|
||||||
|
)
|
||||||
|
use_repo(pip, "pypi")
|
||||||
8261
examples/simple_python_dsl/MODULE.bazel.lock
Normal file
8261
examples/simple_python_dsl/MODULE.bazel.lock
Normal file
File diff suppressed because one or more lines are too long
87
examples/simple_python_dsl/README.md
Normal file
87
examples/simple_python_dsl/README.md
Normal file
|
|
@ -0,0 +1,87 @@
|
||||||
|
# Simple Python DSL Example
|
||||||
|
|
||||||
|
This example demonstrates how to use DataBuild's Python DSL to define a simple data processing pipeline.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The example defines a basic 3-stage data processing pipeline:
|
||||||
|
|
||||||
|
1. **IngestRawData**: Ingests raw data for a specific date
|
||||||
|
2. **ProcessData**: Processes the raw data into a processed format
|
||||||
|
3. **CreateSummary**: Creates summary statistics from processed data
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
- `simple_graph.py`: Python DSL definition of the data pipeline
|
||||||
|
- `BUILD.bazel`: Bazel build configuration
|
||||||
|
- `MODULE.bazel`: Bazel module configuration for dependencies
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Generate DSL Targets
|
||||||
|
|
||||||
|
The DSL generator can create Bazel targets from the Python DSL definition:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bazel run //:simple_graph.generate
|
||||||
|
```
|
||||||
|
|
||||||
|
This will generate Bazel targets in the `generated/` directory.
|
||||||
|
|
||||||
|
### Build Individual Jobs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build a specific job
|
||||||
|
bazel build //:ingest_raw_data
|
||||||
|
|
||||||
|
# Build all jobs
|
||||||
|
bazel build //:simple_graph
|
||||||
|
```
|
||||||
|
|
||||||
|
### Analyze the Graph
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Analyze what jobs would run for specific partitions
|
||||||
|
bazel run //:simple_graph.analyze -- "summary/date=2024-01-01"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run the Graph
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build specific partitions
|
||||||
|
bazel run //:simple_graph.build -- "summary/date=2024-01-01"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Cross-Workspace Usage
|
||||||
|
|
||||||
|
This example can be consumed from external workspaces by adding DataBuild as a dependency in your `MODULE.bazel`:
|
||||||
|
|
||||||
|
```starlark
|
||||||
|
bazel_dep(name = "databuild", version = "0.0")
|
||||||
|
local_path_override(
|
||||||
|
module_name = "databuild",
|
||||||
|
path = "path/to/databuild",
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Then you can reference and extend this example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from databuild.dsl.python.dsl import DataBuildGraph
|
||||||
|
# Import and extend the simple graph
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
To test that the DSL generator works correctly:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test the DSL generation
|
||||||
|
bazel run //:simple_graph.generate
|
||||||
|
|
||||||
|
# Verify generated files exist
|
||||||
|
ls generated/
|
||||||
|
|
||||||
|
# Test job lookup
|
||||||
|
bazel run //:job_lookup -- "raw_data/date=2024-01-01"
|
||||||
|
```
|
||||||
2
examples/simple_python_dsl/requirements_lock.txt
Normal file
2
examples/simple_python_dsl/requirements_lock.txt
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
# Simple Python DSL example - minimal requirements
|
||||||
|
# This file would normally be generated by pip-compile
|
||||||
124
examples/simple_python_dsl/simple_graph.py
Normal file
124
examples/simple_python_dsl/simple_graph.py
Normal file
|
|
@ -0,0 +1,124 @@
|
||||||
|
"""
|
||||||
|
Simple Python DSL example - basic data processing pipeline
|
||||||
|
"""
|
||||||
|
|
||||||
|
from databuild.dsl.python.dsl import DataBuildGraph, DataBuildJob, JobConfigBuilder, PartitionPattern
|
||||||
|
from databuild.proto import JobConfig
|
||||||
|
from datetime import date
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Define the graph
|
||||||
|
graph = DataBuildGraph("//:simple_graph")
|
||||||
|
|
||||||
|
|
||||||
|
class RawDataPartition(PartitionPattern):
|
||||||
|
"""Represents raw input data for a specific date"""
|
||||||
|
_raw_pattern = r"raw_data/date=(?P<date>\d{4}-\d{2}-\d{2})"
|
||||||
|
|
||||||
|
def __init__(self, date: str):
|
||||||
|
self.date = date
|
||||||
|
|
||||||
|
def serialize(self) -> str:
|
||||||
|
return f"raw_data/date={self.date}"
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessedDataPartition(PartitionPattern):
|
||||||
|
"""Represents processed data for a specific date"""
|
||||||
|
_raw_pattern = r"processed_data/date=(?P<date>\d{4}-\d{2}-\d{2})"
|
||||||
|
|
||||||
|
def __init__(self, date: str):
|
||||||
|
self.date = date
|
||||||
|
|
||||||
|
def serialize(self) -> str:
|
||||||
|
return f"processed_data/date={self.date}"
|
||||||
|
|
||||||
|
|
||||||
|
class SummaryPartition(PartitionPattern):
|
||||||
|
"""Represents summary data for a specific date"""
|
||||||
|
_raw_pattern = r"summary/date=(?P<date>\d{4}-\d{2}-\d{2})"
|
||||||
|
|
||||||
|
def __init__(self, date: str):
|
||||||
|
self.date = date
|
||||||
|
|
||||||
|
def serialize(self) -> str:
|
||||||
|
return f"summary/date={self.date}"
|
||||||
|
|
||||||
|
|
||||||
|
@graph.job
|
||||||
|
class IngestRawData(DataBuildJob):
|
||||||
|
"""Job to ingest raw data for a given date"""
|
||||||
|
output_types = [RawDataPartition]
|
||||||
|
|
||||||
|
def config(self, outputs: list[RawDataPartition]) -> list[JobConfig]:
|
||||||
|
configs = []
|
||||||
|
for output in outputs:
|
||||||
|
env = {"DATA_DATE": output.date}
|
||||||
|
configs.append(
|
||||||
|
JobConfigBuilder()
|
||||||
|
.add_outputs(output)
|
||||||
|
.set_env(env)
|
||||||
|
.add_args("--date", output.date)
|
||||||
|
.build()
|
||||||
|
)
|
||||||
|
return configs
|
||||||
|
|
||||||
|
def exec(self, *args: str) -> None:
|
||||||
|
# Simple implementation - just create a dummy file
|
||||||
|
data_date = os.environ["DATA_DATE"]
|
||||||
|
print(f"Ingesting raw data for {data_date}")
|
||||||
|
# In a real job, this would read from external sources
|
||||||
|
print(f"Raw data ingested successfully for {data_date}")
|
||||||
|
|
||||||
|
|
||||||
|
@graph.job
|
||||||
|
class ProcessData(DataBuildJob):
|
||||||
|
"""Job to process raw data into processed format"""
|
||||||
|
output_types = [ProcessedDataPartition]
|
||||||
|
|
||||||
|
def config(self, outputs: list[ProcessedDataPartition]) -> list[JobConfig]:
|
||||||
|
configs = []
|
||||||
|
for output in outputs:
|
||||||
|
raw_input = RawDataPartition(date=output.date)
|
||||||
|
env = {"DATA_DATE": output.date}
|
||||||
|
configs.append(
|
||||||
|
JobConfigBuilder()
|
||||||
|
.add_outputs(output)
|
||||||
|
.add_inputs(raw_input)
|
||||||
|
.set_env(env)
|
||||||
|
.add_args("--date", output.date)
|
||||||
|
.build()
|
||||||
|
)
|
||||||
|
return configs
|
||||||
|
|
||||||
|
def exec(self, *args: str) -> None:
|
||||||
|
data_date = os.environ["DATA_DATE"]
|
||||||
|
print(f"Processing data for {data_date}")
|
||||||
|
# In a real job, this would transform the raw data
|
||||||
|
print(f"Data processed successfully for {data_date}")
|
||||||
|
|
||||||
|
|
||||||
|
@graph.job
|
||||||
|
class CreateSummary(DataBuildJob):
|
||||||
|
"""Job to create summary from processed data"""
|
||||||
|
output_types = [SummaryPartition]
|
||||||
|
|
||||||
|
def config(self, outputs: list[SummaryPartition]) -> list[JobConfig]:
|
||||||
|
configs = []
|
||||||
|
for output in outputs:
|
||||||
|
processed_input = ProcessedDataPartition(date=output.date)
|
||||||
|
env = {"DATA_DATE": output.date}
|
||||||
|
configs.append(
|
||||||
|
JobConfigBuilder()
|
||||||
|
.add_outputs(output)
|
||||||
|
.add_inputs(processed_input)
|
||||||
|
.set_env(env)
|
||||||
|
.add_args("--date", output.date)
|
||||||
|
.build()
|
||||||
|
)
|
||||||
|
return configs
|
||||||
|
|
||||||
|
def exec(self, *args: str) -> None:
|
||||||
|
data_date = os.environ["DATA_DATE"]
|
||||||
|
print(f"Creating summary for {data_date}")
|
||||||
|
# In a real job, this would generate summary statistics
|
||||||
|
print(f"Summary created successfully for {data_date}")
|
||||||
Loading…
Reference in a new issue