databuild/databuild/test/app/dsl/graph.py
Stuart Axelbrooke bef37cd8ab
Some checks are pending
/ setup (push) Waiting to run
Fix service binary and static asset serving
2025-08-04 11:18:28 -07:00

131 lines
5.3 KiB
Python

"""Python DSL implementation of test app"""
from collections import defaultdict
from databuild.dsl.python.dsl import DataBuildGraph, DataBuildJob, JobConfigBuilder
from databuild.proto import JobConfig
from databuild.test.app.colors import COLORS
from databuild.test.app.jobs.ingest_color_votes.execute import execute as ingest_color_votes_exec
from databuild.test.app.jobs.trailing_color_votes.execute import execute as trailing_color_votes_exec
from databuild.test.app.jobs.aggregate_color_votes.execute import execute as aggregate_color_votes_exec
from databuild.test.app.jobs.color_vote_report_calc.execute import execute as color_vote_report_calc_exec
from databuild.test.app.dsl.partitions import (
IngestedColorPartition,
TrailingColorVotes1MPartition,
TrailingColorVotes1WPartition,
DailyVotesPartition,
Votes1WPartition,
Votes1MPartition,
ColorVoteReportPartition
)
import os
from datetime import date, timedelta
graph = DataBuildGraph("//databuild/test/app/dsl:dsl_graph")
@graph.job
class IngestColorVotes(DataBuildJob):
output_types = [IngestedColorPartition]
def config(self, outputs: list[IngestedColorPartition]) -> list[JobConfig]:
configs = []
for output in outputs:
env = {"DATA_DATE": output.data_date, "COLOR": output.color}
configs.append(JobConfigBuilder().add_outputs(output).set_env(env).build())
return configs
def exec(self, *args: str) -> None:
ingest_color_votes_exec(data_date=os.environ["DATA_DATE"], color=os.environ["COLOR"])
@graph.job
class TrailingColorVotes(DataBuildJob):
output_types = [TrailingColorVotes1MPartition, TrailingColorVotes1WPartition]
def config(self, outputs: list[TrailingColorVotes1MPartition | TrailingColorVotes1WPartition]) -> list[JobConfig]:
groups = defaultdict(list)
for output in outputs:
groups[(output.data_date, output.color)].append(output)
configs = []
for (data_date, color), outputs in groups.items():
weekly = "false"
monthly = "false"
max_window = 0
for output in outputs:
if isinstance(output, TrailingColorVotes1WPartition):
weekly = "true"
max_window = max(max_window, 7)
elif isinstance(output, TrailingColorVotes1MPartition):
monthly = "true"
max_window = max(max_window, 28)
env = {"DATA_DATE": data_date, "COLOR": color, "WEEKLY": weekly, "MONTHLY": monthly}
config = JobConfigBuilder(env=env).add_outputs(*outputs)
for i in range(max_window):
in_date = (date.fromisoformat(data_date) - timedelta(days=i)).isoformat()
config.add_inputs(IngestedColorPartition(data_date=in_date, color=color))
configs.append(config.build())
return configs
def exec(self, *args: str) -> None:
trailing_color_votes_exec(data_date=os.environ["DATA_DATE"], color=os.environ["COLOR"])
@graph.job
class AggregateColorVotes(DataBuildJob):
output_types = [DailyVotesPartition, Votes1WPartition, Votes1MPartition]
def config(self, outputs: list[DailyVotesPartition | Votes1WPartition | Votes1MPartition]) -> list[JobConfig]:
configs = []
for output in outputs:
if isinstance(output, DailyVotesPartition):
InPartition = IngestedColorPartition
agg_type = "daily_votes"
elif isinstance(output, Votes1WPartition):
InPartition = TrailingColorVotes1WPartition
agg_type = "votes_1w"
elif isinstance(output, Votes1MPartition):
InPartition = TrailingColorVotes1MPartition
agg_type = "votes_1m"
else:
raise ValueError(f"Unknown output type: {output.type}")
inputs = [InPartition(data_date=output.data_date, color=color) for color in COLORS]
env = {"DATA_DATE": output.data_date, "AGGREGATE_TYPE": agg_type}
configs.append(JobConfigBuilder().add_outputs(output).add_inputs(*inputs).set_env(env).build())
return configs
def exec(self, *args: str) -> None:
aggregate_color_votes_exec(data_date=os.environ["DATA_DATE"], aggregate_type=os.environ["AGGREGATE_TYPE"])
@graph.job
class ColorVoteReportCalc(DataBuildJob):
output_types = [ColorVoteReportPartition]
def config(self, outputs: list[ColorVoteReportPartition]) -> list[JobConfig]:
config = JobConfigBuilder().add_outputs(*outputs).add_args(*[p.serialize() for p in outputs])
for data_date in set(p.data_date for p in outputs):
config.add_inputs(
DailyVotesPartition(data_date=data_date),
Votes1WPartition(data_date=data_date),
Votes1MPartition(data_date=data_date),
)
for output in outputs:
config.add_inputs(
IngestedColorPartition(data_date=output.data_date, color=output.color),
TrailingColorVotes1WPartition(data_date=output.data_date, color=output.color),
TrailingColorVotes1MPartition(data_date=output.data_date, color=output.color),
)
return [config.build()]
def exec(self, *args: str) -> None:
color_vote_report_calc_exec(list(args))