131 lines
5.3 KiB
Python
131 lines
5.3 KiB
Python
"""Python DSL implementation of test app"""
|
|
|
|
from collections import defaultdict
|
|
from databuild.dsl.python.dsl import DataBuildGraph, DataBuildJob, JobConfigBuilder
|
|
from databuild.proto import JobConfig
|
|
from databuild.test.app.colors import COLORS
|
|
from databuild.test.app.jobs.ingest_color_votes.execute import execute as ingest_color_votes_exec
|
|
from databuild.test.app.jobs.trailing_color_votes.execute import execute as trailing_color_votes_exec
|
|
from databuild.test.app.jobs.aggregate_color_votes.execute import execute as aggregate_color_votes_exec
|
|
from databuild.test.app.jobs.color_vote_report_calc.execute import execute as color_vote_report_calc_exec
|
|
from databuild.test.app.dsl.partitions import (
|
|
IngestedColorPartition,
|
|
TrailingColorVotes1MPartition,
|
|
TrailingColorVotes1WPartition,
|
|
DailyVotesPartition,
|
|
Votes1WPartition,
|
|
Votes1MPartition,
|
|
ColorVoteReportPartition
|
|
)
|
|
import os
|
|
from datetime import date, timedelta
|
|
|
|
graph = DataBuildGraph("//databuild/test/app/dsl:dsl_graph")
|
|
|
|
|
|
@graph.job
|
|
class IngestColorVotes(DataBuildJob):
|
|
output_types = [IngestedColorPartition]
|
|
|
|
def config(self, outputs: list[IngestedColorPartition]) -> list[JobConfig]:
|
|
configs = []
|
|
for output in outputs:
|
|
env = {"DATA_DATE": output.data_date, "COLOR": output.color}
|
|
configs.append(JobConfigBuilder().add_outputs(output).set_env(env).build())
|
|
return configs
|
|
|
|
def exec(self, *args: str) -> None:
|
|
ingest_color_votes_exec(data_date=os.environ["DATA_DATE"], color=os.environ["COLOR"])
|
|
|
|
|
|
@graph.job
|
|
class TrailingColorVotes(DataBuildJob):
|
|
output_types = [TrailingColorVotes1MPartition, TrailingColorVotes1WPartition]
|
|
|
|
def config(self, outputs: list[TrailingColorVotes1MPartition | TrailingColorVotes1WPartition]) -> list[JobConfig]:
|
|
groups = defaultdict(list)
|
|
for output in outputs:
|
|
groups[(output.data_date, output.color)].append(output)
|
|
|
|
configs = []
|
|
for (data_date, color), outputs in groups.items():
|
|
weekly = "false"
|
|
monthly = "false"
|
|
max_window = 0
|
|
for output in outputs:
|
|
if isinstance(output, TrailingColorVotes1WPartition):
|
|
weekly = "true"
|
|
max_window = max(max_window, 7)
|
|
elif isinstance(output, TrailingColorVotes1MPartition):
|
|
monthly = "true"
|
|
max_window = max(max_window, 28)
|
|
|
|
env = {"DATA_DATE": data_date, "COLOR": color, "WEEKLY": weekly, "MONTHLY": monthly}
|
|
config = JobConfigBuilder(env=env).add_outputs(*outputs)
|
|
for i in range(max_window):
|
|
in_date = (date.fromisoformat(data_date) - timedelta(days=i)).isoformat()
|
|
config.add_inputs(IngestedColorPartition(data_date=in_date, color=color))
|
|
|
|
configs.append(config.build())
|
|
return configs
|
|
|
|
def exec(self, *args: str) -> None:
|
|
trailing_color_votes_exec(data_date=os.environ["DATA_DATE"], color=os.environ["COLOR"])
|
|
|
|
|
|
@graph.job
|
|
class AggregateColorVotes(DataBuildJob):
|
|
output_types = [DailyVotesPartition, Votes1WPartition, Votes1MPartition]
|
|
|
|
def config(self, outputs: list[DailyVotesPartition | Votes1WPartition | Votes1MPartition]) -> list[JobConfig]:
|
|
configs = []
|
|
|
|
for output in outputs:
|
|
if isinstance(output, DailyVotesPartition):
|
|
InPartition = IngestedColorPartition
|
|
agg_type = "daily_votes"
|
|
elif isinstance(output, Votes1WPartition):
|
|
InPartition = TrailingColorVotes1WPartition
|
|
agg_type = "votes_1w"
|
|
elif isinstance(output, Votes1MPartition):
|
|
InPartition = TrailingColorVotes1MPartition
|
|
agg_type = "votes_1m"
|
|
else:
|
|
raise ValueError(f"Unknown output type: {output.type}")
|
|
|
|
inputs = [InPartition(data_date=output.data_date, color=color) for color in COLORS]
|
|
env = {"DATA_DATE": output.data_date, "AGGREGATE_TYPE": agg_type}
|
|
configs.append(JobConfigBuilder().add_outputs(output).add_inputs(*inputs).set_env(env).build())
|
|
|
|
return configs
|
|
|
|
def exec(self, *args: str) -> None:
|
|
aggregate_color_votes_exec(data_date=os.environ["DATA_DATE"], aggregate_type=os.environ["AGGREGATE_TYPE"])
|
|
|
|
|
|
@graph.job
|
|
class ColorVoteReportCalc(DataBuildJob):
|
|
output_types = [ColorVoteReportPartition]
|
|
|
|
def config(self, outputs: list[ColorVoteReportPartition]) -> list[JobConfig]:
|
|
config = JobConfigBuilder().add_outputs(*outputs).add_args(*[p.serialize() for p in outputs])
|
|
|
|
for data_date in set(p.data_date for p in outputs):
|
|
config.add_inputs(
|
|
DailyVotesPartition(data_date=data_date),
|
|
Votes1WPartition(data_date=data_date),
|
|
Votes1MPartition(data_date=data_date),
|
|
)
|
|
|
|
for output in outputs:
|
|
config.add_inputs(
|
|
IngestedColorPartition(data_date=output.data_date, color=output.color),
|
|
TrailingColorVotes1WPartition(data_date=output.data_date, color=output.color),
|
|
TrailingColorVotes1MPartition(data_date=output.data_date, color=output.color),
|
|
)
|
|
|
|
return [config.build()]
|
|
|
|
def exec(self, *args: str) -> None:
|
|
color_vote_report_calc_exec(list(args))
|
|
|