"""Python DSL implementation of test app""" from collections import defaultdict from databuild.dsl.python.dsl import DataBuildGraph, DataBuildJob, JobConfigBuilder from databuild.proto import JobConfig from databuild.test.app.colors import COLORS from databuild.test.app.jobs.ingest_color_votes.execute import execute as ingest_color_votes_exec from databuild.test.app.jobs.trailing_color_votes.execute import execute as trailing_color_votes_exec from databuild.test.app.jobs.aggregate_color_votes.execute import execute as aggregate_color_votes_exec from databuild.test.app.jobs.color_vote_report_calc.execute import execute as color_vote_report_calc_exec from databuild.test.app.dsl.partitions import ( IngestedColorPartition, TrailingColorVotes1MPartition, TrailingColorVotes1WPartition, DailyVotesPartition, Votes1WPartition, Votes1MPartition, ColorVoteReportPartition ) import os from datetime import date, timedelta graph = DataBuildGraph("//databuild/test/app/dsl:dsl_graph") @graph.job class IngestColorVotes(DataBuildJob): output_types = [IngestedColorPartition] def config(self, outputs: list[IngestedColorPartition]) -> list[JobConfig]: configs = [] for output in outputs: env = {"DATA_DATE": output.data_date, "COLOR": output.color} configs.append(JobConfigBuilder().add_outputs(output).set_env(env).build()) return configs def exec(self, *args: str) -> None: ingest_color_votes_exec(data_date=os.environ["DATA_DATE"], color=os.environ["COLOR"]) @graph.job class TrailingColorVotes(DataBuildJob): output_types = [TrailingColorVotes1MPartition, TrailingColorVotes1WPartition] def config(self, outputs: list[TrailingColorVotes1MPartition | TrailingColorVotes1WPartition]) -> list[JobConfig]: groups = defaultdict(list) for output in outputs: groups[(output.data_date, output.color)].append(output) configs = [] for (data_date, color), outputs in groups.items(): weekly = "false" monthly = "false" max_window = 0 for output in outputs: if isinstance(output, TrailingColorVotes1WPartition): weekly = "true" max_window = max(max_window, 7) elif isinstance(output, TrailingColorVotes1MPartition): monthly = "true" max_window = max(max_window, 28) env = {"DATA_DATE": data_date, "COLOR": color, "WEEKLY": weekly, "MONTHLY": monthly} config = JobConfigBuilder(env=env).add_outputs(*outputs) for i in range(max_window): in_date = (date.fromisoformat(data_date) - timedelta(days=i)).isoformat() config.add_inputs(IngestedColorPartition(data_date=in_date, color=color)) configs.append(config.build()) return configs def exec(self, *args: str) -> None: trailing_color_votes_exec(data_date=os.environ["DATA_DATE"], color=os.environ["COLOR"]) @graph.job class AggregateColorVotes(DataBuildJob): output_types = [DailyVotesPartition, Votes1WPartition, Votes1MPartition] def config(self, outputs: list[DailyVotesPartition | Votes1WPartition | Votes1MPartition]) -> list[JobConfig]: configs = [] for output in outputs: if isinstance(output, DailyVotesPartition): InPartition = IngestedColorPartition agg_type = "daily_votes" elif isinstance(output, Votes1WPartition): InPartition = TrailingColorVotes1WPartition agg_type = "votes_1w" elif isinstance(output, Votes1MPartition): InPartition = TrailingColorVotes1MPartition agg_type = "votes_1m" else: raise ValueError(f"Unknown output type: {output.type}") inputs = [InPartition(data_date=output.data_date, color=color) for color in COLORS] env = {"DATA_DATE": output.data_date, "AGGREGATE_TYPE": agg_type} configs.append(JobConfigBuilder().add_outputs(output).add_inputs(*inputs).set_env(env).build()) return configs def exec(self, *args: str) -> None: aggregate_color_votes_exec(data_date=os.environ["DATA_DATE"], aggregate_type=os.environ["AGGREGATE_TYPE"]) @graph.job class ColorVoteReportCalc(DataBuildJob): output_types = [ColorVoteReportPartition] def config(self, outputs: list[ColorVoteReportPartition]) -> list[JobConfig]: config = JobConfigBuilder().add_outputs(*outputs).add_args(*[p.serialize() for p in outputs]) for data_date in set(p.data_date for p in outputs): config.add_inputs( DailyVotesPartition(data_date=data_date), Votes1WPartition(data_date=data_date), Votes1MPartition(data_date=data_date), ) for output in outputs: config.add_inputs( IngestedColorPartition(data_date=output.data_date, color=output.color), TrailingColorVotes1WPartition(data_date=output.data_date, color=output.color), TrailingColorVotes1MPartition(data_date=output.data_date, color=output.color), ) return [config.build()] def exec(self, *args: str) -> None: color_vote_report_calc_exec(list(args))