Add graph and trailling job
This commit is contained in:
parent
30f1d9addb
commit
63f9518486
6 changed files with 228 additions and 5 deletions
|
|
@ -9,16 +9,68 @@ py_library(
|
||||||
|
|
||||||
# Tests
|
# Tests
|
||||||
py_test(
|
py_test(
|
||||||
name = "test",
|
name = "test_trailing_color_votes",
|
||||||
srcs = glob(["**/test.py"]),
|
srcs = ["jobs/trailing_color_votes/test.py"],
|
||||||
|
main = "jobs/trailing_color_votes/test.py",
|
||||||
|
deps = [":job_src"],
|
||||||
|
)
|
||||||
|
|
||||||
|
py_test(
|
||||||
|
name = "test_ingest_color_votes",
|
||||||
|
srcs = ["jobs/ingest_color_votes/test.py"],
|
||||||
|
main = "jobs/ingest_color_votes/test.py",
|
||||||
deps = [":job_src"],
|
deps = [":job_src"],
|
||||||
)
|
)
|
||||||
|
|
||||||
# Bazel-defined
|
# Bazel-defined
|
||||||
|
## Graph
|
||||||
|
databuild_graph(
|
||||||
|
name = "bazel_graph",
|
||||||
|
jobs = [
|
||||||
|
":ingest_color_votes",
|
||||||
|
":trailing_color_votes",
|
||||||
|
# TODO
|
||||||
|
],
|
||||||
|
lookup = ":bazel_graph_lookup",
|
||||||
|
)
|
||||||
|
|
||||||
#databuild_job(
|
py_binary(
|
||||||
# name = "ingest_color_votes",
|
name = "bazel_graph_lookup",
|
||||||
#)
|
srcs = ["lookup.py"],
|
||||||
|
main = "lookup.py",
|
||||||
|
)
|
||||||
|
|
||||||
|
## Ingest Color Votes
|
||||||
|
databuild_job(
|
||||||
|
name = "ingest_color_votes",
|
||||||
|
binary = ":ingest_color_votes_binary",
|
||||||
|
)
|
||||||
|
|
||||||
|
py_binary(
|
||||||
|
name = "ingest_color_votes_binary",
|
||||||
|
srcs = ["jobs/ingest_color_votes/main.py"],
|
||||||
|
main = "jobs/ingest_color_votes/main.py",
|
||||||
|
deps = [":job_src"],
|
||||||
|
)
|
||||||
|
|
||||||
|
## Trailing Color Votes
|
||||||
|
databuild_job(
|
||||||
|
name = "trailing_color_votes",
|
||||||
|
binary = ":trailing_color_votes_binary",
|
||||||
|
)
|
||||||
|
|
||||||
|
py_binary(
|
||||||
|
name = "trailing_color_votes_binary",
|
||||||
|
srcs = ["jobs/trailing_color_votes/main.py"],
|
||||||
|
main = "jobs/trailing_color_votes/main.py",
|
||||||
|
deps = [":job_src"],
|
||||||
|
)
|
||||||
|
|
||||||
|
## Aggregate Color Votes
|
||||||
|
# TODO
|
||||||
|
|
||||||
|
## Color Vote Report Calc
|
||||||
|
# TODO
|
||||||
|
|
||||||
# Python-DSL-defined
|
# Python-DSL-defined
|
||||||
|
|
||||||
|
|
|
||||||
49
databuild/test/app/jobs/trailing_color_votes/config.py
Normal file
49
databuild/test/app/jobs/trailing_color_votes/config.py
Normal file
|
|
@ -0,0 +1,49 @@
|
||||||
|
from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig
|
||||||
|
from datetime import date, timedelta
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
def configure(outputs: list[PartitionRef]) -> JobConfigureResponse:
|
||||||
|
# Group outputs by date and color
|
||||||
|
grouped_outputs = defaultdict(list)
|
||||||
|
|
||||||
|
for output in outputs:
|
||||||
|
parts = output.str.split("/")
|
||||||
|
if len(parts) == 3 and parts[0] in ["color_votes_1w", "color_votes_1m"]:
|
||||||
|
prefix, data_date, color = parts
|
||||||
|
key = (data_date, color)
|
||||||
|
grouped_outputs[key].append(output)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid output partition format: {output.str}")
|
||||||
|
|
||||||
|
configs = []
|
||||||
|
for (data_date, color), output_partitions in grouped_outputs.items():
|
||||||
|
# Parse the output date
|
||||||
|
output_date = date.fromisoformat(data_date)
|
||||||
|
|
||||||
|
# Determine which windows are needed and the maximum window
|
||||||
|
has_weekly = any(output.str.startswith("color_votes_1w/") for output in output_partitions)
|
||||||
|
has_monthly = any(output.str.startswith("color_votes_1m/") for output in output_partitions)
|
||||||
|
max_window = max(7 if has_weekly else 0, 28 if has_monthly else 0)
|
||||||
|
|
||||||
|
# Generate input partition refs for the required trailing window
|
||||||
|
inputs = []
|
||||||
|
for i in range(max_window):
|
||||||
|
input_date = output_date - timedelta(days=i)
|
||||||
|
input_ref = PartitionRef(str=f"daily_color_votes/{input_date.isoformat()}/{color}")
|
||||||
|
inputs.append(input_ref)
|
||||||
|
|
||||||
|
env = {
|
||||||
|
"DATA_DATE": data_date,
|
||||||
|
"COLOR": color,
|
||||||
|
"WEEKLY": "true" if has_weekly else "false",
|
||||||
|
"MONTHLY": "true" if has_monthly else "false"
|
||||||
|
}
|
||||||
|
|
||||||
|
configs.append(JobConfig(
|
||||||
|
outputs=output_partitions,
|
||||||
|
inputs=inputs,
|
||||||
|
args=[],
|
||||||
|
env=env
|
||||||
|
))
|
||||||
|
|
||||||
|
return JobConfigureResponse(configs=configs)
|
||||||
28
databuild/test/app/jobs/trailing_color_votes/execute.py
Normal file
28
databuild/test/app/jobs/trailing_color_votes/execute.py
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
from databuild.test.app import dal
|
||||||
|
from databuild.proto import PartitionRef
|
||||||
|
from datetime import date, timedelta
|
||||||
|
import os
|
||||||
|
|
||||||
|
def execute(data_date: str, color: str):
|
||||||
|
output_date = date.fromisoformat(data_date)
|
||||||
|
weekly = os.environ.get("WEEKLY", "false").lower() == "true"
|
||||||
|
monthly = os.environ.get("MONTHLY", "false").lower() == "true"
|
||||||
|
|
||||||
|
def calculate_and_write(window_days: int, output_prefix: str):
|
||||||
|
# Read trailing data and sum votes
|
||||||
|
input_refs = []
|
||||||
|
for i in range(window_days):
|
||||||
|
input_date = output_date - timedelta(days=i)
|
||||||
|
input_refs.append(PartitionRef(str=f"daily_color_votes/{input_date.isoformat()}/{color}"))
|
||||||
|
|
||||||
|
data = dal.read(*input_refs)
|
||||||
|
total_votes = sum(record["votes"] for record in data)
|
||||||
|
|
||||||
|
output_ref = PartitionRef(str=f"{output_prefix}/{data_date}/{color}")
|
||||||
|
dal.write(output_ref, [{"color": color, "data_date": data_date, "votes": total_votes}])
|
||||||
|
|
||||||
|
if weekly:
|
||||||
|
calculate_and_write(7, "color_votes_1w")
|
||||||
|
|
||||||
|
if monthly:
|
||||||
|
calculate_and_write(28, "color_votes_1m")
|
||||||
18
databuild/test/app/jobs/trailing_color_votes/main.py
Normal file
18
databuild/test/app/jobs/trailing_color_votes/main.py
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
"""Main entrypoint for the trailing_color_votes job for use with bazel-defined graph."""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from databuild.proto import PartitionRef
|
||||||
|
from databuild.test.app.jobs.trailing_color_votes.config import configure
|
||||||
|
from databuild.test.app.jobs.trailing_color_votes.execute import execute
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if sys.argv[1] == "config":
|
||||||
|
configure([
|
||||||
|
PartitionRef(str=raw_ref)
|
||||||
|
for raw_ref in sys.argv[2:]
|
||||||
|
])
|
||||||
|
elif sys.argv[1] == "execute":
|
||||||
|
execute(os.environ["DATA_DATE"], os.environ["COLOR"])
|
||||||
|
else:
|
||||||
|
raise Exception(f"Invalid command `{sys.argv[1]}`")
|
||||||
53
databuild/test/app/jobs/trailing_color_votes/test.py
Normal file
53
databuild/test/app/jobs/trailing_color_votes/test.py
Normal file
|
|
@ -0,0 +1,53 @@
|
||||||
|
import unittest
|
||||||
|
from databuild.proto import PartitionRef
|
||||||
|
from databuild.test.app.jobs.trailing_color_votes.config import configure
|
||||||
|
|
||||||
|
class TestTrailingColorVotesConfig(unittest.TestCase):
|
||||||
|
def test_configure_weekly_only(self):
|
||||||
|
outputs = [PartitionRef(str="color_votes_1w/2024-01-07/red")]
|
||||||
|
response = configure(outputs)
|
||||||
|
|
||||||
|
self.assertEqual(len(response.configs), 1)
|
||||||
|
config = response.configs[0]
|
||||||
|
self.assertEqual(len(config.outputs), 1)
|
||||||
|
self.assertEqual(len(config.inputs), 7) # 7 days for weekly
|
||||||
|
self.assertEqual(config.env["WEEKLY"], "true")
|
||||||
|
self.assertEqual(config.env["MONTHLY"], "false")
|
||||||
|
|
||||||
|
def test_configure_monthly_only(self):
|
||||||
|
outputs = [PartitionRef(str="color_votes_1m/2024-01-28/blue")]
|
||||||
|
response = configure(outputs)
|
||||||
|
|
||||||
|
self.assertEqual(len(response.configs), 1)
|
||||||
|
config = response.configs[0]
|
||||||
|
self.assertEqual(len(config.outputs), 1)
|
||||||
|
self.assertEqual(len(config.inputs), 28) # 28 days for monthly
|
||||||
|
self.assertEqual(config.env["WEEKLY"], "false")
|
||||||
|
self.assertEqual(config.env["MONTHLY"], "true")
|
||||||
|
|
||||||
|
def test_configure_both_weekly_and_monthly(self):
|
||||||
|
outputs = [
|
||||||
|
PartitionRef(str="color_votes_1w/2024-01-28/green"),
|
||||||
|
PartitionRef(str="color_votes_1m/2024-01-28/green")
|
||||||
|
]
|
||||||
|
response = configure(outputs)
|
||||||
|
|
||||||
|
self.assertEqual(len(response.configs), 1) # Single config for same date/color
|
||||||
|
config = response.configs[0]
|
||||||
|
self.assertEqual(len(config.outputs), 2) # Both outputs
|
||||||
|
self.assertEqual(len(config.inputs), 28) # 28 days (max of 7 and 28)
|
||||||
|
self.assertEqual(config.env["WEEKLY"], "true")
|
||||||
|
self.assertEqual(config.env["MONTHLY"], "true")
|
||||||
|
|
||||||
|
def test_configure_multiple_colors_dates(self):
|
||||||
|
outputs = [
|
||||||
|
PartitionRef(str="color_votes_1w/2024-01-07/red"),
|
||||||
|
PartitionRef(str="color_votes_1w/2024-01-07/blue"),
|
||||||
|
PartitionRef(str="color_votes_1m/2024-01-14/red")
|
||||||
|
]
|
||||||
|
response = configure(outputs)
|
||||||
|
|
||||||
|
self.assertEqual(len(response.configs), 3) # One config per unique date/color combination
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
23
databuild/test/app/lookup.py
Normal file
23
databuild/test/app/lookup.py
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
from collections import defaultdict
|
||||||
|
import sys
|
||||||
|
|
||||||
|
LABEL_BASE = "//databuild/test/app"
|
||||||
|
|
||||||
|
|
||||||
|
def lookup(raw_ref: str):
|
||||||
|
if raw_ref.startswith("daily_color_votes"):
|
||||||
|
return LABEL_BASE + ":ingest_color_votes"
|
||||||
|
elif raw_ref.startswith("color_votes_1"):
|
||||||
|
return LABEL_BASE + ":trailing_color_votes"
|
||||||
|
elif raw_ref.startswith("daily_votes") or raw_ref.startswith("votes_1w") or raw_ref.startswith("votes_1m"):
|
||||||
|
return LABEL_BASE + ":aggregate_color_votes"
|
||||||
|
elif raw_ref.startswith("color_vote_report"):
|
||||||
|
return LABEL_BASE + ":color_vote_report_calc"
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unable to resolve job for partition: `{raw_ref}`")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
results = defaultdict(list)
|
||||||
|
for raw_ref in sys.argv[1:]:
|
||||||
|
results[lookup(raw_ref)].append(raw_ref)
|
||||||
Loading…
Reference in a new issue