Implement test app in python DSL
This commit is contained in:
parent
82e1d0eb26
commit
ae5147cb36
30 changed files with 432 additions and 177 deletions
|
|
@ -3,5 +3,6 @@ py_library(
|
||||||
srcs = ["dsl.py"],
|
srcs = ["dsl.py"],
|
||||||
visibility = ["//visibility:public"],
|
visibility = ["//visibility:public"],
|
||||||
deps = [
|
deps = [
|
||||||
|
"//databuild:py_proto",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
|
|
||||||
|
from databuild.proto import JobConfig, PartitionRef, DataDep, DepType
|
||||||
from typing import Self, Protocol, get_type_hints, get_origin, get_args
|
from typing import Self, Protocol, get_type_hints, get_origin, get_args
|
||||||
from dataclasses import fields, is_dataclass
|
from dataclasses import fields, is_dataclass, dataclass, field
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -58,21 +59,13 @@ class PartitionPattern:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
class JobConfig:
|
|
||||||
"""TODO need to generate this from databuild.proto"""
|
|
||||||
|
|
||||||
|
|
||||||
class PartitionManifest:
|
|
||||||
"""TODO need to generate this from databuild.proto"""
|
|
||||||
|
|
||||||
|
|
||||||
class DataBuildJob(Protocol):
|
class DataBuildJob(Protocol):
|
||||||
# The types of partitions that this job produces
|
# The types of partitions that this job produces
|
||||||
output_types: list[type[PartitionPattern]]
|
output_types: list[type[PartitionPattern]]
|
||||||
|
|
||||||
def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ...
|
def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ...
|
||||||
|
|
||||||
def exec(self, config: JobConfig) -> PartitionManifest: ...
|
def exec(self, config: JobConfig) -> None: ...
|
||||||
|
|
||||||
|
|
||||||
class DataBuildGraph:
|
class DataBuildGraph:
|
||||||
|
|
@ -89,3 +82,49 @@ class DataBuildGraph:
|
||||||
def generate_bazel_module(self):
|
def generate_bazel_module(self):
|
||||||
"""Generates a complete databuild application, packaging up referenced jobs and this graph via bazel targets"""
|
"""Generates a complete databuild application, packaging up referenced jobs and this graph via bazel targets"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class JobConfigBuilder:
|
||||||
|
outputs: list[PartitionRef] = field(default_factory=list)
|
||||||
|
inputs: list[DataDep] = field(default_factory=list)
|
||||||
|
args: list[str] = field(default_factory=list)
|
||||||
|
env: dict[str, str] = field(default_factory=dict)
|
||||||
|
|
||||||
|
def build(self) -> JobConfig:
|
||||||
|
return JobConfig(
|
||||||
|
outputs=self.outputs,
|
||||||
|
inputs=self.inputs,
|
||||||
|
args=self.args,
|
||||||
|
env=self.env,
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_inputs(self, *partitions: PartitionPattern, dep_type: DepType=DepType.MATERIALIZE) -> Self:
|
||||||
|
for p in partitions:
|
||||||
|
dep_type_name = "materialize" if dep_type == DepType.Materialize else "query"
|
||||||
|
self.inputs.append(DataDep(dep_type_code=dep_type, dep_type_name=dep_type_name, partition_ref=PartitionRef(str=p.serialize())))
|
||||||
|
return self
|
||||||
|
|
||||||
|
def add_outputs(self, *partitions: PartitionPattern) -> Self:
|
||||||
|
for p in partitions:
|
||||||
|
self.outputs.append(PartitionRef(str=p.serialize()))
|
||||||
|
return self
|
||||||
|
|
||||||
|
def add_args(self, *args: str) -> Self:
|
||||||
|
self.args.extend(args)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def set_args(self, args: list[str]) -> Self:
|
||||||
|
self.args = args
|
||||||
|
return self
|
||||||
|
|
||||||
|
def set_env(self, env: dict[str, str]) -> Self:
|
||||||
|
self.env = env
|
||||||
|
return self
|
||||||
|
|
||||||
|
def add_env(self, **kwargs) -> Self:
|
||||||
|
for k, v in kwargs.items():
|
||||||
|
assert isinstance(k, str), f"Expected a string key, got `{k}`"
|
||||||
|
assert isinstance(v, str), f"Expected a string key, got `{v}`"
|
||||||
|
self.env[k] = v
|
||||||
|
return self
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
|
|
||||||
from databuild.dsl.python.dsl import PartitionPattern, DataBuildGraph, DataBuildJob, JobConfig, PartitionManifest
|
from databuild.dsl.python.dsl import PartitionPattern, DataBuildGraph, DataBuildJob
|
||||||
|
from databuild.proto import JobConfig, PartitionManifest
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
@ -45,7 +46,7 @@ def test_basic_graph_definition():
|
||||||
@graph.job
|
@graph.job
|
||||||
class TestJob(DataBuildJob):
|
class TestJob(DataBuildJob):
|
||||||
output_types = [CategoryAnalysisPartition]
|
output_types = [CategoryAnalysisPartition]
|
||||||
def exec(self, config: JobConfig) -> PartitionManifest: ...
|
def exec(self, config: JobConfig) -> None: ...
|
||||||
def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ...
|
def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ...
|
||||||
|
|
||||||
assert len(graph.lookup) == 1
|
assert len(graph.lookup) == 1
|
||||||
|
|
@ -58,14 +59,15 @@ def test_graph_collision():
|
||||||
@graph.job
|
@graph.job
|
||||||
class TestJob1(DataBuildJob):
|
class TestJob1(DataBuildJob):
|
||||||
output_types = [CategoryAnalysisPartition]
|
output_types = [CategoryAnalysisPartition]
|
||||||
def exec(self, config: JobConfig) -> PartitionManifest: ...
|
def exec(self, config: JobConfig) -> None: ...
|
||||||
def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ...
|
def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ...
|
||||||
|
|
||||||
with pytest.raises(AssertionError):
|
with pytest.raises(AssertionError):
|
||||||
|
# Outputs the same partition, so should raise
|
||||||
@graph.job
|
@graph.job
|
||||||
class TestJob2(DataBuildJob):
|
class TestJob2(DataBuildJob):
|
||||||
output_types = [CategoryAnalysisPartition]
|
output_types = [CategoryAnalysisPartition]
|
||||||
def exec(self, config: JobConfig) -> PartitionManifest: ...
|
def exec(self, config: JobConfig) -> None: ...
|
||||||
def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ...
|
def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ...
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,123 +1,9 @@
|
||||||
load("//databuild:rules.bzl", "databuild_graph", "databuild_job")
|
|
||||||
|
|
||||||
py_library(
|
py_library(
|
||||||
name = "job_src",
|
name = "job_src",
|
||||||
srcs = glob(["**/*.py"]),
|
srcs = glob(["**/*.py"]),
|
||||||
visibility = ["//visibility:public"],
|
visibility = ["//visibility:public"],
|
||||||
deps = ["//databuild:py_proto"],
|
deps = [
|
||||||
)
|
"//databuild:py_proto",
|
||||||
|
"//databuild/dsl/python:dsl",
|
||||||
# Tests
|
|
||||||
py_test(
|
|
||||||
name = "test_trailing_color_votes",
|
|
||||||
srcs = ["jobs/trailing_color_votes/test.py"],
|
|
||||||
main = "jobs/trailing_color_votes/test.py",
|
|
||||||
deps = [":job_src"],
|
|
||||||
)
|
|
||||||
|
|
||||||
py_test(
|
|
||||||
name = "test_ingest_color_votes",
|
|
||||||
srcs = ["jobs/ingest_color_votes/test.py"],
|
|
||||||
main = "jobs/ingest_color_votes/test.py",
|
|
||||||
deps = [":job_src"],
|
|
||||||
)
|
|
||||||
|
|
||||||
py_test(
|
|
||||||
name = "test_aggregate_color_votes",
|
|
||||||
srcs = ["jobs/aggregate_color_votes/test.py"],
|
|
||||||
main = "jobs/aggregate_color_votes/test.py",
|
|
||||||
deps = [":job_src"],
|
|
||||||
)
|
|
||||||
|
|
||||||
py_test(
|
|
||||||
name = "test_color_vote_report_calc",
|
|
||||||
srcs = ["jobs/color_vote_report_calc/test.py"],
|
|
||||||
main = "jobs/color_vote_report_calc/test.py",
|
|
||||||
deps = [":job_src"],
|
|
||||||
)
|
|
||||||
|
|
||||||
py_test(
|
|
||||||
name = "test_graph_analysis",
|
|
||||||
srcs = ["graph/graph_test.py"],
|
|
||||||
main = "graph/graph_test.py",
|
|
||||||
data = [
|
|
||||||
":bazel_graph.analyze",
|
|
||||||
":bazel_graph_lookup",
|
|
||||||
],
|
],
|
||||||
deps = [":job_src"],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Bazel-defined
|
|
||||||
## Graph
|
|
||||||
databuild_graph(
|
|
||||||
name = "bazel_graph",
|
|
||||||
jobs = [
|
|
||||||
":ingest_color_votes",
|
|
||||||
":trailing_color_votes",
|
|
||||||
":aggregate_color_votes",
|
|
||||||
":color_vote_report_calc",
|
|
||||||
],
|
|
||||||
lookup = ":bazel_graph_lookup",
|
|
||||||
)
|
|
||||||
|
|
||||||
py_binary(
|
|
||||||
name = "bazel_graph_lookup",
|
|
||||||
srcs = ["graph/lookup.py"],
|
|
||||||
main = "graph/lookup.py",
|
|
||||||
)
|
|
||||||
|
|
||||||
## Ingest Color Votes
|
|
||||||
databuild_job(
|
|
||||||
name = "ingest_color_votes",
|
|
||||||
binary = ":ingest_color_votes_binary",
|
|
||||||
)
|
|
||||||
|
|
||||||
py_binary(
|
|
||||||
name = "ingest_color_votes_binary",
|
|
||||||
srcs = ["jobs/ingest_color_votes/main.py"],
|
|
||||||
main = "jobs/ingest_color_votes/main.py",
|
|
||||||
deps = [":job_src"],
|
|
||||||
)
|
|
||||||
|
|
||||||
## Trailing Color Votes
|
|
||||||
databuild_job(
|
|
||||||
name = "trailing_color_votes",
|
|
||||||
binary = ":trailing_color_votes_binary",
|
|
||||||
)
|
|
||||||
|
|
||||||
py_binary(
|
|
||||||
name = "trailing_color_votes_binary",
|
|
||||||
srcs = ["jobs/trailing_color_votes/main.py"],
|
|
||||||
main = "jobs/trailing_color_votes/main.py",
|
|
||||||
deps = [":job_src"],
|
|
||||||
)
|
|
||||||
|
|
||||||
## Aggregate Color Votes
|
|
||||||
databuild_job(
|
|
||||||
name = "aggregate_color_votes",
|
|
||||||
binary = ":aggregate_color_votes_binary",
|
|
||||||
)
|
|
||||||
|
|
||||||
py_binary(
|
|
||||||
name = "aggregate_color_votes_binary",
|
|
||||||
srcs = ["jobs/aggregate_color_votes/main.py"],
|
|
||||||
main = "jobs/aggregate_color_votes/main.py",
|
|
||||||
deps = [":job_src"],
|
|
||||||
)
|
|
||||||
|
|
||||||
## Color Vote Report Calc
|
|
||||||
databuild_job(
|
|
||||||
name = "color_vote_report_calc",
|
|
||||||
binary = ":color_vote_report_calc_binary",
|
|
||||||
)
|
|
||||||
|
|
||||||
py_binary(
|
|
||||||
name = "color_vote_report_calc_binary",
|
|
||||||
srcs = ["jobs/color_vote_report_calc/main.py"],
|
|
||||||
main = "jobs/color_vote_report_calc/main.py",
|
|
||||||
deps = [":job_src"],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Python-DSL-defined
|
|
||||||
|
|
||||||
# TODO
|
|
||||||
|
|
|
||||||
149
databuild/test/app/bazel/BUILD.bazel
Normal file
149
databuild/test/app/bazel/BUILD.bazel
Normal file
|
|
@ -0,0 +1,149 @@
|
||||||
|
load("//databuild:rules.bzl", "databuild_graph", "databuild_job")
|
||||||
|
|
||||||
|
py_library(
|
||||||
|
name = "job_src",
|
||||||
|
srcs = glob(["**/*.py"]),
|
||||||
|
visibility = ["//visibility:public"],
|
||||||
|
deps = [
|
||||||
|
"//databuild:py_proto",
|
||||||
|
"//databuild/dsl/python:dsl",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Tests
|
||||||
|
py_test(
|
||||||
|
name = "test_trailing_color_votes",
|
||||||
|
srcs = ["jobs/trailing_color_votes/test.py"],
|
||||||
|
main = "jobs/trailing_color_votes/test.py",
|
||||||
|
deps = [
|
||||||
|
":job_src",
|
||||||
|
"//databuild/test/app:job_src",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
py_test(
|
||||||
|
name = "test_ingest_color_votes",
|
||||||
|
srcs = ["jobs/ingest_color_votes/test.py"],
|
||||||
|
main = "jobs/ingest_color_votes/test.py",
|
||||||
|
deps = [
|
||||||
|
":job_src",
|
||||||
|
"//databuild/test/app:job_src",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
py_test(
|
||||||
|
name = "test_aggregate_color_votes",
|
||||||
|
srcs = ["jobs/aggregate_color_votes/test.py"],
|
||||||
|
main = "jobs/aggregate_color_votes/test.py",
|
||||||
|
deps = [
|
||||||
|
":job_src",
|
||||||
|
"//databuild/test/app:job_src",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
py_test(
|
||||||
|
name = "test_color_vote_report_calc",
|
||||||
|
srcs = ["jobs/color_vote_report_calc/test.py"],
|
||||||
|
main = "jobs/color_vote_report_calc/test.py",
|
||||||
|
deps = [
|
||||||
|
":job_src",
|
||||||
|
"//databuild/test/app:job_src",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
py_test(
|
||||||
|
name = "test_graph_analysis",
|
||||||
|
srcs = ["graph/graph_test.py"],
|
||||||
|
data = [
|
||||||
|
":bazel_graph.analyze",
|
||||||
|
":bazel_graph_lookup",
|
||||||
|
],
|
||||||
|
main = "graph/graph_test.py",
|
||||||
|
deps = [
|
||||||
|
":job_src",
|
||||||
|
"//databuild/test/app:job_src",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Bazel-defined
|
||||||
|
## Graph
|
||||||
|
databuild_graph(
|
||||||
|
name = "bazel_graph",
|
||||||
|
jobs = [
|
||||||
|
":ingest_color_votes",
|
||||||
|
":trailing_color_votes",
|
||||||
|
":aggregate_color_votes",
|
||||||
|
":color_vote_report_calc",
|
||||||
|
],
|
||||||
|
lookup = ":bazel_graph_lookup",
|
||||||
|
)
|
||||||
|
|
||||||
|
py_binary(
|
||||||
|
name = "bazel_graph_lookup",
|
||||||
|
srcs = ["graph/lookup.py"],
|
||||||
|
main = "graph/lookup.py",
|
||||||
|
)
|
||||||
|
|
||||||
|
## Ingest Color Votes
|
||||||
|
databuild_job(
|
||||||
|
name = "ingest_color_votes",
|
||||||
|
binary = ":ingest_color_votes_binary",
|
||||||
|
)
|
||||||
|
|
||||||
|
py_binary(
|
||||||
|
name = "ingest_color_votes_binary",
|
||||||
|
srcs = ["jobs/ingest_color_votes/main.py"],
|
||||||
|
main = "jobs/ingest_color_votes/main.py",
|
||||||
|
deps = [
|
||||||
|
":job_src",
|
||||||
|
"//databuild/test/app:job_src",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
## Trailing Color Votes
|
||||||
|
databuild_job(
|
||||||
|
name = "trailing_color_votes",
|
||||||
|
binary = ":trailing_color_votes_binary",
|
||||||
|
)
|
||||||
|
|
||||||
|
py_binary(
|
||||||
|
name = "trailing_color_votes_binary",
|
||||||
|
srcs = ["jobs/trailing_color_votes/main.py"],
|
||||||
|
main = "jobs/trailing_color_votes/main.py",
|
||||||
|
deps = [
|
||||||
|
":job_src",
|
||||||
|
"//databuild/test/app:job_src",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
## Aggregate Color Votes
|
||||||
|
databuild_job(
|
||||||
|
name = "aggregate_color_votes",
|
||||||
|
binary = ":aggregate_color_votes_binary",
|
||||||
|
)
|
||||||
|
|
||||||
|
py_binary(
|
||||||
|
name = "aggregate_color_votes_binary",
|
||||||
|
srcs = ["jobs/aggregate_color_votes/main.py"],
|
||||||
|
main = "jobs/aggregate_color_votes/main.py",
|
||||||
|
deps = [
|
||||||
|
":job_src",
|
||||||
|
"//databuild/test/app:job_src",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
## Color Vote Report Calc
|
||||||
|
databuild_job(
|
||||||
|
name = "color_vote_report_calc",
|
||||||
|
binary = ":color_vote_report_calc_binary",
|
||||||
|
)
|
||||||
|
|
||||||
|
py_binary(
|
||||||
|
name = "color_vote_report_calc_binary",
|
||||||
|
srcs = ["jobs/color_vote_report_calc/main.py"],
|
||||||
|
main = "jobs/color_vote_report_calc/main.py",
|
||||||
|
deps = [
|
||||||
|
":job_src",
|
||||||
|
"//databuild/test/app:job_src",
|
||||||
|
],
|
||||||
|
)
|
||||||
4
databuild/test/app/bazel/README.md
Normal file
4
databuild/test/app/bazel/README.md
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
|
||||||
|
# Bazel-Based Graph Definition
|
||||||
|
|
||||||
|
The bazel-based graph definition relies on declaring `databuild_job` and `databuild_graph` targets which reference binaries.
|
||||||
0
databuild/test/app/bazel/graph/test.py
Normal file
0
databuild/test/app/bazel/graph/test.py
Normal file
1
databuild/test/app/bazel/jobs/aggregate_color_votes/README.md
Symbolic link
1
databuild/test/app/bazel/jobs/aggregate_color_votes/README.md
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
jobs/aggregate_color_votes/README.md
|
||||||
|
|
@ -4,10 +4,9 @@ import sys
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
from databuild.proto import PartitionRef, to_dict
|
from databuild.proto import PartitionRef, to_dict
|
||||||
from databuild.test.app.jobs.aggregate_color_votes.config import configure
|
from databuild.test.app.bazel.jobs.aggregate_color_votes.config import configure
|
||||||
from databuild.test.app.jobs.aggregate_color_votes.execute import execute
|
from databuild.test.app.jobs.aggregate_color_votes.execute import execute
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if sys.argv[1] == "config":
|
if sys.argv[1] == "config":
|
||||||
response = configure([
|
response = configure([
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import unittest
|
import unittest
|
||||||
from databuild.proto import PartitionRef
|
from databuild.proto import PartitionRef
|
||||||
from databuild.test.app.jobs.aggregate_color_votes.config import configure
|
from databuild.test.app.bazel.jobs.aggregate_color_votes.config import configure
|
||||||
from databuild.test.app.colors import COLORS
|
from databuild.test.app.colors import COLORS
|
||||||
|
|
||||||
class TestAggregateColorVotesConfig(unittest.TestCase):
|
class TestAggregateColorVotesConfig(unittest.TestCase):
|
||||||
1
databuild/test/app/bazel/jobs/color_vote_report_calc/README.md
Symbolic link
1
databuild/test/app/bazel/jobs/color_vote_report_calc/README.md
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
jobs/color_vote_report_calc/README.md
|
||||||
|
|
@ -4,9 +4,8 @@ import sys
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
from databuild.proto import PartitionRef, to_dict
|
from databuild.proto import PartitionRef, to_dict
|
||||||
from databuild.test.app.jobs.color_vote_report_calc.config import configure
|
from databuild.test.app.bazel.jobs.color_vote_report_calc.config import configure
|
||||||
from databuild.test.app.jobs.color_vote_report_calc.execute import execute
|
from databuild.test.app.jobs.color_vote_report_calc.execute import execute
|
||||||
from betterproto2 import Casing, OutputFormat
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if sys.argv[1] == "config":
|
if sys.argv[1] == "config":
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import unittest
|
import unittest
|
||||||
from databuild.proto import PartitionRef
|
from databuild.proto import PartitionRef
|
||||||
from databuild.test.app.jobs.color_vote_report_calc.config import configure
|
from databuild.test.app.bazel.jobs.color_vote_report_calc.config import configure
|
||||||
|
|
||||||
class TestColorVoteReportCalcConfig(unittest.TestCase):
|
class TestColorVoteReportCalcConfig(unittest.TestCase):
|
||||||
def test_configure_single_output(self):
|
def test_configure_single_output(self):
|
||||||
1
databuild/test/app/bazel/jobs/ingest_color_votes/README.md
Symbolic link
1
databuild/test/app/bazel/jobs/ingest_color_votes/README.md
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
jobs/ingest_color_votes/README.md
|
||||||
|
|
@ -4,9 +4,8 @@ import sys
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
from databuild.proto import PartitionRef, to_dict
|
from databuild.proto import PartitionRef, to_dict
|
||||||
from databuild.test.app.jobs.ingest_color_votes.config import configure
|
from databuild.test.app.bazel.jobs.ingest_color_votes.config import configure
|
||||||
from databuild.test.app.jobs.ingest_color_votes.execute import execute
|
from databuild.test.app.jobs.ingest_color_votes.execute import execute
|
||||||
from betterproto2 import Casing
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if sys.argv[1] == "config":
|
if sys.argv[1] == "config":
|
||||||
32
databuild/test/app/bazel/jobs/ingest_color_votes/test.py
Normal file
32
databuild/test/app/bazel/jobs/ingest_color_votes/test.py
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
from databuild.test.app.bazel.jobs.ingest_color_votes.config import configure
|
||||||
|
from databuild.proto import PartitionRef
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_color_votes_configure():
|
||||||
|
refs_single = [PartitionRef(str="daily_color_votes/2025-01-01/red")]
|
||||||
|
config_single = configure(refs_single)
|
||||||
|
assert len(config_single.configs) == 1
|
||||||
|
assert config_single.configs[0].outputs[0].str == "daily_color_votes/2025-01-01/red"
|
||||||
|
assert config_single.configs[0].env["COLOR"] == "red"
|
||||||
|
assert config_single.configs[0].env["DATA_DATE"] == "2025-01-01"
|
||||||
|
|
||||||
|
refs_multiple = [
|
||||||
|
PartitionRef(str="daily_color_votes/2025-01-02/red"),
|
||||||
|
PartitionRef(str="daily_color_votes/2025-01-02/blue"),
|
||||||
|
]
|
||||||
|
|
||||||
|
config_multiple = configure(refs_multiple)
|
||||||
|
assert len(config_multiple.configs) == 2
|
||||||
|
assert len(config_multiple.configs[0].outputs) == 1
|
||||||
|
assert config_multiple.configs[0].outputs[0].str == "daily_color_votes/2025-01-02/red"
|
||||||
|
assert config_multiple.configs[0].env["COLOR"] == "red"
|
||||||
|
assert config_multiple.configs[0].env["DATA_DATE"] == "2025-01-02"
|
||||||
|
assert len(config_multiple.configs[1].outputs) == 1
|
||||||
|
assert config_multiple.configs[1].outputs[0].str == "daily_color_votes/2025-01-02/blue"
|
||||||
|
assert config_multiple.configs[1].env["COLOR"] == "blue"
|
||||||
|
assert config_multiple.configs[1].env["DATA_DATE"] == "2025-01-02"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import pytest
|
||||||
|
raise SystemExit(pytest.main([__file__]))
|
||||||
1
databuild/test/app/bazel/jobs/trailing_color_votes/README.md
Symbolic link
1
databuild/test/app/bazel/jobs/trailing_color_votes/README.md
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
jobs/trailing_color_votes/README.md
|
||||||
|
|
@ -9,9 +9,7 @@ def configure(outputs: list[PartitionRef]) -> JobConfigureResponse:
|
||||||
for output in outputs:
|
for output in outputs:
|
||||||
parts = output.str.split("/")
|
parts = output.str.split("/")
|
||||||
if len(parts) == 3 and parts[0] in ["color_votes_1w", "color_votes_1m"]:
|
if len(parts) == 3 and parts[0] in ["color_votes_1w", "color_votes_1m"]:
|
||||||
prefix, data_date, color = parts
|
grouped_outputs[tuple(parts[1:])].append(output)
|
||||||
key = (data_date, color)
|
|
||||||
grouped_outputs[key].append(output)
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid output partition format: {output.str}")
|
raise ValueError(f"Invalid output partition format: {output.str}")
|
||||||
|
|
||||||
|
|
@ -29,8 +27,7 @@ def configure(outputs: list[PartitionRef]) -> JobConfigureResponse:
|
||||||
inputs = []
|
inputs = []
|
||||||
for i in range(max_window):
|
for i in range(max_window):
|
||||||
input_date = output_date - timedelta(days=i)
|
input_date = output_date - timedelta(days=i)
|
||||||
input_ref = PartitionRef(str=f"daily_color_votes/{input_date.isoformat()}/{color}")
|
inputs.append(PartitionRef(str=f"daily_color_votes/{input_date.isoformat()}/{color}"))
|
||||||
inputs.append(input_ref)
|
|
||||||
|
|
||||||
env = {
|
env = {
|
||||||
"DATA_DATE": data_date,
|
"DATA_DATE": data_date,
|
||||||
|
|
@ -4,9 +4,8 @@ import sys
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
from databuild.proto import PartitionRef, to_dict
|
from databuild.proto import PartitionRef, to_dict
|
||||||
from databuild.test.app.jobs.trailing_color_votes.config import configure
|
from databuild.test.app.bazel.jobs.trailing_color_votes.config import configure
|
||||||
from databuild.test.app.jobs.trailing_color_votes.execute import execute
|
from databuild.test.app.jobs.trailing_color_votes.execute import execute
|
||||||
from betterproto2 import Casing, OutputFormat
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if sys.argv[1] == "config":
|
if sys.argv[1] == "config":
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import unittest
|
import unittest
|
||||||
from databuild.proto import PartitionRef
|
from databuild.proto import PartitionRef
|
||||||
from databuild.test.app.jobs.trailing_color_votes.config import configure
|
from databuild.test.app.bazel.jobs.trailing_color_votes.config import configure
|
||||||
|
|
||||||
class TestTrailingColorVotesConfig(unittest.TestCase):
|
class TestTrailingColorVotesConfig(unittest.TestCase):
|
||||||
def test_configure_weekly_only(self):
|
def test_configure_weekly_only(self):
|
||||||
0
databuild/test/app/dsl/BUILD.bazel
Normal file
0
databuild/test/app/dsl/BUILD.bazel
Normal file
130
databuild/test/app/dsl/graph.py
Normal file
130
databuild/test/app/dsl/graph.py
Normal file
|
|
@ -0,0 +1,130 @@
|
||||||
|
"""Python DSL implementation of test app"""
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
from databuild.dsl.python.dsl import DataBuildGraph, DataBuildJob, JobConfigBuilder
|
||||||
|
from databuild.proto import JobConfig
|
||||||
|
from databuild.test.app.colors import COLORS
|
||||||
|
from databuild.test.app.jobs.ingest_color_votes.execute import execute as ingest_color_votes_exec
|
||||||
|
from databuild.test.app.jobs.trailing_color_votes.execute import execute as trailing_color_votes_exec
|
||||||
|
from databuild.test.app.jobs.aggregate_color_votes.execute import execute as aggregate_color_votes_exec
|
||||||
|
from databuild.test.app.jobs.color_vote_report_calc.execute import execute as color_vote_report_calc_exec
|
||||||
|
from databuild.test.app.dsl.partitions import (
|
||||||
|
IngestedColorPartition,
|
||||||
|
TrailingColorVotes1MPartition,
|
||||||
|
TrailingColorVotes1WPartition,
|
||||||
|
DailyVotesPartition,
|
||||||
|
Votes1WPartition,
|
||||||
|
Votes1MPartition,
|
||||||
|
ColorVoteReportPartition
|
||||||
|
)
|
||||||
|
from datetime import date, timedelta
|
||||||
|
|
||||||
|
graph = DataBuildGraph("//databuild/test/app:dsl_graph")
|
||||||
|
|
||||||
|
|
||||||
|
@graph.job
|
||||||
|
class IngestColorVotes(DataBuildJob):
|
||||||
|
output_types = [IngestedColorPartition]
|
||||||
|
|
||||||
|
def config(self, outputs: list[IngestedColorPartition]) -> list[JobConfig]:
|
||||||
|
configs = []
|
||||||
|
for output in outputs:
|
||||||
|
env = {"DATA_DATE": output.data_date, "COLOR": output.color}
|
||||||
|
configs.append(JobConfigBuilder().add_outputs(output).set_env(env).build())
|
||||||
|
return configs
|
||||||
|
|
||||||
|
def exec(self, config: JobConfig) -> None:
|
||||||
|
ingest_color_votes_exec(data_date=config.env["DATA_DATE"], color=config.env["COLOR"])
|
||||||
|
|
||||||
|
|
||||||
|
@graph.job
|
||||||
|
class TrailingColorVotes(DataBuildJob):
|
||||||
|
output_types = [TrailingColorVotes1MPartition, TrailingColorVotes1WPartition]
|
||||||
|
|
||||||
|
def config(self, outputs: list[TrailingColorVotes1MPartition | TrailingColorVotes1WPartition]) -> list[JobConfig]:
|
||||||
|
groups = defaultdict(list)
|
||||||
|
for output in outputs:
|
||||||
|
groups[(output.data_date, output.color)].append(output)
|
||||||
|
|
||||||
|
configs = []
|
||||||
|
for (data_date, color), outputs in groups.items():
|
||||||
|
weekly = "false"
|
||||||
|
monthly = "false"
|
||||||
|
max_window = 0
|
||||||
|
for output in outputs:
|
||||||
|
if isinstance(output, TrailingColorVotes1WPartition):
|
||||||
|
weekly = "true"
|
||||||
|
max_window = max(max_window, 7)
|
||||||
|
elif isinstance(output, TrailingColorVotes1MPartition):
|
||||||
|
monthly = "true"
|
||||||
|
max_window = max(max_window, 28)
|
||||||
|
|
||||||
|
env = {"DATA_DATE": data_date, "COLOR": color, "WEEKLY": weekly, "MONTHLY": monthly}
|
||||||
|
config = JobConfigBuilder(env=env, outputs=outputs)
|
||||||
|
for i in range(max_window):
|
||||||
|
in_date = (date.fromisoformat(data_date) - timedelta(days=i)).isoformat()
|
||||||
|
config.add_inputs(IngestedColorPartition(data_date=in_date, color=color))
|
||||||
|
|
||||||
|
configs.append(config.build())
|
||||||
|
return configs
|
||||||
|
|
||||||
|
def exec(self, config: JobConfig) -> None:
|
||||||
|
trailing_color_votes_exec(data_date=config.env["DATA_DATE"], color=config.env["COLOR"])
|
||||||
|
|
||||||
|
|
||||||
|
@graph.job
|
||||||
|
class AggregateColorVotes(DataBuildJob):
|
||||||
|
output_types = [DailyVotesPartition, Votes1WPartition, Votes1MPartition]
|
||||||
|
|
||||||
|
def config(self, outputs: list[DailyVotesPartition | Votes1WPartition | Votes1MPartition]) -> list[JobConfig]:
|
||||||
|
configs = []
|
||||||
|
|
||||||
|
for output in outputs:
|
||||||
|
if isinstance(output, DailyVotesPartition):
|
||||||
|
InPartition = IngestedColorPartition
|
||||||
|
agg_type = "daily_votes"
|
||||||
|
elif isinstance(output, Votes1WPartition):
|
||||||
|
InPartition = TrailingColorVotes1WPartition
|
||||||
|
agg_type = "votes_1w"
|
||||||
|
elif isinstance(output, Votes1MPartition):
|
||||||
|
InPartition = TrailingColorVotes1MPartition
|
||||||
|
agg_type = "votes_1m"
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown output type: {output.type}")
|
||||||
|
|
||||||
|
inputs = [InPartition(data_date=output.data_date, color=color) for color in COLORS]
|
||||||
|
env = {"DATA_DATE": output.data_date, "AGGREGATE_TYPE": agg_type}
|
||||||
|
configs.append(JobConfigBuilder().add_outputs(output).add_inputs(*inputs).set_env(env).build())
|
||||||
|
|
||||||
|
return configs
|
||||||
|
|
||||||
|
def exec(self, config: JobConfig) -> None:
|
||||||
|
aggregate_color_votes_exec(data_date=config.env["DATA_DATE"], aggregate_type=config.env["AGGREGATE_TYPE"])
|
||||||
|
|
||||||
|
|
||||||
|
@graph.job
|
||||||
|
class ColorVoteReportCalc(DataBuildJob):
|
||||||
|
output_types = [ColorVoteReportPartition]
|
||||||
|
|
||||||
|
def config(self, outputs: list[ColorVoteReportPartition]) -> list[JobConfig]:
|
||||||
|
config = JobConfigBuilder().add_outputs(*outputs).add_args(*[p.str for p in outputs])
|
||||||
|
|
||||||
|
for data_date in set(p.data_date for p in outputs):
|
||||||
|
config.add_inputs(
|
||||||
|
DailyVotesPartition(data_date=data_date),
|
||||||
|
Votes1WPartition(data_date=data_date),
|
||||||
|
Votes1MPartition(data_date=data_date),
|
||||||
|
)
|
||||||
|
|
||||||
|
for output in outputs:
|
||||||
|
config.add_inputs(
|
||||||
|
IngestedColorPartition(data_date=output.data_date, color=output.color),
|
||||||
|
TrailingColorVotes1WPartition(data_date=output.data_date, color=output.color),
|
||||||
|
TrailingColorVotes1MPartition(data_date=output.data_date, color=output.color),
|
||||||
|
)
|
||||||
|
|
||||||
|
return [config.build()]
|
||||||
|
|
||||||
|
def exec(self, config: JobConfig) -> None:
|
||||||
|
color_vote_report_calc_exec(config.args)
|
||||||
|
|
||||||
40
databuild/test/app/dsl/partitions.py
Normal file
40
databuild/test/app/dsl/partitions.py
Normal file
|
|
@ -0,0 +1,40 @@
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from databuild.dsl.python.dsl import PartitionPattern
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DatePartitioned:
|
||||||
|
data_date: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DateColorPartitioned:
|
||||||
|
data_date: str
|
||||||
|
color: str
|
||||||
|
|
||||||
|
|
||||||
|
class IngestedColorPartition(DateColorPartitioned, PartitionPattern):
|
||||||
|
_raw_pattern = r"daily_color_votes/(?P<data_date>\d{4}-\d{2}-\d{2})/(?P<color>[^/]+)"
|
||||||
|
|
||||||
|
|
||||||
|
class TrailingColorVotes1WPartition(DateColorPartitioned, PartitionPattern):
|
||||||
|
_raw_pattern = r"color_votes_1w/(?P<data_date>\d{4}-\d{2}-\d{2})/(?P<color>[^/]+)"
|
||||||
|
|
||||||
|
|
||||||
|
class TrailingColorVotes1MPartition(DateColorPartitioned, PartitionPattern):
|
||||||
|
_raw_pattern = r"color_votes_1m/(?P<data_date>\d{4}-\d{2}-\d{2})/(?P<color>[^/]+)"
|
||||||
|
|
||||||
|
|
||||||
|
class DailyVotesPartition(DatePartitioned, PartitionPattern):
|
||||||
|
_raw_pattern = r"daily_votes/(?P<data_date>\d{4}-\d{2}-\d{2})"
|
||||||
|
|
||||||
|
|
||||||
|
class Votes1WPartition(DatePartitioned, PartitionPattern):
|
||||||
|
_raw_pattern = r"votes_1w/(?P<data_date>\d{4}-\d{2}-\d{2})"
|
||||||
|
|
||||||
|
|
||||||
|
class Votes1MPartition(DatePartitioned, PartitionPattern):
|
||||||
|
_raw_pattern = r"votes_1m/(?P<data_date>\d{4}-\d{2}-\d{2})"
|
||||||
|
|
||||||
|
|
||||||
|
class ColorVoteReportPartition(DateColorPartitioned, PartitionPattern):
|
||||||
|
_raw_pattern = r"color_vote_report/(?P<data_date>\d{4}-\d{2}-\d{2})/(?P<color>[^/]+)"
|
||||||
|
|
@ -1,34 +1,8 @@
|
||||||
from databuild.test.app.jobs.ingest_color_votes.config import configure
|
|
||||||
from databuild.test.app.jobs.ingest_color_votes.execute import execute
|
from databuild.test.app.jobs.ingest_color_votes.execute import execute
|
||||||
from databuild.test.app import dal
|
from databuild.test.app import dal
|
||||||
from databuild.proto import PartitionRef
|
from databuild.proto import PartitionRef
|
||||||
|
|
||||||
|
|
||||||
def test_ingest_color_votes_configure():
|
|
||||||
refs_single = [PartitionRef(str="daily_color_votes/2025-01-01/red")]
|
|
||||||
config_single = configure(refs_single)
|
|
||||||
assert len(config_single.configs) == 1
|
|
||||||
assert config_single.configs[0].outputs[0].str == "daily_color_votes/2025-01-01/red"
|
|
||||||
assert config_single.configs[0].env["COLOR"] == "red"
|
|
||||||
assert config_single.configs[0].env["DATA_DATE"] == "2025-01-01"
|
|
||||||
|
|
||||||
refs_multiple = [
|
|
||||||
PartitionRef(str="daily_color_votes/2025-01-02/red"),
|
|
||||||
PartitionRef(str="daily_color_votes/2025-01-02/blue"),
|
|
||||||
]
|
|
||||||
|
|
||||||
config_multiple = configure(refs_multiple)
|
|
||||||
assert len(config_multiple.configs) == 2
|
|
||||||
assert len(config_multiple.configs[0].outputs) == 1
|
|
||||||
assert config_multiple.configs[0].outputs[0].str == "daily_color_votes/2025-01-02/red"
|
|
||||||
assert config_multiple.configs[0].env["COLOR"] == "red"
|
|
||||||
assert config_multiple.configs[0].env["DATA_DATE"] == "2025-01-02"
|
|
||||||
assert len(config_multiple.configs[1].outputs) == 1
|
|
||||||
assert config_multiple.configs[1].outputs[0].str == "daily_color_votes/2025-01-02/blue"
|
|
||||||
assert config_multiple.configs[1].env["COLOR"] == "blue"
|
|
||||||
assert config_multiple.configs[1].env["DATA_DATE"] == "2025-01-02"
|
|
||||||
|
|
||||||
|
|
||||||
def test_ingest_color_votes():
|
def test_ingest_color_votes():
|
||||||
execute("2025-01-01", "red")
|
execute("2025-01-01", "red")
|
||||||
results = dal.read(PartitionRef(str="daily_color_votes/2025-01-01/red"))
|
results = dal.read(PartitionRef(str="daily_color_votes/2025-01-01/red"))
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,8 @@
|
||||||
|
|
||||||
|
- Implement python dsl
|
||||||
|
- Achieve fast configuration (betterproto2 imports are sus)
|
||||||
- Remove manual reference of enum values, e.g. [here](../databuild/repositories/builds/mod.rs:85)
|
- Remove manual reference of enum values, e.g. [here](../databuild/repositories/builds/mod.rs:85)
|
||||||
- Type-safe mithril [claude link](https://claude.ai/share/f33f8605-472a-4db4-9211-5a1e52087316)
|
|
||||||
- Status indicator for page selection
|
|
||||||
- On build request detail page, show aggregated job results
|
- On build request detail page, show aggregated job results
|
||||||
- Use path based navigation instead of hashbang?
|
|
||||||
- Add build request notes
|
|
||||||
- How do we encode job labels in the path? (Build event job links are not encoding job labels properly)
|
- How do we encode job labels in the path? (Build event job links are not encoding job labels properly)
|
||||||
- Resolve double type system with protobuf and openapi
|
- Resolve double type system with protobuf and openapi
|
||||||
- Plan for external worker dispatch (e.g. k8s pod per build, or launch in container service)
|
- Plan for external worker dispatch (e.g. k8s pod per build, or launch in container service)
|
||||||
|
|
@ -12,3 +10,6 @@
|
||||||
- Should we have meaningful exit codes? E.g. "retry-able error", etc?
|
- Should we have meaningful exit codes? E.g. "retry-able error", etc?
|
||||||
- Fully joinable build/job IDs - ensure all execution logs / metrics are joinable to build request ID?
|
- Fully joinable build/job IDs - ensure all execution logs / metrics are joinable to build request ID?
|
||||||
- Triggers?
|
- Triggers?
|
||||||
|
- Add build request notes
|
||||||
|
- Status indicator for page selection
|
||||||
|
- Use path based navigation instead of hashbang?
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue