lay groundwork for python dsl
Some checks are pending
/ setup (push) Waiting to run

This commit is contained in:
Stuart Axelbrooke 2025-07-30 07:01:46 -07:00
parent 1dfa45d94b
commit 6f2408a3ee
10 changed files with 278 additions and 43 deletions

View file

@ -1,3 +1,5 @@
# Python Deps
load("@rules_python//python:pip.bzl", "compile_pip_requirements")
filegroup(
name = "jq",
@ -17,3 +19,10 @@ sh_binary(
],
visibility = ["//visibility:public"],
)
# `bazel run //:requirements.update` will regenerate the requirements_txt file
compile_pip_requirements(
name = "requirements",
src = "requirements.in",
requirements_txt = "requirements_lock.txt",
)

View file

@ -144,15 +144,12 @@ use_repo(crate, "crates")
# TypeScript and Node.js dependencies for dashboard
bazel_dep(name = "aspect_rules_ts", version = "3.6.3")
bazel_dep(name = "aspect_rules_js", version = "2.0.0")
rules_ts_ext = use_extension("@aspect_rules_ts//ts:extensions.bzl", "ext")
rules_ts_ext.deps(ts_version_from = "//databuild/dashboard:package.json")
use_repo(rules_ts_ext, "npm_typescript")
#bazel_dep(name = "aspect_rules_ts", version = "3.4.0")
#bazel_dep(name = "aspect_rules_js", version = "2.1.3")
bazel_dep(name = "aspect_rules_esbuild", version = "0.21.0")
npm = use_extension("@aspect_rules_js//npm:extensions.bzl", "npm")
@ -180,12 +177,6 @@ npm.npm_import(
version = "2.2.7",
)
use_repo(npm, "mithril", "types_mithril")
#npm.npm_import(
# name = "npm_typescript",
# package = "typescript",
# version = "5.8.3",
#)
#use_repo(npm, "mithril", "npm_typescript", "types_mithril")
# Tailwind
http_file = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "http_file")
@ -208,33 +199,18 @@ http_file(
],
)
#http_archive(
# name = "aspect_rules_esbuild",
# sha256 = "550e33ddeb86a564b22b2c5d3f84748c6639b1b2b71fae66bf362c33392cbed8",
# strip_prefix = "rules_esbuild-0.21.0",
# url = "https://github.com/aspect-build/rules_esbuild/releases/download/v0.21.0/rules_esbuild-v0.21.0.tar.gz",
#)
#
#######################
## rules_esbuild setup #
#######################
#
## Fetches the rules_esbuild dependencies.
## If you want to have a different version of some dependency,
## you should fetch it *before* calling this.
## Alternatively, you can skip calling this function, so long as you've
## already fetched all the dependencies.
#load("@aspect_rules_esbuild//esbuild:dependencies.bzl", "rules_esbuild_dependencies")
#
#rules_esbuild_dependencies()
#
#rules_js_register_toolchains(node_version = DEFAULT_NODE_VERSION)
#
## Register a toolchain containing esbuild npm package and native bindings
#load("@aspect_rules_esbuild//esbuild:repositories.bzl", "LATEST_ESBUILD_VERSION", "esbuild_register_toolchains")
#
#esbuild_register_toolchains(
# name = "esbuild",
# esbuild_version = LATEST_ESBUILD_VERSION,
#)
#
# Python
bazel_dep(name = "rules_python", version = "1.5.1")
python = use_extension("@rules_python//python/extensions:python.bzl", "python")
python.toolchain(
python_version = "3.13",
)
pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip")
pip.parse(
hub_name = "pypi",
python_version = "3.13",
requirements_lock = "//:requirements_lock.txt",
)
use_repo(pip, "pypi")

View file

@ -72,6 +72,7 @@
"https://bcr.bazel.build/modules/protobuf/27.0/MODULE.bazel": "7873b60be88844a0a1d8f80b9d5d20cfbd8495a689b8763e76c6372998d3f64c",
"https://bcr.bazel.build/modules/protobuf/27.1/MODULE.bazel": "703a7b614728bb06647f965264967a8ef1c39e09e8f167b3ca0bb1fd80449c0d",
"https://bcr.bazel.build/modules/protobuf/29.0-rc2/MODULE.bazel": "6241d35983510143049943fc0d57937937122baf1b287862f9dc8590fc4c37df",
"https://bcr.bazel.build/modules/protobuf/29.0-rc3/MODULE.bazel": "33c2dfa286578573afc55a7acaea3cada4122b9631007c594bf0729f41c8de92",
"https://bcr.bazel.build/modules/protobuf/29.0/MODULE.bazel": "319dc8bf4c679ff87e71b1ccfb5a6e90a6dbc4693501d471f48662ac46d04e4e",
"https://bcr.bazel.build/modules/protobuf/29.0/source.json": "b857f93c796750eef95f0d61ee378f3420d00ee1dd38627b27193aa482f4f981",
"https://bcr.bazel.build/modules/protobuf/3.19.0/MODULE.bazel": "6b5fbb433f760a99a22b18b6850ed5784ef0e9928a72668b66e4d7ccd47db9b0",
@ -109,6 +110,8 @@
"https://bcr.bazel.build/modules/rules_java/7.6.1/MODULE.bazel": "2f14b7e8a1aa2f67ae92bc69d1ec0fa8d9f827c4e17ff5e5f02e91caa3b2d0fe",
"https://bcr.bazel.build/modules/rules_java/8.12.0/MODULE.bazel": "8e6590b961f2defdfc2811c089c75716cb2f06c8a4edeb9a8d85eaa64ee2a761",
"https://bcr.bazel.build/modules/rules_java/8.12.0/source.json": "cbd5d55d9d38d4008a7d00bee5b5a5a4b6031fcd4a56515c9accbcd42c7be2ba",
"https://bcr.bazel.build/modules/rules_java/8.3.2/MODULE.bazel": "7336d5511ad5af0b8615fdc7477535a2e4e723a357b6713af439fe8cf0195017",
"https://bcr.bazel.build/modules/rules_java/8.5.1/MODULE.bazel": "d8a9e38cc5228881f7055a6079f6f7821a073df3744d441978e7a43e20226939",
"https://bcr.bazel.build/modules/rules_jvm_external/4.4.2/MODULE.bazel": "a56b85e418c83eb1839819f0b515c431010160383306d13ec21959ac412d2fe7",
"https://bcr.bazel.build/modules/rules_jvm_external/5.1/MODULE.bazel": "33f6f999e03183f7d088c9be518a63467dfd0be94a11d0055fe2d210f89aa909",
"https://bcr.bazel.build/modules/rules_jvm_external/5.2/MODULE.bazel": "d9351ba35217ad0de03816ef3ed63f89d411349353077348a45348b096615036",
@ -143,7 +146,8 @@
"https://bcr.bazel.build/modules/rules_python/0.31.0/MODULE.bazel": "93a43dc47ee570e6ec9f5779b2e64c1476a6ce921c48cc9a1678a91dd5f8fd58",
"https://bcr.bazel.build/modules/rules_python/0.4.0/MODULE.bazel": "9208ee05fd48bf09ac60ed269791cf17fb343db56c8226a720fbb1cdf467166c",
"https://bcr.bazel.build/modules/rules_python/0.40.0/MODULE.bazel": "9d1a3cd88ed7d8e39583d9ffe56ae8a244f67783ae89b60caafc9f5cf318ada7",
"https://bcr.bazel.build/modules/rules_python/0.40.0/source.json": "939d4bd2e3110f27bfb360292986bb79fd8dcefb874358ccd6cdaa7bda029320",
"https://bcr.bazel.build/modules/rules_python/1.5.1/MODULE.bazel": "acfe65880942d44a69129d4c5c3122d57baaf3edf58ae5a6bd4edea114906bf5",
"https://bcr.bazel.build/modules/rules_python/1.5.1/source.json": "aa903e1bcbdfa1580f2b8e2d55100b7c18bc92d779ebb507fec896c75635f7bd",
"https://bcr.bazel.build/modules/rules_rust/0.61.0/MODULE.bazel": "0318a95777b9114c8740f34b60d6d68f9cfef61e2f4b52424ca626213d33787b",
"https://bcr.bazel.build/modules/rules_rust/0.61.0/source.json": "d1bc743b5fa2e2abb35c436df7126a53dab0c3f35890ae6841592b2253786a63",
"https://bcr.bazel.build/modules/rules_shell/0.2.0/MODULE.bazel": "fda8a652ab3c7d8fee214de05e7a9916d8b28082234e8d2c0094505c5268ed3c",
@ -157,7 +161,8 @@
"https://bcr.bazel.build/modules/stardoc/0.6.2/MODULE.bazel": "7060193196395f5dd668eda046ccbeacebfd98efc77fed418dbe2b82ffaa39fd",
"https://bcr.bazel.build/modules/stardoc/0.7.0/MODULE.bazel": "05e3d6d30c099b6770e97da986c53bd31844d7f13d41412480ea265ac9e8079c",
"https://bcr.bazel.build/modules/stardoc/0.7.1/MODULE.bazel": "3548faea4ee5dda5580f9af150e79d0f6aea934fc60c1cc50f4efdd9420759e7",
"https://bcr.bazel.build/modules/stardoc/0.7.1/source.json": "b6500ffcd7b48cd72c29bb67bcac781e12701cc0d6d55d266a652583cfcdab01",
"https://bcr.bazel.build/modules/stardoc/0.7.2/MODULE.bazel": "fc152419aa2ea0f51c29583fab1e8c99ddefd5b3778421845606ee628629e0e5",
"https://bcr.bazel.build/modules/stardoc/0.7.2/source.json": "58b029e5e901d6802967754adf0a9056747e8176f017cfe3607c0851f4d42216",
"https://bcr.bazel.build/modules/upb/0.0.0-20220923-a547704/MODULE.bazel": "7298990c00040a0e2f121f6c32544bab27d4452f80d9ce51349b1a28f3005c43",
"https://bcr.bazel.build/modules/zlib/1.2.11/MODULE.bazel": "07b389abc85fdbca459b69e2ec656ae5622873af3f845e1c9d80fe179f3effa0",
"https://bcr.bazel.build/modules/zlib/1.3.1.bcr.5/MODULE.bazel": "eec517b5bbe5492629466e11dae908d043364302283de25581e3eb944326c4ca",
@ -716,6 +721,47 @@
]
}
},
"@@rules_python+//python/uv:uv.bzl%uv": {
"general": {
"bzlTransitiveDigest": "bGHlxez0Lkvq2VwrlfCLraKHiJIRHSIJb432X2+pky8=",
"usagesDigest": "WYhzIw9khRBy34H1GxV5+fI1yi07O90NmCXosPUdHWQ=",
"recordedFileInputs": {},
"recordedDirentsInputs": {},
"envVariables": {},
"generatedRepoSpecs": {
"uv": {
"repoRuleId": "@@rules_python+//python/uv/private:uv_toolchains_repo.bzl%uv_toolchains_repo",
"attributes": {
"toolchain_type": "'@@rules_python+//python/uv:uv_toolchain_type'",
"toolchain_names": [
"none"
],
"toolchain_implementations": {
"none": "'@@rules_python+//python:none'"
},
"toolchain_compatible_with": {
"none": [
"@platforms//:incompatible"
]
},
"toolchain_target_settings": {}
}
}
},
"recordedRepoMappingEntries": [
[
"rules_python+",
"bazel_tools",
"bazel_tools"
],
[
"rules_python+",
"platforms",
"platforms"
]
]
}
},
"@@rules_rust+//crate_universe:extensions.bzl%crate": {
"general": {
"bzlTransitiveDigest": "7WZJd6ddUExnqTm+/VXRBmv6c4MCs9hJn2U/AZqeQvA=",

View file

@ -0,0 +1,7 @@
py_library(
name = "dsl",
srcs = ["dsl.py"],
visibility = ["//visibility:public"],
deps = [
],
)

View file

@ -0,0 +1,91 @@
from typing import Self, Protocol, get_type_hints, get_origin, get_args
from dataclasses import fields, is_dataclass
import re
class PartitionPattern:
_raw_pattern: str
@property
def _pattern(self) -> re.Pattern:
return re.compile(self._raw_pattern)
def _validate_pattern(self):
"""Checks that both conditions are met:
1. All fields from the PartitionFields type are present in the pattern
2. All fields from the pattern are present in the PartitionFields type
"""
# TODO how do I get this to be called?
assert is_dataclass(self), "Should be a dataclass also (for partition fields)"
pattern_fields = set(self._pattern.groupindex.keys())
partition_fields = {field.name for field in fields(self)}
if pattern_fields != partition_fields:
raise ValueError(f"Pattern fields {pattern_fields} do not match partition fields {partition_fields}")
@classmethod
def deserialize(cls, raw_value: str) -> Self:
"""Parses a partition from a string based on the defined pattern."""
# Create a temporary instance to access the compiled pattern
# We need to compile the pattern to match against it
pattern = re.compile(cls._raw_pattern)
# Match the raw value against the pattern
match = pattern.match(raw_value)
if not match:
raise ValueError(f"String '{raw_value}' does not match pattern '{cls._pattern}'")
# Extract the field values from the match
field_values = match.groupdict()
# Create and return a new instance with the extracted values
return cls(**field_values)
def serialize(self) -> str:
"""Returns a string representation by filling in the pattern template with field values."""
# Start with the pattern
result = self._raw_pattern
# Replace each named group in the pattern with its corresponding field value
for field in fields(self):
# Find the named group pattern and replace it with the actual value
# We need to replace the regex pattern with the actual value
# Look for the pattern (?P<field_name>...) and replace with the field value
pattern_to_replace = rf'\(\?P<{field.name}>[^)]+\)'
actual_value = getattr(self, field.name)
result = re.sub(pattern_to_replace, actual_value, result)
return result
class JobConfig:
"""TODO need to generate this from databuild.proto"""
class PartitionManifest:
"""TODO need to generate this from databuild.proto"""
class DataBuildJob(Protocol):
# The types of partitions that this job produces
output_types: list[type[PartitionPattern]]
def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ...
def exec(self, config: JobConfig) -> PartitionManifest: ...
class DataBuildGraph:
def __init__(self, label: str):
self.label = label
self.lookup = {}
def job(self, cls: type[DataBuildJob]) -> None:
"""Register a job with the graph."""
for partition in cls.output_types:
assert partition not in self.lookup, f"Partition `{partition}` already registered"
self.lookup[partition] = cls
def generate_bazel_module(self):
"""Generates a complete databuild application, packaging up referenced jobs and this graph via bazel targets"""
raise NotImplementedError

View file

@ -0,0 +1,8 @@
py_test(
name = "dsl_test",
srcs = glob(["*.py"]),
deps = [
"//databuild/dsl/python:dsl",
"@pypi//pytest",
],
)

View file

@ -0,0 +1,73 @@
from databuild.dsl.python.dsl import PartitionPattern, DataBuildGraph, DataBuildJob, JobConfig, PartitionManifest
from dataclasses import dataclass
import pytest
@dataclass
class DateCategory:
data_date: str
category: str
class CategoryAnalysisPartition(DateCategory, PartitionPattern):
_raw_pattern = r"category_analysis/category=(?P<category>[^/]+)/date=(?P<data_date>\d{4}-\d{2}-\d{2})"
def test_basic_partition_pattern():
p1 = CategoryAnalysisPartition(data_date="2025-01-01", category="comedy")
assert p1.serialize() == "category_analysis/category=comedy/date=2025-01-01"
p2 = CategoryAnalysisPartition.deserialize("category_analysis/category=technology/date=2025-01-02")
assert p2.data_date == "2025-01-02"
assert p2.category == "technology"
class NotEnoughFieldsPartition(DateCategory, PartitionPattern):
# Doesn't use the partition fields
_raw_pattern = r"invalid_partition_pattern"
class TooManyFieldsPartition(DateCategory, PartitionPattern):
# Doesn't use the partition fields
_raw_pattern = r"category_analysis/category=(?P<category>[^/]+)/date=(?P<data_date>\d{4}-\d{2}-\d{2})/hour=(?P<hour>\d{2})"
def test_invalid_partition_pattern():
with pytest.raises(ValueError):
NotEnoughFieldsPartition(data_date="2025-01-01", category="comedy")._validate_pattern()
with pytest.raises(ValueError):
TooManyFieldsPartition(data_date="2025-01-01", category="comedy")._validate_pattern()
def test_basic_graph_definition():
graph = DataBuildGraph("//:test_graph")
@graph.job
class TestJob(DataBuildJob):
output_types = [CategoryAnalysisPartition]
def exec(self, config: JobConfig) -> PartitionManifest: ...
def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ...
assert len(graph.lookup) == 1
assert CategoryAnalysisPartition in graph.lookup
def test_graph_collision():
graph = DataBuildGraph("//:test_graph")
@graph.job
class TestJob1(DataBuildJob):
output_types = [CategoryAnalysisPartition]
def exec(self, config: JobConfig) -> PartitionManifest: ...
def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ...
with pytest.raises(AssertionError):
@graph.job
class TestJob2(DataBuildJob):
output_types = [CategoryAnalysisPartition]
def exec(self, config: JobConfig) -> PartitionManifest: ...
def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ...
if __name__ == "__main__":
raise SystemExit(pytest.main([__file__]))

View file

@ -7,10 +7,8 @@
- Add build request notes
- How do we encode job labels in the path? (Build event job links are not encoding job labels properly)
- Resolve double type system with protobuf and openapi
- Prometheus metrics export
- Plan for external worker dispatch (e.g. k8s pod per build, or launch in container service)
- k8s can use [jobs](https://kubernetes.io/docs/concepts/workloads/controllers/job/)
- Should we have meaningful exit codes? E.g. "retry-able error", etc?
- Fully joinable build/job IDs - ensure all execution logs / metrics are joinable to build request ID?
- Triggers?
- How do we handle task logging?

1
requirements.in Normal file
View file

@ -0,0 +1 @@
pytest

26
requirements_lock.txt Normal file
View file

@ -0,0 +1,26 @@
#
# This file is autogenerated by pip-compile with Python 3.13
# by the following command:
#
# bazel run //:requirements.update
#
iniconfig==2.1.0 \
--hash=sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7 \
--hash=sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760
# via pytest
packaging==25.0 \
--hash=sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484 \
--hash=sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f
# via pytest
pluggy==1.6.0 \
--hash=sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3 \
--hash=sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746
# via pytest
pygments==2.19.2 \
--hash=sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887 \
--hash=sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b
# via pytest
pytest==8.4.1 \
--hash=sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7 \
--hash=sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c
# via -r requirements.in