From 6f2408a3ee83e35c254fb49e3dd8af842c372e42 Mon Sep 17 00:00:00 2001 From: Stuart Axelbrooke Date: Wed, 30 Jul 2025 07:01:46 -0700 Subject: [PATCH] lay groundwork for python dsl --- BUILD.bazel | 9 +++ MODULE.bazel | 54 +++++----------- MODULE.bazel.lock | 50 ++++++++++++++- databuild/dsl/python/BUILD.bazel | 7 +++ databuild/dsl/python/dsl.py | 91 +++++++++++++++++++++++++++ databuild/dsl/python/test/BUILD.bazel | 8 +++ databuild/dsl/python/test/dsl_test.py | 73 +++++++++++++++++++++ plans/todo.md | 2 - requirements.in | 1 + requirements_lock.txt | 26 ++++++++ 10 files changed, 278 insertions(+), 43 deletions(-) create mode 100644 databuild/dsl/python/BUILD.bazel create mode 100644 databuild/dsl/python/dsl.py create mode 100644 databuild/dsl/python/test/BUILD.bazel create mode 100644 databuild/dsl/python/test/dsl_test.py create mode 100644 requirements.in create mode 100644 requirements_lock.txt diff --git a/BUILD.bazel b/BUILD.bazel index ae32aa1..ff80e78 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1,3 +1,5 @@ +# Python Deps +load("@rules_python//python:pip.bzl", "compile_pip_requirements") filegroup( name = "jq", @@ -17,3 +19,10 @@ sh_binary( ], visibility = ["//visibility:public"], ) + +# `bazel run //:requirements.update` will regenerate the requirements_txt file +compile_pip_requirements( + name = "requirements", + src = "requirements.in", + requirements_txt = "requirements_lock.txt", +) diff --git a/MODULE.bazel b/MODULE.bazel index 8df77a7..532f569 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -144,15 +144,12 @@ use_repo(crate, "crates") # TypeScript and Node.js dependencies for dashboard bazel_dep(name = "aspect_rules_ts", version = "3.6.3") - bazel_dep(name = "aspect_rules_js", version = "2.0.0") rules_ts_ext = use_extension("@aspect_rules_ts//ts:extensions.bzl", "ext") rules_ts_ext.deps(ts_version_from = "//databuild/dashboard:package.json") use_repo(rules_ts_ext, "npm_typescript") -#bazel_dep(name = "aspect_rules_ts", version = "3.4.0") -#bazel_dep(name = "aspect_rules_js", version = "2.1.3") bazel_dep(name = "aspect_rules_esbuild", version = "0.21.0") npm = use_extension("@aspect_rules_js//npm:extensions.bzl", "npm") @@ -180,12 +177,6 @@ npm.npm_import( version = "2.2.7", ) use_repo(npm, "mithril", "types_mithril") -#npm.npm_import( -# name = "npm_typescript", -# package = "typescript", -# version = "5.8.3", -#) -#use_repo(npm, "mithril", "npm_typescript", "types_mithril") # Tailwind http_file = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "http_file") @@ -208,33 +199,18 @@ http_file( ], ) -#http_archive( -# name = "aspect_rules_esbuild", -# sha256 = "550e33ddeb86a564b22b2c5d3f84748c6639b1b2b71fae66bf362c33392cbed8", -# strip_prefix = "rules_esbuild-0.21.0", -# url = "https://github.com/aspect-build/rules_esbuild/releases/download/v0.21.0/rules_esbuild-v0.21.0.tar.gz", -#) -# -####################### -## rules_esbuild setup # -####################### -# -## Fetches the rules_esbuild dependencies. -## If you want to have a different version of some dependency, -## you should fetch it *before* calling this. -## Alternatively, you can skip calling this function, so long as you've -## already fetched all the dependencies. -#load("@aspect_rules_esbuild//esbuild:dependencies.bzl", "rules_esbuild_dependencies") -# -#rules_esbuild_dependencies() -# -#rules_js_register_toolchains(node_version = DEFAULT_NODE_VERSION) -# -## Register a toolchain containing esbuild npm package and native bindings -#load("@aspect_rules_esbuild//esbuild:repositories.bzl", "LATEST_ESBUILD_VERSION", "esbuild_register_toolchains") -# -#esbuild_register_toolchains( -# name = "esbuild", -# esbuild_version = LATEST_ESBUILD_VERSION, -#) -# +# Python +bazel_dep(name = "rules_python", version = "1.5.1") + +python = use_extension("@rules_python//python/extensions:python.bzl", "python") +python.toolchain( + python_version = "3.13", +) + +pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip") +pip.parse( + hub_name = "pypi", + python_version = "3.13", + requirements_lock = "//:requirements_lock.txt", +) +use_repo(pip, "pypi") diff --git a/MODULE.bazel.lock b/MODULE.bazel.lock index 0ac7f89..733ba4d 100644 --- a/MODULE.bazel.lock +++ b/MODULE.bazel.lock @@ -72,6 +72,7 @@ "https://bcr.bazel.build/modules/protobuf/27.0/MODULE.bazel": "7873b60be88844a0a1d8f80b9d5d20cfbd8495a689b8763e76c6372998d3f64c", "https://bcr.bazel.build/modules/protobuf/27.1/MODULE.bazel": "703a7b614728bb06647f965264967a8ef1c39e09e8f167b3ca0bb1fd80449c0d", "https://bcr.bazel.build/modules/protobuf/29.0-rc2/MODULE.bazel": "6241d35983510143049943fc0d57937937122baf1b287862f9dc8590fc4c37df", + "https://bcr.bazel.build/modules/protobuf/29.0-rc3/MODULE.bazel": "33c2dfa286578573afc55a7acaea3cada4122b9631007c594bf0729f41c8de92", "https://bcr.bazel.build/modules/protobuf/29.0/MODULE.bazel": "319dc8bf4c679ff87e71b1ccfb5a6e90a6dbc4693501d471f48662ac46d04e4e", "https://bcr.bazel.build/modules/protobuf/29.0/source.json": "b857f93c796750eef95f0d61ee378f3420d00ee1dd38627b27193aa482f4f981", "https://bcr.bazel.build/modules/protobuf/3.19.0/MODULE.bazel": "6b5fbb433f760a99a22b18b6850ed5784ef0e9928a72668b66e4d7ccd47db9b0", @@ -109,6 +110,8 @@ "https://bcr.bazel.build/modules/rules_java/7.6.1/MODULE.bazel": "2f14b7e8a1aa2f67ae92bc69d1ec0fa8d9f827c4e17ff5e5f02e91caa3b2d0fe", "https://bcr.bazel.build/modules/rules_java/8.12.0/MODULE.bazel": "8e6590b961f2defdfc2811c089c75716cb2f06c8a4edeb9a8d85eaa64ee2a761", "https://bcr.bazel.build/modules/rules_java/8.12.0/source.json": "cbd5d55d9d38d4008a7d00bee5b5a5a4b6031fcd4a56515c9accbcd42c7be2ba", + "https://bcr.bazel.build/modules/rules_java/8.3.2/MODULE.bazel": "7336d5511ad5af0b8615fdc7477535a2e4e723a357b6713af439fe8cf0195017", + "https://bcr.bazel.build/modules/rules_java/8.5.1/MODULE.bazel": "d8a9e38cc5228881f7055a6079f6f7821a073df3744d441978e7a43e20226939", "https://bcr.bazel.build/modules/rules_jvm_external/4.4.2/MODULE.bazel": "a56b85e418c83eb1839819f0b515c431010160383306d13ec21959ac412d2fe7", "https://bcr.bazel.build/modules/rules_jvm_external/5.1/MODULE.bazel": "33f6f999e03183f7d088c9be518a63467dfd0be94a11d0055fe2d210f89aa909", "https://bcr.bazel.build/modules/rules_jvm_external/5.2/MODULE.bazel": "d9351ba35217ad0de03816ef3ed63f89d411349353077348a45348b096615036", @@ -143,7 +146,8 @@ "https://bcr.bazel.build/modules/rules_python/0.31.0/MODULE.bazel": "93a43dc47ee570e6ec9f5779b2e64c1476a6ce921c48cc9a1678a91dd5f8fd58", "https://bcr.bazel.build/modules/rules_python/0.4.0/MODULE.bazel": "9208ee05fd48bf09ac60ed269791cf17fb343db56c8226a720fbb1cdf467166c", "https://bcr.bazel.build/modules/rules_python/0.40.0/MODULE.bazel": "9d1a3cd88ed7d8e39583d9ffe56ae8a244f67783ae89b60caafc9f5cf318ada7", - "https://bcr.bazel.build/modules/rules_python/0.40.0/source.json": "939d4bd2e3110f27bfb360292986bb79fd8dcefb874358ccd6cdaa7bda029320", + "https://bcr.bazel.build/modules/rules_python/1.5.1/MODULE.bazel": "acfe65880942d44a69129d4c5c3122d57baaf3edf58ae5a6bd4edea114906bf5", + "https://bcr.bazel.build/modules/rules_python/1.5.1/source.json": "aa903e1bcbdfa1580f2b8e2d55100b7c18bc92d779ebb507fec896c75635f7bd", "https://bcr.bazel.build/modules/rules_rust/0.61.0/MODULE.bazel": "0318a95777b9114c8740f34b60d6d68f9cfef61e2f4b52424ca626213d33787b", "https://bcr.bazel.build/modules/rules_rust/0.61.0/source.json": "d1bc743b5fa2e2abb35c436df7126a53dab0c3f35890ae6841592b2253786a63", "https://bcr.bazel.build/modules/rules_shell/0.2.0/MODULE.bazel": "fda8a652ab3c7d8fee214de05e7a9916d8b28082234e8d2c0094505c5268ed3c", @@ -157,7 +161,8 @@ "https://bcr.bazel.build/modules/stardoc/0.6.2/MODULE.bazel": "7060193196395f5dd668eda046ccbeacebfd98efc77fed418dbe2b82ffaa39fd", "https://bcr.bazel.build/modules/stardoc/0.7.0/MODULE.bazel": "05e3d6d30c099b6770e97da986c53bd31844d7f13d41412480ea265ac9e8079c", "https://bcr.bazel.build/modules/stardoc/0.7.1/MODULE.bazel": "3548faea4ee5dda5580f9af150e79d0f6aea934fc60c1cc50f4efdd9420759e7", - "https://bcr.bazel.build/modules/stardoc/0.7.1/source.json": "b6500ffcd7b48cd72c29bb67bcac781e12701cc0d6d55d266a652583cfcdab01", + "https://bcr.bazel.build/modules/stardoc/0.7.2/MODULE.bazel": "fc152419aa2ea0f51c29583fab1e8c99ddefd5b3778421845606ee628629e0e5", + "https://bcr.bazel.build/modules/stardoc/0.7.2/source.json": "58b029e5e901d6802967754adf0a9056747e8176f017cfe3607c0851f4d42216", "https://bcr.bazel.build/modules/upb/0.0.0-20220923-a547704/MODULE.bazel": "7298990c00040a0e2f121f6c32544bab27d4452f80d9ce51349b1a28f3005c43", "https://bcr.bazel.build/modules/zlib/1.2.11/MODULE.bazel": "07b389abc85fdbca459b69e2ec656ae5622873af3f845e1c9d80fe179f3effa0", "https://bcr.bazel.build/modules/zlib/1.3.1.bcr.5/MODULE.bazel": "eec517b5bbe5492629466e11dae908d043364302283de25581e3eb944326c4ca", @@ -716,6 +721,47 @@ ] } }, + "@@rules_python+//python/uv:uv.bzl%uv": { + "general": { + "bzlTransitiveDigest": "bGHlxez0Lkvq2VwrlfCLraKHiJIRHSIJb432X2+pky8=", + "usagesDigest": "WYhzIw9khRBy34H1GxV5+fI1yi07O90NmCXosPUdHWQ=", + "recordedFileInputs": {}, + "recordedDirentsInputs": {}, + "envVariables": {}, + "generatedRepoSpecs": { + "uv": { + "repoRuleId": "@@rules_python+//python/uv/private:uv_toolchains_repo.bzl%uv_toolchains_repo", + "attributes": { + "toolchain_type": "'@@rules_python+//python/uv:uv_toolchain_type'", + "toolchain_names": [ + "none" + ], + "toolchain_implementations": { + "none": "'@@rules_python+//python:none'" + }, + "toolchain_compatible_with": { + "none": [ + "@platforms//:incompatible" + ] + }, + "toolchain_target_settings": {} + } + } + }, + "recordedRepoMappingEntries": [ + [ + "rules_python+", + "bazel_tools", + "bazel_tools" + ], + [ + "rules_python+", + "platforms", + "platforms" + ] + ] + } + }, "@@rules_rust+//crate_universe:extensions.bzl%crate": { "general": { "bzlTransitiveDigest": "7WZJd6ddUExnqTm+/VXRBmv6c4MCs9hJn2U/AZqeQvA=", diff --git a/databuild/dsl/python/BUILD.bazel b/databuild/dsl/python/BUILD.bazel new file mode 100644 index 0000000..6d44396 --- /dev/null +++ b/databuild/dsl/python/BUILD.bazel @@ -0,0 +1,7 @@ +py_library( + name = "dsl", + srcs = ["dsl.py"], + visibility = ["//visibility:public"], + deps = [ + ], +) diff --git a/databuild/dsl/python/dsl.py b/databuild/dsl/python/dsl.py new file mode 100644 index 0000000..c564dd9 --- /dev/null +++ b/databuild/dsl/python/dsl.py @@ -0,0 +1,91 @@ + +from typing import Self, Protocol, get_type_hints, get_origin, get_args +from dataclasses import fields, is_dataclass +import re + + +class PartitionPattern: + _raw_pattern: str + + @property + def _pattern(self) -> re.Pattern: + return re.compile(self._raw_pattern) + + def _validate_pattern(self): + """Checks that both conditions are met: + 1. All fields from the PartitionFields type are present in the pattern + 2. All fields from the pattern are present in the PartitionFields type + """ + # TODO how do I get this to be called? + assert is_dataclass(self), "Should be a dataclass also (for partition fields)" + pattern_fields = set(self._pattern.groupindex.keys()) + partition_fields = {field.name for field in fields(self)} + if pattern_fields != partition_fields: + raise ValueError(f"Pattern fields {pattern_fields} do not match partition fields {partition_fields}") + + @classmethod + def deserialize(cls, raw_value: str) -> Self: + """Parses a partition from a string based on the defined pattern.""" + # Create a temporary instance to access the compiled pattern + # We need to compile the pattern to match against it + pattern = re.compile(cls._raw_pattern) + + # Match the raw value against the pattern + match = pattern.match(raw_value) + if not match: + raise ValueError(f"String '{raw_value}' does not match pattern '{cls._pattern}'") + + # Extract the field values from the match + field_values = match.groupdict() + + # Create and return a new instance with the extracted values + return cls(**field_values) + + def serialize(self) -> str: + """Returns a string representation by filling in the pattern template with field values.""" + # Start with the pattern + result = self._raw_pattern + + # Replace each named group in the pattern with its corresponding field value + for field in fields(self): + # Find the named group pattern and replace it with the actual value + # We need to replace the regex pattern with the actual value + # Look for the pattern (?P...) and replace with the field value + pattern_to_replace = rf'\(\?P<{field.name}>[^)]+\)' + actual_value = getattr(self, field.name) + result = re.sub(pattern_to_replace, actual_value, result) + + return result + + +class JobConfig: + """TODO need to generate this from databuild.proto""" + + +class PartitionManifest: + """TODO need to generate this from databuild.proto""" + + +class DataBuildJob(Protocol): + # The types of partitions that this job produces + output_types: list[type[PartitionPattern]] + + def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ... + + def exec(self, config: JobConfig) -> PartitionManifest: ... + + +class DataBuildGraph: + def __init__(self, label: str): + self.label = label + self.lookup = {} + + def job(self, cls: type[DataBuildJob]) -> None: + """Register a job with the graph.""" + for partition in cls.output_types: + assert partition not in self.lookup, f"Partition `{partition}` already registered" + self.lookup[partition] = cls + + def generate_bazel_module(self): + """Generates a complete databuild application, packaging up referenced jobs and this graph via bazel targets""" + raise NotImplementedError diff --git a/databuild/dsl/python/test/BUILD.bazel b/databuild/dsl/python/test/BUILD.bazel new file mode 100644 index 0000000..238e3e6 --- /dev/null +++ b/databuild/dsl/python/test/BUILD.bazel @@ -0,0 +1,8 @@ +py_test( + name = "dsl_test", + srcs = glob(["*.py"]), + deps = [ + "//databuild/dsl/python:dsl", + "@pypi//pytest", + ], +) diff --git a/databuild/dsl/python/test/dsl_test.py b/databuild/dsl/python/test/dsl_test.py new file mode 100644 index 0000000..ba766eb --- /dev/null +++ b/databuild/dsl/python/test/dsl_test.py @@ -0,0 +1,73 @@ + +from databuild.dsl.python.dsl import PartitionPattern, DataBuildGraph, DataBuildJob, JobConfig, PartitionManifest +from dataclasses import dataclass +import pytest + + +@dataclass +class DateCategory: + data_date: str + category: str + + +class CategoryAnalysisPartition(DateCategory, PartitionPattern): + _raw_pattern = r"category_analysis/category=(?P[^/]+)/date=(?P\d{4}-\d{2}-\d{2})" + +def test_basic_partition_pattern(): + p1 = CategoryAnalysisPartition(data_date="2025-01-01", category="comedy") + assert p1.serialize() == "category_analysis/category=comedy/date=2025-01-01" + + p2 = CategoryAnalysisPartition.deserialize("category_analysis/category=technology/date=2025-01-02") + assert p2.data_date == "2025-01-02" + assert p2.category == "technology" + + +class NotEnoughFieldsPartition(DateCategory, PartitionPattern): + # Doesn't use the partition fields + _raw_pattern = r"invalid_partition_pattern" + + +class TooManyFieldsPartition(DateCategory, PartitionPattern): + # Doesn't use the partition fields + _raw_pattern = r"category_analysis/category=(?P[^/]+)/date=(?P\d{4}-\d{2}-\d{2})/hour=(?P\d{2})" + + +def test_invalid_partition_pattern(): + with pytest.raises(ValueError): + NotEnoughFieldsPartition(data_date="2025-01-01", category="comedy")._validate_pattern() + with pytest.raises(ValueError): + TooManyFieldsPartition(data_date="2025-01-01", category="comedy")._validate_pattern() + + +def test_basic_graph_definition(): + graph = DataBuildGraph("//:test_graph") + + @graph.job + class TestJob(DataBuildJob): + output_types = [CategoryAnalysisPartition] + def exec(self, config: JobConfig) -> PartitionManifest: ... + def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ... + + assert len(graph.lookup) == 1 + assert CategoryAnalysisPartition in graph.lookup + + +def test_graph_collision(): + graph = DataBuildGraph("//:test_graph") + + @graph.job + class TestJob1(DataBuildJob): + output_types = [CategoryAnalysisPartition] + def exec(self, config: JobConfig) -> PartitionManifest: ... + def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ... + + with pytest.raises(AssertionError): + @graph.job + class TestJob2(DataBuildJob): + output_types = [CategoryAnalysisPartition] + def exec(self, config: JobConfig) -> PartitionManifest: ... + def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ... + + +if __name__ == "__main__": + raise SystemExit(pytest.main([__file__])) diff --git a/plans/todo.md b/plans/todo.md index bbcbe0f..29befb1 100644 --- a/plans/todo.md +++ b/plans/todo.md @@ -7,10 +7,8 @@ - Add build request notes - How do we encode job labels in the path? (Build event job links are not encoding job labels properly) - Resolve double type system with protobuf and openapi -- Prometheus metrics export - Plan for external worker dispatch (e.g. k8s pod per build, or launch in container service) - k8s can use [jobs](https://kubernetes.io/docs/concepts/workloads/controllers/job/) - Should we have meaningful exit codes? E.g. "retry-able error", etc? - Fully joinable build/job IDs - ensure all execution logs / metrics are joinable to build request ID? - Triggers? -- How do we handle task logging? diff --git a/requirements.in b/requirements.in new file mode 100644 index 0000000..e079f8a --- /dev/null +++ b/requirements.in @@ -0,0 +1 @@ +pytest diff --git a/requirements_lock.txt b/requirements_lock.txt new file mode 100644 index 0000000..7c48431 --- /dev/null +++ b/requirements_lock.txt @@ -0,0 +1,26 @@ +# +# This file is autogenerated by pip-compile with Python 3.13 +# by the following command: +# +# bazel run //:requirements.update +# +iniconfig==2.1.0 \ + --hash=sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7 \ + --hash=sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760 + # via pytest +packaging==25.0 \ + --hash=sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484 \ + --hash=sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f + # via pytest +pluggy==1.6.0 \ + --hash=sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3 \ + --hash=sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746 + # via pytest +pygments==2.19.2 \ + --hash=sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887 \ + --hash=sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b + # via pytest +pytest==8.4.1 \ + --hash=sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7 \ + --hash=sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c + # via -r requirements.in