From cf449529a308685f2f2b2fd7bee2e27f2bd53fa5 Mon Sep 17 00:00:00 2001 From: Stuart Axelbrooke Date: Wed, 3 Sep 2025 21:32:17 -0700 Subject: [PATCH] lets go --- AGENTS.md | 71 + CLAUDE.md | 106 +- DESIGN.md | 38 +- README.md | 6 +- databuild/BUILD.bazel | 52 +- databuild/README.md | 26 - databuild/build_event_log.rs | 154 ++ databuild/cli/BUILD.bazel | 27 - databuild/cli/error.rs | 31 - databuild/cli/main.rs | 999 ---------- databuild/client/BUILD.bazel | 194 -- databuild/client/tsconfig.json | 21 - .../client/typescript_generator_config.json | 14 - databuild/dashboard/BUILD.bazel | 111 -- databuild/dashboard/README.md | 4 - databuild/dashboard/TYPE_SAFETY.md | 127 -- databuild/dashboard/index.css | 78 - databuild/dashboard/index.html | 21 - databuild/dashboard/index.test.ts | 15 - databuild/dashboard/index.ts | 76 - databuild/dashboard/layout.ts | 52 - databuild/dashboard/package.json | 16 - databuild/dashboard/pages.ts | 1439 -------------- databuild/dashboard/pnpm-lock.yaml | 111 -- databuild/dashboard/pnpm-workspace.yaml | 2 - databuild/dashboard/services.ts | 490 ----- .../test-data/strict-config-failures.ts | 44 - databuild/dashboard/test-strict-config.sh | 69 - databuild/dashboard/transformation-tests.ts | 318 --- databuild/dashboard/tsconfig_app.json | 21 - databuild/dashboard/tsconfig_test.json | 22 - databuild/dashboard/types.ts | 285 --- databuild/dashboard/utils.test.ts | 52 - databuild/dashboard/utils.ts | 108 - databuild/databuild.proto | 1159 +---------- databuild/dsl/python/BUILD.bazel | 29 - databuild/dsl/python/dsl.py | 431 ---- databuild/dsl/python/dsl_job_wrapper.py | 118 -- databuild/dsl/python/generator.py | 29 - databuild/dsl/python/generator_lib.py | 38 - databuild/dsl/python/test/BUILD.bazel | 8 - databuild/dsl/python/test/dsl_test.py | 75 - databuild/event_log/mock.rs | 665 ------- databuild/event_log/mod.rs | 113 -- databuild/event_log/query_engine.rs | 389 ---- databuild/event_log/sqlite_storage.rs | 154 -- databuild/event_log/storage.rs | 75 - databuild/event_log/writer.rs | 460 ----- databuild/format_consistency_test.rs | 143 -- databuild/graph/BUILD.bazel | 43 - databuild/graph/README.md | 10 - databuild/graph/analyze.rs | 652 ------ databuild/graph/execute.rs | 817 -------- databuild/graph/rust_analyze_wrapper.sh.tpl | 13 - databuild/graph/rust_execute_wrapper.sh.tpl | 11 - databuild/graph/test/BUILD.bazel | 5 - databuild/graph/test/analyze_test.sh | 3 - databuild/job/BUILD.bazel | 27 - databuild/job/README.md | 4 - databuild/job/main.rs | 985 --------- databuild/lib.rs | 41 +- databuild/log_access.rs | 440 ----- databuild/log_collector.rs | 402 ---- databuild/mermaid_utils.rs | 915 --------- databuild/metric_templates.rs | 523 ----- databuild/metrics_aggregator.rs | 507 ----- databuild/orchestration/error.rs | 15 - databuild/orchestration/events.rs | 156 -- databuild/orchestration/mod.rs | 261 --- databuild/repositories/builds/mod.rs | 408 ---- databuild/repositories/jobs/mod.rs | 499 ----- databuild/repositories/mod.rs | 17 - databuild/repositories/partitions/mod.rs | 373 ---- databuild/repositories/tasks/mod.rs | 519 ----- databuild/runtime/BUILD.bazel | 77 - .../runtime/simple_executable_wrapper.sh.tpl | 40 - databuild/service/handlers.rs | 1754 ----------------- databuild/service/main.rs | 140 -- databuild/service/mod.rs | 479 ----- databuild/service/openapi_spec_generator.rs | 35 - databuild/status_utils.rs | 291 --- databuild/test/BUILD.bazel | 61 - databuild/test/app/BUILD.bazel | 15 - databuild/test/app/README.md | 34 - databuild/test/app/bazel/BUILD.bazel | 157 -- databuild/test/app/bazel/README.md | 4 - databuild/test/app/bazel/graph/graph_test.py | 91 - databuild/test/app/bazel/graph/lookup.py | 29 - databuild/test/app/bazel/graph/test.py | 0 .../jobs/aggregate_color_votes/README.md | 1 - .../jobs/aggregate_color_votes/config.py | 42 - .../bazel/jobs/aggregate_color_votes/main.py | 20 - .../bazel/jobs/aggregate_color_votes/test.py | 59 - .../jobs/color_vote_report_calc/README.md | 1 - .../jobs/color_vote_report_calc/config.py | 48 - .../bazel/jobs/color_vote_report_calc/main.py | 20 - .../bazel/jobs/color_vote_report_calc/test.py | 60 - .../bazel/jobs/ingest_color_votes/README.md | 1 - .../bazel/jobs/ingest_color_votes/config.py | 13 - .../app/bazel/jobs/ingest_color_votes/main.py | 20 - .../app/bazel/jobs/ingest_color_votes/test.py | 32 - .../bazel/jobs/trailing_color_votes/README.md | 1 - .../bazel/jobs/trailing_color_votes/config.py | 46 - .../bazel/jobs/trailing_color_votes/main.py | 20 - .../bazel/jobs/trailing_color_votes/test.py | 53 - databuild/test/app/bazel/test_e2e.py | 37 - databuild/test/app/colors.py | 2 - databuild/test/app/dal.py | 30 - databuild/test/app/dsl/BUILD.bazel | 54 - .../test/app/dsl/claude-generated-dsl-test.md | 9 - databuild/test/app/dsl/dsl_job_lookup.py | 47 - databuild/test/app/dsl/dsl_job_wrapper.py | 118 -- databuild/test/app/dsl/generated/BUILD.bazel | 71 - .../dsl/generated/aggregate_color_votes.py | 58 - .../dsl/generated/color_vote_report_calc.py | 58 - .../test/app/dsl/generated/dsl_job_lookup.py | 53 - .../app/dsl/generated/ingest_color_votes.py | 58 - .../app/dsl/generated/trailing_color_votes.py | 58 - .../test/app/dsl/generated_test/BUILD.bazel | 7 - .../test/app/dsl/generated_test/test_e2e.py | 37 - databuild/test/app/dsl/graph.py | 131 -- databuild/test/app/dsl/partitions.py | 40 - databuild/test/app/dsl/test/BUILD.bazel | 87 - .../dsl/test/test_aggregate_color_votes.py | 159 -- .../app/dsl/test/test_bazel_dsl_comparison.py | 244 --- .../dsl/test/test_color_vote_report_calc.py | 204 -- .../test/test_dsl_generation_consistency.py | 105 - .../test/app/dsl/test/test_graph_analysis.py | 157 -- .../app/dsl/test/test_ingest_color_votes.py | 56 - .../app/dsl/test/test_trailing_color_votes.py | 135 -- databuild/test/app/e2e_test_common.py | 103 - .../app/jobs/aggregate_color_votes/README.md | 10 - .../app/jobs/aggregate_color_votes/execute.py | 26 - .../app/jobs/color_vote_report_calc/README.md | 18 - .../jobs/color_vote_report_calc/execute.py | 51 - .../app/jobs/ingest_color_votes/README.md | 9 - .../app/jobs/ingest_color_votes/execute.py | 10 - .../test/app/jobs/ingest_color_votes/test.py | 17 - .../app/jobs/trailing_color_votes/README.md | 11 - .../app/jobs/trailing_color_votes/execute.py | 28 - databuild/test/databuild_test.rs | 79 - databuild/test/py_proto_test.py | 10 - databuild/test/simple.proto | 19 - databuild/test/simple_test.rs | 100 - design/build-event-log.md | 29 +- design/core-build.md | 165 +- design/executor.md | 55 + design/glossary.md | 12 +- design/graph-specification.md | 201 +- design/questions.md | 5 - design/service.md | 97 +- design/wants.md | 298 +-- design/why-databuild.md | 2 +- docs/partition-delegation.md | 256 --- plans/01-build-event-log.md | 338 ---- plans/02-build-graph-service.md | 182 -- plans/03-service-interface-refactor.md | 134 -- plans/04-end-to-end-tests-1.md | 195 -- plans/05-roadmap.md | 96 - plans/06-build-graph-dashboard.md | 215 -- plans/07-cli-service-build-unification.md | 336 ---- plans/08-integration-test-v2.md | 148 -- plans/09-partition-leasing.md | 4 - plans/10-shared-core.md | 74 - plans/11-web-app-compile-time-correctness.md | 510 ----- plans/12-dsl.md | 237 --- plans/13-job-wrapper.md | 346 ---- plans/14-graph-side-log-consumption.md | 384 ---- plans/15-dsl-graph-generation.md | 466 ----- plans/16-bel-delta-backend.md | 407 ---- plans/17-python-dsl-generator-fix.md | 164 -- plans/18-bel-refactor.md | 304 --- plans/19-client-server-cli.md | 182 -- plans/20-wants-initial.md | 163 -- plans/ideas.md | 1 - plans/todo.md | 15 - plans/webapp_v1/chunk-1-client-generation.md | 160 -- plans/webapp_v1/chunk-2-hello-world-app.md | 110 -- plans/webapp_v1/chunk-3-routing-framework.md | 130 -- plans/webapp_v1/chunk-4-recent-activity.md | 148 -- plans/webapp_v1/chunk-5-build-status.md | 237 --- plans/webapp_v1/chunk-6-partition-pages.md | 224 --- plans/webapp_v1/chunk-7-jobs-pages.md | 230 --- plans/webapp_v1/chunk-8-graph-analysis.md | 209 -- plans/webapp_v1/chunk-9-polish.md | 128 -- scripts/prepare_dev | 5 + 186 files changed, 529 insertions(+), 29955 deletions(-) create mode 100644 AGENTS.md mode change 100644 => 120000 CLAUDE.md delete mode 100644 databuild/README.md create mode 100644 databuild/build_event_log.rs delete mode 100644 databuild/cli/BUILD.bazel delete mode 100644 databuild/cli/error.rs delete mode 100644 databuild/cli/main.rs delete mode 100644 databuild/client/BUILD.bazel delete mode 100644 databuild/client/tsconfig.json delete mode 100644 databuild/client/typescript_generator_config.json delete mode 100644 databuild/dashboard/BUILD.bazel delete mode 100644 databuild/dashboard/README.md delete mode 100644 databuild/dashboard/TYPE_SAFETY.md delete mode 100644 databuild/dashboard/index.css delete mode 100644 databuild/dashboard/index.html delete mode 100644 databuild/dashboard/index.test.ts delete mode 100644 databuild/dashboard/index.ts delete mode 100644 databuild/dashboard/layout.ts delete mode 100644 databuild/dashboard/package.json delete mode 100644 databuild/dashboard/pages.ts delete mode 100644 databuild/dashboard/pnpm-lock.yaml delete mode 100644 databuild/dashboard/pnpm-workspace.yaml delete mode 100644 databuild/dashboard/services.ts delete mode 100644 databuild/dashboard/test-data/strict-config-failures.ts delete mode 100755 databuild/dashboard/test-strict-config.sh delete mode 100644 databuild/dashboard/transformation-tests.ts delete mode 100644 databuild/dashboard/tsconfig_app.json delete mode 100644 databuild/dashboard/tsconfig_test.json delete mode 100644 databuild/dashboard/types.ts delete mode 100644 databuild/dashboard/utils.test.ts delete mode 100644 databuild/dashboard/utils.ts delete mode 100644 databuild/dsl/python/BUILD.bazel delete mode 100644 databuild/dsl/python/dsl.py delete mode 100644 databuild/dsl/python/dsl_job_wrapper.py delete mode 100644 databuild/dsl/python/generator.py delete mode 100644 databuild/dsl/python/generator_lib.py delete mode 100644 databuild/dsl/python/test/BUILD.bazel delete mode 100644 databuild/dsl/python/test/dsl_test.py delete mode 100644 databuild/event_log/mock.rs delete mode 100644 databuild/event_log/mod.rs delete mode 100644 databuild/event_log/query_engine.rs delete mode 100644 databuild/event_log/sqlite_storage.rs delete mode 100644 databuild/event_log/storage.rs delete mode 100644 databuild/event_log/writer.rs delete mode 100644 databuild/format_consistency_test.rs delete mode 100644 databuild/graph/BUILD.bazel delete mode 100644 databuild/graph/README.md delete mode 100644 databuild/graph/analyze.rs delete mode 100644 databuild/graph/execute.rs delete mode 100644 databuild/graph/rust_analyze_wrapper.sh.tpl delete mode 100644 databuild/graph/rust_execute_wrapper.sh.tpl delete mode 100644 databuild/graph/test/BUILD.bazel delete mode 100755 databuild/graph/test/analyze_test.sh delete mode 100644 databuild/job/BUILD.bazel delete mode 100644 databuild/job/README.md delete mode 100644 databuild/job/main.rs delete mode 100644 databuild/log_access.rs delete mode 100644 databuild/log_collector.rs delete mode 100644 databuild/mermaid_utils.rs delete mode 100644 databuild/metric_templates.rs delete mode 100644 databuild/metrics_aggregator.rs delete mode 100644 databuild/orchestration/error.rs delete mode 100644 databuild/orchestration/events.rs delete mode 100644 databuild/orchestration/mod.rs delete mode 100644 databuild/repositories/builds/mod.rs delete mode 100644 databuild/repositories/jobs/mod.rs delete mode 100644 databuild/repositories/mod.rs delete mode 100644 databuild/repositories/partitions/mod.rs delete mode 100644 databuild/repositories/tasks/mod.rs delete mode 100644 databuild/runtime/BUILD.bazel delete mode 100755 databuild/runtime/simple_executable_wrapper.sh.tpl delete mode 100644 databuild/service/handlers.rs delete mode 100644 databuild/service/main.rs delete mode 100644 databuild/service/mod.rs delete mode 100644 databuild/service/openapi_spec_generator.rs delete mode 100644 databuild/status_utils.rs delete mode 100644 databuild/test/BUILD.bazel delete mode 100644 databuild/test/app/BUILD.bazel delete mode 100644 databuild/test/app/README.md delete mode 100644 databuild/test/app/bazel/BUILD.bazel delete mode 100644 databuild/test/app/bazel/README.md delete mode 100644 databuild/test/app/bazel/graph/graph_test.py delete mode 100644 databuild/test/app/bazel/graph/lookup.py delete mode 100644 databuild/test/app/bazel/graph/test.py delete mode 120000 databuild/test/app/bazel/jobs/aggregate_color_votes/README.md delete mode 100644 databuild/test/app/bazel/jobs/aggregate_color_votes/config.py delete mode 100644 databuild/test/app/bazel/jobs/aggregate_color_votes/main.py delete mode 100644 databuild/test/app/bazel/jobs/aggregate_color_votes/test.py delete mode 120000 databuild/test/app/bazel/jobs/color_vote_report_calc/README.md delete mode 100644 databuild/test/app/bazel/jobs/color_vote_report_calc/config.py delete mode 100644 databuild/test/app/bazel/jobs/color_vote_report_calc/main.py delete mode 100644 databuild/test/app/bazel/jobs/color_vote_report_calc/test.py delete mode 120000 databuild/test/app/bazel/jobs/ingest_color_votes/README.md delete mode 100644 databuild/test/app/bazel/jobs/ingest_color_votes/config.py delete mode 100644 databuild/test/app/bazel/jobs/ingest_color_votes/main.py delete mode 100644 databuild/test/app/bazel/jobs/ingest_color_votes/test.py delete mode 120000 databuild/test/app/bazel/jobs/trailing_color_votes/README.md delete mode 100644 databuild/test/app/bazel/jobs/trailing_color_votes/config.py delete mode 100644 databuild/test/app/bazel/jobs/trailing_color_votes/main.py delete mode 100644 databuild/test/app/bazel/jobs/trailing_color_votes/test.py delete mode 100644 databuild/test/app/bazel/test_e2e.py delete mode 100644 databuild/test/app/colors.py delete mode 100644 databuild/test/app/dal.py delete mode 100644 databuild/test/app/dsl/BUILD.bazel delete mode 100644 databuild/test/app/dsl/claude-generated-dsl-test.md delete mode 100755 databuild/test/app/dsl/dsl_job_lookup.py delete mode 100644 databuild/test/app/dsl/dsl_job_wrapper.py delete mode 100644 databuild/test/app/dsl/generated/BUILD.bazel delete mode 100755 databuild/test/app/dsl/generated/aggregate_color_votes.py delete mode 100755 databuild/test/app/dsl/generated/color_vote_report_calc.py delete mode 100755 databuild/test/app/dsl/generated/dsl_job_lookup.py delete mode 100755 databuild/test/app/dsl/generated/ingest_color_votes.py delete mode 100755 databuild/test/app/dsl/generated/trailing_color_votes.py delete mode 100644 databuild/test/app/dsl/generated_test/BUILD.bazel delete mode 100644 databuild/test/app/dsl/generated_test/test_e2e.py delete mode 100644 databuild/test/app/dsl/graph.py delete mode 100644 databuild/test/app/dsl/partitions.py delete mode 100644 databuild/test/app/dsl/test/BUILD.bazel delete mode 100644 databuild/test/app/dsl/test/test_aggregate_color_votes.py delete mode 100644 databuild/test/app/dsl/test/test_bazel_dsl_comparison.py delete mode 100644 databuild/test/app/dsl/test/test_color_vote_report_calc.py delete mode 100644 databuild/test/app/dsl/test/test_dsl_generation_consistency.py delete mode 100644 databuild/test/app/dsl/test/test_graph_analysis.py delete mode 100644 databuild/test/app/dsl/test/test_ingest_color_votes.py delete mode 100644 databuild/test/app/dsl/test/test_trailing_color_votes.py delete mode 100644 databuild/test/app/e2e_test_common.py delete mode 100644 databuild/test/app/jobs/aggregate_color_votes/README.md delete mode 100644 databuild/test/app/jobs/aggregate_color_votes/execute.py delete mode 100644 databuild/test/app/jobs/color_vote_report_calc/README.md delete mode 100644 databuild/test/app/jobs/color_vote_report_calc/execute.py delete mode 100644 databuild/test/app/jobs/ingest_color_votes/README.md delete mode 100644 databuild/test/app/jobs/ingest_color_votes/execute.py delete mode 100644 databuild/test/app/jobs/ingest_color_votes/test.py delete mode 100644 databuild/test/app/jobs/trailing_color_votes/README.md delete mode 100644 databuild/test/app/jobs/trailing_color_votes/execute.py delete mode 100644 databuild/test/databuild_test.rs delete mode 100644 databuild/test/py_proto_test.py delete mode 100644 databuild/test/simple.proto delete mode 100644 databuild/test/simple_test.rs create mode 100644 design/executor.md delete mode 100644 design/questions.md delete mode 100644 docs/partition-delegation.md delete mode 100644 plans/01-build-event-log.md delete mode 100644 plans/02-build-graph-service.md delete mode 100644 plans/03-service-interface-refactor.md delete mode 100644 plans/04-end-to-end-tests-1.md delete mode 100644 plans/05-roadmap.md delete mode 100644 plans/06-build-graph-dashboard.md delete mode 100644 plans/07-cli-service-build-unification.md delete mode 100644 plans/08-integration-test-v2.md delete mode 100644 plans/09-partition-leasing.md delete mode 100644 plans/10-shared-core.md delete mode 100644 plans/11-web-app-compile-time-correctness.md delete mode 100644 plans/12-dsl.md delete mode 100644 plans/13-job-wrapper.md delete mode 100644 plans/14-graph-side-log-consumption.md delete mode 100644 plans/15-dsl-graph-generation.md delete mode 100644 plans/16-bel-delta-backend.md delete mode 100644 plans/17-python-dsl-generator-fix.md delete mode 100644 plans/18-bel-refactor.md delete mode 100644 plans/19-client-server-cli.md delete mode 100644 plans/20-wants-initial.md delete mode 100644 plans/ideas.md delete mode 100644 plans/todo.md delete mode 100644 plans/webapp_v1/chunk-1-client-generation.md delete mode 100644 plans/webapp_v1/chunk-2-hello-world-app.md delete mode 100644 plans/webapp_v1/chunk-3-routing-framework.md delete mode 100644 plans/webapp_v1/chunk-4-recent-activity.md delete mode 100644 plans/webapp_v1/chunk-5-build-status.md delete mode 100644 plans/webapp_v1/chunk-6-partition-pages.md delete mode 100644 plans/webapp_v1/chunk-7-jobs-pages.md delete mode 100644 plans/webapp_v1/chunk-8-graph-analysis.md delete mode 100644 plans/webapp_v1/chunk-9-polish.md create mode 100755 scripts/prepare_dev diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..805f208 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,71 @@ +# Agent Instructions + +## Project Overview +DataBuild is a bazel-based data build system. Key files: +- [`DESIGN.md`](./DESIGN.md) - Overall design of databuild +- [`databuild.proto`](databuild/databuild.proto) - System interfaces +- Component designs - design docs for specific aspects or components of databuild: + - [Core build](./design/core-build.md) - How the core semantics of databuild works and are implemented + - [Build event log](./design/build-event-log.md) - How the build event log works and is accessed + - [Service](./design/service.md) - How the databuild HTTP service and web app are designed. + - [Glossary](./design/glossary.md) - Centralized description of key terms. + - [Graph specification](./design/graph-specification.md) - Describes the different libraries that enable more succinct declaration of databuild applications than the core bazel-based interface. + - [Deploy strategies](./design/deploy-strategies.md) - Different strategies for deploying databuild applications. + - [Wants](./design/wants.md) - How triggering works in databuild applications. + - [Why databuild?](./design/why-databuild.md) - Why to choose databuild instead of other better established orchestration solutions. + +Please reference these for any related work, as they indicate key technical bias/direction of the project. + +## Tenets + +- Declarative over imperative wherever possible/reasonable. +- We are building for the future, and choose to do "the right thing" rather than taking shortcuts to get unstuck. If you get stuck, pause and ask for help/input. +- Do not add "unknown" results when parses or matches fail - these should always throw. +- Compile time correctness is a super-power, and investment in it speeds up flywheel for development and user value. +- **CLI/Service Interchangeability**: Both the CLI and service must produce identical artifacts (BEL events, logs, metrics, outputs) in the same locations. Users should be able to build with one interface and query/inspect results from the other seamlessly. This principle applies to all DataBuild operations, not just builds. + +## Build & Test +```bash +# Build all databuild components +bazel build //... + +# Run databuild unit tests +bazel test //... + +# Run end-to-end tests (validates CLI vs Service consistency) +./run_e2e_tests.sh + +# Do not try to `bazel test //examples/basic_graph/...`, as this will not work. +``` + +## Project Structure +- `databuild/` - Core system (Rust/Proto) +- `examples/` - Example implementations +- `scripts/` - Build utilities + +## DataBuild Job Architecture + +### Job Target Structure +Each DataBuild job creates three Bazel targets: +- `job_name.exec` - Execution target (calls binary with "exec" subcommand) +- `job_name` - Main job target (pipes config output to exec input) + +### Graph Configuration +```python +databuild_graph( + name = "my_graph", + jobs = [":job1", ":job2"], # Reference base job targets + lookup = ":job_lookup", # Binary that routes partition refs to jobs +) +``` + +### Job Lookup Pattern +```python +def lookup_job_for_partition(partition_ref: str) -> str: + if pattern.match(partition_ref): + return "//:job_name" # Return base job target + raise ValueError(f"No job found for: {partition_ref}") +``` + +## Notes / Tips +- Rust dependencies are implemented via rules_rust, so new dependencies should be added in the `MODULE.bazel` file. diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index 29d382c..0000000 --- a/CLAUDE.md +++ /dev/null @@ -1,105 +0,0 @@ -# Agent Instructions - -## Project Overview -DataBuild is a bazel-based data build system. Key files: -- [`DESIGN.md`](./DESIGN.md) - Overall design of databuild -- [`databuild.proto`](databuild/databuild.proto) - System interfaces -- Component designs - design docs for specific aspects or components of databuild: - - [Core build](./design/core-build.md) - How the core semantics of databuild works and are implemented - - [Build event log](./design/build-event-log.md) - How the build event log works and is accessed - - [Service](./design/service.md) - How the databuild HTTP service and web app are designed. - - [Glossary](./design/glossary.md) - Centralized description of key terms. - - [Graph specification](./design/graph-specification.md) - Describes the different libraries that enable more succinct declaration of databuild applications than the core bazel-based interface. - - [Observability](./design/observability.md) - How observability is systematically achieved throughout databuild applications. - - [Deploy strategies](./design/deploy-strategies.md) - Different strategies for deploying databuild applications. - - [Wants](./design/wants.md) - How triggering works in databuild applications. - - [Why databuild?](./design/why-databuild.md) - Why to choose databuild instead of other better established orchestration solutions. - -Please reference these for any related work, as they indicate key technical bias/direction of the project. - -## Tenets - -- Declarative over imperative wherever possible/reasonable. -- We are building for the future, and choose to do "the right thing" rather than taking shortcuts to get unstuck. If you get stuck, pause and ask for help/input. -- Do not add "unknown" results when parses or matches fail - these should always throw. -- Compile time correctness is a super-power, and investment in it speeds up flywheel for development and user value. -- **CLI/Service Interchangeability**: Both the CLI and service must produce identical artifacts (BEL events, logs, metrics, outputs) in the same locations. Users should be able to build with one interface and query/inspect results from the other seamlessly. This principle applies to all DataBuild operations, not just builds. - -## Build & Test -```bash -# Build all databuild components -bazel build //... - -# Run databuild unit tests -bazel test //... - -# Run end-to-end tests (validates CLI vs Service consistency) -./run_e2e_tests.sh - -# Do not try to `bazel test //examples/basic_graph/...`, as this will not work. -``` - -## Project Structure -- `databuild/` - Core system (Rust/Proto) -- `examples/` - Example implementations -- `scripts/` - Build utilities - -## Key Components -- Graph analysis/execution in Rust -- Bazel rules for job orchestration -- Java/Python examples for different use cases - -## DataBuild Job Architecture - -### Job Target Structure -Each DataBuild job creates three Bazel targets: -- `job_name.cfg` - Configuration target (calls binary with "config" subcommand) -- `job_name.exec` - Execution target (calls binary with "exec" subcommand) -- `job_name` - Main job target (pipes config output to exec input) - -### Unified Job Binary Pattern -Jobs use a single binary with subcommands: -```python -def main(): - command = sys.argv[1] # "config" or "exec" - if command == "config": - handle_config(sys.argv[2:]) # Output job configuration JSON - elif command == "exec": - handle_exec(sys.argv[2:]) # Perform actual work -``` - -### DataBuild Execution Flow -1. **Planning Phase**: DataBuild calls `.cfg` targets to get job configurations -2. **Execution Phase**: DataBuild calls main job targets which pipe config to exec -3. **Job Resolution**: Job lookup returns base job names (e.g., `//:job_name`), not `.cfg` variants - -### Graph Configuration -```python -databuild_graph( - name = "my_graph", - jobs = [":job1", ":job2"], # Reference base job targets - lookup = ":job_lookup", # Binary that routes partition refs to jobs -) -``` - -### Job Lookup Pattern -```python -def lookup_job_for_partition(partition_ref: str) -> str: - if pattern.match(partition_ref): - return "//:job_name" # Return base job target - raise ValueError(f"No job found for: {partition_ref}") -``` - -### Common Pitfalls -- **Not using protobuf-defined interface**: Where structs and interfaces are defined centrally in [`databuild.proto`](./databuild/databuild.proto), those interfaces should always be used. E.g., in rust depending on them via the prost-generated structs, and in the web app via the OpenAPI-generated typescript interfaces. -- **Empty args**: Jobs with `"args": []` won't execute properly -- **Wrong target refs**: Job lookup must return base targets, not `.cfg` variants -- **Missing partition refs**: All outputs must be addressable via partition references -- **Not adding new generated files to OpenAPI outs**: Bazel hermeticity demands that we specify each output file, so when the OpenAPI code gen would create new files, we need to explicitly add them to the target's outs field. - -## Notes / Tips -- Rust dependencies are implemented via rules_rust, so new dependencies should be added in the `MODULE.bazel` file. - -## Documentation - -We use plans / designs in the [plans](./plans/) directory to anchor most large scale efforts. We create plans that are good bets, though not necessarily exhaustive, then (and this is critical) we update them after the work is completed, or after significant progress towards completion. diff --git a/CLAUDE.md b/CLAUDE.md new file mode 120000 index 0000000..47dc3e3 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +AGENTS.md \ No newline at end of file diff --git a/DESIGN.md b/DESIGN.md index 5f014a0..263a853 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -1,26 +1,26 @@ # DataBuild Design -DataBuild is a trivially-deployable, partition-oriented, declarative build system. Where data orchestration flows are normally imperative and implicit (do this, then do that, etc), DataBuild uses stated data dependencies to make this process declarative and explicit. DataBuild scales the declarative nature of tools like DBT to meet the needs of modern, broadly integrated data and ML organizations, who consume data from many sources and which arrive on a highly varying basis. DataBuild enables confident, bounded completeness in a world where input data is effectively never complete at any given time. +DataBuild is a trivially-deployable, partition-oriented, declarative build system. Where data orchestration flows are normally imperative and implicitly coupled (do this, then do that, etc), DataBuild uses stated data dependencies to make this process declarative and explicit. DataBuild scales the declarative nature of tools like DBT to meet the needs of modern, broadly integrated data and ML organizations, who consume data from many sources and which arrive on a highly varying basis. DataBuild enables confident, bounded completeness in a world where input data is effectively never complete at any given time. ## Philosophy Many large-scale systems for producing data leave the complexity of true orchestration to the user - even DAG-based systems for implementing dependencies leave the system as a collection of DAGs, requiring engineers to solve the same "why doesn't this data exist?" and "how do I build this data?" -DataBuild takes inspiration from modern data orchestration and build systems to fully internalize this complexity, using the Job concept to localize all decisions of turning upstream data into output data (and making all dependencies explicit); and the Graph concept to handle composition of jobs, answering what sequence of jobs must be run to build a specific partition of data. With Jobs and Graphs, DataBuild takes complete responsibility for the data build process, allowing engineers to consider concerns only local to the jobs relevant to their feature. +DataBuild takes inspiration from modern data orchestration and build systems to fully internalize this complexity, using the Job concept to localize all decisions of turning upstream data into output data (and making all dependencies explicit); and the Graph concept to handle composition of jobs, enabling continuous data reconciliation for data platforms of all sizes. With Jobs and Graphs, DataBuild takes complete responsibility for the data build process, allowing engineers to consider concerns only local to the jobs relevant to their feature. Graphs and jobs are defined in [bazel](https://bazel.build), allowing graphs (and their constituent jobs) to be built and deployed trivially. ## Concepts - **Partitions** - A partition is an atomic unit of data. DataBuild's data dependencies work by using partition references (e.g. `s3://some/dataset/date=2025-06-01`) as dependency signals between jobs, allowing the construction of build graphs to produce arbitrary partitions. -- **Jobs** - Their `exec` entrypoint builds partitions from partitions, and their `config` entrypoint specifies what partitions are required to produce the requested partition(s), along with the specific config to run `exec` with to build said partitions. -- **Graphs** - Composes jobs together to achieve multi-job orchestration, using a `lookup` mechanism to resolve a requested partition to the job that can build it. Together with its constituent jobs, Graphs can fully plan the build of any set of partitions. Most interactions with a DataBuild app happen with a graph. -- **Build Event Log** - Encodes the state of the system, recording build requests, job activity, partition production, etc to enable running databuild as a deployed application. -- **Wants** - Partition wants can be registered with DataBuild, causing it to build the wanted partitions as soon as its graph-external dependencies are met. +- **Jobs** - Builds requested partitions from specific input partitions, or raising when input partitions are missing (specifying which partitions can't be built because of specific missing partitions) +- **Graphs** - Composes jobs together to achieve multi-job orchestration, using a `lookup` mechanism to resolve a requested partition to the job that can build it. Together with its constituent jobs, Graphs can fully build any set of partitions. Most interactions with a DataBuild app happen with a graph. +- **Build Event Log** - Encodes the state of the system, recording partition wants, job activity, partition production, etc to enable running databuild as a deployed application. +- **Wants** - Partition wants can be registered with DataBuild, enabling continuous data reconciliation and build of wanted partitions as soon as their graph-external dependencies are met. - **Taints** - Taints mark a partition as invalid, indicating that readers should not use it, and that it should be rebuilt when requested or depended upon. If there is a still-active want for the tainted partition, it will be rebuilt immediately. - **Bazel Targets** - Bazel is a fast, extensible, and hermetic build system. DataBuild uses bazel targets to describe graphs and jobs, making graphs themselves deployable application. Implementing a DataBuild app is the process of integrating your data build jobs in `databuild_job` bazel targets, and connecting them with a `databuild_graph` target. -- [**Graph Specification Strategies**](design/graph-specification.md) (coming soon) Application libraries in Python/Rust/Scala that use language features to enable ergonomic and succinct specification of jobs and graphs. +- [**Graph Definition Languages**](design/graph-specification.md) Application libraries in Python/Rust/Scala that use language features to enable ergonomic and succinct specification of jobs and graphs. ### Partition / Job Assumptions and Best Practices @@ -28,18 +28,11 @@ Graphs and jobs are defined in [bazel](https://bazel.build), allowing graphs (an - **Partitions are mutually exclusive and collectively exhaustive** - Row membership to a partition should be unambiguous and consistent. - **Jobs are idempotent** - For the same input data and parameters, the same partition is produced (functionally). -### Partition Delegation - -If a partition is already up to date, or is already being built by a previous build request, a new build request will "delegate" to that build request. Instead of running the job to build said partition again, it will emit a delegation event in the build event log, explicitly pointing to the build action it is delegating to. - -## Components +## Bazel Components ### Job -The `databuild_job` rule expects to reference a binary that adheres to the following expectations: - -- For the `config` subcommand, it prints the JSON job config to stdout based on the requested partitions, e.g. for a binary `bazel-bin/my_binary`, it prints a valid job config when called like `bazel-bin/my_binary config my_dataset/color=red my_dataset/color=blue`. -- For the `exec` subcommand, it produces the partitions requested to the `config` subcommand when configured by the job config it produced. E.g., if `config` had produced `{..., "args": ["red", "blue"], "env": {"MY_ENV": "foo"}`, then calling `MY_ENV=foo bazel-bin/my_binary exec red blue` should produce partitions `my_dataset/color=red` and `my_dataset/color=blue`. +The `databuild_job` rule requires just a binary target that it can execute, and any relevant metadata that helps the graph call it properly. The referenced binary should accept a list of partitions that it needs to produce, and if any required partitions are missing, report which are missing and which requested partitions they prevent from being built. Jobs are executed via a wrapper component that provides observability, error handling, and standardized communication with the graph. The wrapper captures all job output as structured logs, enabling comprehensive monitoring without requiring jobs to have network connectivity. @@ -50,19 +43,16 @@ The `databuild_graph` rule expects two fields, `jobs`, and `lookup`: - The `lookup` binary target should return a JSON object with keys as job labels and values as the list of partitions that each job is responsible for producing. This enables graph planning by walking backwards in the data dependency graph. - The `jobs` list should just be a list of all jobs involved in the graph. The graph will recursively call config to resolve the full set of jobs to run. -### Build Event Log (BEL) +### [Build Event Log (BEL)](./design/build-event-log.md) -The BEL encodes all relevant build actions that occur, enabling concurrent builds. This includes: - -- Graph events, including "build requested", "build started", "analysis started", "build failed", "build completed", etc. -- Job events, including "..." +The BEL encodes all relevant build actions that occur, enabling distributed/concurrent builds. This includes submitted wants, job events (started, succeeded, partitions missing, etc) The BEL is similar to [event-sourced](https://martinfowler.com/eaaDev/EventSourcing.html) systems, as all application state is rendered from aggregations over the BEL. This enables the BEL to stay simple while also powering concurrent builds, the data catalog, and the DataBuild service. -### Triggers and Wants (Coming Soon) -["Wants"](./design/wants.md) are the main mechanism for continually building partitions over time. In real world scenarios, it is standard for data to arrive late, or not at all. Wants cause the databuild graph to continually attempt to build the wanted partitions until a) the partitions are live or b) the want expires, at which another script can be run. Wants are the mechanism that implements SLA checking. +### Wants and Taints +["Wants"](./design/wants.md) are the main mechanism for eventually built partitions. In real world scenarios, it is standard for data to arrive late, or not at all. Wants cause the databuild graph to continually attempt to build the wanted partitions while they aren't live, and enabling it to list wants who are past SLA. -You can also use cron-based triggers, which return partition refs that they want built. +Taints allow for manual/programmatic invalidation of built partitions. Partitions tainted since their last build are considered as non-existent, and will be rebuilt if any other wanted partition depends on them. This also opens the door for invalidating downstream partitions as well. # Key Insights diff --git a/README.md b/README.md index 6bad251..b60d96d 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,9 @@ █████████╔╝ ██████╔═╝ ██╔╝ ████████╗ ███████╔═╝ ╚════════╝ ╚═════╝ ╚═╝ ╚═══════╝ ╚══════╝ - - -- S Y S T E M O N L I N E -- - + - - -- D E C L A R A T I V E -- - - + - - -- P A R T I T I O N E D -- - - + - - -- D A T A B U I L D S -- - - ``` DataBuild is a trivially-deployable, partition-oriented, declarative data build system. @@ -33,8 +35,6 @@ For important context, check out [DESIGN.md](./DESIGN.md), along with designs in - **Deploy anywhere** - One binary, any platform. Bazel-based builds create hermetic applications that run locally, in containers, or in the cloud. -- **Concurrent by design** - Multiple teams, zero conflicts. Event-sourced coordination enables parallel builds without stepping on each other. - ## Usage ### Graph Description Methods diff --git a/databuild/BUILD.bazel b/databuild/BUILD.bazel index b5632a9..7ae21b7 100644 --- a/databuild/BUILD.bazel +++ b/databuild/BUILD.bazel @@ -20,30 +20,8 @@ rust_binary( rust_library( name = "databuild", srcs = [ - "event_log/mock.rs", - "event_log/mod.rs", - "event_log/query_engine.rs", - "event_log/sqlite_storage.rs", - "event_log/storage.rs", - "event_log/writer.rs", - "format_consistency_test.rs", + "build_event_log.rs", "lib.rs", - "log_access.rs", - "log_collector.rs", - "mermaid_utils.rs", - "metric_templates.rs", - "metrics_aggregator.rs", - "orchestration/error.rs", - "orchestration/events.rs", - "orchestration/mod.rs", - "repositories/builds/mod.rs", - "repositories/jobs/mod.rs", - "repositories/mod.rs", - "repositories/partitions/mod.rs", - "repositories/tasks/mod.rs", - "service/handlers.rs", - "service/mod.rs", - "status_utils.rs", ":generate_databuild_rust", ], edition = "2021", @@ -69,20 +47,9 @@ rust_library( ], ) -# OpenAPI Spec Generator binary (no dashboard dependency) -# No need to run this manually - it will automatically generate source and it will be used in -# the related targets (e.g. //databuild/client:extract_openapi_spec) -rust_binary( - name = "openapi_spec_generator", - srcs = ["service/openapi_spec_generator.rs"], - edition = "2021", - visibility = ["//visibility:public"], - deps = [ - ":databuild", - "@crates//:log", - "@crates//:serde_json", - "@crates//:tokio", - ], +rust_test( + name = "databuild_test", + crate = ":databuild", ) # Build Graph Service binary @@ -111,17 +78,6 @@ rust_binary( ], ) -# Test for orchestration module -rust_test( - name = "orchestration_test", - crate = ":databuild", - edition = "2021", - deps = [ - "@crates//:tempfile", - "@crates//:tokio", - ], -) - # Legacy filegroup for backwards compatibility filegroup( name = "proto", diff --git a/databuild/README.md b/databuild/README.md deleted file mode 100644 index bca1c69..0000000 --- a/databuild/README.md +++ /dev/null @@ -1,26 +0,0 @@ - -# DataBuild - -## API - -A sort of requirements doc for the semantics of DataBuild, enumerating the nouns and verbs they can do. - -### Graph - -- `analyze` - Produce the job graph required to build the requested set of partitions. -- `build` - Analyze and then execute the produced job graph to build the requested partitions. -- `builds` - - `list` - List past builds. - - `show` - Shows current status of specified build and list events. Can tail build events for a build with `--follow/-f` - - `cancel` - Cancel specified build. -- `partitions` - - `list` - Lists partitions. - - `show` - Shows current status of the specified partition. - - `invalidate` - Marks a partition as invalid (will be rebuilt, won't be read). -- `jobs` - - `list` - List jobs in the graph. - - `show` - Shows task statistics (success %, runtime, etc) and recent task results. -- `tasks` (job runs) - - `list` - Lists past tasks. - - `show` - Describes current task status and lists events. - - `cancel` - Cancels a specific task. diff --git a/databuild/build_event_log.rs b/databuild/build_event_log.rs new file mode 100644 index 0000000..464343e --- /dev/null +++ b/databuild/build_event_log.rs @@ -0,0 +1,154 @@ +use crate::data_build_event::Event; +use crate::{ + BuildState, DataBuildEvent, EventFilter, WantState, +}; +use std::error::Error; +use std::sync::{Arc, RwLock}; + +trait BELStorage { + fn append_event(&mut self, event: Event) -> Result>; + fn list_events( + &self, + since_idx: u64, + filter: EventFilter, + limit: u64, + ) -> Result, Box>; +} + +struct BuildEventLog { + storage: B, + state: Arc>, +} + +impl BuildEventLog { + fn create(storage: B) -> BuildEventLog { + BuildEventLog { + storage, + state: Arc::new(Default::default()), + } + } +} + +impl BuildEventLog { + fn append_event(&mut self, event: Event) -> Result> { + let idx = self.storage.append_event(event.clone())?; + self.reduce(event); + Ok(idx) + } + + fn reduce(&mut self, event: Event) { + match event { + Event::JobRunBuffer(e) => {} + Event::JobRunQueue(_) => {} + Event::JobRunStarted(_) => {} + Event::JobRunHeartbeat(_) => {} + Event::JobRunSuccess(_) => {} + Event::JobRunFailure(_) => {} + Event::JobRunCancel(_) => {} + Event::JobRunMissingDeps(_) => {} + Event::WantCreate(e) => { + self.state + .write() + .expect("couldn't take write lock") + .wants + .insert(e.want_id.clone(), WantState { want_id: e.want_id }); + } + Event::WantCancel(_) => {} + Event::TaintCreate(_) => {} + Event::TaintDelete(_) => {} + } + } +} + +mod tests { + use crate::build_event_log::{BELStorage, BuildEventLog}; + use crate::data_build_event::Event; + use crate::{DataBuildEvent, EventFilter, PartitionRef, WantCreateEvent}; + use std::error::Error; + use std::time::{SystemTime, UNIX_EPOCH}; + + struct TestBELStorage { + events: Vec, + } + + impl TestBELStorage { + fn create() -> TestBELStorage { + TestBELStorage { events: vec![] } + } + } + + impl BELStorage for TestBELStorage { + fn append_event(&mut self, event: Event) -> Result> { + let now = SystemTime::now(); + let duration_since_epoch = now.duration_since(UNIX_EPOCH) + .expect("Time went backwards"); + + let timestamp = duration_since_epoch.as_nanos() as u64; + + let dbe = DataBuildEvent { + timestamp, + event_id: self.events.len() as u64, + event: Some(event), + }; + self.events.push(dbe); + Ok(self.events.len() as u64) + } + + fn list_events( + &self, + since_idx: u64, + filter: EventFilter, + limit: u64, + ) -> Result, Box> { + Ok(self.events.clone()) + } + } + + #[test] + fn test_hello() { + assert_eq!(2 + 3, 5); + } + + #[test] + fn test_append_event() { + let storage = TestBELStorage::create(); + let mut log = BuildEventLog::create(storage); + // Initial state + assert_eq!(log.storage.events.len(), 0); + let want_id = "1234".to_string(); + { + let state = log.state.read().unwrap(); + assert!(state.wants.get(&want_id).is_none()); + } + + // Given + log.append_event(Event::WantCreate(WantCreateEvent { + want_id: want_id.clone(), + root_want_id: "123".to_string(), + parent_want_id: "123".to_string(), + partitions: vec![PartitionRef { + r#ref: "".to_string(), + }], + data_timestamp: 0, + ttl_seconds: 1, + sla_seconds: 1, + source: None, + comment: None, + })) + .expect("append_event failed"); + + // Assert + assert_eq!(log.storage.events.len(), 1); + let state = log.state.read().expect("couldn't take read lock"); + assert!(state.wants.get(&want_id).is_some(), "want_id not found"); + assert_eq!( + state + .wants + .get(&want_id) + .map(|want| want.want_id.clone()) + .expect("state.wants want_id not found"), + want_id, + "want_id not equal", + ); + } +} diff --git a/databuild/cli/BUILD.bazel b/databuild/cli/BUILD.bazel deleted file mode 100644 index ec2794d..0000000 --- a/databuild/cli/BUILD.bazel +++ /dev/null @@ -1,27 +0,0 @@ -load("@rules_rust//rust:defs.bzl", "rust_binary") - -# DataBuild CLI wrapper using orchestrator -rust_binary( - name = "databuild_cli", - srcs = [ - "main.rs", - "error.rs", - ], - edition = "2021", - visibility = ["//visibility:public"], - data = [ - "//databuild/graph:analyze", - "//databuild/graph:execute", - ], - deps = [ - "//databuild:databuild", - "@crates//:clap", - "@crates//:log", - "@crates//:serde", - "@crates//:serde_json", - "@crates//:simple_logger", - "@crates//:thiserror", - "@crates//:tokio", - "@crates//:uuid", - ], -) \ No newline at end of file diff --git a/databuild/cli/error.rs b/databuild/cli/error.rs deleted file mode 100644 index 1320195..0000000 --- a/databuild/cli/error.rs +++ /dev/null @@ -1,31 +0,0 @@ -use crate::event_log::BuildEventLogError; -use crate::orchestration::OrchestrationError; - -#[derive(Debug, thiserror::Error)] -pub enum CliError { - #[error("Event log error: {0}")] - EventLog(#[from] BuildEventLogError), - - #[error("Orchestration error: {0}")] - Orchestration(#[from] OrchestrationError), - - #[error("Analysis error: {0}")] - Analysis(String), - - #[error("Execution error: {0}")] - Execution(String), - - #[error("Environment error: {0}")] - Environment(String), - - #[error("Invalid arguments: {0}")] - InvalidArguments(String), - - #[error("Database error: {0}")] - Database(String), - - #[error("Output formatting error: {0}")] - Output(String), -} - -pub type Result = std::result::Result; \ No newline at end of file diff --git a/databuild/cli/main.rs b/databuild/cli/main.rs deleted file mode 100644 index 4f2ba7b..0000000 --- a/databuild/cli/main.rs +++ /dev/null @@ -1,999 +0,0 @@ -use databuild::*; -use databuild::event_log::create_bel_query_engine; -use databuild::orchestration::{BuildOrchestrator, BuildResult}; -use databuild::repositories::{ - partitions::PartitionsRepository, - jobs::JobsRepository, - tasks::TasksRepository, - builds::BuildsRepository -}; -use clap::{Arg, Command as ClapCommand, ArgMatches}; -use log::{info, error}; -use simple_logger::SimpleLogger; -use std::env; -use std::process::{Command, Stdio}; -use uuid::Uuid; - -mod error; -use error::{CliError, Result}; - -/// Run the analyze command and return the job graph -async fn run_analysis( - partitions: &[String], - orchestrator: &BuildOrchestrator, -) -> Result { - info!("Running analysis for partitions: {:?}", partitions); - - // Get required environment variables - let candidate_jobs = env::var("DATABUILD_CANDIDATE_JOBS_CFG") - .map_err(|_| CliError::Environment("DATABUILD_CANDIDATE_JOBS_CFG not set".to_string()))?; - let job_lookup_path = env::var("DATABUILD_JOB_LOOKUP_PATH") - .map_err(|_| CliError::Environment("DATABUILD_JOB_LOOKUP_PATH not set".to_string()))?; - let graph_label = env::var("DATABUILD_GRAPH_LABEL") - .map_err(|_| CliError::Environment("DATABUILD_GRAPH_LABEL not set".to_string()))?; - - // Find analyze binary using runfiles - let analyze_path = env::var("DATABUILD_ANALYZE_BINARY") - .map_err(|_| CliError::Environment("DATABUILD_ANALYZE_BINARY not set".to_string()))?; - - // Build analyze command - let cmd = Command::new(analyze_path) - .args(partitions) - .env("DATABUILD_CANDIDATE_JOBS_CFG", candidate_jobs) - .env("DATABUILD_JOB_LOOKUP_PATH", job_lookup_path) - .env("DATABUILD_GRAPH_LABEL", graph_label) - .env("DATABUILD_MODE", "plan") - .env("DATABUILD_BUILD_REQUEST_ID", orchestrator.build_request_id()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn() - .map_err(|e| CliError::Analysis(format!("Failed to spawn analyze process: {}", e)))?; - - let output = cmd.wait_with_output() - .map_err(|e| CliError::Analysis(format!("Failed to run analyze: {}", e)))?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - return Err(CliError::Analysis(format!("Analysis failed: {}", stderr))); - } - - let stdout = String::from_utf8_lossy(&output.stdout); - let job_graph: JobGraph = serde_json::from_str(&stdout) - .map_err(|e| CliError::Analysis(format!("Failed to parse job graph: {}", e)))?; - - info!("Analysis complete, found {} tasks", job_graph.nodes.len()); - Ok(job_graph) -} - -/// Run the execute command with the job graph -async fn run_execution( - job_graph: JobGraph, - orchestrator: &BuildOrchestrator, -) -> Result { - info!("Running execution for {} tasks", job_graph.nodes.len()); - - // Serialize job graph to JSON for the execute command - let job_graph_json = serde_json::to_string(&job_graph) - .map_err(|e| CliError::Execution(format!("Failed to serialize job graph: {}", e)))?; - - // Get required environment variables - let candidate_jobs = env::var("DATABUILD_CANDIDATE_JOBS_CFG") - .map_err(|_| CliError::Environment("DATABUILD_CANDIDATE_JOBS_CFG not set".to_string()))?; - let build_event_log_uri = env::var("DATABUILD_BUILD_EVENT_LOG").unwrap_or_else(|_| "stdout".to_string()); - - // Find execute binary using runfiles - let execute_path = env::var("DATABUILD_EXECUTE_BINARY") - .map_err(|_| CliError::Environment("DATABUILD_EXECUTE_BINARY not set".to_string()))?; - - // Build execute command - let mut cmd = Command::new(execute_path) - .env("DATABUILD_CANDIDATE_JOBS_CFG", candidate_jobs) - .env("DATABUILD_BUILD_EVENT_LOG", build_event_log_uri) - .env("DATABUILD_BUILD_REQUEST_ID", orchestrator.build_request_id()) - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn() - .map_err(|e| CliError::Execution(format!("Failed to spawn execute process: {}", e)))?; - - // Write job graph to stdin - if let Some(stdin) = cmd.stdin.as_mut() { - use std::io::Write; - stdin.write_all(job_graph_json.as_bytes()) - .map_err(|e| CliError::Execution(format!("Failed to write job graph to execute: {}", e)))?; - } - - let output = cmd.wait_with_output() - .map_err(|e| CliError::Execution(format!("Failed to run execute: {}", e)))?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - error!("Execution failed:\n{}", stderr); - return Err(CliError::Execution("Execution failed".to_string())); - } - - // For now, assume success if the command completed without error - // In the future, we could parse the output to get more detailed results - info!("Execution completed successfully"); - Ok(BuildResult::Success { jobs_completed: job_graph.nodes.len() }) -} - -async fn handle_build_command(matches: &ArgMatches) -> Result<()> { - let partitions: Vec = matches.get_many::("partitions") - .unwrap() - .cloned() - .collect(); - - let event_log_uri = matches.get_one::("event-log") - .cloned() - .or_else(|| env::var("DATABUILD_BUILD_EVENT_LOG").ok()) - .unwrap_or_else(|| "stdout".to_string()); - - let build_request_id = matches.get_one::("build-request-id") - .cloned() - .or_else(|| env::var("DATABUILD_BUILD_REQUEST_ID").ok()) - .unwrap_or_else(|| Uuid::new_v4().to_string()); - - info!("Build request ID: {}", build_request_id); - info!("Partitions: {:?}", partitions); - info!("Event log URI: {}", event_log_uri); - - // Create event log and orchestrator - let query_engine = create_bel_query_engine(&event_log_uri).await?; - - let requested_partitions: Vec = partitions.iter() - .map(|p| PartitionRef { str: p.clone() }) - .collect(); - - let orchestrator = BuildOrchestrator::new( - query_engine.clone(), - build_request_id, - requested_partitions, - ); - - // Emit orchestration events - orchestrator.start_build().await?; - orchestrator.start_planning().await?; - - // Run analysis - let job_graph = run_analysis(&partitions, &orchestrator).await?; - - orchestrator.start_execution().await?; - - // Run execution - let result = run_execution(job_graph, &orchestrator).await?; - - orchestrator.complete_build(result).await?; - - info!("DataBuild CLI completed successfully"); - Ok(()) -} - -fn format_timestamp(timestamp_nanos: i64) -> String { - use std::time::{UNIX_EPOCH, Duration}; - - let timestamp_secs = timestamp_nanos / 1_000_000_000; - let system_time = UNIX_EPOCH + Duration::from_secs(timestamp_secs as u64); - - match system_time.duration_since(UNIX_EPOCH) { - Ok(duration) => { - let secs = duration.as_secs(); - let days = secs / 86400; - let hours = (secs % 86400) / 3600; - let minutes = (secs % 3600) / 60; - - if days > 0 { - format!("{}d {}h ago", days, hours) - } else if hours > 0 { - format!("{}h {}m ago", hours, minutes) - } else { - format!("{}m ago", minutes) - } - } - Err(_) => "unknown".to_string(), - } -} - -#[tokio::main] -async fn main() -> Result<()> { - // Initialize logger - SimpleLogger::new() - .with_level(log::LevelFilter::Info) - .init() - .map_err(|e| CliError::Environment(format!("Failed to initialize logger: {}", e)))?; - - // Parse command line arguments - let matches = ClapCommand::new("databuild") - .version("1.0") - .about("DataBuild unified CLI") - .subcommand_required(false) - .arg_required_else_help(false) - .arg( - Arg::new("partitions") - .help("Partition references to build (legacy direct build mode)") - .num_args(1..) - .value_name("PARTITIONS") - ) - .subcommand( - ClapCommand::new("build") - .about("Build partitions using the DataBuild execution engine") - .arg( - Arg::new("partitions") - .help("Partition references to build") - .required(true) - .num_args(1..) - .value_name("PARTITIONS") - ) - .arg( - Arg::new("event-log") - .long("event-log") - .help("Event log URI (default: stdout)") - .value_name("URI") - ) - .arg( - Arg::new("build-request-id") - .long("build-request-id") - .help("Build request ID (default: generate UUID)") - .value_name("ID") - ) - ) - .subcommand( - ClapCommand::new("partitions") - .about("Query and manage partitions") - .subcommand( - ClapCommand::new("list") - .about("List all partitions") - .arg(Arg::new("limit").long("limit").short('l').value_name("LIMIT").help("Maximum number of partitions to show")) - .arg(Arg::new("format").long("format").short('f').value_name("FORMAT").help("Output format (table or json)").default_value("table")) - ) - .subcommand( - ClapCommand::new("show") - .about("Show partition details") - .arg(Arg::new("partition_ref").required(true).help("Partition reference")) - .arg(Arg::new("format").long("format").short('f').value_name("FORMAT").help("Output format (table or json)").default_value("table")) - ) - .subcommand( - ClapCommand::new("invalidate") - .about("Invalidate a partition") - .arg(Arg::new("partition_ref").required(true).help("Partition reference")) - .arg(Arg::new("reason").long("reason").short('r').required(true).help("Reason for invalidation")) - .arg(Arg::new("build_request_id").long("build-request-id").short('b').required(true).help("Build request ID")) - ) - ) - .subcommand( - ClapCommand::new("jobs") - .about("Query job execution data") - .subcommand( - ClapCommand::new("list") - .about("List all jobs") - .arg(Arg::new("limit").long("limit").short('l').value_name("LIMIT").help("Maximum number of jobs to show")) - .arg(Arg::new("format").long("format").short('f').value_name("FORMAT").help("Output format (table or json)").default_value("table")) - ) - .subcommand( - ClapCommand::new("show") - .about("Show job details") - .arg(Arg::new("job_label").required(true).help("Job label")) - .arg(Arg::new("format").long("format").short('f').value_name("FORMAT").help("Output format (table or json)").default_value("table")) - ) - ) - .subcommand( - ClapCommand::new("tasks") - .about("Query and manage tasks (job runs)") - .subcommand( - ClapCommand::new("list") - .about("List all tasks") - .arg(Arg::new("limit").long("limit").short('l').value_name("LIMIT").help("Maximum number of tasks to show")) - .arg(Arg::new("format").long("format").short('f').value_name("FORMAT").help("Output format (table or json)").default_value("table")) - ) - .subcommand( - ClapCommand::new("show") - .about("Show task details") - .arg(Arg::new("job_run_id").required(true).help("Job run ID")) - .arg(Arg::new("format").long("format").short('f').value_name("FORMAT").help("Output format (table or json)").default_value("table")) - ) - .subcommand( - ClapCommand::new("cancel") - .about("Cancel a task") - .arg(Arg::new("job_run_id").required(true).help("Job run ID")) - .arg(Arg::new("reason").long("reason").short('r').required(true).help("Reason for cancellation")) - .arg(Arg::new("build_request_id").long("build-request-id").short('b').required(true).help("Build request ID")) - ) - ) - .subcommand( - ClapCommand::new("builds") - .about("Query and manage build requests") - .subcommand( - ClapCommand::new("list") - .about("List all builds") - .arg(Arg::new("limit").long("limit").short('l').value_name("LIMIT").help("Maximum number of builds to show")) - .arg(Arg::new("format").long("format").short('f').value_name("FORMAT").help("Output format (table or json)").default_value("table")) - ) - .subcommand( - ClapCommand::new("show") - .about("Show build details") - .arg(Arg::new("build_request_id").required(true).help("Build request ID")) - .arg(Arg::new("format").long("format").short('f').value_name("FORMAT").help("Output format (table or json)").default_value("table")) - ) - .subcommand( - ClapCommand::new("cancel") - .about("Cancel a build") - .arg(Arg::new("build_request_id").required(true).help("Build request ID")) - .arg(Arg::new("reason").long("reason").short('r').required(true).help("Reason for cancellation")) - ) - ) - .arg( - Arg::new("event-log") - .long("event-log") - .help("Event log URI (default: sqlite:databuild.db for repository commands)") - .value_name("URI") - .global(true) - ) - .get_matches(); - - // Get global event log URI - let event_log_uri = matches.get_one::("event-log") - .cloned() - .or_else(|| env::var("DATABUILD_BUILD_EVENT_LOG").ok()) - .unwrap_or_else(|| "sqlite:databuild.db".to_string()); - - match matches.subcommand() { - Some(("build", sub_matches)) => { - handle_build_command(sub_matches).await?; - } - Some(("partitions", sub_matches)) => { - handle_partitions_command(sub_matches, &event_log_uri).await?; - } - Some(("jobs", sub_matches)) => { - handle_jobs_command(sub_matches, &event_log_uri).await?; - } - Some(("tasks", sub_matches)) => { - handle_tasks_command(sub_matches, &event_log_uri).await?; - } - Some(("builds", sub_matches)) => { - handle_builds_command(sub_matches, &event_log_uri).await?; - } - _ => { - // Check if direct partition arguments were provided (legacy mode) - if let Some(partitions) = matches.get_many::("partitions") { - let partition_list: Vec = partitions.cloned().collect(); - if !partition_list.is_empty() { - // Create a synthetic build command with these partitions - let build_cmd = ClapCommand::new("build") - .arg(Arg::new("partitions").num_args(1..)) - .arg(Arg::new("event-log").long("event-log")) - .arg(Arg::new("build-request-id").long("build-request-id")); - - let build_matches = build_cmd.try_get_matches_from( - std::iter::once("build".to_string()).chain(partition_list.clone()) - ).map_err(|e| CliError::InvalidArguments(format!("Failed to parse legacy build arguments: {}", e)))?; - - handle_build_command(&build_matches).await?; - return Ok(()); - } - } - - // Show help if no subcommand or arguments provided - let mut cmd = ClapCommand::new("databuild") - .version("1.0") - .about("DataBuild unified CLI"); - cmd.print_help().unwrap(); - println!(); - } - } - - Ok(()) -} - -async fn handle_partitions_command(matches: &ArgMatches, event_log_uri: &str) -> Result<()> { - let query_engine = create_bel_query_engine(event_log_uri).await - .map_err(|e| CliError::Database(format!("Failed to connect to event log: {}", e)))?; - - let repository = PartitionsRepository::new(query_engine); - - match matches.subcommand() { - Some(("list", sub_matches)) => { - let limit = sub_matches.get_one::("limit").and_then(|s| s.parse::().ok()); - let format = sub_matches.get_one::("format").map(|s| s.as_str()).unwrap_or("table"); - - // Use new protobuf response format for consistency with service - let request = PartitionsListRequest { - limit, - offset: None, // TODO: Add offset support to CLI - status_filter: None, // TODO: Add status filtering to CLI - }; - - let response = repository.list_protobuf(request).await - .map_err(|e| CliError::Database(format!("Failed to list partitions: {}", e)))?; - - match format { - "json" => { - let json = serde_json::to_string_pretty(&response) - .map_err(|e| CliError::Output(format!("Failed to serialize to JSON: {}", e)))?; - println!("{}", json); - } - _ => { - if response.partitions.is_empty() { - println!("No partitions found"); - return Ok(()); - } - - println!("Partitions ({} total):", response.total_count); - println!(); - println!("{:<30} {:<15} {:<12} {:<12} {:<20}", "Partition", "Status", "Builds", "Invalidated", "Last Updated"); - println!("{}", "-".repeat(90)); - - for partition in response.partitions { - let last_updated = format_timestamp(partition.last_updated); - - println!("{:<30} {:<15} {:<12} {:<12} {:<20}", - partition.partition_ref.map(|p| p.str).unwrap_or("".to_string()), - partition.status_name, // Use human-readable status name - partition.builds_count, - partition.invalidation_count, - last_updated - ); - } - - if response.has_more { - println!("\nNote: More results available. Use --limit to control output."); - } - } - } - } - Some(("show", sub_matches)) => { - let partition_ref = sub_matches.get_one::("partition_ref").unwrap(); - let format = sub_matches.get_one::("format").map(|s| s.as_str()).unwrap_or("table"); - let result = repository.show_protobuf(partition_ref).await - .map_err(|e| CliError::Database(format!("Failed to show partition: {}", e)))?; - - match result { - Some(detail) => { - match format { - "json" => { - let json = serde_json::to_string_pretty(&detail) - .map_err(|e| CliError::Output(format!("Failed to serialize to JSON: {}", e)))?; - println!("{}", json); - } - _ => { - println!("Partition: {}", detail.partition_ref.map(|p| p.str).unwrap_or("".to_string())); - println!("Status: {} ({})", detail.status_name, detail.status_code); - println!("Builds involved: {}", detail.builds_count); - println!("Invalidation count: {}", detail.invalidation_count); - println!("Last updated: {}", format_timestamp(detail.last_updated)); - - if let Some(ref last_build) = detail.last_successful_build { - println!("\nLast successful build: {}", last_build); - } - - if !detail.timeline.is_empty() { - println!("\nTimeline ({} events):", detail.timeline.len()); - for event in detail.timeline { - let timestamp = format_timestamp(event.timestamp); - println!(" {} [{}] {}", timestamp, event.status_name, event.message); - if event.message.starts_with("Invalidated:") { - // Invalidation reason is in the message - } - } - } - } - } - } - None => { - match format { - "json" => { - println!("null"); - } - _ => { - println!("Partition '{}' not found", partition_ref); - } - } - } - } - } - Some(("invalidate", sub_matches)) => { - let partition_ref = sub_matches.get_one::("partition_ref").unwrap(); - let reason = sub_matches.get_one::("reason").unwrap(); - let build_request_id = sub_matches.get_one::("build_request_id").unwrap(); - - let partition_ref_obj = PartitionRef { str: partition_ref.clone() }; - - repository.invalidate(&partition_ref_obj.str, reason.clone(), build_request_id.clone()).await - .map_err(|e| CliError::Database(format!("Failed to invalidate partition: {}", e)))?; - - println!("Successfully invalidated partition '{}' with reason: {}", partition_ref, reason); - } - _ => { - println!("Unknown partitions subcommand. Use 'list', 'show', or 'invalidate'."); - } - } - - Ok(()) -} - -async fn handle_jobs_command(matches: &ArgMatches, event_log_uri: &str) -> Result<()> { - let query_engine = create_bel_query_engine(event_log_uri).await - .map_err(|e| CliError::Database(format!("Failed to connect to event log: {}", e)))?; - - let repository = JobsRepository::new(query_engine); - - match matches.subcommand() { - Some(("list", sub_matches)) => { - let limit = sub_matches.get_one::("limit").and_then(|s| s.parse().ok()); - let format = sub_matches.get_one::("format").map(|s| s.as_str()).unwrap_or("table"); - let jobs = repository.list(limit).await - .map_err(|e| CliError::Database(format!("Failed to list jobs: {}", e)))?; - - match format { - "json" => { - let json = serde_json::to_string_pretty(&jobs) - .map_err(|e| CliError::Output(format!("Failed to serialize to JSON: {}", e)))?; - println!("{}", json); - } - _ => { - if jobs.is_empty() { - println!("No jobs found"); - return Ok(()); - } - - println!("Jobs ({} total):", jobs.len()); - println!(); - println!("{:<40} {:<8} {:<8} {:<8} {:<8} {:<8} {:<20}", "Job Label", "Runs", "Success", "Failed", "Cancel", "Avg Parts", "Last Run"); - println!("{}", "-".repeat(120)); - - for job in jobs { - let success_rate = if job.total_runs > 0 { - (job.successful_runs as f64 / job.total_runs as f64 * 100.0) as u32 - } else { - 0 - }; - - let last_run = format_timestamp(job.last_run_timestamp); - let last_status = format!("{:?}", job.last_run_status); - - println!("{:<40} {:<8} {:<8} {:<8} {:<8} {:<8.1} {:<20}", - job.job_label, - job.total_runs, - format!("{}({}%)", job.successful_runs, success_rate), - job.failed_runs, - job.cancelled_runs, - job.average_partitions_per_run, - format!("{} ({})", last_run, last_status) - ); - } - } - } - } - Some(("show", sub_matches)) => { - let job_label = sub_matches.get_one::("job_label").unwrap(); - let format = sub_matches.get_one::("format").map(|s| s.as_str()).unwrap_or("table"); - let result = repository.show_protobuf(job_label).await - .map_err(|e| CliError::Database(format!("Failed to show job: {}", e)))?; - - match result { - Some(detail) => { - match format { - "json" => { - let json = serde_json::to_string_pretty(&detail) - .map_err(|e| CliError::Output(format!("Failed to serialize to JSON: {}", e)))?; - println!("{}", json); - } - _ => { - println!("Job: {}", detail.job_label); - println!("Total runs: {}", detail.total_runs); - println!("Successful runs: {} ({:.1}%)", detail.successful_runs, - if detail.total_runs > 0 { detail.successful_runs as f64 / detail.total_runs as f64 * 100.0 } else { 0.0 }); - println!("Failed runs: {}", detail.failed_runs); - println!("Cancelled runs: {}", detail.cancelled_runs); - println!("Average partitions per run: {:.1}", detail.average_partitions_per_run); - println!("Last run: {} ({} - {})", format_timestamp(detail.last_run_timestamp), detail.last_run_status_name, detail.last_run_status_code); - - if !detail.recent_builds.is_empty() { - println!("\nRecent builds:"); - for build_id in &detail.recent_builds { - println!(" - {}", build_id); - } - } - - if !detail.runs.is_empty() { - println!("\nExecution history ({} runs):", detail.runs.len()); - println!("{:<25} {:<15} {:<15} {:<10} {:<30}", "Run ID", "Status", "Duration", "Parts", "Build Request"); - println!("{}", "-".repeat(95)); - - for run in detail.runs.iter().take(10) { // Show last 10 runs - let duration_str = if let Some(duration) = run.duration_ms { - if duration > 1000 { - format!("{:.1}s", duration as f64 / 1000.0) - } else { - format!("{}ms", duration) - } - } else { - "N/A".to_string() - }; - - println!("{:<25} {:<15} {:<15} {:<10} {:<30}", - run.job_run_id, - run.status_name, - duration_str, - run.target_partitions.len(), - run.build_request_id - ); - } - - if detail.runs.len() > 10 { - println!("... and {} more runs", detail.runs.len() - 10); - } - } - } - } - } - None => { - match format { - "json" => { - println!("null"); - } - _ => { - println!("Job '{}' not found", job_label); - } - } - } - } - } - _ => { - println!("Unknown jobs subcommand. Use 'list' or 'show'."); - } - } - - Ok(()) -} - -async fn handle_tasks_command(matches: &ArgMatches, event_log_uri: &str) -> Result<()> { - let query_engine = create_bel_query_engine(event_log_uri).await - .map_err(|e| CliError::Database(format!("Failed to connect to event log: {}", e)))?; - - let repository = TasksRepository::new(query_engine); - - match matches.subcommand() { - Some(("list", sub_matches)) => { - let limit = sub_matches.get_one::("limit").and_then(|s| s.parse().ok()); - let format = sub_matches.get_one::("format").map(|s| s.as_str()).unwrap_or("table"); - let tasks = repository.list(limit).await - .map_err(|e| CliError::Database(format!("Failed to list tasks: {}", e)))?; - - match format { - "json" => { - let json = serde_json::to_string_pretty(&tasks) - .map_err(|e| CliError::Output(format!("Failed to serialize to JSON: {}", e)))?; - println!("{}", json); - } - _ => { - if tasks.is_empty() { - println!("No tasks found"); - return Ok(()); - } - - println!("Tasks ({} total):", tasks.len()); - println!(); - println!("{:<25} {:<30} {:<15} {:<15} {:<10} {:<20}", "Job Run ID", "Job Label", "Status", "Duration", "Parts", "Scheduled"); - println!("{}", "-".repeat(115)); - - for task in tasks { - let duration_str = if let Some(duration) = task.duration_ms { - if duration > 1000 { - format!("{:.1}s", duration as f64 / 1000.0) - } else { - format!("{}ms", duration) - } - } else { - "N/A".to_string() - }; - - let scheduled = format_timestamp(task.scheduled_at); - let status_str = if task.cancelled { - format!("{:?}*", task.status) // Add asterisk for cancelled tasks - } else { - format!("{:?}", task.status) - }; - - println!("{:<25} {:<30} {:<15} {:<15} {:<10} {:<20}", - task.job_run_id, - task.job_label, - status_str, - duration_str, - task.target_partitions.len(), - scheduled - ); - } - - println!("\n* = Cancelled task"); - } - } - } - Some(("show", sub_matches)) => { - let job_run_id = sub_matches.get_one::("job_run_id").unwrap(); - let format = sub_matches.get_one::("format").map(|s| s.as_str()).unwrap_or("table"); - let result = repository.show_protobuf(job_run_id).await - .map_err(|e| CliError::Database(format!("Failed to show task: {}", e)))?; - - match result { - Some(detail) => { - match format { - "json" => { - let json = serde_json::to_string_pretty(&detail) - .map_err(|e| CliError::Output(format!("Failed to serialize to JSON: {}", e)))?; - println!("{}", json); - } - _ => { - println!("Task: {}", detail.job_run_id); - println!("Job: {}", detail.job_label); - println!("Build request: {}", detail.build_request_id); - println!("Status: {} ({})", detail.status_name, detail.status_code); - println!("Target partitions: {}", detail.target_partitions.len()); - println!("Scheduled: {}", format_timestamp(detail.scheduled_at)); - - if let Some(started) = detail.started_at { - println!("Started: {}", format_timestamp(started)); - } - - if let Some(completed) = detail.completed_at { - println!("Completed: {}", format_timestamp(completed)); - } - - if let Some(duration) = detail.duration_ms { - if duration > 1000 { - println!("Duration: {:.1}s", duration as f64 / 1000.0); - } else { - println!("Duration: {}ms", duration); - } - } - - if detail.cancelled { - println!("Cancelled: Yes"); - if let Some(ref reason) = detail.cancel_reason { - println!("Cancel reason: {}", reason); - } - } - - if !detail.message.is_empty() { - println!("Message: {}", detail.message); - } - - if !detail.target_partitions.is_empty() { - println!("\nTarget partitions:"); - for partition in &detail.target_partitions { - println!(" - {}", partition.str); - } - } - - if !detail.timeline.is_empty() { - println!("\nTimeline ({} events):", detail.timeline.len()); - for event in detail.timeline { - let timestamp = format_timestamp(event.timestamp); - let status_info = if let Some(ref status_name) = event.status_name { - format!(" -> {}", status_name) - } else { - String::new() - }; - - println!(" {} [{}]{} {}", timestamp, event.event_type, status_info, event.message); - if let Some(ref reason) = event.cancel_reason { - println!(" Reason: {}", reason); - } - } - } - } - } - } - None => { - match format { - "json" => { - println!("null"); - } - _ => { - println!("Task '{}' not found", job_run_id); - } - } - } - } - } - Some(("cancel", sub_matches)) => { - let job_run_id = sub_matches.get_one::("job_run_id").unwrap(); - let reason = sub_matches.get_one::("reason").unwrap(); - let build_request_id = sub_matches.get_one::("build_request_id").unwrap(); - - repository.cancel(job_run_id, reason.clone(), build_request_id.clone()).await - .map_err(|e| CliError::Database(format!("Failed to cancel task: {}", e)))?; - - println!("Successfully cancelled task '{}' with reason: {}", job_run_id, reason); - } - _ => { - println!("Unknown tasks subcommand. Use 'list', 'show', or 'cancel'."); - } - } - - Ok(()) -} - -async fn handle_builds_command(matches: &ArgMatches, event_log_uri: &str) -> Result<()> { - let query_engine = create_bel_query_engine(event_log_uri).await - .map_err(|e| CliError::Database(format!("Failed to connect to event log: {}", e)))?; - - let repository = BuildsRepository::new(query_engine); - - match matches.subcommand() { - Some(("list", sub_matches)) => { - let limit = sub_matches.get_one::("limit").and_then(|s| s.parse().ok()); - let format = sub_matches.get_one::("format").map(|s| s.as_str()).unwrap_or("table"); - let builds = repository.list(limit).await - .map_err(|e| CliError::Database(format!("Failed to list builds: {}", e)))?; - - match format { - "json" => { - let json = serde_json::to_string_pretty(&builds) - .map_err(|e| CliError::Output(format!("Failed to serialize to JSON: {}", e)))?; - println!("{}", json); - } - _ => { - if builds.is_empty() { - println!("No builds found"); - return Ok(()); - } - - println!("Builds ({} total):", builds.len()); - println!(); - println!("{:<40} {:<15} {:<15} {:<8} {:<8} {:<8} {:<20}", "Build Request ID", "Status", "Duration", "Parts", "Jobs", "Comp", "Requested"); - println!("{}", "-".repeat(120)); - - for build in builds { - let duration_str = if let Some(duration) = build.duration_ms { - if duration > 60000 { - format!("{:.1}m", duration as f64 / 60000.0) - } else if duration > 1000 { - format!("{:.1}s", duration as f64 / 1000.0) - } else { - format!("{}ms", duration) - } - } else { - "N/A".to_string() - }; - - let requested = format_timestamp(build.requested_at); - let status_str = if build.cancelled { - format!("{:?}*", build.status) // Add asterisk for cancelled builds - } else { - format!("{:?}", build.status) - }; - - let completion_rate = if build.total_jobs > 0 { - format!("{}/{}", build.completed_jobs, build.total_jobs) - } else { - "0/0".to_string() - }; - - println!("{:<40} {:<15} {:<15} {:<8} {:<8} {:<8} {:<20}", - build.build_request_id, - status_str, - duration_str, - build.requested_partitions.len(), - build.total_jobs, - completion_rate, - requested - ); - } - - println!("\n* = Cancelled build"); - } - } - } - Some(("show", sub_matches)) => { - let build_request_id = sub_matches.get_one::("build_request_id").unwrap(); - let format = sub_matches.get_one::("format").map(|s| s.as_str()).unwrap_or("table"); - let result = repository.show_protobuf(build_request_id).await - .map_err(|e| CliError::Database(format!("Failed to show build: {}", e)))?; - - match result { - Some(detail) => { - match format { - "json" => { - let json = serde_json::to_string_pretty(&detail) - .map_err(|e| CliError::Output(format!("Failed to serialize to JSON: {}", e)))?; - println!("{}", json); - } - _ => { - println!("Build: {}", detail.build_request_id); - println!("Status: {} ({})", detail.status.clone().unwrap().name, detail.status.unwrap().code); - println!("Requested partitions: {}", detail.requested_partitions.len()); - println!("Total jobs: {}", detail.total_jobs); - println!("Completed jobs: {}", detail.completed_jobs); - println!("Failed jobs: {}", detail.failed_jobs); - println!("Cancelled jobs: {}", detail.cancelled_jobs); - println!("Requested: {}", format_timestamp(detail.requested_at)); - - if let Some(started) = detail.started_at { - println!("Started: {}", format_timestamp(started)); - } - - if let Some(completed) = detail.completed_at { - println!("Completed: {}", format_timestamp(completed)); - } - - if let Some(duration) = detail.duration_ms { - if duration > 60000 { - println!("Duration: {:.1}m", duration as f64 / 60000.0); - } else if duration > 1000 { - println!("Duration: {:.1}s", duration as f64 / 1000.0); - } else { - println!("Duration: {}ms", duration); - } - } - - if detail.cancelled { - println!("Cancelled: Yes"); - if let Some(ref reason) = detail.cancel_reason { - println!("Cancel reason: {}", reason); - } - } - - if !detail.requested_partitions.is_empty() { - println!("\nRequested partitions:"); - for partition in &detail.requested_partitions { - println!(" - {}", partition.str); - } - } - - // Show job statistics - if detail.total_jobs > 0 { - let success_rate = (detail.completed_jobs as f64 / detail.total_jobs as f64 * 100.0) as u32; - println!("\nJob statistics:"); - println!(" Success rate: {}% ({}/{})", success_rate, detail.completed_jobs, detail.total_jobs); - - if detail.failed_jobs > 0 { - println!(" Failed: {}", detail.failed_jobs); - } - if detail.cancelled_jobs > 0 { - println!(" Cancelled: {}", detail.cancelled_jobs); - } - } - - if !detail.timeline.is_empty() { - println!("\nTimeline ({} events):", detail.timeline.len()); - for event in detail.timeline { - let timestamp = format_timestamp(event.timestamp); - let status_info = event.status.unwrap().name; - - println!(" {} [{}]{} {}", timestamp, event.event_type, status_info, event.message); - if let Some(ref reason) = event.cancel_reason { - println!(" Reason: {}", reason); - } - } - } - } - } - } - None => { - match format { - "json" => { - println!("null"); - } - _ => { - println!("Build '{}' not found", build_request_id); - } - } - } - } - } - Some(("cancel", sub_matches)) => { - let build_request_id = sub_matches.get_one::("build_request_id").unwrap(); - let reason = sub_matches.get_one::("reason").unwrap(); - - repository.cancel(build_request_id, reason.clone()).await - .map_err(|e| CliError::Database(format!("Failed to cancel build: {}", e)))?; - - println!("Successfully cancelled build '{}' with reason: {}", build_request_id, reason); - } - _ => { - println!("Unknown builds subcommand. Use 'list', 'show', or 'cancel'."); - } - } - - Ok(()) -} \ No newline at end of file diff --git a/databuild/client/BUILD.bazel b/databuild/client/BUILD.bazel deleted file mode 100644 index e9006f8..0000000 --- a/databuild/client/BUILD.bazel +++ /dev/null @@ -1,194 +0,0 @@ -load("@aspect_rules_ts//ts:defs.bzl", "ts_config", "ts_project") - -# Extract OpenAPI spec from the dedicated spec generator binary -genrule( - name = "extract_openapi_spec", - srcs = [], - outs = ["openapi.json"], - cmd = """ - $(location //databuild:openapi_spec_generator) > $@ - """, - tools = [ - "//databuild:openapi_spec_generator", - ], - visibility = ["//visibility:public"], -) - -# TypeScript generator configuration -filegroup( - name = "typescript_generator_config", - srcs = ["typescript_generator_config.json"], - visibility = ["//visibility:public"], -) - -# Generate TypeScript client using OpenAPI Generator JAR -genrule( - name = "typescript_client", - srcs = [ - ":extract_openapi_spec", - ":typescript_generator_config", - ], - outs = [ - "typescript_generated/src/apis/DefaultApi.ts", - "typescript_generated/src/apis/index.ts", - "typescript_generated/src/models/index.ts", - "typescript_generated/src/models/ActivityApiResponse.ts", - "typescript_generated/src/models/ActivityResponse.ts", - "typescript_generated/src/models/AnalyzeRequest.ts", - "typescript_generated/src/models/AnalyzeResponse.ts", - "typescript_generated/src/models/BuildCancelPathRequest.ts", - "typescript_generated/src/models/BuildCancelRepositoryResponse.ts", - "typescript_generated/src/models/BuildDetailRequest.ts", - "typescript_generated/src/models/BuildDetailResponse.ts", - "typescript_generated/src/models/BuildEventSummary.ts", - "typescript_generated/src/models/BuildRequest.ts", - "typescript_generated/src/models/BuildRequestResponse.ts", - "typescript_generated/src/models/BuildSummary.ts", - "typescript_generated/src/models/BuildRequestStatus.ts", - "typescript_generated/src/models/BuildTimelineEvent.ts", - "typescript_generated/src/models/BuildsListApiResponse.ts", - "typescript_generated/src/models/BuildsListResponse.ts", - "typescript_generated/src/models/CancelBuildRepositoryRequest.ts", - "typescript_generated/src/models/InvalidatePartitionRequest.ts", - "typescript_generated/src/models/JobDailyStats.ts", - "typescript_generated/src/models/JobDetailRequest.ts", - "typescript_generated/src/models/JobDetailResponse.ts", - "typescript_generated/src/models/JobMetricsRequest.ts", - "typescript_generated/src/models/JobMetricsResponse.ts", - "typescript_generated/src/models/JobRunDetail.ts", - "typescript_generated/src/models/JobSummary.ts", - "typescript_generated/src/models/JobsListApiResponse.ts", - "typescript_generated/src/models/JobsListResponse.ts", - "typescript_generated/src/models/PaginationInfo.ts", - "typescript_generated/src/models/PartitionDetailRequest.ts", - "typescript_generated/src/models/PartitionDetailResponse.ts", - "typescript_generated/src/models/PartitionEventsRequest.ts", - "typescript_generated/src/models/PartitionEventsResponse.ts", - "typescript_generated/src/models/PartitionInvalidatePathRequest.ts", - "typescript_generated/src/models/PartitionInvalidateResponse.ts", - "typescript_generated/src/models/PartitionRef.ts", - "typescript_generated/src/models/PartitionStatusRequest.ts", - "typescript_generated/src/models/PartitionStatusResponse.ts", - "typescript_generated/src/models/PartitionSummary.ts", - "typescript_generated/src/models/PartitionTimelineEvent.ts", - "typescript_generated/src/models/PartitionsListApiResponse.ts", - "typescript_generated/src/models/PartitionsListResponse.ts", - "typescript_generated/src/models/CancelTaskRequest.ts", - "typescript_generated/src/models/JobRunDetailResponse.ts", - "typescript_generated/src/models/JobRunSummary.ts", - "typescript_generated/src/models/JobRunSummary2.ts", - "typescript_generated/src/models/JobRunTimelineEvent.ts", - "typescript_generated/src/models/JobRunsListApiResponse.ts", - "typescript_generated/src/models/JobRunsListResponse.ts", - "typescript_generated/src/models/TaskCancelPathRequest.ts", - "typescript_generated/src/models/TaskCancelResponse.ts", - "typescript_generated/src/models/TaskDetailRequest.ts", - "typescript_generated/src/runtime.ts", - "typescript_generated/src/index.ts", - ], - cmd = """ - # Download OpenAPI Generator JAR - OPENAPI_JAR=/tmp/openapi-generator-cli.jar - if [ ! -f $$OPENAPI_JAR ]; then - curl -L -o $$OPENAPI_JAR https://repo1.maven.org/maven2/org/openapitools/openapi-generator-cli/7.2.0/openapi-generator-cli-7.2.0.jar - fi - - # Create temporary directory for generation - TEMP_DIR=$$(mktemp -d) - - # Generate TypeScript client to temp directory - java -jar $$OPENAPI_JAR generate \ - -i $(location :extract_openapi_spec) \ - -g typescript-fetch \ - -c $(location :typescript_generator_config) \ - -o $$TEMP_DIR - - # Copy generated files to expected output locations - cp $$TEMP_DIR/src/apis/DefaultApi.ts $(location typescript_generated/src/apis/DefaultApi.ts) - cp $$TEMP_DIR/src/apis/index.ts $(location typescript_generated/src/apis/index.ts) - cp $$TEMP_DIR/src/models/index.ts $(location typescript_generated/src/models/index.ts) - cp $$TEMP_DIR/src/models/ActivityApiResponse.ts $(location typescript_generated/src/models/ActivityApiResponse.ts) - cp $$TEMP_DIR/src/models/ActivityResponse.ts $(location typescript_generated/src/models/ActivityResponse.ts) - cp $$TEMP_DIR/src/models/AnalyzeRequest.ts $(location typescript_generated/src/models/AnalyzeRequest.ts) - cp $$TEMP_DIR/src/models/AnalyzeResponse.ts $(location typescript_generated/src/models/AnalyzeResponse.ts) - cp $$TEMP_DIR/src/models/BuildCancelPathRequest.ts $(location typescript_generated/src/models/BuildCancelPathRequest.ts) - cp $$TEMP_DIR/src/models/BuildCancelRepositoryResponse.ts $(location typescript_generated/src/models/BuildCancelRepositoryResponse.ts) - cp $$TEMP_DIR/src/models/BuildDetailRequest.ts $(location typescript_generated/src/models/BuildDetailRequest.ts) - cp $$TEMP_DIR/src/models/BuildDetailResponse.ts $(location typescript_generated/src/models/BuildDetailResponse.ts) - cp $$TEMP_DIR/src/models/BuildEventSummary.ts $(location typescript_generated/src/models/BuildEventSummary.ts) - cp $$TEMP_DIR/src/models/BuildRequest.ts $(location typescript_generated/src/models/BuildRequest.ts) - cp $$TEMP_DIR/src/models/BuildRequestResponse.ts $(location typescript_generated/src/models/BuildRequestResponse.ts) - cp $$TEMP_DIR/src/models/BuildSummary.ts $(location typescript_generated/src/models/BuildSummary.ts) - cp $$TEMP_DIR/src/models/BuildRequestStatus.ts $(location typescript_generated/src/models/BuildRequestStatus.ts) - cp $$TEMP_DIR/src/models/BuildTimelineEvent.ts $(location typescript_generated/src/models/BuildTimelineEvent.ts) - cp $$TEMP_DIR/src/models/BuildsListApiResponse.ts $(location typescript_generated/src/models/BuildsListApiResponse.ts) - cp $$TEMP_DIR/src/models/BuildsListResponse.ts $(location typescript_generated/src/models/BuildsListResponse.ts) - cp $$TEMP_DIR/src/models/CancelBuildRepositoryRequest.ts $(location typescript_generated/src/models/CancelBuildRepositoryRequest.ts) - cp $$TEMP_DIR/src/models/InvalidatePartitionRequest.ts $(location typescript_generated/src/models/InvalidatePartitionRequest.ts) - cp $$TEMP_DIR/src/models/JobDailyStats.ts $(location typescript_generated/src/models/JobDailyStats.ts) - cp $$TEMP_DIR/src/models/JobDetailRequest.ts $(location typescript_generated/src/models/JobDetailRequest.ts) - cp $$TEMP_DIR/src/models/JobDetailResponse.ts $(location typescript_generated/src/models/JobDetailResponse.ts) - cp $$TEMP_DIR/src/models/JobMetricsRequest.ts $(location typescript_generated/src/models/JobMetricsRequest.ts) - cp $$TEMP_DIR/src/models/JobMetricsResponse.ts $(location typescript_generated/src/models/JobMetricsResponse.ts) - cp $$TEMP_DIR/src/models/JobRunDetail.ts $(location typescript_generated/src/models/JobRunDetail.ts) - cp $$TEMP_DIR/src/models/JobRunSummary.ts $(location typescript_generated/src/models/JobRunSummary.ts) - cp $$TEMP_DIR/src/models/JobSummary.ts $(location typescript_generated/src/models/JobSummary.ts) - cp $$TEMP_DIR/src/models/JobsListApiResponse.ts $(location typescript_generated/src/models/JobsListApiResponse.ts) - cp $$TEMP_DIR/src/models/JobsListResponse.ts $(location typescript_generated/src/models/JobsListResponse.ts) - cp $$TEMP_DIR/src/models/PaginationInfo.ts $(location typescript_generated/src/models/PaginationInfo.ts) - cp $$TEMP_DIR/src/models/PartitionDetailRequest.ts $(location typescript_generated/src/models/PartitionDetailRequest.ts) - cp $$TEMP_DIR/src/models/PartitionDetailResponse.ts $(location typescript_generated/src/models/PartitionDetailResponse.ts) - cp $$TEMP_DIR/src/models/PartitionEventsRequest.ts $(location typescript_generated/src/models/PartitionEventsRequest.ts) - cp $$TEMP_DIR/src/models/PartitionEventsResponse.ts $(location typescript_generated/src/models/PartitionEventsResponse.ts) - cp $$TEMP_DIR/src/models/PartitionInvalidatePathRequest.ts $(location typescript_generated/src/models/PartitionInvalidatePathRequest.ts) - cp $$TEMP_DIR/src/models/PartitionInvalidateResponse.ts $(location typescript_generated/src/models/PartitionInvalidateResponse.ts) - cp $$TEMP_DIR/src/models/PartitionRef.ts $(location typescript_generated/src/models/PartitionRef.ts) - cp $$TEMP_DIR/src/models/PartitionStatusRequest.ts $(location typescript_generated/src/models/PartitionStatusRequest.ts) - cp $$TEMP_DIR/src/models/PartitionStatusResponse.ts $(location typescript_generated/src/models/PartitionStatusResponse.ts) - cp $$TEMP_DIR/src/models/PartitionSummary.ts $(location typescript_generated/src/models/PartitionSummary.ts) - cp $$TEMP_DIR/src/models/PartitionTimelineEvent.ts $(location typescript_generated/src/models/PartitionTimelineEvent.ts) - cp $$TEMP_DIR/src/models/PartitionsListApiResponse.ts $(location typescript_generated/src/models/PartitionsListApiResponse.ts) - cp $$TEMP_DIR/src/models/PartitionsListResponse.ts $(location typescript_generated/src/models/PartitionsListResponse.ts) - cp $$TEMP_DIR/src/models/JobRunSummary.ts $(location typescript_generated/src/models/JobRunSummary.ts) - cp $$TEMP_DIR/src/models/JobRunTimelineEvent.ts $(location typescript_generated/src/models/JobRunTimelineEvent.ts) - cp $$TEMP_DIR/src/models/JobRunsListApiResponse.ts $(location typescript_generated/src/models/JobRunsListApiResponse.ts) - cp $$TEMP_DIR/src/models/JobRunsListResponse.ts $(location typescript_generated/src/models/JobRunsListResponse.ts) - cp $$TEMP_DIR/src/models/CancelTaskRequest.ts $(location typescript_generated/src/models/CancelTaskRequest.ts) - cp $$TEMP_DIR/src/models/JobRunDetailResponse.ts $(location typescript_generated/src/models/JobRunDetailResponse.ts) - cp $$TEMP_DIR/src/models/JobRunSummary2.ts $(location typescript_generated/src/models/JobRunSummary2.ts) - cp $$TEMP_DIR/src/models/TaskCancelPathRequest.ts $(location typescript_generated/src/models/TaskCancelPathRequest.ts) - cp $$TEMP_DIR/src/models/TaskCancelResponse.ts $(location typescript_generated/src/models/TaskCancelResponse.ts) - cp $$TEMP_DIR/src/models/TaskDetailRequest.ts $(location typescript_generated/src/models/TaskDetailRequest.ts) - cp $$TEMP_DIR/src/runtime.ts $(location typescript_generated/src/runtime.ts) - cp $$TEMP_DIR/src/index.ts $(location typescript_generated/src/index.ts) - """, - visibility = ["//visibility:public"], -) - -# TypeScript configuration for the client -ts_config( - name = "ts_config", - src = "tsconfig.json", - visibility = ["//visibility:public"], -) - -# Create a proper TypeScript project from the generated files -ts_project( - name = "typescript_lib", - srcs = [":typescript_client"], - allow_js = True, - declaration = True, - resolve_json_module = True, - transpiler = "tsc", - tsconfig = ":ts_config", - visibility = ["//visibility:public"], -) - -# Main TypeScript client target -filegroup( - name = "typescript", - srcs = [ - ":typescript_client", - ], - visibility = ["//visibility:public"], -) diff --git a/databuild/client/tsconfig.json b/databuild/client/tsconfig.json deleted file mode 100644 index d933fa3..0000000 --- a/databuild/client/tsconfig.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "compilerOptions": { - "target": "ES2020", - "module": "CommonJS", - "moduleResolution": "node", - "allowJs": true, - "declaration": true, - "strict": false, - "esModuleInterop": true, - "skipLibCheck": true, - "forceConsistentCasingInFileNames": true, - "resolveJsonModule": true, - "isolatedModules": true, - "noEmit": false - }, - "include": ["**/*"], - "exclude": [ - "node_modules", - "**/*.test.ts" - ] -} \ No newline at end of file diff --git a/databuild/client/typescript_generator_config.json b/databuild/client/typescript_generator_config.json deleted file mode 100644 index 3f4d5bf..0000000 --- a/databuild/client/typescript_generator_config.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "enumPropertyNaming": "snake_case", - "withInterfaces": true, - "useSingleRequestParameter": true, - "typescriptThreePlus": true, - "npmName": "databuild-client", - "npmVersion": "1.0.0", - "stringEnums": true, - "generateAliasAsModel": false, - "modelPropertyNaming": "snake_case", - "paramNaming": "snake_case", - "supportsES6": true, - "withoutRuntimeChecks": false -} \ No newline at end of file diff --git a/databuild/dashboard/BUILD.bazel b/databuild/dashboard/BUILD.bazel deleted file mode 100644 index a68e78e..0000000 --- a/databuild/dashboard/BUILD.bazel +++ /dev/null @@ -1,111 +0,0 @@ -load("@aspect_rules_esbuild//esbuild:defs.bzl", "esbuild") -load("@aspect_rules_js//js:defs.bzl", "js_test") -load("@aspect_rules_ts//ts:defs.bzl", "ts_config", "ts_project") -load("@databuild_npm//:defs.bzl", "npm_link_all_packages") - -npm_link_all_packages(name = "node_modules") - -filegroup( - name = "dist", - srcs = [ - # To be added once we have one - # "favicon.svg", - "index.html", - ":app_dist", - ":css", - ], - visibility = ["//visibility:public"], -) - -genrule( - name = "css", - srcs = [ - "index.css", - "index.html", - ":node_modules/daisyui", - ":app_dist", - ], - outs = ["dist.css"], - cmd = """ -# Must manually copy sources, because tailwind silently ignores symlinked files: -# https://github.com/tailwindlabs/tailwindcss/issues/13731 -WORKDIR=$$(dirname $(location index.css)) -find $$WORKDIR -type l -exec bash -c 'echo "> $${0}" && cp -fL "$${0}" "$${0}.tmp" && mv "$${0}.tmp" "$${0}"' {} \\; -# Copy over source from built TS app so that tailwind can see the used classes -for fpath in $(locations :app_dist); do - cp $$fpath $$WORKDIR -done -# Include daisyui plugin -cp -R $(@D)/node_modules/.aspect_rules_js/*/node_modules $$WORKDIR/node_modules -# Run tailwind build -$(location //tools/build_rules:tailwind) -i $(location index.css) -o $@ - """, - tools = ["//tools/build_rules:tailwind"], -) - -ts_config( - name = "ts_config_app", - src = ":tsconfig_app.json", - visibility = ["//visibility:public"], -) - -# Making modules of ts projects seems to be a rats nest. -# Hopefully we can figure this out in the future. -ts_project( - name = "app", - srcs = [ - "index.ts", - "layout.ts", - "pages.ts", - "services.ts", - "types.ts", - "utils.ts", - # Test files - "index.test.ts", - "utils.test.ts", - "transformation-tests.ts", - ], - allow_js = True, - resolve_json_module = True, - transpiler = "tsc", - tsconfig = ":ts_config_app", - deps = [ - ":node_modules/@types/mithril", - ":node_modules/@types/node", - ":node_modules/@types/ospec", - ":node_modules/mithril", - ":node_modules/ospec", - ":node_modules/whatwg-fetch", - "//databuild/client:typescript_lib", - ], -) - -esbuild( - name = "app_dist", - srcs = [":app"], - bazel_sandbox_plugin = True, - entry_point = "index.js", - # esbuild_log_level = "verbose", - # js_log_level = "debug", - metafile = True, - visibility = ["//visibility:public"], -) - -js_test( - name = "app_test", - chdir = package_name(), - data = [":app"], - entry_point = "index.test.js", -) - -# Test to verify strict TypeScript configuration catches expected failures -sh_test( - name = "strict_config_test", - srcs = ["test-strict-config.sh"], - data = [ - "test-data/strict-config-failures.ts", - "tsconfig_app.json", - ":node_modules/@types/node", - ":node_modules/typescript", - ], -) diff --git a/databuild/dashboard/README.md b/databuild/dashboard/README.md deleted file mode 100644 index 1f3a5e2..0000000 --- a/databuild/dashboard/README.md +++ /dev/null @@ -1,4 +0,0 @@ - -# Dashboard - -A dashboard for viewing past build status, current running builds, etc. Extremely prototyped right now. diff --git a/databuild/dashboard/TYPE_SAFETY.md b/databuild/dashboard/TYPE_SAFETY.md deleted file mode 100644 index be503d4..0000000 --- a/databuild/dashboard/TYPE_SAFETY.md +++ /dev/null @@ -1,127 +0,0 @@ -# Dashboard Type Safety Architecture - -## Overview - -This document describes the type safety architecture implemented in the DataBuild dashboard to prevent runtime errors from backend API changes. - -## Problem Statement - -The dashboard previously experienced runtime crashes when backend API changes were deployed: -- `status.toLowerCase()` failed when status changed from string to object -- `partition.str` access failed when partition structure changed -- TypeScript compilation passed but runtime errors occurred - -## Solution Architecture - -### 1. Dashboard Data Contracts - -We define stable TypeScript interfaces in `types.ts` that represent the data shapes the UI components expect: - -```typescript -export interface DashboardBuild { - build_request_id: string; - status: string; // Always a human-readable string - requested_partitions: string[]; // Always flat string array - // ... other fields -} -``` - -### 2. Transformation Layer - -The `services.ts` file contains transformation functions that convert OpenAPI-generated types to dashboard types: - -```typescript -function transformBuildSummary(apiResponse: BuildSummary): DashboardBuild { - return { - build_request_id: apiResponse.build_request_id, - status: apiResponse.status_name, // Extract string from API - requested_partitions: apiResponse.requested_partitions.map(p => p.str), // Flatten objects - // ... transform other fields - }; -} -``` - -### 3. Component Isolation - -All UI components use only dashboard types, never raw API types: - -```typescript -// GOOD: Using dashboard types -const build: DashboardBuild = await DashboardService.getBuildDetail(id); -m('div', build.status.toLowerCase()); // Safe - status is always string - -// BAD: Using API types directly -const build: BuildSummary = await apiClient.getBuild(id); -m('div', build.status.toLowerCase()); // Unsafe - status might be object -``` - -## Benefits - -1. **Compile-time Safety**: TypeScript catches type mismatches during development -2. **Runtime Protection**: Transformation functions handle API changes gracefully -3. **Clear Boundaries**: UI code is isolated from API implementation details -4. **Easier Updates**: API changes require updates only in transformation functions - -## Testing Strategy - -### Unit Tests -- `transformation-tests.ts`: Verify transformation functions produce correct dashboard types - -### Strict TypeScript Configuration -- `exactOptionalPropertyTypes`: Ensures optional properties are handled explicitly -- `strictNullChecks`: Prevents null/undefined errors -- `noImplicitAny`: Requires explicit typing - -## Maintenance Guidelines - -### When Backend API Changes - -1. Update the OpenAPI spec and regenerate client -2. TypeScript compilation will fail in transformation functions if types changed -3. Update only the transformation functions to handle new API shape -4. Run tests to verify UI components still work correctly - -### Adding New Features - -1. Define dashboard types in `types.ts` -2. Create transformation functions in `services.ts` -3. Use only dashboard types in components -4. Add tests for the transformation logic - -## Example: Handling API Evolution - -If the backend changes `status` from string to object: - -```typescript -// Old API -{ status_name: "COMPLETED" } - -// New API -{ status: { code: 4, name: "COMPLETED" } } - -// Transformation handles both -function transformBuildSummary(apiResponse: any): DashboardBuild { - return { - status: apiResponse.status_name || apiResponse.status?.name || 'UNKNOWN', - // ... other fields - }; -} -``` - -The UI components continue working without changes because they always receive the expected `string` type. - -## Monitoring - -To maintain type safety over time: - -1. **Build-time Checks**: TypeScript compilation catches type errors -2. **Test Suite**: Transformation tests run on every build -3. **Code Reviews**: Ensure new code follows the pattern -4. **Documentation**: Keep this document updated with patterns - -## Related Files - -- `types.ts` - Dashboard type definitions -- `services.ts` - API transformation functions -- `transformation-tests.ts` - Unit tests for transformations -- `tsconfig_app.json` - Strict TypeScript configuration \ No newline at end of file diff --git a/databuild/dashboard/index.css b/databuild/dashboard/index.css deleted file mode 100644 index c0909ef..0000000 --- a/databuild/dashboard/index.css +++ /dev/null @@ -1,78 +0,0 @@ -@import "tailwindcss" source("./**/*.{js,html}"); -@plugin "daisyui" { -} - - - -@plugin "daisyui/theme" { - name: "databuild-light"; - default: true; - prefersdark: false; - color-scheme: "light"; - --color-base-100: oklch(100% 0 0); - --color-base-200: oklch(98% 0.002 247.839); - --color-base-300: oklch(96% 0.003 264.542); - --color-base-content: oklch(21% 0.034 264.665); - --color-primary: oklch(37% 0.01 67.558); - --color-primary-content: oklch(100% 0 0); - --color-secondary: oklch(77% 0.152 181.912); - --color-secondary-content: oklch(100% 0 0); - --color-accent: oklch(75% 0.183 55.934); - --color-accent-content: oklch(100% 0 0); - --color-neutral: oklch(37% 0.01 67.558); - --color-neutral-content: oklch(98% 0.002 247.839); - --color-info: oklch(80% 0.105 251.813); - --color-info-content: oklch(28% 0.091 267.935); - --color-success: oklch(84% 0.238 128.85); - --color-success-content: oklch(27% 0.072 132.109); - --color-warning: oklch(85% 0.199 91.936); - --color-warning-content: oklch(27% 0.077 45.635); - --color-error: oklch(70% 0.191 22.216); - --color-error-content: oklch(25% 0.092 26.042); - --radius-selector: 0.5rem; - --radius-field: 0.5rem; - --radius-box: 0.5rem; - --size-selector: 0.25rem; - --size-field: 0.25rem; - --border: 1px; - --depth: 0; - --noise: 0; -} - - -@plugin "daisyui/theme" { - name: "databuild-dark"; - default: false; - prefersdark: false; - color-scheme: "dark"; - --color-base-100: oklch(15% 0.002 247.839); - --color-base-200: oklch(18% 0.003 264.542); - --color-base-300: oklch(22% 0.006 264.531); - --color-base-content: oklch(92% 0.034 264.665); - --color-primary: oklch(75% 0.005 56.366); - --color-primary-content: oklch(15% 0.006 56.043); - --color-secondary: oklch(65% 0.152 181.912); - --color-secondary-content: oklch(15% 0 0); - --color-accent: oklch(70% 0.183 55.934); - --color-accent-content: oklch(15% 0 0); - --color-neutral: oklch(25% 0.01 67.558); - --color-neutral-content: oklch(92% 0.002 247.839); - --color-info: oklch(65% 0.165 254.624); - --color-info-content: oklch(15% 0.091 267.935); - --color-success: oklch(75% 0.238 128.85); - --color-success-content: oklch(15% 0.072 132.109); - --color-warning: oklch(80% 0.199 91.936); - --color-warning-content: oklch(15% 0.077 45.635); - --color-error: oklch(65% 0.191 22.216); - --color-error-content: oklch(15% 0.092 26.042); - --radius-selector: 0.5rem; - --radius-field: 0.5rem; - --radius-box: 0.5rem; - --size-selector: 0.25rem; - --size-field: 0.25rem; - --border: 1px; - --depth: 0; - --noise: 0; -} - - diff --git a/databuild/dashboard/index.html b/databuild/dashboard/index.html deleted file mode 100644 index 543eb5a..0000000 --- a/databuild/dashboard/index.html +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - DataBuild Dashboard - - - - - -
- Loading... -
- - diff --git a/databuild/dashboard/index.test.ts b/databuild/dashboard/index.test.ts deleted file mode 100644 index 0e9b514..0000000 --- a/databuild/dashboard/index.test.ts +++ /dev/null @@ -1,15 +0,0 @@ -const { appName } = require('./index'); -const o = require('ospec'); - -// Import transformation tests -require('./transformation-tests'); - -o.spec("appName", () => { - o("should be databuild", () => { - o(appName).equals("databuild") `Should be databuild`; - }); -}); - -// TODO - I think we can create an ospec target that invokes these with the ospec CLI? -// https://github.com/MithrilJS/ospec?tab=readme-ov-file#command-line-interface -o.run(); diff --git a/databuild/dashboard/index.ts b/databuild/dashboard/index.ts deleted file mode 100644 index 3e406df..0000000 --- a/databuild/dashboard/index.ts +++ /dev/null @@ -1,76 +0,0 @@ -import m from 'mithril'; -import { Layout } from './layout'; -import { - RecentActivity, - BuildStatus, - PartitionsList, - PartitionStatus, - JobsList, - JobMetrics, - GraphAnalysis -} from './pages'; -import { decodePartitionRef } from './utils'; -import { - TypedComponent, - LayoutWrapperAttrs, - RecentActivityAttrs, - BuildStatusAttrs, - PartitionStatusAttrs, - PartitionsListAttrs, - JobsListAttrs, - JobMetricsAttrs, - GraphAnalysisAttrs -} from './types'; - -export const appName = "databuild"; - -// Wrapper components that include layout - now with type safety -function createLayoutWrapper(component: TypedComponent): m.Component { - const wrapper: any = { - view: (vnode: m.Vnode) => m(Layout, [component.view.call(component, vnode)]) - }; - - // Only add lifecycle methods if they exist to avoid exactOptionalPropertyTypes issues - if (component.oninit) { - wrapper.oninit = (vnode: m.Vnode) => component.oninit!.call(component, vnode); - } - if (component.oncreate) { - wrapper.oncreate = (vnode: m.VnodeDOM) => component.oncreate!.call(component, vnode); - } - if (component.onupdate) { - wrapper.onupdate = (vnode: m.VnodeDOM) => component.onupdate!.call(component, vnode); - } - if (component.onbeforeremove) { - wrapper.onbeforeremove = (vnode: m.VnodeDOM) => component.onbeforeremove!.call(component, vnode); - } - if (component.onremove) { - wrapper.onremove = (vnode: m.VnodeDOM) => component.onremove!.call(component, vnode); - } - if (component.onbeforeupdate) { - wrapper.onbeforeupdate = (vnode: m.Vnode, old: m.VnodeDOM) => component.onbeforeupdate!.call(component, vnode, old); - } - - return wrapper; -} - -// Route definitions with type safety -const routes = { - '/': createLayoutWrapper(RecentActivity), - '/builds/:id': createLayoutWrapper(BuildStatus), - '/partitions': createLayoutWrapper(PartitionsList), - '/partitions/:base64_ref': createLayoutWrapper(PartitionStatus), - '/jobs': createLayoutWrapper(JobsList), - '/jobs/:label': createLayoutWrapper(JobMetrics), - '/analyze': createLayoutWrapper(GraphAnalysis), -}; - -if (typeof window !== "undefined") { - document.addEventListener("DOMContentLoaded", () => { - // Initialize theme from localStorage - const savedTheme = localStorage.getItem('theme') || 'databuild-light'; - document.documentElement.setAttribute('data-theme', savedTheme); - - // Set up routing - m.route(document.getElementById('app') as HTMLElement, '/', routes); - }); -} diff --git a/databuild/dashboard/layout.ts b/databuild/dashboard/layout.ts deleted file mode 100644 index d683f15..0000000 --- a/databuild/dashboard/layout.ts +++ /dev/null @@ -1,52 +0,0 @@ -import m from 'mithril'; - -export const Layout = { - view: (vnode: any) => [ - m('header.navbar.bg-base-100.shadow-lg', [ - m('div.navbar-start', [ - m('div.dropdown', [ - m('div.btn.btn-ghost.lg:hidden[tabindex="0"][role="button"]', [ - m('svg.w-5.h-5[xmlns="http://www.w3.org/2000/svg"][fill="none"][viewBox="0 0 24 24"]', [ - m('path[stroke-linecap="round"][stroke-linejoin="round"][stroke-width="2"][stroke="currentColor"][d="M4 6h16M4 12h8m-8 6h16"]'), - ]), - ]), - m('ul.menu.menu-sm.dropdown-content.bg-base-100.rounded-box.z-1.mt-3.w-52.p-2.shadow[tabindex="0"]', [ - m('li', m(m.route.Link, { href: '/partitions' }, 'Partitions')), - m('li', m(m.route.Link, { href: '/jobs' }, 'Jobs')), - m('li', m(m.route.Link, { href: '/analyze' }, 'Analyze')), - ]), - ]), - m(m.route.Link, { href: '/', class: 'btn btn-ghost text-xl' }, 'DataBuild Dashboard'), - ]), - m('div.navbar-center.hidden.lg:flex', [ - m('ul.menu.menu-horizontal.px-1', [ - m('li', m(m.route.Link, { href: '/' }, 'Dashboard')), - m('li', m(m.route.Link, { href: '/partitions' }, 'Partitions')), - m('li', m(m.route.Link, { href: '/jobs' }, 'Jobs')), - m('li', m(m.route.Link, { href: '/analyze' }, 'Analyze')), - ]), - ]), - m('div.navbar-end', [ - m('label.swap.swap-rotate', [ - m('input.theme-controller[type="checkbox"]', { - value: 'databuild-dark', - onchange: (e: Event) => { - const target = e.target as HTMLInputElement; - const theme = target.checked ? 'databuild-dark' : 'databuild-light'; - document.documentElement.setAttribute('data-theme', theme); - localStorage.setItem('theme', theme); - }, - checked: localStorage.getItem('theme') === 'databuild-dark' - }), - m('svg.swap-off.fill-current.w-6.h-6[xmlns="http://www.w3.org/2000/svg"][viewBox="0 0 24 24"]', [ - m('path[d="M5.64,17l-.71.71a1,1,0,0,0,0,1.41,1,1,0,0,0,1.41,0l.71-.71A1,1,0,0,0,5.64,17ZM5,12a1,1,0,0,0-1-1H3a1,1,0,0,0,0,2H4A1,1,0,0,0,5,12Zm7-7a1,1,0,0,0,1-1V3a1,1,0,0,0-2,0V4A1,1,0,0,0,12,5ZM5.64,7.05a1,1,0,0,0,.7.29,1,1,0,0,0,.71-.29,1,1,0,0,0,0-1.41l-.71-.71A1,1,0,0,0,4.93,6.34Zm12,.29a1,1,0,0,0,.7-.29l.71-.71a1,1,0,1,0-1.41-1.41L17,5.64a1,1,0,0,0,0,1.41A1,1,0,0,0,17.66,7.34ZM21,11H20a1,1,0,0,0,0,2h1a1,1,0,0,0,0-2Zm-9,8a1,1,0,0,0-1,1v1a1,1,0,0,0,2,0V20A1,1,0,0,0,12,19ZM18.36,17A1,1,0,0,0,17,18.36l.71.71a1,1,0,0,0,1.41,0,1,1,0,0,0,0-1.41ZM12,6.5A5.5,5.5,0,1,0,17.5,12,5.51,5.51,0,0,0,12,6.5Zm0,9A3.5,3.5,0,1,1,15.5,12,3.5,3.5,0,0,1,12,15.5Z"]'), - ]), - m('svg.swap-on.fill-current.w-6.h-6[xmlns="http://www.w3.org/2000/svg"][viewBox="0 0 24 24"]', [ - m('path[d="M21.64,13a1,1,0,0,0-1.05-.14,8.05,8.05,0,0,1-3.37.73A8.15,8.15,0,0,1,9.08,5.49a8.59,8.59,0,0,1,.25-2A1,1,0,0,0,8,2.36,10.14,10.14,0,1,0,22,14.05,1,1,0,0,0,21.64,13Zm-9.5,6.69A8.14,8.14,0,0,1,7.08,5.22v.27A10.15,10.15,0,0,0,17.22,15.63a9.79,9.79,0,0,0,2.1-.22A8.11,8.11,0,0,1,12.14,19.73Z"]'), - ]), - ]), - ]), - ]), - m('main.min-h-screen.bg-base-200.pt-4', vnode.children), - ] -}; \ No newline at end of file diff --git a/databuild/dashboard/package.json b/databuild/dashboard/package.json deleted file mode 100644 index 1c88cda..0000000 --- a/databuild/dashboard/package.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "private": true, - "devDependencies": { - "typescript": "5.7.3", - "@types/node": "^22.12.0", - "mithril": "^2.2.7", - "@types/mithril": "^2.2.7", - "ospec": "^4.2.0", - "@types/ospec": "^4.2.0", - "whatwg-fetch": "^3.6.20", - "daisyui": "^5.0.0-beta.6" - }, - "pnpm": { - "onlyBuiltDependencies": [] - } -} diff --git a/databuild/dashboard/pages.ts b/databuild/dashboard/pages.ts deleted file mode 100644 index b1c9b68..0000000 --- a/databuild/dashboard/pages.ts +++ /dev/null @@ -1,1439 +0,0 @@ -import m from 'mithril'; -import { DashboardService, pollingManager, formatTime, formatDateTime, formatDuration, formatDate } from './services'; -import { encodePartitionRef, decodePartitionRef, encodeJobLabel, decodeJobLabel, BuildStatusBadge, PartitionStatusBadge, EventTypeBadge } from './utils'; -import { - TypedComponent, - RecentActivityAttrs, - BuildStatusAttrs, - PartitionStatusAttrs, - PartitionsListAttrs, - JobsListAttrs, - JobMetricsAttrs, - GraphAnalysisAttrs, - DashboardActivity, - DashboardBuild, - DashboardPartition, - DashboardJob, - getTypedRouteParams -} from './types'; -import { - PartitionRef -} from '../client/typescript_generated/src/index'; - -// Page scaffold components -export const RecentActivity: TypedComponent = { - data: null as DashboardActivity | null, - loading: true, - error: null as string | null, - pollInterval: null as NodeJS.Timeout | null, - - loadData() { - this.loading = true; - this.error = null; - m.redraw(); // Redraw to show loading state - - const service = DashboardService.getInstance(); - - return service.getRecentActivity() - .then(data => { - this.data = data; - this.loading = false; - m.redraw(); // Explicitly redraw after data loads - }) - .catch(error => { - console.error('RecentActivity: Error in loadData:', error); - this.error = error instanceof Error ? error.message : 'Failed to load data'; - this.loading = false; - m.redraw(); // Redraw after error - }); - }, - - oninit(vnode: m.Vnode) { - // Load initial data - Mithril will automatically redraw after promise resolves - this.loadData(); - - // Set up polling for real-time updates (5 second interval) - if (pollingManager.isVisible()) { - pollingManager.startPolling('recent-activity', () => { - this.loadData(); - }, 5000); - } - }, - - onremove(vnode: m.VnodeDOM) { - // Clean up polling when component is removed - pollingManager.stopPolling('recent-activity'); - }, - - view: function(vnode: m.Vnode) { - - if (this.loading && !this.data) { - return m('div.container.mx-auto.p-4', [ - m('div.flex.flex-col.justify-center.items-center.min-h-96', [ - m('span.loading.loading-spinner.loading-lg'), - m('span.ml-4.text-lg.mb-4', 'Loading dashboard...'), - m('button.btn.btn-sm.btn-outline', { - onclick: () => this.loadData() - }, 'Retry Load') - ]) - ]); - } - - if (this.error) { - return m('div.container.mx-auto.p-4', [ - m('div.alert.alert-error', [ - m('svg.stroke-current.shrink-0.h-6.w-6', { - fill: 'none', - viewBox: '0 0 24 24' - }, [ - m('path', { - 'stroke-linecap': 'round', - 'stroke-linejoin': 'round', - 'stroke-width': '2', - d: 'M10 14l2-2m0 0l2-2m-2 2l-2-2m2 2l2 2m7-2a9 9 0 11-18 0 9 9 0 0118 0z' - }) - ]), - m('span', this.error), - m('div', [ - m('button.btn.btn-sm.btn-outline', { - onclick: () => this.loadData() - }, 'Retry') - ]) - ]) - ]); - } - - const data = this.data; - if (!data) return m('div'); - - return m('div.container.mx-auto.p-4', [ - // Dashboard Header - m('div.dashboard-header.mb-6', [ - m('div.flex.justify-between.items-center.mb-4', [ - m('h1.text-3xl.font-bold', 'DataBuild Dashboard'), - m('div.badge.badge-primary.badge-lg', data.graph_name) - ]), - - // Statistics - Updated to use DashboardActivity field names - m('div.stats.shadow.w-full.bg-base-100', [ - m('div.stat', [ - m('div.stat-figure.text-primary', [ - m('svg.w-8.h-8', { - fill: 'none', - stroke: 'currentColor', - viewBox: '0 0 24 24' - }, [ - m('path', { - 'stroke-linecap': 'round', - 'stroke-linejoin': 'round', - 'stroke-width': '2', - d: 'M13 10V3L4 14h7v7l9-11h-7z' - }) - ]) - ]), - m('div.stat-title', 'Active Builds'), - m('div.stat-value.text-primary', data.active_builds_count), - m('div.stat-desc', 'Currently running') - ]), - m('div.stat', [ - m('div.stat-figure.text-secondary', [ - m('svg.w-8.h-8', { - fill: 'none', - stroke: 'currentColor', - viewBox: '0 0 24 24' - }, [ - m('path', { - 'stroke-linecap': 'round', - 'stroke-linejoin': 'round', - 'stroke-width': '2', - d: 'M9 12l2 2 4-4M7.835 4.697a3.42 3.42 0 001.946-.806 3.42 3.42 0 014.438 0 3.42 3.42 0 001.946.806 3.42 3.42 0 013.138 3.138 3.42 3.42 0 00.806 1.946 3.42 3.42 0 010 4.438 3.42 3.42 0 00-.806 1.946 3.42 3.42 0 01-3.138 3.138 3.42 3.42 0 00-1.946.806 3.42 3.42 0 01-4.438 0 3.42 3.42 0 00-1.946-.806 3.42 3.42 0 01-3.138-3.138 3.42 3.42 0 00-.806-1.946 3.42 3.42 0 010-4.438 3.42 3.42 0 00.806-1.946 3.42 3.42 0 013.138-3.138z' - }) - ]) - ]), - m('div.stat-title', 'Recent Builds'), - m('div.stat-value.text-secondary', data.recent_builds.length), - m('div.stat-desc', 'In the last hour') - ]), - m('div.stat', [ - m('div.stat-figure.text-accent', [ - m('svg.w-8.h-8', { - fill: 'none', - stroke: 'currentColor', - viewBox: '0 0 24 24' - }, [ - m('path', { - 'stroke-linecap': 'round', - 'stroke-linejoin': 'round', - 'stroke-width': '2', - d: 'M20 7l-8-4-8 4m16 0l-8 4m8-4v10l-8 4m0-10L4 7m8 4v10M9 5l8 4' - }) - ]) - ]), - m('div.stat-title', 'Total Partitions'), - m('div.stat-value.text-accent', data.total_partitions_count), - m('div.stat-desc', 'Managed partitions') - ]) - ]) - ]), - - // Dashboard Content Grid - m('div.dashboard-content.grid.grid-cols-1.lg:grid-cols-2.gap-6', [ - // Recent Build Requests - m('div.recent-builds.card.bg-base-100.shadow-xl', [ - m('div.card-body', [ - m('h2.card-title.text-xl.mb-4', [ - m('svg.w-6.h-6.mr-2', { - fill: 'none', - stroke: 'currentColor', - viewBox: '0 0 24 24' - }, [ - m('path', { - 'stroke-linecap': 'round', - 'stroke-linejoin': 'round', - 'stroke-width': '2', - d: 'M13 10V3L4 14h7v7l9-11h-7z' - }) - ]), - 'Recent Build Requests' - ]), - data.recent_builds.length === 0 - ? m('div.text-center.py-8.text-base-content.opacity-60', 'No recent builds') - : m('div.overflow-x-auto', [ - m('table.table.table-sm', [ - m('thead', [ - m('tr', [ - m('th', 'Build ID'), - m('th', 'Status'), - m('th', 'Created'), - ]) - ]), - m('tbody', - data.recent_builds.map((build: DashboardBuild) => - m('tr.hover', [ - m('td', [ - m('a.link.link-primary.font-mono.text-sm', { - href: `/builds/${build.build_request_id}`, - onclick: (e: Event) => { - e.preventDefault(); - m.route.set(`/builds/${build.build_request_id}`); - } - }, build.build_request_id) - ]), - m('td', [ - // KEY FIX: build.status.name is now always a string, prevents runtime errors - m(BuildStatusBadge, { status: build.status.name }) - ]), - m('td.text-sm.opacity-70', formatTime(build.requested_at)), - ]) - ) - ) - ]) - ]) - ]) - ]), - - // Recent Partition Builds - m('div.recent-partitions.card.bg-base-100.shadow-xl', [ - m('div.card-body', [ - m('h2.card-title.text-xl.mb-4', [ - m('svg.w-6.h-6.mr-2', { - fill: 'none', - stroke: 'currentColor', - viewBox: '0 0 24 24' - }, [ - m('path', { - 'stroke-linecap': 'round', - 'stroke-linejoin': 'round', - 'stroke-width': '2', - d: 'M20 7l-8-4-8 4m16 0l-8 4m8-4v10l-8 4m0-10L4 7m8 4v10M9 5l8 4' - }) - ]), - 'Recent Partition Builds' - ]), - data.recent_partitions.length === 0 - ? m('div.text-center.py-8.text-base-content.opacity-60', 'No recent partitions') - : m('div.overflow-x-auto', [ - m('table.table.table-sm', [ - m('thead', [ - m('tr', [ - m('th', 'Partition Reference'), - m('th', 'Status'), - m('th', 'Updated'), - ]) - ]), - m('tbody', - data.recent_partitions.map((partition: DashboardPartition) => - m('tr.hover', [ - m('td', [ - m('a.link.link-primary.font-mono.text-sm.break-all', { - // KEY FIX: partition.partition_ref.str is now always a string, not an object - href: `/partitions/${encodePartitionRef(partition.partition_ref.str)}`, - onclick: (e: Event) => { - e.preventDefault(); - m.route.set(`/partitions/${encodePartitionRef(partition.partition_ref.str)}`); - }, - title: partition.partition_ref.str - }, partition.partition_ref.str) - ]), - m('td', [ - // KEY FIX: partition.status.name is now always a string, prevents runtime errors - m(PartitionStatusBadge, { status: partition.status_name }) - ]), - m('td.text-sm.opacity-70', - // KEY FIX: Proper null handling for last_updated - partition.last_updated ? formatTime(partition.last_updated) : '—'), - ]) - ) - ) - ]) - ]) - ]) - ]) - ]) - ]); - } -}; - -/* -// OLD BUILDSTATUS COMPONENT - COMMENTED OUT FOR CLEAN REBUILD -// This component had mixed old/new patterns and complex direct API calls -// Rebuilding with proper dashboard types architecture - -export const BuildStatus_OLD: TypedComponent = { - // ... (full old implementation preserved for reference) -}; -*/ - -// CLEAN REBUILD: BuildStatus using proper dashboard architecture -export const BuildStatus: TypedComponent = { - data: null as DashboardBuild | null, - loading: true, - error: null as string | null, - partitionStatuses: new Map(), - buildId: '', - mermaidDiagram: null as string | null, - mermaidLoading: false, - mermaidError: null as string | null, - - oninit(vnode: m.Vnode) { - this.buildId = vnode.attrs.id; - this.loadBuild(); - this.startPolling(); - }, - - onremove(vnode: m.VnodeDOM) { - pollingManager.stopPolling(`build-status-${this.buildId}`); - }, - - async loadBuild() { - try { - this.loading = true; - this.error = null; - m.redraw(); - - const service = DashboardService.getInstance(); - - // Get build details using our transformation layer - const buildData = await service.getBuildDetail(this.buildId); - if (!buildData) { - throw new Error(`Build ${this.buildId} not found`); - } - - this.data = buildData; - - // Load partition statuses using our transformation layer - this.partitionStatuses.clear(); - for (const partitionRef of buildData.requested_partitions) { - try { - const partitionData = await service.getPartitionDetail(partitionRef.str); - if (partitionData) { - this.partitionStatuses.set(partitionRef.str, partitionData); - } - } catch (e) { - console.warn(`Failed to load status for partition ${partitionRef.str}:`, e); - } - } - - // Load mermaid diagram if we don't have it yet - if (!this.mermaidDiagram && !this.mermaidLoading) { - this.loadMermaidDiagram(); - } - - this.loading = false; - m.redraw(); - } catch (error) { - console.error('Failed to load build:', error); - this.error = error instanceof Error ? error.message : 'Failed to load build'; - this.loading = false; - m.redraw(); - } - }, - - async loadMermaidDiagram() { - try { - this.mermaidLoading = true; - this.mermaidError = null; - m.redraw(); - - const service = DashboardService.getInstance(); - const diagram = await service.getMermaidDiagram(this.buildId); - - if (diagram) { - this.mermaidDiagram = diagram; - // Trigger mermaid to render the diagram after the DOM updates - setTimeout(() => { - if (typeof window !== 'undefined' && (window as any).mermaid) { - (window as any).mermaid.init(); - } - }, 200); - } else { - this.mermaidError = 'No job graph available for this build'; - } - - this.mermaidLoading = false; - m.redraw(); - } catch (error) { - console.error('Failed to load mermaid diagram:', error); - this.mermaidError = error instanceof Error ? error.message : 'Failed to load diagram'; - this.mermaidLoading = false; - m.redraw(); - } - }, - - startPolling() { - // Use different poll intervals based on build status - const isActive = this.data?.status.name === 'EXECUTING' || - this.data?.status.name === 'PLANNING'; - const interval = isActive ? 2000 : 10000; // 2s for active, 10s for completed - - pollingManager.startPolling(`build-status-${this.buildId}`, () => { - this.loadBuild(); - }, interval); - }, - - view(vnode: m.Vnode) { - // Loading/error states - if (this.loading && !this.data) { - return m('div.container.mx-auto.p-4', [ - m('div.flex.flex-col.justify-center.items-center.min-h-96', [ - m('span.loading.loading-spinner.loading-lg'), - m('span.ml-4.text-lg.mb-4', 'Loading build status...'), - m('button.btn.btn-sm.btn-outline', { - onclick: () => this.loadBuild() - }, 'Retry Load') - ]) - ]); - } - - if (this.error) { - return m('div.container.mx-auto.p-4', [ - m('div.alert.alert-error', [ - m('svg.stroke-current.shrink-0.h-6.w-6', { - fill: 'none', - viewBox: '0 0 24 24' - }, [ - m('path', { - 'stroke-linecap': 'round', - 'stroke-linejoin': 'round', - 'stroke-width': '2', - d: 'M10 14l2-2m0 0l2-2m-2 2l-2-2m2 2l2 2m7-2a9 9 0 11-18 0 9 9 0 0118 0z' - }) - ]), - m('span', this.error), - m('div', [ - m('button.btn.btn-sm.btn-outline', { - onclick: () => this.loadBuild() - }, 'Retry') - ]) - ]) - ]); - } - - if (!this.data) return m('div'); - - const build = this.data; - - const timelineData = [ - {stage: 'Build Requested', time: build.requested_at, icon: '🕚'}, - ...(build.started_at ? [{stage: 'Build Started', time: build.started_at, icon: '▶️'}] : []), - // ...(this.data.events as BuildEvent[]).filter(ev => ev.job_event !== null).map((ev) => ({ - // stage: ev.job_event.status.name, time: ev.timestamp, icon: '🙃' - // })), - ...(build.completed_at ? [{stage: 'Build Completed', time: build.completed_at, icon: '✅'}] : []), - ]; - - let startedAt = build.started_at || build.requested_at; - - return m('div.container.mx-auto.p-4', [ - // Build Header - m('.build-header.mb-6', [ - m('h1.text-3xl.font-bold.mb-4', `Build ${this.buildId}`), - m('.build-meta.grid.grid-cols-1.md:grid-cols-4.gap-4.mb-6', [ - m('.stat.bg-base-100.shadow.rounded-lg.p-4', [ - m('.stat-title', 'Status'), - m('.stat-value.text-2xl', [ - m(BuildStatusBadge, { status: build.status.name, size: 'lg' }) - ]) - ]), - m('.stat.bg-base-100.shadow.rounded-lg.p-4', [ - m('.stat-title', 'Partitions'), - m('.stat-value.text-2xl', build.requested_partitions.length), - m('.stat-desc', 'requested') - ]), - m('.stat.bg-base-100.shadow.rounded-lg.p-4', [ - m('.stat-title', 'Jobs'), - m('.stat-value.text-2xl', `${build.completed_jobs}`), - m('.stat-desc', 'completed') - ]), - m('.stat.bg-base-100.shadow.rounded-lg.p-4', [ - m('.stat-title', 'Duration'), - m('.stat-value.text-2xl', (build.completed_at - startedAt) ? formatDuration((build.completed_at - startedAt)) : '—'), - m('.stat-desc', startedAt ? formatDateTime(startedAt) : 'Not started') - ]) - ]) - ]), - - // Build Content - m('.build-content.space-y-6', [ - // Partition Status Grid - m('.partition-status.card.bg-base-100.shadow-xl', [ - m('.card-body', [ - m('h2.card-title.text-xl.mb-4', 'Partition Status'), - build.requested_partitions.length === 0 ? - m('.text-center.py-8.text-base-content.opacity-60', 'No partitions requested') : - m('.partition-grid.grid.grid-cols-1.md:grid-cols-2.lg:grid-cols-3.gap-4', - build.requested_partitions.map((partitionRef: PartitionRef) => { - const partitionStatus = this.partitionStatuses.get(partitionRef.str); - return m('.partition-card.border.border-base-300.rounded-lg.p-4', [ - m('.partition-header.mb-3', [ - m('a.partition-ref.font-mono.text-sm.break-all.link.link-primary', { - href: `/partitions/${encodePartitionRef(partitionRef.str)}`, - onclick: (e: Event) => { - e.preventDefault(); - m.route.set(`/partitions/${encodePartitionRef(partitionRef.str)}`); - }, - title: `View details for partition: ${partitionRef.str}` - }, partitionRef.str) - ]), - m('.partition-status.flex.justify-between.items-center', [ - // CLEAN: Always string status, no nested object access - m(PartitionStatusBadge, { - status: partitionStatus?.status.name || 'Loading...', - size: 'sm' - }), - partitionStatus?.last_updated ? - m('.updated-time.text-xs.opacity-60', - formatTime(partitionStatus.last_updated)) : - m('.text-xs.opacity-60', '—') - ]) - ]); - }) - ) - ]) - ]), - - // Build Summary - m('.build-summary.card.bg-base-100.shadow-xl', [ - m('.card-body', [ - m('h2.card-title.text-xl.mb-4', 'Build Summary'), - m('.grid.grid-cols-2.md:grid-cols-4.gap-4', [ - m('.metric.text-center', [ - m('.metric-value.text-2xl.font-bold.text-success', build.completed_jobs), - m('.metric-label.text-sm.opacity-60', 'Completed') - ]), - m('.metric.text-center', [ - m('.metric-value.text-2xl.font-bold.text-error', build.failed_jobs), - m('.metric-label.text-sm.opacity-60', 'Failed') - ]), - m('.metric.text-center', [ - m('.metric-value.text-2xl.font-bold.text-warning', build.cancelled_jobs), - m('.metric-label.text-sm.opacity-60', 'Cancelled') - ]), - m('.metric.text-center', [ - m('.metric-value.text-2xl.font-bold', build.total_jobs), - m('.metric-label.text-sm.opacity-60', 'Total Jobs') - ]) - ]), - - m('ul.timeline.mx-auto',timelineData.map((item) => { - return m('li', [ - ...(item.stage === 'Build Requested' ? [] : [m("hr")]), - m('div.font-medium.timeline-middle', item.icon), - m("div.timeline-box.timeline-end", { - style: { - '--timeline-color': item.stage === 'Requested' ? '#1976d2' : '#6b7280' - } - }, [ - m('span.flex.flex-col.gap-2', { - class: 'timeline-point', - }, [ - m('.font-medium', item.stage), - m('.text-xs.opacity-60', formatDateTime(item.time)), - ]) - ] - ), - ...(item.stage === 'Build Completed' ? [] : [m("hr")]), - ] - ) - - })), - - ]) - ]), - - // Build Graph Visualization - m('.build-graph.card.bg-base-100.shadow-xl', [ - m('.card-body', [ - m('.flex.justify-between.items-center.mb-4', [ - m('h2.card-title.text-xl', 'Build Graph'), - this.mermaidDiagram && m('button.btn.btn-sm.btn-outline', { - onclick: () => this.loadMermaidDiagram() - }, 'Refresh Diagram') - ]), - this.mermaidLoading ? - m('.flex.justify-center.items-center.h-32', [ - m('span.loading.loading-spinner.loading-lg'), - m('span.ml-4', 'Loading diagram...') - ]) : - this.mermaidError ? - m('.alert.alert-warning', [ - m('svg.stroke-current.shrink-0.h-6.w-6', { - fill: 'none', - viewBox: '0 0 24 24' - }, [ - m('path', { - 'stroke-linecap': 'round', - 'stroke-linejoin': 'round', - 'stroke-width': '2', - 'd': 'M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-2.5L13.732 4c-.77-.833-1.964-.833-2.732 0L3.732 16.5c-.77.833.192 2.5 1.732 2.5z' - }) - ]), - m('span', this.mermaidError), - m('div', [ - m('button.btn.btn-sm.btn-outline', { - onclick: () => this.loadMermaidDiagram() - }, 'Retry') - ]) - ]) : - this.mermaidDiagram ? - m('.mermaid-container.w-full.overflow-x-auto', [ - m('pre.mermaid.text-center', { - key: `mermaid-${this.buildId}` // Force re-render when buildId changes - }, this.mermaidDiagram) - ]) : - m('.text-center.py-8.text-base-content.opacity-60', 'No graph visualization available') - ]) - ]) - ]) - ]); - } -}; - -export const PartitionsList: TypedComponent = { - data: [] as DashboardPartition[], - loading: true, - error: null as string | null, - searchTerm: '', - totalCount: 0, - - async loadPartitions() { - try { - this.loading = true; - this.error = null; - m.redraw(); - - // Use direct fetch since we don't have a specific service method for partition list - // TODO: Consider adding getPartitionsList() to DashboardService - const response = await fetch('/api/v1/partitions'); - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - const apiData = (await response.json()).data; - - // Transform API response to dashboard types - this.data = apiData.partitions?.map((partition: any) => ({ - partition_ref: partition.partition_ref, - status_code: partition.status_code, - status_name: partition.status_name, - last_updated: partition.last_updated ?? null, - build_requests: partition.build_requests || [] - })) || []; - - this.totalCount = apiData.totalCount || this.data.length; - this.loading = false; - m.redraw(); - } catch (error) { - console.error('Failed to load partitions:', error); - this.error = error instanceof Error ? error.message : 'Failed to load partitions'; - this.loading = false; - m.redraw(); - } - }, - - async buildPartition(partitionRef: string) { - try { - const response = await fetch('/api/v1/builds', { - method: 'POST', - headers: { - 'Content-Type': 'application/json' - }, - body: JSON.stringify({ - partitions: [partitionRef] - }) - }); - - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - - const result = await response.json(); - - // Redirect to build status page - m.route.set(`/builds/${result.build_request_id}`); - } catch (error) { - console.error('Failed to start build:', error); - alert(`Failed to start build: ${error instanceof Error ? error.message : 'Unknown error'}`); - } - }, - - filteredPartitions() { - if (!this.data) return []; - - if (!this.searchTerm) return this.data; - - const search = this.searchTerm.toLowerCase(); - return this.data.filter((partition: DashboardPartition) => - partition.partition_ref.str.toLowerCase().includes(search) - ); - }, - - oninit(vnode: m.Vnode) { - this.loadPartitions(); - }, - - view(vnode: m.Vnode) { - if (this.loading && !this.data) { - return m('div.container.mx-auto.p-4', [ - m('div.flex.flex-col.justify-center.items-center.min-h-96', [ - m('span.loading.loading-spinner.loading-lg'), - m('span.ml-4.text-lg.mb-4', 'Loading partitions...'), - m('button.btn.btn-sm.btn-outline', { - onclick: () => this.loadPartitions() - }, 'Retry Load') - ]) - ]); - } - - if (this.error) { - return m('div.container.mx-auto.p-4', [ - m('div.alert.alert-error', [ - m('svg.stroke-current.shrink-0.h-6.w-6', { - fill: 'none', - viewBox: '0 0 24 24' - }, [ - m('path', { - 'stroke-linecap': 'round', - 'stroke-linejoin': 'round', - 'stroke-width': '2', - d: 'M10 14l2-2m0 0l2-2m-2 2l-2-2m2 2l2 2m7-2a9 9 0 11-18 0 9 9 0 0118 0z' - }) - ]), - m('span', this.error), - m('div', [ - m('button.btn.btn-sm.btn-outline', { - onclick: () => this.loadPartitions() - }, 'Retry') - ]) - ]) - ]); - } - - const filteredPartitions = this.filteredPartitions(); - - return m('div.container.mx-auto.p-4', [ - m('.partitions-header.mb-6', [ - m('div.flex.justify-between.items-center.mb-4', [ - m('h1.text-3xl.font-bold', 'Partitions'), - m('.badge.badge-primary.badge-lg', `${this.totalCount} total` || "missing") - ]), - - m('div.form-control.mb-4', [ - m('input.input.input-bordered.w-full.max-w-md', { - placeholder: 'Search partitions...', - value: this.searchTerm, - oninput: (e: Event) => { - this.searchTerm = (e.target as HTMLInputElement).value; - m.redraw(); - } - }) - ]) - ]), - - m('.partitions-content', [ - filteredPartitions.length === 0 ? - m('div.card.bg-base-100.shadow-xl', [ - m('div.card-body.text-center', [ - m('h2.card-title.justify-center', 'No Partitions Found'), - m('p.text-base-content.opacity-60', - this.searchTerm ? - 'No partitions match your search criteria.' : - 'No partitions have been built yet.') - ]) - ]) : - m('div.card.bg-base-100.shadow-xl', [ - m('div.card-body', [ - m('h2.card-title.mb-4', `Showing ${filteredPartitions.length} partitions`), - m('div.overflow-x-auto', [ - m('table.table.table-sm', [ - m('thead', [ - m('tr', [ - m('th', 'Partition Reference'), - m('th', 'Status'), - m('th', 'Last Updated'), - m('th', 'Actions'), - ]) - ]), - m('tbody', - filteredPartitions.map((partition: DashboardPartition) => - m('tr.hover', [ - m('td', [ - m('a.link.link-primary.font-mono.text-sm.break-all', { - href: `/partitions/${encodePartitionRef(partition.partition_ref.str)}`, - onclick: (e: Event) => { - e.preventDefault(); - m.route.set(`/partitions/${encodePartitionRef(partition.partition_ref.str)}`); - }, - title: partition.partition_ref.str - }, partition.partition_ref.str) - ]), - m('td', [ - m(PartitionStatusBadge, { status: partition.status_name }) - ]), - m('td.text-sm.opacity-70', - partition.last_updated ? formatTime(partition.last_updated) : '—'), - m('td', [ - m('button.btn.btn-sm.btn-primary', { - onclick: () => this.buildPartition(partition.partition_ref.str) - }, 'Build'), - partition.build_requests.length > 0 ? - m('a.btn.btn-sm.btn-outline.ml-2', { - href: `/builds/${partition.build_requests[0]}`, - onclick: (e: Event) => { - e.preventDefault(); - m.route.set(`/builds/${partition.build_requests[0]}`); - }, - title: 'View most recent build' - }, 'View Build') : null - ]) - ]) - ) - ) - ]) - ]) - ]) - ]) - ]) - ]); - } -}; - -export const PartitionStatus: TypedComponent = { - data: null as DashboardPartition | null, - events: null as any | null, // Keep as any since events structure varies - loading: true, - error: null as string | null, - partitionRef: '', - buildHistory: [] as any[], // Keep as any since this is extracted from events - - async loadPartition() { - try { - this.loading = true; - this.error = null; - m.redraw(); - - const service = DashboardService.getInstance(); - - // Load partition status using our transformation layer - const partitionData = await service.getPartitionDetail(this.partitionRef); - if (!partitionData) { - throw new Error(`Partition ${this.partitionRef} not found`); - } - this.data = partitionData; - - // Load partition events for build history (use direct API for now) - // TODO: Consider adding getPartitionEvents() to DashboardService - const encodedRef = btoa(this.partitionRef).replace(/\+/g, '-').replace(/\//g, '_').replace(/=/g, ''); - const eventsResponse = await fetch(`/api/v1/partitions/${encodedRef}/events`); - if (eventsResponse.ok) { - this.events = await eventsResponse.json(); - this.buildHistory = this.extractBuildHistory(this.events.events || []); - } else { - console.warn('Failed to load partition events:', eventsResponse.statusText); - this.events = { events: [] }; - this.buildHistory = []; - } - - this.loading = false; - m.redraw(); - } catch (error) { - console.error('Failed to load partition:', error); - this.error = error instanceof Error ? error.message : 'Failed to load partition'; - this.loading = false; - m.redraw(); - } - }, - - extractBuildHistory(events: any[]): any[] { - // Group events by build request ID to create build history entries - const buildRequests = new Map(); - - events.forEach(event => { - if (event.buildRequestId) { - if (!buildRequests.has(event.buildRequestId)) { - buildRequests.set(event.buildRequestId, { - id: event.buildRequestId, - status: 'Unknown', - startedAt: event.timestamp, - completedAt: null, - events: [] - }); - } - - const build = buildRequests.get(event.buildRequestId); - build.events.push(event); - - // Update status based on event type - if (event.eventType === 'build_request') { - if (event.message?.includes('completed') || event.message?.includes('successful')) { - build.status.name = 'Completed'; - build.completedAt = event.timestamp; - } else if (event.message?.includes('failed') || event.message?.includes('error')) { - build.status.name = 'Failed'; - build.completedAt = event.timestamp; - } else if (event.message?.includes('executing') || event.message?.includes('running')) { - build.status.name = 'Executing'; - } else if (event.message?.includes('planning')) { - build.status.name = 'Planning'; - } - } - } - }); - - // Convert to array and sort by start time (newest first) - return Array.from(buildRequests.values()).sort((a, b) => b.startedAt - a.startedAt); - }, - - async buildPartition(forceRebuild: boolean = false) { - try { - const response = await fetch('/api/v1/builds', { - method: 'POST', - headers: { - 'Content-Type': 'application/json' - }, - body: JSON.stringify({ - partitions: [this.partitionRef] - }) - }); - - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - - const result = await response.json(); - - // Redirect to build status page - m.route.set(`/builds/${result.build_request_id}`); - } catch (error) { - console.error('Failed to start build:', error); - alert(`Failed to start build: ${error instanceof Error ? error.message : 'Unknown error'}`); - } - }, - - oninit(vnode: m.Vnode) { - this.partitionRef = decodePartitionRef(vnode.attrs.base64_ref); - this.loadPartition(); - }, - - view(vnode: m.Vnode) { - if (this.loading && !this.data) { - return m('div.container.mx-auto.p-4', [ - m('div.flex.flex-col.justify-center.items-center.min-h-96', [ - m('span.loading.loading-spinner.loading-lg'), - m('span.ml-4.text-lg.mb-4', 'Loading partition...'), - m('button.btn.btn-sm.btn-outline', { - onclick: () => this.loadPartition() - }, 'Retry Load') - ]) - ]); - } - - if (this.error) { - return m('div.container.mx-auto.p-4', [ - m('div.alert.alert-error', [ - m('svg.stroke-current.shrink-0.h-6.w-6', { - fill: 'none', - viewBox: '0 0 24 24' - }, [ - m('path', { - 'stroke-linecap': 'round', - 'stroke-linejoin': 'round', - 'stroke-width': '2', - d: 'M10 14l2-2m0 0l2-2m-2 2l-2-2m2 2l2 2m7-2a9 9 0 11-18 0 9 9 0 0118 0z' - }) - ]), - m('span', this.error), - m('div', [ - m('button.btn.btn-sm.btn-outline', { - onclick: () => this.loadPartition() - }, 'Retry') - ]) - ]) - ]); - } - - if (!this.data) return m('div'); - - return m('div.container.mx-auto.p-4', [ - // Partition Header - m('.partition-header.mb-6', [ - m('div.flex.justify-between.items-start.mb-4', [ - m('div.flex-1', [ - m('h1.text-3xl.font-bold.mb-2', 'Partition Status'), - m('div.font-mono.text-lg.break-all.bg-base-200.p-3.rounded', this.partitionRef) - ]), - m('div.flex.flex-col.gap-2', [ - m('button.btn.btn-primary', { - onclick: () => this.buildPartition(false) - }, 'Build Now'), - m('button.btn.btn-secondary', { - onclick: () => this.buildPartition(true) - }, 'Force Rebuild'), - ]) - ]), - - m('div.partition-meta.flex.gap-4.items-center.mb-4', [ - m(PartitionStatusBadge, { status: this.data?.status.name || 'Unknown', size: 'lg' }), - this.data?.last_updated ? - m('.timestamp.text-sm.opacity-70', - `Last updated: ${formatDateTime(this.data.last_updated)}`) : null, - ]) - ]), - - // Main Content - m('.partition-content.space-y-6', [ - // Build History - m('.build-history.card.bg-base-100.shadow-xl', [ - m('.card-body', [ - m('h2.card-title.text-xl.mb-4', `Build History (${this.buildHistory?.length || 0} builds)`), - !this.buildHistory || this.buildHistory.length === 0 ? - m('.text-center.py-8.text-base-content.opacity-60', 'No build history available') : - m('.overflow-x-auto', [ - m('table.table.table-sm', [ - m('thead', [ - m('tr', [ - m('th', 'Build Request'), - m('th', 'Status'), - m('th', 'Started'), - m('th', 'Completed'), - m('th', 'Events'), - ]) - ]), - m('tbody', - this.buildHistory.map((build: any) => - m('tr.hover', [ - m('td', [ - m('a.link.link-primary.font-mono.text-sm', { - href: `/builds/${build.id}`, - onclick: (e: Event) => { - e.preventDefault(); - m.route.set(`/builds/${build.id}`); - } - }, build.id) - ]), - m('td', [ - m(BuildStatusBadge, { status: build.status.name }) - ]), - m('td.text-sm.opacity-70', - formatDateTime(build.startedAt)), - m('td.text-sm.opacity-70', - build.completedAt ? - formatDateTime(build.completedAt) : - '—'), - m('td.text-sm.opacity-70', `${build.events?.length || 0} events`) - ]) - ) - ) - ]) - ]) - ]) - ]), - - // Related Build Requests - this.data?.build_requests && this.data.build_requests.length > 0 ? - m('.related-builds.card.bg-base-100.shadow-xl', [ - m('.card-body', [ - m('h2.card-title.text-xl.mb-4', 'Related Build Requests'), - m('.grid.grid-cols-1.md:grid-cols-2.lg:grid-cols-3.gap-3', - this.data.build_requests.map((buildId: string) => - m('.build-card.border.border-base-300.rounded.p-3', [ - m('a.link.link-primary.font-mono.text-sm', { - href: `/builds/${buildId}`, - onclick: (e: Event) => { - e.preventDefault(); - m.route.set(`/builds/${buildId}`); - } - }, buildId) - ]) - ) - ) - ]) - ]) : null, - - // Raw Events - this.events?.events && this.events.events.length > 0 ? - m('.partition-events.card.bg-base-100.shadow-xl', [ - m('.card-body', [ - m('h2.card-title.text-xl.mb-4', `All Events (${this.events.events.length})`), - m('.overflow-x-auto', [ - m('table.table.table-xs', [ - m('thead', [ - m('tr', [ - m('th', 'Timestamp'), - m('th', 'Event Type'), - m('th', 'Build Request'), - m('th', 'Message'), - ]) - ]), - m('tbody', - this.events.events.slice(0, 100).map((event: any) => // Show first 100 events - m('tr.hover', [ - m('td.text-xs.font-mono', - formatDateTime(event.timestamp)), - m('td', [ - m(EventTypeBadge, { eventType: event.event_type, size: 'xs' }) - ]), - m('td', - event.build_request_id ? - m('a.link.link-primary.font-mono.text-xs', { - href: `/builds/${event.build_request_id}`, - onclick: (e: Event) => { - e.preventDefault(); - m.route.set(`/builds/${event.build_request_id}`); - } - }, event.build_request_id) : '—'), - m('td.text-xs', event.message || ''), - ]) - ) - ) - ]) - ]), - this.events.events.length > 100 ? - m('.text-center.mt-4', [ - m('.text-sm.opacity-60', `Showing first 100 of ${this.events.events.length} events`) - ]) : null - ]) - ]) : null - ]) - ]); - } -}; - -export const JobsList: TypedComponent = { - jobs: [] as DashboardJob[], - searchTerm: '', - loading: false, - error: null as string | null, - searchTimeout: null as NodeJS.Timeout | null, - - oninit(vnode: m.Vnode) { - JobsList.loadJobs(); - }, - - async loadJobs() { - JobsList.loading = true; - JobsList.error = null; - - try { - const service = DashboardService.getInstance(); - JobsList.jobs = await service.getJobs(JobsList.searchTerm || undefined); - } catch (error) { - console.error('Failed to load jobs:', error); - JobsList.error = 'Failed to load jobs. Please try again.'; - } finally { - JobsList.loading = false; - m.redraw(); - } - }, - - filteredJobs() { - if (!JobsList.searchTerm) { - return JobsList.jobs; - } - const search = JobsList.searchTerm.toLowerCase(); - return JobsList.jobs.filter((job: DashboardJob) => - job.job_label.toLowerCase().includes(search) - ); - }, - - view: (vnode: m.Vnode) => { - if (JobsList.loading) { - return m('div.container.mx-auto.p-4', [ - m('div.flex.justify-center.items-center.h-64', [ - m('div.loading.loading-spinner.loading-lg') - ]) - ]); - } - - if (JobsList.error) { - return m('div.container.mx-auto.p-4', [ - m('div.alert.alert-error', [ - m('span', JobsList.error), - m('div', [ - m('button.btn.btn-sm.btn-outline', { - onclick: () => JobsList.loadJobs() - }, 'Retry') - ]) - ]) - ]); - } - - return m('div.container.mx-auto.p-4', [ - // Jobs Header - m('.jobs-header.mb-6', [ - m('h1.text-3xl.font-bold.mb-4', 'Jobs'), - m('div.flex.gap-4.items-center.mb-4', [ - m('input.input.input-bordered.flex-1[placeholder="Search jobs..."]', { - value: JobsList.searchTerm, - oninput: (e: Event) => { - JobsList.searchTerm = (e.target as HTMLInputElement).value; - // Debounce search - if (JobsList.searchTimeout) clearTimeout(JobsList.searchTimeout); - JobsList.searchTimeout = setTimeout(() => JobsList.loadJobs(), 300); - } - }), - m('button.btn.btn-outline', { - onclick: () => JobsList.loadJobs() - }, 'Refresh') - ]) - ]), - - // Jobs Table - JobsList.filteredJobs().length === 0 ? - m('div.text-center.py-8.text-base-content.opacity-60', 'No jobs found') : - m('.jobs-table.card.bg-base-100.shadow-xl', [ - m('.card-body.p-0', [ - m('.overflow-x-auto', [ - m('table.table.table-zebra', [ - m('thead', [ - m('tr', [ - m('th', 'Job Label'), - m('th', 'Success Rate'), - m('th', 'Success/Total'), - m('th', 'Avg Partitions'), - m('th', 'Last Run'), - ]) - ]), - m('tbody', JobsList.filteredJobs().map((job: DashboardJob) => { - // Calculate success rate - const successRate = job.total_runs > 0 ? job.successful_runs / job.total_runs : 0; - return m('tr.hover', [ - m('td', [ - m('a.link.link-primary.font-mono.text-sm', { - href: `/jobs/${encodeJobLabel(job.job_label)}`, - onclick: (e: Event) => { - e.preventDefault(); - m.route.set(`/jobs/${encodeJobLabel(job.job_label)}`); - } - }, job.job_label) - ]), - m('td', [ - m(`span.badge.${successRate >= 0.9 ? 'badge-success' : successRate >= 0.7 ? 'badge-warning' : 'badge-error'}`, - `${Math.round(successRate * 100)}%`) - ]), - m('td', `${job.successful_runs}/${job.total_runs}`), - m('td', job.average_partitions_per_run?.toFixed(1) || '—'), - m('td.text-sm.opacity-70', - job.last_run_timestamp ? formatTime(job.last_run_timestamp) : '—'), - ]); - })) - ]) - ]) - ]) - ]) - ]); - } -}; - -export const JobMetrics: TypedComponent = { - jobLabel: '', - metrics: null as DashboardJob | null, - loading: false, - error: null as string | null, - - oninit(vnode: m.Vnode) { - JobMetrics.jobLabel = decodeJobLabel(vnode.attrs.label); - JobMetrics.loadJobMetrics(); - }, - - async loadJobMetrics() { - JobMetrics.loading = true; - JobMetrics.error = null; - - try { - const service = DashboardService.getInstance(); - JobMetrics.metrics = await service.getJobMetrics(JobMetrics.jobLabel); - if (!JobMetrics.metrics) { - JobMetrics.error = 'Job not found or no metrics available'; - } - } catch (error) { - console.error('Failed to load job metrics:', error); - JobMetrics.error = 'Failed to load job metrics. Please try again.'; - } finally { - JobMetrics.loading = false; - m.redraw(); - } - }, - - view: (vnode: m.Vnode) => { - if (JobMetrics.loading) { - return m('div.container.mx-auto.p-4', [ - m('div.flex.justify-center.items-center.h-64', [ - m('div.loading.loading-spinner.loading-lg') - ]) - ]); - } - - if (JobMetrics.error) { - return m('div.container.mx-auto.p-4', [ - m('div.alert.alert-error', [ - m('span', JobMetrics.error), - m('div', [ - m('button.btn.btn-sm.btn-outline', { - onclick: () => JobMetrics.loadJobMetrics() - }, 'Retry') - ]) - ]) - ]); - } - - if (!JobMetrics.metrics) { - return m('div.container.mx-auto.p-4', [ - m('div.text-center.py-8.text-base-content.opacity-60', 'No metrics available') - ]); - } - - const successRate = JobMetrics.metrics.total_runs > 0 ? - JobMetrics.metrics.successful_runs / JobMetrics.metrics.total_runs : 0; - - return m('div.container.mx-auto.p-4', [ - // Job Header - m('.job-header.mb-6', [ - m('h1.text-3xl.font-bold.mb-4', [ - 'Job Metrics: ', - m('span.font-mono.text-2xl', JobMetrics.jobLabel) - ]), - m('.job-stats.grid.grid-cols-1.md:grid-cols-4.gap-4.mb-6', [ - m('.stat.bg-base-100.shadow-xl.rounded-lg.p-4', [ - m('.stat-title', 'Success Rate'), - m('.stat-value.text-3xl', [ - m(`span.${successRate >= 0.9 ? 'text-success' : successRate >= 0.7 ? 'text-warning' : 'text-error'}`, - `${Math.round(successRate * 100)}%`) - ]), - m('.stat-desc', `${JobMetrics.metrics.successful_runs}/${JobMetrics.metrics.total_runs}`) - ]), - m('.stat.bg-base-100.shadow-xl.rounded-lg.p-4', [ - m('.stat-title', 'Total Runs'), - m('.stat-value.text-3xl', JobMetrics.metrics.total_runs), - m('.stat-desc', `${JobMetrics.metrics.failed_runs} failed, ${JobMetrics.metrics.cancelled_runs} cancelled`) - ]), - m('.stat.bg-base-100.shadow-xl.rounded-lg.p-4', [ - m('.stat-title', 'Last Run'), - m('.stat-value.text-2xl', [ - m(`span.badge.${JobMetrics.metrics.last_run_status === 'COMPLETED' ? 'badge-success' : - JobMetrics.metrics.last_run_status === 'FAILED' ? 'badge-error' : 'badge-warning'}`, - JobMetrics.metrics.last_run_status) - ]), - m('.stat-desc', JobMetrics.metrics.last_run_timestamp ? formatTime(JobMetrics.metrics.last_run_timestamp) : '—') - ]), - m('.stat.bg-base-100.shadow-xl.rounded-lg.p-4', [ - m('.stat-title', 'Avg Partitions'), - m('.stat-value.text-3xl', JobMetrics.metrics.average_partitions_per_run?.toFixed(1) || '—'), - m('.stat-desc', 'per run') - ]), - ]) - ]), - - // Main Content - m('.job-content.space-y-6', [ - // Recent Builds Summary - JobMetrics.metrics.recent_builds?.length > 0 && m('.recent-builds-summary.card.bg-base-100.shadow-xl', [ - m('.card-body', [ - m('h2.card-title.text-xl.mb-4', `Recent Builds (${JobMetrics.metrics.recent_builds.length})`), - m('.grid.grid-cols-1.md:grid-cols-2.lg:grid-cols-3.gap-3', - JobMetrics.metrics.recent_builds.slice(0, 9).map((buildId: string) => - m('.build-card.border.border-base-300.rounded.p-3', [ - m('a.link.link-primary.font-mono.text-sm', { - href: `/builds/${buildId}`, - onclick: (e: Event) => { - e.preventDefault(); - m.route.set(`/builds/${buildId}`); - } - }, buildId) - ]) - ) - ), - JobMetrics.metrics.recent_builds.length > 9 && - m('.text-center.mt-4.text-sm.opacity-60', - `Showing 9 of ${JobMetrics.metrics.recent_builds.length} recent builds`) - ]) - ]), - - // Job Summary Stats - m('.job-summary.card.bg-base-100.shadow-xl', [ - m('.card-body', [ - m('h2.card-title.text-xl.mb-4', 'Job Summary'), - m('.grid.grid-cols-2.md:grid-cols-4.gap-4', [ - m('.metric.text-center', [ - m('.metric-value.text-2xl.font-bold.text-success', JobMetrics.metrics.successful_runs), - m('.metric-label.text-sm.opacity-60', 'Successful') - ]), - m('.metric.text-center', [ - m('.metric-value.text-2xl.font-bold.text-error', JobMetrics.metrics.failed_runs), - m('.metric-label.text-sm.opacity-60', 'Failed') - ]), - m('.metric.text-center', [ - m('.metric-value.text-2xl.font-bold.text-warning', JobMetrics.metrics.cancelled_runs), - m('.metric-label.text-sm.opacity-60', 'Cancelled') - ]), - m('.metric.text-center', [ - m('.metric-value.text-2xl.font-bold', JobMetrics.metrics.average_partitions_per_run?.toFixed(1) || '0'), - m('.metric-label.text-sm.opacity-60', 'Avg Partitions') - ]) - ]) - ]) - ]) - ]) - ]); - } -}; - -export const GraphAnalysis: TypedComponent = { - view: (vnode: m.Vnode) => m('div.container.mx-auto.p-4', [ - m('h1.text-3xl.font-bold.mb-4', 'Graph Analysis'), - m('div.card.bg-base-100.shadow-xl', [ - m('div.card-body', [ - m('h2.card-title', 'Interactive Build Graph'), - m('p', 'Analyze partition dependencies and execution plans.'), - m('div.form-control.mb-4', [ - m('label.label', [ - m('span.label-text', 'Partition References'), - ]), - m('textarea.textarea.textarea-bordered[placeholder="Enter partition references to analyze..."]'), - ]), - m('div.card-actions.justify-end', [ - m('button.btn.btn-primary', 'Analyze Graph'), - ]), - ]), - ]), - ]) -}; \ No newline at end of file diff --git a/databuild/dashboard/pnpm-lock.yaml b/databuild/dashboard/pnpm-lock.yaml deleted file mode 100644 index ee66372..0000000 --- a/databuild/dashboard/pnpm-lock.yaml +++ /dev/null @@ -1,111 +0,0 @@ -lockfileVersion: '9.0' -settings: - autoInstallPeers: true - excludeLinksFromLockfile: false -importers: - .: - devDependencies: - '@types/mithril': - specifier: ^2.2.7 - version: 2.2.7 - '@types/node': - specifier: ^22.12.0 - version: 22.12.0 - '@types/ospec': - specifier: ^4.2.0 - version: 4.2.0 - daisyui: - specifier: ^5.0.0-beta.6 - version: 5.0.0-beta.6 - mithril: - specifier: ^2.2.7 - version: 2.2.13 - ospec: - specifier: ^4.2.0 - version: 4.2.1 - typescript: - specifier: ^5.7.3 - version: 5.7.3 - whatwg-fetch: - specifier: ^3.6.20 - version: 3.6.20 -packages: - '@types/mithril@2.2.7': - resolution: {integrity: sha512-uetxoYizBMHPELl6DSZUfO6Q/aOm+h0NUCv9bVAX2iAxfrdBSOvU9KKFl+McTtxR13F+BReYLY814pJsZvnSxg==} - '@types/node@22.12.0': - resolution: {integrity: sha512-Fll2FZ1riMjNmlmJOdAyY5pUbkftXslB5DgEzlIuNaiWhXd00FhWxVC/r4yV/4wBb9JfImTu+jiSvXTkJ7F/gA==} - '@types/ospec@4.2.0': - resolution: {integrity: sha512-QgwAtrYYstU7otBXmQ2yjUWaYMWkF48EevmG+IfYzAWk39cwsTw7ZHp7dK2XyA3eJ2v5AvbMa5ijcLewklDRDA==} - balanced-match@1.0.2: - resolution: {integrity: sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==} - brace-expansion@2.0.1: - resolution: {integrity: sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==} - daisyui@5.0.0-beta.6: - resolution: {integrity: sha512-gwXHv6MApRBrvUayzg83vS6bfZ+y7/1VGLu0a8/cEAMviS4rXLCd4AndEdlVxhq+25wkAp0CZRkNQ7O4wIoFnQ==} - fs.realpath@1.0.0: - resolution: {integrity: sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==} - glob@9.3.5: - resolution: {integrity: sha512-e1LleDykUz2Iu+MTYdkSsuWX8lvAjAcs0Xef0lNIu0S2wOAzuTxCJtcd9S3cijlwYF18EsU3rzb8jPVobxDh9Q==} - engines: {node: '>=16 || 14 >=14.17'} - lru-cache@10.4.3: - resolution: {integrity: sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==} - minimatch@8.0.4: - resolution: {integrity: sha512-W0Wvr9HyFXZRGIDgCicunpQ299OKXs9RgZfaukz4qAW/pJhcpUfupc9c+OObPOFueNy8VSrZgEmDtk6Kh4WzDA==} - engines: {node: '>=16 || 14 >=14.17'} - minipass@4.2.8: - resolution: {integrity: sha512-fNzuVyifolSLFL4NzpF+wEF4qrgqaaKX0haXPQEdQ7NKAN+WecoKMHV09YcuL/DHxrUsYQOK3MiuDf7Ip2OXfQ==} - engines: {node: '>=8'} - minipass@7.1.2: - resolution: {integrity: sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==} - engines: {node: '>=16 || 14 >=14.17'} - mithril@2.2.13: - resolution: {integrity: sha512-dfWFYmRJDXAROG6B1AsQXEwhSgFZ65Am/5Xj3oJ/R1wZtrC0W20P4sIAtFQB0SZsGwV7H2MiEJiFGmlUtXF1Ww==} - ospec@4.2.1: - resolution: {integrity: sha512-LsJw2WMaVlFDiaIPPH+LMtsxOABjFD29XQ12ENZM+8Cwgg5BEgW65CB+SPL1PceIun+HSfdw8hkf27C8iF/XFw==} - hasBin: true - path-scurry@1.11.1: - resolution: {integrity: sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==} - engines: {node: '>=16 || 14 >=14.18'} - typescript@5.7.3: - resolution: {integrity: sha512-84MVSjMEHP+FQRPy3pX9sTVV/INIex71s9TL2Gm5FG/WG1SqXeKyZ0k7/blY/4FdOzI12CBy1vGc4og/eus0fw==} - engines: {node: '>=14.17'} - hasBin: true - undici-types@6.20.0: - resolution: {integrity: sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==} - whatwg-fetch@3.6.20: - resolution: {integrity: sha512-EqhiFU6daOA8kpjOWTL0olhVOF3i7OrFzSYiGsEMB8GcXS+RrzauAERX65xMeNWVqxA6HXH2m69Z9LaKKdisfg==} -snapshots: - '@types/mithril@2.2.7': {} - '@types/node@22.12.0': - dependencies: - undici-types: 6.20.0 - '@types/ospec@4.2.0': {} - balanced-match@1.0.2: {} - brace-expansion@2.0.1: - dependencies: - balanced-match: 1.0.2 - daisyui@5.0.0-beta.6: {} - fs.realpath@1.0.0: {} - glob@9.3.5: - dependencies: - fs.realpath: 1.0.0 - minimatch: 8.0.4 - minipass: 4.2.8 - path-scurry: 1.11.1 - lru-cache@10.4.3: {} - minimatch@8.0.4: - dependencies: - brace-expansion: 2.0.1 - minipass@4.2.8: {} - minipass@7.1.2: {} - mithril@2.2.13: {} - ospec@4.2.1: - dependencies: - glob: 9.3.5 - path-scurry@1.11.1: - dependencies: - lru-cache: 10.4.3 - minipass: 7.1.2 - typescript@5.7.3: {} - undici-types@6.20.0: {} - whatwg-fetch@3.6.20: {} diff --git a/databuild/dashboard/pnpm-workspace.yaml b/databuild/dashboard/pnpm-workspace.yaml deleted file mode 100644 index c713fa4..0000000 --- a/databuild/dashboard/pnpm-workspace.yaml +++ /dev/null @@ -1,2 +0,0 @@ -packages: - - "databuild" diff --git a/databuild/dashboard/services.ts b/databuild/dashboard/services.ts deleted file mode 100644 index 62de35b..0000000 --- a/databuild/dashboard/services.ts +++ /dev/null @@ -1,490 +0,0 @@ -// Import the generated TypeScript client -import { - DefaultApi, - Configuration, - ActivityApiResponse, - ActivityResponse, - BuildSummary, - BuildDetailResponse, - PartitionSummary, - JobsListApiResponse, - JobMetricsResponse, - JobSummary, - JobRunSummary, - JobDailyStats -} from '../client/typescript_generated/src/index'; - -// Import our dashboard types -import { - DashboardActivity, - DashboardBuild, - DashboardPartition, - DashboardJob, - isDashboardActivity, - isDashboardBuild, - isDashboardPartition, - isDashboardJob -} from './types'; - -// Configure the API client -const apiConfig = new Configuration({ - basePath: '', // Use relative paths since we're on the same host -}); -const apiClient = new DefaultApi(apiConfig); - -// Transformation functions: Convert API responses to dashboard types -// These functions prevent runtime errors by ensuring consistent data shapes - -function transformBuildSummary(apiResponse: BuildSummary): DashboardBuild { - return { - build_request_id: apiResponse.build_request_id, - status: apiResponse.status!, - requested_partitions: apiResponse.requested_partitions, // Keep as PartitionRef array - total_jobs: apiResponse.total_jobs, - completed_jobs: apiResponse.completed_jobs, - failed_jobs: apiResponse.failed_jobs, - cancelled_jobs: apiResponse.cancelled_jobs, - requested_at: apiResponse.requested_at, - started_at: apiResponse.started_at ?? null, - completed_at: apiResponse.completed_at ?? null, - duration_ms: apiResponse.duration_ms ?? null, - cancelled: apiResponse.cancelled, - }; -} - -function transformBuildDetail(apiResponse: BuildDetailResponse): DashboardBuild { - return { - build_request_id: apiResponse.build_request_id, - status: apiResponse.status!, - requested_partitions: apiResponse.requested_partitions, // Keep as PartitionRef array - total_jobs: apiResponse.total_jobs, - completed_jobs: apiResponse.completed_jobs, - failed_jobs: apiResponse.failed_jobs, - cancelled_jobs: apiResponse.cancelled_jobs, - requested_at: apiResponse.requested_at, - started_at: apiResponse.started_at ?? null, - completed_at: apiResponse.completed_at ?? null, - duration_ms: apiResponse.duration_ms ?? null, - cancelled: apiResponse.cancelled, - }; -} - -function transformPartitionSummary(apiResponse: PartitionSummary): DashboardPartition { - if (!apiResponse.partition_ref) { - throw new Error('PartitionSummary must have a valid partition_ref'); - } - - return { - partition_ref: apiResponse.partition_ref, // Keep as PartitionRef object - status_code: apiResponse.status_code, - status_name: apiResponse.status_name, - last_updated: apiResponse.last_updated ?? null, - build_requests: (apiResponse as any).build_requests || [], // This field might not be in the OpenAPI spec - }; -} - -function transformJobSummary(apiResponse: JobSummary): DashboardJob { - return { - job_label: apiResponse.job_label, - total_runs: apiResponse.total_runs, - successful_runs: apiResponse.successful_runs, - failed_runs: apiResponse.failed_runs, - cancelled_runs: apiResponse.cancelled_runs, - last_run_timestamp: apiResponse.last_run_timestamp, - last_run_status_code: apiResponse.last_run_status_code, - last_run_status_name: apiResponse.last_run_status_name, - average_partitions_per_run: apiResponse.average_partitions_per_run, - recent_builds: apiResponse.recent_builds || [], // Default for optional array field - }; -} - -function transformActivityResponse(apiResponse: ActivityResponse): DashboardActivity { - return { - active_builds_count: apiResponse.active_builds_count, - recent_builds: apiResponse.recent_builds.map(transformBuildSummary), - recent_partitions: apiResponse.recent_partitions.map(transformPartitionSummary), - total_partitions_count: apiResponse.total_partitions_count, - system_status: apiResponse.system_status, - graph_name: apiResponse.graph_name, - }; -} - -// Type guards for runtime validation -function isValidBuildDetailResponse(data: unknown): data is BuildDetailResponse { - return typeof data === 'object' && - data !== null && - 'build_request_id' in data && - 'status_name' in data && - 'requested_partitions' in data; -} - -function isValidActivityResponse(data: unknown): data is ActivityResponse { - return typeof data === 'object' && - data !== null && - 'active_builds_count' in data && - 'recent_builds' in data && - 'recent_partitions' in data; -} - -function isValidJobsListApiResponse(data: unknown): data is JobsListApiResponse { - return typeof data === 'object' && - data !== null && - 'data' in data; -} - -// API Service for fetching recent activity data -export class DashboardService { - private static instance: DashboardService; - - static getInstance(): DashboardService { - if (!DashboardService.instance) { - DashboardService.instance = new DashboardService(); - } - return DashboardService.instance; - } - - async getRecentActivity(): Promise { - try { - // Use the new activity endpoint that aggregates all the data we need - const activityApiResponse: ActivityApiResponse = await apiClient.apiV1ActivityGet(); - console.info('Recent activity:', activityApiResponse); - - const activityResponse = activityApiResponse.data; - - // Validate API response structure - if (!isValidActivityResponse(activityResponse)) { - throw new Error('Invalid activity response structure'); - } - - // Transform API response to dashboard format using transformation function - const dashboardActivity = transformActivityResponse(activityResponse); - - // Validate transformed result - if (!isDashboardActivity(dashboardActivity)) { - throw new Error('Transformation produced invalid dashboard activity'); - } - - return dashboardActivity; - } catch (error) { - console.error('Failed to fetch recent activity:', error); - - // Fall back to valid dashboard format if API call fails - return { - active_builds_count: 0, - recent_builds: [], - recent_partitions: [], - total_partitions_count: 0, - system_status: 'error', - graph_name: 'Unknown Graph' - }; - } - } - - async getJobs(searchTerm?: string): Promise { - try { - // Build query parameters manually since the generated client may not support query params correctly - const queryParams = new URLSearchParams(); - if (searchTerm) { - queryParams.append('search', searchTerm); - } - const url = `/api/v1/jobs${queryParams.toString() ? '?' + queryParams.toString() : ''}`; - - const response = await fetch(url); - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - const data: unknown = await response.json(); - - // Validate API response structure - if (!isValidJobsListApiResponse(data)) { - throw new Error('Invalid jobs list response structure'); - } - - // Transform each job using our transformation function - const dashboardJobs = data.data.jobs.map(transformJobSummary); - - // Validate each transformed job - for (const job of dashboardJobs) { - if (!isDashboardJob(job)) { - throw new Error('Transformation produced invalid dashboard job'); - } - } - - return dashboardJobs; - } catch (error) { - console.error('Failed to fetch jobs:', error); - return []; - } - } - - async getBuildDetail(buildId: string): Promise { - try { - const url = `/api/v1/builds/${buildId}`; - - const response = await fetch(url); - if (!response.ok) { - if (response.status === 404) { - return null; // Build not found - } - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - const data: unknown = await response.json(); - - // Validate API response structure - if (!isValidBuildDetailResponse(data)) { - throw new Error('Invalid build detail response structure'); - } - - // Transform to dashboard format - const dashboardBuild = transformBuildDetail(data); - - // Validate transformed result - if (!isDashboardBuild(dashboardBuild)) { - throw new Error('Transformation produced invalid dashboard build'); - } - - return dashboardBuild; - } catch (error) { - console.error('Failed to fetch build detail:', error); - return null; - } - } - - async getPartitionDetail(partitionRef: string): Promise { - try { - // Encode partition ref for URL safety - const encodedRef = btoa(partitionRef).replace(/\+/g, '-').replace(/\//g, '_').replace(/=/g, ''); - const url = `/api/v1/partitions/${encodedRef}`; - - const response = await fetch(url); - if (!response.ok) { - if (response.status === 404) { - return null; // Partition not found - } - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - const data: unknown = await response.json(); - - // For partition detail, we need to extract the PartitionSummary from the response - // and transform it to dashboard format - if (typeof data === 'object' && data !== null && 'partition_ref' in data) { - const dashboardPartition = transformPartitionSummary(data as PartitionSummary); - - if (!isDashboardPartition(dashboardPartition)) { - throw new Error('Transformation produced invalid dashboard partition'); - } - - return dashboardPartition; - } else { - throw new Error('Invalid partition detail response structure'); - } - } catch (error) { - console.error('Failed to fetch partition detail:', error); - return null; - } - } - - async getJobMetrics(jobLabel: string): Promise { - try { - // Encode job label like partition refs for URL safety - const encodedLabel = btoa(jobLabel).replace(/\+/g, '-').replace(/\//g, '_').replace(/=/g, ''); - const url = `/api/v1/jobs/${encodedLabel}`; - - const response = await fetch(url); - if (!response.ok) { - if (response.status === 404) { - return null; // Job not found - } - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - const data: unknown = await response.json(); - console.log('Job metrics response:', data); - - // Extract job summary from metrics response and transform it - if (typeof data === 'object' && data !== null && 'job_label' in data) { - const dashboardJob = transformJobSummary(data as unknown as JobSummary); - console.log('Transformed job summary:', dashboardJob); - - if (!isDashboardJob(dashboardJob)) { - throw new Error('Transformation produced invalid dashboard job'); - } - - return dashboardJob; - } - - throw new Error('Invalid job metrics response structure'); - } catch (error) { - console.error('Failed to fetch job metrics:', error); - return null; - } - } - - async getMermaidDiagram(buildId: string): Promise { - try { - const url = `/api/v1/builds/${buildId}/mermaid`; - - const response = await fetch(url); - if (!response.ok) { - if (response.status === 404) { - return null; // Build not found or no job graph - } - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - - const data = await response.json(); - - // Validate response structure - if (typeof data === 'object' && data !== null && 'mermaid' in data && typeof data.mermaid === 'string') { - return data.mermaid; - } - - throw new Error('Invalid mermaid response structure'); - } catch (error) { - console.error('Failed to fetch mermaid diagram:', error); - return null; - } - } -} - -// Polling manager with Page Visibility API integration -export class PollingManager { - private intervals: Map = new Map(); - private isTabVisible: boolean = true; - private visibilityChangeHandler: () => void; - - constructor() { - this.visibilityChangeHandler = () => { - this.isTabVisible = !document.hidden; - - // Pause or resume polling based on tab visibility - if (this.isTabVisible) { - this.resumePolling(); - } else { - this.pausePolling(); - } - }; - - // Set up Page Visibility API listener only in browser environment - if (typeof document !== 'undefined') { - document.addEventListener('visibilitychange', this.visibilityChangeHandler); - } - } - - startPolling(key: string, callback: () => void, intervalMs: number): void { - // Clear existing interval if any - this.stopPolling(key); - - // Only start polling if tab is visible - if (this.isTabVisible) { - const interval = setInterval(callback, intervalMs); - this.intervals.set(key, interval); - } - } - - stopPolling(key: string): void { - const interval = this.intervals.get(key); - if (interval) { - clearInterval(interval); - this.intervals.delete(key); - } - } - - private pausePolling(): void { - // Store current intervals but clear them - for (const [key, interval] of this.intervals) { - clearInterval(interval); - } - } - - private resumePolling(): void { - // This is a simplified approach - in practice you'd want to store the callback - // and interval info to properly resume. For now, components will handle this - // by checking visibility state when setting up polling. - } - - cleanup(): void { - // Clean up all intervals - for (const interval of this.intervals.values()) { - clearInterval(interval); - } - this.intervals.clear(); - - // Remove event listener only in browser environment - if (typeof document !== 'undefined') { - document.removeEventListener('visibilitychange', this.visibilityChangeHandler); - } - } - - isVisible(): boolean { - return this.isTabVisible; - } -} - -// Export singleton instance -export const pollingManager = new PollingManager(); - -// Utility functions for time formatting -export function formatTime(epochNanos: number): string { - const date = new Date(epochNanos / 1000000); - const now = new Date(); - const diffMs = now.getTime() - date.getTime(); - - if (diffMs < 60000) { // Less than 1 minute - return 'just now'; - } else if (diffMs < 3600000) { // Less than 1 hour - const minutes = Math.floor(diffMs / 60000); - return `${minutes}m ago`; - } else if (diffMs < 86400000) { // Less than 1 day - const hours = Math.floor(diffMs / 3600000); - return `${hours}h ago`; - } else { - return date.toLocaleDateString(); - } -} - -export function formatDateTime(epochNanos: number): string { - const date = new Date(epochNanos / 1000000); - const dateStr = date.toLocaleDateString('en-US'); - const timeStr = date.toLocaleTimeString('en-US', { - hour: 'numeric', - minute: '2-digit', - second: '2-digit', - hour12: true, - timeZoneName: 'short' - }); - const millisStr = date.getMilliseconds().toString().padStart(3, '0'); - - // Insert milliseconds between seconds and AM/PM: "7/12/2025, 9:03:48.264 AM EST" - return `${dateStr}, ${timeStr.replace(/(\d{2})\s+(AM|PM)/, `$1.${millisStr} $2`)}`; -} - -export function formatDuration(durationNanos?: number | null): string { - let durationMs = durationNanos ? durationNanos / 1000000 : null; - console.warn('Formatting duration:', durationMs); - if (!durationMs || durationMs <= 0) { - return '—'; - } - - if (durationMs < 1000) { - return `${Math.round(durationMs)}ms`; - } else if (durationMs < 60000) { - return `${(durationMs / 1000).toFixed(1)}s`; - } else if (durationMs < 3600000) { - const minutes = Math.floor(durationMs / 60000); - const seconds = Math.floor((durationMs % 60000) / 1000); - return `${minutes}m ${seconds}s`; - } else { - const hours = Math.floor(durationMs / 3600000); - const minutes = Math.floor((durationMs % 3600000) / 60000); - return `${hours}h ${minutes}m`; - } -} - -export function formatDate(epochNanos: number): string { - const date = new Date(epochNanos / 1000000); - return date.toLocaleDateString('en-US', { - month: 'short', - day: 'numeric', - year: 'numeric' - }); -} \ No newline at end of file diff --git a/databuild/dashboard/test-data/strict-config-failures.ts b/databuild/dashboard/test-data/strict-config-failures.ts deleted file mode 100644 index b394ff7..0000000 --- a/databuild/dashboard/test-data/strict-config-failures.ts +++ /dev/null @@ -1,44 +0,0 @@ -// Test file designed to fail TypeScript compilation with strict config -// These are the exact patterns that caused runtime failures in production - -// Test 1: Reproduce original status.toLowerCase() failure -const mockResponseWithStatusObject = { status_code: 1, status_name: "COMPLETED" }; - -// This should cause compilation error: Property 'status' does not exist -const test1 = mockResponseWithStatusObject.status?.toLowerCase(); - -// Test 2: Reproduce original status?.status access failure -const test2 = mockResponseWithStatusObject.status?.status; - -// Test 3: Optional field access without null check -interface PartitionSummaryTest { - last_updated?: number; - partition_ref: string; -} - -const testPartition: PartitionSummaryTest = { - partition_ref: "test-partition" -}; - -// This should fail: accessing optional field without null check -const timestamp = testPartition.last_updated.toString(); - -// Test 4: Exact optional property types -interface StrictTest { - required: string; - optional?: string; -} - -// This should fail with exactOptionalPropertyTypes -const testObj: StrictTest = { - required: "test", - optional: undefined // undefined not assignable to optional string -}; - -// Test 5: Array access without undefined handling -const testArray: string[] = ["a", "b", "c"]; -const element: string = testArray[10]; // Should include undefined in type - -// Test 6: Null access without proper checks -let possiblyNull: string | null = Math.random() > 0.5 ? "value" : null; -const upperCase = possiblyNull.toUpperCase(); // Should fail with strictNullChecks \ No newline at end of file diff --git a/databuild/dashboard/test-strict-config.sh b/databuild/dashboard/test-strict-config.sh deleted file mode 100755 index 64ac3ca..0000000 --- a/databuild/dashboard/test-strict-config.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -# Test script to verify strict TypeScript configuration catches expected failures - -set -e - -echo "Testing strict TypeScript configuration..." - -# Find TypeScript compiler in runfiles -if [[ -n "${RUNFILES_DIR:-}" ]]; then - TSC="${RUNFILES_DIR}/_main/databuild/dashboard/node_modules/typescript/bin/tsc" -else - # Fallback for local execution - TSC="$(find . -name tsc -type f | head -1)" - if [[ -z "$TSC" ]]; then - echo "ERROR: Could not find TypeScript compiler" - exit 1 - fi -fi - -# Get paths relative to runfiles -if [[ -n "${RUNFILES_DIR:-}" ]]; then - TEST_DATA_DIR="${RUNFILES_DIR}/_main/databuild/dashboard/test-data" - TSCONFIG="${RUNFILES_DIR}/_main/databuild/dashboard/tsconfig_app.json" -else - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - TEST_DATA_DIR="$SCRIPT_DIR/test-data" - TSCONFIG="$SCRIPT_DIR/tsconfig_app.json" -fi - -# Function to test that TypeScript compilation fails with expected errors -test_compilation_failures() { - local test_file="$1" - local expected_errors="$2" - - echo "Testing compilation failures for: $test_file" - - # Run TypeScript compilation and capture output - if node "$TSC" --noEmit --strict --strictNullChecks --noImplicitAny --noImplicitReturns --noUncheckedIndexedAccess --exactOptionalPropertyTypes "$test_file" 2>&1; then - echo "ERROR: Expected TypeScript compilation to fail for $test_file, but it passed" - return 1 - fi - - # Check that we get the expected error patterns - local tsc_output=$(node "$TSC" --noEmit --strict --strictNullChecks --noImplicitAny --noImplicitReturns --noUncheckedIndexedAccess --exactOptionalPropertyTypes "$test_file" 2>&1 || true) - - IFS='|' read -ra ERROR_PATTERNS <<< "$expected_errors" - for pattern in "${ERROR_PATTERNS[@]}"; do - if ! echo "$tsc_output" | grep -q "$pattern"; then - echo "ERROR: Expected error pattern '$pattern' not found in TypeScript output" - echo "Actual output:" - echo "$tsc_output" - return 1 - fi - done - - echo "✓ Compilation correctly failed with expected errors" -} - -# Test 1: Verify strict config catches undefined property access -test_compilation_failures "$TEST_DATA_DIR/strict-config-failures.ts" "Property 'status' does not exist|is possibly 'undefined'|Type 'undefined' is not assignable" - -echo "All strict TypeScript configuration tests passed!" -echo "" -echo "Summary of what strict config catches:" -echo "- ✓ Undefined property access (status.toLowerCase() failures)" -echo "- ✓ Optional field access without null checks" -echo "- ✓ Exact optional property type mismatches" -echo "- ✓ Array access without undefined handling" -echo "- ✓ Null/undefined access without proper checks" \ No newline at end of file diff --git a/databuild/dashboard/transformation-tests.ts b/databuild/dashboard/transformation-tests.ts deleted file mode 100644 index 99611db..0000000 --- a/databuild/dashboard/transformation-tests.ts +++ /dev/null @@ -1,318 +0,0 @@ -// Phase 3.5: Unit tests for transformation functions -// These tests verify that transformation functions prevent the observed runtime failures - -import o from 'ospec'; -import { - BuildSummary, - BuildDetailResponse, - PartitionSummary, - JobSummary, - ActivityResponse, - BuildRequestStatus -} from '../client/typescript_generated/src/index'; - -// Import types directly since we're now in the same ts_project -import { - DashboardActivity, - DashboardBuild, - DashboardPartition, - DashboardJob, - isDashboardActivity, - isDashboardBuild, - isDashboardPartition, - isDashboardJob -} from './types'; - -// Mock transformation functions for testing (since they're not exported from services.ts) -function transformBuildSummary(apiResponse: BuildSummary): DashboardBuild { - return { - build_request_id: apiResponse.build_request_id, - status: apiResponse.status!, - requested_partitions: apiResponse.requested_partitions, // Keep as PartitionRef array - total_jobs: apiResponse.total_jobs, - completed_jobs: apiResponse.completed_jobs, - failed_jobs: apiResponse.failed_jobs, - cancelled_jobs: apiResponse.cancelled_jobs, - requested_at: apiResponse.requested_at, - started_at: apiResponse.started_at ?? null, - completed_at: apiResponse.completed_at ?? null, - duration_ms: apiResponse.duration_ms ?? null, - cancelled: apiResponse.cancelled, - }; -} - -function transformBuildDetail(apiResponse: BuildDetailResponse): DashboardBuild { - return { - build_request_id: apiResponse.build_request_id, - status: apiResponse.status!, - requested_partitions: apiResponse.requested_partitions, // Keep as PartitionRef array - total_jobs: apiResponse.total_jobs, - completed_jobs: apiResponse.completed_jobs, - failed_jobs: apiResponse.failed_jobs, - cancelled_jobs: apiResponse.cancelled_jobs, - requested_at: apiResponse.requested_at, - started_at: apiResponse.started_at ?? null, - completed_at: apiResponse.completed_at ?? null, - duration_ms: apiResponse.duration_ms ?? null, - cancelled: apiResponse.cancelled, - }; -} - -function transformPartitionSummary(apiResponse: any): DashboardPartition { - return { - partition_ref: apiResponse.partition_ref, // Keep as PartitionRef object - status_code: apiResponse.status_code, - status_name: apiResponse.status_name, - last_updated: apiResponse.last_updated ?? null, - build_requests: apiResponse.build_requests || [], - }; -} - -function transformJobSummary(apiResponse: JobSummary): DashboardJob { - return { - job_label: apiResponse.job_label, - total_runs: apiResponse.total_runs, - successful_runs: apiResponse.successful_runs, - failed_runs: apiResponse.failed_runs, - cancelled_runs: apiResponse.cancelled_runs, - last_run_timestamp: apiResponse.last_run_timestamp, - last_run_status_code: apiResponse.last_run_status_code, - last_run_status_name: apiResponse.last_run_status_name, - average_partitions_per_run: apiResponse.average_partitions_per_run, - recent_builds: apiResponse.recent_builds || [], - }; -} - -function transformActivityResponse(apiResponse: ActivityResponse): DashboardActivity { - return { - active_builds_count: apiResponse.active_builds_count, - recent_builds: apiResponse.recent_builds.map(transformBuildSummary), - recent_partitions: apiResponse.recent_partitions.map(transformPartitionSummary), - total_partitions_count: apiResponse.total_partitions_count, - system_status: apiResponse.system_status, - graph_name: apiResponse.graph_name, - }; -} - -// Test Data Mocks -const mockBuildSummary: BuildSummary = { - build_request_id: 'build-123', - status: {code: 4, name: 'COMPLETED'}, - requested_partitions: [{ str: 'partition-1' }, { str: 'partition-2' }], - total_jobs: 5, - completed_jobs: 5, - failed_jobs: 0, - cancelled_jobs: 0, - requested_at: 1640995200000000000, // 2022-01-01 00:00:00 UTC in nanos - started_at: 1640995260000000000, // 2022-01-01 00:01:00 UTC in nanos - completed_at: 1640995320000000000, // 2022-01-01 00:02:00 UTC in nanos - duration_ms: 60000, // 1 minute - cancelled: false -}; - -const mockPartitionSummary: any = { - partition_ref: { str: 'test-partition' }, - status_code: 4, // PARTITION_AVAILABLE - status_name: 'AVAILABLE', - last_updated: 1640995200000000000, - builds_count: 3, - invalidation_count: 0, - build_requests: ['build-123', 'build-124'], - last_successful_build: 'build-123' -}; - -const mockJobSummary: JobSummary = { - job_label: '//:test-job', - total_runs: 10, - successful_runs: 9, - failed_runs: 1, - cancelled_runs: 0, - average_partitions_per_run: 2.5, - last_run_timestamp: 1640995200000000000, - last_run_status_code: 3, // JOB_COMPLETED - last_run_status_name: 'COMPLETED', - recent_builds: ['build-123', 'build-124'] -}; - -const mockActivityResponse: ActivityResponse = { - active_builds_count: 2, - recent_builds: [mockBuildSummary], - recent_partitions: [mockPartitionSummary], - total_partitions_count: 100, - system_status: 'healthy', - graph_name: 'test-graph' -}; - -// Test Suite -o.spec('Transformation Functions', () => { - o('transformBuildSummary handles status fields correctly', () => { - const result = transformBuildSummary(mockBuildSummary); - - // The key fix: status_name should be a string, status_code a number - o(typeof result.status?.code).equals('number'); - o(typeof result.status?.name).equals('string'); - o(result.status.name).equals('COMPLETED'); - - // This should not throw (preventing the original runtime error) - o(() => result.status.name.toLowerCase()).notThrows('status_name.toLowerCase should work'); - }); - - o('transformBuildSummary handles null optional fields', () => { - const buildWithNulls: BuildSummary = { - ...mockBuildSummary, - started_at: null, - completed_at: null, - duration_ms: null - }; - - const result = transformBuildSummary(buildWithNulls); - - // Explicit null handling prevents undefined property access - o(result.started_at).equals(null); - o(result.completed_at).equals(null); - o(result.duration_ms).equals(null); - }); - - o('transformPartitionSummary preserves PartitionRef objects correctly', () => { - const result = transformPartitionSummary(mockPartitionSummary); - - // The key fix: partition_ref should remain as PartitionRef object - o(typeof result.partition_ref).equals('object'); - o(result.partition_ref.str).equals('test-partition'); - - // This should not throw (preventing original runtime errors) - o(() => result.partition_ref.str.toLowerCase()).notThrows('partition_ref.str.toLowerCase should work'); - }); - - o('transformPartitionSummary handles missing arrays safely', () => { - const partitionWithoutArray: any = { - ...mockPartitionSummary - }; - delete partitionWithoutArray.build_requests; - - const result = transformPartitionSummary(partitionWithoutArray); - - // Should default to empty array, preventing length/iteration errors - o(Array.isArray(result.build_requests)).equals(true); - o(result.build_requests.length).equals(0); - }); - - o('transformJobSummary handles status fields correctly', () => { - const result = transformJobSummary(mockJobSummary); - - // The key fix: both status code and name should be preserved - o(typeof result.last_run_status_code).equals('number'); - o(typeof result.last_run_status_name).equals('string'); - o(result.last_run_status_name).equals('COMPLETED'); - - // This should not throw - o(() => result.last_run_status_name.toLowerCase()).notThrows('last_run_status_name.toLowerCase should work'); - }); - - o('transformActivityResponse maintains structure consistency', () => { - const result = transformActivityResponse(mockActivityResponse); - - // Should pass our type guard - o(isDashboardActivity(result)).equals(true); - - // All nested objects should be properly transformed - o(result.recent_builds.length).equals(1); - o(typeof result.recent_builds[0]?.status.name).equals('string'); - - o(result.recent_partitions.length).equals(1); - o(typeof result.recent_partitions[0]?.partition_ref).equals('object'); - o(typeof result.recent_partitions[0]?.partition_ref.str).equals('string'); - }); - - o('transformations prevent original runtime failures', () => { - const result = transformActivityResponse(mockActivityResponse); - - // These are the exact patterns that caused runtime failures: - - // 1. status_name.toLowerCase() - should not crash - result.recent_builds.forEach((build: DashboardBuild) => { - o(() => build.status.name.toLowerCase()).notThrows('build.status.name.toLowerCase should work'); - o(build.status.name.toLowerCase()).equals('completed'); - }); - - // 2. partition_ref.str access - should access string property - result.recent_partitions.forEach((partition: DashboardPartition) => { - o(typeof partition.partition_ref).equals('object'); - o(typeof partition.partition_ref.str).equals('string'); - o(() => partition.partition_ref.str.toLowerCase()).notThrows('partition.partition_ref.str.toLowerCase should work'); - }); - - // 3. Null/undefined handling - should be explicit - result.recent_builds.forEach((build: DashboardBuild) => { - // These fields can be null but never undefined - o(build.started_at === null || typeof build.started_at === 'number').equals(true); - o(build.completed_at === null || typeof build.completed_at === 'number').equals(true); - o(build.duration_ms === null || typeof build.duration_ms === 'number').equals(true); - }); - }); -}); - -// Edge Cases and Error Conditions -o.spec('Transformation Edge Cases', () => { - o('handles empty arrays correctly', () => { - const emptyActivity: ActivityResponse = { - ...mockActivityResponse, - recent_builds: [], - recent_partitions: [] - }; - - const result = transformActivityResponse(emptyActivity); - - o(Array.isArray(result.recent_builds)).equals(true); - o(result.recent_builds.length).equals(0); - o(Array.isArray(result.recent_partitions)).equals(true); - o(result.recent_partitions.length).equals(0); - }); - - o('handles malformed PartitionRef gracefully', () => { - const malformedPartition: any = { - ...mockPartitionSummary, - partition_ref: { str: '' } // Empty string - }; - - const result = transformPartitionSummary(malformedPartition); - - o(typeof result.partition_ref.str).equals('string'); - o(result.partition_ref.str).equals(''); - }); - - o('transformations produce valid dashboard types', () => { - // Test that all transformation results pass type guards - const transformedBuild = transformBuildSummary(mockBuildSummary); - const transformedPartition = transformPartitionSummary(mockPartitionSummary); - const transformedJob = transformJobSummary(mockJobSummary); - const transformedActivity = transformActivityResponse(mockActivityResponse); - - o(isDashboardBuild(transformedBuild)).equals(true); - o(isDashboardPartition(transformedPartition)).equals(true); - o(isDashboardJob(transformedJob)).equals(true); - o(isDashboardActivity(transformedActivity)).equals(true); - }); -}); - -// Performance and Memory Tests -o.spec('Transformation Performance', () => { - o('transforms large datasets efficiently', () => { - const largeActivity: ActivityResponse = { - ...mockActivityResponse, - recent_builds: Array(1000).fill(mockBuildSummary), - recent_partitions: Array(1000).fill(mockPartitionSummary) - }; - - const start = Date.now(); - const result = transformActivityResponse(largeActivity); - const duration = Date.now() - start; - - // Should complete transformation in reasonable time - o(duration < 1000).equals(true); // Less than 1 second - o(result.recent_builds.length).equals(1000); - o(result.recent_partitions.length).equals(1000); - }); -}); - -// Export default removed - tests are run by importing this file \ No newline at end of file diff --git a/databuild/dashboard/tsconfig_app.json b/databuild/dashboard/tsconfig_app.json deleted file mode 100644 index 0807b5d..0000000 --- a/databuild/dashboard/tsconfig_app.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "compilerOptions": { - "target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ - "lib": ["es6","dom", "es2021"], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ - "module": "commonjs", /* Specify what module code is generated. */ - "rootDir": "./", /* Specify the root folder within your source files. */ - "moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */ - "resolveJsonModule": true, /* Enable importing .json files. */ - "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */ - "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ - "esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */ - "forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */ - "strict": true, /* Enable all strict type-checking options. */ - "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */ - "strictNullChecks": true, /* Enable error reporting for null and undefined values. */ - "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return. */ - "noUncheckedIndexedAccess": true, /* Add 'undefined' to index signature results. */ - "exactOptionalPropertyTypes": true, /* Ensure optional property types are exact. */ - "skipLibCheck": true /* Skip type checking all .d.ts files. */ - } -} \ No newline at end of file diff --git a/databuild/dashboard/tsconfig_test.json b/databuild/dashboard/tsconfig_test.json deleted file mode 100644 index e092559..0000000 --- a/databuild/dashboard/tsconfig_test.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "compilerOptions": { - "target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ - "lib": ["es6","dom"], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ - "module": "commonjs", /* Specify what module code is generated. */ - "rootDir": "./", /* Specify the root folder within your source files. */ - "moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */ - "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ - "resolveJsonModule": true, /* Enable importing .json files. */ - "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */ - "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ - "esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */ - "forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */ - "strict": true, /* Enable all strict type-checking options. */ - "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */ - "strictNullChecks": true, /* Enable error reporting for null and undefined values. */ - "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return. */ - "noUncheckedIndexedAccess": true, /* Add 'undefined' to index signature results. */ - "exactOptionalPropertyTypes": true, /* Ensure optional property types are exact. */ - "skipLibCheck": true /* Skip type checking all .d.ts files. */ - } -} diff --git a/databuild/dashboard/types.ts b/databuild/dashboard/types.ts deleted file mode 100644 index 8dc82c3..0000000 --- a/databuild/dashboard/types.ts +++ /dev/null @@ -1,285 +0,0 @@ -import m from 'mithril'; -import { - ActivityResponse, - ActivityApiResponse, - BuildSummary, - BuildDetailResponse, - PartitionSummary, - PartitionDetailResponse, - PartitionEventsResponse, - JobSummary, - JobMetricsResponse, - JobDailyStats, - JobRunSummary, - PartitionRef, - BuildRequestStatus -} from '../client/typescript_generated/src/index'; - -// Dashboard-optimized types - canonical frontend types independent of backend schema -// These types prevent runtime errors by ensuring consistent data shapes throughout components - -export interface DashboardBuild { - build_request_id: string; - status: BuildRequestStatus; - requested_partitions: PartitionRef[]; - total_jobs: number; - completed_jobs: number; - failed_jobs: number; - cancelled_jobs: number; - requested_at: number; - started_at: number | null; - completed_at: number | null; - duration_ms: number | null; - cancelled: boolean; -} - -export interface DashboardPartition { - partition_ref: PartitionRef; - status_code: number; - status_name: string; - last_updated: number | null; - build_requests: string[]; -} - -export interface DashboardJob { - job_label: string; - total_runs: number; - successful_runs: number; - failed_runs: number; - cancelled_runs: number; - last_run_timestamp: number; - last_run_status_code: number; - last_run_status_name: string; - average_partitions_per_run: number; - recent_builds: string[]; -} - -export interface DashboardActivity { - active_builds_count: number; - recent_builds: DashboardBuild[]; - recent_partitions: DashboardPartition[]; - total_partitions_count: number; - system_status: string; - graph_name: string; -} - -// Dashboard timeline event types for consistent UI handling -export interface DashboardBuildTimelineEvent { - timestamp: number; - status: BuildRequestStatus; - message: string; - event_type: string; - cancel_reason?: string; -} - -export interface DashboardPartitionTimelineEvent { - timestamp: number; - status: BuildRequestStatus; - message: string; - build_request_id: string; - job_run_id?: string; -} - -// Generic typed component interface that extends Mithril's component -// Uses intersection type to allow arbitrary properties while ensuring type safety for lifecycle methods -export interface TypedComponent extends Record { - oninit?(vnode: m.Vnode): void; - oncreate?(vnode: m.VnodeDOM): void; - onupdate?(vnode: m.VnodeDOM): void; - onbeforeremove?(vnode: m.VnodeDOM): Promise | void; - onremove?(vnode: m.VnodeDOM): void; - onbeforeupdate?(vnode: m.Vnode, old: m.VnodeDOM): boolean | void; - view(vnode: m.Vnode): m.Children; -} - -// Helper type for typed vnodes -export type TypedVnode = m.Vnode; -export type TypedVnodeDOM = m.VnodeDOM; - -// Route parameter types -export interface RouteParams { - [key: string]: string; -} - -export interface BuildRouteParams extends RouteParams { - id: string; -} - -export interface PartitionRouteParams extends RouteParams { - base64_ref: string; -} - -export interface JobRouteParams extends RouteParams { - label: string; -} - -// Component attribute interfaces that reference OpenAPI types - -export interface RecentActivityAttrs { - // No external attrs needed - component manages its own data loading -} - -export interface BuildStatusAttrs { - id: string; -} - -export interface PartitionStatusAttrs { - base64_ref: string; -} - -export interface PartitionsListAttrs { - // No external attrs needed - component manages its own data loading -} - -export interface JobsListAttrs { - // No external attrs needed - component manages its own data loading -} - -export interface JobMetricsAttrs { - label: string; -} - -export interface GraphAnalysisAttrs { - // No external attrs needed for now -} - -// Badge component attribute interfaces with OpenAPI type constraints - -export interface BuildStatusBadgeAttrs { - status: string; // This should be constrained to BuildSummary status values - size?: 'xs' | 'sm' | 'md' | 'lg'; - class?: string; -} - -export interface PartitionStatusBadgeAttrs { - status: string; // This should be constrained to PartitionSummary status values - size?: 'xs' | 'sm' | 'md' | 'lg'; - class?: string; -} - -export interface EventTypeBadgeAttrs { - eventType: string; // This should be constrained to known event types - size?: 'xs' | 'sm' | 'md' | 'lg'; - class?: string; -} - -// Layout wrapper attributes -export interface LayoutWrapperAttrs { - // Layout wrapper will pass through attributes to wrapped component - [key: string]: any; -} - -// Data types for component state (using Dashboard types for consistency) -export interface RecentActivityData { - data: DashboardActivity | null; - loading: boolean; - error: string | null; -} - -export interface BuildStatusData { - data: DashboardBuild | null; - partitionStatuses: Map; // Key is partition_ref.str - timeline: DashboardBuildTimelineEvent[]; - loading: boolean; - error: string | null; - buildId: string; -} - -export interface PartitionStatusData { - data: DashboardPartition | null; - timeline: DashboardPartitionTimelineEvent[]; - loading: boolean; - error: string | null; - partitionRef: string; - buildHistory: DashboardBuild[]; -} - -export interface JobsListData { - jobs: DashboardJob[]; - searchTerm: string; - loading: boolean; - error: string | null; - searchTimeout: NodeJS.Timeout | null; -} - -export interface JobMetricsData { - jobLabel: string; - job: DashboardJob | null; - loading: boolean; - error: string | null; -} - -// Utility type for creating typed components -export type CreateTypedComponent = TypedComponent; - -/* -## Dashboard Type Transformation Rationale - -The dashboard types provide a stable interface between the OpenAPI-generated types and UI components: - -1. **Explicit Null Handling**: Protobuf optional fields become `T | null` instead of `T | undefined` - to ensure consistent null checking throughout the application. - -2. **Type Safety**: Keep protobuf structure (PartitionRef objects, status codes) to maintain - type safety from backend to frontend. Only convert to display strings in components. - -3. **Clear Boundaries**: Dashboard types are the contract between services and components. - Services handle API responses, components handle presentation. - -Key principles: -- Preserve protobuf structure for type safety -- Explicit null handling for optional fields -- Convert to display strings only at the UI layer -- Consistent types prevent runtime errors -*/ - -// Type guards and validators for Dashboard types -export function isDashboardActivity(data: any): data is DashboardActivity { - return data && - typeof data.active_builds_count === 'number' && - typeof data.graph_name === 'string' && - Array.isArray(data.recent_builds) && - Array.isArray(data.recent_partitions) && - typeof data.system_status === 'string' && - typeof data.total_partitions_count === 'number'; -} - -export function isDashboardBuild(data: any): data is DashboardBuild { - return data && - typeof data.build_request_id === 'string' && - typeof data.status?.code === 'number' && - typeof data.status?.name === 'string' && - typeof data.requested_at === 'number' && - Array.isArray(data.requested_partitions); -} - -export function isDashboardPartition(data: any): data is DashboardPartition { - return data && - data.partition_ref && - typeof data.partition_ref.str === 'string' && - typeof data.status_code === 'number' && - typeof data.status_name === 'string' && - (data.last_updated === null || typeof data.last_updated === 'number') && - Array.isArray(data.build_requests); -} - -export function isDashboardJob(data: any): data is DashboardJob { - return data && - typeof data.job_label === 'string' && - typeof data.total_runs === 'number' && - typeof data.last_run_status_code === 'number' && - typeof data.last_run_status_name === 'string' && - Array.isArray(data.recent_builds); -} - -// Helper function to create type-safe Mithril components -export function createTypedComponent( - component: TypedComponent -): m.Component { - return component as m.Component; -} - -// Helper for type-safe route handling -export function getTypedRouteParams(vnode: m.Vnode): T { - return vnode.attrs; -} \ No newline at end of file diff --git a/databuild/dashboard/utils.test.ts b/databuild/dashboard/utils.test.ts deleted file mode 100644 index 774c0df..0000000 --- a/databuild/dashboard/utils.test.ts +++ /dev/null @@ -1,52 +0,0 @@ -import o from 'ospec'; - -// Inline the utils functions for testing since we can't import from the app module in tests -function encodePartitionRef(ref: string): string { - return btoa(ref).replace(/\+/g, '-').replace(/\//g, '_').replace(/=/g, ''); -} - -function decodePartitionRef(encoded: string): string { - // Add padding if needed - const padding = '='.repeat((4 - (encoded.length % 4)) % 4); - const padded = encoded.replace(/-/g, '+').replace(/_/g, '/') + padding; - return atob(padded); -} - -o.spec('URL Encoding Utils', () => { - o('should encode and decode partition references correctly', () => { - const testCases = [ - 'simple/partition', - 'complex/partition/with/slashes', - 'partition+with+plus', - 'partition=with=equals', - 'partition_with_underscores', - 'partition-with-dashes', - 'partition/with/mixed+symbols=test_case-123', - ]; - - testCases.forEach(original => { - const encoded = encodePartitionRef(original); - const decoded = decodePartitionRef(encoded); - - o(decoded).equals(original)(`Failed for: ${original}`); - - // Encoded string should be URL-safe (no +, /, or = characters) - o(encoded.includes('+')).equals(false)(`Encoded string contains +: ${encoded}`); - o(encoded.includes('/')).equals(false)(`Encoded string contains /: ${encoded}`); - o(encoded.includes('=')).equals(false)(`Encoded string contains =: ${encoded}`); - }); - }); - - o('should handle empty string', () => { - const encoded = encodePartitionRef(''); - const decoded = decodePartitionRef(encoded); - o(decoded).equals(''); - }); - - o('should handle special characters', () => { - const special = 'test/path?query=value&other=123#fragment'; - const encoded = encodePartitionRef(special); - const decoded = decodePartitionRef(encoded); - o(decoded).equals(special); - }); -}); \ No newline at end of file diff --git a/databuild/dashboard/utils.ts b/databuild/dashboard/utils.ts deleted file mode 100644 index 241f3cb..0000000 --- a/databuild/dashboard/utils.ts +++ /dev/null @@ -1,108 +0,0 @@ -// URL encoding utilities for partition references -export function encodePartitionRef(ref: string): string { - return btoa(ref).replace(/\+/g, '-').replace(/\//g, '_').replace(/=/g, ''); -} - -export function decodePartitionRef(encoded: string): string { - // Add padding if needed - const padding = '='.repeat((4 - (encoded.length % 4)) % 4); - const padded = encoded.replace(/-/g, '+').replace(/_/g, '/') + padding; - return atob(padded); -} - -// Job label encoding utilities (same pattern as partition refs) -export function encodeJobLabel(label: string): string { - return btoa(label).replace(/\+/g, '-').replace(/\//g, '_').replace(/=/g, ''); -} - -export function decodeJobLabel(encoded: string): string { - // Add padding if needed - const padding = '='.repeat((4 - (encoded.length % 4)) % 4); - const padded = encoded.replace(/-/g, '+').replace(/_/g, '/') + padding; - return atob(padded); -} - -import m from 'mithril'; -import { - TypedComponent, - BuildStatusBadgeAttrs, - PartitionStatusBadgeAttrs, - EventTypeBadgeAttrs, - createTypedComponent -} from './types'; - -// Mithril components for status badges - encapsulates both logic and presentation - -export const BuildStatusBadge: TypedComponent = { - view(vnode: m.Vnode) { - const { status, size = 'sm', class: className, ...attrs } = vnode.attrs; - const normalizedStatus = status.toLowerCase(); - - let badgeClass = 'badge-neutral'; - if (normalizedStatus.includes('completed')) { - badgeClass = 'badge-success'; - } else if (normalizedStatus.includes('executing') || normalizedStatus.includes('planning')) { - badgeClass = 'badge-warning'; - } else if (normalizedStatus.includes('received')) { - badgeClass = 'badge-info'; - } else if (normalizedStatus.includes('failed') || normalizedStatus.includes('cancelled')) { - badgeClass = 'badge-error'; - } - - return m(`span.badge.badge-${size}.${badgeClass}`, { class: className, ...attrs }, status); - } -}; - -export const PartitionStatusBadge: TypedComponent = { - view(vnode: m.Vnode) { - const { status, size = 'sm', class: className, ...attrs } = vnode.attrs; - if (!status) { - return m(`span.badge.badge-${size}.badge-neutral`, { class: className, ...attrs }, 'Unknown'); - } - - const normalizedStatus = status.toLowerCase(); - let badgeClass = 'badge-neutral'; - - if (normalizedStatus.includes('available')) { - badgeClass = 'badge-success'; - } else if (normalizedStatus.includes('building') || normalizedStatus.includes('analyzed')) { - badgeClass = 'badge-warning'; - } else if (normalizedStatus.includes('requested') || normalizedStatus.includes('delegated')) { - badgeClass = 'badge-info'; - } else if (normalizedStatus.includes('failed')) { - badgeClass = 'badge-error'; - } - - return m(`span.badge.badge-${size}.${badgeClass}`, { class: className, ...attrs }, status); - } -}; - -export const EventTypeBadge: TypedComponent = { - view(vnode: m.Vnode) { - const { eventType, size = 'sm', class: className, ...attrs } = vnode.attrs; - - let badgeClass = 'badge-ghost'; - let displayName = eventType; - - switch (eventType) { - case 'build_request': - badgeClass = 'badge-primary'; - displayName = 'Build'; - break; - case 'job': - badgeClass = 'badge-secondary'; - displayName = 'Job'; - break; - case 'partition': - badgeClass = 'badge-accent'; - displayName = 'Partition'; - break; - case 'delegation': - badgeClass = 'badge-info'; - displayName = 'Delegation'; - break; - } - - return m(`span.badge.badge-${size}.${badgeClass}`, { class: className, ...attrs }, displayName); - } -}; \ No newline at end of file diff --git a/databuild/databuild.proto b/databuild/databuild.proto index 57e3b34..e6f499e 100644 --- a/databuild/databuild.proto +++ b/databuild/databuild.proto @@ -2,1086 +2,129 @@ syntax = "proto3"; package databuild.v1; +// Core Build Event Log (BEL) + message PartitionRef { - string str = 1; + string ref = 1; } -/////////////////////////////////////////////////////////////////////////////////////////////// -// Jobs -/////////////////////////////////////////////////////////////////////////////////////////////// - -// -// Job Config -// - -// The type of dependency -enum DepType { - QUERY = 0; // Default - MATERIALIZE = 1; +// The base event for all events written to the BEL +message DataBuildEvent { + uint64 timestamp = 1; + uint64 event_id = 2; + oneof event { + // Job run events + JobRunBufferEvent job_run_buffer = 3; + JobRunQueueEvent job_run_queue = 4; + JobRunStartEvent job_run_started = 5; + JobRunHeartbeatEvent job_run_heartbeat = 6; + JobRunSuccessEvent job_run_success = 7; + JobRunFailureEvent job_run_failure = 8; + JobRunCancelEvent job_run_cancel = 9; + JobRunMissingDepsEvent job_run_missing_deps = 10; + // Want events + WantCreateEvent want_create = 11; + WantCancelEvent want_cancel = 12; + // Taint events + TaintCreateEvent taint_create = 13; + TaintDeleteEvent taint_delete = 14; + } } -// Represents a data dependency -message DataDep { - DepType dep_type_code = 1; // Enum for programmatic use - string dep_type_name = 2; // Human-readable string ("query", "materialize") - PartitionRef partition_ref = 3; // Moved from field 2 to 3 -} +message JobRunBufferEvent {} +message JobRunQueueEvent {} +message JobRunStartEvent {} +message JobRunHeartbeatEvent {} +message JobRunSuccessEvent {} +message JobRunFailureEvent {} +message JobRunCancelEvent {} +message JobRunMissingDepsEvent {} -// Configuration for a job -message JobConfig { - // The partitions that this parameterization produces - repeated PartitionRef outputs = 1; - - // Required data dependencies - repeated DataDep inputs = 2; - - // Command line arguments - repeated string args = 3; - - // Environment variables - map env = 4; -} - -// Request message for job configuration service -message JobConfigureRequest { repeated PartitionRef outputs = 1; } - -// Response message for job configuration service -message JobConfigureResponse { repeated JobConfig configs = 1; } - -// Implemented by the job.cfg bazel rule -service JobConfigure { - rpc Configure(JobConfigureRequest) returns (JobConfigureResponse); -} - -// -// Job Exec -// - -// Manifest that records the literal partitions consumed (and their manifests) in order to -// produce the specified partitions -message PartitionManifest { - // The refs of the partitions produced by this job - repeated PartitionRef outputs = 1; - - // Input partition manifests - repeated PartitionManifest inputs = 2; - - // Start time of job execution (Unix timestamp seconds) - int64 start_time = 3; - - // End time of job execution (Unix timestamp seconds) - int64 end_time = 4; - - // The configuration used to run the job - Task task = 5; - - // Arbitrary metadata about the produced partitions, keyed by partition ref - map metadata = 6; -} - -message JobExecuteRequest { repeated PartitionRef outputs = 1; } - -// Metadata for the complete set of partitions produced by this job -message JobExecuteResponse { repeated PartitionManifest manifests = 1; } - -// Implemented by the job.exec bazel rule -service JobExecute { - rpc Execute(JobExecuteRequest) returns (JobExecuteResponse); -} - -/////////////////////////////////////////////////////////////////////////////////////////////// -// Graphs -/////////////////////////////////////////////////////////////////////////////////////////////// - -// -// GraphLookup -// - -message JobLabel { - // The bazel label the references the job_target - string label = 1; -} - -message GraphLookupRequest { repeated PartitionRef outputs = 1; } - -// Represents a not-yet configured task -message TaskRef { - // The job whose configure/exec targets will be used - JobLabel job = 1; - - // The partition refs this task is responsible for producing, and with which the configure - // target will be invoked - repeated PartitionRef outputs = 2; -} - -// Represents the complete set of tasks needed to produce the requested partitions -message GraphLookupResponse { repeated TaskRef task_refs = 1; } - -// Implemented per graph -service GraphLookup { - rpc Lookup(GraphLookupRequest) returns (GraphLookupResponse); -} - -// Request message for graph analyze service -message GraphAnalyzeRequest { repeated PartitionRef outputs = 1; } - -// -// JobGraph -// - -message Task { - // The bazel label uniquely identifying the job - JobLabel job = 1; - - // The configuration for the job - JobConfig config = 2; -} - -// The bazel label referencing the graph -message GraphLabel { string label = 1; } - -// Represents a job graph -message JobGraph { - // The bazel label of the graph to be executed - GraphLabel label = 1; - - // The output partitions to be produced by this graph - repeated PartitionRef outputs = 2; - - // The job configurations that make up this graph - repeated Task nodes = 3; -} - -// Response message for graph analyze service -message GraphAnalyzeResponse { JobGraph graph = 1; } - -message GraphExecuteResponse { repeated PartitionManifest manifests = 1; } -message GraphBuildRequest { repeated PartitionRef outputs = 1; } -message GraphBuildResponse { repeated PartitionManifest manifests = 1; } - -/////////////////////////////////////////////////////////////////////////////////////////////// -// Build Event Log -/////////////////////////////////////////////////////////////////////////////////////////////// - -// Filter for querying build events -message EventFilter { - repeated string partition_refs = 1; - repeated string partition_patterns = 2; - repeated string job_labels = 3; - repeated string job_run_ids = 4; - repeated string build_request_ids = 5; -} - -// Paginated response for build events -message EventPage { - repeated BuildEvent events = 1; - int64 next_idx = 2; - bool has_more = 3; -} - -// Partition lifecycle states -enum PartitionStatus { - PARTITION_UNKNOWN = 0; - PARTITION_REQUESTED = 1; // Partition requested but not yet analyzed - PARTITION_ANALYZED = 2; // Partition analyzed successfully - PARTITION_BUILDING = 3; // Job actively building this partition - PARTITION_AVAILABLE = 4; // Partition successfully built and available - PARTITION_FAILED = 5; // Partition build failed - PARTITION_DELEGATED = 6; // Request delegated to existing build -} - -// Job execution lifecycle -enum JobStatus { - JOB_UNKNOWN = 0; - JOB_SCHEDULED = 1; // Job scheduled for execution - JOB_RUNNING = 2; // Job actively executing - JOB_COMPLETED = 3; // Job completed successfully - JOB_FAILED = 4; // Job execution failed - JOB_CANCELLED = 5; // Job execution cancelled - JOB_SKIPPED = 6; // Job skipped because target partitions already available -} - -// Build request lifecycle -enum BuildRequestStatusCode { - // Not good - BUILD_REQUEST_UNKNOWN = 0; - // Build request received - BUILD_REQUEST_RECEIVED = 1; - // Graph analysis in progress - BUILD_REQUEST_PLANNING = 2; - // Graph analysis completed successfully - BUILD_REQUEST_ANALYSIS_COMPLETED = 7; - // Jobs are being executed - BUILD_REQUEST_EXECUTING = 3; - // All requested partitions built - BUILD_REQUEST_COMPLETED = 4; - // Build precondition failed (e.g. required external data was not available) - BUILD_REQUEST_PRECONDITION_FAILED = 8; - // Build request failed - BUILD_REQUEST_FAILED = 5; - // Build request cancelled - BUILD_REQUEST_CANCELLED = 6; -} - -message BuildRequestStatus { - // Enum for programmatic use - BuildRequestStatusCode code = 1; - // Human readable string - string name = 2; -} - -// Build request lifecycle event -message BuildRequestEvent { - // The status that this event indicates - BuildRequestStatus status = 1; - // Output partitions requested to be built as part of this build - repeated PartitionRef requested_partitions = 3; - // Optional status message - string message = 4; - // The comment attached to the request - contains arbitrary text - optional string comment = 5; - // The id of the want that triggered this build - optional string want_id = 6; -} - -// Partition state change event -message PartitionEvent { - PartitionRef partition_ref = 1; - PartitionStatus status_code = 2; // Enum for programmatic use - string status_name = 3; // Human-readable string - string message = 4; // Optional status message - string job_run_id = 5; // UUID of job run producing this partition (if applicable) -} - -// Job execution event -message JobEvent { - string job_run_id = 1; // UUID for this job run - JobLabel job_label = 2; // Job being executed - repeated PartitionRef target_partitions = 3; // Partitions this job run produces - JobStatus status_code = 4; // Enum for programmatic use - string status_name = 5; // Human-readable string - string message = 6; // Optional status message - JobConfig config = 7; // Job configuration used (for SCHEDULED events) - repeated PartitionManifest manifests = 8; // Results (for COMPLETED events) -} - -// Delegation event (when build request delegates to existing build) -message DelegationEvent { - PartitionRef partition_ref = 1; - string delegated_to_build_request_id = 2; // Build request handling this partition - string message = 3; // Optional message -} - -// Job graph analysis result event (stores the analyzed job graph) -message JobGraphEvent { - JobGraph job_graph = 1; // The analyzed job graph - string message = 2; // Optional message -} - -// Partition invalidation event -message PartitionInvalidationEvent { - PartitionRef partition_ref = 1; // Partition being invalidated - string reason = 2; // Reason for invalidation -} - -// Job run cancellation event -message JobRunCancelEvent { - string job_run_id = 1; // UUID of the job run being cancelled - string reason = 2; // Reason for cancellation -} - -// Build cancellation event -message BuildCancelEvent { - string reason = 1; // Reason for cancellation -} - -message WantEvent { - repeated PartitionRef requested_partitions = 1; - // Unique identifier - string want_id = 2; - // How this want was created - WantSource source = 3; - string comment = 4; -} - -message PartitionWant { +message WantCreateEvent { string want_id = 1; - // The ref we want to materialize - PartitionRef ref = 2; - // Server time when want registered - uint64 created_at = 3; - // Business time this partition represents - uint64 data_timestamp = 4; - // Give up after this long (from created_at) - optional uint64 ttl_seconds = 5; - // SLA violation after this long (from data_timestamp) - optional uint64 sla_seconds = 6; - // Cross-graph dependencies determined in the analysis phase triggered upon want submission - // These are per-partition, since wants can be partially, marginally materialized - repeated string external_dependencies = 7; + string root_want_id = 2; + string parent_want_id = 3; + repeated PartitionRef partitions = 4; + uint64 data_timestamp = 5; + uint64 ttl_seconds = 6; + uint64 sla_seconds = 7; + WantSource source = 8; + optional string comment = 9; } - message WantSource { - // The source of the want - SourceType source_type = 1; - - // TODO implement something to record want actual want source for external requests when we have real use case + WantSourceType source_type = 1; + string source_name = 2; } - -message SourceType { - SourceTypeCode code = 1; +message WantSourceType { + WantSourceCode code = 1; string name = 2; } - -enum SourceTypeCode { - // Manual CLI request - CLI_MANUAL = 0; - // Manual dashboard request - DASHBOARD_MANUAL = 1; - // Scheduled/triggered job - SCHEDULED = 2; - // External API call - API_REQUEST = 3; +enum WantSourceCode{ + Manual = 0; + Automated = 1; + Propagated = 2; } - - -// Marks a partition as tainted, so that it will be rebuilt if a data dep points to it, and will be rebuilt if a live -// want points to it. -message TaintEvent { - // The list of partitions to be tainted - repeated PartitionRef refs = 1; - // When the taint was created - uint64 created_at = 2; - // The source of the taint event - SourceType source_type = 3; - // Free text comment attached to the taint - string comment = 4; -} - -// Individual build event -message BuildEvent { - // Event metadata - string event_id = 1; // UUID for this event - int64 timestamp = 2; // Unix timestamp (nanoseconds) - optional string build_request_id = 3; - - // Event type and payload (one of) - oneof event_type { - BuildRequestEvent build_request_event = 10; - PartitionEvent partition_event = 11; - JobEvent job_event = 12; - DelegationEvent delegation_event = 13; - JobGraphEvent job_graph_event = 14; - PartitionInvalidationEvent partition_invalidation_event = 15; - JobRunCancelEvent job_run_cancel_event = 16; - BuildCancelEvent build_cancel_event = 17; - WantEvent want_event = 18; - TaintEvent taint_event = 19; - } -} - -/////////////////////////////////////////////////////////////////////////////////////////////// -// Job Wrapper Log Protocol -/////////////////////////////////////////////////////////////////////////////////////////////// - -// Structured log entry emitted by job wrapper to stdout -message JobLogEntry { - string timestamp = 1; // Unix timestamp - string job_id = 2; // UUID for this job execution - repeated PartitionRef outputs = 3; // Partitions being processed by this job - uint64 sequence_number = 4; // Monotonic sequence starting from 1 - - oneof content { - LogMessage log = 5; - MetricPoint metric = 6; - WrapperJobEvent job_event = 7; // Wrapper-specific job events - PartitionManifest manifest = 8; - } -} - -// Log message from job stdout/stderr -message LogMessage { - enum LogLevel { - DEBUG = 0; - INFO = 1; - WARN = 2; - ERROR = 3; - } - LogLevel level = 1; - string message = 2; - map fields = 3; -} - -// Metric point emitted by job -message MetricPoint { - string name = 1; - double value = 2; - map labels = 3; - string unit = 4; -} - -// Job wrapper event (distinct from build event log JobEvent) -message WrapperJobEvent { - string event_type = 1; // "config_validate_success", "task_launch_success", etc - map metadata = 2; - optional string job_status = 3; // JobStatus enum as string - optional int32 exit_code = 4; - optional string job_label = 5; // Job label for low-cardinality metrics -} - -/////////////////////////////////////////////////////////////////////////////////////////////// -// List Operations (Unified CLI/Service Responses) -/////////////////////////////////////////////////////////////////////////////////////////////// - -// -// Partitions List -// - -message PartitionsListRequest { - optional uint32 limit = 1; - optional uint32 offset = 2; - optional string status_filter = 3; -} - -message PartitionsListResponse { - repeated PartitionSummary partitions = 1; - uint32 total_count = 2; - bool has_more = 3; -} - -message PartitionSummary { - PartitionRef partition_ref = 1; - PartitionStatus status_code = 2; // Enum for programmatic use - string status_name = 3; // Human-readable string - int64 last_updated = 4; - uint32 builds_count = 5; - uint32 invalidation_count = 6; - optional string last_successful_build = 7; -} - -// -// Jobs List -// - -message JobsListRequest { - optional uint32 limit = 1; - optional string search = 2; -} - -message JobsListResponse { - repeated JobSummary jobs = 1; - uint32 total_count = 2; -} - -message JobSummary { - string job_label = 1; - uint32 total_runs = 2; - uint32 successful_runs = 3; - uint32 failed_runs = 4; - uint32 cancelled_runs = 5; - double average_partitions_per_run = 6; - int64 last_run_timestamp = 7; - JobStatus last_run_status_code = 8; // Enum for programmatic use - string last_run_status_name = 9; // Human-readable string - repeated string recent_builds = 10; -} - -// -// Job Runs List -// - -message JobRunsListRequest { - optional uint32 limit = 1; -} - -message JobRunsListResponse { - repeated JobRunSummary tasks = 1; - uint32 total_count = 2; -} - -message JobRunSummary { - string job_run_id = 1; - string job_label = 2; - string build_request_id = 3; - JobStatus status_code = 4; // Enum for programmatic use - string status_name = 5; // Human-readable string - repeated PartitionRef target_partitions = 6; - int64 scheduled_at = 7; - optional int64 started_at = 8; - optional int64 completed_at = 9; - optional int64 duration_ms = 10; - bool cancelled = 11; - string message = 12; -} - -// -// Builds List -// - -message BuildsListRequest { - optional uint32 limit = 1; - optional uint32 offset = 2; - optional string status_filter = 3; -} - -message BuildsListResponse { - repeated BuildSummary builds = 1; - uint32 total_count = 2; - bool has_more = 3; -} - -message BuildSummary { - string build_request_id = 1; - BuildRequestStatus status = 2; - repeated PartitionRef requested_partitions = 3; - uint32 total_jobs = 4; - uint32 completed_jobs = 5; - uint32 failed_jobs = 6; - uint32 cancelled_jobs = 7; - int64 requested_at = 8; - optional int64 started_at = 9; - optional int64 completed_at = 10; - optional int64 duration_ms = 11; - bool cancelled = 12; - optional string comment = 13; -} - -// -// Activity Summary -// - -message ActivityResponse { - uint32 active_builds_count = 1; - repeated BuildSummary recent_builds = 2; - repeated PartitionSummary recent_partitions = 3; - uint32 total_partitions_count = 4; - string system_status = 5; - string graph_name = 6; -} - -/////////////////////////////////////////////////////////////////////////////////////////////// -// Detail Operations (Unified CLI/Service Detail Responses) -/////////////////////////////////////////////////////////////////////////////////////////////// - -// -// Build Detail -// - -message BuildDetailRequest { - string build_request_id = 1; -} - -message BuildDetailResponse { - string build_request_id = 1; - BuildRequestStatus status = 2; - repeated PartitionRef requested_partitions = 3; - uint32 total_jobs = 4; - uint32 completed_jobs = 5; - uint32 failed_jobs = 6; - uint32 cancelled_jobs = 7; - int64 requested_at = 8; - optional int64 started_at = 9; - optional int64 completed_at = 10; - optional int64 duration_ms = 11; - bool cancelled = 12; - optional string cancel_reason = 13; - repeated BuildTimelineEvent timeline = 14; -} - -message BuildTimelineEvent { - int64 timestamp = 1; - optional BuildRequestStatus status = 2; - string message = 3; - string event_type = 4; - optional string cancel_reason = 5; -} - -// -// Partition Detail -// - -message PartitionDetailRequest { - PartitionRef partition_ref = 1; -} - -message PartitionDetailResponse { - PartitionRef partition_ref = 1; - PartitionStatus status_code = 2; // Enum for programmatic use - string status_name = 3; // Human-readable string - int64 last_updated = 4; - uint32 builds_count = 5; - optional string last_successful_build = 6; - uint32 invalidation_count = 7; - repeated PartitionTimelineEvent timeline = 8; -} - -message PartitionTimelineEvent { - int64 timestamp = 1; - PartitionStatus status_code = 2; // Enum for programmatic use - string status_name = 3; // Human-readable string - string message = 4; - string build_request_id = 5; - optional string job_run_id = 6; -} - -// -// Job Detail -// - -message JobDetailRequest { - string job_label = 1; -} - -message JobDetailResponse { - string job_label = 1; - uint32 total_runs = 2; - uint32 successful_runs = 3; - uint32 failed_runs = 4; - uint32 cancelled_runs = 5; - double average_partitions_per_run = 6; - int64 last_run_timestamp = 7; - JobStatus last_run_status_code = 8; // Enum for programmatic use - string last_run_status_name = 9; // Human-readable string - repeated string recent_builds = 10; - repeated JobRunDetail runs = 11; -} - -message JobRunDetail { - string job_run_id = 1; - string build_request_id = 2; - repeated PartitionRef target_partitions = 3; - JobStatus status_code = 4; // Enum for programmatic use - string status_name = 5; // Human-readable string - optional int64 started_at = 6; - optional int64 completed_at = 7; - optional int64 duration_ms = 8; - string message = 9; -} - -// -// Job Run Detail -// - -message JobRunDetailRequest { - string job_run_id = 1; -} - -message JobRunDetailResponse { - string job_run_id = 1; - string job_label = 2; - string build_request_id = 3; - JobStatus status_code = 4; // Enum for programmatic use - string status_name = 5; // Human-readable string - repeated PartitionRef target_partitions = 6; - int64 scheduled_at = 7; - optional int64 started_at = 8; - optional int64 completed_at = 9; - optional int64 duration_ms = 10; - bool cancelled = 11; - optional string cancel_reason = 12; - string message = 13; - repeated JobRunTimelineEvent timeline = 14; -} - -message JobRunTimelineEvent { - int64 timestamp = 1; - optional JobStatus status_code = 2; // Enum for programmatic use - optional string status_name = 3; // Human-readable string - string message = 4; - string event_type = 5; - optional string cancel_reason = 6; -} - -/////////////////////////////////////////////////////////////////////////////////////////////// -// Job Log Access (Unified CLI/Service Interface) -/////////////////////////////////////////////////////////////////////////////////////////////// - -// Request for retrieving job logs -message JobLogsRequest { - string job_run_id = 1; // UUID of the job run - int64 since_timestamp = 2; // Unix timestamp (nanoseconds) - only logs after this time - int32 min_level = 3; // Minimum LogLevel enum value (0=DEBUG, 1=INFO, 2=WARN, 3=ERROR) - uint32 limit = 4; // Maximum number of entries to return -} - -// Response containing job log entries -message JobLogsResponse { - repeated JobLogEntry entries = 1; // Log entries matching the request criteria - bool has_more = 2; // True if more entries exist beyond the limit -} - -/////////////////////////////////////////////////////////////////////////////////////////////// -// Currently unused - implemented via HTTP REST API instead -/////////////////////////////////////////////////////////////////////////////////////////////// - -// Partition Want (Future feature - currently unused) -// message WantSource { -// // TODO -// } - -// message PartitionWant { -// PartitionRef partition_ref = 1; // Partition being requested -// uint64 created_at = 2; // Server time when want registered -// optional uint64 data_timestamp = 3; // Business time this partition represents -// optional uint64 ttl_seconds = 4; // Give up after this long (from created_at) -// optional uint64 sla_seconds = 5; // SLA violation after this long (from data_timestamp) -// repeated string external_dependencies = 6; // Cross-graph dependencies -// string want_id = 7; // Unique identifier -// WantSource source = 8; // How this want was created -// } - -// Service for job configuration and graph analysis -// service DataBuildService { -// // Get job configurations for the specified output references -// // rpc GetJobConfigs(JobConfigureRequest) returns (JobConfigureResponse) {} - -// // Analyze and get the job graph for the specified output references -// rpc AnalyzeGraph(GraphAnalyzeRequest) returns (GraphAnalyzeResponse); - -// // Execute the specified job graph (implemented by databuild) -// rpc Execute(JobGraph) returns (GraphExecuteResponse); - -// // User-facing: build the desired partitions -// rpc Build(GraphBuildRequest) returns (GraphBuildResponse); -// } - - -/////////////////////////////////////////////////////////////////////////////////////////////// -// DataBuildService - v2 of service and CLI interface below -/////////////////////////////////////////////////////////////////////////////////////////////// - -// The service that vends all build status information -// Core objects are: -// - Build events - events emitted as part of the build process that indicate status/state -// - BuildRequests - the literal request to build 1+ partitions -// - Partitions - Atomic units of data that represent results of jobs, and act as sufficiency signals for other jobs -// - Jobs - the units of work that build partitions (a single run of one is a JobRun) -// - JobRuns - the specific runs of Jobs -// - Wants - the recorded "want" to build a partition, which will be acted on ASAP -// - Taints - invalidate built partitions, in cases where the result should not be used or should be rebuilt -// Each of these will have a list page, and all but build events will have a summary page. -service DataBuildService { - // Build events - exposes literal events from build event log with filters - rpc GetBuildEvents(ListBuildEventsRequest) returns (ListBuildEventsResponse); - - // For batched requests - rpc Batched(BatchedRequest) returns (BatchedResponse); - - // BUILDS - // List the available build requests with limited metadata about them (requested partitions, status, requested time, etc) - rpc ListBuildRequests(ListBuildsRequest) returns (ListBuildsResponse); - // Get build status, summary, and paginated lists of produced partitions, and other related metadata - rpc GetBuildSummary(BuildSummaryRequest) returns (BuildSummaryResponse); - // Get a mermaid description of the build request graph with its current status rendered - rpc GetBuildMermaid(BuildSummaryRequest) returns (BuildMermaidResponse); - - // PARTITIONS - // List partitions (built, building, wanted) - rpc ListPartitions(ListPartitionsRequest) returns (ListPartitionsResponse); - // Get details about a specific partition (status, created at, past builds, job runs that built or are building it, etc) - rpc GetPartitionsSummary(PartitionSummaryRequest) returns (PartitionSummaryResponse); - - // JOBS - // List jobs described in the graph plus metadata (success rate, last result, last run at, etc) - rpc ListJobs(ListJobsRequest) returns (ListJobsResponse); - // Get details for a specific job - rpc GetJobSummary(JobSummaryRequest) returns (JobSummaryResponse); - - // JOB RUNS - // List job runs plus basic metadata (job they ran, result, runtime, etc) - rpc ListJobRuns(ListJobRunsRequest) returns (ListJobRunsResponse); - // Get details of a specific job run (above details plus produced partitions, paginated logs, etc) - rpc GetJobRunSummary(JobRunSummaryRequest) returns (JobRunSummaryResponse); - - // Wants - // List wants plus metadata (wanted partitions, created at, status) - rpc ListWants(ListWantsRequest) returns (ListWantsResponse); - // Get details for a want (above plus reasons for want being in current state, etc) - rpc GetWantSummary(WantSummaryRequest) returns (WantSummaryResponse); - // Register a want (list of partition refs, with user, reason, etc) - rpc PutWants(PutWantsRequest) returns (PutWantsResponse); - - // Taints - // List taints plus metadata (tainted partitions, created at, status) - rpc ListTaints(ListTaintsRequest) returns (ListTaintsResponse); - // Summarize the requested taint - rpc GetTaintSummary(TaintSummaryRequest) returns (TaintSummaryResponse); - // Register a taint (list of partition refs, with user, reason, etc) - rpc PutTaints(PutTaintsRequest) returns (PutTaintsResponse); -} - -message RequestContainer { - ListBuildEventsResponse list_build_events = 1; - BuildSummaryRequest build_request_status = 2; - - // TODO -} - -message ResponseContainer { - ListBuildEventsResponse list_build_events = 1; - BuildSummaryResponse build_request_status = 2; - // TODO -} - -message ErrorContainer { - string error_message = 1; -} - -message BatchedRequest { - map requests = 1; -} - -message BatchedResponse { - map responses = 1; - map errors = 2; -} - -// BEL events - -message ListBuildEventsRequest { - EventFilter filters = 1; - - // Either one of the following must be provided - // Scrolls backwards from the specified timestamp - uint64 max_timestamp_ns = 2; - // Scrolls forward from the specified timestamp - uint64 min_timestamp_ns = 3; -} - -message ListBuildEventsResponse { - // Resulting events are ordered - repeated BuildEvent events = 1; - bool has_more = 2; -} - -// BUILD REQUESTS - -// ANDed filters -message ListBuildsRequest { - // The max time the service will search until to find build requests - uint64 started_until = 1; - // Filters returned build requests those that currently have this status - repeated string build_status = 2; - // Filters build requests to those that built one of these partitions - repeated string built_partition = 3; - // Filters build requests to those that output one of these partitions (excluding those that were not explicitly - // requested in the build request) - repeated string output_partition = 4; - // Filters by jobs that were run as part of the build - repeated string run_jobs = 5; - // Filters by the ID of the want that triggered the build - repeated string triggering_want_ids = 6; - // Filters by contains match against build request comment - string comment_contains = 7; -} - -// Ordered and paginated by build start time -message ListBuildsResponse { - // Resulting builds - repeated BuildSummary builds = 1; - // Paging bounds for requesting next page - uint64 min_started = 2; - // Indicates if there are more to request - bool has_more = 3; -} - -message BuildSummaryRequest { - string build_id = 1; -} - -message BuildSummaryResponse { - string build_id = 1; - - // Overall status of the build - BuildRequestStatusCode status = 2; - // Summary of the build - BuildSummary summary = 3; - // Partitions produced by the build - repeated PartitionBuildStatus partitions = 4; - -} - -message PartitionBuildStatus { - PartitionRef ref = 1; - PartitionStatus status = 2; -} - -message BuildMermaidResponse { - string build_id = 1; - string mermaid = 2; -} - -// PARTITIONS - -message ListPartitionsRequest { - // Optional regex filter - string ref_pattern = 1; - // Optional ORing partition status filter - repeated PartitionStatus partition_status = 2; - // Basic pagination mechanism - returns partitions sorted after the provided ref - string last_partition = 3; -} - -message ListPartitionsResponse { - repeated PartitionSummaryV2 refs = 1; -} - -message PartitionStatusV2 { - PartitionStatus code = 1; - string name = 2; -} - -message PartitionSummaryV2 { - PartitionRef partition_ref = 1; - PartitionStatusV2 status = 2; - uint64 last_updated = 4; - uint64 last_invalidated_at = 6; - repeated string past_build_request = 7; -} - -message PartitionSummaryRequest { - PartitionRef ref = 1; -} - -message PartitionSummaryResponse { - PartitionSummaryV2 partition = 1; -} - -// JOBS - -// No query params - if you need to paginate here something is insane or you're google -message ListJobsRequest {} - -message ListJobsResponse { - repeated JobSummary jobs = 1; -} - -message JobSummaryRequest { - string job_label = 1; -} - -message JobSummaryResponse { - JobSummary job = 1; -} - -// JOB RUNS - -// Paginates backwards -message ListJobRunsRequest { - // Filters to job runs started until this point - uint64 started_until = 1; - // ORing filter matching job run IDs - repeated string job_run_ids = 2; - // ORing filters to job runs that are defined by one of these job labels - repeated string job_labels = 3; - // ORing filters to job runs that were involved in one of these build requests - repeated string build_reqeust_ids = 4; - // ORing filters to partitions produced by these job runs - repeated string built_partition_refs = 5; -} - -message ListJobRunsResponse { - repeated JobRunSummary job_runs = 1; - uint64 min_start_at = 2; -} - -message JobRunSummaryRequest { - string job_run_id = 1; -} - -message JobRunSummaryResponse { - JobRunSummary job_run = 1; -} - -// WANTS - -message ListWantsRequest { - // Filters the latest time the want could been requested until - uint64 requested_until = 1; - // Filters to wants whose ttl expires after ttl_until (allows querying "currently wanted" - uint64 ttl_until = 2; -} - -message ListWantsResponse { - repeated PartitionWantSummary wants = 1; - uint64 min_requested_at = 2; -} - -message LabeledPartitionBuildStatus { - PartitionRef ref = 1; - PartitionBuildStatus status = 2; -} - -message PartitionWantSummary { - PartitionWant want = 1; - repeated PartitionSummary partitions = 2; - repeated LabeledPartitionBuildStatus external_partitions = 3; - string comment = 4; -} - -message WantSummaryRequest { +message WantCancelEvent { string want_id = 1; + optional string reason = 2; } -message WantSummaryResponse { - PartitionWantSummary want = 1; +message TaintCreateEvent { + string taint_id = 1; + repeated PartitionRef partitions = 2; + optional string reason = 3; +} +message TaintDeleteEvent { + string taint_id = 1; + optional string reason = 2; } -message IndividualWantRequest { +// Build State + +// Represents the whole state of the system +message BuildState { + map wants = 1; + map partitions = 2; + map taints = 3; +} + +message WantState { + string want_id = 1; + // TODO +} + +message PartitionState { + // The partition reference PartitionRef ref = 1; - uint64 date_timestamp = 2; - uint64 ttl_seconds = 3; - uint64 sla_seconds = 4; + // The partitions current status + PartitionStatus status = 2; + // The latest update to the partition's status + optional uint64 last_updated_at = 3; + // IDs that associate the partition with other objects + repeated string job_run_ids = 4; + repeated string want_ids = 5; + repeated string taint_ids = 6; +} +message PartitionStatus { + PartitionStatusCode code = 1; + string name = 2; +} +enum PartitionStatusCode{ + Unknown = 0; + Wanted = 1; + Building = 2; + Live = 3; + Tainted = 4; } -message PutWantsRequest { - repeated IndividualWantRequest wants = 1; - WantSource source = 2; - string comment = 3; +message TaintState { + } -message CreatedWant { - PartitionRef ref = 1; - string want_id = 2; -} -message PutWantsResponse { - repeated CreatedWant wants = 1; -} - -// TAINTS - -message ListTaintsRequest { - uint64 tainted_at_until = 1; -} - -message ListTaintsResponse { - repeated PartitionTaintSummary taints = 1; - uint64 min_tainted_at = 2; -} - -message PartitionTaintSummary { - string taint_id = 1; - repeated PartitionRef refs = 2; - uint64 tainted_at = 3; - SourceType source = 4; - string comment = 5; -} - -message TaintSummaryRequest { - string taint_id = 1; -} - -message TaintSummaryResponse { - PartitionTaintSummary taint = 1; -} - -message PutTaintsRequest { - repeated PartitionRef refs = 1; - SourceType source = 2; - string comment = 3; -} - -message PutTaintsResponse { - string taint_id = 1; +message EventFilter { + repeated string partition_refs = 1; // Exact partition matches + repeated string partition_patterns = 2; // Glob patterns like "data/users/*" + repeated string job_labels = 3; // Job-specific events + repeated string job_run_ids = 4; // Job run events } diff --git a/databuild/dsl/python/BUILD.bazel b/databuild/dsl/python/BUILD.bazel deleted file mode 100644 index 34908b9..0000000 --- a/databuild/dsl/python/BUILD.bazel +++ /dev/null @@ -1,29 +0,0 @@ -py_library( - name = "dsl", - srcs = ["dsl.py"], - visibility = ["//visibility:public"], - deps = [ - "//databuild:py_proto", - ], -) - -py_library( - name = "generator_lib", - srcs = ["generator_lib.py"], - visibility = ["//visibility:public"], - deps = [ - ":dsl", - "//databuild:py_proto", - ], -) - -py_binary( - name = "generator", - srcs = ["generator.py"], - data = ["dsl_job_wrapper.py"], - main = "generator.py", - visibility = ["//visibility:public"], - deps = [ - ":generator_lib", - ], -) diff --git a/databuild/dsl/python/dsl.py b/databuild/dsl/python/dsl.py deleted file mode 100644 index dde2e00..0000000 --- a/databuild/dsl/python/dsl.py +++ /dev/null @@ -1,431 +0,0 @@ - -from databuild.proto import JobConfig, PartitionRef, DataDep, DepType -from typing import Self, Protocol, get_type_hints, get_origin, get_args -from dataclasses import fields, is_dataclass, dataclass, field -import re - - -class PartitionPattern: - _raw_pattern: str - - @property - def _pattern(self) -> re.Pattern: - return re.compile(self._raw_pattern) - - def _validate_pattern(self): - """Checks that both conditions are met: - 1. All fields from the PartitionFields type are present in the pattern - 2. All fields from the pattern are present in the PartitionFields type - """ - # TODO how do I get this to be called? - assert is_dataclass(self), "Should be a dataclass also (for partition fields)" - pattern_fields = set(self._pattern.groupindex.keys()) - partition_fields = {field.name for field in fields(self)} - if pattern_fields != partition_fields: - raise ValueError(f"Pattern fields {pattern_fields} do not match partition fields {partition_fields}") - - @classmethod - def deserialize(cls, raw_value: str) -> Self: - """Parses a partition from a string based on the defined pattern.""" - # Create a temporary instance to access the compiled pattern - # We need to compile the pattern to match against it - pattern = re.compile(cls._raw_pattern) - - # Match the raw value against the pattern - match = pattern.match(raw_value) - if not match: - raise ValueError(f"String '{raw_value}' does not match pattern '{cls._pattern}'") - - # Extract the field values from the match - field_values = match.groupdict() - - # Create and return a new instance with the extracted values - return cls(**field_values) - - def serialize(self) -> str: - """Returns a string representation by filling in the pattern template with field values.""" - # Start with the pattern - result = self._raw_pattern - - # Replace each named group in the pattern with its corresponding field value - for field in fields(self): - # Find the named group pattern and replace it with the actual value - # We need to replace the regex pattern with the actual value - # Look for the pattern (?P...) and replace with the field value - pattern_to_replace = rf'\(\?P<{field.name}>[^)]+\)' - actual_value = getattr(self, field.name) - result = re.sub(pattern_to_replace, actual_value, result) - - return result - - -class DataBuildJob(Protocol): - # The types of partitions that this job produces - output_types: list[type[PartitionPattern]] - - def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ... - - def exec(self, *args: str) -> None: ... - - -class DataBuildGraph: - def __init__(self, label: str): - self.label = label - self.lookup = {} - - def job(self, cls: type[DataBuildJob]) -> None: - """Register a job with the graph.""" - for partition in cls.output_types: - assert partition not in self.lookup, f"Partition `{partition}` already registered" - self.lookup[partition] = cls - return cls - - def generate_bazel_module(self): - """Generates a complete databuild application, packaging up referenced jobs and this graph via bazel targets""" - raise NotImplementedError - - def generate_bazel_package(self, name: str, output_dir: str, deps: list = None) -> None: - """Generate BUILD.bazel and binaries into a generated/ subdirectory. - - Args: - name: Base name for the generated graph (without .generate suffix) - output_dir: Directory to write generated files to (will create generated/ subdir) - deps: List of Bazel dependency labels to use in generated BUILD.bazel - """ - import os - import shutil - - # Create generated/ subdirectory - generated_dir = os.path.join(output_dir, "generated") - os.makedirs(generated_dir, exist_ok=True) - - # Generate BUILD.bazel with job and graph targets - self._generate_build_bazel(generated_dir, name, deps or []) - - # Generate individual job scripts (instead of shared wrapper) - self._generate_job_scripts(generated_dir) - - # Generate job lookup binary - self._generate_job_lookup(generated_dir, name) - - package_name = self._get_package_name() - print(f"Generated DataBuild package '{name}' in {generated_dir}") - if package_name != "UNKNOWN_PACKAGE": - print(f"Run 'bazel build \"@databuild//{package_name}/generated:{name}_graph.analyze\"' to use the generated graph") - else: - print(f"Run 'bazel build generated:{name}_graph.analyze' to use the generated graph") - - def _generate_build_bazel(self, output_dir: str, name: str, deps: list) -> None: - """Generate BUILD.bazel with databuild_job and databuild_graph targets.""" - import os - - # Get job classes from the lookup table - job_classes = sorted(set(self.lookup.values()), key=lambda cls: cls.__name__) - - # Format deps for BUILD.bazel - if deps: - deps_str = ", ".join([f'"{dep}"' for dep in deps]) - else: - # Fallback to parent package if no deps provided - parent_package = self._get_package_name() - deps_str = f'"//{parent_package}:dsl_src"' - - # Generate py_binary targets for each job - job_binaries = [] - job_targets = [] - - for job_class in job_classes: - job_name = self._snake_case(job_class.__name__) - binary_name = f"{job_name}_binary" - job_targets.append(f'"{job_name}"') - - job_script_name = f"{job_name}.py" - job_binaries.append(f'''py_binary( - name = "{binary_name}", - srcs = ["{job_script_name}"], - main = "{job_script_name}", - deps = [{deps_str}], -) - -databuild_job( - name = "{job_name}", - binary = ":{binary_name}", -)''') - - # Generate the complete BUILD.bazel content - build_content = f'''load("@databuild//databuild:rules.bzl", "databuild_job", "databuild_graph") - -# Generated by DataBuild DSL - do not edit manually -# This file is generated in a subdirectory to avoid overwriting the original BUILD.bazel - -{chr(10).join(job_binaries)} - -py_binary( - name = "{name}_job_lookup", - srcs = ["{name}_job_lookup.py"], - deps = [{deps_str}], -) - -databuild_graph( - name = "{name}_graph", - jobs = [{", ".join(job_targets)}], - lookup = ":{name}_job_lookup", - visibility = ["//visibility:public"], -) - -# Create tar archive of generated files for testing -genrule( - name = "existing_generated", - srcs = glob(["*.py", "BUILD.bazel"]), - outs = ["existing_generated.tar"], - cmd = "mkdir -p temp && cp $(SRCS) temp/ && find temp -exec touch -t 197001010000 {{}} + && tar -cf $@ -C temp .", - visibility = ["//visibility:public"], -) -''' - - with open(os.path.join(output_dir, "BUILD.bazel"), "w") as f: - f.write(build_content) - - def _generate_job_scripts(self, output_dir: str) -> None: - """Generate individual Python scripts for each job class.""" - import os - - # Get job classes and generate a script for each one - job_classes = list(set(self.lookup.values())) - graph_module_path = self._get_graph_module_path() - - for job_class in job_classes: - job_name = self._snake_case(job_class.__name__) - script_name = f"{job_name}.py" - - script_content = f'''#!/usr/bin/env python3 -""" -Generated job script for {job_class.__name__}. -""" - -import sys -import json -from {graph_module_path} import {job_class.__name__} -from databuild.proto import PartitionRef, JobConfigureResponse, to_dict - - -def parse_outputs_from_args(args: list[str]) -> list: - """Parse partition output references from command line arguments.""" - outputs = [] - for arg in args: - # Find which output type can deserialize this partition reference - for output_type in {job_class.__name__}.output_types: - try: - partition = output_type.deserialize(arg) - outputs.append(partition) - break - except ValueError: - continue - else: - raise ValueError(f"No output type in {job_class.__name__} can deserialize partition ref: {{arg}}") - - return outputs - - -if __name__ == "__main__": - if len(sys.argv) < 2: - raise Exception(f"Invalid command usage") - - command = sys.argv[1] - job_instance = {job_class.__name__}() - - if command == "config": - # Parse output partition references as PartitionRef objects (for Rust wrapper) - output_refs = [PartitionRef(str=raw_ref) for raw_ref in sys.argv[2:]] - - # Also parse them into DSL partition objects (for DSL job.config()) - outputs = parse_outputs_from_args(sys.argv[2:]) - - # Call job's config method - returns list[JobConfig] - configs = job_instance.config(outputs) - - # Wrap in JobConfigureResponse and serialize using to_dict() - response = JobConfigureResponse(configs=configs) - print(json.dumps(to_dict(response))) - - elif command == "exec": - # The exec method expects a JobConfig but the Rust wrapper passes args - # For now, let the DSL job handle the args directly - # TODO: This needs to be refined based on actual Rust wrapper interface - job_instance.exec(*sys.argv[2:]) - - else: - raise Exception(f"Invalid command `{{sys.argv[1]}}`") -''' - - script_path = os.path.join(output_dir, script_name) - with open(script_path, "w") as f: - f.write(script_content) - - # Make it executable - os.chmod(script_path, 0o755) - - def _generate_job_lookup(self, output_dir: str, name: str) -> None: - """Generate job lookup binary that maps partition patterns to job targets.""" - import os - - # Build the job lookup mappings with full package paths - package_name = self._get_package_name() - lookup_mappings = [] - for partition_type, job_class in self.lookup.items(): - job_name = self._snake_case(job_class.__name__) - pattern = partition_type._raw_pattern - full_target = f"//{package_name}/generated:{job_name}" - lookup_mappings.append(f' r"{pattern}": "{full_target}",') - - lookup_content = f'''#!/usr/bin/env python3 -""" -Generated job lookup for DataBuild DSL graph. -Maps partition patterns to job targets. -""" - -import sys -import re -import json -from collections import defaultdict - - -# Mapping from partition patterns to job targets -JOB_MAPPINGS = {{ -{chr(10).join(lookup_mappings)} -}} - - -def lookup_job_for_partition(partition_ref: str) -> str: - """Look up which job can build the given partition reference.""" - for pattern, job_target in JOB_MAPPINGS.items(): - if re.match(pattern, partition_ref): - return job_target - - raise ValueError(f"No job found for partition: {{partition_ref}}") - - -def main(): - if len(sys.argv) < 2: - print("Usage: job_lookup.py [partition_ref...]", file=sys.stderr) - sys.exit(1) - - results = defaultdict(list) - try: - for partition_ref in sys.argv[1:]: - job_target = lookup_job_for_partition(partition_ref) - results[job_target].append(partition_ref) - - # Output the results as JSON (matching existing lookup format) - print(json.dumps(dict(results))) - except ValueError as e: - print(f"ERROR: {{e}}", file=sys.stderr) - sys.exit(1) - - -if __name__ == "__main__": - main() -''' - - lookup_file = os.path.join(output_dir, f"{name}_job_lookup.py") - with open(lookup_file, "w") as f: - f.write(lookup_content) - - # Make it executable - os.chmod(lookup_file, 0o755) - - def _snake_case(self, name: str) -> str: - """Convert CamelCase to snake_case.""" - import re - s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) - return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() - - def _get_graph_module_path(self) -> str: - """Get the module path for the graph containing this instance.""" - # Try to find the module by looking at where the graph object is defined - import inspect - import sys - - # Look through all loaded modules to find where this graph instance is defined - for module_name, module in sys.modules.items(): - if hasattr(module, 'graph') and getattr(module, 'graph') is self: - if module_name != '__main__': - return module_name - - # Look through the call stack to find the module that imported us - for frame_info in inspect.stack(): - frame_globals = frame_info.frame.f_globals - module_name = frame_globals.get('__name__') - if module_name and module_name != '__main__' and 'graph' in frame_globals: - # Check if this frame has our graph - if frame_globals.get('graph') is self: - return module_name - - # Last resort fallback - this will need to be manually configured - return "UNKNOWN_MODULE" - - def _get_package_name(self) -> str: - """Get the Bazel package name where the DSL source files are located.""" - # Extract package from the graph label if available - if hasattr(self, 'label') and self.label.startswith('//'): - # Extract package from label like "//databuild/test/app:dsl_graph" - package_part = self.label.split(':')[0] - return package_part[2:] # Remove "//" prefix - - # Fallback to trying to infer from module path - module_path = self._get_graph_module_path() - if module_path != "UNKNOWN_MODULE": - # Convert module path to package path - # e.g., "databuild.test.app.dsl.graph" -> "databuild/test/app/dsl" - parts = module_path.split('.') - if parts[-1] in ['graph', 'main']: - parts = parts[:-1] - return '/'.join(parts) - - return "UNKNOWN_PACKAGE" - - -@dataclass -class JobConfigBuilder: - outputs: list[PartitionRef] = field(default_factory=list) - inputs: list[DataDep] = field(default_factory=list) - args: list[str] = field(default_factory=list) - env: dict[str, str] = field(default_factory=dict) - - def build(self) -> JobConfig: - return JobConfig( - outputs=self.outputs, - inputs=self.inputs, - args=self.args, - env=self.env, - ) - - def add_inputs(self, *partitions: PartitionPattern, dep_type: DepType=DepType.MATERIALIZE) -> Self: - for p in partitions: - dep_type_name = "materialize" if dep_type == DepType.MATERIALIZE else "query" - self.inputs.append(DataDep(dep_type_code=dep_type, dep_type_name=dep_type_name, partition_ref=PartitionRef(str=p.serialize()))) - return self - - def add_outputs(self, *partitions: PartitionPattern) -> Self: - for p in partitions: - self.outputs.append(PartitionRef(str=p.serialize())) - return self - - def add_args(self, *args: str) -> Self: - self.args.extend(args) - return self - - def set_args(self, args: list[str]) -> Self: - self.args = args - return self - - def set_env(self, env: dict[str, str]) -> Self: - self.env = env - return self - - def add_env(self, **kwargs) -> Self: - for k, v in kwargs.items(): - assert isinstance(k, str), f"Expected a string key, got `{k}`" - assert isinstance(v, str), f"Expected a string key, got `{v}`" - self.env[k] = v - return self diff --git a/databuild/dsl/python/dsl_job_wrapper.py b/databuild/dsl/python/dsl_job_wrapper.py deleted file mode 100644 index de00f12..0000000 --- a/databuild/dsl/python/dsl_job_wrapper.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python3 -""" -Shared DSL job wrapper that can execute any DataBuildJob defined in a DSL graph. -Configured via environment variables: -- DATABUILD_DSL_GRAPH_MODULE: Python module path containing the graph (e.g., 'databuild.test.app.dsl.graph') -- DATABUILD_JOB_CLASS: Job class name to execute (e.g., 'IngestColorVotes') -""" - -import sys -import json -import os -import importlib -from typing import List, Any -from databuild.proto import JobConfig - - -def parse_outputs_from_args(args: List[str], job_class: Any) -> List[Any]: - """Parse partition output references from command line arguments into partition objects.""" - outputs = [] - for arg in args: - # Find which output type can deserialize this partition reference - for output_type in job_class.output_types: - try: - partition = output_type.deserialize(arg) - outputs.append(partition) - break - except ValueError: - continue - else: - raise ValueError(f"No output type in {job_class.__name__} can deserialize partition ref: {arg}") - - return outputs - - -def main(): - if len(sys.argv) < 2: - print("Usage: dsl_job_wrapper.py [args...]", file=sys.stderr) - sys.exit(1) - - command = sys.argv[1] - - # Read configuration from environment - graph_module_path = os.environ.get('DATABUILD_DSL_GRAPH_MODULE') - job_class_name = os.environ.get('DATABUILD_JOB_CLASS') - - if not graph_module_path: - print("ERROR: DATABUILD_DSL_GRAPH_MODULE environment variable not set", file=sys.stderr) - sys.exit(1) - - if not job_class_name: - print("ERROR: DATABUILD_JOB_CLASS environment variable not set", file=sys.stderr) - sys.exit(1) - - try: - # Import the graph module - module = importlib.import_module(graph_module_path) - graph = getattr(module, 'graph') - - # Get the job class - job_class = getattr(module, job_class_name) - - # Create job instance - job_instance = job_class() - - except (ImportError, AttributeError) as e: - print(f"ERROR: Failed to load job {job_class_name} from {graph_module_path}: {e}", file=sys.stderr) - sys.exit(1) - - if command == "config": - try: - # Parse output partition references from remaining args - output_refs = sys.argv[2:] - if not output_refs: - print("ERROR: No output partition references provided", file=sys.stderr) - sys.exit(1) - - outputs = parse_outputs_from_args(output_refs, job_class) - - # Call job's config method - configs = job_instance.config(outputs) - - # Output each config as JSON (one per line for multiple configs) - for config in configs: - # Convert JobConfig to dict for JSON serialization - config_dict = { - 'outputs': [{'str': ref.str} for ref in config.outputs], - 'inputs': [ - { - 'dep_type_code': dep.dep_type_code, - 'dep_type_name': dep.dep_type_name, - 'partition_ref': {'str': dep.partition_ref.str} - } for dep in config.inputs - ], - 'args': config.args, - 'env': config.env, - } - print(json.dumps(config_dict)) - - except Exception as e: - print(f"ERROR: Config failed: {e}", file=sys.stderr) - sys.exit(1) - - elif command == "exec": - try: - # Read config from stdin - job_instance.exec(*sys.argv[2:]) - - except Exception as e: - print(f"ERROR: Execution failed: {e}", file=sys.stderr) - sys.exit(1) - - else: - print(f"ERROR: Unknown command '{command}'. Use 'config' or 'exec'.", file=sys.stderr) - sys.exit(1) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/databuild/dsl/python/generator.py b/databuild/dsl/python/generator.py deleted file mode 100644 index b5e70de..0000000 --- a/databuild/dsl/python/generator.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 -""" -DSL code generator that can be run as a py_binary with proper dependencies. -""" - -import sys -from databuild.dsl.python.generator_lib import generate_dsl_package - - -def main(): - if len(sys.argv) != 4: - print("Usage: generator.py ", file=sys.stderr) - sys.exit(1) - - module_path = sys.argv[1] - graph_attr = sys.argv[2] - output_dir = sys.argv[3] - - try: - generate_dsl_package(module_path, graph_attr, output_dir) - except Exception as e: - print(f"ERROR: {e}", file=sys.stderr) - import traceback - traceback.print_exc() - sys.exit(1) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/databuild/dsl/python/generator_lib.py b/databuild/dsl/python/generator_lib.py deleted file mode 100644 index b134cb4..0000000 --- a/databuild/dsl/python/generator_lib.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 -""" -Core DSL code generation library that can be imported by different generator binaries. -""" - -import os -import importlib - - -def generate_dsl_package(module_path: str, graph_attr: str, output_dir: str, deps: list = None): - """ - Generate DataBuild DSL package from a graph definition. - - Args: - module_path: Python module path (e.g., "databuild.test.app.dsl.graph") - graph_attr: Name of the graph attribute in the module - output_dir: Directory where to generate the DSL package - deps: List of Bazel dependency labels to use in generated BUILD.bazel - """ - # Extract the base name from the output directory for naming - name = os.path.basename(output_dir.rstrip('/')) or "graph" - - try: - # Import the graph module - module = importlib.import_module(module_path) - graph = getattr(module, graph_attr) - - # Generate the bazel package - graph.generate_bazel_package(name, output_dir, deps or []) - - print(f"Generated DataBuild DSL package in {output_dir}") - - except ImportError as e: - raise ImportError(f"Failed to import {graph_attr} from {module_path}: {e}") - except AttributeError as e: - raise AttributeError(f"Module {module_path} does not have attribute {graph_attr}: {e}") - except Exception as e: - raise Exception(f"Generation failed: {e}") \ No newline at end of file diff --git a/databuild/dsl/python/test/BUILD.bazel b/databuild/dsl/python/test/BUILD.bazel deleted file mode 100644 index c94a448..0000000 --- a/databuild/dsl/python/test/BUILD.bazel +++ /dev/null @@ -1,8 +0,0 @@ -py_test( - name = "dsl_test", - srcs = glob(["*.py"]), - deps = [ - "//databuild/dsl/python:dsl", - "@databuild_pypi//pytest", - ], -) diff --git a/databuild/dsl/python/test/dsl_test.py b/databuild/dsl/python/test/dsl_test.py deleted file mode 100644 index 90e9f95..0000000 --- a/databuild/dsl/python/test/dsl_test.py +++ /dev/null @@ -1,75 +0,0 @@ - -from databuild.dsl.python.dsl import PartitionPattern, DataBuildGraph, DataBuildJob -from databuild.proto import JobConfig, PartitionManifest -from dataclasses import dataclass -import pytest - - -@dataclass -class DateCategory: - data_date: str - category: str - - -class CategoryAnalysisPartition(DateCategory, PartitionPattern): - _raw_pattern = r"category_analysis/category=(?P[^/]+)/date=(?P\d{4}-\d{2}-\d{2})" - -def test_basic_partition_pattern(): - p1 = CategoryAnalysisPartition(data_date="2025-01-01", category="comedy") - assert p1.serialize() == "category_analysis/category=comedy/date=2025-01-01" - - p2 = CategoryAnalysisPartition.deserialize("category_analysis/category=technology/date=2025-01-02") - assert p2.data_date == "2025-01-02" - assert p2.category == "technology" - - -class NotEnoughFieldsPartition(DateCategory, PartitionPattern): - # Doesn't use the partition fields - _raw_pattern = r"invalid_partition_pattern" - - -class TooManyFieldsPartition(DateCategory, PartitionPattern): - # Doesn't use the partition fields - _raw_pattern = r"category_analysis/category=(?P[^/]+)/date=(?P\d{4}-\d{2}-\d{2})/hour=(?P\d{2})" - - -def test_invalid_partition_pattern(): - with pytest.raises(ValueError): - NotEnoughFieldsPartition(data_date="2025-01-01", category="comedy")._validate_pattern() - with pytest.raises(ValueError): - TooManyFieldsPartition(data_date="2025-01-01", category="comedy")._validate_pattern() - - -def test_basic_graph_definition(): - graph = DataBuildGraph("//:test_graph") - - @graph.job - class TestJob(DataBuildJob): - output_types = [CategoryAnalysisPartition] - def exec(self, config: JobConfig) -> None: ... - def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ... - - assert len(graph.lookup) == 1 - assert CategoryAnalysisPartition in graph.lookup - - -def test_graph_collision(): - graph = DataBuildGraph("//:test_graph") - - @graph.job - class TestJob1(DataBuildJob): - output_types = [CategoryAnalysisPartition] - def exec(self, config: JobConfig) -> None: ... - def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ... - - with pytest.raises(AssertionError): - # Outputs the same partition, so should raise - @graph.job - class TestJob2(DataBuildJob): - output_types = [CategoryAnalysisPartition] - def exec(self, config: JobConfig) -> None: ... - def config(self, outputs: list[PartitionPattern]) -> list[JobConfig]: ... - - -if __name__ == "__main__": - raise SystemExit(pytest.main([__file__])) diff --git a/databuild/event_log/mock.rs b/databuild/event_log/mock.rs deleted file mode 100644 index 99d3813..0000000 --- a/databuild/event_log/mock.rs +++ /dev/null @@ -1,665 +0,0 @@ -use crate::*; -use crate::event_log::{BuildEventLogError, Result}; -use crate::event_log::storage::BELStorage; -use crate::event_log::query_engine::BELQueryEngine; -use async_trait::async_trait; -use std::sync::{Arc, Mutex}; -use rusqlite::Connection; - -/// MockBuildEventLog provides an in-memory SQLite database for testing -/// -/// This implementation makes it easy to specify test data and verify behavior -/// while using the real code paths for event writing and repository queries. -/// -/// Key features: -/// - Uses in-memory SQLite for parallel test execution -/// - Provides event constructors with sensible defaults -/// - Allows easy specification of test scenarios -/// - Uses the same SQL schema as production SQLite implementation -pub struct MockBuildEventLog { - connection: Arc>, -} - -impl MockBuildEventLog { - /// Create a new MockBuildEventLog with an in-memory SQLite database - pub async fn new() -> Result { - let conn = Connection::open(":memory:") - .map_err(|e| BuildEventLogError::ConnectionError(e.to_string()))?; - - // Disable foreign key constraints for simplicity in testing - // conn.execute("PRAGMA foreign_keys = ON", []) - - let mock = Self { - connection: Arc::new(Mutex::new(conn)), - }; - - // Initialize the schema - mock.initialize().await?; - - Ok(mock) - } - - /// Create a new MockBuildEventLog with predefined events - pub async fn with_events(events: Vec) -> Result { - let mock = Self::new().await?; - - // Insert all provided events - for event in events { - mock.append_event(event).await?; - } - - Ok(mock) - } - - /// Get the number of events in the mock event log - pub async fn event_count(&self) -> Result { - let conn = self.connection.lock().unwrap(); - let mut stmt = conn.prepare("SELECT COUNT(*) FROM build_events") - .map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - - let count: i64 = stmt.query_row([], |row| row.get(0)) - .map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - - Ok(count as usize) - } - - /// Get all events ordered by timestamp - pub async fn get_all_events(&self) -> Result> { - let conn = self.connection.lock().unwrap(); - let mut stmt = conn.prepare( - "SELECT event_data FROM build_events ORDER BY timestamp ASC" - ).map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - - let rows = stmt.query_map([], |row| { - let event_data: String = row.get(0)?; - Ok(event_data) - }).map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - - let mut events = Vec::new(); - for row in rows { - let event_data = row.map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - let event: BuildEvent = serde_json::from_str(&event_data) - .map_err(|e| BuildEventLogError::SerializationError(e.to_string()))?; - events.push(event); - } - - Ok(events) - } - - /// Clear all events from the mock event log - pub async fn clear(&self) -> Result<()> { - let conn = self.connection.lock().unwrap(); - - // Clear all tables - conn.execute("DELETE FROM build_events", []) - .map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - conn.execute("DELETE FROM build_request_events", []) - .map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - conn.execute("DELETE FROM partition_events", []) - .map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - conn.execute("DELETE FROM job_events", []) - .map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - conn.execute("DELETE FROM delegation_events", []) - .map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - conn.execute("DELETE FROM job_graph_events", []) - .map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - - Ok(()) - } - - /// Initialize the database schema for testing - pub async fn initialize(&self) -> Result<()> { - let conn = self.connection.lock().unwrap(); - - // Create main events table - conn.execute( - "CREATE TABLE IF NOT EXISTS build_events ( - event_id TEXT PRIMARY KEY, - timestamp INTEGER NOT NULL, - build_request_id TEXT NOT NULL, - event_type TEXT NOT NULL, - event_data TEXT NOT NULL - )", - [], - ).map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - - // Create supporting tables for easier queries - conn.execute( - "CREATE TABLE IF NOT EXISTS build_request_events ( - event_id TEXT PRIMARY KEY, - status TEXT NOT NULL, - requested_partitions TEXT NOT NULL, - message TEXT NOT NULL - )", - [], - ).map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - - conn.execute( - "CREATE TABLE IF NOT EXISTS partition_events ( - event_id TEXT PRIMARY KEY, - partition_ref TEXT NOT NULL, - status TEXT NOT NULL, - message TEXT NOT NULL, - job_run_id TEXT - )", - [], - ).map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - - conn.execute( - "CREATE TABLE IF NOT EXISTS job_events ( - event_id TEXT PRIMARY KEY, - job_run_id TEXT NOT NULL, - job_label TEXT NOT NULL, - target_partitions TEXT NOT NULL, - status TEXT NOT NULL, - message TEXT NOT NULL, - config_json TEXT, - manifests_json TEXT NOT NULL - )", - [], - ).map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - - conn.execute( - "CREATE TABLE IF NOT EXISTS delegation_events ( - event_id TEXT PRIMARY KEY, - partition_ref TEXT NOT NULL, - delegated_to_build_request_id TEXT NOT NULL, - message TEXT NOT NULL - )", - [], - ).map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - - conn.execute( - "CREATE TABLE IF NOT EXISTS job_graph_events ( - event_id TEXT PRIMARY KEY, - job_graph_json TEXT NOT NULL - )", - [], - ).map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - - Ok(()) - } - - /// Append an event to the mock event log - pub async fn append_event(&self, event: BuildEvent) -> Result<()> { - let conn = self.connection.lock().unwrap(); - - // Serialize the entire event for storage - let event_data = serde_json::to_string(&event) - .map_err(|e| BuildEventLogError::SerializationError(e.to_string()))?; - - // Insert into main events table - conn.execute( - "INSERT INTO build_events (event_id, timestamp, build_request_id, event_type, event_data) VALUES (?1, ?2, ?3, ?4, ?5)", - rusqlite::params![ - event.event_id, - event.timestamp, - event.build_request_id, - match &event.event_type { - Some(crate::build_event::EventType::BuildRequestEvent(_)) => "build_request", - Some(crate::build_event::EventType::PartitionEvent(_)) => "partition", - Some(crate::build_event::EventType::JobEvent(_)) => "job", - Some(crate::build_event::EventType::DelegationEvent(_)) => "delegation", - Some(crate::build_event::EventType::JobGraphEvent(_)) => "job_graph", - Some(crate::build_event::EventType::PartitionInvalidationEvent(_)) => "partition_invalidation", - Some(crate::build_event::EventType::JobRunCancelEvent(_)) => "job_run_cancel", - Some(crate::build_event::EventType::BuildCancelEvent(_)) => "build_cancel", - Some(crate::build_event::EventType::WantEvent(_)) => "want", - Some(crate::build_event::EventType::TaintEvent(_)) => "taint", - None => "unknown", - }, - event_data - ], - ).map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - - // Insert into specific event type table for better querying - match &event.event_type { - Some(crate::build_event::EventType::BuildRequestEvent(br_event)) => { - let partitions_json = serde_json::to_string(&br_event.requested_partitions) - .map_err(|e| BuildEventLogError::SerializationError(e.to_string()))?; - - conn.execute( - "INSERT INTO build_request_events (event_id, status, requested_partitions, message) VALUES (?1, ?2, ?3, ?4)", - rusqlite::params![ - event.event_id, - br_event.clone().status.unwrap().code.to_string(), - partitions_json, - br_event.message - ], - ).map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - } - Some(crate::build_event::EventType::PartitionEvent(p_event)) => { - conn.execute( - "INSERT INTO partition_events (event_id, partition_ref, status, message, job_run_id) VALUES (?1, ?2, ?3, ?4, ?5)", - rusqlite::params![ - event.event_id, - p_event.partition_ref.as_ref().map(|r| &r.str).unwrap_or(&String::new()), - p_event.status_code.to_string(), - p_event.message, - if p_event.job_run_id.is_empty() { None } else { Some(&p_event.job_run_id) } - ], - ).map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - } - Some(crate::build_event::EventType::JobEvent(j_event)) => { - let partitions_json = serde_json::to_string(&j_event.target_partitions) - .map_err(|e| BuildEventLogError::SerializationError(e.to_string()))?; - let config_json = j_event.config.as_ref() - .map(|c| serde_json::to_string(c)) - .transpose() - .map_err(|e| BuildEventLogError::SerializationError(e.to_string()))?; - let manifests_json = serde_json::to_string(&j_event.manifests) - .map_err(|e| BuildEventLogError::SerializationError(e.to_string()))?; - - conn.execute( - "INSERT INTO job_events (event_id, job_run_id, job_label, target_partitions, status, message, config_json, manifests_json) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)", - rusqlite::params![ - event.event_id, - j_event.job_run_id, - j_event.job_label.as_ref().map(|l| &l.label).unwrap_or(&String::new()), - partitions_json, - j_event.status_code.to_string(), - j_event.message, - config_json, - manifests_json - ], - ).map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - } - _ => {} // Other event types don't need special handling for testing - } - - Ok(()) - } - - /// Get all events for a specific build request - pub async fn get_build_request_events(&self, build_request_id: &str, _limit: Option) -> Result> { - let conn = self.connection.lock().unwrap(); - let mut stmt = conn.prepare( - "SELECT event_data FROM build_events WHERE build_request_id = ? ORDER BY timestamp ASC" - ).map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - - let rows = stmt.query_map([build_request_id], |row| { - let event_data: String = row.get(0)?; - Ok(event_data) - }).map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - - let mut events = Vec::new(); - for row in rows { - let event_data = row.map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - let event: BuildEvent = serde_json::from_str(&event_data) - .map_err(|e| BuildEventLogError::SerializationError(e.to_string()))?; - events.push(event); - } - - Ok(events) - } - - /// Get all events for a specific partition - pub async fn get_partition_events(&self, partition_ref: &str, _limit: Option) -> Result> { - let conn = self.connection.lock().unwrap(); - let mut stmt = conn.prepare( - "SELECT e.event_data FROM build_events e - JOIN partition_events p ON e.event_id = p.event_id - WHERE p.partition_ref = ? ORDER BY e.timestamp ASC" - ).map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - - let rows = stmt.query_map([partition_ref], |row| { - let event_data: String = row.get(0)?; - Ok(event_data) - }).map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - - let mut events = Vec::new(); - for row in rows { - let event_data = row.map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - let event: BuildEvent = serde_json::from_str(&event_data) - .map_err(|e| BuildEventLogError::SerializationError(e.to_string()))?; - events.push(event); - } - - Ok(events) - } - - /// Get the latest status for a partition - pub async fn get_latest_partition_status(&self, partition_ref: &str) -> Result> { - let conn = self.connection.lock().unwrap(); - let mut stmt = conn.prepare( - "SELECT p.status, e.timestamp FROM build_events e - JOIN partition_events p ON e.event_id = p.event_id - WHERE p.partition_ref = ? ORDER BY e.timestamp DESC LIMIT 1" - ).map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - - let result = stmt.query_row([partition_ref], |row| { - let status_str: String = row.get(0)?; - let timestamp: i64 = row.get(1)?; - let status_code = status_str.parse::().unwrap_or(0); - let status = PartitionStatus::try_from(status_code).unwrap_or(PartitionStatus::PartitionUnknown); - Ok((status, timestamp)) - }); - - match result { - Ok(status_and_timestamp) => Ok(Some(status_and_timestamp)), - Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None), - Err(e) => Err(BuildEventLogError::QueryError(e.to_string())), - } - } - - /// Get events in a timestamp range (used by BELStorage) - pub async fn get_events_in_range(&self, start: i64, end: i64) -> Result> { - let conn = self.connection.lock().unwrap(); - let mut stmt = conn.prepare( - "SELECT event_data FROM build_events WHERE timestamp >= ? AND timestamp <= ? ORDER BY timestamp ASC" - ).map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - - let rows = stmt.query_map([start, end], |row| { - let event_data: String = row.get(0)?; - Ok(event_data) - }).map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - - let mut events = Vec::new(); - for row in rows { - let event_data = row.map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - let event: BuildEvent = serde_json::from_str(&event_data) - .map_err(|e| BuildEventLogError::SerializationError(e.to_string()))?; - events.push(event); - } - - Ok(events) - } -} - - -/// Utility functions for creating test events with sensible defaults -pub mod test_events { - use super::*; - use crate::event_log::{generate_event_id, current_timestamp_nanos}; - use uuid::Uuid; - - /// Create a build request received event with random defaults - pub fn build_request_received( - build_request_id: Option, - partitions: Vec, - ) -> BuildEvent { - BuildEvent { - event_id: generate_event_id(), - timestamp: current_timestamp_nanos(), - build_request_id, - event_type: Some(build_event::EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(BuildRequestStatusCode::BuildRequestReceived.status()), - requested_partitions: partitions, - message: "Build request received".to_string(), - comment: None, - want_id: None, - })), - } - } - - /// Create a build request event with specific status - pub fn build_request_event( - build_request_id: Option, - partitions: Vec, - status: BuildRequestStatus, - ) -> BuildEvent { - BuildEvent { - event_id: generate_event_id(), - timestamp: current_timestamp_nanos(), - build_request_id, - event_type: Some(build_event::EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(status.clone()), - requested_partitions: partitions, - message: format!("Build request status: {:?}", status.name), - comment: None, - want_id: None, - })), - } - } - - /// Create a partition status event with random defaults - pub fn partition_status( - build_request_id: Option, - partition_ref: PartitionRef, - status: PartitionStatus, - job_run_id: Option, - ) -> BuildEvent { - BuildEvent { - event_id: generate_event_id(), - timestamp: current_timestamp_nanos(), - build_request_id, - event_type: Some(build_event::EventType::PartitionEvent(PartitionEvent { - partition_ref: Some(partition_ref), - status_code: status as i32, - status_name: status.to_display_string(), - message: format!("Partition status: {:?}", status), - job_run_id: job_run_id.unwrap_or_default(), - })), - } - } - - /// Create a job event with random defaults - pub fn job_event( - build_request_id: Option, - job_run_id: Option, - job_label: JobLabel, - target_partitions: Vec, - status: JobStatus, - ) -> BuildEvent { - BuildEvent { - event_id: generate_event_id(), - timestamp: current_timestamp_nanos(), - build_request_id, - event_type: Some(build_event::EventType::JobEvent(JobEvent { - job_run_id: job_run_id.unwrap_or_else(|| Uuid::new_v4().to_string()), - job_label: Some(job_label), - target_partitions, - status_code: status as i32, - status_name: status.to_display_string(), - message: format!("Job status: {:?}", status), - config: None, - manifests: vec![], - })), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use super::test_events::*; - - #[tokio::test] - async fn test_mock_build_event_log_basic() { - let mock = MockBuildEventLog::new().await.unwrap(); - - // Initially empty - assert_eq!(mock.event_count().await.unwrap(), 0); - - // Add an event - let build_id = "test-build-123".to_string(); - let partition = PartitionRef { str: "test/partition".to_string() }; - let event = build_request_received(Some(build_id.clone()), vec![partition]); - - mock.append_event(event).await.unwrap(); - - // Check event count - assert_eq!(mock.event_count().await.unwrap(), 1); - - // Query events by build request - let events = mock.get_build_request_events(&build_id, None).await.unwrap(); - assert_eq!(events.len(), 1); - - // Clear events - mock.clear().await.unwrap(); - assert_eq!(mock.event_count().await.unwrap(), 0); - } - - #[tokio::test] - async fn test_mock_build_event_log_with_predefined_events() { - let build_id = "test-build-456".to_string(); - let partition = PartitionRef { str: "data/users".to_string() }; - - let events = vec![ - build_request_received(Some(build_id.clone()), vec![partition.clone()]), - partition_status(Some(build_id.clone()), partition.clone(), PartitionStatus::PartitionBuilding, None), - partition_status(Some(build_id.clone()), partition.clone(), PartitionStatus::PartitionAvailable, None), - ]; - - let mock = MockBuildEventLog::with_events(events).await.unwrap(); - - // Should have 3 events - assert_eq!(mock.event_count().await.unwrap(), 3); - - // Query partition events - let partition_events = mock.get_partition_events(&partition.str, None).await.unwrap(); - assert_eq!(partition_events.len(), 2); // Two partition events - - // Check latest partition status - let latest_status = mock.get_latest_partition_status(&partition.str).await.unwrap(); - assert!(latest_status.is_some()); - let (status, _timestamp) = latest_status.unwrap(); - assert_eq!(status, PartitionStatus::PartitionAvailable); - } - - #[tokio::test] - async fn test_event_constructors() { - let partition = PartitionRef { str: "test/data".to_string() }; - let job_label = JobLabel { label: "//:test_job".to_string() }; - - // Test build request event constructor - let br_event = build_request_received(None, vec![partition.clone()]); - assert!(matches!(br_event.event_type, Some(build_event::EventType::BuildRequestEvent(_)))); - - // Test partition event constructor - let p_event = partition_status(None, partition.clone(), PartitionStatus::PartitionAvailable, None); - assert!(matches!(p_event.event_type, Some(build_event::EventType::PartitionEvent(_)))); - - // Test job event constructor - let j_event = job_event(None, None, job_label, vec![partition], JobStatus::JobCompleted); - assert!(matches!(j_event.event_type, Some(build_event::EventType::JobEvent(_)))); - } -} - -/// MockBELStorage is a BELStorage implementation that wraps MockBuildEventLog -/// This allows us to use the real BELQueryEngine in tests while having control over the data -pub struct MockBELStorage { - mock_log: Arc, -} - -impl MockBELStorage { - pub async fn new() -> Result { - let mock_log = Arc::new(MockBuildEventLog::new().await?); - Ok(Self { mock_log }) - } - - pub async fn with_events(events: Vec) -> Result { - let mock_log = Arc::new(MockBuildEventLog::with_events(events).await?); - Ok(Self { mock_log }) - } -} - -#[async_trait] -impl BELStorage for MockBELStorage { - async fn append_event(&self, event: BuildEvent) -> Result { - self.mock_log.append_event(event).await?; - Ok(0) // Return dummy index for mock storage - } - - async fn list_events(&self, since_idx: i64, filter: EventFilter) -> Result { - // Get all events first (MockBELEventLog uses timestamps, so we get all events) - let mut events = self.mock_log.get_events_in_range(0, i64::MAX).await?; - - // Apply filtering based on EventFilter - events.retain(|event| { - - // Filter by build request IDs if specified - if !filter.build_request_ids.is_empty() { - if !filter.build_request_ids.contains(&event.build_request_id.clone().unwrap()) { - return false; - } - } - - // Filter by partition refs if specified - if !filter.partition_refs.is_empty() { - let has_matching_partition = match &event.event_type { - Some(build_event::EventType::PartitionEvent(pe)) => { - pe.partition_ref.as_ref() - .map(|pr| filter.partition_refs.contains(&pr.str)) - .unwrap_or(false) - } - Some(build_event::EventType::BuildRequestEvent(bre)) => { - bre.requested_partitions.iter() - .any(|pr| filter.partition_refs.contains(&pr.str)) - } - Some(build_event::EventType::JobEvent(je)) => { - je.target_partitions.iter() - .any(|pr| filter.partition_refs.contains(&pr.str)) - } - _ => false, - }; - if !has_matching_partition { - return false; - } - } - - // Filter by job labels if specified - if !filter.job_labels.is_empty() { - let has_matching_job = match &event.event_type { - Some(build_event::EventType::JobEvent(je)) => { - je.job_label.as_ref() - .map(|jl| filter.job_labels.contains(&jl.label)) - .unwrap_or(false) - } - _ => false, - }; - if !has_matching_job { - return false; - } - } - - // Filter by job run IDs if specified - if !filter.job_run_ids.is_empty() { - let has_matching_job_run = match &event.event_type { - Some(build_event::EventType::JobEvent(je)) => { - filter.job_run_ids.contains(&je.job_run_id) - } - Some(build_event::EventType::JobRunCancelEvent(jrce)) => { - filter.job_run_ids.contains(&jrce.job_run_id) - } - Some(build_event::EventType::PartitionEvent(pe)) => { - if pe.job_run_id.is_empty() { - false - } else { - filter.job_run_ids.contains(&pe.job_run_id) - } - } - // Add other job-run-related events here if they exist - _ => false, - }; - if !has_matching_job_run { - return false; - } - } - - true - }); - - Ok(EventPage { - events, - next_idx: since_idx + 1, // Simple increment for testing - has_more: false, // Simplify for testing - }) - } - - async fn initialize(&self) -> Result<()> { - self.mock_log.initialize().await - } -} - -/// Helper function to create a BELQueryEngine for testing with mock data -pub async fn create_mock_bel_query_engine() -> Result> { - let storage: Arc = Arc::new(MockBELStorage::new().await?); - Ok(Arc::new(BELQueryEngine::new(storage))) -} - -/// Helper function to create a BELQueryEngine for testing with predefined events -pub async fn create_mock_bel_query_engine_with_events(events: Vec) -> Result> { - let storage: Arc = Arc::new(MockBELStorage::with_events(events).await?); - Ok(Arc::new(BELQueryEngine::new(storage))) -} \ No newline at end of file diff --git a/databuild/event_log/mod.rs b/databuild/event_log/mod.rs deleted file mode 100644 index 4e173cf..0000000 --- a/databuild/event_log/mod.rs +++ /dev/null @@ -1,113 +0,0 @@ -use crate::*; -use std::error::Error as StdError; -use uuid::Uuid; - -pub mod writer; -pub mod mock; -pub mod storage; -pub mod sqlite_storage; -pub mod query_engine; - -#[derive(Debug)] -pub enum BuildEventLogError { - DatabaseError(String), - SerializationError(String), - ConnectionError(String), - QueryError(String), -} - -impl std::fmt::Display for BuildEventLogError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - BuildEventLogError::DatabaseError(msg) => write!(f, "Database error: {}", msg), - BuildEventLogError::SerializationError(msg) => write!(f, "Serialization error: {}", msg), - BuildEventLogError::ConnectionError(msg) => write!(f, "Connection error: {}", msg), - BuildEventLogError::QueryError(msg) => write!(f, "Query error: {}", msg), - } - } -} - -impl StdError for BuildEventLogError {} - -pub type Result = std::result::Result; - -#[derive(Debug, Clone)] -pub struct QueryResult { - pub columns: Vec, - pub rows: Vec>, -} - -// Summary types for list endpoints -#[derive(Debug, Clone)] -pub struct BuildRequestSummary { - pub build_request_id: String, - pub status: BuildRequestStatus, - pub requested_partitions: Vec, - pub created_at: i64, - pub updated_at: i64, -} - -#[derive(Debug, Clone)] -pub struct PartitionSummary { - pub partition_ref: String, - pub status: PartitionStatus, - pub updated_at: i64, - pub build_request_id: Option, -} - -#[derive(Debug, Clone)] -pub struct ActivitySummary { - pub active_builds_count: u32, - pub recent_builds: Vec, - pub recent_partitions: Vec, - pub total_partitions_count: u32, -} - - -// Helper function to generate event ID -pub fn generate_event_id() -> String { - Uuid::new_v4().to_string() -} - -// Helper function to get current timestamp in nanoseconds -pub fn current_timestamp_nanos() -> i64 { - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_nanos() as i64 -} - -// Helper function to create build event with metadata -pub fn create_build_event( - build_request_id: String, - event_type: crate::build_event::EventType, -) -> BuildEvent { - BuildEvent { - event_id: generate_event_id(), - timestamp: current_timestamp_nanos(), - build_request_id: Some(build_request_id.clone()), - event_type: Some(event_type), - } -} - - -// Parse build event log URI and create BEL query engine with appropriate storage backend -pub async fn create_bel_query_engine(uri: &str) -> Result> { - use std::sync::Arc; - use storage::BELStorage; - - if uri == "stdout" { - let storage: Arc = Arc::new(storage::StdoutBELStorage::new()); - storage.initialize().await?; - Ok(Arc::new(query_engine::BELQueryEngine::new(storage))) - } else if uri.starts_with("sqlite://") { - let path = &uri[9..]; // Remove "sqlite://" prefix - let storage: Arc = Arc::new(sqlite_storage::SqliteBELStorage::new(path)?); - storage.initialize().await?; - Ok(Arc::new(query_engine::BELQueryEngine::new(storage))) - } else { - Err(BuildEventLogError::ConnectionError( - format!("Unsupported build event log URI for BEL query engine: {}", uri) - )) - } -} \ No newline at end of file diff --git a/databuild/event_log/query_engine.rs b/databuild/event_log/query_engine.rs deleted file mode 100644 index 55733be..0000000 --- a/databuild/event_log/query_engine.rs +++ /dev/null @@ -1,389 +0,0 @@ -use super::*; -use super::storage::BELStorage; -use std::sync::Arc; -use std::collections::HashMap; - -/// App-layer aggregation that scans storage events -pub struct BELQueryEngine { - storage: Arc, -} - -impl BELQueryEngine { - pub fn new(storage: Arc) -> Self { - Self { storage } - } - - /// Get latest status for a partition by scanning recent events - pub async fn get_latest_partition_status(&self, partition_ref: &str) -> Result> { - let filter = EventFilter { - partition_refs: vec![partition_ref.to_string()], - partition_patterns: vec![], - job_labels: vec![], - job_run_ids: vec![], - build_request_ids: vec![], - }; - - let events = self.storage.list_events(0, filter).await?; - self.aggregate_partition_status(&events.events) - } - - /// Get all build requests that are currently building a partition - pub async fn get_active_builds_for_partition(&self, partition_ref: &str) -> Result> { - let filter = EventFilter { - partition_refs: vec![partition_ref.to_string()], - partition_patterns: vec![], - job_labels: vec![], - job_run_ids: vec![], - build_request_ids: vec![], - }; - - let events = self.storage.list_events(0, filter).await?; - let mut active_builds: Vec = Vec::new(); - let mut build_states: HashMap = HashMap::new(); - - // Process events chronologically to track build states - for event in events.events { - let build_request_id = event.build_request_id.clone().unwrap(); - match &event.event_type { - Some(crate::build_event::EventType::BuildRequestEvent(br_event)) => { - if let Ok(code) = BuildRequestStatusCode::try_from(br_event.clone().status.unwrap().code) { - build_states.insert(build_request_id.clone(), code); - } - } - Some(crate::build_event::EventType::PartitionEvent(p_event)) => { - if let Some(partition_event_ref) = &p_event.partition_ref { - if partition_event_ref.str == partition_ref { - // Check if this partition is actively being built - if let Ok(status) = PartitionStatus::try_from(p_event.status_code) { - if matches!(status, PartitionStatus::PartitionBuilding | PartitionStatus::PartitionAnalyzed) { - // Check if the build request is still active - if let Some(build_status) = build_states.get(&build_request_id) { - if matches!(build_status, - BuildRequestStatusCode::BuildRequestReceived | - BuildRequestStatusCode::BuildRequestPlanning | - BuildRequestStatusCode::BuildRequestExecuting | - BuildRequestStatusCode::BuildRequestAnalysisCompleted - ) { - if !active_builds.contains(&build_request_id) { - active_builds.push(build_request_id.clone()); - } - } - } - } - } - } - } - } - _ => {} - } - } - - Ok(active_builds) - } - - /// Get summary of a build request by aggregating its events - pub async fn get_build_request_summary(&self, build_id: &str) -> Result { - let filter = EventFilter { - partition_refs: vec![], - partition_patterns: vec![], - job_labels: vec![], - job_run_ids: vec![], - build_request_ids: vec![build_id.to_string()], - }; - - let events = self.storage.list_events(0, filter).await?; - - // If no events found, build doesn't exist - if events.events.is_empty() { - return Err(BuildEventLogError::QueryError(format!("Build request '{}' not found", build_id))); - } - - let mut status = BuildRequestStatusCode::BuildRequestUnknown.status(); - let mut requested_partitions = Vec::new(); - let mut created_at = 0i64; - let mut updated_at = 0i64; - - for event in events.events.iter().filter(|event| event.build_request_id.is_some()) { - if event.timestamp > 0 { - if created_at == 0 || event.timestamp < created_at { - created_at = event.timestamp; - } - if event.timestamp > updated_at { - updated_at = event.timestamp; - } - } - - if let Some(crate::build_event::EventType::BuildRequestEvent(br_event)) = &event.event_type { - if let Ok(event_status) = BuildRequestStatus::try_from(br_event.status.clone().unwrap()) { - status = event_status; - } - if !br_event.requested_partitions.is_empty() { - requested_partitions = br_event.requested_partitions.iter() - .map(|p| p.str.clone()) - .collect(); - } - } - } - - Ok(BuildRequestSummary { - build_request_id: build_id.to_string(), - status, - requested_partitions, - created_at, - updated_at, - }) - } - - /// List build requests with pagination and filtering - pub async fn list_build_requests(&self, request: BuildsListRequest) -> Result { - // For now, scan all events and aggregate - let filter = EventFilter { - partition_refs: vec![], - partition_patterns: vec![], - job_labels: vec![], - job_run_ids: vec![], - build_request_ids: vec![], - }; - - let events = self.storage.list_events(0, filter).await?; - let mut build_summaries: HashMap = HashMap::new(); - - // Aggregate by build request ID - for event in events.events.iter().filter(|event| event.build_request_id.is_some()) { - if let Some(crate::build_event::EventType::BuildRequestEvent(br_event)) = &event.event_type { - let build_id = &event.build_request_id.clone().unwrap(); - let entry = build_summaries.entry(build_id.clone()).or_insert_with(|| { - BuildRequestSummary { - build_request_id: build_id.clone(), - status: BuildRequestStatusCode::BuildRequestUnknown.status(), - requested_partitions: Vec::new(), - created_at: event.timestamp, - updated_at: event.timestamp, - } - }); - - if let Ok(status) = BuildRequestStatus::try_from(br_event.status.clone().unwrap()) { - entry.status = status; - } - entry.updated_at = event.timestamp.max(entry.updated_at); - if !br_event.requested_partitions.is_empty() { - entry.requested_partitions = br_event.requested_partitions.iter() - .map(|p| p.str.clone()) - .collect(); - } - } - } - - let mut builds: Vec<_> = build_summaries.into_values().collect(); - builds.sort_by(|a, b| b.created_at.cmp(&a.created_at)); // Most recent first - - // Apply status filter if provided - if let Some(status_filter) = &request.status_filter { - if let Ok(filter_status) = status_filter.parse::() { - if let Ok(status) = BuildRequestStatusCode::try_from(filter_status) { - builds.retain(|b| b.status.code == status as i32); - } - } - } - - let total_count = builds.len() as u32; - let offset = request.offset.unwrap_or(0) as usize; - let limit = request.limit.unwrap_or(50) as usize; - - let paginated_builds = builds.into_iter() - .skip(offset) - .take(limit) - .map(|summary| BuildSummary { - build_request_id: summary.build_request_id, - status: Some(summary.status), - requested_partitions: summary.requested_partitions.into_iter() - .map(|s| PartitionRef { str: s }) - .collect(), - total_jobs: 0, // TODO: Implement - completed_jobs: 0, // TODO: Implement - failed_jobs: 0, // TODO: Implement - cancelled_jobs: 0, // TODO: Implement - requested_at: summary.created_at, - started_at: None, // TODO: Implement - completed_at: None, // TODO: Implement - duration_ms: None, // TODO: Implement - cancelled: false, // TODO: Implement - comment: None, - }) - .collect(); - - Ok(BuildsListResponse { - builds: paginated_builds, - total_count, - has_more: (offset + limit) < total_count as usize, - }) - } - - /// Get activity summary for dashboard - pub async fn get_activity_summary(&self) -> Result { - let builds_response = self.list_build_requests(BuildsListRequest { - limit: Some(5), - offset: Some(0), - status_filter: None, - }).await?; - - let active_builds_count = builds_response.builds.iter() - .filter(|b| matches!( - BuildRequestStatusCode::try_from(b.status.clone().unwrap().code).unwrap_or(BuildRequestStatusCode::BuildRequestUnknown), - BuildRequestStatusCode::BuildRequestReceived | - BuildRequestStatusCode::BuildRequestPlanning | - BuildRequestStatusCode::BuildRequestExecuting | - BuildRequestStatusCode::BuildRequestAnalysisCompleted - )) - .count() as u32; - - let recent_builds = builds_response.builds.into_iter() - .map(|b| BuildRequestSummary { - build_request_id: b.build_request_id, - status: b.status.unwrap_or(BuildRequestStatusCode::BuildRequestUnknown.status()), - requested_partitions: b.requested_partitions.into_iter().map(|p| p.str).collect(), - created_at: b.requested_at, - updated_at: b.completed_at.unwrap_or(b.requested_at), - }) - .collect(); - - // For partitions, we'd need a separate implementation - let recent_partitions = Vec::new(); // TODO: Implement partition listing - - Ok(ActivitySummary { - active_builds_count, - recent_builds, - recent_partitions, - total_partitions_count: 0, // TODO: Implement - }) - } - - /// Helper to aggregate partition status from events - fn aggregate_partition_status(&self, events: &[BuildEvent]) -> Result> { - let mut latest_status = None; - let mut latest_timestamp = 0i64; - - // Look for the most recent partition event for this partition - for event in events { - if let Some(crate::build_event::EventType::PartitionEvent(p_event)) = &event.event_type { - if event.timestamp >= latest_timestamp { - if let Ok(status) = PartitionStatus::try_from(p_event.status_code) { - latest_status = Some(status); - latest_timestamp = event.timestamp; - } - } - } - } - - Ok(latest_status.map(|status| (status, latest_timestamp))) - } - - /// Get build request ID that created an available partition - pub async fn get_build_request_for_available_partition(&self, partition_ref: &str) -> Result> { - let filter = EventFilter { - partition_refs: vec![partition_ref.to_string()], - partition_patterns: vec![], - job_labels: vec![], - job_run_ids: vec![], - build_request_ids: vec![], - }; - - let events = self.storage.list_events(0, filter).await?; - - // Find the most recent PARTITION_AVAILABLE event - let mut latest_available_build_id = None; - let mut latest_timestamp = 0i64; - - for event in events.events { - if let Some(crate::build_event::EventType::PartitionEvent(p_event)) = &event.event_type { - if let Some(partition_event_ref) = &p_event.partition_ref { - if partition_event_ref.str == partition_ref { - if let Ok(status) = PartitionStatus::try_from(p_event.status_code) { - if status == PartitionStatus::PartitionAvailable && event.timestamp >= latest_timestamp { - latest_available_build_id = event.build_request_id.clone(); - latest_timestamp = event.timestamp; - } - } - } - } - } - } - - Ok(latest_available_build_id) - } - - /// Append an event to storage - pub async fn append_event(&self, event: BuildEvent) -> Result { - self.storage.append_event(event).await - } - - /// Get all events for a specific partition - pub async fn get_partition_events(&self, partition_ref: &str, _limit: Option) -> Result> { - let filter = EventFilter { - partition_refs: vec![partition_ref.to_string()], - partition_patterns: vec![], - job_labels: vec![], - job_run_ids: vec![], - build_request_ids: vec![], - }; - - let events = self.storage.list_events(0, filter).await?; - Ok(events.events) - } - - /// Execute a raw SQL query (for backwards compatibility) - pub async fn execute_query(&self, _query: &str) -> Result { - // TODO: Implement SQL query execution if needed - // For now, return empty result to avoid compilation errors - Ok(QueryResult { - columns: vec![], - rows: vec![], - }) - } - - /// Get all events in a timestamp range - pub async fn get_events_in_range(&self, _start: i64, _end: i64) -> Result> { - // TODO: Implement range filtering - // For now, get all events - let filter = EventFilter { - partition_refs: vec![], - partition_patterns: vec![], - job_labels: vec![], - job_run_ids: vec![], - build_request_ids: vec![], - }; - - let events = self.storage.list_events(0, filter).await?; - Ok(events.events) - } - - /// Get all events for a specific job run - pub async fn get_job_run_events(&self, job_run_id: &str) -> Result> { - let filter = EventFilter { - partition_refs: vec![], - partition_patterns: vec![], - job_labels: vec![], - job_run_ids: vec![job_run_id.to_string()], - build_request_ids: vec![], - }; - - let events = self.storage.list_events(0, filter).await?; - Ok(events.events) - } - - /// Get all events for a specific build request - pub async fn get_build_request_events(&self, build_request_id: &str, _limit: Option) -> Result> { - let filter = EventFilter { - partition_refs: vec![], - partition_patterns: vec![], - job_labels: vec![], - job_run_ids: vec![], - build_request_ids: vec![build_request_id.to_string()], - }; - - let events = self.storage.list_events(0, filter).await?; - Ok(events.events) - } -} - - diff --git a/databuild/event_log/sqlite_storage.rs b/databuild/event_log/sqlite_storage.rs deleted file mode 100644 index 11bea28..0000000 --- a/databuild/event_log/sqlite_storage.rs +++ /dev/null @@ -1,154 +0,0 @@ -use super::*; -use super::storage::BELStorage; -use async_trait::async_trait; -use rusqlite::{params, Connection}; -use std::path::Path; -use std::sync::{Arc, Mutex}; - -pub struct SqliteBELStorage { - connection: Arc>, -} - -impl SqliteBELStorage { - pub fn new(path: &str) -> Result { - // Create parent directory if it doesn't exist - if let Some(parent) = Path::new(path).parent() { - std::fs::create_dir_all(parent) - .map_err(|e| BuildEventLogError::ConnectionError( - format!("Failed to create directory {}: {}", parent.display(), e) - ))?; - } - - let conn = Connection::open(path) - .map_err(|e| BuildEventLogError::ConnectionError(e.to_string()))?; - - Ok(Self { - connection: Arc::new(Mutex::new(conn)), - }) - } -} - -#[async_trait] -impl BELStorage for SqliteBELStorage { - async fn append_event(&self, event: BuildEvent) -> Result { - let serialized = serde_json::to_string(&event) - .map_err(|e| BuildEventLogError::SerializationError(e.to_string()))?; - - let conn = self.connection.lock().unwrap(); - let _row_id = conn.execute( - "INSERT INTO build_events (event_data) VALUES (?)", - params![serialized], - ).map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - - Ok(conn.last_insert_rowid()) - } - - async fn list_events(&self, since_idx: i64, filter: EventFilter) -> Result { - let conn = self.connection.lock().unwrap(); - - // For simplicity in the initial implementation, we'll do basic filtering - // More sophisticated JSON path filtering can be added later if needed - let mut query = "SELECT rowid, event_data FROM build_events WHERE rowid > ?".to_string(); - let mut params_vec = vec![since_idx.to_string()]; - - // Add build request ID filter if provided - if !filter.build_request_ids.is_empty() { - query.push_str(" AND ("); - for (i, build_id) in filter.build_request_ids.iter().enumerate() { - if i > 0 { query.push_str(" OR "); } - query.push_str("JSON_EXTRACT(event_data, '$.build_request_id') = ?"); - params_vec.push(build_id.clone()); - } - query.push_str(")"); - } - - // Add ordering and pagination - query.push_str(" ORDER BY rowid ASC LIMIT 1000"); - - let mut stmt = conn.prepare(&query) - .map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - - // Convert params to rusqlite params - let param_refs: Vec<&dyn rusqlite::ToSql> = params_vec.iter() - .map(|p| p as &dyn rusqlite::ToSql) - .collect(); - - let rows = stmt.query_map(¶m_refs[..], |row| { - let rowid: i64 = row.get(0)?; - let event_data: String = row.get(1)?; - Ok((rowid, event_data)) - }).map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - - let mut events = Vec::new(); - let mut max_idx = since_idx; - - for row in rows { - let (rowid, event_data) = row.map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; - - let event: BuildEvent = serde_json::from_str(&event_data) - .map_err(|e| BuildEventLogError::SerializationError(e.to_string()))?; - - // Apply additional filtering in memory for now - let mut include_event = true; - - if !filter.partition_refs.is_empty() { - include_event = false; - if let Some(event_type) = &event.event_type { - if let crate::build_event::EventType::PartitionEvent(pe) = event_type { - if let Some(partition_ref) = &pe.partition_ref { - if filter.partition_refs.contains(&partition_ref.str) { - include_event = true; - } - } - } - } - } - - if !filter.job_run_ids.is_empty() && include_event { - include_event = false; - if let Some(event_type) = &event.event_type { - if let crate::build_event::EventType::JobEvent(je) = event_type { - if filter.job_run_ids.contains(&je.job_run_id) { - include_event = true; - } - } - } - } - - if include_event { - events.push(event); - max_idx = rowid; - } - } - - let has_more = events.len() >= 1000; // If we got the max limit, there might be more - - Ok(EventPage { - events, - next_idx: max_idx, - has_more, - }) - } - - async fn initialize(&self) -> Result<()> { - let conn = self.connection.lock().unwrap(); - - conn.execute( - "CREATE TABLE IF NOT EXISTS build_events ( - rowid INTEGER PRIMARY KEY AUTOINCREMENT, - event_data TEXT NOT NULL - )", - [], - ).map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - - // Create index for efficient JSON queries - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_build_request_id ON build_events( - JSON_EXTRACT(event_data, '$.build_request_id') - )", - [], - ).map_err(|e| BuildEventLogError::DatabaseError(e.to_string()))?; - - Ok(()) - } -} \ No newline at end of file diff --git a/databuild/event_log/storage.rs b/databuild/event_log/storage.rs deleted file mode 100644 index 1c104d0..0000000 --- a/databuild/event_log/storage.rs +++ /dev/null @@ -1,75 +0,0 @@ -use crate::*; -use async_trait::async_trait; -use super::Result; - -/// Simple stdout storage backend for debugging -pub struct StdoutBELStorage; - -impl StdoutBELStorage { - pub fn new() -> Self { - Self - } -} - -#[async_trait] -impl BELStorage for StdoutBELStorage { - async fn append_event(&self, event: BuildEvent) -> Result { - let json = serde_json::to_string(&event) - .map_err(|e| BuildEventLogError::SerializationError(e.to_string()))?; - - println!("BUILD_EVENT: {}", json); - Ok(0) // Return dummy index for stdout - } - - async fn list_events(&self, _since_idx: i64, _filter: EventFilter) -> Result { - // Stdout implementation doesn't support querying - Err(BuildEventLogError::QueryError( - "Stdout storage backend doesn't support querying".to_string() - )) - } - - async fn initialize(&self) -> Result<()> { - Ok(()) // Nothing to initialize for stdout - } -} - -/// Minimal append-only interface optimized for sequential scanning -#[async_trait] -pub trait BELStorage: Send + Sync { - /// Append a single event, returns the sequential index - async fn append_event(&self, event: BuildEvent) -> Result; - - /// List events with filtering, starting from a given index - async fn list_events(&self, since_idx: i64, filter: EventFilter) -> Result; - - /// Initialize storage backend (create tables, etc.) - async fn initialize(&self) -> Result<()>; -} - -/// Factory function to create storage backends from URI -pub async fn create_bel_storage(uri: &str) -> Result> { - if uri == "stdout" { - Ok(Box::new(StdoutBELStorage::new())) - } else if uri.starts_with("sqlite://") { - let path = &uri[9..]; // Remove "sqlite://" prefix - let storage = crate::event_log::sqlite_storage::SqliteBELStorage::new(path)?; - storage.initialize().await?; - Ok(Box::new(storage)) - } else if uri.starts_with("postgres://") { - // TODO: Implement PostgresBELStorage - Err(BuildEventLogError::ConnectionError( - "PostgreSQL storage backend not yet implemented".to_string() - )) - } else { - Err(BuildEventLogError::ConnectionError( - format!("Unsupported build event log URI: {}", uri) - )) - } -} - -/// Factory function to create query engine from URI -pub async fn create_bel_query_engine(uri: &str) -> Result> { - let storage = create_bel_storage(uri).await?; - let storage_arc = std::sync::Arc::from(storage); - Ok(std::sync::Arc::new(crate::event_log::query_engine::BELQueryEngine::new(storage_arc))) -} \ No newline at end of file diff --git a/databuild/event_log/writer.rs b/databuild/event_log/writer.rs deleted file mode 100644 index 329d305..0000000 --- a/databuild/event_log/writer.rs +++ /dev/null @@ -1,460 +0,0 @@ -use crate::*; -use crate::event_log::{BuildEventLogError, Result, create_build_event, current_timestamp_nanos, generate_event_id, query_engine::BELQueryEngine}; -use std::sync::Arc; -use log::debug; - -/// Common interface for writing events to the build event log with validation -pub struct EventWriter { - query_engine: Arc, -} - -impl EventWriter { - /// Create a new EventWriter with the specified query engine - pub fn new(query_engine: Arc) -> Self { - Self { query_engine } - } - - /// Append an event directly to the event log - pub async fn append_event(&self, event: BuildEvent) -> Result<()> { - self.query_engine.append_event(event).await.map(|_| ()) - } - - /// Get access to the underlying query engine for direct operations - pub fn query_engine(&self) -> &BELQueryEngine { - self.query_engine.as_ref() - } - - /// Request a new build for the specified partitions - pub async fn request_build( - &self, - build_request_id: String, - requested_partitions: Vec, - ) -> Result<()> { - debug!("Writing build request event for build: {}", build_request_id); - - let event = create_build_event( - build_request_id, - build_event::EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(BuildRequestStatusCode::BuildRequestReceived.status()), - requested_partitions, - message: "Build request received".to_string(), - comment: None, - want_id: None, - }), - ); - - self.query_engine.append_event(event).await.map(|_| ()) - } - - /// Update build request status - pub async fn update_build_status( - &self, - build_request_id: String, - status: BuildRequestStatus, - message: String, - ) -> Result<()> { - debug!("Updating build status for {}: {:?}", build_request_id, status); - - let event = create_build_event( - build_request_id, - build_event::EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(status), - requested_partitions: vec![], - message, - comment: None, - want_id: None, - }), - ); - - self.query_engine.append_event(event).await.map(|_| ()) - } - - /// Update build request status with partition list - pub async fn update_build_status_with_partitions( - &self, - build_request_id: String, - status: BuildRequestStatus, - requested_partitions: Vec, - message: String, - ) -> Result<()> { - debug!("Updating build status for {}: {:?}", build_request_id, status); - - let event = create_build_event( - build_request_id, - build_event::EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(status), - requested_partitions, - message, - comment: None, - want_id: None, - }), - ); - - self.query_engine.append_event(event).await.map(|_| ()) - } - - /// Update partition status - pub async fn update_partition_status( - &self, - build_request_id: String, - partition_ref: PartitionRef, - status: PartitionStatus, - message: String, - job_run_id: Option, - ) -> Result<()> { - debug!("Updating partition status for {}: {:?}", partition_ref.str, status); - - let event = BuildEvent { - event_id: generate_event_id(), - timestamp: current_timestamp_nanos(), - build_request_id: Some(build_request_id), - event_type: Some(build_event::EventType::PartitionEvent(PartitionEvent { - partition_ref: Some(partition_ref), - status_code: status as i32, - status_name: status.to_display_string(), - message, - job_run_id: job_run_id.unwrap_or_default(), - })), - }; - - self.query_engine.append_event(event).await.map(|_| ()) - } - - /// Invalidate a partition with a reason - pub async fn invalidate_partition( - &self, - build_request_id: String, - partition_ref: PartitionRef, - reason: String, - ) -> Result<()> { - // First validate that the partition exists by checking its current status - let current_status = self.query_engine.get_latest_partition_status(&partition_ref.str).await?; - - if current_status.is_none() { - return Err(BuildEventLogError::QueryError( - format!("Cannot invalidate non-existent partition: {}", partition_ref.str) - )); - } - - let event = BuildEvent { - event_id: generate_event_id(), - timestamp: current_timestamp_nanos(), - build_request_id: Some(build_request_id), - event_type: Some(build_event::EventType::PartitionInvalidationEvent( - PartitionInvalidationEvent { - partition_ref: Some(partition_ref), - reason, - } - )), - }; - - self.query_engine.append_event(event).await.map(|_| ()) - } - - /// Schedule a job for execution - pub async fn schedule_job( - &self, - build_request_id: String, - job_run_id: String, - job_label: JobLabel, - target_partitions: Vec, - config: JobConfig, - ) -> Result<()> { - debug!("Scheduling job {} for partitions: {:?}", job_label.label, target_partitions); - - let event = BuildEvent { - event_id: generate_event_id(), - timestamp: current_timestamp_nanos(), - build_request_id: Some(build_request_id), - event_type: Some(build_event::EventType::JobEvent(JobEvent { - job_run_id, - job_label: Some(job_label), - target_partitions, - status_code: JobStatus::JobScheduled as i32, - status_name: JobStatus::JobScheduled.to_display_string(), - message: "Job scheduled for execution".to_string(), - config: Some(config), - manifests: vec![], - })), - }; - - self.query_engine.append_event(event).await.map(|_| ()) - } - - /// Update job status - pub async fn update_job_status( - &self, - build_request_id: String, - job_run_id: String, - job_label: JobLabel, - target_partitions: Vec, - status: JobStatus, - message: String, - manifests: Vec, - ) -> Result<()> { - debug!("Updating job {} status to {:?}", job_run_id, status); - - let event = BuildEvent { - event_id: generate_event_id(), - timestamp: current_timestamp_nanos(), - build_request_id: Some(build_request_id), - event_type: Some(build_event::EventType::JobEvent(JobEvent { - job_run_id, - job_label: Some(job_label), - target_partitions, - status_code: status as i32, - status_name: status.to_display_string(), - message, - config: None, - manifests, - })), - }; - - self.query_engine.append_event(event).await.map(|_| ()) - } - - /// Cancel a task (job run) with a reason - pub async fn cancel_task( - &self, - build_request_id: String, - job_run_id: String, - reason: String, - ) -> Result<()> { - // Validate that the job run exists and is in a cancellable state - let job_events = self.query_engine.get_job_run_events(&job_run_id).await?; - - if job_events.is_empty() { - return Err(BuildEventLogError::QueryError( - format!("Cannot cancel non-existent job run: {}", job_run_id) - )); - } - - // Find the latest job status - let latest_status = job_events.iter() - .rev() - .find_map(|e| match &e.event_type { - Some(build_event::EventType::JobEvent(job)) => Some(job.status_code), - _ => None, - }); - - match latest_status { - Some(status) if status == JobStatus::JobCompleted as i32 => { - return Err(BuildEventLogError::QueryError( - format!("Cannot cancel completed job run: {}", job_run_id) - )); - } - Some(status) if status == JobStatus::JobFailed as i32 => { - return Err(BuildEventLogError::QueryError( - format!("Cannot cancel failed job run: {}", job_run_id) - )); - } - Some(status) if status == JobStatus::JobCancelled as i32 => { - return Err(BuildEventLogError::QueryError( - format!("Job run already cancelled: {}", job_run_id) - )); - } - _ => {} - } - - let event = BuildEvent { - event_id: generate_event_id(), - timestamp: current_timestamp_nanos(), - build_request_id: Some(build_request_id), - event_type: Some(build_event::EventType::JobRunCancelEvent(JobRunCancelEvent { - job_run_id, - reason, - })), - }; - - self.query_engine.append_event(event).await.map(|_| ()) - } - - /// Cancel a build request with a reason - pub async fn cancel_build( - &self, - build_request_id: String, - reason: String, - ) -> Result<()> { - // Validate that the build exists and is in a cancellable state - let build_events = self.query_engine.get_build_request_events(&build_request_id, None).await?; - - if build_events.is_empty() { - return Err(BuildEventLogError::QueryError( - format!("Cannot cancel non-existent build: {}", build_request_id) - )); - } - - // Find the latest build status - let latest_status = build_events.iter() - .rev() - .find_map(|e| match &e.event_type { - Some(build_event::EventType::BuildRequestEvent(br)) => Some(br.clone().status.unwrap().code), - _ => None, - }); - - match latest_status { - Some(status) if status == BuildRequestStatusCode::BuildRequestCompleted as i32 => { - return Err(BuildEventLogError::QueryError( - format!("Cannot cancel completed build: {}", build_request_id) - )); - } - Some(status) if status == BuildRequestStatusCode::BuildRequestFailed as i32 => { - return Err(BuildEventLogError::QueryError( - format!("Cannot cancel failed build: {}", build_request_id) - )); - } - Some(status) if status == BuildRequestStatusCode::BuildRequestCancelled as i32 => { - return Err(BuildEventLogError::QueryError( - format!("Build already cancelled: {}", build_request_id) - )); - } - _ => {} - } - - let event = BuildEvent { - event_id: generate_event_id(), - timestamp: current_timestamp_nanos(), - build_request_id: Some(build_request_id.clone()), - event_type: Some(build_event::EventType::BuildCancelEvent(BuildCancelEvent { - reason, - })), - }; - - self.query_engine.append_event(event).await.map(|_| ())?; - - // Also emit a build request status update - self.update_build_status( - build_request_id, - BuildRequestStatusCode::BuildRequestCancelled.status(), - "Build cancelled by user".to_string(), - ).await - } - - /// Record a delegation event when a partition build is delegated to another build - pub async fn record_delegation( - &self, - build_request_id: String, - partition_ref: PartitionRef, - delegated_to_build_request_id: String, - message: String, - ) -> Result<()> { - debug!("Recording delegation of {} to build {}", partition_ref.str, delegated_to_build_request_id); - - let event = create_build_event( - build_request_id, - build_event::EventType::DelegationEvent(DelegationEvent { - partition_ref: Some(partition_ref), - delegated_to_build_request_id, - message, - }), - ); - - self.query_engine.append_event(event).await.map(|_| ()) - } - - /// Record the analyzed job graph - pub async fn record_job_graph( - &self, - build_request_id: String, - job_graph: JobGraph, - message: String, - ) -> Result<()> { - debug!("Recording job graph for build: {}", build_request_id); - - let event = BuildEvent { - event_id: generate_event_id(), - timestamp: current_timestamp_nanos(), - build_request_id: Some(build_request_id), - event_type: Some(build_event::EventType::JobGraphEvent(JobGraphEvent { - job_graph: Some(job_graph), - message, - })), - }; - - self.query_engine.append_event(event).await.map(|_| ()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::event_log::mock::create_mock_bel_query_engine; - - #[tokio::test] - async fn test_event_writer_build_lifecycle() { - let query_engine = create_mock_bel_query_engine().await.unwrap(); - let writer = EventWriter::new(query_engine); - - let build_id = "test-build-123".to_string(); - let partitions = vec![PartitionRef { str: "test/partition".to_string() }]; - - // Test build request - writer.request_build(build_id.clone(), partitions.clone()).await.unwrap(); - - // Test status updates - writer.update_build_status( - build_id.clone(), - BuildRequestStatusCode::BuildRequestPlanning.status(), - "Starting planning".to_string(), - ).await.unwrap(); - - writer.update_build_status( - build_id.clone(), - BuildRequestStatusCode::BuildRequestExecuting.status(), - "Starting execution".to_string(), - ).await.unwrap(); - - writer.update_build_status( - build_id.clone(), - BuildRequestStatusCode::BuildRequestCompleted.status(), - "Build completed successfully".to_string(), - ).await.unwrap(); - } - - #[tokio::test] - async fn test_event_writer_partition_and_job() { - let query_engine = create_mock_bel_query_engine().await.unwrap(); - let writer = EventWriter::new(query_engine); - - let build_id = "test-build-456".to_string(); - let partition = PartitionRef { str: "data/users".to_string() }; - let job_run_id = "job-run-789".to_string(); - let job_label = JobLabel { label: "//:test_job".to_string() }; - - // Test partition status update - writer.update_partition_status( - build_id.clone(), - partition.clone(), - PartitionStatus::PartitionBuilding, - "Building partition".to_string(), - Some(job_run_id.clone()), - ).await.unwrap(); - - // Test job scheduling - let config = JobConfig { - outputs: vec![partition.clone()], - inputs: vec![], - args: vec!["test".to_string()], - env: std::collections::HashMap::new(), - }; - - writer.schedule_job( - build_id.clone(), - job_run_id.clone(), - job_label.clone(), - vec![partition.clone()], - config, - ).await.unwrap(); - - // Test job status update - writer.update_job_status( - build_id.clone(), - job_run_id, - job_label, - vec![partition], - JobStatus::JobCompleted, - "Job completed successfully".to_string(), - vec![], - ).await.unwrap(); - } -} \ No newline at end of file diff --git a/databuild/format_consistency_test.rs b/databuild/format_consistency_test.rs deleted file mode 100644 index bf000ef..0000000 --- a/databuild/format_consistency_test.rs +++ /dev/null @@ -1,143 +0,0 @@ -#[cfg(test)] -mod format_consistency_tests { - use super::*; - use crate::*; - use crate::repositories::partitions::PartitionsRepository; - use crate::event_log::mock::{create_mock_bel_query_engine_with_events, test_events}; - use std::sync::Arc; - - #[tokio::test] - async fn test_partitions_list_json_format_consistency() { - // Create test data - let build_id = "test-build-123".to_string(); - let partition1 = PartitionRef { str: "data/users".to_string() }; - let partition2 = PartitionRef { str: "data/orders".to_string() }; - - let events = vec![ - test_events::build_request_received(Some(build_id.clone()), vec![partition1.clone(), partition2.clone()]), - test_events::partition_status(Some(build_id.clone()), partition1.clone(), PartitionStatus::PartitionBuilding, None), - test_events::partition_status(Some(build_id.clone()), partition1.clone(), PartitionStatus::PartitionAvailable, None), - test_events::partition_status(Some(build_id.clone()), partition2.clone(), PartitionStatus::PartitionBuilding, None), - test_events::partition_status(Some(build_id.clone()), partition2.clone(), PartitionStatus::PartitionFailed, None), - ]; - - let query_engine = create_mock_bel_query_engine_with_events(events).await.unwrap(); - let repository = PartitionsRepository::new(query_engine); - - // Test the new unified protobuf format - let request = PartitionsListRequest { - limit: Some(10), - offset: None, - status_filter: None, - }; - - let response = repository.list_protobuf(request).await.unwrap(); - - // Serialize to JSON and verify structure - let json_value = serde_json::to_value(&response).unwrap(); - - // Verify top-level structure matches expected protobuf schema - assert!(json_value.get("partitions").is_some()); - assert!(json_value.get("total_count").is_some()); - assert!(json_value.get("has_more").is_some()); - - let partitions = json_value["partitions"].as_array().unwrap(); - assert_eq!(partitions.len(), 2); - - // Verify each partition has dual status fields - for partition in partitions { - assert!(partition.get("partition_ref").is_some()); - assert!(partition.get("status_code").is_some(), "Missing status_code field"); - assert!(partition.get("status_name").is_some(), "Missing status_name field"); - assert!(partition.get("last_updated").is_some()); - assert!(partition.get("builds_count").is_some()); - assert!(partition.get("invalidation_count").is_some()); - - // Verify status fields are consistent - let status_code = partition["status_code"].as_i64().unwrap(); - let status_name = partition["status_name"].as_str().unwrap(); - - // Map status codes to expected names - let expected_name = match status_code { - 1 => "requested", - 2 => "analyzed", - 3 => "building", - 4 => "available", - 5 => "failed", - 6 => "delegated", - _ => "unknown", - }; - - // Find the partition by status to verify correct mapping - if status_name == "available" { - assert_eq!(status_code, 4, "Available status should have code 4"); - } else if status_name == "failed" { - assert_eq!(status_code, 5, "Failed status should have code 5"); - } - } - - // Verify JSON serialization produces expected field names (snake_case for JSON) - let json_str = serde_json::to_string_pretty(&response).unwrap(); - assert!(json_str.contains("\"partitions\"")); - assert!(json_str.contains("\"total_count\"")); - assert!(json_str.contains("\"has_more\"")); - assert!(json_str.contains("\"partition_ref\"")); - assert!(json_str.contains("\"status_code\"")); - assert!(json_str.contains("\"status_name\"")); - assert!(json_str.contains("\"last_updated\"")); - assert!(json_str.contains("\"builds_count\"")); - assert!(json_str.contains("\"invalidation_count\"")); - - println!("✅ Partitions list JSON format test passed"); - println!("Sample JSON output:\n{}", json_str); - } - - #[tokio::test] - async fn test_status_conversion_utilities() { - use crate::status_utils::*; - - // Test PartitionStatus conversions - let status = PartitionStatus::PartitionAvailable; - assert_eq!(status.to_display_string(), "available"); - assert_eq!(PartitionStatus::from_display_string("available"), Some(status)); - - // Test JobStatus conversions - let job_status = JobStatus::JobCompleted; - assert_eq!(job_status.to_display_string(), "completed"); - assert_eq!(JobStatus::from_display_string("completed"), Some(job_status)); - - // Test BuildRequestStatus conversions - let build_status = BuildRequestStatusCode::BuildRequestCompleted.status(); - assert_eq!(build_status.name, "completed"); - - // Test invalid conversions - assert_eq!(PartitionStatus::from_display_string("invalid"), None); - - println!("✅ Status conversion utilities test passed"); - } - - #[test] - fn test_protobuf_response_helper_functions() { - use crate::status_utils::list_response_helpers::*; - - // Test PartitionSummary creation - let summary = create_partition_summary( - PartitionRef { str: "test/partition".to_string() }, - PartitionStatus::PartitionAvailable, - 1234567890, - 5, - 2, - Some("build-123".to_string()), - ); - - assert_eq!(summary.partition_ref, Some(PartitionRef { str: "test/partition".to_string() })); - assert_eq!(summary.status_code, 4); // PartitionAvailable = 4 - assert_eq!(summary.status_name, "available"); - assert_eq!(summary.last_updated, 1234567890); - assert_eq!(summary.builds_count, 5); - assert_eq!(summary.invalidation_count, 2); - assert_eq!(summary.last_successful_build, Some("build-123".to_string())); - - println!("✅ Protobuf response helper functions test passed"); - } -} \ No newline at end of file diff --git a/databuild/graph/BUILD.bazel b/databuild/graph/BUILD.bazel deleted file mode 100644 index dcbd589..0000000 --- a/databuild/graph/BUILD.bazel +++ /dev/null @@ -1,43 +0,0 @@ -load("@rules_rust//rust:defs.bzl", "rust_binary", "rust_library") - -exports_files([ - "rust_analyze_wrapper.sh.tpl", - "rust_execute_wrapper.sh.tpl", -]) - -rust_binary( - name = "execute", - srcs = ["execute.rs"], - edition = "2021", - visibility = ["//visibility:public"], - deps = [ - "//databuild", - "@crates//:clap", - "@crates//:crossbeam-channel", - "@crates//:log", - "@crates//:serde", - "@crates//:serde_json", - "@crates//:simple_logger", - "@crates//:tokio", - "@crates//:uuid", - ], -) - -rust_binary( - name = "analyze", - srcs = ["analyze.rs"], - edition = "2021", - visibility = ["//visibility:public"], - deps = [ - "//databuild", - "@crates//:clap", - "@crates//:crossbeam-channel", - "@crates//:log", - "@crates//:num_cpus", - "@crates//:serde", - "@crates//:serde_json", - "@crates//:simple_logger", - "@crates//:tokio", - "@crates//:uuid", - ], -) diff --git a/databuild/graph/README.md b/databuild/graph/README.md deleted file mode 100644 index a18e4e3..0000000 --- a/databuild/graph/README.md +++ /dev/null @@ -1,10 +0,0 @@ - -# DataBuild Graph - -## Entrypoints - -- `graph.build` - Build the requested partitions. -- `graph.analyze` - Calculate the `JobGraph` that would produce the requested partitions. -- `graph.mermaid` - Calculate a [mermaid](https://mermaid.js.org/syntax/flowchart.html) diagram describing the `JobGraph`. -- `graph.serve` - Run the databuild server for this graph. -- `graph.image` / `graph.load` - Build a deployable graph artifact and wrap it in a container. `load` registers the container locally. diff --git a/databuild/graph/analyze.rs b/databuild/graph/analyze.rs deleted file mode 100644 index 31f1f65..0000000 --- a/databuild/graph/analyze.rs +++ /dev/null @@ -1,652 +0,0 @@ -use std::collections::{HashMap, HashSet}; -use std::env; -use std::process::{Command, exit}; -use std::sync::{Arc, Mutex}; -use std::thread; -use log::{info, error}; -use simple_logger::SimpleLogger; -use clap::{Arg, Command as ClapCommand}; -use uuid::Uuid; -use databuild::*; -use databuild::event_log::{create_bel_query_engine, create_build_event}; -use databuild::mermaid_utils::generate_mermaid_diagram; - -// Configure a job to produce the desired outputs -fn configure(job_label: &str, output_refs: &[String]) -> Result, String> { - let candidate_jobs_str = env::var("DATABUILD_CANDIDATE_JOBS_CFG") - .map_err(|e| format!("Failed to get DATABUILD_CANDIDATE_JOBS_CFG: {}", e))?; - - let job_path_map: HashMap = serde_json::from_str(&candidate_jobs_str) - .map_err(|e| format!("Failed to parse DATABUILD_CANDIDATE_JOBS_CFG: {}", e))?; - - // Look up the executable path for this job - let exec_path = job_path_map.get(job_label) - .ok_or_else(|| format!("Job {} is not a candidate job", job_label))?; - - // Check if executable exists - if !std::path::Path::new(exec_path).exists() { - return Err(format!("Executable not found at path: {}", exec_path)); - } - - info!("Executing job configuration: {} {:?}", exec_path, output_refs); - - // Execute the job configuration command - let output = Command::new(exec_path) - .args(output_refs) - .output() - .map_err(|e| format!("Failed to execute job config: {}", e))?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - error!("Job configuration failed: {}", stderr); - return Err(format!("Failed to run job config: {}", stderr)); - } - - info!("Job configuration succeeded for {}", job_label); - - // Parse the job configurations - let stdout = String::from_utf8_lossy(&output.stdout); - let job_configure_response: JobConfigureResponse = serde_json::from_str(&stdout) - .map_err(|e| { - error!("Error parsing job configs for {}: {}. `{}`", job_label, e, stdout); - format!("Failed to parse job configs: {}", e) - })?; - let job_configs = job_configure_response.configs; - - // Create tasks - let tasks: Vec = job_configs.into_iter() - .map(|cfg| Task { - job: Some(JobLabel { label: job_label.to_string() }), - config: Some(cfg), - }) - .collect(); - - info!("Created {} tasks for job {}", tasks.len(), job_label); - Ok(tasks) -} - -// Resolve produces a mapping of required job refs to the partitions it produces -fn resolve(output_refs: &[String]) -> Result>, String> { - let lookup_path = env::var("DATABUILD_JOB_LOOKUP_PATH") - .map_err(|e| format!("Failed to get DATABUILD_JOB_LOOKUP_PATH: {}", e))?; - - // Run the job lookup - info!("Executing job lookup: {} {:?}", lookup_path, output_refs); - - let output = Command::new(&lookup_path) - .args(output_refs) - .output() - .map_err(|e| format!("Failed to execute job lookup: {}", e))?; - - if !output.status.success() { - error!("Job lookup failed: {}", output.status); - let stderr = String::from_utf8_lossy(&output.stderr); - error!("stderr: {}", stderr); - let stdout = String::from_utf8_lossy(&output.stdout); - error!("stdout: {}", stdout); - return Err(format!("Failed to run job lookup: {}", stderr)); - } - - info!("Job lookup succeeded for {} output refs", output_refs.len()); - - // Parse the result - let stdout = String::from_utf8_lossy(&output.stdout); - let result: HashMap> = serde_json::from_str(&stdout) - .map_err(|e| { - error!("Error parsing job lookup result: {}", e); - format!("Failed to parse job lookup result: {}", e) - })?; - - info!("Job lookup found {} job mappings", result.len()); - for (job, refs) in &result { - info!(" Job {} produces {} refs", job, refs.len()); - } - - Ok(result) -} - -// Configure multiple jobs in parallel -fn configure_parallel(job_refs: HashMap>, num_workers: usize) -> Result, String> { - // Create a channel for jobs - let (job_sender, job_receiver) = crossbeam_channel::unbounded(); - - // Fill the jobs channel - for (job_label, produced_refs) in job_refs { - job_sender.send((job_label, produced_refs)).unwrap(); - } - drop(job_sender); // Close the channel - - // Create a channel for results - let (task_sender, task_receiver) = crossbeam_channel::unbounded(); - let error = Arc::new(Mutex::new(None)); - - // Spawn worker threads - let mut handles = vec![]; - for _ in 0..num_workers { - let job_receiver = job_receiver.clone(); - let task_sender = task_sender.clone(); - let error = Arc::clone(&error); - - let handle = thread::spawn(move || { - for (job_label, produced_refs) in job_receiver { - // Check if an error has already occurred - if error.lock().unwrap().is_some() { - return; - } - - match configure(&job_label, &produced_refs) { - Ok(tasks) => { - task_sender.send(tasks).unwrap(); - } - Err(e) => { - let mut error_guard = error.lock().unwrap(); - if error_guard.is_none() { - *error_guard = Some(e); - } - return; - } - } - } - }); - - handles.push(handle); - } - - // Close the task sender - drop(task_sender); - - // Wait for all workers to finish - for handle in handles { - handle.join().unwrap(); - } - - // Check for errors - let error_guard = error.lock().unwrap(); - if let Some(e) = &*error_guard { - return Err(e.clone()); - } - - // Collect results - let mut all_tasks = Vec::new(); - while let Ok(tasks) = task_receiver.try_recv() { - all_tasks.extend(tasks); - } - - Ok(all_tasks) -} - -// Simple staleness check - all requested partitions need jobs created -// Delegation optimization happens in execution phase -async fn check_partition_staleness( - partition_refs: &[String], - _query_engine: &std::sync::Arc, - _build_request_id: &str -) -> Result<(Vec, Vec), String> { - // Analysis phase creates jobs for all requested partitions - // Execution phase will handle delegation optimization - let stale_partitions = partition_refs.to_vec(); - let delegated_partitions = Vec::new(); - - Ok((stale_partitions, delegated_partitions)) -} - -// Plan creates a job graph for given output references -async fn plan( - output_refs: &[String], - query_engine: Option>, - build_request_id: &str -) -> Result { - info!("Starting planning for {} output refs: {:?}", output_refs.len(), output_refs); - - // Log build request received event - if let Some(ref query_engine_ref) = query_engine { - let event = create_build_event( - build_request_id.to_string(), - crate::build_event::EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(BuildRequestStatusCode::BuildRequestReceived.status()), - requested_partitions: output_refs.iter().map(|s| PartitionRef { str: s.clone() }).collect(), - message: "Analysis started".to_string(), - comment: None, - want_id: None, - }) - ); - if let Err(e) = query_engine_ref.append_event(event).await { - error!("Failed to log build request event: {}", e); - } - } - - // Check for partition staleness and delegation opportunities - let (stale_refs, _delegated_refs) = if let Some(ref query_engine_ref) = query_engine { - match check_partition_staleness(output_refs, query_engine_ref, build_request_id).await { - Ok((stale, delegated)) => { - info!("Staleness check: {} stale, {} delegated partitions", stale.len(), delegated.len()); - (stale, delegated) - } - Err(e) => { - error!("Failed to check partition staleness: {}", e); - // Fall back to building all partitions - (output_refs.to_vec(), Vec::new()) - } - } - } else { - // No event log, build all partitions - (output_refs.to_vec(), Vec::new()) - }; - - // Only plan for stale partitions that need to be built - let mut unhandled_refs = HashSet::new(); - for ref_str in &stale_refs { - unhandled_refs.insert(ref_str.clone()); - } - - // Note: Partition analysis events will be logged after successful job graph creation - - let mut epoch = 0; - let mut nodes = Vec::new(); - - // Determine the number of workers based on available CPU cores or environment variable - let mut num_workers = num_cpus::get(); - if let Ok(worker_env) = env::var("DATABUILD_PARALLEL_WORKERS") { - if let Ok(parsed_workers) = worker_env.parse::() { - if parsed_workers < 1 { - num_workers = 1; - info!("Warning: DATABUILD_PARALLEL_WORKERS must be at least 1, using: {}", num_workers); - } else { - num_workers = parsed_workers; - } - } else { - info!("Warning: Invalid DATABUILD_PARALLEL_WORKERS value '{}', using default: {}", worker_env, num_workers); - } - } - info!("Using {} workers for parallel execution", num_workers); - - // Log planning phase start - if let Some(ref query_engine_ref) = query_engine { - let event = create_build_event( - build_request_id.to_string(), - crate::build_event::EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(BuildRequestStatusCode::BuildRequestPlanning.status()), - requested_partitions: output_refs.iter().map(|s| PartitionRef { str: s.clone() }).collect(), - message: "Graph analysis in progress".to_string(), - comment: None, - want_id: None, - }) - ); - if let Err(e) = query_engine_ref.append_event(event).await { - error!("Failed to log planning event: {}", e); - } - } - - while !unhandled_refs.is_empty() { - if epoch >= 1000 { - error!("Planning timeout: still planning after {} epochs, giving up", epoch); - return Err(format!("Still planning after {} epochs, giving up", epoch)); - } - - info!("Planning epoch {} with {} unhandled refs", epoch, unhandled_refs.len()); - - // Resolve jobs for all unhandled refs - let unhandled_refs_list: Vec = unhandled_refs.iter().cloned().collect(); - let job_refs = resolve(&unhandled_refs_list)?; - - // Configure jobs in parallel - let new_nodes = configure_parallel(job_refs.clone(), num_workers)?; - - // Remove handled refs - for (_, produced_refs) in job_refs { - for ref_str in produced_refs { - unhandled_refs.remove(&ref_str); - } - } - - if !unhandled_refs.is_empty() { - error!("Error: Still have unhandled refs after configuration phase: {:?}", unhandled_refs); - return Err(format!("Should have no unhandled refs after configuration phase, but had: {:?}", unhandled_refs)); - } - - epoch += 1; - - // Add new nodes to the graph - nodes.extend(new_nodes.clone()); - info!("Planning epoch {} completed: added {} new nodes, total nodes: {}", epoch, new_nodes.len(), nodes.len()); - - // Plan next epoch - let mut new_unhandled_count = 0; - for task in &new_nodes { - for input in &task.config.as_ref().unwrap().inputs { - if input.dep_type_code == 1 { // MATERIALIZE = 1 - if !unhandled_refs.contains(&input.partition_ref.as_ref().unwrap().str) { - new_unhandled_count += 1; - } - unhandled_refs.insert(input.partition_ref.as_ref().unwrap().str.clone()); - } - } - } - - if new_unhandled_count > 0 { - info!("Added {} new unhandled refs for next planning epoch", new_unhandled_count); - } - } - - if !nodes.is_empty() { - info!("Planning complete: created graph with {} nodes for {} output refs", nodes.len(), output_refs.len()); - - // Log analysis completion event - if let Some(ref query_engine) = query_engine { - let event = create_build_event( - build_request_id.to_string(), - crate::build_event::EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(BuildRequestStatusCode::BuildRequestAnalysisCompleted.status()), - requested_partitions: output_refs.iter().map(|s| PartitionRef { str: s.clone() }).collect(), - message: format!("Analysis completed successfully, {} tasks planned", nodes.len()), - comment: None, - want_id: None, - }) - ); - if let Err(e) = query_engine.append_event(event).await { - error!("Failed to log analysis completion event: {}", e); - } - - // Store the job graph as an event in the build event log - let job_graph = JobGraph { - label: Some(GraphLabel { label: "analyzed_graph".to_string() }), - outputs: output_refs.iter().map(|s| PartitionRef { str: s.clone() }).collect(), - nodes: nodes.clone(), - }; - - let job_graph_event = create_build_event( - build_request_id.to_string(), - crate::build_event::EventType::JobGraphEvent(JobGraphEvent { - job_graph: Some(job_graph), - message: format!("Job graph analysis completed with {} tasks", nodes.len()), - }), - ); - if let Err(e) = query_engine.append_event(job_graph_event).await { - error!("Failed to log job graph event: {}", e); - } - } - - Ok(JobGraph { - label: Some(GraphLabel { label: "analyzed_graph".to_string() }), - outputs: output_refs.iter().map(|s| PartitionRef { str: s.clone() }).collect(), - nodes, - }) - } else { - error!("Planning failed: no nodes created for output refs {:?}", output_refs); - - // Log planning failure - if let Some(ref query_engine) = query_engine { - let event = create_build_event( - build_request_id.to_string(), - crate::build_event::EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(BuildRequestStatusCode::BuildRequestFailed.status()), - requested_partitions: output_refs.iter().map(|s| PartitionRef { str: s.clone() }).collect(), - message: "No jobs found for requested partitions".to_string(), - comment: None, - want_id: None, - }) - ); - if let Err(e) = query_engine.append_event(event).await { - error!("Failed to log failure event: {}", e); - } - } - - Err("Unknown failure in graph planning".to_string()) - } -} - -// Generate a Mermaid flowchart diagram from a job graph -// fn generate_mermaid_diagram(graph: &JobGraph) -> String { -// // Start the mermaid flowchart -// let mut mermaid = String::from("flowchart TD\n"); -// -// // Track nodes we've already added to avoid duplicates -// let mut added_nodes = HashSet::new(); -// let mut added_refs = HashSet::new(); -// -// // Map to track which refs are outputs (to highlight them) -// let mut is_output_ref = HashSet::new(); -// for ref_str in &graph.outputs { -// is_output_ref.insert(ref_str.str.clone()); -// } -// -// // Process each task in the graph -// for task in &graph.nodes { -// // Create a unique ID for this job+outputs combination -// let outputs_strs: Vec = task.config.as_ref().unwrap().outputs.iter().map(|o| o.str.clone()).collect(); -// let outputs_key = outputs_strs.join("_"); -// let mut job_node_id = format!("job_{}", task.job.as_ref().unwrap().label.replace("//", "_")); -// job_node_id = job_node_id.replace(":", "_").replace("=", "_").replace("?", "_").replace(" ", "_"); -// job_node_id = format!("{}_{}", job_node_id, outputs_key.replace("/", "_").replace("=", "_")); -// -// // Create a descriptive label that includes both job label and outputs -// let job_label = &task.job.as_ref().unwrap().label; -// let outputs_label = if !task.config.as_ref().unwrap().outputs.is_empty() { -// if task.config.as_ref().unwrap().outputs.len() == 1 { -// format!(" [{}]", task.config.as_ref().unwrap().outputs[0].str) -// } else { -// format!(" [{}, ...]", task.config.as_ref().unwrap().outputs[0].str) -// } -// } else { -// String::new() -// }; -// -// // Add the job node if not already added -// if !added_nodes.contains(&job_node_id) { -// // Represent job as a process shape with escaped label -// mermaid.push_str(&format!( -// " {}[\"`**{}** {}`\"]:::job\n", -// job_node_id, -// job_label, -// outputs_label -// )); -// added_nodes.insert(job_node_id.clone()); -// } -// -// // Process inputs (dependencies) -// for input in &task.config.as_ref().unwrap().inputs { -// let ref_node_id = format!("ref_{}", input.partition_ref.as_ref().unwrap().str.replace("/", "_").replace("=", "_")); -// -// // Add the partition ref node if not already added -// if !added_refs.contains(&ref_node_id) { -// let node_class = if is_output_ref.contains(&input.partition_ref.as_ref().unwrap().str) { -// "outputPartition" -// } else { -// "partition" -// }; -// -// // Represent partition as a cylinder -// mermaid.push_str(&format!( -// " {}[(\"{}\")]:::{}\n", -// ref_node_id, -// input.partition_ref.as_ref().unwrap().str.replace("/", "_").replace("=", "_"), -// node_class -// )); -// added_refs.insert(ref_node_id.clone()); -// } -// -// // Add the edge from input to job -// if input.dep_type == 1 { // MATERIALIZE = 1 -// // Solid line for materialize dependencies -// mermaid.push_str(&format!(" {} --> {}\n", ref_node_id, job_node_id)); -// } else { -// // Dashed line for query dependencies -// mermaid.push_str(&format!(" {} -.-> {}\n", ref_node_id, job_node_id)); -// } -// } -// -// // Process outputs -// for output in &task.config.as_ref().unwrap().outputs { -// let ref_node_id = format!("ref_{}", output.str.replace("/", "_").replace("=", "_")); -// -// // Add the partition ref node if not already added -// if !added_refs.contains(&ref_node_id) { -// let node_class = if is_output_ref.contains(&output.str) { -// "outputPartition" -// } else { -// "partition" -// }; -// -// // Represent partition as a cylinder -// mermaid.push_str(&format!( -// " {}[(\"Partition: {}\")]:::{}\n", -// ref_node_id, -// output.str, -// node_class -// )); -// added_refs.insert(ref_node_id.clone()); -// } -// -// // Add the edge from job to output -// mermaid.push_str(&format!(" {} --> {}\n", job_node_id, ref_node_id)); -// } -// } -// -// // Add styling -// mermaid.push_str("\n %% Styling\n"); -// mermaid.push_str(" classDef job fill:#f9f,stroke:#333,stroke-width:1px;\n"); -// mermaid.push_str(" classDef partition fill:#bbf,stroke:#333,stroke-width:1px;\n"); -// mermaid.push_str(" classDef outputPartition fill:#bfb,stroke:#333,stroke-width:2px;\n"); -// -// mermaid -// } - -#[tokio::main] -async fn main() { - // Initialize logger - SimpleLogger::new().init().unwrap(); - - let mode = env::var("DATABUILD_MODE").unwrap_or_else(|_| "unknown".to_string()); - info!("Starting analyze.rs in mode: {}", mode); - - // Parse command line arguments (only for partition references) - let matches = ClapCommand::new("analyze") - .version("1.0") - .about("DataBuild graph analysis tool") - .arg( - Arg::new("partitions") - .help("Partition references to analyze") - .required(false) - .num_args(0..) - .value_name("PARTITIONS") - ) - .get_matches(); - - let args: Vec = matches.get_many::("partitions") - .unwrap_or_default() - .cloned() - .collect(); - - // Validate arguments based on mode - match mode.as_str() { - "plan" | "mermaid" => { - if args.is_empty() { - error!("Error: Partition references are required for {} mode", mode); - eprintln!("Error: Partition references are required for {} mode", mode); - exit(1); - } - } - "import_test" => { - // No partition arguments needed for test mode - } - _ => { - // Unknown mode, will be handled later - } - } - - // Get build event log configuration from environment variables - let build_event_log_uri = env::var("DATABUILD_BUILD_EVENT_LOG").ok(); - let build_request_id = env::var("DATABUILD_BUILD_REQUEST_ID") - .unwrap_or_else(|_| Uuid::new_v4().to_string()); - - // Initialize build event log if provided - let query_engine = if let Some(uri) = build_event_log_uri { - match create_bel_query_engine(&uri).await { - Ok(engine) => { - info!("Initialized build event log: {}", uri); - Some(engine) - } - Err(e) => { - error!("Failed to initialize build event log {}: {}", uri, e); - exit(1); - } - } - } else { - None - }; - - - match mode.as_str() { - "plan" => { - // Get output refs from command line arguments - match plan(&args, query_engine, &build_request_id).await { - Ok(graph) => { - // Output the job graph as JSON - match serde_json::to_string(&graph) { - Ok(json_data) => { - info!("Successfully generated job graph with {} nodes", graph.nodes.len()); - println!("{}", json_data); - } - Err(e) => { - error!("Error marshaling job graph: {}", e); - eprintln!("Error marshaling job graph: {}", e); - exit(1); - } - } - } - Err(e) => { - eprintln!("Error: {}", e); - exit(1); - } - } - } - "lookup" => { - // Get output refs from command line arguments - match resolve(&args) { - Ok(result) => { - // Output the result as JSON - match serde_json::to_string(&result) { - Ok(json_data) => { - info!("Successfully completed lookup for {} output refs with {} job mappings", args.len(), result.len()); - println!("{}", json_data); - } - Err(e) => { - error!("Error marshaling lookup result: {}", e); - eprintln!("Error marshaling lookup result: {}", e); - exit(1); - } - } - } - Err(e) => { - eprintln!("Error: {}", e); - exit(1); - } - } - } - "mermaid" => { - // Get output refs from command line arguments - match plan(&args, None, &build_request_id).await { - Ok(graph) => { - // Generate and output the mermaid diagram - let mermaid_diagram = generate_mermaid_diagram(&graph); - println!("{}", mermaid_diagram); - info!("Successfully generated mermaid diagram for {} nodes", graph.nodes.len()); - } - Err(e) => { - eprintln!("Error: {}", e); - exit(1); - } - } - } - "import_test" => { - info!("Running in import_test mode"); - println!("ok :)"); - info!("Import test completed successfully"); - } - _ => { - error!("Error: Unknown mode '{}'", mode); - eprintln!("Unknown MODE `{}`", mode); - exit(1); - } - } -} \ No newline at end of file diff --git a/databuild/graph/execute.rs b/databuild/graph/execute.rs deleted file mode 100644 index 7d962d7..0000000 --- a/databuild/graph/execute.rs +++ /dev/null @@ -1,817 +0,0 @@ -use databuild::{JobGraph, Task, JobStatus, BuildRequestStatus, BuildRequestStatusCode, PartitionStatus, BuildRequestEvent, JobEvent, PartitionEvent, PartitionRef}; -use databuild::event_log::{create_bel_query_engine, create_build_event}; -use databuild::build_event::EventType; -use databuild::log_collector::{LogCollector, LogCollectorError}; -use crossbeam_channel::{Receiver, Sender}; -use log::{debug, error, info, warn}; -use std::collections::{HashMap, HashSet}; -use std::io::{BufReader, Read, Write}; -use std::path::{Path, PathBuf}; -use std::process::{Command, Stdio}; -use std::sync::Arc; -use std::{env, thread}; -use std::time::{Duration, Instant}; -// Command line parsing removed - using environment variables -use uuid::Uuid; - -const NUM_WORKERS: usize = 4; -const LOG_INTERVAL: Duration = Duration::from_secs(5); -const FAIL_FAST: bool = true; // Same default as the Go version - -#[derive(Debug, Clone, PartialEq, Eq)] -enum TaskState { - Pending, - Running, - Succeeded, - Failed, -} - -#[derive(Debug, Clone)] -struct TaskExecutionResult { - task_key: String, - job_label: String, // For logging - success: bool, - stdout: String, - stderr: String, - duration: Duration, - error_message: Option, -} - -// Generates a unique key for a task based on its JobLabel, input and output references. -// Mirrors the Go implementation's getTaskKey. -fn get_task_key(task: &Task) -> String { - let mut key_parts = Vec::new(); - key_parts.push(task.job.as_ref().unwrap().label.clone()); - - for input_dep in &task.config.as_ref().unwrap().inputs { - key_parts.push(format!("input:{}", input_dep.partition_ref.as_ref().unwrap().str)); - } - for output_ref in &task.config.as_ref().unwrap().outputs { - key_parts.push(format!("output:{}", output_ref.str)); - } - key_parts.join("|") -} - -fn worker( - task_rx: Receiver>, - result_tx: Sender, - worker_id: usize, -) { - info!("[Worker {}] Starting", worker_id); - while let Ok(task) = task_rx.recv() { - let task_key = get_task_key(&task); - info!("[Worker {}] Starting job: {} (Key: {})", worker_id, task.job.as_ref().unwrap().label, task_key); - let start_time = Instant::now(); - - let candidate_jobs_str = env::var("DATABUILD_CANDIDATE_JOBS_EXEC") - .map_err(|e| format!("Failed to get DATABUILD_CANDIDATE_JOBS_EXEC: {}", e)).unwrap(); - - let job_path_map: HashMap = serde_json::from_str(&candidate_jobs_str) - .map_err(|e| format!("Failed to parse DATABUILD_CANDIDATE_JOBS_EXEC: {}", e)).unwrap(); - - // Look up the executable path for this job - let job_label = &task.job.as_ref().unwrap().label; - let exec_path = job_path_map.get(job_label) - .ok_or_else(|| format!("Job {} is not a candidate job", job_label)).unwrap(); - - let config_json = match serde_json::to_string(&task.config.as_ref().unwrap()) { - Ok(json) => json, - Err(e) => { - let err_msg = format!("Failed to serialize task config for {}: {}", task.job.as_ref().unwrap().label, e); - error!("[Worker {}] {}", worker_id, err_msg); - result_tx - .send(TaskExecutionResult { - task_key, - job_label: task.job.as_ref().unwrap().label.clone(), - success: false, - stdout: String::new(), - stderr: err_msg.clone(), - duration: start_time.elapsed(), - error_message: Some(err_msg), - }) - .unwrap_or_else(|e| error!("[Worker {}] Failed to send error result: {}", worker_id, e)); - continue; - } - }; - - // Generate a job run ID for this execution - let job_run_id = Uuid::new_v4().to_string(); - - info!("Running job {} (Path: {}) with config: {}", job_label, exec_path, config_json); - let mut cmd = Command::new(&exec_path); - cmd.stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()); - - // Set environment variables from the current process's environment - // This mirrors the Go `cmd.Env = os.Environ()` behavior. - // Task-specific env vars from task.config.env are passed via JSON through stdin. - cmd.env_clear(); // Start with no environment variables - for (key, value) in std::env::vars() { - cmd.env(key, value); // Add current process's environment variables - } - - // Add the job run ID so the job wrapper can use the same ID - cmd.env("DATABUILD_JOB_RUN_ID", &job_run_id); - - match cmd.spawn() { - Ok(mut child) => { - if let Some(mut child_stdin) = child.stdin.take() { - if let Err(e) = child_stdin.write_all(config_json.as_bytes()) { - let err_msg = format!("[Worker {}] Failed to write to stdin for {}: {}", worker_id, task.job.as_ref().unwrap().label, e); - error!("{}", err_msg); - // Ensure child is killed if stdin write fails before wait - let _ = child.kill(); - let _ = child.wait(); // Reap the child - result_tx.send(TaskExecutionResult { - task_key, - job_label: task.job.as_ref().unwrap().label.clone(), - success: false, - stdout: String::new(), - stderr: err_msg.clone(), - duration: start_time.elapsed(), - error_message: Some(err_msg), - }) - .unwrap_or_else(|e| error!("[Worker {}] Failed to send error result: {}", worker_id, e)); - continue; - } - drop(child_stdin); // Close stdin to signal EOF to the child - } else { - let err_msg = format!("[Worker {}] Failed to get stdin for {}", worker_id, task.job.as_ref().unwrap().label); - error!("{}", err_msg); - result_tx.send(TaskExecutionResult { - task_key, - job_label: task.job.as_ref().unwrap().label.clone(), - success: false, - stdout: String::new(), - stderr: err_msg.clone(), - duration: start_time.elapsed(), - error_message: Some(err_msg), - }) - .unwrap_or_else(|e| error!("[Worker {}] Failed to send error result: {}", worker_id, e)); - continue; - } - - // Initialize log collector - let mut log_collector = match LogCollector::new(LogCollector::default_logs_dir()) { - Ok(mut collector) => { - // Set the job label mapping for this job run - collector.set_job_label(&job_run_id, &task.job.as_ref().unwrap().label); - collector - }, - Err(e) => { - let err_msg = format!("[Worker {}] Failed to initialize log collector for {}: {}", - worker_id, task.job.as_ref().unwrap().label, e); - error!("{}", err_msg); - result_tx - .send(TaskExecutionResult { - task_key, - job_label: task.job.as_ref().unwrap().label.clone(), - success: false, - stdout: String::new(), - stderr: err_msg.clone(), - duration: start_time.elapsed(), - error_message: Some(err_msg), - }) - .unwrap_or_else(|e| error!("[Worker {}] Failed to send error result: {}", worker_id, e)); - continue; - } - }; - - // Collect stdout/stderr and process with LogCollector - let stdout_handle = child.stdout.take(); - let stderr_handle = child.stderr.take(); - - let mut stdout_content = String::new(); - let mut stderr_content = String::new(); - - // Read stdout and process with LogCollector - if let Some(stdout) = stdout_handle { - let stdout_reader = BufReader::new(stdout); - if let Err(e) = log_collector.consume_job_output(&job_run_id, stdout_reader) { - warn!("[Worker {}] Failed to process job logs for {}: {}", - worker_id, task.job.as_ref().unwrap().label, e); - } - } - - // Read stderr (raw, not structured) - if let Some(mut stderr) = stderr_handle { - if let Err(e) = stderr.read_to_string(&mut stderr_content) { - warn!("[Worker {}] Failed to read stderr for {}: {}", - worker_id, task.job.as_ref().unwrap().label, e); - } - } - - // Wait for the process to finish - match child.wait() { - Ok(status) => { - let duration = start_time.elapsed(); - let success = status.success(); - - // Close the log collector for this job - if let Err(e) = log_collector.close_job(&job_run_id) { - warn!("[Worker {}] Failed to close log collector for {}: {}", - worker_id, task.job.as_ref().unwrap().label, e); - } - - if success { - info!( - "[Worker {}] Job succeeded: {} (Duration: {:?}, Job Run ID: {})", - worker_id, task.job.as_ref().unwrap().label, duration, job_run_id - ); - } else { - error!( - "[Worker {}] Job failed: {} (Duration: {:?}, Status: {:?}, Job Run ID: {})\nStderr: {}", - worker_id, task.job.as_ref().unwrap().label, duration, status, job_run_id, stderr_content - ); - } - result_tx - .send(TaskExecutionResult { - task_key, - job_label: task.job.as_ref().unwrap().label.clone(), - success, - stdout: format!("Job logs written to JSONL (Job Run ID: {})", job_run_id), - stderr: stderr_content, - duration, - error_message: if success { None } else { Some(format!("Exited with status: {:?}", status)) }, - }) - .unwrap_or_else(|e| error!("[Worker {}] Failed to send result: {}", worker_id, e)); - } - Err(e) => { - let err_msg = format!("[Worker {}] Failed to execute or wait for {}: {}", worker_id, task.job.as_ref().unwrap().label, e); - error!("{}", err_msg); - result_tx - .send(TaskExecutionResult { - task_key, - job_label: task.job.as_ref().unwrap().label.clone(), - success: false, - stdout: String::new(), - stderr: err_msg.clone(), - duration: start_time.elapsed(), - error_message: Some(err_msg), - }) - .unwrap_or_else(|e| error!("[Worker {}] Failed to send execution error result: {}", worker_id, e)); - } - } - } - Err(e) => { - let err_msg = format!("[Worker {}] Failed to spawn command for {}: {} (Path: {:?})", worker_id, task.job.as_ref().unwrap().label, e, exec_path); - error!("{}", err_msg); - result_tx - .send(TaskExecutionResult { - task_key, - job_label: task.job.as_ref().unwrap().label.clone(), - success: false, - stdout: String::new(), - stderr: err_msg.clone(), - duration: start_time.elapsed(), - error_message: Some(err_msg), - }) - .unwrap_or_else(|e| error!("[Worker {}] Failed to send spawn error result: {}", worker_id, e)); - } - } - } - info!("[Worker {}] Exiting", worker_id); -} - -fn is_task_ready(task: &Task, completed_outputs: &HashSet) -> bool { - let mut missing_deps = Vec::new(); - - for dep in &task.config.as_ref().unwrap().inputs { - if dep.dep_type_code == 1 { // MATERIALIZE = 1 - if !completed_outputs.contains(&dep.partition_ref.as_ref().unwrap().str) { - missing_deps.push(&dep.partition_ref.as_ref().unwrap().str); - } - } - } - - if !missing_deps.is_empty() { - debug!("Task {} not ready - missing dependencies: {:?}", task.job.as_ref().unwrap().label, missing_deps); - return false; - } - - true -} - -// Check if partitions are already available or being built by other build requests -async fn check_build_coordination( - task: &Task, - query_engine: &Arc, - build_request_id: &str -) -> Result<(bool, bool, Vec<(PartitionRef, String)>), String> { - let outputs = &task.config.as_ref().unwrap().outputs; - let mut available_partitions = Vec::new(); - let mut needs_building = false; - - for output_ref in outputs { - debug!("Checking build coordination for partition: {}", output_ref.str); - - // First check if this partition is already available - match query_engine.get_latest_partition_status(&output_ref.str).await { - Ok(Some((status, _timestamp))) => { - debug!("Partition {} has status: {:?}", output_ref.str, status); - if status == databuild::PartitionStatus::PartitionAvailable { - // Get which build request created this partition - match query_engine.get_build_request_for_available_partition(&output_ref.str).await { - Ok(Some(source_build_id)) => { - info!("Partition {} already available from build {}", output_ref.str, source_build_id); - available_partitions.push((output_ref.clone(), source_build_id)); - continue; - } - Ok(None) => { - error!("Partition {} is available but no source build found - this indicates a bug in the event log implementation", output_ref.str); - return Err(format!("Available partition {} has no source build ID. This suggests the event log is missing required data.", output_ref.str)); - } - Err(e) => { - error!("Failed to get source build for partition {}: {}", output_ref.str, e); - return Err(format!("Cannot determine source build for available partition {}: {}", output_ref.str, e)); - } - } - } else { - debug!("Partition {} has non-available status {:?}, needs building", output_ref.str, status); - needs_building = true; - } - } - Ok(None) => { - debug!("Partition {} has no status, needs building", output_ref.str); - needs_building = true; - } - Err(e) => { - error!("Failed to check partition status for {}: {}", output_ref.str, e); - return Err(format!("Cannot check partition status: {}. Use a queryable event log (e.g., SQLite) for builds that need to check existing partitions.", e)); - } - } - - // Check if this partition is being built by another request - match query_engine.get_active_builds_for_partition(&output_ref.str).await { - Ok(active_builds) => { - let other_builds: Vec = active_builds.into_iter() - .filter(|id| id != build_request_id) - .collect(); - - if !other_builds.is_empty() { - info!("Partition {} is already being built by other requests: {:?}. Delegating.", - output_ref.str, other_builds); - - // Log delegation event for active builds - for delegated_to_build_id in &other_builds { - let event = create_build_event( - build_request_id.to_string(), - EventType::DelegationEvent(databuild::DelegationEvent { - partition_ref: Some(output_ref.clone()), - delegated_to_build_request_id: delegated_to_build_id.clone(), - message: "Delegated to active build during execution".to_string(), - }) - ); - if let Err(e) = query_engine.append_event(event).await { - error!("Failed to log delegation event: {}", e); - } - } - - return Ok((false, false, available_partitions)); // Don't build, delegated to active build - } - } - Err(e) => { - error!("Failed to check active builds for partition {}: {}", output_ref.str, e); - return Err(format!("Cannot check active builds: {}. Use a queryable event log (e.g., SQLite) for builds that need to check for concurrent execution.", e)); - } - } - - // If we reach here, this partition needs to be built - needs_building = true; - } - - // Only skip the job if ALL partitions are already available - if !needs_building && available_partitions.len() == outputs.len() { - Ok((false, true, available_partitions)) // Don't build, skip due to all partitions available - } else { - Ok((true, false, available_partitions)) // Need to build (some partitions unavailable) - } -} - -fn log_status_summary( - task_states: &HashMap, - original_tasks_by_key: &HashMap>, -) { - let mut pending_tasks = Vec::new(); - let mut running_tasks = Vec::new(); - let mut succeeded_tasks = Vec::new(); - let mut failed_tasks = Vec::new(); - - for (key, state) in task_states { - let label = original_tasks_by_key.get(key).map_or_else(|| key.as_str(), |t| t.job.as_ref().unwrap().label.as_str()); - match state { - TaskState::Pending => pending_tasks.push(label), - TaskState::Running => running_tasks.push(label), - TaskState::Succeeded => succeeded_tasks.push(label), - TaskState::Failed => failed_tasks.push(label), - } - } - - info!("Task Status Summary:"); - info!(" Pending ({}): {:?}", pending_tasks.len(), pending_tasks); - info!(" Running ({}): {:?}", running_tasks.len(), running_tasks); - info!(" Succeeded ({}): {:?}", succeeded_tasks.len(), succeeded_tasks); - info!(" Failed ({}): {:?}", failed_tasks.len(), failed_tasks); -} - - -#[tokio::main] -async fn main() -> Result<(), Box> { - simple_logger::SimpleLogger::new() - .with_level( - std::env::var("RUST_LOG") - .unwrap_or_else(|_| "info".to_string()) - .parse() - .unwrap_or(log::LevelFilter::Info) - ) - .init()?; - - // Get build event log configuration from environment variables - let build_event_log_uri = std::env::var("DATABUILD_BUILD_EVENT_LOG").ok(); - let build_request_id = std::env::var("DATABUILD_BUILD_REQUEST_ID") - .unwrap_or_else(|_| Uuid::new_v4().to_string()); - - // Initialize build event log if provided - let build_event_log = if let Some(uri) = build_event_log_uri { - match create_bel_query_engine(&uri).await { - Ok(log) => { - info!("Initialized build event log: {}", uri); - Some(log) - } - Err(e) => { - error!("Failed to initialize build event log {}: {}", uri, e); - std::process::exit(1); - } - } - } else { - None - }; - - let mut buffer = String::new(); - std::io::stdin().read_to_string(&mut buffer)?; - let graph: JobGraph = serde_json::from_str(&buffer)?; - - info!("Executing job graph with {} nodes", graph.nodes.len()); - - - // Log build request execution start (existing detailed event) - if let Some(ref query_engine) = build_event_log { - let event = create_build_event( - build_request_id.clone(), - EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(BuildRequestStatusCode::BuildRequestExecuting.status()), - requested_partitions: graph.outputs.clone(), - message: format!("Starting execution of {} jobs", graph.nodes.len()), - comment: None, - want_id: None, - }) - ); - if let Err(e) = query_engine.append_event(event).await { - error!("Failed to log execution start event: {}", e); - } - } - - let mut task_states: HashMap = HashMap::new(); - let mut original_tasks_by_key: HashMap> = HashMap::new(); - let graph_nodes_arc: Vec> = graph.nodes.into_iter().map(Arc::new).collect(); - - - for task_node in &graph_nodes_arc { - let key = get_task_key(task_node); - task_states.insert(key.clone(), TaskState::Pending); - original_tasks_by_key.insert(key, task_node.clone()); - } - - let mut completed_outputs: HashSet = HashSet::new(); - let mut job_results: Vec = Vec::new(); - - let (task_tx, task_rx): (Sender>, Receiver>) = crossbeam_channel::unbounded(); - let (result_tx, result_rx): (Sender, Receiver) = crossbeam_channel::unbounded(); - - let mut worker_handles = Vec::new(); - for i in 0..NUM_WORKERS { - let task_rx_clone = task_rx.clone(); - let result_tx_clone = result_tx.clone(); - worker_handles.push(thread::spawn(move || { - worker(task_rx_clone, result_tx_clone, i + 1); - })); - } - // Drop the original result_tx so the channel closes when all workers are done - // if result_rx is the only remaining receiver. - drop(result_tx); - - - let mut last_log_time = Instant::now(); - let mut active_tasks_count = 0; - let mut fail_fast_triggered = false; - - loop { - // 1. Process results - while let Ok(result) = result_rx.try_recv() { - active_tasks_count -= 1; - info!( - "Received result for task {}: Success: {}", - result.job_label, result.success - ); - - let current_state = if result.success { - TaskState::Succeeded - } else { - TaskState::Failed - }; - task_states.insert(result.task_key.clone(), current_state); - - // Log job completion events - if let Some(ref query_engine) = build_event_log { - if let Some(original_task) = original_tasks_by_key.get(&result.task_key) { - let job_run_id = Uuid::new_v4().to_string(); - - // Log job completion - let job_event = create_build_event( - build_request_id.clone(), - EventType::JobEvent(JobEvent { - job_run_id: job_run_id.clone(), - job_label: original_task.job.clone(), - target_partitions: original_task.config.as_ref().unwrap().outputs.clone(), - status_code: if result.success { JobStatus::JobCompleted as i32 } else { JobStatus::JobFailed as i32 }, - status_name: if result.success { JobStatus::JobCompleted.to_display_string() } else { JobStatus::JobFailed.to_display_string() }, - message: if result.success { "Job completed successfully".to_string() } else { result.error_message.clone().unwrap_or_default() }, - config: original_task.config.clone(), - manifests: vec![], // Would be populated from actual job output - }) - ); - if let Err(e) = query_engine.append_event(job_event).await { - error!("Failed to log job completion event: {}", e); - } - - // Log partition status updates - for output_ref in &original_task.config.as_ref().unwrap().outputs { - let partition_event = create_build_event( - build_request_id.clone(), - EventType::PartitionEvent(PartitionEvent { - partition_ref: Some(output_ref.clone()), - status_code: if result.success { PartitionStatus::PartitionAvailable as i32 } else { PartitionStatus::PartitionFailed as i32 }, - status_name: if result.success { PartitionStatus::PartitionAvailable.to_display_string() } else { PartitionStatus::PartitionFailed.to_display_string() }, - message: if result.success { "Partition built successfully".to_string() } else { "Partition build failed".to_string() }, - job_run_id: job_run_id.clone(), - }) - ); - if let Err(e) = query_engine.append_event(partition_event).await { - error!("Failed to log partition status event: {}", e); - } - } - } - } - - if result.success { - if let Some(original_task) = original_tasks_by_key.get(&result.task_key) { - for output_ref in &original_task.config.as_ref().unwrap().outputs { - completed_outputs.insert(output_ref.str.clone()); - } - } - } else { - if FAIL_FAST { - warn!("Fail-fast enabled and task {} failed. Shutting down.", result.job_label); - fail_fast_triggered = true; - } - } - job_results.push(result); - } - - // 2. Check for fail-fast break - if fail_fast_triggered && active_tasks_count == 0 { // Wait for running tasks to finish if fail fast - info!("All active tasks completed after fail-fast trigger."); - break; - } - if fail_fast_triggered && active_tasks_count > 0 { - // Don't schedule new tasks, just wait for active ones or log - } else if !fail_fast_triggered { // Only dispatch if not in fail-fast shutdown - // 3. Dispatch ready tasks - for task_node in &graph_nodes_arc { - let task_key = get_task_key(task_node); - if task_states.get(&task_key) == Some(&TaskState::Pending) { - if is_task_ready(task_node, &completed_outputs) { - // Check build coordination if event log is available - let (should_build, is_skipped, available_partitions) = if let Some(ref query_engine) = build_event_log { - match check_build_coordination(task_node, query_engine, &build_request_id).await { - Ok((should_build, is_skipped, available_partitions)) => (should_build, is_skipped, available_partitions), - Err(e) => { - error!("Error checking build coordination for {}: {}", - task_node.job.as_ref().unwrap().label, e); - (true, false, Vec::<(PartitionRef, String)>::new()) // Default to building on error - } - } - } else { - (true, false, Vec::<(PartitionRef, String)>::new()) // No event log, always build - }; - - if !should_build { - if is_skipped { - // Task skipped due to all partitions already available - info!("Task {} skipped - all target partitions already available", task_node.job.as_ref().unwrap().label); - - // Log delegation events for each available partition - if let Some(ref query_engine) = build_event_log { - for (partition_ref, source_build_id) in &available_partitions { - let delegation_event = create_build_event( - build_request_id.clone(), - EventType::DelegationEvent(databuild::DelegationEvent { - partition_ref: Some(partition_ref.clone()), - delegated_to_build_request_id: source_build_id.clone(), - message: "Delegated to historical build - partition already available".to_string(), - }) - ); - if let Err(e) = query_engine.append_event(delegation_event).await { - error!("Failed to log historical delegation event: {}", e); - } - } - - // Log JOB_SKIPPED event - let job_run_id = Uuid::new_v4().to_string(); - let job_event = create_build_event( - build_request_id.clone(), - EventType::JobEvent(JobEvent { - job_run_id: job_run_id.clone(), - job_label: task_node.job.clone(), - target_partitions: task_node.config.as_ref().unwrap().outputs.clone(), - status_code: JobStatus::JobSkipped as i32, - status_name: JobStatus::JobSkipped.to_display_string(), - message: "Job skipped - all target partitions already available".to_string(), - config: task_node.config.clone(), - manifests: vec![], - }) - ); - if let Err(e) = query_engine.append_event(job_event).await { - error!("Failed to log job skipped event: {}", e); - } - } - } else { - // Task delegated to active build - info!("Task {} delegated to active build request", task_node.job.as_ref().unwrap().label); - } - - task_states.insert(task_key.clone(), TaskState::Succeeded); - - // Mark outputs as completed - for output_ref in &task_node.config.as_ref().unwrap().outputs { - completed_outputs.insert(output_ref.str.clone()); - } - continue; - } - - info!("Dispatching task: {}", task_node.job.as_ref().unwrap().label); - - // Log job scheduling events - if let Some(ref query_engine) = build_event_log { - let job_run_id = Uuid::new_v4().to_string(); - - // Log job scheduled - let job_event = create_build_event( - build_request_id.clone(), - EventType::JobEvent(JobEvent { - job_run_id: job_run_id.clone(), - job_label: task_node.job.clone(), - target_partitions: task_node.config.as_ref().unwrap().outputs.clone(), - status_code: JobStatus::JobScheduled as i32, - status_name: JobStatus::JobScheduled.to_display_string(), - message: "Job scheduled for execution".to_string(), - config: task_node.config.clone(), - manifests: vec![], - }) - ); - if let Err(e) = query_engine.append_event(job_event).await { - error!("Failed to log job scheduled event: {}", e); - } - - // Log partition building status - for output_ref in &task_node.config.as_ref().unwrap().outputs { - let partition_event = create_build_event( - build_request_id.clone(), - EventType::PartitionEvent(PartitionEvent { - partition_ref: Some(output_ref.clone()), - status_code: PartitionStatus::PartitionBuilding as i32, - status_name: PartitionStatus::PartitionBuilding.to_display_string(), - message: "Partition build started".to_string(), - job_run_id: job_run_id.clone(), - }) - ); - if let Err(e) = query_engine.append_event(partition_event).await { - error!("Failed to log partition building event: {}", e); - } - } - } - - task_states.insert(task_key.clone(), TaskState::Running); - task_tx.send(task_node.clone())?; - active_tasks_count += 1; - } - } - } - } - - - // 4. Periodic logging - if last_log_time.elapsed() >= LOG_INTERVAL { - log_status_summary(&task_states, &original_tasks_by_key); - - // Debug: Check for deadlock (pending tasks with no running tasks) - let has_pending = task_states.values().any(|s| *s == TaskState::Pending); - if has_pending && active_tasks_count == 0 { - warn!("Potential deadlock detected: {} pending tasks with no running tasks", - task_states.values().filter(|s| **s == TaskState::Pending).count()); - - // Log details of pending tasks and their preconditions - for (key, state) in &task_states { - if *state == TaskState::Pending { - if let Some(task) = original_tasks_by_key.get(key) { - warn!("Pending task: {} ({})", task.job.as_ref().unwrap().label, key); - warn!(" Required inputs:"); - for dep in &task.config.as_ref().unwrap().inputs { - if dep.dep_type_code == 1 { // MATERIALIZE = 1 - let available = completed_outputs.contains(&dep.partition_ref.as_ref().unwrap().str); - warn!(" {} - {}", dep.partition_ref.as_ref().unwrap().str, if available { "AVAILABLE" } else { "MISSING" }); - } - } - warn!(" Produces outputs:"); - for output in &task.config.as_ref().unwrap().outputs { - warn!(" {}", output.str); - } - } - } - } - } - - last_log_time = Instant::now(); - } - - // 5. Check completion - let all_done = task_states.values().all(|s| *s == TaskState::Succeeded || *s == TaskState::Failed); - if active_tasks_count == 0 && all_done { - info!("All tasks are in a terminal state and no tasks are active."); - break; - } - - // Avoid busy-waiting if no events, give channels time - // Select would be better here, but for simplicity: - thread::sleep(Duration::from_millis(50)); - } - - info!("Shutting down workers..."); - drop(task_tx); // Signal workers to stop by closing the task channel - - for handle in worker_handles { - handle.join().expect("Failed to join worker thread"); - } - info!("All workers finished."); - - // Final processing of any remaining results (should be minimal if loop logic is correct) - while let Ok(result) = result_rx.try_recv() { - active_tasks_count -= 1; // Should be 0 - info!( - "Received late result for task {}: Success: {}", - result.job_label, result.success - ); - // Update state for completeness, though it might not affect overall outcome now - let current_state = if result.success { TaskState::Succeeded } else { TaskState::Failed }; - task_states.insert(result.task_key.clone(), current_state); - job_results.push(result); - } - - - let success_count = job_results.iter().filter(|r| r.success).count(); - let failure_count = job_results.len() - success_count; - - info!("Execution complete: {} succeeded, {} failed", success_count, failure_count); - - - // Log final build request status (existing detailed event) - if let Some(ref query_engine) = build_event_log { - let final_status = if failure_count > 0 || fail_fast_triggered { - BuildRequestStatusCode::BuildRequestFailed - } else { - BuildRequestStatusCode::BuildRequestCompleted - }; - - let event = create_build_event( - build_request_id.clone(), - EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(final_status.status()), - requested_partitions: graph.outputs.clone(), - message: format!("Execution completed: {} succeeded, {} failed", success_count, failure_count), - comment: None, - want_id: None, - }) - ); - if let Err(e) = query_engine.append_event(event).await { - error!("Failed to log final build request event: {}", e); - } - } - - if failure_count > 0 || fail_fast_triggered { - error!("Execution finished with errors."); - std::process::exit(1); - } - - Ok(()) -} diff --git a/databuild/graph/rust_analyze_wrapper.sh.tpl b/databuild/graph/rust_analyze_wrapper.sh.tpl deleted file mode 100644 index b2bb897..0000000 --- a/databuild/graph/rust_analyze_wrapper.sh.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -set -e - -%{RUNFILES_PREFIX} - -%{PREFIX} - -# Locate the Rust binary using its standard runfiles path -# Assumes workspace name is 'databuild' -EXECUTABLE_BINARY="$(rlocation "databuild/databuild/graph/analyze")" - -# Run the analysis -exec "${EXECUTABLE_BINARY}" "$@" diff --git a/databuild/graph/rust_execute_wrapper.sh.tpl b/databuild/graph/rust_execute_wrapper.sh.tpl deleted file mode 100644 index 2552f8c..0000000 --- a/databuild/graph/rust_execute_wrapper.sh.tpl +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -set -e - -%{RUNFILES_PREFIX} - -%{PREFIX} - -EXECUTABLE_BINARY="$(rlocation "databuild/databuild/graph/execute")" - -# Run the execution -exec "${EXECUTABLE_BINARY}" "$@" \ No newline at end of file diff --git a/databuild/graph/test/BUILD.bazel b/databuild/graph/test/BUILD.bazel deleted file mode 100644 index 146f710..0000000 --- a/databuild/graph/test/BUILD.bazel +++ /dev/null @@ -1,5 +0,0 @@ -sh_test( - name = "analyze_test", - srcs = ["analyze_test.sh"], - data = ["//databuild/graph:analyze"], -) \ No newline at end of file diff --git a/databuild/graph/test/analyze_test.sh b/databuild/graph/test/analyze_test.sh deleted file mode 100755 index b48a1b8..0000000 --- a/databuild/graph/test/analyze_test.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env bash - -DATABUILD_MODE=import_test DATABUILD_JOB_LOOKUP_PATH=foo DATABUILD_CANDIDATE_JOBS=bar databuild/graph/analyze diff --git a/databuild/job/BUILD.bazel b/databuild/job/BUILD.bazel deleted file mode 100644 index dc41474..0000000 --- a/databuild/job/BUILD.bazel +++ /dev/null @@ -1,27 +0,0 @@ -load("@rules_rust//rust:defs.bzl", "rust_binary", "rust_test") - -rust_binary( - name = "job_wrapper", - srcs = ["main.rs"], - visibility = ["//visibility:public"], - deps = [ - "//databuild", - "@crates//:serde", - "@crates//:serde_json", - "@crates//:uuid", - "@crates//:sysinfo", - ], -) - -rust_test( - name = "job_wrapper_test", - srcs = ["main.rs"], - deps = [ - "//databuild", - "@crates//:serde", - "@crates//:serde_json", - "@crates//:uuid", - "@crates//:sysinfo", - "@crates//:tempfile", - ], -) diff --git a/databuild/job/README.md b/databuild/job/README.md deleted file mode 100644 index eeece74..0000000 --- a/databuild/job/README.md +++ /dev/null @@ -1,4 +0,0 @@ - -# DataBuild Jobs - -Contains wrappers and tools for implementing DataBuild jobs. diff --git a/databuild/job/main.rs b/databuild/job/main.rs deleted file mode 100644 index 03feb52..0000000 --- a/databuild/job/main.rs +++ /dev/null @@ -1,985 +0,0 @@ -use std::env; -use std::io::{self, Read, Write}; -use std::process::{Command, Stdio}; -use std::sync::{mpsc, Arc, Mutex}; -use std::thread; -use std::time::{Duration, SystemTime, UNIX_EPOCH}; -// All serialization handled by protobuf serde derives -use serde_json; -use sysinfo::{Pid, ProcessRefreshKind, System}; -use uuid::Uuid; - -// Import protobuf types from databuild -use databuild::{ - job_log_entry, log_message, JobConfig, JobLabel, JobLogEntry, LogMessage, PartitionManifest, - PartitionRef, Task, WrapperJobEvent, -}; - -// All types now come from protobuf - no custom structs needed - -// Configuration constants -const DEFAULT_HEARTBEAT_INTERVAL_MS: u64 = 30_000; // 30 seconds -const DEFAULT_METRICS_INTERVAL_MS: u64 = 100; // 100 milliseconds -const TEST_HEARTBEAT_INTERVAL_MS: u64 = 100; // Fast heartbeats for testing -const TEST_METRICS_INTERVAL_MS: u64 = 50; // Fast metrics for testing - -#[derive(Debug)] -struct HeartbeatMessage { - entry: JobLogEntry, -} - -fn get_timestamp() -> String { - SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_secs() - .to_string() -} - -trait LogSink { - fn emit(&mut self, entry: JobLogEntry); -} - -struct StdoutSink; - -impl LogSink for StdoutSink { - fn emit(&mut self, entry: JobLogEntry) { - println!("{}", serde_json::to_string(&entry).unwrap()); - } -} - -struct JobWrapper { - job_id: String, - sequence_number: u64, - start_time: i64, - sink: S, -} - -impl JobWrapper { - fn new() -> Self { - Self::new_with_sink(StdoutSink) - } -} - -impl JobWrapper { - fn new_with_sink(sink: S) -> Self { - // Use job ID from environment if provided by graph execution, otherwise generate one - let job_id = env::var("DATABUILD_JOB_RUN_ID") - .unwrap_or_else(|_| Uuid::new_v4().to_string()); - - Self { - job_id, - sequence_number: 0, - start_time: SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_secs() as i64, - sink, - } - } - - fn next_sequence(&mut self) -> u64 { - self.sequence_number += 1; - self.sequence_number - } - - fn emit_log(&mut self, outputs: &[PartitionRef], content: job_log_entry::Content) { - let entry = JobLogEntry { - timestamp: get_timestamp(), - job_id: self.job_id.clone(), - outputs: outputs.to_vec(), - sequence_number: self.next_sequence(), - content: Some(content), - }; - - self.sink.emit(entry); - } - - fn config_mode(&mut self, outputs: Vec) -> Result<(), Box> { - // Convert to PartitionRef objects - let output_refs: Vec = outputs - .iter() - .map(|s| PartitionRef { r#str: s.clone() }) - .collect(); - - // Following the state diagram: wrapper_validate_config -> emit_config_validate_success - self.emit_log( - &output_refs, - job_log_entry::Content::JobEvent(WrapperJobEvent { - event_type: "config_validate_success".to_string(), - metadata: std::collections::HashMap::new(), - job_status: None, - exit_code: None, - job_label: None, // Will be enriched by LogCollector - }), - ); - - // For Phase 0, we still need to produce the expected JSON config format - // so the current graph system can parse it. Later phases will change this. - let config = JobConfig { - outputs: output_refs.clone(), - inputs: vec![], - args: outputs.clone(), - env: { - let mut env_map = std::collections::HashMap::new(); - if let Some(partition_ref) = outputs.first() { - env_map.insert("PARTITION_REF".to_string(), partition_ref.clone()); - } - env_map - }, - }; - - // For config mode, we need to output the standard config format to stdout - // The structured logs will come later during exec mode - let configs_wrapper = serde_json::json!({ - "configs": [config] - }); - - println!("{}", serde_json::to_string(&configs_wrapper)?); - - Ok(()) - } - - fn exec_mode(&mut self, job_binary: &str) -> Result<(), Box> { - // Read the job config from stdin - let mut buffer = String::new(); - io::stdin().read_to_string(&mut buffer)?; - - let config: JobConfig = serde_json::from_str(&buffer)?; - self.exec_mode_with_config(job_binary, config) - } - - fn exec_mode_with_config( - &mut self, - job_binary: &str, - config: JobConfig, - ) -> Result<(), Box> { - let outputs = &config.outputs; - - // Following the state diagram: - // 1. wrapper_validate_config -> emit_config_validate_success - self.emit_log( - outputs, - job_log_entry::Content::JobEvent(WrapperJobEvent { - event_type: "config_validate_success".to_string(), - job_status: None, - exit_code: None, - metadata: std::collections::HashMap::new(), - job_label: None, // Will be enriched by LogCollector - }), - ); - - // 2. wrapper_launch_task -> emit_task_launch_success - self.emit_log( - outputs, - job_log_entry::Content::JobEvent(WrapperJobEvent { - event_type: "task_launch_success".to_string(), - job_status: None, - exit_code: None, - metadata: std::collections::HashMap::new(), - job_label: None, // Will be enriched by LogCollector - }), - ); - - // Execute the original job binary with the exec subcommand - let mut cmd = Command::new(job_binary); - cmd.arg("exec"); - - // Add the args from the config - for arg in &config.args { - cmd.arg(arg); - } - - cmd.stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()); - - // Set environment variables from config - for (key, value) in &config.env { - cmd.env(key, value); - } - - let mut child = cmd.spawn()?; - let child_pid = child.id(); - - // Send the config to the job - if let Some(stdin) = child.stdin.as_mut() { - stdin.write_all(serde_json::to_string(&config).unwrap().as_bytes())?; - } - - // Start heartbeat thread with channel communication - let heartbeat_job_id = self.job_id.clone(); - let heartbeat_outputs = outputs.clone(); - let heartbeat_sequence = Arc::new(Mutex::new(0u64)); - let heartbeat_sequence_clone = heartbeat_sequence.clone(); - let (heartbeat_tx, heartbeat_rx) = mpsc::channel::(); - - let heartbeat_handle = thread::spawn(move || { - let mut system = System::new_all(); - let pid = Pid::from(child_pid as usize); - - let heartbeat_interval_ms = env::var("DATABUILD_HEARTBEAT_INTERVAL_MS") - .unwrap_or_else(|_| DEFAULT_HEARTBEAT_INTERVAL_MS.to_string()) - .parse::() - .unwrap_or(DEFAULT_HEARTBEAT_INTERVAL_MS); - - loop { - thread::sleep(Duration::from_millis(heartbeat_interval_ms)); - - // Refresh process info - system.refresh_processes_specifics(ProcessRefreshKind::new()); - - // Check if process still exists - if let Some(process) = system.process(pid) { - let memory_mb = process.memory() as f64 / 1024.0 / 1024.0; - let cpu_percent = process.cpu_usage(); - - // Create heartbeat event with metrics - let mut metadata = std::collections::HashMap::new(); - metadata.insert("memory_usage_mb".to_string(), format!("{:.3}", memory_mb)); - metadata.insert( - "cpu_usage_percent".to_string(), - format!("{:.3}", cpu_percent), - ); - - // Get next sequence number for heartbeat - let seq = { - let mut seq_lock = heartbeat_sequence_clone.lock().unwrap(); - *seq_lock += 1; - *seq_lock - }; - - let heartbeat_event = JobLogEntry { - timestamp: get_timestamp(), - job_id: heartbeat_job_id.clone(), - outputs: heartbeat_outputs.clone(), - sequence_number: seq, - content: Some(job_log_entry::Content::JobEvent(WrapperJobEvent { - event_type: "heartbeat".to_string(), - job_status: None, - exit_code: None, - metadata, - job_label: None, // Will be enriched by LogCollector - })), - }; - - // Send heartbeat through channel instead of printing directly - if heartbeat_tx.send(HeartbeatMessage { entry: heartbeat_event }).is_err() { - // Main thread dropped receiver, exit - break; - } - } else { - // Process no longer exists, exit heartbeat thread - break; - } - } - }); - - // Track metrics while job is running - let job_start_time = SystemTime::now(); - let mut system = System::new(); - let pid = Pid::from(child_pid as usize); - - // Initial refresh to establish baseline for CPU measurements - system.refresh_cpu(); - system.refresh_processes_specifics(ProcessRefreshKind::new().with_cpu()); - - let mut peak_memory_mb = 0.0f64; - let mut cpu_samples = Vec::new(); - let mut stdout_buffer = Vec::new(); - let mut stderr_buffer = Vec::new(); - - // Sleep briefly to allow the process to start up before measuring - let sample_interval_ms = env::var("DATABUILD_METRICS_INTERVAL_MS") - .unwrap_or_else(|_| DEFAULT_METRICS_INTERVAL_MS.to_string()) - .parse::() - .unwrap_or(DEFAULT_METRICS_INTERVAL_MS); - thread::sleep(Duration::from_millis(sample_interval_ms)); - - // Poll process status and metrics - let (output, peak_memory_mb, total_cpu_ms, job_duration) = loop { - // Check if process has exited - match child.try_wait()? { - Some(status) => { - // Process has exited, collect any remaining output - if let Some(mut stdout) = child.stdout.take() { - stdout.read_to_end(&mut stdout_buffer)?; - } - if let Some(mut stderr) = child.stderr.take() { - stderr.read_to_end(&mut stderr_buffer)?; - } - - // Calculate final metrics - let job_duration = job_start_time.elapsed().map_err(|e| { - io::Error::new( - io::ErrorKind::Other, - format!("Time calculation error: {}", e), - ) - })?; - - // Calculate CPU time: average CPU percentage * wall-clock time - let total_cpu_ms = if cpu_samples.is_empty() { - 0.0 - } else { - let avg_cpu_percent = - cpu_samples.iter().sum::() as f64 / cpu_samples.len() as f64; - (avg_cpu_percent / 100.0) * job_duration.as_millis() as f64 - }; - - // Stop heartbeat thread - drop(heartbeat_handle); - - // Process any remaining heartbeat messages - while let Ok(heartbeat_msg) = heartbeat_rx.try_recv() { - self.sink.emit(heartbeat_msg.entry); - } - - // Update sequence number to account for heartbeats - let heartbeat_count = heartbeat_sequence.lock().unwrap(); - self.sequence_number = self.sequence_number.max(*heartbeat_count); - drop(heartbeat_count); - - // Create output struct to match original behavior - let output = std::process::Output { - status, - stdout: stdout_buffer, - stderr: stderr_buffer, - }; - - break (output, peak_memory_mb, total_cpu_ms, job_duration); - } - None => { - // Check for heartbeat messages and emit them - while let Ok(heartbeat_msg) = heartbeat_rx.try_recv() { - self.sink.emit(heartbeat_msg.entry); - } - - // Process still running, collect metrics - // Refresh CPU info and processes - system.refresh_cpu(); - system.refresh_processes_specifics(ProcessRefreshKind::new().with_cpu()); - - // Sleep to allow CPU measurement interval - thread::sleep(Duration::from_millis(sample_interval_ms)); - - // Refresh again to get updated CPU usage - system.refresh_cpu(); - system.refresh_processes_specifics(ProcessRefreshKind::new().with_cpu()); - - if let Some(process) = system.process(pid) { - let memory_mb = process.memory() as f64 / 1024.0 / 1024.0; - peak_memory_mb = peak_memory_mb.max(memory_mb); - let cpu_usage = process.cpu_usage(); - cpu_samples.push(cpu_usage); - } - } - } - }; - let success = output.status.success(); - let exit_code = output.status.code().unwrap_or(-1); - - // Capture and forward job stdout/stderr as log messages - if !output.stdout.is_empty() { - let stdout_str = String::from_utf8_lossy(&output.stdout); - self.emit_log( - outputs, - job_log_entry::Content::Log(LogMessage { - level: log_message::LogLevel::Info as i32, - message: stdout_str.to_string(), - fields: std::collections::HashMap::new(), - }), - ); - } - - if !output.stderr.is_empty() { - let stderr_str = String::from_utf8_lossy(&output.stderr); - self.emit_log( - outputs, - job_log_entry::Content::Log(LogMessage { - level: log_message::LogLevel::Error as i32, - message: stderr_str.to_string(), - fields: std::collections::HashMap::new(), - }), - ); - } - - // Emit job summary with resource metrics - let mut summary_metadata = std::collections::HashMap::new(); - summary_metadata.insert( - "runtime_ms".to_string(), - format!("{:.3}", job_duration.as_millis() as f64), - ); - summary_metadata.insert( - "peak_memory_mb".to_string(), - format!("{:.3}", peak_memory_mb), - ); - summary_metadata.insert("total_cpu_ms".to_string(), format!("{:.3}", total_cpu_ms)); - summary_metadata.insert("exit_code".to_string(), exit_code.to_string()); - - self.emit_log( - outputs, - job_log_entry::Content::JobEvent(WrapperJobEvent { - event_type: "job_summary".to_string(), - job_status: None, - exit_code: Some(exit_code), - metadata: summary_metadata, - job_label: None, // Will be enriched by LogCollector - }), - ); - - if success { - // Following the state diagram: wrapper_monitor_task -> zero exit -> emit_task_success - self.emit_log( - outputs, - job_log_entry::Content::JobEvent(WrapperJobEvent { - event_type: "task_success".to_string(), - job_status: Some("JOB_COMPLETED".to_string()), - exit_code: Some(exit_code), - metadata: std::collections::HashMap::new(), - job_label: None, // Will be enriched by LogCollector - }), - ); - - // Then emit_partition_manifest -> success - let end_time = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_secs() as i64; - - self.emit_log( - outputs, - job_log_entry::Content::Manifest(PartitionManifest { - outputs: config.outputs.clone(), - inputs: vec![], // Phase 0: no input manifests yet - start_time: self.start_time, - end_time, - task: Some(Task { - job: Some(JobLabel { - label: env::var("DATABUILD_JOB_LABEL") - .unwrap_or_else(|_| "unknown".to_string()), - }), - config: Some(config.clone()), - }), - metadata: std::collections::HashMap::new(), // Phase 0: no metadata yet - }), - ); - } else { - // Following the state diagram: wrapper_monitor_task -> non-zero exit -> emit_task_failed - self.emit_log( - outputs, - job_log_entry::Content::JobEvent(WrapperJobEvent { - event_type: "task_failed".to_string(), - job_status: Some("JOB_FAILED".to_string()), - exit_code: Some(exit_code), - metadata: std::collections::HashMap::new(), - job_label: None, // Will be enriched by LogCollector - }), - ); - - // Then emit_job_exec_fail -> fail (don't emit partition manifest on failure) - self.emit_log( - outputs, - job_log_entry::Content::JobEvent(WrapperJobEvent { - event_type: "job_exec_fail".to_string(), - job_status: Some("JOB_FAILED".to_string()), - exit_code: Some(exit_code), - metadata: { - let mut meta = std::collections::HashMap::new(); - meta.insert( - "error".to_string(), - format!("Job failed with exit code {}", exit_code), - ); - meta - }, - job_label: None, // Will be enriched by LogCollector - }), - ); - } - - // Forward the original job's output to stdout for compatibility - io::stdout().write_all(&output.stdout)?; - io::stderr().write_all(&output.stderr)?; - - if !success { - std::process::exit(exit_code); - } - - Ok(()) - } -} - -fn main() -> Result<(), Box> { - let args: Vec = env::args().collect(); - - if args.len() < 2 { - eprintln!("Usage: job_wrapper [args...]"); - std::process::exit(1); - } - - let mode = &args[1]; - let mut wrapper = JobWrapper::new(); - - match mode.as_str() { - "config" => { - let outputs = args[2..].to_vec(); - wrapper.config_mode(outputs)?; - } - "exec" => { - // For exec mode, we need to know which original job binary to call - // For Phase 0, we'll derive this from environment or make it configurable - let job_binary = - env::var("DATABUILD_JOB_BINARY").unwrap_or_else(|_| "python3".to_string()); // Default fallback - - wrapper.exec_mode(&job_binary)?; - } - _ => { - eprintln!("Unknown mode: {}", mode); - std::process::exit(1); - } - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - - // Test infrastructure - struct TestSink { - entries: Vec, - } - - impl TestSink { - fn new() -> Self { - Self { - entries: Vec::new(), - } - } - - fn find_event(&self, event_type: &str) -> Option<&JobLogEntry> { - self.entries.iter().find(|entry| { - if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content { - event.event_type == event_type - } else { - false - } - }) - } - } - - impl LogSink for TestSink { - fn emit(&mut self, entry: JobLogEntry) { - self.entries.push(entry); - } - } - - // Helper functions for testing - fn generate_test_config(outputs: &[String]) -> JobConfig { - JobConfig { - outputs: outputs - .iter() - .map(|s| PartitionRef { r#str: s.clone() }) - .collect(), - inputs: vec![], - args: outputs.to_vec(), - env: { - let mut env_map = std::collections::HashMap::new(); - if let Some(partition_ref) = outputs.first() { - env_map.insert("PARTITION_REF".to_string(), partition_ref.clone()); - } - env_map - }, - } - } - - #[test] - fn test_job_log_entry_serialization() { - let entry = JobLogEntry { - timestamp: "1234567890".to_string(), - job_id: "test-id".to_string(), - outputs: vec![PartitionRef { r#str: "test/partition".to_string() }], - sequence_number: 1, - content: Some(job_log_entry::Content::Log(LogMessage { - level: log_message::LogLevel::Info as i32, - message: "test message".to_string(), - fields: std::collections::HashMap::new(), - })), - }; - - let json = serde_json::to_string(&entry).unwrap(); - assert!(json.contains("\"timestamp\":\"1234567890\"")); - assert!(json.contains("\"sequence_number\":1")); - assert!(json.contains("\"Log\":{")); // Capitalized field name - assert!(json.contains("\"message\":\"test message\"")); - } - - #[test] - fn test_sequence_number_increment() { - let mut wrapper = JobWrapper::new(); - assert_eq!(wrapper.next_sequence(), 1); - assert_eq!(wrapper.next_sequence(), 2); - assert_eq!(wrapper.next_sequence(), 3); - } - - #[test] - fn test_config_mode_output_format() { - let outputs = vec!["test/partition".to_string()]; - let config = generate_test_config(&outputs); - - // Verify it produces expected structure - assert_eq!(config.outputs.len(), 1); - assert_eq!(config.outputs[0].r#str, "test/partition"); - assert_eq!(config.args, outputs); - assert_eq!( - config.env.get("PARTITION_REF"), - Some(&"test/partition".to_string()) - ); - } - - #[test] - fn test_multiple_outputs_config() { - let outputs = vec![ - "reviews/date=2025-01-01".to_string(), - "reviews/date=2025-01-02".to_string(), - ]; - let config = generate_test_config(&outputs); - - assert_eq!(config.outputs.len(), 2); - assert_eq!(config.outputs[0].r#str, "reviews/date=2025-01-01"); - assert_eq!(config.outputs[1].r#str, "reviews/date=2025-01-02"); - // First output is used as PARTITION_REF - assert_eq!( - config.env.get("PARTITION_REF"), - Some(&"reviews/date=2025-01-01".to_string()) - ); - } - - #[test] - fn test_wrapper_job_event_creation() { - // Test success event - let event = WrapperJobEvent { - event_type: "task_success".to_string(), - job_status: Some("JOB_COMPLETED".to_string()), - exit_code: Some(0), - metadata: std::collections::HashMap::new(), - job_label: None, - }; - assert_eq!(event.event_type, "task_success"); - assert_eq!(event.job_status, Some("JOB_COMPLETED".to_string())); - assert_eq!(event.exit_code, Some(0)); - - // Test failure event - let event = WrapperJobEvent { - event_type: "task_failed".to_string(), - job_status: Some("JOB_FAILED".to_string()), - exit_code: Some(1), - metadata: std::collections::HashMap::new(), - job_label: None, - }; - assert_eq!(event.event_type, "task_failed"); - assert_eq!(event.job_status, Some("JOB_FAILED".to_string())); - assert_eq!(event.exit_code, Some(1)); - } - - #[test] - fn test_log_message_levels() { - let info_log = LogMessage { - level: log_message::LogLevel::Info as i32, - message: "info message".to_string(), - fields: std::collections::HashMap::new(), - }; - assert_eq!(info_log.level, log_message::LogLevel::Info as i32); - - let error_log = LogMessage { - level: log_message::LogLevel::Error as i32, - message: "error message".to_string(), - fields: std::collections::HashMap::new(), - }; - assert_eq!(error_log.level, log_message::LogLevel::Error as i32); - } - - #[test] - fn test_partition_manifest_structure() { - let config = generate_test_config(&vec!["test/partition".to_string()]); - let manifest = PartitionManifest { - outputs: config.outputs.clone(), - inputs: vec![], - start_time: 1234567890, - end_time: 1234567900, - task: Some(Task { - job: Some(JobLabel { - label: "//test:job".to_string(), - }), - config: Some(config), - }), - metadata: std::collections::HashMap::new(), - }; - - assert_eq!(manifest.outputs.len(), 1); - assert_eq!(manifest.outputs[0].r#str, "test/partition"); - assert_eq!(manifest.end_time - manifest.start_time, 10); - assert!(manifest.task.is_some()); - } - - #[test] - fn test_timestamp_generation() { - let ts1 = get_timestamp(); - std::thread::sleep(std::time::Duration::from_millis(10)); - let ts2 = get_timestamp(); - - // Timestamps should be parseable as integers - let t1: u64 = ts1.parse().expect("Should be valid timestamp"); - let t2: u64 = ts2.parse().expect("Should be valid timestamp"); - - // Second timestamp should be equal or greater - assert!(t2 >= t1); - } - - #[test] - fn test_job_wrapper_initialization() { - let wrapper = JobWrapper::new(); - assert_eq!(wrapper.sequence_number, 0); - assert!(!wrapper.job_id.is_empty()); - assert!(wrapper.start_time > 0); - } - - #[test] - fn test_cpu_metrics_are_captured() { - use std::io::Write; - use tempfile::NamedTempFile; - - // Create a CPU-intensive test script - let mut temp_file = NamedTempFile::new().expect("Failed to create temp file"); - let script_content = r#"#!/usr/bin/env python3 -import sys -import json -import time - -if len(sys.argv) > 1 and sys.argv[1] == "config": - config = { - "outputs": [{"str": "test/cpu"}], - "inputs": [], - "args": [], - "env": {"PARTITION_REF": "test/cpu"} - } - print(json.dumps({"configs": [config]})) -elif len(sys.argv) > 1 and sys.argv[1] == "exec": - # CPU-intensive work that runs longer - start_time = time.time() - total = 0 - while time.time() - start_time < 0.5: # Run for at least 500ms - total += sum(range(1_000_000)) - print(f"Sum: {total}") -"#; - - temp_file - .write_all(script_content.as_bytes()) - .expect("Failed to write script"); - let script_path = temp_file.path().to_str().unwrap(); - - // Make script executable - std::fs::set_permissions( - script_path, - std::os::unix::fs::PermissionsExt::from_mode(0o755), - ) - .expect("Failed to set permissions"); - - // Set up environment for fast sampling and the test script - env::set_var("DATABUILD_METRICS_INTERVAL_MS", "10"); // Even faster for CPU test - env::set_var("DATABUILD_JOB_BINARY", script_path); - - // Create test sink and wrapper - let sink = TestSink::new(); - let mut wrapper = JobWrapper::new_with_sink(sink); - - // Create a JobConfig for the test - let config = JobConfig { - outputs: vec![PartitionRef { - r#str: "test/cpu".to_string(), - }], - inputs: vec![], - args: vec![], - env: { - let mut env_map = std::collections::HashMap::new(); - env_map.insert("PARTITION_REF".to_string(), "test/cpu".to_string()); - env_map - }, - }; - - // We need to simulate stdin for exec_mode - let's create a test-specific exec method - // that takes the config directly rather than reading from stdin - let result = wrapper.exec_mode_with_config(script_path, config); - - // Clean up environment - env::remove_var("DATABUILD_METRICS_INTERVAL_MS"); - env::remove_var("DATABUILD_JOB_BINARY"); - - // Check that exec_mode succeeded - if let Err(e) = &result { - println!("exec_mode failed with error: {}", e); - } - assert!(result.is_ok(), "exec_mode should succeed: {:?}", result); - - // Find the job_summary event - let summary_event = wrapper - .sink - .find_event("job_summary") - .expect("Should have job_summary event"); - - if let Some(job_log_entry::Content::JobEvent(event)) = &summary_event.content { - // Verify we have CPU metrics - let cpu_ms_str = event - .metadata - .get("total_cpu_ms") - .expect("Should have total_cpu_ms metric"); - let cpu_ms: f64 = cpu_ms_str - .parse() - .expect("CPU metric should be valid float"); - - // For CPU-intensive work, we should get non-zero CPU time - assert!( - cpu_ms > 0.0, - "Expected non-zero CPU time for CPU-intensive workload, but got {:.3}ms", - cpu_ms - ); - - // Also verify runtime is reasonable - let runtime_ms_str = event - .metadata - .get("runtime_ms") - .expect("Should have runtime_ms metric"); - let runtime_ms: f64 = runtime_ms_str - .parse() - .expect("Runtime metric should be valid float"); - assert!(runtime_ms > 0.0, "Should have non-zero runtime"); - - println!( - "CPU test results: {:.3}ms CPU time over {:.3}ms runtime", - cpu_ms, runtime_ms - ); - } else { - panic!("job_summary event should contain JobEvent"); - } - } - - #[test] - fn test_heartbeat_functionality() { - use std::io::Write; - use tempfile::NamedTempFile; - - // Create a longer-running test script to trigger heartbeats - let mut temp_file = NamedTempFile::new().expect("Failed to create temp file"); - let script_content = r#"#!/usr/bin/env python3 -import sys -import json -import time - -if len(sys.argv) > 1 and sys.argv[1] == "config": - config = { - "outputs": [{"str": "test/heartbeat"}], - "inputs": [], - "args": [], - "env": {"PARTITION_REF": "test/heartbeat"} - } - print(json.dumps({"configs": [config]})) -elif len(sys.argv) > 1 and sys.argv[1] == "exec": - # Sleep long enough to trigger at least 2 heartbeats - time.sleep(0.3) # 300ms with 100ms heartbeat interval should give us 2-3 heartbeats - print("Job completed") -"#; - - temp_file - .write_all(script_content.as_bytes()) - .expect("Failed to write script"); - let script_path = temp_file.path().to_str().unwrap(); - - // Make script executable - std::fs::set_permissions( - script_path, - std::os::unix::fs::PermissionsExt::from_mode(0o755), - ) - .expect("Failed to set permissions"); - - // Set up environment for fast heartbeats and the test script - env::set_var("DATABUILD_HEARTBEAT_INTERVAL_MS", &TEST_HEARTBEAT_INTERVAL_MS.to_string()); - env::set_var("DATABUILD_METRICS_INTERVAL_MS", &TEST_METRICS_INTERVAL_MS.to_string()); - env::set_var("DATABUILD_JOB_BINARY", script_path); - - // Create test sink and wrapper - let sink = TestSink::new(); - let mut wrapper = JobWrapper::new_with_sink(sink); - - // Create a JobConfig for the test - let config = JobConfig { - outputs: vec![PartitionRef { - r#str: "test/heartbeat".to_string(), - }], - inputs: vec![], - args: vec![], - env: { - let mut env_map = std::collections::HashMap::new(); - env_map.insert("PARTITION_REF".to_string(), "test/heartbeat".to_string()); - env_map - }, - }; - - // Run the job - let result = wrapper.exec_mode_with_config(script_path, config); - - // Clean up environment - env::remove_var("DATABUILD_HEARTBEAT_INTERVAL_MS"); - env::remove_var("DATABUILD_METRICS_INTERVAL_MS"); - env::remove_var("DATABUILD_JOB_BINARY"); - - // Check that exec_mode succeeded - assert!(result.is_ok(), "exec_mode should succeed: {:?}", result); - - // Count heartbeat events - let heartbeat_count = wrapper - .sink - .entries - .iter() - .filter(|entry| { - if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content { - event.event_type == "heartbeat" - } else { - false - } - }) - .count(); - - // We should have at least 1 heartbeat event (possibly 2-3 depending on timing) - assert!( - heartbeat_count >= 1, - "Expected at least 1 heartbeat event, but got {}", - heartbeat_count - ); - - // Verify heartbeat event structure - let heartbeat_event = wrapper - .sink - .entries - .iter() - .find(|entry| { - if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content { - event.event_type == "heartbeat" - } else { - false - } - }) - .expect("Should have at least one heartbeat event"); - - if let Some(job_log_entry::Content::JobEvent(event)) = &heartbeat_event.content { - // Verify heartbeat contains memory and CPU metrics - assert!( - event.metadata.contains_key("memory_usage_mb"), - "Heartbeat should contain memory_usage_mb" - ); - assert!( - event.metadata.contains_key("cpu_usage_percent"), - "Heartbeat should contain cpu_usage_percent" - ); - } - } -} diff --git a/databuild/lib.rs b/databuild/lib.rs index bb7851b..a3fd2aa 100644 --- a/databuild/lib.rs +++ b/databuild/lib.rs @@ -1,41 +1,4 @@ +mod build_event_log; + // Include generated protobuf code include!("databuild.rs"); - -// Event log module -pub mod event_log; - -// Orchestration module -pub mod orchestration; - -// Service module -pub mod service; - -// Repository pattern implementations -pub mod repositories; - -pub mod mermaid_utils; - -// Status conversion utilities -pub mod status_utils; - -// Log collection module -pub mod log_collector; - -// Log access module -pub mod log_access; - -// Metric templates module -pub mod metric_templates; - -// Metrics aggregator module -pub mod metrics_aggregator; - -// Format consistency tests -#[cfg(test)] -mod format_consistency_test; - -// Re-export commonly used types from event_log -pub use event_log::{BuildEventLogError, create_bel_query_engine}; - -// Re-export orchestration types -pub use orchestration::{BuildOrchestrator, BuildResult, OrchestrationError}; \ No newline at end of file diff --git a/databuild/log_access.rs b/databuild/log_access.rs deleted file mode 100644 index 807f5f3..0000000 --- a/databuild/log_access.rs +++ /dev/null @@ -1,440 +0,0 @@ -use crate::{JobLogEntry, JobLogsRequest, JobLogsResponse, log_message}; -use serde_json; -use std::collections::HashMap; -use std::fs::{self, File}; -use std::io::{BufRead, BufReader}; -use std::path::{Path, PathBuf}; -use std::time::{SystemTime, UNIX_EPOCH}; -use thiserror::Error; - -#[derive(Error, Debug)] -pub enum LogAccessError { - #[error("IO error: {0}")] - Io(#[from] std::io::Error), - #[error("JSON parsing error: {0}")] - Json(#[from] serde_json::Error), - #[error("Invalid request: {0}")] - InvalidRequest(String), - #[error("Job not found: {0}")] - JobNotFound(String), -} - -pub struct LogReader { - logs_base_path: PathBuf, -} - -impl LogReader { - pub fn new>(logs_base_path: P) -> Self { - Self { - logs_base_path: logs_base_path.as_ref().to_path_buf(), - } - } - - /// Create LogReader with the default logs directory - pub fn default() -> Self { - Self::new(crate::log_collector::LogCollector::default_logs_dir()) - } - - /// Get job logs according to the request criteria - pub fn get_job_logs(&self, request: &JobLogsRequest) -> Result { - let job_file_path = self.find_job_file(&request.job_run_id)?; - - let file = File::open(&job_file_path)?; - let reader = BufReader::new(file); - - let mut entries = Vec::new(); - let mut count = 0u32; - let limit = if request.limit > 0 { request.limit } else { 1000 }; // Default limit - - for line in reader.lines() { - let line = line?; - - // Skip empty lines - if line.trim().is_empty() { - continue; - } - - // Parse the log entry - let entry: JobLogEntry = serde_json::from_str(&line)?; - - // Apply filters - if !self.matches_filters(&entry, request) { - continue; - } - - entries.push(entry); - count += 1; - - // Stop if we've hit the limit - if count >= limit { - break; - } - } - - // Check if there are more entries by trying to read one more - let has_more = count == limit; - - Ok(JobLogsResponse { - entries, - has_more, - }) - } - - /// List available job run IDs for a given date range - pub fn list_available_jobs(&self, date_range: Option<(String, String)>) -> Result, LogAccessError> { - let mut job_ids = Vec::new(); - - // If no date range specified, look at all directories - if let Some((start_date, end_date)) = date_range { - // Parse date range and iterate through dates - for date_str in self.date_range_iterator(&start_date, &end_date)? { - let date_dir = self.logs_base_path.join(&date_str); - if date_dir.exists() { - job_ids.extend(self.get_job_ids_from_directory(&date_dir)?); - } - } - } else { - // List all date directories and collect job IDs - if self.logs_base_path.exists() { - for entry in fs::read_dir(&self.logs_base_path)? { - let entry = entry?; - if entry.file_type()?.is_dir() { - job_ids.extend(self.get_job_ids_from_directory(&entry.path())?); - } - } - } - } - - // Remove duplicates and sort - job_ids.sort(); - job_ids.dedup(); - - Ok(job_ids) - } - - /// Get metrics points for a specific job - pub fn get_job_metrics(&self, job_run_id: &str) -> Result, LogAccessError> { - let job_file_path = self.find_job_file(job_run_id)?; - - let file = File::open(&job_file_path)?; - let reader = BufReader::new(file); - - let mut metrics = Vec::new(); - - for line in reader.lines() { - let line = line?; - - // Skip empty lines - if line.trim().is_empty() { - continue; - } - - // Parse the log entry - let entry: JobLogEntry = serde_json::from_str(&line)?; - - // Extract metrics from the entry - if let Some(crate::job_log_entry::Content::Metric(metric)) = entry.content { - metrics.push(metric); - } - } - - Ok(metrics) - } - - /// Find the JSONL file for a specific job run ID - fn find_job_file(&self, job_run_id: &str) -> Result { - // Search through all date directories for the job file - if !self.logs_base_path.exists() { - return Err(LogAccessError::JobNotFound(job_run_id.to_string())); - } - - for entry in fs::read_dir(&self.logs_base_path)? { - let entry = entry?; - if entry.file_type()?.is_dir() { - let job_file = entry.path().join(format!("{}.jsonl", job_run_id)); - if job_file.exists() { - return Ok(job_file); - } - } - } - - Err(LogAccessError::JobNotFound(job_run_id.to_string())) - } - - /// Check if a log entry matches the request filters - fn matches_filters(&self, entry: &JobLogEntry, request: &JobLogsRequest) -> bool { - // Filter by timestamp (since_timestamp is in nanoseconds) - if request.since_timestamp > 0 { - if let Ok(entry_timestamp) = entry.timestamp.parse::() { - let entry_timestamp_ns = entry_timestamp * 1_000_000_000; // Convert seconds to nanoseconds - if entry_timestamp_ns <= request.since_timestamp as u64 { - return false; - } - } - } - - // Filter by log level (only applies to log messages) - if request.min_level > 0 { - if let Some(crate::job_log_entry::Content::Log(log_msg)) = &entry.content { - if log_msg.level < request.min_level { - return false; - } - } - // For non-log entries (metrics, events), we include them regardless of min_level - } - - true - } - - /// Get job IDs from files in a specific directory - fn get_job_ids_from_directory(&self, dir_path: &Path) -> Result, LogAccessError> { - let mut job_ids = Vec::new(); - - for entry in fs::read_dir(dir_path)? { - let entry = entry?; - if entry.file_type()?.is_file() { - if let Some(file_name) = entry.file_name().to_str() { - if file_name.ends_with(".jsonl") { - // Extract job ID by removing .jsonl extension - let job_id = file_name.trim_end_matches(".jsonl"); - job_ids.push(job_id.to_string()); - } - } - } - } - - Ok(job_ids) - } - - /// Generate an iterator over date strings in a range (YYYY-MM-DD format) - fn date_range_iterator(&self, start_date: &str, end_date: &str) -> Result, LogAccessError> { - // Simple implementation - for production might want more robust date parsing - let start_parts: Vec<&str> = start_date.split('-').collect(); - let end_parts: Vec<&str> = end_date.split('-').collect(); - - if start_parts.len() != 3 || end_parts.len() != 3 { - return Err(LogAccessError::InvalidRequest("Invalid date format, expected YYYY-MM-DD".to_string())); - } - - // For now, just return the start and end dates - // In a full implementation, you'd iterate through all dates in between - let mut dates = vec![start_date.to_string()]; - if start_date != end_date { - dates.push(end_date.to_string()); - } - - Ok(dates) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::{job_log_entry, log_message, LogMessage, PartitionRef, MetricPoint}; - use std::io::Write; - use tempfile::TempDir; - - fn create_test_log_entry(job_id: &str, sequence: u64, timestamp: &str) -> JobLogEntry { - JobLogEntry { - timestamp: timestamp.to_string(), - job_id: job_id.to_string(), - outputs: vec![PartitionRef { r#str: "test/partition".to_string() }], - sequence_number: sequence, - content: Some(job_log_entry::Content::Log(LogMessage { - level: log_message::LogLevel::Info as i32, - message: format!("Test log message {}", sequence), - fields: HashMap::new(), - })), - } - } - - fn create_test_metric_entry(job_id: &str, sequence: u64, timestamp: &str) -> JobLogEntry { - JobLogEntry { - timestamp: timestamp.to_string(), - job_id: job_id.to_string(), - outputs: vec![PartitionRef { r#str: "test/partition".to_string() }], - sequence_number: sequence, - content: Some(job_log_entry::Content::Metric(MetricPoint { - name: "test_metric".to_string(), - value: 42.0, - labels: HashMap::new(), - unit: "count".to_string(), - })), - } - } - - fn setup_test_logs(temp_dir: &TempDir) -> Result<(), Box> { - // Create date directory - let date_dir = temp_dir.path().join("2025-01-27"); - fs::create_dir_all(&date_dir)?; - - // Create a test job file - let job_file = date_dir.join("job_123.jsonl"); - let mut file = File::create(&job_file)?; - - // Write test entries - let entry1 = create_test_log_entry("job_123", 1, "1737993600"); // 2025-01-27 12:00:00 - let entry2 = create_test_log_entry("job_123", 2, "1737993660"); // 2025-01-27 12:01:00 - let entry3 = create_test_metric_entry("job_123", 3, "1737993720"); // 2025-01-27 12:02:00 - - writeln!(file, "{}", serde_json::to_string(&entry1)?)?; - writeln!(file, "{}", serde_json::to_string(&entry2)?)?; - writeln!(file, "{}", serde_json::to_string(&entry3)?)?; - - Ok(()) - } - - #[test] - fn test_log_reader_creation() { - let temp_dir = TempDir::new().unwrap(); - let reader = LogReader::new(temp_dir.path()); - - assert_eq!(reader.logs_base_path, temp_dir.path()); - } - - #[test] - fn test_get_job_logs_basic() { - let temp_dir = TempDir::new().unwrap(); - setup_test_logs(&temp_dir).unwrap(); - - let reader = LogReader::new(temp_dir.path()); - let request = JobLogsRequest { - job_run_id: "job_123".to_string(), - since_timestamp: 0, - min_level: 0, - limit: 10, - }; - - let response = reader.get_job_logs(&request).unwrap(); - - assert_eq!(response.entries.len(), 3); - assert!(!response.has_more); - - // Verify the entries are in order - assert_eq!(response.entries[0].sequence_number, 1); - assert_eq!(response.entries[1].sequence_number, 2); - assert_eq!(response.entries[2].sequence_number, 3); - } - - #[test] - fn test_get_job_logs_with_timestamp_filter() { - let temp_dir = TempDir::new().unwrap(); - setup_test_logs(&temp_dir).unwrap(); - - let reader = LogReader::new(temp_dir.path()); - let request = JobLogsRequest { - job_run_id: "job_123".to_string(), - since_timestamp: 1737993600_000_000_000, // 2025-01-27 12:00:00 in nanoseconds - min_level: 0, - limit: 10, - }; - - let response = reader.get_job_logs(&request).unwrap(); - - // Should get entries 2 and 3 (after the timestamp) - assert_eq!(response.entries.len(), 2); - assert_eq!(response.entries[0].sequence_number, 2); - assert_eq!(response.entries[1].sequence_number, 3); - } - - #[test] - fn test_get_job_logs_with_level_filter() { - let temp_dir = TempDir::new().unwrap(); - setup_test_logs(&temp_dir).unwrap(); - - let reader = LogReader::new(temp_dir.path()); - let request = JobLogsRequest { - job_run_id: "job_123".to_string(), - since_timestamp: 0, - min_level: log_message::LogLevel::Warn as i32, // Only WARN and ERROR - limit: 10, - }; - - let response = reader.get_job_logs(&request).unwrap(); - - // Should get only the metric entry (sequence 3) since log entries are INFO level - assert_eq!(response.entries.len(), 1); - assert_eq!(response.entries[0].sequence_number, 3); - } - - #[test] - fn test_get_job_logs_with_limit() { - let temp_dir = TempDir::new().unwrap(); - setup_test_logs(&temp_dir).unwrap(); - - let reader = LogReader::new(temp_dir.path()); - let request = JobLogsRequest { - job_run_id: "job_123".to_string(), - since_timestamp: 0, - min_level: 0, - limit: 2, - }; - - let response = reader.get_job_logs(&request).unwrap(); - - assert_eq!(response.entries.len(), 2); - assert!(response.has_more); - assert_eq!(response.entries[0].sequence_number, 1); - assert_eq!(response.entries[1].sequence_number, 2); - } - - #[test] - fn test_list_available_jobs() { - let temp_dir = TempDir::new().unwrap(); - setup_test_logs(&temp_dir).unwrap(); - - // Create another job file - let date_dir = temp_dir.path().join("2025-01-27"); - let job_file2 = date_dir.join("job_456.jsonl"); - let mut file2 = File::create(&job_file2).unwrap(); - let entry = create_test_log_entry("job_456", 1, "1737993600"); - writeln!(file2, "{}", serde_json::to_string(&entry).unwrap()).unwrap(); - - let reader = LogReader::new(temp_dir.path()); - let job_ids = reader.list_available_jobs(None).unwrap(); - - assert_eq!(job_ids.len(), 2); - assert!(job_ids.contains(&"job_123".to_string())); - assert!(job_ids.contains(&"job_456".to_string())); - } - - #[test] - fn test_get_job_metrics() { - let temp_dir = TempDir::new().unwrap(); - setup_test_logs(&temp_dir).unwrap(); - - let reader = LogReader::new(temp_dir.path()); - let metrics = reader.get_job_metrics("job_123").unwrap(); - - assert_eq!(metrics.len(), 1); - assert_eq!(metrics[0].name, "test_metric"); - assert_eq!(metrics[0].value, 42.0); - assert_eq!(metrics[0].unit, "count"); - } - - #[test] - fn test_job_not_found() { - let temp_dir = TempDir::new().unwrap(); - let reader = LogReader::new(temp_dir.path()); - - let request = JobLogsRequest { - job_run_id: "nonexistent_job".to_string(), - since_timestamp: 0, - min_level: 0, - limit: 10, - }; - - let result = reader.get_job_logs(&request); - assert!(result.is_err()); - assert!(matches!(result.unwrap_err(), LogAccessError::JobNotFound(_))); - } - - #[test] - fn test_default_log_reader() { - let reader = LogReader::default(); - - // Should use the default logs directory - let expected = crate::log_collector::LogCollector::default_logs_dir(); - assert_eq!(reader.logs_base_path, expected); - } -} \ No newline at end of file diff --git a/databuild/log_collector.rs b/databuild/log_collector.rs deleted file mode 100644 index d6475fc..0000000 --- a/databuild/log_collector.rs +++ /dev/null @@ -1,402 +0,0 @@ -use crate::{JobLogEntry, job_log_entry}; -use serde_json; -use std::collections::HashMap; -use std::fs::{self, File, OpenOptions}; -use std::io::{BufRead, Write}; -use std::path::{Path, PathBuf}; -use std::time::{SystemTime, UNIX_EPOCH}; -use thiserror::Error; - -/// Convert days since Unix epoch to (year, month, day) -/// This is a simplified algorithm good enough for log file naming -fn days_to_ymd(days: i32) -> (i32, u32, u32) { - // Start from 1970-01-01 - let mut year = 1970; - let mut remaining_days = days; - - // Handle years - loop { - let days_in_year = if is_leap_year(year) { 366 } else { 365 }; - if remaining_days < days_in_year { - break; - } - remaining_days -= days_in_year; - year += 1; - } - - // Handle months - let mut month = 1; - for m in 1..=12 { - let days_in_month = days_in_month(year, m); - if remaining_days < days_in_month as i32 { - month = m; - break; - } - remaining_days -= days_in_month as i32; - } - - let day = remaining_days + 1; // Days are 1-indexed - (year, month, day as u32) -} - -/// Check if a year is a leap year -fn is_leap_year(year: i32) -> bool { - (year % 4 == 0 && year % 100 != 0) || (year % 400 == 0) -} - -/// Get number of days in a given month -fn days_in_month(year: i32, month: u32) -> u32 { - match month { - 1 | 3 | 5 | 7 | 8 | 10 | 12 => 31, - 4 | 6 | 9 | 11 => 30, - 2 => if is_leap_year(year) { 29 } else { 28 }, - _ => 30, // Should never happen - } -} - -#[derive(Error, Debug)] -pub enum LogCollectorError { - #[error("IO error: {0}")] - Io(#[from] std::io::Error), - #[error("JSON parsing error: {0}")] - Json(#[from] serde_json::Error), - #[error("Invalid log entry: {0}")] - InvalidLogEntry(String), -} - -pub struct LogCollector { - logs_dir: PathBuf, - active_files: HashMap, - job_label_mapping: HashMap, // job_run_id -> job_label -} - -impl LogCollector { - pub fn new>(logs_dir: P) -> Result { - let logs_dir = logs_dir.as_ref().to_path_buf(); - - // Ensure the base logs directory exists - if !logs_dir.exists() { - fs::create_dir_all(&logs_dir)?; - } - - Ok(Self { - logs_dir, - active_files: HashMap::new(), - job_label_mapping: HashMap::new(), - }) - } - - /// Set the job label for a specific job run ID - pub fn set_job_label(&mut self, job_run_id: &str, job_label: &str) { - self.job_label_mapping.insert(job_run_id.to_string(), job_label.to_string()); - } - - /// Get the default logs directory based on environment variable or fallback - pub fn default_logs_dir() -> PathBuf { - std::env::var("DATABUILD_LOGS_DIR") - .map(PathBuf::from) - .unwrap_or_else(|_| { - // Fallback to ./logs/databuild for safety - avoid system directories - std::env::current_dir() - .unwrap_or_else(|_| PathBuf::from(".")) - .join("logs") - .join("databuild") - }) - } - - /// Create a date-organized directory path for today - fn get_date_directory(&self) -> Result { - let now = SystemTime::now() - .duration_since(UNIX_EPOCH) - .map_err(|e| LogCollectorError::InvalidLogEntry(format!("System time error: {}", e)))?; - - let timestamp = now.as_secs(); - - // Convert timestamp to YYYY-MM-DD format - // Using a simple calculation instead of chrono - let days_since_epoch = timestamp / 86400; // 86400 seconds in a day - let days_since_1970 = days_since_epoch as i32; - - // Calculate year, month, day from days since epoch - // This is a simplified calculation - good enough for log file naming - let (year, month, day) = days_to_ymd(days_since_1970); - let date_str = format!("{:04}-{:02}-{:02}", year, month, day); - - let date_dir = self.logs_dir.join(date_str); - - // Ensure the date directory exists - if !date_dir.exists() { - fs::create_dir_all(&date_dir)?; - } - - Ok(date_dir) - } - - /// Get or create a file handle for a specific job run - fn get_job_file(&mut self, job_run_id: &str) -> Result<&mut File, LogCollectorError> { - if !self.active_files.contains_key(job_run_id) { - let date_dir = self.get_date_directory()?; - let file_path = date_dir.join(format!("{}.jsonl", job_run_id)); - - let file = OpenOptions::new() - .create(true) - .append(true) - .open(&file_path)?; - - self.active_files.insert(job_run_id.to_string(), file); - } - - Ok(self.active_files.get_mut(job_run_id).unwrap()) - } - - /// Write a single log entry to the appropriate JSONL file - pub fn write_log_entry(&mut self, job_run_id: &str, entry: &JobLogEntry) -> Result<(), LogCollectorError> { - let file = self.get_job_file(job_run_id)?; - let json_line = serde_json::to_string(entry)?; - writeln!(file, "{}", json_line)?; - file.flush()?; - Ok(()) - } - - /// Consume stdout from a job process and parse/store log entries - pub fn consume_job_output(&mut self, job_run_id: &str, reader: R) -> Result<(), LogCollectorError> { - for line in reader.lines() { - let line = line?; - - // Skip empty lines - if line.trim().is_empty() { - continue; - } - - // Try to parse as JobLogEntry - match serde_json::from_str::(&line) { - Ok(mut entry) => { - // Validate that the job_id matches - if entry.job_id != job_run_id { - return Err(LogCollectorError::InvalidLogEntry( - format!("Job ID mismatch: expected {}, got {}", job_run_id, entry.job_id) - )); - } - - // Enrich WrapperJobEvent and Manifest with job_label if available - if let Some(job_label) = self.job_label_mapping.get(job_run_id) { - match &mut entry.content { - Some(job_log_entry::Content::JobEvent(ref mut job_event)) => { - job_event.job_label = Some(job_label.clone()); - } - Some(job_log_entry::Content::Manifest(ref mut manifest)) => { - if let Some(ref mut task) = manifest.task { - if let Some(ref mut job) = task.job { - job.label = job_label.clone(); - } - } - } - _ => {} // No enrichment needed for Log entries - } - } - - self.write_log_entry(job_run_id, &entry)?; - } - Err(_) => { - // If it's not a JobLogEntry, treat it as raw output and create a log entry - let raw_entry = JobLogEntry { - timestamp: SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_secs() - .to_string(), - job_id: job_run_id.to_string(), - outputs: vec![], // Raw output doesn't have specific outputs - sequence_number: 0, // Raw output gets sequence 0 - content: Some(crate::job_log_entry::Content::Log(crate::LogMessage { - level: crate::log_message::LogLevel::Info as i32, - message: line, - fields: HashMap::new(), - })), - }; - - self.write_log_entry(job_run_id, &raw_entry)?; - } - } - } - - Ok(()) - } - - /// Close and flush all active files - pub fn close_all(&mut self) -> Result<(), LogCollectorError> { - for (_, mut file) in self.active_files.drain() { - file.flush()?; - } - Ok(()) - } - - /// Close and flush a specific job's file - pub fn close_job(&mut self, job_run_id: &str) -> Result<(), LogCollectorError> { - if let Some(mut file) = self.active_files.remove(job_run_id) { - file.flush()?; - } - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::{job_log_entry, log_message, LogMessage, PartitionRef}; - use std::io::Cursor; - use tempfile::TempDir; - - fn create_test_log_entry(job_id: &str, sequence: u64) -> JobLogEntry { - JobLogEntry { - timestamp: "1234567890".to_string(), - job_id: job_id.to_string(), - outputs: vec![PartitionRef { r#str: "test/partition".to_string() }], - sequence_number: sequence, - content: Some(job_log_entry::Content::Log(LogMessage { - level: log_message::LogLevel::Info as i32, - message: "Test log message".to_string(), - fields: HashMap::new(), - })), - } - } - - #[test] - fn test_log_collector_creation() { - let temp_dir = TempDir::new().unwrap(); - let collector = LogCollector::new(temp_dir.path()).unwrap(); - - assert_eq!(collector.logs_dir, temp_dir.path()); - assert!(collector.active_files.is_empty()); - } - - #[test] - fn test_write_single_log_entry() { - let temp_dir = TempDir::new().unwrap(); - let mut collector = LogCollector::new(temp_dir.path()).unwrap(); - - let entry = create_test_log_entry("job_123", 1); - collector.write_log_entry("job_123", &entry).unwrap(); - - // Verify file was created and contains the entry - collector.close_all().unwrap(); - - // Check that a date directory was created - let date_dirs: Vec<_> = fs::read_dir(temp_dir.path()).unwrap().collect(); - assert_eq!(date_dirs.len(), 1); - - // Check that the job file exists in the date directory - let date_dir_path = date_dirs[0].as_ref().unwrap().path(); - let job_files: Vec<_> = fs::read_dir(&date_dir_path).unwrap().collect(); - assert_eq!(job_files.len(), 1); - - let job_file_path = job_files[0].as_ref().unwrap().path(); - assert!(job_file_path.file_name().unwrap().to_string_lossy().contains("job_123")); - - // Verify content - let content = fs::read_to_string(&job_file_path).unwrap(); - assert!(content.contains("Test log message")); - assert!(content.contains("\"sequence_number\":1")); - } - - #[test] - fn test_consume_structured_output() { - let temp_dir = TempDir::new().unwrap(); - let mut collector = LogCollector::new(temp_dir.path()).unwrap(); - - let entry1 = create_test_log_entry("job_456", 1); - let entry2 = create_test_log_entry("job_456", 2); - - let input = format!("{}\n{}\n", - serde_json::to_string(&entry1).unwrap(), - serde_json::to_string(&entry2).unwrap() - ); - - let reader = Cursor::new(input); - collector.consume_job_output("job_456", reader).unwrap(); - collector.close_all().unwrap(); - - // Verify both entries were written - let date_dirs: Vec<_> = fs::read_dir(temp_dir.path()).unwrap().collect(); - let date_dir_path = date_dirs[0].as_ref().unwrap().path(); - let job_files: Vec<_> = fs::read_dir(&date_dir_path).unwrap().collect(); - let job_file_path = job_files[0].as_ref().unwrap().path(); - - let content = fs::read_to_string(&job_file_path).unwrap(); - let lines: Vec<&str> = content.trim().split('\n').collect(); - assert_eq!(lines.len(), 2); - - // Verify both entries can be parsed back - let parsed1: JobLogEntry = serde_json::from_str(lines[0]).unwrap(); - let parsed2: JobLogEntry = serde_json::from_str(lines[1]).unwrap(); - assert_eq!(parsed1.sequence_number, 1); - assert_eq!(parsed2.sequence_number, 2); - } - - #[test] - fn test_consume_mixed_output() { - let temp_dir = TempDir::new().unwrap(); - let mut collector = LogCollector::new(temp_dir.path()).unwrap(); - - let entry = create_test_log_entry("job_789", 1); - let structured_line = serde_json::to_string(&entry).unwrap(); - - let input = format!("{}\nRaw output line\nAnother raw line\n", structured_line); - - let reader = Cursor::new(input); - collector.consume_job_output("job_789", reader).unwrap(); - collector.close_all().unwrap(); - - // Verify all lines were captured (1 structured + 2 raw) - let date_dirs: Vec<_> = fs::read_dir(temp_dir.path()).unwrap().collect(); - let date_dir_path = date_dirs[0].as_ref().unwrap().path(); - let job_files: Vec<_> = fs::read_dir(&date_dir_path).unwrap().collect(); - let job_file_path = job_files[0].as_ref().unwrap().path(); - - let content = fs::read_to_string(&job_file_path).unwrap(); - let lines: Vec<&str> = content.trim().split('\n').collect(); - assert_eq!(lines.len(), 3); - - // First line should be the structured entry - let parsed1: JobLogEntry = serde_json::from_str(lines[0]).unwrap(); - assert_eq!(parsed1.sequence_number, 1); - - // Second and third lines should be raw output entries - let parsed2: JobLogEntry = serde_json::from_str(lines[1]).unwrap(); - let parsed3: JobLogEntry = serde_json::from_str(lines[2]).unwrap(); - assert_eq!(parsed2.sequence_number, 0); // Raw output gets sequence 0 - assert_eq!(parsed3.sequence_number, 0); - - if let Some(job_log_entry::Content::Log(log_msg)) = &parsed2.content { - assert_eq!(log_msg.message, "Raw output line"); - } else { - panic!("Expected log content"); - } - } - - #[test] - fn test_default_logs_dir() { - let default_dir = LogCollector::default_logs_dir(); - - // Should be a valid path - assert!(default_dir.is_absolute() || default_dir.starts_with(".")); - assert!(default_dir.to_string_lossy().contains("logs")); - assert!(default_dir.to_string_lossy().contains("databuild")); - } - - #[test] - fn test_job_id_validation() { - let temp_dir = TempDir::new().unwrap(); - let mut collector = LogCollector::new(temp_dir.path()).unwrap(); - - let mut entry = create_test_log_entry("wrong_job_id", 1); - entry.job_id = "wrong_job_id".to_string(); - - let input = serde_json::to_string(&entry).unwrap(); - let reader = Cursor::new(input); - - let result = collector.consume_job_output("expected_job_id", reader); - assert!(result.is_err()); - assert!(result.unwrap_err().to_string().contains("Job ID mismatch")); - } -} \ No newline at end of file diff --git a/databuild/mermaid_utils.rs b/databuild/mermaid_utils.rs deleted file mode 100644 index 59a5e93..0000000 --- a/databuild/mermaid_utils.rs +++ /dev/null @@ -1,915 +0,0 @@ -use crate::*; -use std::collections::{HashMap, HashSet}; - -/// Represents the status of a job or partition for visualization -#[derive(Debug, Clone, PartialEq)] -pub enum NodeStatus { - Pending, - Running, - Completed, - Failed, - Cancelled, - Skipped, - Available, - Delegated, -} - -impl NodeStatus { - /// Get the CSS class name for this status - fn css_class(&self) -> &'static str { - match self { - NodeStatus::Pending => "pending", - NodeStatus::Running => "running", - NodeStatus::Completed => "completed", - NodeStatus::Failed => "failed", - NodeStatus::Cancelled => "cancelled", - NodeStatus::Skipped => "skipped", - NodeStatus::Available => "available", - NodeStatus::Delegated => "delegated", - } - } -} - -/// Extract current status information from build events -pub fn extract_status_map(events: &[BuildEvent]) -> (HashMap, HashMap) { - let mut job_statuses: HashMap = HashMap::new(); - let mut partition_statuses: HashMap = HashMap::new(); - - // Process events in chronological order to get latest status - let mut sorted_events = events.to_vec(); - sorted_events.sort_by_key(|e| e.timestamp); - - for event in sorted_events { - match &event.event_type { - Some(crate::build_event::EventType::JobEvent(job_event)) => { - if let Some(job_label) = &job_event.job_label { - let status = match job_event.status_code { - 1 => NodeStatus::Running, // JOB_SCHEDULED - 2 => NodeStatus::Running, // JOB_RUNNING - 3 => NodeStatus::Completed, // JOB_COMPLETED - 4 => NodeStatus::Failed, // JOB_FAILED - 5 => NodeStatus::Cancelled, // JOB_CANCELLED - 6 => NodeStatus::Skipped, // JOB_SKIPPED - _ => NodeStatus::Pending, - }; - - // Create a unique key using job label + target partitions (same as node ID) - let outputs_label = job_event.target_partitions.iter() - .map(|p| p.str.clone()) - .collect::>() - .join("___"); - let unique_key = encode_id(&(job_label.label.clone() + "___" + &outputs_label)); - - job_statuses.insert(unique_key, status); - } - } - Some(crate::build_event::EventType::PartitionEvent(partition_event)) => { - if let Some(partition_ref) = &partition_event.partition_ref { - let status = match partition_event.status_code { - 1 => NodeStatus::Pending, // PARTITION_REQUESTED - 2 => NodeStatus::Pending, // PARTITION_ANALYZED - 3 => NodeStatus::Running, // PARTITION_BUILDING - 4 => NodeStatus::Available, // PARTITION_AVAILABLE - 5 => NodeStatus::Failed, // PARTITION_FAILED - 6 => NodeStatus::Delegated, // PARTITION_DELEGATED - _ => NodeStatus::Pending, - }; - partition_statuses.insert(partition_ref.str.clone(), status); - } - } - _ => {} - } - } - - (job_statuses, partition_statuses) -} - -/// Convert NodeStatus to EdgeStatus for edge coloring -fn map_node_status_to_edge_status(node_status: &NodeStatus) -> EdgeStatus { - match node_status { - NodeStatus::Failed => EdgeStatus::Failed, - NodeStatus::Running => EdgeStatus::Running, - NodeStatus::Completed => EdgeStatus::Completed, - NodeStatus::Available => EdgeStatus::Available, - NodeStatus::Pending => EdgeStatus::Pending, - NodeStatus::Cancelled => EdgeStatus::Failed, // Treat cancelled as failed - NodeStatus::Skipped => EdgeStatus::Pending, // Treat skipped as pending - NodeStatus::Delegated => EdgeStatus::Available, // Treat delegated as available - } -} - -/// Encodes ID for safe usage in mermaid graph -fn encode_id(id: &str) -> String { - id.replace("/", "_").replace("=", "_").replace(":", "_") -} - -/// Trait for all Mermaid node types -trait MermaidNode { - fn id(&self) -> &str; - #[allow(dead_code)] - fn label(&self) -> &str; - fn render(&self, status: &NodeStatus) -> String; -} - -/// Represents a job node in the Mermaid diagram -struct MermaidJobNode { - task: Task, - id: String, - label: String, -} - -impl MermaidJobNode { - fn from(task: &Task) -> Option { - let job_label: String = match &task.job { - Some(job) => job.label.clone(), - None => return None, - }; - - let outputs_label: String = match &task.config { - Some(config) => config.outputs.iter() - .map(|o| o.str.clone()) - .collect::>() - .join("___"), - None => String::new(), - }; - - let id = encode_id(&(job_label.clone() + "___" + &outputs_label)); - let label = format!("**{}** {}", job_label, outputs_label); - - Some(MermaidJobNode { - task: task.clone(), - id, - label, - }) - } - - fn to_mermaid(&self, job_statuses: &HashMap) -> String { - // Use the same unique ID logic for status lookup as we use for the node ID - let status = job_statuses.get(&self.id).unwrap_or(&NodeStatus::Pending); - self.render(status) - } -} - -impl MermaidNode for MermaidJobNode { - fn id(&self) -> &str { - &self.id - } - - fn label(&self) -> &str { - &self.label - } - - fn render(&self, status: &NodeStatus) -> String { - format!(" {}[\"{}\"]:::job_{}\n", self.id, self.label, status.css_class()) - } -} - -/// Represents a partition node in the Mermaid diagram -struct MermaidPartitionNode { - id: String, - label: String, - is_output: bool, -} - -impl MermaidPartitionNode { - fn new(partition_ref: &str, is_output: bool) -> Self { - let id = format!("ref_{}", encode_id(partition_ref)); - let label = partition_ref.to_string(); - - Self { - id, - label, - is_output, - } - } -} - -impl MermaidNode for MermaidPartitionNode { - fn id(&self) -> &str { - &self.id - } - - fn label(&self) -> &str { - &self.label - } - - fn render(&self, status: &NodeStatus) -> String { - let node_class = if self.is_output { - format!("outputPartition_{}", status.css_class()) - } else { - format!("partition_{}", status.css_class()) - }; - - format!(" {}[(\"{}\")]:::{}\n", self.id, encode_id(&self.label), node_class) - } -} - -/// Types of edges in the diagram -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -enum EdgeType { - Solid, // Regular dependency - Dotted, // Weak dependency -} - -/// Status of an edge for coloring purposes -#[derive(Debug, Clone, PartialEq)] -enum EdgeStatus { - Failed, // Red - critical path issues - Running, // Yellow - actively processing - Completed, // Green - successfully processed - Available, // Light green - data ready - Pending, // Gray - waiting/not started -} - -/// Represents an edge between two nodes -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -struct MermaidEdge { - from_id: String, - to_id: String, - edge_type: EdgeType, -} - -impl MermaidEdge { - fn new(from_id: String, to_id: String, edge_type: EdgeType) -> Self { - Self { from_id, to_id, edge_type } - } - - fn render(&self) -> String { - match self.edge_type { - EdgeType::Solid => format!(" {} --> {}\n", self.from_id, self.to_id), - EdgeType::Dotted => format!(" {} -.-> {}\n", self.from_id, self.to_id), - } - } -} - -/// Collection of edges with deduplication -struct EdgeCollection { - edges: HashSet, -} - -impl EdgeCollection { - fn new() -> Self { - Self { - edges: HashSet::new(), - } - } - - fn add(&mut self, edge: MermaidEdge) { - self.edges.insert(edge); - } - - fn render_all(&self) -> String { - self.edges.iter() - .map(|edge| edge.render()) - .collect::>() - .join("") - } -} - -/// Style rule for a specific node type and status combination -struct StyleRule { - class_name: String, - fill: &'static str, - stroke: &'static str, - stroke_width: &'static str, -} - -impl StyleRule { - fn render(&self) -> String { - format!( - " classDef {} fill:{},stroke:{},stroke-width:{};\n", - self.class_name, self.fill, self.stroke, self.stroke_width - ) - } -} - -/// Manages all styling for the Mermaid diagram -struct MermaidStyleSheet { - rules: Vec, -} - -impl MermaidStyleSheet { - fn default() -> Self { - let mut rules = Vec::new(); - - // Job status styles - rules.push(StyleRule { - class_name: "job_pending".to_string(), - fill: "#e0e0e0", - stroke: "#333", - stroke_width: "1px", - }); - rules.push(StyleRule { - class_name: "job_running".to_string(), - fill: "#ffeb3b", - stroke: "#333", - stroke_width: "2px", - }); - rules.push(StyleRule { - class_name: "job_completed".to_string(), - fill: "#4caf50", - stroke: "#333", - stroke_width: "2px", - }); - rules.push(StyleRule { - class_name: "job_failed".to_string(), - fill: "#f44336", - stroke: "#333", - stroke_width: "2px", - }); - rules.push(StyleRule { - class_name: "job_cancelled".to_string(), - fill: "#ff9800", - stroke: "#333", - stroke_width: "2px", - }); - rules.push(StyleRule { - class_name: "job_skipped".to_string(), - fill: "#9e9e9e", - stroke: "#333", - stroke_width: "1px", - }); - - // Partition status styles - rules.push(StyleRule { - class_name: "partition_pending".to_string(), - fill: "#e3f2fd", - stroke: "#333", - stroke_width: "1px", - }); - rules.push(StyleRule { - class_name: "partition_running".to_string(), - fill: "#fff9c4", - stroke: "#333", - stroke_width: "2px", - }); - rules.push(StyleRule { - class_name: "partition_available".to_string(), - fill: "#c8e6c9", - stroke: "#333", - stroke_width: "2px", - }); - rules.push(StyleRule { - class_name: "partition_failed".to_string(), - fill: "#ffcdd2", - stroke: "#333", - stroke_width: "2px", - }); - rules.push(StyleRule { - class_name: "partition_delegated".to_string(), - fill: "#d1c4e9", - stroke: "#333", - stroke_width: "2px", - }); - - // Output partition status styles (highlighted versions) - rules.push(StyleRule { - class_name: "outputPartition_pending".to_string(), - fill: "#bbdefb", - stroke: "#333", - stroke_width: "3px", - }); - rules.push(StyleRule { - class_name: "outputPartition_running".to_string(), - fill: "#fff59d", - stroke: "#333", - stroke_width: "3px", - }); - rules.push(StyleRule { - class_name: "outputPartition_available".to_string(), - fill: "#a5d6a7", - stroke: "#333", - stroke_width: "3px", - }); - rules.push(StyleRule { - class_name: "outputPartition_failed".to_string(), - fill: "#ef9a9a", - stroke: "#333", - stroke_width: "3px", - }); - rules.push(StyleRule { - class_name: "outputPartition_delegated".to_string(), - fill: "#b39ddb", - stroke: "#333", - stroke_width: "3px", - }); - - Self { rules } - } - - fn render(&self) -> String { - let mut result = String::from("\n %% Styling\n"); - for rule in &self.rules { - result.push_str(&rule.render()); - } - result - } - - fn get_edge_color(&self, status: &EdgeStatus) -> &'static str { - match status { - EdgeStatus::Failed => "#ff4444", // Red - EdgeStatus::Running => "#ffaa00", // Orange - EdgeStatus::Completed => "#44aa44", // Green - EdgeStatus::Available => "#88cc88", // Light green - EdgeStatus::Pending => "#888888", // Gray - } - } -} - -/// Builder for constructing Mermaid diagrams -struct MermaidDiagramBuilder { - job_nodes: HashMap, - partition_nodes: HashMap, - edges: EdgeCollection, - output_refs: HashSet, - edge_count: usize, -} - -impl MermaidDiagramBuilder { - fn new() -> Self { - Self { - job_nodes: HashMap::new(), - partition_nodes: HashMap::new(), - edges: EdgeCollection::new(), - output_refs: HashSet::new(), - edge_count: 0, - } - } - - fn set_output_refs(&mut self, refs: &[PartitionRef]) { - for ref_str in refs { - self.output_refs.insert(ref_str.str.clone()); - } - } - - fn add_job_node(&mut self, node: MermaidJobNode) { - self.job_nodes.insert(node.id().to_string(), node); - } - - fn add_partition_node(&mut self, partition_ref: &str) -> String { - let is_output = self.output_refs.contains(partition_ref); - let node = MermaidPartitionNode::new(partition_ref, is_output); - let id = node.id().to_string(); - self.partition_nodes.entry(partition_ref.to_string()) - .or_insert(node); - id - } - - fn add_edge(&mut self, from_id: String, to_id: String, edge_type: EdgeType) { - self.edges.add(MermaidEdge::new(from_id, to_id, edge_type)); - } - - fn add_edge_with_status(&mut self, from_id: String, to_id: String, edge_type: EdgeType, - edge_status: EdgeStatus, result: &mut String, stylesheet: &MermaidStyleSheet) { - // Create the edge - let edge = MermaidEdge::new(from_id, to_id, edge_type); - - // Check if this edge already exists (for deduplication) - if self.edges.edges.contains(&edge) { - return; // Skip duplicate edge - } - - // Render the edge - result.push_str(&edge.render()); - - // Add edge to collection for deduplication tracking - self.edges.add(edge); - - // Immediately render the linkStyle if status is not pending - if edge_status != EdgeStatus::Pending { - let color = stylesheet.get_edge_color(&edge_status); - result.push_str(&format!(" linkStyle {} stroke:{},stroke-width:2px\n", - self.edge_count, color)); - } - - self.edge_count += 1; - } - - fn build_with_edges(self, statuses: &(HashMap, HashMap), - stylesheet: MermaidStyleSheet, edges_content: String) -> String { - let (job_statuses, partition_statuses) = statuses; - let mut result = String::from("flowchart TD\n"); - - // Render all job nodes - for (_, job_node) in self.job_nodes { - result.push_str(&job_node.to_mermaid(job_statuses)); - } - - // Render all partition nodes - for (partition_ref, node) in self.partition_nodes { - let status = partition_statuses.get(&partition_ref).unwrap_or(&NodeStatus::Pending); - result.push_str(&node.render(status)); - } - - // Add the edges content (which includes linkStyle statements) - result.push_str(&edges_content); - - // Apply styles - result.push_str(&stylesheet.render()); - - result - } -} - - -pub fn generate_mermaid_diagram(graph: &JobGraph) -> String { - generate_mermaid_with_status(graph, &[]) -} - -/// Generate a mermaid diagram for a job graph with current status annotations -pub fn generate_mermaid_with_status( - graph: &JobGraph, - events: &[BuildEvent], -) -> String { - let statuses = extract_status_map(events); - let (job_statuses, partition_statuses) = &statuses; - let mut builder = MermaidDiagramBuilder::new(); - let stylesheet = MermaidStyleSheet::default(); - - // Set output refs for highlighting - builder.set_output_refs(&graph.outputs); - - // String to accumulate edges with their styles - let mut edges_content = String::new(); - - // Process all task nodes - for task in &graph.nodes { - if let Some(job_node) = MermaidJobNode::from(task) { - let job_id = job_node.id().to_string(); - builder.add_job_node(job_node); - - if let Some(config) = &task.config { - // Process inputs (dependencies) - for input in &config.inputs { - if let Some(partition_ref) = &input.partition_ref { - let ref_id = builder.add_partition_node(&partition_ref.str); - let edge_type = if input.dep_type_code == 1 { - EdgeType::Solid - } else { - EdgeType::Dotted - }; - - // Get partition status for edge coloring - let partition_status = partition_statuses.get(&partition_ref.str) - .unwrap_or(&NodeStatus::Pending); - let edge_status = map_node_status_to_edge_status(partition_status); - - builder.add_edge_with_status(ref_id, job_id.clone(), edge_type, - edge_status, &mut edges_content, &stylesheet); - } - } - - // Process outputs - for output in &config.outputs { - let ref_id = builder.add_partition_node(&output.str); - - // Get job status for edge coloring - let job_status = job_statuses.get(&job_id) - .unwrap_or(&NodeStatus::Pending); - let edge_status = map_node_status_to_edge_status(job_status); - - builder.add_edge_with_status(job_id.clone(), ref_id, EdgeType::Solid, - edge_status, &mut edges_content, &stylesheet); - } - } - } - } - - // Build the diagram with edges content - builder.build_with_edges(&statuses, stylesheet, edges_content) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_encode_id() { - assert_eq!(encode_id("path/to/file"), "path_to_file"); - assert_eq!(encode_id("key=value"), "key_value"); - assert_eq!(encode_id("scope:item"), "scope_item"); - assert_eq!(encode_id("a/b=c:d"), "a_b_c_d"); - } - - #[test] - fn test_mermaid_job_node() { - let mut task = Task::default(); - task.job = Some(JobLabel { label: "test_job".to_string() }); - task.config = Some(JobConfig { - outputs: vec![ - PartitionRef { str: "output1".to_string() }, - PartitionRef { str: "output2".to_string() }, - ], - inputs: vec![], - args: vec![], - env: HashMap::new(), - }); - - let node = MermaidJobNode::from(&task).expect("Failed to create job node"); - assert_eq!(node.id(), "test_job___output1___output2"); - assert_eq!(node.label(), "**test_job** output1___output2"); - - let rendered = node.render(&NodeStatus::Running); - assert!(rendered.contains("test_job___output1___output2")); - assert!(rendered.contains("**test_job** output1___output2")); - assert!(rendered.contains("job_running")); - } - - #[test] - fn test_mermaid_partition_node() { - let node = MermaidPartitionNode::new("data/partition=1", false); - assert_eq!(node.id(), "ref_data_partition_1"); - assert_eq!(node.label(), "data/partition=1"); - - let rendered = node.render(&NodeStatus::Available); - assert!(rendered.contains("ref_data_partition_1")); - assert!(rendered.contains("data_partition_1")); - assert!(rendered.contains("partition_available")); - - // Test output partition - let output_node = MermaidPartitionNode::new("output/data", true); - let output_rendered = output_node.render(&NodeStatus::Available); - assert!(output_rendered.contains("outputPartition_available")); - } - - #[test] - fn test_edge_collection() { - let mut edges = EdgeCollection::new(); - - // Add edges - edges.add(MermaidEdge::new("node1".to_string(), "node2".to_string(), EdgeType::Solid)); - edges.add(MermaidEdge::new("node2".to_string(), "node3".to_string(), EdgeType::Dotted)); - - // Test deduplication - edges.add(MermaidEdge::new("node1".to_string(), "node2".to_string(), EdgeType::Solid)); - - let rendered = edges.render_all(); - assert!(rendered.contains("node1 --> node2")); - assert!(rendered.contains("node2 -.-> node3")); - - // Should only have 2 unique edges - assert_eq!(rendered.matches("-->").count(), 1); - assert_eq!(rendered.matches("-.->").count(), 1); - } - - #[test] - fn test_simple_graph_generation() { - // Create task 1 - let mut task1 = Task::default(); - task1.job = Some(JobLabel { label: "job1".to_string() }); - task1.config = Some(JobConfig { - inputs: vec![{ - let mut input = DataDep::default(); - input.partition_ref = Some(PartitionRef { str: "input/data".to_string() }); - input.dep_type_code = 1; // Solid dependency - input.dep_type_name = "materialize".to_string(); - input - }], - outputs: vec![ - PartitionRef { str: "intermediate/data".to_string() }, - ], - args: vec![], - env: HashMap::new(), - }); - - // Create task 2 - let mut task2 = Task::default(); - task2.job = Some(JobLabel { label: "job2".to_string() }); - task2.config = Some(JobConfig { - inputs: vec![{ - let mut input = DataDep::default(); - input.partition_ref = Some(PartitionRef { str: "intermediate/data".to_string() }); - input.dep_type_code = 0; // Dotted dependency - input.dep_type_name = "query".to_string(); - input - }], - outputs: vec![ - PartitionRef { str: "output/data".to_string() }, - ], - args: vec![], - env: HashMap::new(), - }); - - // Create a simple graph - let mut graph = JobGraph::default(); - graph.nodes = vec![task1, task2]; - graph.outputs = vec![ - PartitionRef { str: "output/data".to_string() }, - ]; - - let mermaid = generate_mermaid_diagram(&graph); - - // Check basic structure - assert!(mermaid.starts_with("flowchart TD\n")); - - // Check nodes - verify both ID and label are present - assert!(mermaid.contains("job1___intermediate_data"), "Missing job1 node ID"); - assert!(mermaid.contains("**job1** intermediate/data"), "Missing job1 label"); - assert!(mermaid.contains("job2___output_data"), "Missing job2 node ID"); - assert!(mermaid.contains("**job2** output/data"), "Missing job2 label"); - assert!(mermaid.contains("ref_input_data")); - assert!(mermaid.contains("ref_intermediate_data")); - assert!(mermaid.contains("ref_output_data")); - - // Check edges - assert!(mermaid.contains("ref_input_data --> job1")); - assert!(mermaid.contains("job1___intermediate_data --> ref_intermediate_data")); - assert!(mermaid.contains("ref_intermediate_data -.-> job2")); - assert!(mermaid.contains("job2___output_data --> ref_output_data")); - - // Check styling - assert!(mermaid.contains("classDef job_pending")); - assert!(mermaid.contains("classDef partition_pending")); - assert!(mermaid.contains("classDef outputPartition_pending")); - } - - #[test] - fn test_status_extraction() { - let mut event1 = BuildEvent::default(); - event1.timestamp = 1; - event1.event_type = Some(crate::build_event::EventType::JobEvent({ - let mut job_event = JobEvent::default(); - job_event.job_label = Some(JobLabel { label: "test_job".to_string() }); - job_event.status_code = 2; // JOB_RUNNING - job_event - })); - - let mut event2 = BuildEvent::default(); - event2.timestamp = 2; - event2.event_type = Some(crate::build_event::EventType::PartitionEvent({ - let mut partition_event = PartitionEvent::default(); - partition_event.partition_ref = Some(PartitionRef { str: "test/partition".to_string() }); - partition_event.status_code = 4; // PARTITION_AVAILABLE - partition_event - })); - - let events = vec![event1, event2]; - - let (job_statuses, partition_statuses) = extract_status_map(&events); - - // Should use the unique key (job_label + target_partitions) instead of just job_label - assert_eq!(job_statuses.get("test_job"), None, "Should not find job by label alone"); - assert_eq!(partition_statuses.get("test/partition"), Some(&NodeStatus::Available)); - } - - #[test] - fn test_job_status_per_task_instance() { - // Test that different task instances with same job label get different status - let mut event1 = BuildEvent::default(); - event1.event_type = Some(crate::build_event::EventType::JobEvent({ - let mut job_event = JobEvent::default(); - job_event.job_label = Some(JobLabel { label: "same_job".to_string() }); - job_event.target_partitions = vec![PartitionRef { str: "output1".to_string() }]; - job_event.status_code = 2; // JOB_RUNNING - job_event - })); - - let mut event2 = BuildEvent::default(); - event2.event_type = Some(crate::build_event::EventType::JobEvent({ - let mut job_event = JobEvent::default(); - job_event.job_label = Some(JobLabel { label: "same_job".to_string() }); - job_event.target_partitions = vec![PartitionRef { str: "output2".to_string() }]; - job_event.status_code = 3; // JOB_COMPLETED - job_event - })); - - let events = vec![event1, event2]; - let (job_statuses, _) = extract_status_map(&events); - - // Each task should have its own status based on unique key - assert_eq!(job_statuses.get("same_job___output1"), Some(&NodeStatus::Running)); - assert_eq!(job_statuses.get("same_job___output2"), Some(&NodeStatus::Completed)); - assert_eq!(job_statuses.get("same_job"), None, "Should not find job by label alone"); - } - - #[test] - fn test_edge_coloring_with_status() { - // Create a simple graph with status - let mut task1 = Task::default(); - task1.job = Some(JobLabel { label: "job1".to_string() }); - task1.config = Some(JobConfig { - inputs: vec![{ - let mut input = DataDep::default(); - input.partition_ref = Some(PartitionRef { str: "input/data".to_string() }); - input.dep_type_code = 1; // Solid dependency - input.dep_type_name = "materialize".to_string(); - input - }], - outputs: vec![ - PartitionRef { str: "intermediate/data".to_string() }, - ], - args: vec![], - env: HashMap::new(), - }); - - let mut graph = JobGraph::default(); - graph.nodes = vec![task1]; - graph.outputs = vec![ - PartitionRef { str: "intermediate/data".to_string() }, - ]; - - // Create events to set status - let mut partition_event = BuildEvent::default(); - partition_event.event_type = Some(crate::build_event::EventType::PartitionEvent({ - let mut pe = PartitionEvent::default(); - pe.partition_ref = Some(PartitionRef { str: "input/data".to_string() }); - pe.status_code = 4; // PARTITION_AVAILABLE - pe - })); - - let mut job_event = BuildEvent::default(); - job_event.event_type = Some(crate::build_event::EventType::JobEvent({ - let mut je = JobEvent::default(); - je.job_label = Some(JobLabel { label: "job1".to_string() }); - je.target_partitions = vec![PartitionRef { str: "intermediate/data".to_string() }]; - je.status_code = 2; // JOB_RUNNING - je - })); - - let events = vec![partition_event, job_event]; - let mermaid = generate_mermaid_with_status(&graph, &events); - - // Check that linkStyle statements are present - assert!(mermaid.contains("linkStyle"), "Should contain linkStyle statements"); - assert!(mermaid.contains("#88cc88"), "Should contain available edge color (light green)"); - assert!(mermaid.contains("#ffaa00"), "Should contain running edge color (orange)"); - - // Check basic structure is still intact - assert!(mermaid.contains("flowchart TD")); - assert!(mermaid.contains("job1___intermediate_data")); - assert!(mermaid.contains("ref_input_data")); - assert!(mermaid.contains("ref_intermediate_data")); - } - - #[test] - fn test_edge_status_mapping() { - assert_eq!(map_node_status_to_edge_status(&NodeStatus::Failed), EdgeStatus::Failed); - assert_eq!(map_node_status_to_edge_status(&NodeStatus::Running), EdgeStatus::Running); - assert_eq!(map_node_status_to_edge_status(&NodeStatus::Completed), EdgeStatus::Completed); - assert_eq!(map_node_status_to_edge_status(&NodeStatus::Available), EdgeStatus::Available); - assert_eq!(map_node_status_to_edge_status(&NodeStatus::Pending), EdgeStatus::Pending); - assert_eq!(map_node_status_to_edge_status(&NodeStatus::Cancelled), EdgeStatus::Failed); - assert_eq!(map_node_status_to_edge_status(&NodeStatus::Skipped), EdgeStatus::Pending); - assert_eq!(map_node_status_to_edge_status(&NodeStatus::Delegated), EdgeStatus::Available); - } - - #[test] - fn test_edge_deduplication() { - // Create a graph that could potentially have duplicate edges - let mut task1 = Task::default(); - task1.job = Some(JobLabel { label: "job1".to_string() }); - task1.config = Some(JobConfig { - inputs: vec![{ - let mut input = DataDep::default(); - input.partition_ref = Some(PartitionRef { str: "shared_input".to_string() }); - input.dep_type_code = 1; - input.dep_type_name = "materialize".to_string(); - input - }], - outputs: vec![ - PartitionRef { str: "output1".to_string() }, - ], - args: vec![], - env: HashMap::new(), - }); - - let mut task2 = Task::default(); - task2.job = Some(JobLabel { label: "job2".to_string() }); - task2.config = Some(JobConfig { - inputs: vec![{ - let mut input = DataDep::default(); - input.partition_ref = Some(PartitionRef { str: "shared_input".to_string() }); - input.dep_type_code = 1; - input.dep_type_name = "materialize".to_string(); - input - }], - outputs: vec![ - PartitionRef { str: "output2".to_string() }, - ], - args: vec![], - env: HashMap::new(), - }); - - let mut graph = JobGraph::default(); - graph.nodes = vec![task1, task2]; - graph.outputs = vec![ - PartitionRef { str: "output1".to_string() }, - PartitionRef { str: "output2".to_string() }, - ]; - - let mermaid = generate_mermaid_diagram(&graph); - - // Count how many times the shared edge appears - let shared_edge_count = mermaid.matches("ref_shared_input --> job").count(); - - // Should only appear once per job (2 total), not duplicated - assert_eq!(shared_edge_count, 2, "Should have exactly 2 edges from shared_input (one to each job)"); - - // Verify no duplicate edges in the output - let lines: Vec<&str> = mermaid.lines().collect(); - let edge_lines: Vec<&str> = lines.iter().filter(|line| line.contains("-->") || line.contains("-.->")).cloned().collect(); - let unique_edges: std::collections::HashSet<&str> = edge_lines.iter().cloned().collect(); - - assert_eq!(edge_lines.len(), unique_edges.len(), "Should have no duplicate edges in output"); - } -} \ No newline at end of file diff --git a/databuild/metric_templates.rs b/databuild/metric_templates.rs deleted file mode 100644 index 139f34a..0000000 --- a/databuild/metric_templates.rs +++ /dev/null @@ -1,523 +0,0 @@ -use crate::{JobLogEntry, job_log_entry, WrapperJobEvent}; -use std::collections::HashMap; - -/// Template for metric extraction from job events -#[derive(Debug, Clone)] -pub struct MetricTemplate { - pub name: String, - pub help: String, - pub metric_type: MetricType, - pub extractor: MetricExtractor, - pub labels: Vec, // Static label names for this metric -} - -/// Prometheus metric types -#[derive(Debug, Clone)] -pub enum MetricType { - Counter, - Gauge, - Histogram, - Summary, -} - -/// Strategy for extracting metric values from job events -#[derive(Debug, Clone)] -pub enum MetricExtractor { - /// Extract from job event metadata by key - EventMetadata { - event_type: String, - metadata_key: String, - /// Optional conversion function name for non-numeric values - converter: Option, - }, - /// Count occurrences of specific event types - EventCount { - event_type: String, - }, - /// Extract job duration from start/end events - JobDuration, - /// Extract peak memory from job summary - PeakMemory, - /// Extract total CPU time from job summary - TotalCpuTime, - /// Extract exit code from job events - ExitCode, -} - -/// Converters for non-numeric metadata values -#[derive(Debug, Clone)] -pub enum MetricConverter { - /// Convert boolean strings to 0/1 - BoolToFloat, - /// Convert status strings to numeric codes - StatusToCode(HashMap), - /// Parse duration strings like "123ms" to seconds - DurationToSeconds, -} - -/// Result of metric extraction -#[derive(Debug)] -pub struct ExtractedMetric { - pub name: String, - pub value: f64, - pub labels: HashMap, - pub help: String, - pub metric_type: MetricType, -} - -impl MetricTemplate { - /// Extract a metric from a job log entry if applicable - pub fn extract(&self, entry: &JobLogEntry) -> Option { - let value = match &self.extractor { - MetricExtractor::EventMetadata { event_type, metadata_key, converter } => { - if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content { - if event.event_type == *event_type { - if let Some(raw_value) = event.metadata.get(metadata_key) { - self.convert_value(raw_value, converter)? - } else { - return None; - } - } else { - return None; - } - } else { - return None; - } - }, - MetricExtractor::EventCount { event_type } => { - if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content { - if event.event_type == *event_type { - 1.0 - } else { - return None; - } - } else { - return None; - } - }, - MetricExtractor::JobDuration => { - if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content { - if event.event_type == "job_summary" { - if let Some(runtime_str) = event.metadata.get("runtime_ms") { - runtime_str.parse::().ok()? / 1000.0 // Convert to seconds - } else { - return None; - } - } else { - return None; - } - } else { - return None; - } - }, - MetricExtractor::PeakMemory => { - if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content { - if event.event_type == "job_summary" { - if let Some(memory_str) = event.metadata.get("peak_memory_mb") { - memory_str.parse::().ok()? - } else { - return None; - } - } else { - return None; - } - } else { - return None; - } - }, - MetricExtractor::TotalCpuTime => { - if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content { - if event.event_type == "job_summary" { - if let Some(cpu_str) = event.metadata.get("total_cpu_ms") { - cpu_str.parse::().ok()? / 1000.0 // Convert to seconds - } else { - return None; - } - } else { - return None; - } - } else { - return None; - } - }, - MetricExtractor::ExitCode => { - if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content { - if let Some(exit_code) = event.exit_code { - exit_code as f64 - } else { - return None; - } - } else { - return None; - } - }, - }; - - // Generate labels for this metric - let mut labels = HashMap::new(); - - // Always include job_id as a label (but this is excluded by default for cardinality safety) - labels.insert("job_id".to_string(), entry.job_id.clone()); - - // Extract job label from manifest if available - this is the low-cardinality identifier - if let Some(job_log_entry::Content::Manifest(manifest)) = &entry.content { - if let Some(task) = &manifest.task { - if let Some(job) = &task.job { - labels.insert("job_label".to_string(), job.label.clone()); - } - } - } - - // Add job status and job label if available from job events - if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content { - if let Some(job_status) = &event.job_status { - labels.insert("job_status".to_string(), job_status.clone()); - } - if let Some(job_label) = &event.job_label { - labels.insert("job_label".to_string(), job_label.clone()); - } - } - - Some(ExtractedMetric { - name: self.name.clone(), - value, - labels, - help: self.help.clone(), - metric_type: self.metric_type.clone(), - }) - } - - fn convert_value(&self, raw_value: &str, converter: &Option) -> Option { - match converter { - None => raw_value.parse().ok(), - Some(MetricConverter::BoolToFloat) => { - match raw_value.to_lowercase().as_str() { - "true" | "1" | "yes" => Some(1.0), - "false" | "0" | "no" => Some(0.0), - _ => None, - } - }, - Some(MetricConverter::StatusToCode(mapping)) => { - mapping.get(raw_value).copied() - }, - Some(MetricConverter::DurationToSeconds) => { - // Parse formats like "123ms", "45s", "2.5m" - if raw_value.ends_with("ms") { - raw_value.trim_end_matches("ms").parse::().ok().map(|v| v / 1000.0) - } else if raw_value.ends_with("s") { - raw_value.trim_end_matches("s").parse::().ok() - } else if raw_value.ends_with("m") { - raw_value.trim_end_matches("m").parse::().ok().map(|v| v * 60.0) - } else { - raw_value.parse::().ok() - } - }, - } - } -} - - -/// Get standard DataBuild metric templates -pub fn get_standard_metrics() -> Vec { - vec![ - // Job execution metrics - MetricTemplate { - name: "databuild_job_duration_seconds".to_string(), - help: "Duration of job execution in seconds".to_string(), - metric_type: MetricType::Histogram, - extractor: MetricExtractor::JobDuration, - labels: vec!["job_label".to_string()], - }, - MetricTemplate { - name: "databuild_job_peak_memory_mb".to_string(), - help: "Peak memory usage of job in megabytes".to_string(), - metric_type: MetricType::Gauge, - extractor: MetricExtractor::PeakMemory, - labels: vec!["job_label".to_string()], - }, - MetricTemplate { - name: "databuild_job_cpu_time_seconds".to_string(), - help: "Total CPU time consumed by job in seconds".to_string(), - metric_type: MetricType::Counter, - extractor: MetricExtractor::TotalCpuTime, - labels: vec!["job_label".to_string()], - }, - MetricTemplate { - name: "databuild_job_exit_code".to_string(), - help: "Exit code of job execution".to_string(), - metric_type: MetricType::Gauge, - extractor: MetricExtractor::ExitCode, - labels: vec!["job_label".to_string(), "job_status".to_string()], - }, - - // Job event counters - MetricTemplate { - name: "databuild_job_events_total".to_string(), - help: "Total number of job events".to_string(), - metric_type: MetricType::Counter, - extractor: MetricExtractor::EventCount { event_type: "task_success".to_string() }, - labels: vec!["job_label".to_string()], - }, - MetricTemplate { - name: "databuild_job_failures_total".to_string(), - help: "Total number of job failures".to_string(), - metric_type: MetricType::Counter, - extractor: MetricExtractor::EventCount { event_type: "task_failed".to_string() }, - labels: vec!["job_label".to_string()], - }, - MetricTemplate { - name: "databuild_heartbeats_total".to_string(), - help: "Total number of heartbeat events".to_string(), - metric_type: MetricType::Counter, - extractor: MetricExtractor::EventCount { event_type: "heartbeat".to_string() }, - labels: vec!["job_label".to_string()], - }, - ] -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::{PartitionRef, log_message, LogMessage}; - - fn create_test_job_summary_entry(job_id: &str, runtime_ms: &str, memory_mb: &str, cpu_ms: &str, exit_code: i32) -> JobLogEntry { - let mut metadata = HashMap::new(); - metadata.insert("runtime_ms".to_string(), runtime_ms.to_string()); - metadata.insert("peak_memory_mb".to_string(), memory_mb.to_string()); - metadata.insert("total_cpu_ms".to_string(), cpu_ms.to_string()); - metadata.insert("exit_code".to_string(), exit_code.to_string()); - - JobLogEntry { - timestamp: "1234567890".to_string(), - job_id: job_id.to_string(), - outputs: vec![PartitionRef { r#str: "reviews/date=2025-01-27".to_string() }], - sequence_number: 1, - content: Some(job_log_entry::Content::JobEvent(WrapperJobEvent { - event_type: "job_summary".to_string(), - job_status: Some("JOB_COMPLETED".to_string()), - exit_code: Some(exit_code), - metadata, - job_label: None, - })), - } - } - - fn create_test_task_success_entry(job_id: &str) -> JobLogEntry { - JobLogEntry { - timestamp: "1234567890".to_string(), - job_id: job_id.to_string(), - outputs: vec![PartitionRef { r#str: "podcasts/date=2025-01-27".to_string() }], - sequence_number: 2, - content: Some(job_log_entry::Content::JobEvent(WrapperJobEvent { - event_type: "task_success".to_string(), - job_status: Some("JOB_COMPLETED".to_string()), - exit_code: Some(0), - metadata: HashMap::new(), - job_label: None, - })), - } - } - - #[test] - fn test_job_duration_extraction() { - let template = MetricTemplate { - name: "test_duration".to_string(), - help: "Test duration".to_string(), - metric_type: MetricType::Histogram, - extractor: MetricExtractor::JobDuration, - labels: vec![], - }; - - let entry = create_test_job_summary_entry("test-job", "2500", "64.5", "1200", 0); - let metric = template.extract(&entry).unwrap(); - - assert_eq!(metric.name, "test_duration"); - assert_eq!(metric.value, 2.5); // 2500ms -> 2.5s - assert_eq!(metric.labels.get("job_id").unwrap(), "test-job"); - // Note: job_label would only be available from manifest entries, not job_summary - } - - #[test] - fn test_memory_extraction() { - let template = MetricTemplate { - name: "test_memory".to_string(), - help: "Test memory".to_string(), - metric_type: MetricType::Gauge, - extractor: MetricExtractor::PeakMemory, - labels: vec![], - }; - - let entry = create_test_job_summary_entry("test-job", "2500", "128.75", "1200", 0); - let metric = template.extract(&entry).unwrap(); - - assert_eq!(metric.value, 128.75); - } - - #[test] - fn test_cpu_time_extraction() { - let template = MetricTemplate { - name: "test_cpu".to_string(), - help: "Test CPU".to_string(), - metric_type: MetricType::Counter, - extractor: MetricExtractor::TotalCpuTime, - labels: vec![], - }; - - let entry = create_test_job_summary_entry("test-job", "2500", "64.5", "1500", 0); - let metric = template.extract(&entry).unwrap(); - - assert_eq!(metric.value, 1.5); // 1500ms -> 1.5s - } - - #[test] - fn test_exit_code_extraction() { - let template = MetricTemplate { - name: "test_exit_code".to_string(), - help: "Test exit code".to_string(), - metric_type: MetricType::Gauge, - extractor: MetricExtractor::ExitCode, - labels: vec![], - }; - - let entry = create_test_job_summary_entry("test-job", "2500", "64.5", "1200", 42); - let metric = template.extract(&entry).unwrap(); - - assert_eq!(metric.value, 42.0); - assert_eq!(metric.labels.get("job_status").unwrap(), "JOB_COMPLETED"); - } - - #[test] - fn test_event_count_extraction() { - let template = MetricTemplate { - name: "test_success_count".to_string(), - help: "Test success count".to_string(), - metric_type: MetricType::Counter, - extractor: MetricExtractor::EventCount { event_type: "task_success".to_string() }, - labels: vec![], - }; - - let entry = create_test_task_success_entry("test-job"); - let metric = template.extract(&entry).unwrap(); - - assert_eq!(metric.value, 1.0); - // Note: job_label would only be available from manifest entries, not job events - } - - #[test] - fn test_event_metadata_extraction() { - let template = MetricTemplate { - name: "test_runtime".to_string(), - help: "Test runtime from metadata".to_string(), - metric_type: MetricType::Gauge, - extractor: MetricExtractor::EventMetadata { - event_type: "job_summary".to_string(), - metadata_key: "runtime_ms".to_string(), - converter: None, - }, - labels: vec![], - }; - - let entry = create_test_job_summary_entry("test-job", "3000", "64.5", "1200", 0); - let metric = template.extract(&entry).unwrap(); - - assert_eq!(metric.value, 3000.0); - } - - - #[test] - fn test_bool_converter() { - let template = MetricTemplate { - name: "test_bool".to_string(), - help: "Test bool".to_string(), - metric_type: MetricType::Gauge, - extractor: MetricExtractor::EventMetadata { - event_type: "test_event".to_string(), - metadata_key: "success".to_string(), - converter: Some(MetricConverter::BoolToFloat), - }, - labels: vec![], - }; - - assert_eq!(template.convert_value("true", &Some(MetricConverter::BoolToFloat)), Some(1.0)); - assert_eq!(template.convert_value("false", &Some(MetricConverter::BoolToFloat)), Some(0.0)); - assert_eq!(template.convert_value("yes", &Some(MetricConverter::BoolToFloat)), Some(1.0)); - assert_eq!(template.convert_value("no", &Some(MetricConverter::BoolToFloat)), Some(0.0)); - assert_eq!(template.convert_value("invalid", &Some(MetricConverter::BoolToFloat)), None); - } - - #[test] - fn test_duration_converter() { - let template = MetricTemplate { - name: "test_duration".to_string(), - help: "Test duration".to_string(), - metric_type: MetricType::Gauge, - extractor: MetricExtractor::EventMetadata { - event_type: "test_event".to_string(), - metadata_key: "duration".to_string(), - converter: Some(MetricConverter::DurationToSeconds), - }, - labels: vec![], - }; - - assert_eq!(template.convert_value("1000ms", &Some(MetricConverter::DurationToSeconds)), Some(1.0)); - assert_eq!(template.convert_value("5s", &Some(MetricConverter::DurationToSeconds)), Some(5.0)); - assert_eq!(template.convert_value("2.5m", &Some(MetricConverter::DurationToSeconds)), Some(150.0)); - assert_eq!(template.convert_value("42", &Some(MetricConverter::DurationToSeconds)), Some(42.0)); - } - - #[test] - fn test_standard_metrics() { - let metrics = get_standard_metrics(); - assert!(!metrics.is_empty()); - - // Verify we have the key metrics - let metric_names: Vec<&String> = metrics.iter().map(|m| &m.name).collect(); - assert!(metric_names.contains(&&"databuild_job_duration_seconds".to_string())); - assert!(metric_names.contains(&&"databuild_job_peak_memory_mb".to_string())); - assert!(metric_names.contains(&&"databuild_job_cpu_time_seconds".to_string())); - assert!(metric_names.contains(&&"databuild_job_failures_total".to_string())); - } - - #[test] - fn test_no_extraction_for_wrong_event_type() { - let template = MetricTemplate { - name: "test_metric".to_string(), - help: "Test".to_string(), - metric_type: MetricType::Counter, - extractor: MetricExtractor::EventCount { event_type: "task_failed".to_string() }, - labels: vec![], - }; - - let entry = create_test_task_success_entry("test-job"); // This is task_success, not task_failed - let result = template.extract(&entry); - - assert!(result.is_none()); - } - - #[test] - fn test_no_extraction_for_log_entries() { - let template = MetricTemplate { - name: "test_metric".to_string(), - help: "Test".to_string(), - metric_type: MetricType::Counter, - extractor: MetricExtractor::JobDuration, - labels: vec![], - }; - - // Create a log entry instead of job event - let entry = JobLogEntry { - timestamp: "1234567890".to_string(), - job_id: "test-job".to_string(), - outputs: vec![PartitionRef { r#str: "test/partition".to_string() }], - sequence_number: 1, - content: Some(job_log_entry::Content::Log(LogMessage { - level: log_message::LogLevel::Info as i32, - message: "Test log message".to_string(), - fields: HashMap::new(), - })), - }; - - let result = template.extract(&entry); - assert!(result.is_none()); - } -} \ No newline at end of file diff --git a/databuild/metrics_aggregator.rs b/databuild/metrics_aggregator.rs deleted file mode 100644 index 69c532a..0000000 --- a/databuild/metrics_aggregator.rs +++ /dev/null @@ -1,507 +0,0 @@ -use crate::{JobLogEntry, log_access::LogReader, metric_templates::{MetricTemplate, ExtractedMetric, MetricType, get_standard_metrics}}; -use std::collections::{HashMap, HashSet}; -use std::path::Path; -use thiserror::Error; - -#[derive(Error, Debug)] -pub enum MetricsError { - #[error("Log access error: {0}")] - LogAccess(#[from] crate::log_access::LogAccessError), - #[error("IO error: {0}")] - Io(#[from] std::io::Error), - #[error("Too many label combinations for metric {metric}: {count} > {limit}")] - CardinalityLimit { metric: String, count: usize, limit: usize }, -} - -/// Aggregated metric value with labels -#[derive(Debug, Clone)] -pub struct AggregatedMetric { - pub name: String, - pub help: String, - pub metric_type: MetricType, - pub samples: Vec, -} - -/// Individual metric sample -#[derive(Debug, Clone)] -pub struct MetricSample { - pub labels: HashMap, - pub value: f64, - pub timestamp_ms: Option, -} - -/// Configuration for metrics aggregation -#[derive(Debug, Clone)] -pub struct MetricsConfig { - /// Maximum number of unique label combinations per metric (cardinality safety) - pub max_cardinality_per_metric: usize, - /// Time range for metrics collection (in hours from now) - pub time_range_hours: u64, - /// Whether to include job_id in labels (can create high cardinality) - pub include_job_id_labels: bool, - /// Maximum number of jobs to process per metric - pub max_jobs_per_metric: usize, -} - -impl Default for MetricsConfig { - fn default() -> Self { - Self { - max_cardinality_per_metric: 1000, // Prometheus recommended limit - time_range_hours: 24, // Last 24 hours - include_job_id_labels: false, // Disabled by default for cardinality safety - max_jobs_per_metric: 100, // Limit recent jobs - } - } -} - -/// Aggregates metrics from job logs with cardinality safety -pub struct MetricsAggregator { - log_reader: LogReader, - config: MetricsConfig, - templates: Vec, -} - -impl MetricsAggregator { - /// Create a new metrics aggregator - pub fn new>(logs_path: P, config: MetricsConfig) -> Self { - Self { - log_reader: LogReader::new(logs_path), - config, - templates: get_standard_metrics(), - } - } - - /// Create with default configuration - pub fn with_defaults>(logs_path: P) -> Self { - Self::new(logs_path, MetricsConfig::default()) - } - - /// Add custom metric template - pub fn add_template(&mut self, template: MetricTemplate) { - self.templates.push(template); - } - - /// Aggregate all metrics from recent job logs - pub fn aggregate_metrics(&self) -> Result, MetricsError> { - // Get recent job IDs - let job_ids = self.get_recent_job_ids()?; - - let mut aggregated: HashMap = HashMap::new(); - let mut cardinality_counters: HashMap> = HashMap::new(); - - // Process each job's logs - for job_id in job_ids.iter().take(self.config.max_jobs_per_metric) { - if let Ok(entries) = self.get_job_entries(job_id) { - for entry in entries { - self.process_entry(&entry, &mut aggregated, &mut cardinality_counters)?; - } - } - } - - Ok(aggregated.into_values().collect()) - } - - /// Generate Prometheus format output - pub fn to_prometheus_format(&self) -> Result { - let metrics = self.aggregate_metrics()?; - let mut output = String::new(); - - for metric in metrics { - // Add help comment - output.push_str(&format!("# HELP {} {}\n", metric.name, metric.help)); - - // Add type comment - let type_str = match metric.metric_type { - MetricType::Counter => "counter", - MetricType::Gauge => "gauge", - MetricType::Histogram => "histogram", - MetricType::Summary => "summary", - }; - output.push_str(&format!("# TYPE {} {}\n", metric.name, type_str)); - - // Add samples - for sample in metric.samples { - output.push_str(&format!("{}{} {}\n", - metric.name, - self.format_labels(&sample.labels), - sample.value - )); - } - output.push('\n'); - } - - Ok(output) - } - - /// Get recent job IDs within the configured time range - fn get_recent_job_ids(&self) -> Result, MetricsError> { - // For now, get all available jobs. In production, this would filter by date - let job_ids = self.log_reader.list_available_jobs(None)?; - Ok(job_ids) - } - - /// Get log entries for a specific job - fn get_job_entries(&self, job_id: &str) -> Result, MetricsError> { - use crate::JobLogsRequest; - - let request = JobLogsRequest { - job_run_id: job_id.to_string(), - since_timestamp: 0, - min_level: 0, - limit: 1000, // Get all entries for the job - }; - - let response = self.log_reader.get_job_logs(&request)?; - Ok(response.entries) - } - - /// Process a single log entry through all metric templates - fn process_entry( - &self, - entry: &JobLogEntry, - aggregated: &mut HashMap, - cardinality_counters: &mut HashMap>, - ) -> Result<(), MetricsError> { - for template in &self.templates { - if let Some(mut extracted) = template.extract(entry) { - // Apply cardinality safety filters - if !self.config.include_job_id_labels { - extracted.labels.remove("job_id"); - } - - // Check cardinality limit - let label_signature = self.get_label_signature(&extracted.labels); - let cardinality_set = cardinality_counters - .entry(extracted.name.clone()) - .or_insert_with(HashSet::new); - - if cardinality_set.len() >= self.config.max_cardinality_per_metric - && !cardinality_set.contains(&label_signature) { - // Skip this metric to avoid cardinality explosion - continue; - } - - cardinality_set.insert(label_signature); - - // Add to aggregated metrics - let agg_metric = aggregated - .entry(extracted.name.clone()) - .or_insert_with(|| AggregatedMetric { - name: extracted.name.clone(), - help: extracted.help.clone(), - metric_type: extracted.metric_type.clone(), - samples: Vec::new(), - }); - - // For counters, sum values with same labels; for gauges, keep latest - let existing_sample = agg_metric.samples.iter_mut() - .find(|s| s.labels == extracted.labels); - - if let Some(sample) = existing_sample { - match extracted.metric_type { - MetricType::Counter => { - sample.value += extracted.value; // Sum counters - }, - MetricType::Gauge | MetricType::Histogram | MetricType::Summary => { - sample.value = extracted.value; // Replace with latest - }, - } - } else { - agg_metric.samples.push(MetricSample { - labels: extracted.labels, - value: extracted.value, - timestamp_ms: None, // Could add timestamp parsing if needed - }); - } - } - } - - Ok(()) - } - - /// Generate a signature string for label combinations - fn get_label_signature(&self, labels: &HashMap) -> String { - let mut pairs: Vec<_> = labels.iter().collect(); - pairs.sort_by_key(|&(k, _)| k); - pairs.iter() - .map(|(k, v)| format!("{}={}", k, v)) - .collect::>() - .join(",") - } - - /// Format labels for Prometheus output - fn format_labels(&self, labels: &HashMap) -> String { - if labels.is_empty() { - return String::new(); - } - - let mut pairs: Vec<_> = labels.iter().collect(); - pairs.sort_by_key(|&(k, _)| k); - - let formatted_pairs: Vec = pairs.iter() - .map(|(k, v)| format!("{}=\"{}\"", k, self.escape_label_value(v))) - .collect(); - - format!("{{{}}}", formatted_pairs.join(",")) - } - - /// Escape label values for Prometheus format - fn escape_label_value(&self, value: &str) -> String { - value - .replace('\\', "\\\\") - .replace('"', "\\\"") - .replace('\n', "\\n") - .replace('\t', "\\t") - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::{job_log_entry, PartitionRef, WrapperJobEvent}; - use std::io::Write; - use tempfile::TempDir; - - fn create_test_logs(temp_dir: &TempDir) -> Result<(), Box> { - // Create date directory - let date_dir = temp_dir.path().join("2025-01-27"); - std::fs::create_dir_all(&date_dir)?; - - // Create test job file with job summary - let job_file = date_dir.join("test_job_123.jsonl"); - let mut file = std::fs::File::create(&job_file)?; - - let entry = JobLogEntry { - timestamp: "1753763856".to_string(), - job_id: "test_job_123".to_string(), - outputs: vec![PartitionRef { r#str: "reviews/date=2025-01-27".to_string() }], - sequence_number: 4, - content: Some(job_log_entry::Content::JobEvent(WrapperJobEvent { - event_type: "job_summary".to_string(), - job_status: Some("JOB_COMPLETED".to_string()), - exit_code: Some(0), - metadata: { - let mut meta = HashMap::new(); - meta.insert("runtime_ms".to_string(), "2500.000".to_string()); - meta.insert("peak_memory_mb".to_string(), "128.5".to_string()); - meta.insert("total_cpu_ms".to_string(), "1200.000".to_string()); - meta.insert("exit_code".to_string(), "0".to_string()); - meta - }, - job_label: None, - })), - }; - - writeln!(file, "{}", serde_json::to_string(&entry)?)?; - - // Create task_success entry - let success_entry = JobLogEntry { - timestamp: "1753763857".to_string(), - job_id: "test_job_123".to_string(), - outputs: vec![PartitionRef { r#str: "reviews/date=2025-01-27".to_string() }], - sequence_number: 5, - content: Some(job_log_entry::Content::JobEvent(WrapperJobEvent { - event_type: "task_success".to_string(), - job_status: Some("JOB_COMPLETED".to_string()), - exit_code: Some(0), - metadata: HashMap::new(), - job_label: None, - })), - }; - - writeln!(file, "{}", serde_json::to_string(&success_entry)?)?; - - Ok(()) - } - - #[test] - fn test_metrics_aggregation() { - let temp_dir = TempDir::new().unwrap(); - create_test_logs(&temp_dir).unwrap(); - - let aggregator = MetricsAggregator::with_defaults(temp_dir.path()); - let metrics = aggregator.aggregate_metrics().unwrap(); - - assert!(!metrics.is_empty()); - - // Find duration metric - let duration_metric = metrics.iter() - .find(|m| m.name == "databuild_job_duration_seconds") - .expect("Should have duration metric"); - - assert_eq!(duration_metric.samples.len(), 1); - assert_eq!(duration_metric.samples[0].value, 2.5); // 2500ms -> 2.5s - - // Verify labels - should only have job_id (which gets excluded) and job_status - let labels = &duration_metric.samples[0].labels; - assert_eq!(labels.get("job_status").unwrap(), "JOB_COMPLETED"); - assert!(!labels.contains_key("job_id")); // Should be excluded by default - // Note: job_label would only be available from manifest entries, not job_summary events - } - - #[test] - fn test_prometheus_format() { - let temp_dir = TempDir::new().unwrap(); - create_test_logs(&temp_dir).unwrap(); - - let aggregator = MetricsAggregator::with_defaults(temp_dir.path()); - let prometheus_output = aggregator.to_prometheus_format().unwrap(); - - assert!(prometheus_output.contains("# HELP databuild_job_duration_seconds")); - assert!(prometheus_output.contains("# TYPE databuild_job_duration_seconds histogram")); - assert!(prometheus_output.contains("databuild_job_duration_seconds{")); - assert!(prometheus_output.contains("job_status=\"JOB_COMPLETED\"")); - assert!(prometheus_output.contains("} 2.5")); - } - - #[test] - fn test_cardinality_safety() { - let config = MetricsConfig { - max_cardinality_per_metric: 2, // Very low limit for testing - time_range_hours: 24, - include_job_id_labels: true, // Enable to test cardinality - max_jobs_per_metric: 100, - }; - - let temp_dir = TempDir::new().unwrap(); - - // Create multiple jobs to test cardinality limit - let date_dir = temp_dir.path().join("2025-01-27"); - std::fs::create_dir_all(&date_dir).unwrap(); - - for i in 1..=5 { - let job_file = date_dir.join(format!("job_{}.jsonl", i)); - let mut file = std::fs::File::create(&job_file).unwrap(); - - let entry = JobLogEntry { - timestamp: "1753763856".to_string(), - job_id: format!("job_{}", i), - outputs: vec![PartitionRef { r#str: format!("table_{}/date=2025-01-27", i) }], - sequence_number: 1, - content: Some(job_log_entry::Content::JobEvent(WrapperJobEvent { - event_type: "task_success".to_string(), - job_status: Some("JOB_COMPLETED".to_string()), - exit_code: Some(0), - metadata: HashMap::new(), - job_label: None, - })), - }; - - writeln!(file, "{}", serde_json::to_string(&entry).unwrap()).unwrap(); - } - - let aggregator = MetricsAggregator::new(temp_dir.path(), config); - let metrics = aggregator.aggregate_metrics().unwrap(); - - // Find the success count metric - let success_metric = metrics.iter() - .find(|m| m.name == "databuild_job_events_total") - .expect("Should have success count metric"); - - // Should be limited by cardinality (max 2 unique label combinations) - assert!(success_metric.samples.len() <= 2, - "Expected <= 2 samples due to cardinality limit, got {}", - success_metric.samples.len()); - } - - #[test] - fn test_label_escaping() { - let aggregator = MetricsAggregator::with_defaults("/tmp"); - - assert_eq!(aggregator.escape_label_value("normal"), "normal"); - assert_eq!(aggregator.escape_label_value("with\"quotes"), "with\\\"quotes"); - assert_eq!(aggregator.escape_label_value("with\\backslash"), "with\\\\backslash"); - assert_eq!(aggregator.escape_label_value("with\nnewline"), "with\\nnewline"); - assert_eq!(aggregator.escape_label_value("with\ttab"), "with\\ttab"); - } - - #[test] - fn test_label_signature_generation() { - let aggregator = MetricsAggregator::with_defaults("/tmp"); - - let mut labels1 = HashMap::new(); - labels1.insert("job_label".to_string(), "test_job".to_string()); - labels1.insert("job_status".to_string(), "JOB_COMPLETED".to_string()); - - let mut labels2 = HashMap::new(); - labels2.insert("job_status".to_string(), "JOB_COMPLETED".to_string()); - labels2.insert("job_label".to_string(), "test_job".to_string()); - - // Order shouldn't matter - assert_eq!( - aggregator.get_label_signature(&labels1), - aggregator.get_label_signature(&labels2) - ); - - let signature = aggregator.get_label_signature(&labels1); - assert!(signature.contains("job_label=test_job")); - assert!(signature.contains("job_status=JOB_COMPLETED")); - } - - #[test] - fn test_counter_vs_gauge_aggregation() { - let temp_dir = TempDir::new().unwrap(); - let date_dir = temp_dir.path().join("2025-01-27"); - std::fs::create_dir_all(&date_dir).unwrap(); - - let job_file = date_dir.join("test_job.jsonl"); - let mut file = std::fs::File::create(&job_file).unwrap(); - - // Create multiple task_success events (should be summed as counter) - for i in 1..=3 { - let entry = JobLogEntry { - timestamp: format!("175376385{}", i), - job_id: "test_job".to_string(), - outputs: vec![PartitionRef { r#str: "reviews/date=2025-01-27".to_string() }], - sequence_number: i, - content: Some(job_log_entry::Content::JobEvent(WrapperJobEvent { - event_type: "task_success".to_string(), - job_status: Some("JOB_COMPLETED".to_string()), - exit_code: Some(0), - metadata: HashMap::new(), - job_label: None, - })), - }; - writeln!(file, "{}", serde_json::to_string(&entry).unwrap()).unwrap(); - } - - // Create job summaries with different memory values (should use latest as gauge) - for (i, memory) in ["100.0", "150.0", "120.0"].iter().enumerate() { - let entry = JobLogEntry { - timestamp: format!("175376386{}", i), - job_id: "test_job".to_string(), - outputs: vec![PartitionRef { r#str: "reviews/date=2025-01-27".to_string() }], - sequence_number: (i + 10) as u64, - content: Some(job_log_entry::Content::JobEvent(WrapperJobEvent { - event_type: "job_summary".to_string(), - job_status: Some("JOB_COMPLETED".to_string()), - exit_code: Some(0), - metadata: { - let mut meta = HashMap::new(); - meta.insert("peak_memory_mb".to_string(), memory.to_string()); - meta.insert("runtime_ms".to_string(), "1000".to_string()); - meta.insert("total_cpu_ms".to_string(), "500".to_string()); - meta - }, - job_label: None, - })), - }; - writeln!(file, "{}", serde_json::to_string(&entry).unwrap()).unwrap(); - } - - let aggregator = MetricsAggregator::with_defaults(temp_dir.path()); - let metrics = aggregator.aggregate_metrics().unwrap(); - - // Check counter behavior (task_success events should be summed) - let success_metric = metrics.iter() - .find(|m| m.name == "databuild_job_events_total") - .expect("Should have success count metric"); - assert_eq!(success_metric.samples[0].value, 3.0); // 3 events summed - - // Check gauge behavior (memory should be latest value) - let memory_metric = metrics.iter() - .find(|m| m.name == "databuild_job_peak_memory_mb") - .expect("Should have memory metric"); - assert_eq!(memory_metric.samples[0].value, 120.0); // Latest value - } -} \ No newline at end of file diff --git a/databuild/orchestration/error.rs b/databuild/orchestration/error.rs deleted file mode 100644 index 8e8f766..0000000 --- a/databuild/orchestration/error.rs +++ /dev/null @@ -1,15 +0,0 @@ -use crate::event_log::BuildEventLogError; - -#[derive(Debug, thiserror::Error)] -pub enum OrchestrationError { - #[error("Event log error: {0}")] - EventLog(#[from] BuildEventLogError), - - #[error("Build coordination error: {0}")] - Coordination(String), - - #[error("Invalid build state transition: {current} -> {requested}")] - InvalidStateTransition { current: String, requested: String }, -} - -pub type Result = std::result::Result; \ No newline at end of file diff --git a/databuild/orchestration/events.rs b/databuild/orchestration/events.rs deleted file mode 100644 index 5854ef5..0000000 --- a/databuild/orchestration/events.rs +++ /dev/null @@ -1,156 +0,0 @@ -use crate::*; -use crate::event_log::{create_build_event, current_timestamp_nanos, generate_event_id}; - -/// Helper functions for creating standardized build events - -pub fn create_build_request_received_event( - build_request_id: String, - requested_partitions: Vec, -) -> BuildEvent { - create_build_event( - build_request_id, - build_event::EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(BuildRequestStatusCode::BuildRequestReceived.status()), - requested_partitions, - message: "Build request received".to_string(), - comment: None, - want_id: None, - }), - ) -} - -pub fn create_build_planning_started_event( - build_request_id: String, -) -> BuildEvent { - create_build_event( - build_request_id, - build_event::EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(BuildRequestStatusCode::BuildRequestPlanning.status()), - requested_partitions: vec![], - message: "Starting build planning".to_string(), - comment: None, - want_id: None, - }), - ) -} - -pub fn create_build_execution_started_event( - build_request_id: String, -) -> BuildEvent { - create_build_event( - build_request_id, - build_event::EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(BuildRequestStatusCode::BuildRequestExecuting.status()), - requested_partitions: vec![], - message: "Starting build execution".to_string(), - comment: None, - want_id: None, - }), - ) -} - -pub fn create_build_completed_event( - build_request_id: String, - result: &super::BuildResult, -) -> BuildEvent { - let message = match result { - super::BuildResult::Success { jobs_completed } => { - format!("Build completed successfully with {} jobs", jobs_completed) - } - super::BuildResult::Failed { jobs_completed, jobs_failed } => { - format!("Build failed: {} jobs completed, {} jobs failed", jobs_completed, jobs_failed) - } - super::BuildResult::FailFast { trigger_job } => { - format!("Build failed fast due to job: {}", trigger_job) - } - }; - - let status = match result { - super::BuildResult::Success { .. } => BuildRequestStatusCode::BuildRequestCompleted.status(), - super::BuildResult::Failed { .. } | super::BuildResult::FailFast { .. } => BuildRequestStatusCode::BuildRequestFailed.status(), - }; - - create_build_event( - build_request_id, - build_event::EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(status), - requested_partitions: vec![], - message, - comment: None, - want_id: None, - }), - ) -} - -pub fn create_analysis_completed_event( - build_request_id: String, - requested_partitions: Vec, - task_count: usize, -) -> BuildEvent { - create_build_event( - build_request_id, - build_event::EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(BuildRequestStatusCode::BuildRequestAnalysisCompleted.status()), - requested_partitions, - message: format!("Analysis completed successfully, {} tasks planned", task_count), - comment: None, - want_id: None, - }), - ) -} - -pub fn create_job_scheduled_event( - build_request_id: String, - job_event: &JobEvent, -) -> BuildEvent { - BuildEvent { - event_id: generate_event_id(), - timestamp: current_timestamp_nanos(), - build_request_id: Some(build_request_id), - event_type: Some(build_event::EventType::JobEvent(job_event.clone())), - } -} - -pub fn create_job_completed_event( - build_request_id: String, - job_event: &JobEvent, -) -> BuildEvent { - BuildEvent { - event_id: generate_event_id(), - timestamp: current_timestamp_nanos(), - build_request_id: Some(build_request_id), - event_type: Some(build_event::EventType::JobEvent(job_event.clone())), - } -} - -pub fn create_partition_available_event( - build_request_id: String, - partition_event: &PartitionEvent, -) -> BuildEvent { - BuildEvent { - event_id: generate_event_id(), - timestamp: current_timestamp_nanos(), - build_request_id: Some(build_request_id), - event_type: Some(build_event::EventType::PartitionEvent(partition_event.clone())), - } -} - -pub fn create_delegation_event( - build_request_id: String, - partition_ref: &str, - target_build: &str, - message: &str, -) -> BuildEvent { - let partition = PartitionRef { - str: partition_ref.to_string(), - }; - - create_build_event( - build_request_id, - build_event::EventType::DelegationEvent(DelegationEvent { - partition_ref: Some(partition), - delegated_to_build_request_id: target_build.to_string(), - message: message.to_string(), - }), - ) -} \ No newline at end of file diff --git a/databuild/orchestration/mod.rs b/databuild/orchestration/mod.rs deleted file mode 100644 index e048583..0000000 --- a/databuild/orchestration/mod.rs +++ /dev/null @@ -1,261 +0,0 @@ -use crate::*; -use crate::event_log::{writer::EventWriter, query_engine::BELQueryEngine}; -use log::info; -use std::sync::Arc; - -pub mod error; -pub mod events; - -pub use error::{OrchestrationError, Result}; - -/// Result of a build execution -#[derive(Debug, Clone)] -pub enum BuildResult { - Success { jobs_completed: usize }, - Failed { jobs_completed: usize, jobs_failed: usize }, - FailFast { trigger_job: String }, -} - -/// Core orchestrator for managing build lifecycle and event emission -pub struct BuildOrchestrator { - event_writer: EventWriter, - build_request_id: String, - requested_partitions: Vec, -} - -impl BuildOrchestrator { - /// Create a new build orchestrator - pub fn new( - query_engine: Arc, - build_request_id: String, - requested_partitions: Vec, - ) -> Self { - Self { - event_writer: EventWriter::new(query_engine), - build_request_id, - requested_partitions, - } - } - - /// Get the build request ID - pub fn build_request_id(&self) -> &str { - &self.build_request_id - } - - /// Get the requested partitions - pub fn requested_partitions(&self) -> &[PartitionRef] { - &self.requested_partitions - } - - /// Emit build request received event and start the build lifecycle - pub async fn start_build(&self) -> Result<()> { - info!("Starting build for request: {}", self.build_request_id); - - self.event_writer.request_build( - self.build_request_id.clone(), - self.requested_partitions.clone(), - ).await - .map_err(OrchestrationError::EventLog)?; - - Ok(()) - } - - /// Emit build planning started event - pub async fn start_planning(&self) -> Result<()> { - info!("Starting build planning for request: {}", self.build_request_id); - - self.event_writer.update_build_status( - self.build_request_id.clone(), - BuildRequestStatusCode::BuildRequestPlanning.status(), - "Starting build planning".to_string(), - ).await - .map_err(OrchestrationError::EventLog)?; - - Ok(()) - } - - /// Emit build execution started event - pub async fn start_execution(&self) -> Result<()> { - info!("Starting build execution for request: {}", self.build_request_id); - - self.event_writer.update_build_status( - self.build_request_id.clone(), - BuildRequestStatusCode::BuildRequestExecuting.status(), - "Starting build execution".to_string(), - ).await - .map_err(OrchestrationError::EventLog)?; - - Ok(()) - } - - /// Emit build completion event - pub async fn complete_build(&self, result: BuildResult) -> Result<()> { - info!("Completing build for request: {} with result: {:?}", - self.build_request_id, result); - - let (status, message) = match &result { - BuildResult::Success { jobs_completed } => { - (BuildRequestStatusCode::BuildRequestCompleted, - format!("Build completed successfully with {} jobs", jobs_completed)) - } - BuildResult::Failed { jobs_completed, jobs_failed } => { - (BuildRequestStatusCode::BuildRequestFailed, - format!("Build failed: {} jobs completed, {} jobs failed", jobs_completed, jobs_failed)) - } - BuildResult::FailFast { trigger_job } => { - (BuildRequestStatusCode::BuildRequestFailed, - format!("Build failed fast due to job: {}", trigger_job)) - } - }; - - self.event_writer.update_build_status( - self.build_request_id.clone(), - status.status(), - message, - ).await - .map_err(OrchestrationError::EventLog)?; - - Ok(()) - } - - /// Emit analysis completed event - pub async fn emit_analysis_completed(&self, task_count: usize) -> Result<()> { - self.event_writer.update_build_status_with_partitions( - self.build_request_id.clone(), - BuildRequestStatusCode::BuildRequestAnalysisCompleted.status(), - self.requested_partitions.clone(), - format!("Analysis completed successfully, {} tasks planned", task_count), - ).await - .map_err(OrchestrationError::EventLog)?; - - Ok(()) - } - - /// Emit job scheduled event - pub async fn emit_job_scheduled(&self, job: &JobEvent) -> Result<()> { - let event = events::create_job_scheduled_event( - self.build_request_id.clone(), - job, - ); - - self.event_writer.append_event(event).await - .map_err(OrchestrationError::EventLog)?; - - Ok(()) - } - - /// Emit job completed event - pub async fn emit_job_completed(&self, job: &JobEvent) -> Result<()> { - let event = events::create_job_completed_event( - self.build_request_id.clone(), - job, - ); - - self.event_writer.append_event(event).await - .map_err(OrchestrationError::EventLog)?; - - Ok(()) - } - - /// Emit partition available event - pub async fn emit_partition_available(&self, partition: &PartitionEvent) -> Result<()> { - let event = events::create_partition_available_event( - self.build_request_id.clone(), - partition, - ); - - self.event_writer.append_event(event).await - .map_err(OrchestrationError::EventLog)?; - - Ok(()) - } - - /// Emit delegation event - pub async fn emit_delegation( - &self, - partition_ref: &str, - target_build: &str, - message: &str, - ) -> Result<()> { - let partition = PartitionRef { str: partition_ref.to_string() }; - - self.event_writer.record_delegation( - self.build_request_id.clone(), - partition, - target_build.to_string(), - message.to_string(), - ).await - .map_err(OrchestrationError::EventLog)?; - - Ok(()) - } - -} - -#[cfg(test)] -mod tests { - use super::*; - - - - #[tokio::test] - async fn test_build_lifecycle_events() { - // Use mock BEL query engine for testing - let query_engine = crate::event_log::mock::create_mock_bel_query_engine().await.unwrap(); - let partitions = vec![PartitionRef { str: "test/partition".to_string() }]; - - let orchestrator = BuildOrchestrator::new( - query_engine, - "test-build-123".to_string(), - partitions.clone(), - ); - - // Test full build lifecycle - orchestrator.start_build().await.unwrap(); - orchestrator.start_planning().await.unwrap(); - orchestrator.start_execution().await.unwrap(); - orchestrator.complete_build(BuildResult::Success { jobs_completed: 5 }).await.unwrap(); - - // Note: Since we're using the real BELQueryEngine with mock storage, - // we can't easily inspect emitted events in this test without significant refactoring. - // The test verifies that the orchestration methods complete without errors, - // which exercises the event emission code paths. - - // TODO: If we need to verify specific events, we could: - // 1. Query the mock storage through the query engine - // 2. Create a specialized test storage that captures events - // 3. Use the existing MockBuildEventLog test pattern with dependency injection - } - - #[tokio::test] - async fn test_partition_and_job_events() { - // Use mock BEL query engine for testing - let query_engine = crate::event_log::mock::create_mock_bel_query_engine().await.unwrap(); - - let orchestrator = BuildOrchestrator::new( - query_engine, - "test-build-456".to_string(), - vec![], - ); - - // Test analysis completed event - orchestrator.emit_analysis_completed(3).await.unwrap(); - - // Test job event - let partition = PartitionRef { str: "data/users".to_string() }; - let job_event = JobEvent { - job_run_id: "job-run-123".to_string(), - job_label: Some(JobLabel { label: "//:test_job".to_string() }), - target_partitions: vec![partition.clone()], - status_code: JobStatus::JobScheduled as i32, - status_name: JobStatus::JobScheduled.to_display_string(), - message: "Job scheduled".to_string(), - config: None, - manifests: vec![], - }; - orchestrator.emit_job_scheduled(&job_event).await.unwrap(); - - // Note: Same testing limitation as above. - // We verify that the methods complete successfully without panicking. - } -} \ No newline at end of file diff --git a/databuild/repositories/builds/mod.rs b/databuild/repositories/builds/mod.rs deleted file mode 100644 index 942da9a..0000000 --- a/databuild/repositories/builds/mod.rs +++ /dev/null @@ -1,408 +0,0 @@ -use crate::*; -use crate::event_log::{BuildEventLogError, Result}; -use crate::event_log::query_engine::BELQueryEngine; -use crate::{BuildDetailResponse, BuildTimelineEvent as ServiceBuildTimelineEvent}; -use std::sync::Arc; -// use std::collections::HashMap; // Commented out since not used with new query engine -use serde::Serialize; - -/// Repository for querying build data from the build event log -pub struct BuildsRepository { - query_engine: Arc, -} - -/// Summary of a build request and its current status -#[derive(Debug, Clone, Serialize)] -pub struct BuildInfo { - pub build_request_id: String, - pub status: BuildRequestStatus, - pub requested_partitions: Vec, - pub requested_at: i64, - pub started_at: Option, - pub completed_at: Option, - pub duration_ms: Option, - pub total_jobs: usize, - pub completed_jobs: usize, - pub failed_jobs: usize, - pub cancelled_jobs: usize, - pub cancelled: bool, - pub cancel_reason: Option, -} - -/// Detailed timeline of a build's execution events -#[derive(Debug, Clone, Serialize)] -pub struct BuildEvent { - pub timestamp: i64, - pub event_type: String, - pub status: Option, - pub message: String, - pub cancel_reason: Option, -} - -impl BuildsRepository { - /// Create a new BuildsRepository - pub fn new(query_engine: Arc) -> Self { - Self { query_engine } - } - - /// List all builds with their current status - /// - /// Returns a list of all build requests that have been made, - /// including their current status and execution details. - pub async fn list(&self, limit: Option) -> Result> { - // Use query engine to list builds with the protobuf request format - let request = BuildsListRequest { - limit: limit.map(|l| l as u32), - offset: Some(0), - status_filter: None, - }; - let response = self.query_engine.list_build_requests(request).await?; - - // Convert from protobuf BuildSummary to repository BuildInfo - let builds = response.builds.into_iter().map(|build| { - BuildInfo { - build_request_id: build.build_request_id, - status: build.status.clone().unwrap_or(BuildRequestStatusCode::BuildRequestUnknown.status()), - requested_partitions: build.requested_partitions, - requested_at: build.requested_at, - started_at: build.started_at, - completed_at: build.completed_at, - duration_ms: build.duration_ms, - total_jobs: build.total_jobs as usize, - completed_jobs: build.completed_jobs as usize, - failed_jobs: build.failed_jobs as usize, - cancelled_jobs: build.cancelled_jobs as usize, - cancelled: build.cancelled, - cancel_reason: None, // TODO: Add cancel reason to BuildSummary if needed - } - }).collect(); - - Ok(builds) - } - - /// Show detailed information about a specific build - /// - /// Returns the complete timeline of events for the specified build, - /// including all status changes and any cancellation events. - pub async fn show(&self, build_request_id: &str) -> Result)>> { - // Use query engine to get build summary - let summary_result = self.query_engine.get_build_request_summary(build_request_id).await; - - match summary_result { - Ok(summary) => { - // Convert BuildRequestSummary to BuildInfo - let build_info = BuildInfo { - build_request_id: summary.build_request_id, - status: summary.status, - requested_partitions: summary.requested_partitions.into_iter() - .map(|s| PartitionRef { str: s }) - .collect(), - requested_at: summary.created_at, - started_at: None, // TODO: Track started_at in query engine - completed_at: Some(summary.updated_at), - duration_ms: None, // TODO: Calculate duration in query engine - total_jobs: 0, // TODO: Implement job counting in query engine - completed_jobs: 0, - failed_jobs: 0, - cancelled_jobs: 0, - cancelled: false, // TODO: Track cancellation in query engine - cancel_reason: None, - }; - - // Get all events for this build to create a proper timeline - let all_events = self.query_engine.get_build_request_events(build_request_id, None).await?; - - // Create timeline from build request events - let mut timeline = Vec::new(); - for event in all_events { - if let Some(crate::build_event::EventType::BuildRequestEvent(br_event)) = &event.event_type { - if let Some(status) = br_event.clone().status { - timeline.push(BuildEvent { - timestamp: event.timestamp, - event_type: "build_status".to_string(), - status: Some(status), - message: br_event.message.clone(), - cancel_reason: None, - }); - } - } - } - - // Sort timeline by timestamp - timeline.sort_by_key(|e| e.timestamp); - - Ok(Some((build_info, timeline))) - } - Err(_) => { - // Build not found - Ok(None) - } - } - } - - /// Show detailed information about a specific build using protobuf response format - /// - /// Returns the complete build details with dual status fields and timeline events. - pub async fn show_protobuf(&self, build_request_id: &str) -> Result> { - // Get build info and timeline using existing show method - if let Some((build_info, timeline)) = self.show(build_request_id).await? { - // Convert timeline events to protobuf format - let protobuf_timeline: Vec = timeline - .into_iter() - .map(|event| ServiceBuildTimelineEvent { - timestamp: event.timestamp, - status: event.status, - message: event.message, - event_type: event.event_type, - cancel_reason: event.cancel_reason, - }) - .collect(); - - let response = BuildDetailResponse { - build_request_id: build_info.build_request_id, - status: Some(build_info.status), - requested_partitions: build_info.requested_partitions, - total_jobs: build_info.total_jobs as u32, - completed_jobs: build_info.completed_jobs as u32, - failed_jobs: build_info.failed_jobs as u32, - cancelled_jobs: build_info.cancelled_jobs as u32, - requested_at: build_info.requested_at, - started_at: build_info.started_at, - completed_at: build_info.completed_at, - duration_ms: build_info.duration_ms, - cancelled: build_info.cancelled, - cancel_reason: build_info.cancel_reason, - timeline: protobuf_timeline, - }; - - Ok(Some(response)) - } else { - Ok(None) - } - } - - /// Cancel a build with a reason - /// - /// This method uses the EventWriter to write a build cancellation event. - /// It validates that the build exists and is in a cancellable state. - pub async fn cancel(&self, build_request_id: &str, _reason: String) -> Result<()> { - // First check if the build exists and get its current status - let build_info = self.show(build_request_id).await?; - - if build_info.is_none() { - return Err(BuildEventLogError::QueryError( - format!("Cannot cancel non-existent build: {}", build_request_id) - )); - } - - let (build, _timeline) = build_info.unwrap(); - - // Check if build is in a cancellable state - match BuildRequestStatusCode::try_from(build.status.code) { - Ok(BuildRequestStatusCode::BuildRequestCompleted) => { - return Err(BuildEventLogError::QueryError( - format!("Cannot cancel completed build: {}", build_request_id) - )); - } - Ok(BuildRequestStatusCode::BuildRequestFailed) => { - return Err(BuildEventLogError::QueryError( - format!("Cannot cancel failed build: {}", build_request_id) - )); - } - Ok(BuildRequestStatusCode::BuildRequestCancelled) => { - return Err(BuildEventLogError::QueryError( - format!("Build already cancelled: {}", build_request_id) - )); - } - _ => {} - } - - // Create a build cancellation event - use crate::event_log::{create_build_event, current_timestamp_nanos, generate_event_id}; - - let cancel_event = create_build_event( - build_request_id.to_string(), - crate::build_event::EventType::BuildRequestEvent(crate::BuildRequestEvent { - status: Some(BuildRequestStatusCode::BuildRequestCancelled.status()), - requested_partitions: build.requested_partitions, - message: format!("Build cancelled"), - comment: None, - want_id: None, - }) - ); - - // Append the cancellation event - self.query_engine.append_event(cancel_event).await?; - - Ok(()) - } - - /// List builds using protobuf response format with dual status fields - /// - /// Returns BuildSummary protobuf messages with status_code and status_name. - pub async fn list_protobuf(&self, limit: Option) -> Result> { - // Get build info using existing list method - let builds = self.list(limit).await?; - - // Convert to protobuf format - let protobuf_builds: Vec = builds - .into_iter() - .map(|build| crate::BuildSummary { - build_request_id: build.build_request_id, - status: Some(build.status), - requested_partitions: build.requested_partitions.into_iter().map(|p| crate::PartitionRef { str: p.str }).collect(), - total_jobs: build.total_jobs as u32, - completed_jobs: build.completed_jobs as u32, - failed_jobs: build.failed_jobs as u32, - cancelled_jobs: build.cancelled_jobs as u32, - requested_at: build.requested_at, - started_at: build.started_at, - completed_at: build.completed_at, - duration_ms: build.duration_ms, - cancelled: build.cancelled, - comment: None, - }) - .collect(); - - Ok(protobuf_builds) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::event_log::mock::{create_mock_bel_query_engine, create_mock_bel_query_engine_with_events, test_events}; - - #[tokio::test] - async fn test_builds_repository_list_empty() { - let query_engine = create_mock_bel_query_engine().await.unwrap(); - let repo = BuildsRepository::new(query_engine); - - let builds = repo.list(None).await.unwrap(); - assert!(builds.is_empty()); - } - - #[tokio::test] - async fn test_builds_repository_list_with_data() { - let build_id1 = "build-123".to_string(); - let build_id2 = "build-456".to_string(); - let partition1 = PartitionRef { str: "data/users".to_string() }; - let partition2 = PartitionRef { str: "data/orders".to_string() }; - - // Create events for multiple builds - let events = vec![ - test_events::build_request_event(Some(build_id1.clone()), vec![partition1.clone()], BuildRequestStatusCode::BuildRequestReceived.status()), - test_events::build_request_event(Some(build_id1.clone()), vec![partition1.clone()], BuildRequestStatusCode::BuildRequestCompleted.status()), - test_events::build_request_event(Some(build_id2.clone()), vec![partition2.clone()], BuildRequestStatusCode::BuildRequestReceived.status()), - test_events::build_request_event(Some(build_id2.clone()), vec![partition2.clone()], BuildRequestStatusCode::BuildRequestFailed.status()), - ]; - - let query_engine = create_mock_bel_query_engine_with_events(events).await.unwrap(); - let repo = BuildsRepository::new(query_engine); - - let builds = repo.list(None).await.unwrap(); - assert_eq!(builds.len(), 2); - - // Find builds by id - let build1 = builds.iter().find(|b| b.build_request_id == build_id1).unwrap(); - let build2 = builds.iter().find(|b| b.build_request_id == build_id2).unwrap(); - - assert_eq!(build1.status, BuildRequestStatusCode::BuildRequestCompleted.status()); - assert_eq!(build1.requested_partitions.len(), 1); - assert!(!build1.cancelled); - - assert_eq!(build2.status, BuildRequestStatusCode::BuildRequestFailed.status()); - assert_eq!(build2.requested_partitions.len(), 1); - assert!(!build2.cancelled); - } - - #[tokio::test] - async fn test_builds_repository_show() { - let build_id = "build-789".to_string(); - let partition = PartitionRef { str: "analytics/daily".to_string() }; - - let events = vec![ - test_events::build_request_event(Some(build_id.clone()), vec![partition.clone()], BuildRequestStatusCode::BuildRequestReceived.status()), - test_events::build_request_event(Some(build_id.clone()), vec![partition.clone()], BuildRequestStatusCode::BuildRequestPlanning.status()), - test_events::build_request_event(Some(build_id.clone()), vec![partition.clone()], BuildRequestStatusCode::BuildRequestExecuting.status()), - test_events::build_request_event(Some(build_id.clone()), vec![partition.clone()], BuildRequestStatusCode::BuildRequestCompleted.status()), - ]; - - let query_engine = create_mock_bel_query_engine_with_events(events).await.unwrap(); - let repo = BuildsRepository::new(query_engine); - - let result = repo.show(&build_id).await.unwrap(); - assert!(result.is_some()); - - let (info, timeline) = result.unwrap(); - assert_eq!(info.build_request_id, build_id); - assert_eq!(info.status, BuildRequestStatusCode::BuildRequestCompleted.status()); - assert!(!info.cancelled); - - assert_eq!(timeline.len(), 4); - assert_eq!(timeline[0].status, Some(BuildRequestStatusCode::BuildRequestReceived.status())); - assert_eq!(timeline[1].status, Some(BuildRequestStatusCode::BuildRequestPlanning.status())); - assert_eq!(timeline[2].status, Some(BuildRequestStatusCode::BuildRequestExecuting.status())); - assert_eq!(timeline[3].status, Some(BuildRequestStatusCode::BuildRequestCompleted.status())); - } - - #[tokio::test] - async fn test_builds_repository_show_nonexistent() { - let query_engine = create_mock_bel_query_engine().await.unwrap(); - let repo = BuildsRepository::new(query_engine); - - let result = repo.show("nonexistent-build").await.unwrap(); - assert!(result.is_none()); - } - - #[tokio::test] - async fn test_builds_repository_cancel() { - let build_id = "build-cancel-test".to_string(); - let partition = PartitionRef { str: "test/data".to_string() }; - - // Start with a running build - let events = vec![ - test_events::build_request_event(Some(build_id.clone()), vec![partition.clone()], BuildRequestStatusCode::BuildRequestReceived.status()), - test_events::build_request_event(Some(build_id.clone()), vec![partition.clone()], BuildRequestStatusCode::BuildRequestExecuting.status()), - ]; - - let query_engine = create_mock_bel_query_engine_with_events(events).await.unwrap(); - let repo = BuildsRepository::new(query_engine.clone()); - - // Cancel the build - repo.cancel(&build_id, "User requested cancellation".to_string()).await.unwrap(); - - // Verify the cancellation was recorded - // Note: This test demonstrates the pattern, but the MockBELStorage would need - // to be enhanced to properly store build cancel events for full verification - - // Try to cancel a non-existent build - let result = repo.cancel("nonexistent-build", "Should fail".to_string()).await; - assert!(result.is_err()); - } - - #[tokio::test] - async fn test_builds_repository_cancel_completed_build() { - let build_id = "completed-build".to_string(); - let partition = PartitionRef { str: "test/data".to_string() }; - - // Create a completed build - let events = vec![ - test_events::build_request_event(Some(build_id.clone()), vec![partition.clone()], BuildRequestStatusCode::BuildRequestReceived.status()), - test_events::build_request_event(Some(build_id.clone()), vec![partition.clone()], BuildRequestStatusCode::BuildRequestCompleted.status()), - ]; - - let query_engine = create_mock_bel_query_engine_with_events(events).await.unwrap(); - let repo = BuildsRepository::new(query_engine); - - // Try to cancel the completed build - should fail - let result = repo.cancel(&build_id, "Should fail".to_string()).await; - assert!(result.is_err()); - - if let Err(BuildEventLogError::QueryError(msg)) = result { - assert!(msg.contains("Cannot cancel completed build")); - } else { - panic!("Expected QueryError for completed build cancellation"); - } - } -} \ No newline at end of file diff --git a/databuild/repositories/jobs/mod.rs b/databuild/repositories/jobs/mod.rs deleted file mode 100644 index 3ef6049..0000000 --- a/databuild/repositories/jobs/mod.rs +++ /dev/null @@ -1,499 +0,0 @@ -use crate::*; -use crate::event_log::{BuildEventLogError, Result}; -use crate::event_log::query_engine::BELQueryEngine; -use crate::{JobDetailResponse, JobRunDetail as ServiceJobRunDetail}; -use std::sync::Arc; -use std::collections::HashMap; -use serde::Serialize; - -/// Repository for querying job data from the build event log -pub struct JobsRepository { - query_engine: Arc, -} - -/// Summary of a job's execution history and statistics -#[derive(Debug, Clone, Serialize)] -pub struct JobInfo { - pub job_label: String, - pub total_runs: usize, - pub successful_runs: usize, - pub failed_runs: usize, - pub cancelled_runs: usize, - pub last_run_timestamp: i64, - pub last_run_status: JobStatus, - pub average_partitions_per_run: f64, - pub recent_builds: Vec, // Build request IDs that used this job -} - -/// Detailed information about a specific job execution -#[derive(Debug, Clone, Serialize)] -pub struct JobRunDetail { - pub job_run_id: String, - pub job_label: String, - pub build_request_id: String, - pub target_partitions: Vec, - pub status: JobStatus, - pub scheduled_at: i64, - pub started_at: Option, - pub completed_at: Option, - pub duration_ms: Option, - pub message: String, - pub config: Option, - pub manifests: Vec, -} - -impl JobsRepository { - /// Create a new JobsRepository - pub fn new(query_engine: Arc) -> Self { - Self { query_engine } - } - - /// List all jobs with their execution statistics - /// - /// Returns a summary of all jobs that have been executed, including - /// success/failure statistics and recent activity. - pub async fn list(&self, limit: Option) -> Result> { - // Get all job events from the event log - let events = self.query_engine.get_events_in_range(0, i64::MAX).await?; - - let mut job_data: HashMap> = HashMap::new(); - - // Collect all job events and group by job label - for event in events { - if let Some(build_event::EventType::JobEvent(j_event)) = &event.event_type { - let job_label = j_event.job_label.as_ref() - .map(|l| l.label.clone()) - .unwrap_or_else(|| "unknown".to_string()); - - let status = match j_event.status_code { - 1 => JobStatus::JobScheduled, - 2 => JobStatus::JobRunning, - 3 => JobStatus::JobCompleted, - 4 => JobStatus::JobFailed, - 5 => JobStatus::JobCancelled, - 6 => JobStatus::JobSkipped, - _ => JobStatus::JobUnknown, - }; - - // Create or update job run detail - let job_runs = job_data.entry(job_label.clone()).or_insert_with(Vec::new); - - // Find existing run or create new one - if let Some(existing_run) = job_runs.iter_mut().find(|r| r.job_run_id == j_event.job_run_id) { - // Update existing run with new status - existing_run.status = status; - existing_run.message = j_event.message.clone(); - - match status { - JobStatus::JobRunning => { - existing_run.started_at = Some(event.timestamp); - } - JobStatus::JobCompleted | JobStatus::JobFailed | JobStatus::JobCancelled => { - existing_run.completed_at = Some(event.timestamp); - if let Some(started) = existing_run.started_at { - existing_run.duration_ms = Some((event.timestamp - started) / 1_000_000); // Convert to ms - } - existing_run.manifests = j_event.manifests.clone(); - } - _ => {} - } - } else { - // Create new job run - let job_run = JobRunDetail { - job_run_id: j_event.job_run_id.clone(), - job_label: job_label.clone(), - build_request_id: event.build_request_id.clone().unwrap(), - target_partitions: j_event.target_partitions.clone(), - status, - scheduled_at: event.timestamp, - started_at: if status == JobStatus::JobRunning { Some(event.timestamp) } else { None }, - completed_at: None, - duration_ms: None, - message: j_event.message.clone(), - config: j_event.config.clone(), - manifests: j_event.manifests.clone(), - }; - job_runs.push(job_run); - } - } - } - - // Convert to JobInfo structs with statistics - let mut job_infos: Vec = job_data.into_iter() - .map(|(job_label, job_runs)| { - let total_runs = job_runs.len(); - let successful_runs = job_runs.iter().filter(|r| r.status == JobStatus::JobCompleted).count(); - let failed_runs = job_runs.iter().filter(|r| r.status == JobStatus::JobFailed).count(); - let cancelled_runs = job_runs.iter().filter(|r| r.status == JobStatus::JobCancelled).count(); - - let (last_run_timestamp, last_run_status) = job_runs.iter() - .max_by_key(|r| r.scheduled_at) - .map(|r| (r.scheduled_at, r.status.clone())) - .unwrap_or((0, JobStatus::JobUnknown)); - - let total_partitions: usize = job_runs.iter() - .map(|r| r.target_partitions.len()) - .sum(); - let average_partitions_per_run = if total_runs > 0 { - total_partitions as f64 / total_runs as f64 - } else { - 0.0 - }; - - // Get recent unique build request IDs - let mut recent_builds: Vec = job_runs.iter() - .map(|r| r.build_request_id.clone()) - .collect::>() - .into_iter() - .collect(); - recent_builds.sort(); - recent_builds.truncate(10); // Keep last 10 builds - - JobInfo { - job_label, - total_runs, - successful_runs, - failed_runs, - cancelled_runs, - last_run_timestamp, - last_run_status, - average_partitions_per_run, - recent_builds, - } - }) - .collect(); - - // Sort by last run timestamp (most recent first) - job_infos.sort_by(|a, b| b.last_run_timestamp.cmp(&a.last_run_timestamp)); - - // Apply limit if specified - if let Some(limit) = limit { - job_infos.truncate(limit); - } - - Ok(job_infos) - } - - /// Show detailed information about a specific job - /// - /// Returns all execution runs for the specified job label, including - /// detailed timing, status, and output information. - pub async fn show(&self, job_label: &str) -> Result)>> { - // Get all job events for this specific job - let events = self.query_engine.get_events_in_range(0, i64::MAX).await?; - - let mut job_runs: Vec = Vec::new(); - - // Collect all job events for this job label - for event in events { - if let Some(build_event::EventType::JobEvent(j_event)) = &event.event_type { - let event_job_label = j_event.job_label.as_ref() - .map(|l| l.label.clone()) - .unwrap_or_else(|| "unknown".to_string()); - - if event_job_label != job_label { - continue; - } - - let status = match j_event.status_code { - 1 => JobStatus::JobScheduled, - 2 => JobStatus::JobRunning, - 3 => JobStatus::JobCompleted, - 4 => JobStatus::JobFailed, - 5 => JobStatus::JobCancelled, - 6 => JobStatus::JobSkipped, - _ => JobStatus::JobUnknown, - }; - - // Find existing run or create new one - if let Some(existing_run) = job_runs.iter_mut().find(|r| r.job_run_id == j_event.job_run_id) { - // Update existing run with new status - existing_run.status = status; - existing_run.message = j_event.message.clone(); - - match status { - JobStatus::JobRunning => { - existing_run.started_at = Some(event.timestamp); - } - JobStatus::JobCompleted | JobStatus::JobFailed | JobStatus::JobCancelled => { - existing_run.completed_at = Some(event.timestamp); - if let Some(started) = existing_run.started_at { - existing_run.duration_ms = Some((event.timestamp - started) / 1_000_000); // Convert to ms - } - existing_run.manifests = j_event.manifests.clone(); - } - _ => {} - } - } else { - // Create new job run - let job_run = JobRunDetail { - job_run_id: j_event.job_run_id.clone(), - job_label: job_label.to_string(), - build_request_id: event.build_request_id.clone().unwrap(), - target_partitions: j_event.target_partitions.clone(), - status, - scheduled_at: event.timestamp, - started_at: if status == JobStatus::JobRunning { Some(event.timestamp) } else { None }, - completed_at: None, - duration_ms: None, - message: j_event.message.clone(), - config: j_event.config.clone(), - manifests: j_event.manifests.clone(), - }; - job_runs.push(job_run); - } - } - } - - if job_runs.is_empty() { - return Ok(None); - } - - // Sort runs by scheduled time (most recent first) - job_runs.sort_by(|a, b| b.scheduled_at.cmp(&a.scheduled_at)); - - // Calculate job statistics - let total_runs = job_runs.len(); - let successful_runs = job_runs.iter().filter(|r| r.status == JobStatus::JobCompleted).count(); - let failed_runs = job_runs.iter().filter(|r| r.status == JobStatus::JobFailed).count(); - let cancelled_runs = job_runs.iter().filter(|r| r.status == JobStatus::JobCancelled).count(); - - let (last_run_timestamp, last_run_status) = job_runs.iter() - .max_by_key(|r| r.scheduled_at) - .map(|r| (r.scheduled_at, r.status.clone())) - .unwrap_or((0, JobStatus::JobUnknown)); - - let total_partitions: usize = job_runs.iter() - .map(|r| r.target_partitions.len()) - .sum(); - let average_partitions_per_run = if total_runs > 0 { - total_partitions as f64 / total_runs as f64 - } else { - 0.0 - }; - - // Get recent unique build request IDs - let mut recent_builds: Vec = job_runs.iter() - .map(|r| r.build_request_id.clone()) - .collect::>() - .into_iter() - .collect(); - recent_builds.sort(); - recent_builds.truncate(10); // Keep last 10 builds - - let job_info = JobInfo { - job_label: job_label.to_string(), - total_runs, - successful_runs, - failed_runs, - cancelled_runs, - last_run_timestamp, - last_run_status, - average_partitions_per_run, - recent_builds, - }; - - Ok(Some((job_info, job_runs))) - } - - /// Show detailed information about a specific job using protobuf response format - /// - /// Returns the complete job details with dual status fields and run details. - pub async fn show_protobuf(&self, job_label: &str) -> Result> { - // Get job info and runs using existing show method - if let Some((job_info, job_runs)) = self.show(job_label).await? { - // Convert job runs to protobuf format - let protobuf_runs: Vec = job_runs - .into_iter() - .map(|run| ServiceJobRunDetail { - job_run_id: run.job_run_id, - build_request_id: run.build_request_id, - target_partitions: run.target_partitions, - status_code: run.status as i32, - status_name: run.status.to_display_string(), - started_at: run.started_at, - completed_at: run.completed_at, - duration_ms: run.duration_ms, - message: run.message, - }) - .collect(); - - let response = JobDetailResponse { - job_label: job_info.job_label, - total_runs: job_info.total_runs as u32, - successful_runs: job_info.successful_runs as u32, - failed_runs: job_info.failed_runs as u32, - cancelled_runs: job_info.cancelled_runs as u32, - average_partitions_per_run: job_info.average_partitions_per_run, - last_run_timestamp: job_info.last_run_timestamp, - last_run_status_code: job_info.last_run_status as i32, - last_run_status_name: job_info.last_run_status.to_display_string(), - recent_builds: job_info.recent_builds, - runs: protobuf_runs, - }; - - Ok(Some(response)) - } else { - Ok(None) - } - } - - /// List jobs using protobuf response format with dual status fields - /// - /// Returns JobsListResponse protobuf message with JobSummary objects containing - /// last_run_status_code and last_run_status_name fields. - pub async fn list_protobuf(&self, request: JobsListRequest) -> Result { - // Get job info using existing list method - let jobs = self.list(request.limit.map(|l| l as usize)).await?; - - // Convert to protobuf format - let protobuf_jobs: Vec = jobs - .into_iter() - .map(|job| crate::JobSummary { - job_label: job.job_label, - total_runs: job.total_runs as u32, - successful_runs: job.successful_runs as u32, - failed_runs: job.failed_runs as u32, - cancelled_runs: job.cancelled_runs as u32, - average_partitions_per_run: job.average_partitions_per_run, - last_run_timestamp: job.last_run_timestamp, - last_run_status_code: job.last_run_status as i32, - last_run_status_name: job.last_run_status.to_display_string(), - recent_builds: job.recent_builds, - }) - .collect(); - - let total_count = protobuf_jobs.len() as u32; - - Ok(JobsListResponse { - jobs: protobuf_jobs, - total_count, - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::event_log::mock::{create_mock_bel_query_engine, create_mock_bel_query_engine_with_events, test_events}; - - #[tokio::test] - async fn test_jobs_repository_list_empty() { - let query_engine = create_mock_bel_query_engine().await.unwrap(); - let repo = JobsRepository::new(query_engine); - - let jobs = repo.list(None).await.unwrap(); - assert!(jobs.is_empty()); - } - - #[tokio::test] - async fn test_jobs_repository_list_with_data() { - let build_id = "test-build-123".to_string(); - let job_label1 = JobLabel { label: "//:process_data".to_string() }; - let job_label2 = JobLabel { label: "//:generate_reports".to_string() }; - let partition1 = PartitionRef { str: "data/users".to_string() }; - let partition2 = PartitionRef { str: "reports/summary".to_string() }; - - // Create events for multiple jobs - let events = vec![ - test_events::job_event(Some(build_id.clone()), Some("job-run-1".to_string()), job_label1.clone(), vec![partition1.clone()], JobStatus::JobScheduled), - test_events::job_event(Some(build_id.clone()), Some("job-run-1".to_string()), job_label1.clone(), vec![partition1.clone()], JobStatus::JobCompleted), - test_events::job_event(Some(build_id.clone()), Some("job-run-2".to_string()), job_label2.clone(), vec![partition2.clone()], JobStatus::JobScheduled), - test_events::job_event(Some(build_id.clone()), Some("job-run-2".to_string()), job_label2.clone(), vec![partition2.clone()], JobStatus::JobFailed), - ]; - - let query_engine = create_mock_bel_query_engine_with_events(events).await.unwrap(); - let repo = JobsRepository::new(query_engine); - - let jobs = repo.list(None).await.unwrap(); - assert_eq!(jobs.len(), 2); - - // Find jobs by label - let process_job = jobs.iter().find(|j| j.job_label == "//:process_data").unwrap(); - let reports_job = jobs.iter().find(|j| j.job_label == "//:generate_reports").unwrap(); - - assert_eq!(process_job.total_runs, 1); - assert_eq!(process_job.successful_runs, 1); - assert_eq!(process_job.failed_runs, 0); - assert_eq!(process_job.last_run_status, JobStatus::JobCompleted); - - assert_eq!(reports_job.total_runs, 1); - assert_eq!(reports_job.successful_runs, 0); - assert_eq!(reports_job.failed_runs, 1); - assert_eq!(reports_job.last_run_status, JobStatus::JobFailed); - } - - #[tokio::test] - async fn test_jobs_repository_show() { - let build_id = "test-build-456".to_string(); - let job_label = JobLabel { label: "//:analytics_job".to_string() }; - let partition = PartitionRef { str: "analytics/daily".to_string() }; - - let events = vec![ - test_events::job_event(Some(build_id.clone()), Some("job-run-123".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobScheduled), - test_events::job_event(Some(build_id.clone()), Some("job-run-123".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobRunning), - test_events::job_event(Some(build_id.clone()), Some("job-run-123".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobCompleted), - ]; - - let query_engine = create_mock_bel_query_engine_with_events(events).await.unwrap(); - let repo = JobsRepository::new(query_engine); - - let result = repo.show(&job_label.label).await.unwrap(); - assert!(result.is_some()); - - let (info, runs) = result.unwrap(); - assert_eq!(info.job_label, "//:analytics_job"); - assert_eq!(info.total_runs, 1); - assert_eq!(info.successful_runs, 1); - assert_eq!(info.last_run_status, JobStatus::JobCompleted); - - assert_eq!(runs.len(), 1); - let run = &runs[0]; - assert_eq!(run.job_run_id, "job-run-123"); - assert_eq!(run.status, JobStatus::JobCompleted); - assert_eq!(run.target_partitions.len(), 1); - assert_eq!(run.target_partitions[0].str, "analytics/daily"); - } - - #[tokio::test] - async fn test_jobs_repository_show_nonexistent() { - let query_engine = create_mock_bel_query_engine().await.unwrap(); - let repo = JobsRepository::new(query_engine); - - let result = repo.show("//:nonexistent_job").await.unwrap(); - assert!(result.is_none()); - } - - #[tokio::test] - async fn test_jobs_repository_statistics() { - let build_id = "test-build-789".to_string(); - let job_label = JobLabel { label: "//:batch_processor".to_string() }; - let partition = PartitionRef { str: "batch/data".to_string() }; - - // Create multiple runs with different outcomes - let events = vec![ - // First run - successful - test_events::job_event(Some(build_id.clone()), Some("run-1".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobScheduled), - test_events::job_event(Some(build_id.clone()), Some("run-1".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobCompleted), - // Second run - failed - test_events::job_event(Some(build_id.clone()), Some("run-2".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobScheduled), - test_events::job_event(Some(build_id.clone()), Some("run-2".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobFailed), - // Third run - cancelled - test_events::job_event(Some(build_id.clone()), Some("run-3".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobScheduled), - test_events::job_event(Some(build_id.clone()), Some("run-3".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobCancelled), - ]; - - let query_engine = create_mock_bel_query_engine_with_events(events).await.unwrap(); - let repo = JobsRepository::new(query_engine); - - let result = repo.show(&job_label.label).await.unwrap(); - assert!(result.is_some()); - - let (info, _runs) = result.unwrap(); - assert_eq!(info.total_runs, 3); - assert_eq!(info.successful_runs, 1); - assert_eq!(info.failed_runs, 1); - assert_eq!(info.cancelled_runs, 1); - assert_eq!(info.average_partitions_per_run, 1.0); - } -} \ No newline at end of file diff --git a/databuild/repositories/mod.rs b/databuild/repositories/mod.rs deleted file mode 100644 index db7db15..0000000 --- a/databuild/repositories/mod.rs +++ /dev/null @@ -1,17 +0,0 @@ -/// Repository pattern implementations for reading from the build event log -/// -/// This module provides read-only repository interfaces that query the build event log -/// for different types of data. Each repository focuses on a specific domain: -/// -/// - PartitionsRepository: Query partition status and history -/// - JobsRepository: Query job execution data -/// - TasksRepository: Query task (job run) information -/// - BuildsRepository: Query build request data -/// -/// All repositories work with any BuildEventLog implementation and provide -/// a clean separation between read and write operations. - -pub mod partitions; -pub mod jobs; -pub mod tasks; -pub mod builds; \ No newline at end of file diff --git a/databuild/repositories/partitions/mod.rs b/databuild/repositories/partitions/mod.rs deleted file mode 100644 index 6034b36..0000000 --- a/databuild/repositories/partitions/mod.rs +++ /dev/null @@ -1,373 +0,0 @@ -use crate::*; -use crate::event_log::{BuildEventLogError, Result}; -use crate::event_log::query_engine::BELQueryEngine; -use crate::status_utils::list_response_helpers; -use std::sync::Arc; -use std::collections::HashMap; -use serde::Serialize; - -/// Repository for querying partition data from the build event log -pub struct PartitionsRepository { - query_engine: Arc, -} - -/// Summary of a partition's current state and history -#[derive(Debug, Clone, Serialize)] -pub struct PartitionInfo { - pub partition_ref: PartitionRef, - pub current_status: PartitionStatus, - pub last_updated: i64, - pub builds_count: usize, - pub last_successful_build: Option, - pub invalidation_count: usize, -} - -/// Detailed partition status with timeline -#[derive(Debug, Clone, Serialize)] -pub struct PartitionStatusEvent { - pub timestamp: i64, - pub status: PartitionStatus, - pub message: String, - pub build_request_id: String, - pub job_run_id: Option, -} - -impl PartitionsRepository { - /// Create a new PartitionsRepository - pub fn new(query_engine: Arc) -> Self { - Self { query_engine } - } - - /// List all partitions with their current status - /// - /// Returns a list of all partitions that have been referenced in the build event log, - /// along with their current status and summary information. - pub async fn list(&self, _limit: Option) -> Result> { - // Get all events to find unique partitions - let filter = EventFilter { - partition_refs: vec![], - partition_patterns: vec![], - job_labels: vec![], - job_run_ids: vec![], - build_request_ids: vec![], - }; - - let events = self.query_engine.get_events_in_range(0, i64::MAX).await?; - - // Collect unique partition references - let mut unique_partitions = std::collections::HashSet::new(); - for event in &events { - match &event.event_type { - Some(crate::build_event::EventType::PartitionEvent(p_event)) => { - if let Some(partition_ref) = &p_event.partition_ref { - unique_partitions.insert(partition_ref.str.clone()); - } - } - Some(crate::build_event::EventType::BuildRequestEvent(br_event)) => { - for partition_ref in &br_event.requested_partitions { - unique_partitions.insert(partition_ref.str.clone()); - } - } - Some(crate::build_event::EventType::JobEvent(j_event)) => { - for partition_ref in &j_event.target_partitions { - unique_partitions.insert(partition_ref.str.clone()); - } - } - _ => {} - } - } - - // Get status for each partition and count builds - let mut partition_infos = Vec::new(); - for partition_ref in unique_partitions { - if let Ok(Some((status, last_updated))) = self.query_engine.get_latest_partition_status(&partition_ref).await { - // Count builds that reference this partition by looking at BuildRequestEvents - let mut builds_count = 0; - for event in &events { - if let Some(crate::build_event::EventType::BuildRequestEvent(br_event)) = &event.event_type { - if br_event.requested_partitions.iter().any(|p| p.str == partition_ref) { - builds_count += 1; - } - } - } - - partition_infos.push(PartitionInfo { - partition_ref: PartitionRef { str: partition_ref }, - current_status: status, - last_updated, - builds_count, - last_successful_build: None, // TODO: Find last successful build - invalidation_count: 0, // TODO: Count invalidation events - }); - } - } - - // Sort by partition reference for consistent ordering - partition_infos.sort_by(|a, b| a.partition_ref.str.cmp(&b.partition_ref.str)); - - Ok(partition_infos) - } - - // TODO: Implement remaining methods for BELQueryEngine - /* - Legacy methods that need to be updated to use query_engine: - - pub async fn show(&self, partition_ref: &str) -> Result)>> { ... } - pub async fn invalidate(&self, partition_ref: &str, reason: String, build_request_id: String) -> Result<()> { ... } - pub async fn show_protobuf(&self, partition_ref: &str) -> Result> { ... } - pub async fn list_protobuf(&self, request: PartitionsListRequest) -> Result { ... } - */ - - /// Show detailed information about a specific partition - /// - /// Returns the complete timeline of status changes for the specified partition, - /// including all builds that have referenced it. - pub async fn show(&self, partition_ref: &str) -> Result)>> { - // Get partition events from query engine - let events = self.query_engine.get_partition_events(partition_ref, None).await?; - - if events.is_empty() { - return Ok(None); - } - - // Get the latest partition status - let latest_status_result = self.query_engine.get_latest_partition_status(partition_ref).await?; - let (status, last_updated) = latest_status_result.unwrap_or((PartitionStatus::PartitionUnknown, 0)); - - // Count builds that reference this partition - let all_events = self.query_engine.get_events_in_range(0, i64::MAX).await?; - let mut builds_count = 0; - for event in &all_events { - if let Some(crate::build_event::EventType::BuildRequestEvent(br_event)) = &event.event_type { - if br_event.requested_partitions.iter().any(|p| p.str == partition_ref) { - builds_count += 1; - } - } - } - - // Create partition info - let partition_info = PartitionInfo { - partition_ref: PartitionRef { str: partition_ref.to_string() }, - current_status: status, - last_updated, - builds_count, - last_successful_build: None, // TODO: Find last successful build - invalidation_count: 0, // TODO: Count invalidation events - }; - - // Convert events to PartitionStatusEvent - let mut status_events = Vec::new(); - for event in events { - if let Some(crate::build_event::EventType::PartitionEvent(p_event)) = &event.event_type { - if let Ok(event_status) = PartitionStatus::try_from(p_event.status_code) { - status_events.push(PartitionStatusEvent { - timestamp: event.timestamp, - status: event_status, - message: p_event.message.clone(), - build_request_id: event.build_request_id.unwrap(), - job_run_id: if p_event.job_run_id.is_empty() { None } else { Some(p_event.job_run_id.clone()) }, - }); - } - } - } - - // Sort events by timestamp - status_events.sort_by_key(|e| e.timestamp); - - Ok(Some((partition_info, status_events))) - } - - /// Invalidate a partition with a reason - /// - /// This method uses the EventWriter to write a partition invalidation event. - /// It validates that the partition exists before invalidating it. - pub async fn invalidate(&self, partition_ref: &str, reason: String, build_request_id: String) -> Result<()> { - // Check if the partition exists by looking for any events that reference it - let partition_events = self.query_engine.get_partition_events(partition_ref, None).await?; - let all_events = self.query_engine.get_events_in_range(0, i64::MAX).await?; - - // Check if partition is referenced in any build request events - let mut partition_exists = !partition_events.is_empty(); - if !partition_exists { - for event in &all_events { - if let Some(crate::build_event::EventType::BuildRequestEvent(br_event)) = &event.event_type { - if br_event.requested_partitions.iter().any(|p| p.str == partition_ref) { - partition_exists = true; - break; - } - } - } - } - - if !partition_exists { - return Err(crate::event_log::BuildEventLogError::QueryError( - format!("Cannot invalidate non-existent partition: {}", partition_ref) - )); - } - - // Create a partition invalidation event - use crate::event_log::create_build_event; - - let invalidation_event = create_build_event( - build_request_id, - crate::build_event::EventType::PartitionInvalidationEvent(crate::PartitionInvalidationEvent { - partition_ref: Some(crate::PartitionRef { str: partition_ref.to_string() }), - reason, - }) - ); - - // Append the invalidation event - self.query_engine.append_event(invalidation_event).await?; - - Ok(()) - } - - /// Show detailed information about a specific partition using protobuf response format - /// - /// Returns the complete partition details with dual status fields and timeline events. - pub async fn show_protobuf(&self, partition_ref: &str) -> Result> { - // TODO: Implement with query engine - for now return None - Ok(None) - } - - /// List partitions returning protobuf response format with dual status fields - /// - /// This method provides the unified CLI/Service response format with both - /// status codes (enum values) and status names (human-readable strings). - pub async fn list_protobuf(&self, request: PartitionsListRequest) -> Result { - // Get partition info using existing list method - let partition_infos = self.list(request.limit.map(|l| l as usize)).await?; - - // Convert to protobuf format - let protobuf_partitions: Vec = partition_infos - .into_iter() - .map(|info| crate::PartitionSummary { - partition_ref: Some(info.partition_ref), - status_code: info.current_status as i32, - status_name: info.current_status.to_display_string(), - last_updated: info.last_updated, - builds_count: info.builds_count as u32, - last_successful_build: info.last_successful_build, - invalidation_count: info.invalidation_count as u32, - }) - .collect(); - - let total_count = protobuf_partitions.len() as u32; - - Ok(PartitionsListResponse { - partitions: protobuf_partitions, - total_count, - has_more: false, // TODO: Implement pagination - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::event_log::mock::{create_mock_bel_query_engine, create_mock_bel_query_engine_with_events, test_events}; - - #[tokio::test] - async fn test_partitions_repository_list_empty() { - let query_engine = create_mock_bel_query_engine().await.unwrap(); - let repo = PartitionsRepository::new(query_engine); - - let partitions = repo.list(None).await.unwrap(); - assert!(partitions.is_empty()); - } - - #[tokio::test] - async fn test_partitions_repository_list_with_data() { - let build_id = "test-build-123".to_string(); - let partition1 = PartitionRef { str: "data/users".to_string() }; - let partition2 = PartitionRef { str: "data/orders".to_string() }; - - // Create events for multiple partitions - let events = vec![ - test_events::build_request_received(Some(build_id.clone()), vec![partition1.clone(), partition2.clone()]), - test_events::partition_status(Some(build_id.clone()), partition1.clone(), PartitionStatus::PartitionBuilding, None), - test_events::partition_status(Some(build_id.clone()), partition1.clone(), PartitionStatus::PartitionAvailable, None), - test_events::partition_status(Some(build_id.clone()), partition2.clone(), PartitionStatus::PartitionBuilding, None), - test_events::partition_status(Some(build_id.clone()), partition2.clone(), PartitionStatus::PartitionFailed, None), - ]; - - let query_engine = create_mock_bel_query_engine_with_events(events).await.unwrap(); - let repo = PartitionsRepository::new(query_engine.clone()); - - let partitions = repo.list(None).await.unwrap(); - assert_eq!(partitions.len(), 2); - - // Find partitions by name - let users_partition = partitions.iter().find(|p| p.partition_ref.str == "data/users").unwrap(); - let orders_partition = partitions.iter().find(|p| p.partition_ref.str == "data/orders").unwrap(); - - assert_eq!(users_partition.current_status, PartitionStatus::PartitionAvailable); - assert_eq!(orders_partition.current_status, PartitionStatus::PartitionFailed); - assert_eq!(users_partition.builds_count, 1); - assert_eq!(orders_partition.builds_count, 1); - } - - #[tokio::test] - async fn test_partitions_repository_show() { - let build_id = "test-build-456".to_string(); - let partition = PartitionRef { str: "analytics/metrics".to_string() }; - - let events = vec![ - test_events::build_request_received(Some(build_id.clone()), vec![partition.clone()]), - test_events::partition_status(Some(build_id.clone()), partition.clone(), PartitionStatus::PartitionRequested, None), - test_events::partition_status(Some(build_id.clone()), partition.clone(), PartitionStatus::PartitionBuilding, None), - test_events::partition_status(Some(build_id.clone()), partition.clone(), PartitionStatus::PartitionAvailable, None), - ]; - - let query_engine = create_mock_bel_query_engine_with_events(events).await.unwrap(); - let repo = PartitionsRepository::new(query_engine); - - let result = repo.show(&partition.str).await.unwrap(); - assert!(result.is_some()); - - let (info, timeline) = result.unwrap(); - assert_eq!(info.partition_ref.str, "analytics/metrics"); - assert_eq!(info.current_status, PartitionStatus::PartitionAvailable); - assert_eq!(info.builds_count, 1); - assert_eq!(timeline.len(), 3); - - // Verify timeline order - assert_eq!(timeline[0].status, PartitionStatus::PartitionRequested); - assert_eq!(timeline[1].status, PartitionStatus::PartitionBuilding); - assert_eq!(timeline[2].status, PartitionStatus::PartitionAvailable); - } - - #[tokio::test] - async fn test_partitions_repository_show_nonexistent() { - let query_engine = create_mock_bel_query_engine().await.unwrap(); - let repo = PartitionsRepository::new(query_engine); - - let result = repo.show("nonexistent/partition").await.unwrap(); - assert!(result.is_none()); - } - - #[tokio::test] - async fn test_partitions_repository_invalidate() { - let build_id = "test-build-789".to_string(); - let partition = PartitionRef { str: "temp/data".to_string() }; - - // Start with an existing partition - let events = vec![ - test_events::partition_status(Some(build_id.clone()), partition.clone(), PartitionStatus::PartitionAvailable, None), - ]; - - let query_engine = create_mock_bel_query_engine_with_events(events).await.unwrap(); - let repo = PartitionsRepository::new(query_engine.clone()); - - // Invalidate the partition - repo.invalidate(&partition.str, "Test invalidation".to_string(), build_id.clone()).await.unwrap(); - - // Verify the invalidation was recorded - // Note: This test demonstrates the pattern, but the MockBuildEventLog would need - // to be enhanced to properly store invalidation events for full verification - - // Try to invalidate a non-existent partition - let result = repo.invalidate("nonexistent/partition", "Should fail".to_string(), build_id).await; - assert!(result.is_err()); - } -} \ No newline at end of file diff --git a/databuild/repositories/tasks/mod.rs b/databuild/repositories/tasks/mod.rs deleted file mode 100644 index ba77c5f..0000000 --- a/databuild/repositories/tasks/mod.rs +++ /dev/null @@ -1,519 +0,0 @@ -use crate::*; -use crate::event_log::{BuildEventLogError, Result}; -use crate::event_log::query_engine::BELQueryEngine; -use crate::{JobRunDetailResponse, JobRunTimelineEvent as ServiceTaskTimelineEvent}; -use std::sync::Arc; -use std::collections::HashMap; -use serde::Serialize; - -/// Repository for querying task (job run) data from the build event log -pub struct TasksRepository { - query_engine: Arc, -} - -/// Summary of a task's execution -#[derive(Debug, Clone, Serialize)] -pub struct TaskInfo { - pub job_run_id: String, - pub job_label: String, - pub build_request_id: String, - pub status: JobStatus, - pub target_partitions: Vec, - pub scheduled_at: i64, - pub started_at: Option, - pub completed_at: Option, - pub duration_ms: Option, - pub message: String, - pub config: Option, - pub manifests: Vec, - pub cancelled: bool, - pub cancel_reason: Option, -} - -/// Detailed timeline of a task's execution events -#[derive(Debug, Clone, Serialize)] -pub struct TaskEvent { - pub timestamp: i64, - pub event_type: String, - pub status: Option, - pub message: String, - pub cancel_reason: Option, -} - -impl TasksRepository { - /// Create a new TasksRepository - pub fn new(query_engine: Arc) -> Self { - Self { query_engine } - } - - /// List all tasks with their current status - /// - /// Returns a list of all job runs (tasks) that have been executed, - /// including their current status and execution details. - pub async fn list(&self, limit: Option) -> Result> { - // Get all events from the event log - let events = self.query_engine.get_events_in_range(0, i64::MAX).await?; - - let mut task_data: HashMap = HashMap::new(); - let mut task_cancellations: HashMap = HashMap::new(); - - // First pass: collect all task cancel events - for event in &events { - if let Some(build_event::EventType::JobRunCancelEvent(tc_event)) = &event.event_type { - task_cancellations.insert(tc_event.job_run_id.clone(), tc_event.reason.clone()); - } - } - - // Second pass: collect all job events and build task information - for event in events { - if let Some(build_event::EventType::JobEvent(j_event)) = &event.event_type { - let job_label = j_event.job_label.as_ref() - .map(|l| l.label.clone()) - .unwrap_or_else(|| "unknown".to_string()); - - let status = match j_event.status_code { - 1 => JobStatus::JobScheduled, - 2 => JobStatus::JobRunning, - 3 => JobStatus::JobCompleted, - 4 => JobStatus::JobFailed, - 5 => JobStatus::JobCancelled, - 6 => JobStatus::JobSkipped, - _ => JobStatus::JobUnknown, - }; - - // Create or update task info - let task = task_data.entry(j_event.job_run_id.clone()).or_insert_with(|| { - TaskInfo { - job_run_id: j_event.job_run_id.clone(), - job_label: job_label.clone(), - build_request_id: event.build_request_id.clone().unwrap(), - status: JobStatus::JobUnknown, - target_partitions: j_event.target_partitions.clone(), - scheduled_at: event.timestamp, - started_at: None, - completed_at: None, - duration_ms: None, - message: String::new(), - config: None, - manifests: vec![], - cancelled: false, - cancel_reason: None, - } - }); - - // Update task with new information - task.status = status; - task.message = j_event.message.clone(); - - match status { - JobStatus::JobScheduled => { - task.scheduled_at = event.timestamp; - if let Some(config) = &j_event.config { - task.config = Some(config.clone()); - } - } - JobStatus::JobRunning => { - task.started_at = Some(event.timestamp); - } - JobStatus::JobCompleted | JobStatus::JobFailed | JobStatus::JobCancelled => { - task.completed_at = Some(event.timestamp); - if let Some(started) = task.started_at { - task.duration_ms = Some((event.timestamp - started) / 1_000_000); // Convert to ms - } - task.manifests = j_event.manifests.clone(); - } - _ => {} - } - - // Check if this task was cancelled - if let Some(cancel_reason) = task_cancellations.get(&j_event.job_run_id) { - task.cancelled = true; - task.cancel_reason = Some(cancel_reason.clone()); - } - } - } - - // Convert to vector and sort by scheduled time (most recent first) - let mut tasks: Vec = task_data.into_values().collect(); - tasks.sort_by(|a, b| b.scheduled_at.cmp(&a.scheduled_at)); - - // Apply limit if specified - if let Some(limit) = limit { - tasks.truncate(limit); - } - - Ok(tasks) - } - - /// Show detailed information about a specific task - /// - /// Returns the complete timeline of events for the specified task, - /// including all status changes and any cancellation events. - pub async fn show(&self, job_run_id: &str) -> Result)>> { - // Get all events for this specific job run - let job_events = self.query_engine.get_job_run_events(job_run_id).await?; - - if job_events.is_empty() { - return Ok(None); - } - - let mut task_info: Option = None; - let mut timeline: Vec = Vec::new(); - - // Process job events to build task information - for event in &job_events { - if let Some(build_event::EventType::JobEvent(j_event)) = &event.event_type { - let job_label = j_event.job_label.as_ref() - .map(|l| l.label.clone()) - .unwrap_or_else(|| "unknown".to_string()); - - let status = match j_event.status_code { - 1 => JobStatus::JobScheduled, - 2 => JobStatus::JobRunning, - 3 => JobStatus::JobCompleted, - 4 => JobStatus::JobFailed, - 5 => JobStatus::JobCancelled, - 6 => JobStatus::JobSkipped, - _ => JobStatus::JobUnknown, - }; - - // Create or update task info - if task_info.is_none() { - task_info = Some(TaskInfo { - job_run_id: j_event.job_run_id.clone(), - job_label: job_label.clone(), - build_request_id: event.build_request_id.clone().unwrap(), - status: JobStatus::JobUnknown, - target_partitions: j_event.target_partitions.clone(), - scheduled_at: event.timestamp, - started_at: None, - completed_at: None, - duration_ms: None, - message: String::new(), - config: None, - manifests: vec![], - cancelled: false, - cancel_reason: None, - }); - } - - let task = task_info.as_mut().unwrap(); - task.status = status; - task.message = j_event.message.clone(); - - match status { - JobStatus::JobScheduled => { - task.scheduled_at = event.timestamp; - if let Some(config) = &j_event.config { - task.config = Some(config.clone()); - } - } - JobStatus::JobRunning => { - task.started_at = Some(event.timestamp); - } - JobStatus::JobCompleted | JobStatus::JobFailed | JobStatus::JobCancelled => { - task.completed_at = Some(event.timestamp); - if let Some(started) = task.started_at { - task.duration_ms = Some((event.timestamp - started) / 1_000_000); // Convert to ms - } - task.manifests = j_event.manifests.clone(); - } - _ => {} - } - - // Add to timeline - timeline.push(TaskEvent { - timestamp: event.timestamp, - event_type: "job_status_change".to_string(), - status: Some(status), - message: j_event.message.clone(), - cancel_reason: None, - }); - } - } - - // Also check for task cancel events in all events - let all_events = self.query_engine.get_events_in_range(0, i64::MAX).await?; - for event in all_events { - if let Some(build_event::EventType::JobRunCancelEvent(tc_event)) = &event.event_type { - if tc_event.job_run_id == job_run_id { - if let Some(task) = task_info.as_mut() { - task.cancelled = true; - task.cancel_reason = Some(tc_event.reason.clone()); - } - - timeline.push(TaskEvent { - timestamp: event.timestamp, - event_type: "task_cancel".to_string(), - status: None, - message: "Task cancelled".to_string(), - cancel_reason: Some(tc_event.reason.clone()), - }); - } - } - } - - // Sort timeline by timestamp - timeline.sort_by_key(|e| e.timestamp); - - Ok(task_info.map(|info| (info, timeline))) - } - - /// Cancel a task with a reason - /// - /// This method uses the EventWriter to write a task cancellation event. - /// It validates that the task exists and is in a cancellable state. - pub async fn cancel(&self, job_run_id: &str, reason: String, build_request_id: String) -> Result<()> { - // First check if the task exists and get its current status - let task_info = self.show(job_run_id).await?; - - if task_info.is_none() { - return Err(BuildEventLogError::QueryError( - format!("Cannot cancel non-existent task: {}", job_run_id) - )); - } - - let (task, _timeline) = task_info.unwrap(); - - // Check if task is in a cancellable state - match task.status { - JobStatus::JobCompleted => { - return Err(BuildEventLogError::QueryError( - format!("Cannot cancel completed task: {}", job_run_id) - )); - } - JobStatus::JobFailed => { - return Err(BuildEventLogError::QueryError( - format!("Cannot cancel failed task: {}", job_run_id) - )); - } - JobStatus::JobCancelled => { - return Err(BuildEventLogError::QueryError( - format!("Task already cancelled: {}", job_run_id) - )); - } - _ => {} - } - - // Use EventWriter to write the cancellation event - let event_writer = crate::event_log::writer::EventWriter::new(self.query_engine.clone()); - event_writer.cancel_task(build_request_id, job_run_id.to_string(), reason).await - } - - /// Show detailed information about a specific task using protobuf response format - /// - /// Returns the complete task details with dual status fields and timeline events. - pub async fn show_protobuf(&self, job_run_id: &str) -> Result> { - // Get task info and timeline using existing show method - if let Some((task_info, timeline)) = self.show(job_run_id).await? { - // Convert timeline events to protobuf format - let protobuf_timeline: Vec = timeline - .into_iter() - .map(|event| ServiceTaskTimelineEvent { - timestamp: event.timestamp, - status_code: event.status.map(|s| s as i32), - status_name: event.status.map(|s| s.to_display_string()), - message: event.message, - event_type: event.event_type, - cancel_reason: event.cancel_reason, - }) - .collect(); - - let response = JobRunDetailResponse { - job_run_id: task_info.job_run_id, - job_label: task_info.job_label, - build_request_id: task_info.build_request_id, - status_code: task_info.status as i32, - status_name: task_info.status.to_display_string(), - target_partitions: task_info.target_partitions, - scheduled_at: task_info.scheduled_at, - started_at: task_info.started_at, - completed_at: task_info.completed_at, - duration_ms: task_info.duration_ms, - cancelled: task_info.cancelled, - cancel_reason: task_info.cancel_reason, - message: task_info.message, - timeline: protobuf_timeline, - }; - - Ok(Some(response)) - } else { - Ok(None) - } - } - - /// List tasks using protobuf response format with dual status fields - /// - /// Returns JobRunsListResponse protobuf message with JobRunSummary objects containing - /// status_code and status_name fields. - pub async fn list_protobuf(&self, request: JobRunsListRequest) -> Result { - // Get task info using existing list method - let tasks = self.list(request.limit.map(|l| l as usize)).await?; - - // Convert to protobuf format - let protobuf_tasks: Vec = tasks - .into_iter() - .map(|task| crate::JobRunSummary { - job_run_id: task.job_run_id, - job_label: task.job_label, - build_request_id: task.build_request_id, - status_code: task.status as i32, - status_name: task.status.to_display_string(), - target_partitions: task.target_partitions.into_iter().map(|p| crate::PartitionRef { str: p.str }).collect(), - scheduled_at: task.scheduled_at, - started_at: task.started_at, - completed_at: task.completed_at, - duration_ms: task.duration_ms, - cancelled: task.cancelled, - message: task.message, - }) - .collect(); - - let total_count = protobuf_tasks.len() as u32; - - Ok(JobRunsListResponse { - tasks: protobuf_tasks, - total_count, - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::event_log::mock::{create_mock_bel_query_engine, create_mock_bel_query_engine_with_events, test_events}; - - #[tokio::test] - async fn test_tasks_repository_list_empty() { - let query_engine = create_mock_bel_query_engine().await.unwrap(); - let repo = TasksRepository::new(query_engine); - - let tasks = repo.list(None).await.unwrap(); - assert!(tasks.is_empty()); - } - - #[tokio::test] - async fn test_tasks_repository_list_with_data() { - let build_id = "test-build-123".to_string(); - let job_label = JobLabel { label: "//:process_data".to_string() }; - let partition = PartitionRef { str: "data/users".to_string() }; - - // Create events for multiple tasks - let events = vec![ - test_events::job_event(Some(build_id.clone()), Some("task-1".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobScheduled), - test_events::job_event(Some(build_id.clone()), Some("task-1".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobCompleted), - test_events::job_event(Some(build_id.clone()), Some("task-2".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobScheduled), - test_events::job_event(Some(build_id.clone()), Some("task-2".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobFailed), - ]; - - let query_engine = create_mock_bel_query_engine_with_events(events).await.unwrap(); - let repo = TasksRepository::new(query_engine); - - let tasks = repo.list(None).await.unwrap(); - assert_eq!(tasks.len(), 2); - - // Find tasks by job run id - let task1 = tasks.iter().find(|t| t.job_run_id == "task-1").unwrap(); - let task2 = tasks.iter().find(|t| t.job_run_id == "task-2").unwrap(); - - assert_eq!(task1.status, JobStatus::JobCompleted); - assert_eq!(task1.job_label, "//:process_data"); - assert!(!task1.cancelled); - - assert_eq!(task2.status, JobStatus::JobFailed); - assert_eq!(task2.job_label, "//:process_data"); - assert!(!task2.cancelled); - } - - #[tokio::test] - async fn test_tasks_repository_show() { - let build_id = "test-build-456".to_string(); - let job_label = JobLabel { label: "//:analytics_task".to_string() }; - let partition = PartitionRef { str: "analytics/daily".to_string() }; - - let events = vec![ - test_events::job_event(Some(build_id.clone()), Some("task-123".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobScheduled), - test_events::job_event(Some(build_id.clone()), Some("task-123".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobRunning), - test_events::job_event(Some(build_id.clone()), Some("task-123".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobCompleted), - ]; - - let query_engine = create_mock_bel_query_engine_with_events(events).await.unwrap(); - let repo = TasksRepository::new(query_engine); - - let result = repo.show("task-123").await.unwrap(); - assert!(result.is_some()); - - let (info, timeline) = result.unwrap(); - assert_eq!(info.job_run_id, "task-123"); - assert_eq!(info.job_label, "//:analytics_task"); - assert_eq!(info.status, JobStatus::JobCompleted); - assert!(!info.cancelled); - - assert_eq!(timeline.len(), 3); - assert_eq!(timeline[0].status, Some(JobStatus::JobScheduled)); - assert_eq!(timeline[1].status, Some(JobStatus::JobRunning)); - assert_eq!(timeline[2].status, Some(JobStatus::JobCompleted)); - } - - #[tokio::test] - async fn test_tasks_repository_show_nonexistent() { - let query_engine = create_mock_bel_query_engine().await.unwrap(); - let repo = TasksRepository::new(query_engine); - - let result = repo.show("nonexistent-task").await.unwrap(); - assert!(result.is_none()); - } - - #[tokio::test] - async fn test_tasks_repository_cancel() { - let build_id = "test-build-789".to_string(); - let job_label = JobLabel { label: "//:batch_task".to_string() }; - let partition = PartitionRef { str: "batch/data".to_string() }; - - // Start with a running task - let events = vec![ - test_events::job_event(Some(build_id.clone()), Some("task-456".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobScheduled), - test_events::job_event(Some(build_id.clone()), Some("task-456".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobRunning), - ]; - - let query_engine = create_mock_bel_query_engine_with_events(events).await.unwrap(); - let repo = TasksRepository::new(query_engine.clone()); - - // Cancel the task - repo.cancel("task-456", "User requested cancellation".to_string(), build_id.clone()).await.unwrap(); - - // Verify the cancellation was recorded - // Note: This test demonstrates the pattern, but the MockBELStorage would need - // to be enhanced to properly store task cancel events for full verification - - // Try to cancel a non-existent task - let result = repo.cancel("nonexistent-task", "Should fail".to_string(), build_id).await; - assert!(result.is_err()); - } - - #[tokio::test] - async fn test_tasks_repository_cancel_completed_task() { - let build_id = "test-build-999".to_string(); - let job_label = JobLabel { label: "//:completed_task".to_string() }; - let partition = PartitionRef { str: "test/data".to_string() }; - - // Create a completed task - let events = vec![ - test_events::job_event(Some(build_id.clone()), Some("completed-task".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobScheduled), - test_events::job_event(Some(build_id.clone()), Some("completed-task".to_string()), job_label.clone(), vec![partition.clone()], JobStatus::JobCompleted), - ]; - - let query_engine = create_mock_bel_query_engine_with_events(events).await.unwrap(); - let repo = TasksRepository::new(query_engine); - - // Try to cancel the completed task - should fail - let result = repo.cancel("completed-task", "Should fail".to_string(), build_id).await; - assert!(result.is_err()); - - if let Err(BuildEventLogError::QueryError(msg)) = result { - assert!(msg.contains("Cannot cancel completed task")); - } else { - panic!("Expected QueryError for completed task cancellation"); - } - } -} \ No newline at end of file diff --git a/databuild/runtime/BUILD.bazel b/databuild/runtime/BUILD.bazel deleted file mode 100644 index 9e2c8a0..0000000 --- a/databuild/runtime/BUILD.bazel +++ /dev/null @@ -1,77 +0,0 @@ -# In modules/jq/BUILD.bazel -load("@bazel_skylib//lib:selects.bzl", "selects") - -exports_files([ - "simple_executable_wrapper.sh.tpl", -]) - -# Platform detection -config_setting( - name = "darwin", - constraint_values = ["@platforms//os:osx"], -) - -config_setting( - name = "linux", - constraint_values = ["@platforms//os:linux"], -) - -# Download jq binaries -genrule( - name = "download_jq_linux", - outs = ["jq-linux64"], - cmd = """ - curl -L "https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64" -o "$@" - chmod +x "$@" - """, -) - -genrule( - name = "download_jq_macos", - outs = ["jq-osx-amd64"], - cmd = """ - curl -L "https://github.com/stedolan/jq/releases/download/jq-1.6/jq-osx-amd64" -o "$@" - chmod +x "$@" - """, -) - -# Create jq binary target for each platform -genrule( - name = "jq_linux_bin", - srcs = [":jq-linux64"], - outs = ["jq_linux"], - cmd = "cp $< $@ && chmod +x $@", - executable = True, -) - -genrule( - name = "jq_macos_bin", - srcs = [":jq-osx-amd64"], - outs = ["jq_macos"], - cmd = "cp $< $@ && chmod +x $@", - executable = True, -) - -# Create platform-specific filegroups -filegroup( - name = "jq_bin_linux", - srcs = [":jq_linux"], - visibility = ["//visibility:public"], -) - -filegroup( - name = "jq_bin_macos", - srcs = [":jq_macos"], - visibility = ["//visibility:public"], -) - -# Create a binary target for jq -sh_binary( - name = "jq", - srcs = select({ - ":darwin": [":jq_macos"], - ":linux": [":jq_linux"], - "//conditions:default": [":jq_linux"], - }), - visibility = ["//visibility:public"], -) diff --git a/databuild/runtime/simple_executable_wrapper.sh.tpl b/databuild/runtime/simple_executable_wrapper.sh.tpl deleted file mode 100755 index 84814bc..0000000 --- a/databuild/runtime/simple_executable_wrapper.sh.tpl +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -set -e - -%{RUNFILES_PREFIX} - -%{PREFIX} - -# Check if rlocation function is available -if ! type rlocation >/dev/null 2>&1; then - echo "Error: rlocation function not available. Runfiles may not be properly initialized." >&2 - exit 1 -fi - -# Resolve the executable using rlocation -EXECUTABLE_BINARY="$(rlocation "_main/%{EXECUTABLE_SHORT_PATH}")" - -# Check if rlocation returned something -if [[ -z "${EXECUTABLE_BINARY}" ]]; then - echo "Error: rlocation returned empty result for '_main/%{EXECUTABLE_SHORT_PATH}'" >&2 - exit 1 -fi - -# Check if the resolved binary exists -if [[ ! -f "${EXECUTABLE_BINARY}" ]]; then - echo "Error: Resolved executable '${EXECUTABLE_BINARY}' does not exist" >&2 - exit 1 -fi - -# Check if the resolved binary is executable -if [[ ! -x "${EXECUTABLE_BINARY}" ]]; then - echo "Error: Resolved executable '${EXECUTABLE_BINARY}' is not executable" >&2 - exit 1 -fi - -# Run the configuration -if [[ -n "${EXECUTABLE_SUBCOMMAND:-}" ]]; then - exec "${EXECUTABLE_BINARY}" "${EXECUTABLE_SUBCOMMAND}" "$@" -else - exec "${EXECUTABLE_BINARY}" "$@" -fi diff --git a/databuild/service/handlers.rs b/databuild/service/handlers.rs deleted file mode 100644 index 2a49198..0000000 --- a/databuild/service/handlers.rs +++ /dev/null @@ -1,1754 +0,0 @@ -use super::*; -use crate::event_log::{current_timestamp_nanos, create_build_event}; -use crate::orchestration::{BuildOrchestrator, BuildResult}; -use crate::mermaid_utils; -use axum::{ - extract::{Path, State}, - http::StatusCode, -}; -use axum_jsonschema::Json; -use log::{error, info}; -use serde::Deserialize; -use schemars::JsonSchema; -use std::process::Command; -use std::env; - -// Simple base64 URL-safe decoding function for job labels -fn base64_url_decode(encoded: &str) -> Result> { - - // Convert URL-safe base64 back to regular base64 - let mut padded = encoded.replace('-', "+").replace('_', "/"); - - // Add padding if needed - match padded.len() % 4 { - 2 => padded.push_str("=="), - 3 => padded.push_str("="), - _ => {} - } - - // Manual base64 decoding (simplified) - let alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - let mut result = Vec::new(); - let mut buffer = 0u32; - let mut bits = 0; - - for c in padded.chars() { - if c == '=' { break; } - - if let Some(index) = alphabet.find(c) { - buffer = (buffer << 6) | (index as u32); - bits += 6; - - if bits >= 8 { - result.push(((buffer >> (bits - 8)) & 0xFF) as u8); - bits -= 8; - } - } - } - - String::from_utf8(result).map_err(|e| e.into()) -} - -pub async fn submit_build_request( - State(service): State, - Json(request): Json, -) -> Result, (StatusCode, Json)> { - let build_request_id = BuildGraphService::generate_build_request_id(); - let timestamp = current_timestamp_nanos(); - - info!("Received build request {} for partitions: {:?}", build_request_id, request.partitions); - - // Create build request state - let build_state = BuildRequestState { - build_request_id: build_request_id.clone(), - status: BuildRequestStatusCode::BuildRequestReceived.status(), - requested_partitions: request.partitions.clone(), - created_at: timestamp, - updated_at: timestamp, - }; - - // Store in active builds - { - let mut active_builds = service.active_builds.write().await; - active_builds.insert(build_request_id.clone(), build_state); - } - - // Create orchestrator and emit build request received event - let requested_partitions: Vec = request.partitions.iter() - .map(|p| PartitionRef { str: p.clone() }) - .collect(); - - let orchestrator = BuildOrchestrator::new( - service.query_engine.clone(), - build_request_id.clone(), - requested_partitions, - ); - - if let Err(e) = orchestrator.start_build().await { - error!("Failed to log build request received event: {}", e); - return Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to log build request: {}", e), - }), - )); - } - - // Start build execution in background - let service_clone = service.clone(); - let build_request_id_clone = build_request_id.clone(); - let partitions_clone = request.partitions.clone(); - - tokio::spawn(async move { - if let Err(e) = execute_build_request( - service_clone, - build_request_id_clone, - partitions_clone, - ).await { - error!("Build request execution failed: {}", e); - } - }); - - Ok(Json(BuildRequestResponse { build_request_id })) -} - -#[derive(Deserialize, JsonSchema)] -pub struct BuildStatusRequest { - pub build_request_id: String, -} - -pub async fn get_build_status( - State(service): State, - Path(BuildStatusRequest { build_request_id }): Path, -) -> Result, (StatusCode, Json)> { - let repository = crate::repositories::builds::BuildsRepository::new(service.query_engine.clone()); - - match repository.show_protobuf(&build_request_id).await { - Ok(Some(build_detail)) => { - Ok(Json(build_detail)) - } - Ok(None) => { - Err(( - StatusCode::NOT_FOUND, - Json(ErrorResponse { - error: "Build request not found".to_string(), - }), - )) - } - Err(e) => { - error!("Failed to get build status: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to get build status: {}", e), - }), - )) - } - } -} - -#[derive(Deserialize, JsonSchema)] -pub struct CancelBuildRequest { - pub build_request_id: String, -} - -pub async fn cancel_build_request( - State(service): State, - Path(CancelBuildRequest { build_request_id }): Path, -) -> Result, (StatusCode, Json)> { - // Update build request state - { - let mut active_builds = service.active_builds.write().await; - if let Some(build_state) = active_builds.get_mut(&build_request_id) { - build_state.status = BuildRequestStatusCode::BuildRequestCancelled.status(); - build_state.updated_at = current_timestamp_nanos(); - } else { - return Err(( - StatusCode::NOT_FOUND, - Json(ErrorResponse { - error: "Build request not found".to_string(), - }), - )); - } - } - - // Log cancellation event - let event = create_build_event( - build_request_id.clone(), - crate::build_event::EventType::BuildRequestEvent(BuildRequestEvent { - status: Some(BuildRequestStatusCode::BuildRequestCancelled.status()), - requested_partitions: vec![], - message: "Build request cancelled".to_string(), - comment: None, - want_id: None, - }), - ); - - if let Err(e) = service.query_engine.append_event(event).await { - error!("Failed to log build request cancelled event: {}", e); - } - - info!("Build request {} cancelled", build_request_id); - - Ok(Json(BuildCancelResponse { - cancelled: true, - build_request_id, - })) -} - -#[derive(Deserialize, JsonSchema)] -pub struct PartitionStatusRequest { - pub partition_ref: String, -} - -pub async fn get_partition_status( - State(service): State, - Path(PartitionStatusRequest { partition_ref }): Path, -) -> Result, (StatusCode, Json)> { - // Get latest partition status - let (status, last_updated) = match service.query_engine.get_latest_partition_status(&partition_ref).await { - Ok(Some((status, timestamp))) => (status, Some(timestamp)), - Ok(None) => { - // No partition events found - this is a legitimate 404 - return Err(( - StatusCode::NOT_FOUND, - Json(ErrorResponse { - error: format!("Partition not found: {}", partition_ref), - }), - )); - }, - Err(e) => { - error!("Failed to get partition status: {}", e); - return Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to get partition status: {}", e), - }), - )); - } - }; - - // Get active builds for this partition - let build_requests = match service.query_engine.get_active_builds_for_partition(&partition_ref).await { - Ok(builds) => builds, - Err(e) => { - error!("Failed to get active builds for partition: {}", e); - return Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to get active builds for partition: {}", e), - }), - )); - } - }; - - Ok(Json(PartitionStatusResponse { - partition_ref, - status_code: status as i32, - status_name: status.to_display_string(), - last_updated, - build_requests, - })) -} - -#[derive(Deserialize, JsonSchema)] -pub struct PartitionEventsRequest { - pub partition_ref: String, -} - -pub async fn get_partition_events( - State(service): State, - Path(PartitionEventsRequest { partition_ref }): Path, -) -> Result, (StatusCode, Json)> { - let decoded_partition_ref = base64_url_decode(&partition_ref).unwrap(); - - let events = match service.query_engine.get_partition_events(&decoded_partition_ref, None).await { - Ok(events) => events.into_iter().filter(|e| e.build_request_id.is_some()).map(|e| { - let (job_label, partition_ref, delegated_build_id) = extract_navigation_data(&e.event_type); - BuildEventSummary { - event_id: e.event_id, - timestamp: e.timestamp, - event_type: event_type_to_string(&e.event_type), - message: event_to_message(&e.event_type), - build_request_id: e.build_request_id.clone().unwrap(), - job_label, - partition_ref, - delegated_build_id, - } - }).collect(), - Err(e) => { - error!("Failed to get partition events: {}", e); - return Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to get partition events: {}", e), - }), - )); - } - }; - - Ok(Json(PartitionEventsResponse { - partition_ref: decoded_partition_ref, - events, - })) -} - -pub async fn analyze_build_graph( - State(service): State, - Json(request): Json, -) -> Result, (StatusCode, Json)> { - // Call the analyze command (use temporary ID for analyze-only requests) - let temp_build_request_id = BuildGraphService::generate_build_request_id(); - let analyze_result = run_analyze_command(&service, &temp_build_request_id, &request.partitions).await; - - match analyze_result { - Ok(job_graph) => { - let job_graph_json = match serde_json::to_value(&job_graph) { - Ok(json) => json, - Err(e) => { - error!("Failed to serialize job graph: {}", e); - return Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to serialize job graph: {}", e), - }), - )); - } - }; - - Ok(Json(AnalyzeResponse { - job_graph: job_graph_json, - })) - } - Err(e) => { - error!("Failed to analyze build graph: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to analyze build graph: {}", e), - }), - )) - } - } -} - -async fn execute_build_request( - service: ServiceState, - build_request_id: String, - partitions: Vec, -) -> Result<(), String> { - info!("Starting build execution for request {}", build_request_id); - - // Create orchestrator for this build request - let requested_partitions: Vec = partitions.iter() - .map(|p| PartitionRef { str: p.clone() }) - .collect(); - - let orchestrator = BuildOrchestrator::new( - service.query_engine.clone(), - build_request_id.clone(), - requested_partitions, - ); - - // Update status to planning - update_build_request_status(&service, &build_request_id, BuildRequestStatusCode::BuildRequestPlanning.status()).await; - - // Log planning event - if let Err(e) = orchestrator.start_planning().await { - error!("Failed to log planning event: {}", e); - } - - // Analyze the build graph - let job_graph = match run_analyze_command(&service, &build_request_id, &partitions).await { - Ok(graph) => graph, - Err(e) => { - error!("Failed to analyze build graph: {}", e); - update_build_request_status(&service, &build_request_id, BuildRequestStatusCode::BuildRequestFailed.status()).await; - - // Log failure event - if let Err(log_err) = orchestrator.complete_build(BuildResult::Failed { jobs_completed: 0, jobs_failed: 1 }).await { - error!("Failed to log failure event: {}", log_err); - } - - return Err(e); - } - }; - - - // Update status to executing - update_build_request_status(&service, &build_request_id, BuildRequestStatusCode::BuildRequestExecuting.status()).await; - - // Log executing event - if let Err(e) = orchestrator.start_execution().await { - error!("Failed to log executing event: {}", e); - } - - // Execute the build graph - match run_execute_command(&service, &build_request_id, &job_graph).await { - Ok(_) => { - info!("Build request {} completed successfully", build_request_id); - update_build_request_status(&service, &build_request_id, BuildRequestStatusCode::BuildRequestCompleted.status()).await; - - // Log completion event - if let Err(e) = orchestrator.complete_build(BuildResult::Success { jobs_completed: 0 }).await { - error!("Failed to log completion event: {}", e); - } - - Ok(()) - } - Err(e) => { - error!("Build request {} failed: {}", build_request_id, e); - update_build_request_status(&service, &build_request_id, BuildRequestStatusCode::BuildRequestFailed.status()).await; - - // Log failure event - if let Err(log_err) = orchestrator.complete_build(BuildResult::Failed { jobs_completed: 0, jobs_failed: 1 }).await { - error!("Failed to log failure event: {}", log_err); - } - - Err(e) - } - } -} - -async fn update_build_request_status( - service: &ServiceState, - build_request_id: &str, - status: BuildRequestStatus, -) { - let mut active_builds = service.active_builds.write().await; - if let Some(build_state) = active_builds.get_mut(build_request_id) { - build_state.status = status; - build_state.updated_at = current_timestamp_nanos(); - } -} - -async fn run_analyze_command( - service: &ServiceState, - build_request_id: &str, - partitions: &[String], -) -> Result { - // Run analyze command - let analyze_binary = env::var("DATABUILD_ANALYZE_BINARY") - .unwrap_or_else(|_| "databuild_analyze".to_string()); - - let output = Command::new(&analyze_binary) - .args(partitions) - .env("DATABUILD_JOB_LOOKUP_PATH", &service.job_lookup_path) - .env("DATABUILD_CANDIDATE_JOBS", serde_json::to_string(&service.candidate_jobs).unwrap()) - .env("DATABUILD_BUILD_EVENT_LOG", &service.event_log_uri) - .env("DATABUILD_BUILD_REQUEST_ID", build_request_id) - .output() - .map_err(|e| format!("Failed to execute analyze command: {}", e))?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - return Err(format!("Analyze command failed: {}", stderr)); - } - - let stdout = String::from_utf8_lossy(&output.stdout); - let job_graph: JobGraph = serde_json::from_str(&stdout) - .map_err(|e| format!("Failed to parse analyze result: {}", e))?; - - Ok(job_graph) -} - -async fn run_execute_command( - service: &ServiceState, - build_request_id: &str, - job_graph: &JobGraph, -) -> Result<(), String> { - // Serialize job graph - let job_graph_json = serde_json::to_string(job_graph) - .map_err(|e| format!("Failed to serialize job graph: {}", e))?; - - // Run execute command - let execute_binary = env::var("DATABUILD_EXECUTE_BINARY") - .unwrap_or_else(|_| "databuild_execute".to_string()); - - let mut child = Command::new(&execute_binary) - .env("DATABUILD_JOB_LOOKUP_PATH", &service.job_lookup_path) - .env("DATABUILD_CANDIDATE_JOBS", serde_json::to_string(&service.candidate_jobs).unwrap()) - .env("DATABUILD_BUILD_EVENT_LOG", &service.event_log_uri) - .env("DATABUILD_BUILD_REQUEST_ID", build_request_id) - .stdin(std::process::Stdio::piped()) - .stdout(std::process::Stdio::piped()) - .stderr(std::process::Stdio::piped()) - .spawn() - .map_err(|e| format!("Failed to spawn execute command: {}", e))?; - - // Write job graph to stdin - if let Some(stdin) = child.stdin.take() { - use std::io::Write; - let mut stdin = stdin; - stdin.write_all(job_graph_json.as_bytes()) - .map_err(|e| format!("Failed to write job graph to stdin: {}", e))?; - } - - // Wait for completion - let output = child.wait_with_output() - .map_err(|e| format!("Failed to wait for execute command: {}", e))?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - return Err(format!("Execute command failed: {}", stderr)); - } - - Ok(()) -} - -fn event_type_to_string(event_type: &Option) -> String { - match event_type { - Some(crate::build_event::EventType::BuildRequestEvent(_)) => "build_request".to_string(), - Some(crate::build_event::EventType::PartitionEvent(_)) => "partition".to_string(), - Some(crate::build_event::EventType::JobEvent(_)) => "job".to_string(), - Some(crate::build_event::EventType::DelegationEvent(_)) => "delegation".to_string(), - Some(crate::build_event::EventType::JobGraphEvent(_)) => "job_graph".to_string(), - Some(crate::build_event::EventType::PartitionInvalidationEvent(_)) => "partition_invalidation".to_string(), - Some(crate::build_event::EventType::JobRunCancelEvent(_)) => "task_cancel".to_string(), - Some(crate::build_event::EventType::BuildCancelEvent(_)) => "build_cancel".to_string(), - Some(build_event::EventType::WantEvent(_)) => "want".to_string(), - Some(build_event::EventType::TaintEvent(_)) => "taint".to_string(), - None => "INVALID_EVENT_TYPE".to_string(), - } -} - -fn event_to_message(event_type: &Option) -> String { - match event_type { - Some(crate::build_event::EventType::BuildRequestEvent(event)) => event.message.clone(), - Some(crate::build_event::EventType::PartitionEvent(event)) => event.message.clone(), - Some(crate::build_event::EventType::JobEvent(event)) => event.message.clone(), - Some(crate::build_event::EventType::DelegationEvent(event)) => event.message.clone(), - Some(crate::build_event::EventType::JobGraphEvent(event)) => event.message.clone(), - Some(crate::build_event::EventType::PartitionInvalidationEvent(event)) => event.reason.clone(), - Some(crate::build_event::EventType::JobRunCancelEvent(event)) => event.reason.clone(), - Some(crate::build_event::EventType::BuildCancelEvent(event)) => event.reason.clone(), - Some(build_event::EventType::WantEvent(event)) => event.comment.clone(), - Some(build_event::EventType::TaintEvent(event)) => event.comment.clone(), - - None => "INVALID_EVENT_NO_MESSAGE".to_string(), - } -} - -fn extract_navigation_data(event_type: &Option) -> (Option, Option, Option) { - match event_type { - Some(crate::build_event::EventType::JobEvent(event)) => { - let job_label = event.job_label.as_ref().map(|l| l.label.clone()); - (job_label, None, None) - }, - Some(crate::build_event::EventType::PartitionEvent(event)) => { - let partition_ref = event.partition_ref.as_ref().map(|r| r.str.clone()); - (None, partition_ref, None) - }, - Some(crate::build_event::EventType::DelegationEvent(event)) => { - let delegated_build_id = Some(event.delegated_to_build_request_id.clone()); - (None, None, delegated_build_id) - }, - Some(crate::build_event::EventType::BuildRequestEvent(_)) => { - // Build request events don't need navigation links (self-referential) - (None, None, None) - }, - Some(crate::build_event::EventType::JobGraphEvent(_)) => { - // Job graph events don't need navigation links - (None, None, None) - }, - Some(crate::build_event::EventType::PartitionInvalidationEvent(event)) => { - let partition_ref = event.partition_ref.as_ref().map(|r| r.str.clone()); - (None, partition_ref, None) - }, - Some(crate::build_event::EventType::JobRunCancelEvent(_event)) => { - // Task cancel events reference job run IDs, which we could potentially navigate to - (None, None, None) - }, - Some(crate::build_event::EventType::BuildCancelEvent(_)) => { - // Build cancel events don't need navigation links - (None, None, None) - }, - Some(crate::build_event::EventType::WantEvent(_)) => { - (None, None, None) - }, - Some(crate::build_event::EventType::TaintEvent(_)) => { - (None, None, None) - }, - None => (None, None, None), - } -} - -// New handlers for list endpoints -use axum::extract::Query; -use std::collections::HashMap; - -pub async fn list_build_requests( - State(service): State, - Query(params): Query>, -) -> Result, (StatusCode, Json)> { - let limit = params.get("limit") - .and_then(|s| s.parse::().ok()) - .unwrap_or(20) - .min(100); // Cap at 100 - - // Use repository with protobuf format - let builds_repo = BuildsRepository::new(service.query_engine.clone()); - match builds_repo.list_protobuf(Some(limit as usize)).await { - Ok(builds) => { - let total_count = builds.len() as u32; - let response = crate::BuildsListResponse { - builds, - total_count, // TODO: implement proper total count with pagination - has_more: false, // TODO: implement proper pagination - }; - Ok(Json(response)) - }, - Err(e) => { - error!("Failed to list build requests: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to list build requests: {}", e), - }), - )) - } - } -} - -pub async fn list_partitions( - State(service): State, - Query(params): Query>, -) -> Result, (StatusCode, Json)> { - let limit = params.get("limit") - .and_then(|s| s.parse::().ok()) - .unwrap_or(20) - .min(100); // Cap at 100 - - // Use repository with protobuf format - // TODO: Update PartitionsRepository to work with BELQueryEngine - // let partitions_repo = PartitionsRepository::new(service.query_engine.clone()); - let request = PartitionsListRequest { - limit: Some(limit), - offset: None, - status_filter: None, - }; - - // TODO: Implement with PartitionsRepository using BELQueryEngine - let response = PartitionsListResponse { - partitions: vec![], - total_count: 0, - has_more: false, - }; - Ok(Json(response)) -} - -// New unified protobuf-based handler for future migration -pub async fn list_partitions_unified( - State(service): State, - Query(params): Query>, -) -> Result, (StatusCode, Json)> { - let limit = params.get("limit") - .and_then(|s| s.parse::().ok()) - .unwrap_or(20) - .min(100); // Cap at 100 - - let offset = params.get("offset") - .and_then(|s| s.parse::().ok()) - .unwrap_or(0); - - let status_filter = params.get("status") - .and_then(|s| crate::PartitionStatus::from_display_string(s)); - - // Use repository with protobuf response format - // TODO: Update PartitionsRepository to work with BELQueryEngine - // let repository = crate::repositories::partitions::PartitionsRepository::new(service.query_engine.clone()); - - let request = crate::PartitionsListRequest { - limit: Some(limit), - offset: Some(offset), - status_filter: status_filter.map(|s| s.to_display_string()), - }; - - // TODO: Implement with PartitionsRepository using BELQueryEngine - let response = PartitionsListResponse { - partitions: vec![], - total_count: 0, - has_more: false, - }; - Ok(Json(response)) -} - -pub async fn get_activity_summary( - State(service): State, -) -> Result, (StatusCode, Json)> { - // Build activity response using repositories to get dual status fields - let builds_repo = BuildsRepository::new(service.query_engine.clone()); - // TODO: Update PartitionsRepository to work with BELQueryEngine - let partitions_repo = PartitionsRepository::new(service.query_engine.clone()); - - // Get recent builds and partitions with dual status fields - let recent_builds = builds_repo.list_protobuf(Some(5)).await.unwrap_or_else(|_| vec![]); - let recent_partitions_request = PartitionsListRequest { - limit: Some(10), - offset: None, - status_filter: None - }; - let recent_partitions_response = partitions_repo.list_protobuf(recent_partitions_request).await - .unwrap_or_else(|_| crate::PartitionsListResponse { - partitions: vec![], - total_count: 0, - has_more: false - }); - - // Get activity counts (fallback to event log method for now) - let summary = service.query_engine.get_activity_summary().await.unwrap_or_else(|_| { - crate::event_log::ActivitySummary { - active_builds_count: 0, - recent_builds: vec![], - recent_partitions: vec![], - total_partitions_count: 0, - } - }); - - // Simple system status logic - let system_status = if summary.active_builds_count > 10 { - "degraded".to_string() - } else { - "healthy".to_string() - }; - - // Build protobuf activity response with dual status fields - let protobuf_response = crate::ActivityResponse { - active_builds_count: summary.active_builds_count, - recent_builds, - recent_partitions: recent_partitions_response.partitions, - total_partitions_count: summary.total_partitions_count, - system_status, - graph_name: service.graph_label.clone(), - }; - - let api_response = ActivityApiResponse { - data: protobuf_response, - request_id: None, - }; - Ok(Json(api_response)) -} - -#[derive(Deserialize, JsonSchema)] -pub struct JobMetricsRequest { - pub label: String, -} - -pub async fn list_jobs( - State(service): State, - Query(params): Query>, -) -> Result, (StatusCode, Json)> { - let limit = params.get("limit") - .and_then(|s| s.parse::().ok()) - .unwrap_or(20) - .min(100); // Cap at 100 - - let search = params.get("search").map(|s| s.to_string()); - - // Use repository with protobuf format - let jobs_repo = JobsRepository::new(service.query_engine.clone()); - let request = JobsListRequest { - limit: Some(limit), - search, - }; - - match jobs_repo.list_protobuf(request).await { - Ok(response) => { - Ok(Json(response)) - }, - Err(e) => { - error!("Failed to list jobs: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to list jobs: {}", e), - }), - )) - } - } -} - -pub async fn get_job_metrics( - State(service): State, - Path(JobMetricsRequest { label }): Path, -) -> Result, (StatusCode, Json)> { - // Decode the base64-encoded job label - let decoded_label = match base64_url_decode(&label) { - Ok(decoded) => decoded, - Err(_) => { - return Err(( - StatusCode::BAD_REQUEST, - Json(ErrorResponse { - error: "Invalid job label encoding".to_string(), - }), - )); - } - }; - - log::info!("get_job_metrics: encoded='{}', decoded='{}'", label, decoded_label); - - // Get overall job metrics - let metrics_query = " - WITH job_run_durations AS ( - SELECT - be.build_request_id, - (MAX(be.timestamp) - MIN(be.timestamp)) / 1000000 as duration_ms - FROM job_events je - JOIN build_events be ON je.event_id = be.event_id - WHERE je.job_label = ? - GROUP BY be.build_request_id - HAVING MAX(CASE WHEN je.status IN ('3', '4', '5', '6') THEN 1 ELSE 0 END) = 1 - ) - SELECT - COUNT(CASE WHEN je.status IN ('3', '6') THEN 1 END) as completed_count, - COUNT(CASE WHEN je.status IN ('3', '4', '5', '6') THEN 1 END) as total_count, - COALESCE(AVG(jrd.duration_ms), 0) as avg_duration_ms - FROM job_events je - JOIN build_events be ON je.event_id = be.event_id - LEFT JOIN job_run_durations jrd ON be.build_request_id = jrd.build_request_id - WHERE je.job_label = ?"; - - let (success_rate, total_runs, avg_duration_ms) = match service.query_engine.execute_query(&metrics_query.replace("?", &format!("'{}'", decoded_label)).replace("?", &format!("'{}'", decoded_label))).await { - Ok(result) if !result.rows.is_empty() => { - let row = &result.rows[0]; - let completed_count: u32 = row[0].parse().unwrap_or(0); - let total_count: u32 = row[1].parse().unwrap_or(0); - let avg_duration: Option = row[2].parse::().ok().map(|f| f as i64); - - let success_rate = if total_count > 0 { - completed_count as f64 / total_count as f64 - } else { - 0.0 - }; - - (success_rate, total_count, avg_duration) - } - _ => (0.0, 0, None), - }; - - // Get recent runs - consolidated by build request to show final status per job run - let recent_runs_query = " - SELECT - be.build_request_id, - je.target_partitions, - je.status, - MIN(be.timestamp) as started_at, - MAX(be.timestamp) as completed_at - FROM job_events je - JOIN build_events be ON je.event_id = be.event_id - WHERE je.job_label = ? - GROUP BY be.build_request_id, je.target_partitions - HAVING je.status = ( - SELECT je2.status - FROM job_events je2 - JOIN build_events be2 ON je2.event_id = be2.event_id - WHERE je2.job_label = ? - AND be2.build_request_id = be.build_request_id - ORDER BY be2.timestamp DESC - LIMIT 1 - ) - ORDER BY started_at DESC - LIMIT 50"; - - let recent_runs = match service.query_engine.execute_query(&recent_runs_query.replace("?", &format!("'{}'", decoded_label)).replace("?", &format!("'{}'", decoded_label))).await { - Ok(result) => { - result.rows.into_iter().map(|row| { - let build_request_id = row[0].clone(); - let partitions_json: String = row[1].clone(); - let status_code: String = row[2].clone(); - let started_at: i64 = row[3].parse().unwrap_or(0); - let completed_at: i64 = row[4].parse().unwrap_or(started_at); - let duration_ms: Option = if completed_at > started_at { - Some(completed_at - started_at) - } else { - None - }; - - let partitions: Vec = serde_json::from_str::>(&partitions_json) - .unwrap_or_default() - .into_iter() - .filter_map(|v| { - v.get("str").and_then(|s| s.as_str()).map(|s| s.to_string()) - }) - .collect(); - - let (status_code_int, status_name) = match status_code.as_str() { - "1" => (1, "scheduled"), - "2" => (2, "running"), - "3" => (3, "completed"), - "4" => (4, "failed"), - "5" => (5, "cancelled"), - "6" => (6, "skipped"), - _ => (0, "unknown"), - }; - - JobRunSummary { - build_request_id, - partitions, - status_code: status_code_int, - status_name: status_name.to_string(), - duration_ms, - started_at, - } - }).collect() - } - Err(_) => Vec::new(), - }; - - // Get daily stats (simplified - just recent days) - let daily_stats_query = " - WITH daily_job_durations AS ( - SELECT - date(be.timestamp/1000000000, 'unixepoch') as date, - be.build_request_id, - (MAX(be.timestamp) - MIN(be.timestamp)) / 1000000 as duration_ms - FROM job_events je - JOIN build_events be ON je.event_id = be.event_id - WHERE je.job_label = ? - AND be.timestamp > (strftime('%s', 'now', '-30 days') * 1000000000) - GROUP BY date(be.timestamp/1000000000, 'unixepoch'), be.build_request_id - HAVING MAX(CASE WHEN je.status IN ('3', '4', '5', '6') THEN 1 ELSE 0 END) = 1 - ) - SELECT - date(be.timestamp/1000000000, 'unixepoch') as date, - COUNT(CASE WHEN je.status IN ('3', '6') THEN 1 END) as completed_count, - COUNT(CASE WHEN je.status IN ('3', '4', '5', '6') THEN 1 END) as total_count, - COALESCE(AVG(djd.duration_ms), 0) as avg_duration_ms - FROM job_events je - JOIN build_events be ON je.event_id = be.event_id - LEFT JOIN daily_job_durations djd ON date(be.timestamp/1000000000, 'unixepoch') = djd.date - WHERE je.job_label = ? - AND be.timestamp > (strftime('%s', 'now', '-30 days') * 1000000000) - GROUP BY date(be.timestamp/1000000000, 'unixepoch') - ORDER BY date DESC"; - - let daily_stats = match service.query_engine.execute_query(&daily_stats_query.replace("?", &format!("'{}'", decoded_label)).replace("?", &format!("'{}'", decoded_label))).await { - Ok(result) => { - result.rows.into_iter().map(|row| { - let date = row[0].clone(); - let completed_count: u32 = row[1].parse().unwrap_or(0); - let total_count: u32 = row[2].parse().unwrap_or(0); - let avg_duration: Option = row[3].parse::().ok().map(|f| f as i64); - - let success_rate = if total_count > 0 { - completed_count as f64 / total_count as f64 - } else { - 0.0 - }; - - JobDailyStats { - date, - success_rate, - avg_duration_ms: avg_duration, - total_runs: total_count, - } - }).collect() - } - Err(_) => Vec::new(), - }; - - Ok(Json(JobMetricsResponse { - job_label: decoded_label, - success_rate, - avg_duration_ms, - total_runs, - recent_runs, - daily_stats, - })) -} - -// Repository-based handlers for the new shared core functionality -use crate::repositories::{ - partitions::PartitionsRepository, - jobs::JobsRepository, - tasks::TasksRepository, - builds::BuildsRepository, -}; - -/// Request for partition detail endpoint -#[derive(Deserialize, JsonSchema)] -pub struct PartitionDetailRequest { - pub partition_ref: String, -} - -/// Get detailed partition information with timeline -pub async fn get_partition_detail( - State(service): State, - Path(PartitionDetailRequest { partition_ref }): Path, -) -> Result, (StatusCode, Json)> { - let repository = PartitionsRepository::new(service.query_engine.clone()); - let decoded_partition_ref = base64_url_decode(&partition_ref).unwrap(); - - match repository.show_protobuf(&decoded_partition_ref).await { - Ok(Some(protobuf_response)) => { - let timeline_events: Vec = protobuf_response.timeline.into_iter().map(|event| { - PartitionTimelineEvent { - timestamp: event.timestamp, - status_code: event.status_code, - status_name: event.status_name, - message: event.message, - build_request_id: event.build_request_id, - job_run_id: event.job_run_id, - } - }).collect(); - - Ok(Json(PartitionDetailResponse { - partition_ref: protobuf_response.partition_ref, - status_code: protobuf_response.status_code, - status_name: protobuf_response.status_name, - last_updated: protobuf_response.last_updated, - builds_count: protobuf_response.builds_count, - last_successful_build: protobuf_response.last_successful_build, - invalidation_count: protobuf_response.invalidation_count, - timeline: timeline_events, - })) - } - Ok(None) => Err(( - StatusCode::NOT_FOUND, - Json(ErrorResponse { - error: format!("Partition '{}' not found", partition_ref), - }), - )), - Err(e) => { - error!("Failed to get partition detail: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to get partition detail: {}", e), - }), - )) - } - } -} - -/// Invalidate a partition -#[derive(Deserialize, JsonSchema)] -pub struct InvalidatePartitionRequest { - pub reason: String, - pub build_request_id: String, -} - -/// Request for partition invalidation endpoint path -#[derive(Deserialize, JsonSchema)] -pub struct PartitionInvalidatePathRequest { - pub partition_ref: String, -} - -pub async fn invalidate_partition( - State(service): State, - Path(PartitionInvalidatePathRequest { partition_ref }): Path, - Json(request): Json, -) -> Result, (StatusCode, Json)> { - let repository = PartitionsRepository::new(service.query_engine.clone()); - - match repository.invalidate(&partition_ref, request.reason.clone(), request.build_request_id).await { - Ok(()) => Ok(Json(PartitionInvalidateResponse { - invalidated: true, - partition_ref, - reason: request.reason, - })), - Err(e) => { - error!("Failed to invalidate partition: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to invalidate partition: {}", e), - }), - )) - } - } -} - -/// List partitions using repository -pub async fn list_partitions_repository( - State(service): State, - Query(params): Query>, -) -> Result, (StatusCode, Json)> { - let repository = PartitionsRepository::new(service.query_engine.clone()); - let limit = params.get("limit").and_then(|s| s.parse().ok()); - - let request = PartitionsListRequest { - limit, - offset: None, - status_filter: None, - }; - - match repository.list_protobuf(request).await { - Ok(protobuf_response) => { - let total_count = protobuf_response.total_count; - let has_more = protobuf_response.has_more; - - let api_response = PartitionsListApiResponse { - data: protobuf_response, - request_id: None, // TODO: add request ID tracking - pagination: Some(PaginationInfo { - total_count, - has_more, - limit: limit.map(|l| l as u32), - offset: None, - }), - }; - Ok(Json(api_response)) - }, - Err(e) => { - error!("Failed to list partitions: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to list partitions: {}", e), - }), - )) - } - } -} - -/// List tasks using repository -pub async fn list_tasks_repository( - State(service): State, - Query(params): Query>, -) -> Result, (StatusCode, Json)> { - let repository = TasksRepository::new(service.query_engine.clone()); - let limit = params.get("limit").and_then(|s| s.parse().ok()); - - let request = JobRunsListRequest { limit }; - - match repository.list_protobuf(request).await { - Ok(protobuf_response) => { - let total_count = protobuf_response.total_count; - - let api_response = JobRunsListApiResponse { - data: protobuf_response, - request_id: None, // TODO: add request ID tracking - pagination: Some(PaginationInfo { - total_count, - has_more: false, // Tasks list doesn't implement has_more yet - limit: limit.map(|l| l as u32), - offset: None, - }), - }; - Ok(Json(api_response)) - }, - Err(e) => { - error!("Failed to list tasks: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to list tasks: {}", e), - }), - )) - } - } -} - -/// List jobs using repository -pub async fn list_jobs_repository( - State(service): State, - Query(params): Query>, -) -> Result, (StatusCode, Json)> { - let repository = JobsRepository::new(service.query_engine.clone()); - let limit = params.get("limit").and_then(|s| s.parse().ok()); - let search = params.get("search").map(|s| s.to_string()); - - let request = JobsListRequest { - limit, - search, - }; - - match repository.list_protobuf(request).await { - Ok(protobuf_response) => { - let total_count = protobuf_response.total_count; - - let api_response = JobsListApiResponse { - data: protobuf_response, - request_id: None, // TODO: add request ID tracking - pagination: Some(PaginationInfo { - total_count, - has_more: false, // Jobs list doesn't implement has_more yet - limit: limit.map(|l| l as u32), - offset: None, - }), - }; - Ok(Json(api_response)) - }, - Err(e) => { - error!("Failed to list jobs: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to list jobs: {}", e), - }), - )) - } - } -} - -/// Request for job detail endpoint -#[derive(Deserialize, JsonSchema)] -pub struct JobDetailRequest { - pub label: String, -} - -/// Get detailed job information -pub async fn get_job_detail( - State(service): State, - Path(JobDetailRequest { label }): Path, -) -> Result, (StatusCode, Json)> { - let job_label = base64_url_decode(&label).unwrap(); - let repository = JobsRepository::new(service.query_engine.clone()); - - match repository.show_protobuf(&job_label).await { - Ok(Some(protobuf_response)) => { - let run_summaries: Vec = protobuf_response.runs.into_iter().map(|run| { - JobRunDetail { - job_run_id: run.job_run_id, - build_request_id: run.build_request_id, - target_partitions: run.target_partitions, - status_code: run.status_code, - status_name: run.status_name, - started_at: run.started_at, - completed_at: run.completed_at, - duration_ms: run.duration_ms, - message: run.message, - } - }).collect(); - - Ok(Json(JobDetailResponse { - job_label: protobuf_response.job_label, - total_runs: protobuf_response.total_runs, - successful_runs: protobuf_response.successful_runs, - failed_runs: protobuf_response.failed_runs, - cancelled_runs: protobuf_response.cancelled_runs, - average_partitions_per_run: protobuf_response.average_partitions_per_run, - last_run_timestamp: protobuf_response.last_run_timestamp, - last_run_status_code: protobuf_response.last_run_status_code, - last_run_status_name: protobuf_response.last_run_status_name, - recent_builds: protobuf_response.recent_builds, - runs: run_summaries, - })) - } - Ok(None) => Err(( - StatusCode::NOT_FOUND, - Json(ErrorResponse { - error: format!("Job '{}' not found", job_label), - }), - )), - Err(e) => { - error!("Failed to get job detail: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to get job detail: {}", e), - }), - )) - } - } -} - -/// List tasks using repository -pub async fn list_tasks( - State(service): State, - Query(params): Query>, -) -> Result, (StatusCode, Json)> { - let repository = TasksRepository::new(service.query_engine.clone()); - let limit = params.get("limit").and_then(|s| s.parse().ok()); - - let request = JobRunsListRequest { limit }; - - match repository.list_protobuf(request).await { - Ok(response) => { - Ok(Json(response)) - } - Err(e) => { - error!("Failed to list tasks: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to list tasks: {}", e), - }), - )) - } - } -} - -/// Request for task detail endpoint -#[derive(Deserialize, JsonSchema)] -pub struct TaskDetailRequest { - pub job_run_id: String, -} - -/// Get detailed task information -pub async fn get_task_detail( - State(service): State, - Path(TaskDetailRequest { job_run_id }): Path, -) -> Result, (StatusCode, Json)> { - let repository = TasksRepository::new(service.query_engine.clone()); - - match repository.show_protobuf(&job_run_id).await { - Ok(Some(protobuf_response)) => { - let timeline_events: Vec = protobuf_response.timeline.into_iter().map(|event| { - JobRunTimelineEvent { - timestamp: event.timestamp, - status_code: event.status_code, - status_name: event.status_name, - message: event.message, - event_type: event.event_type, - cancel_reason: event.cancel_reason, - } - }).collect(); - - Ok(Json(JobRunDetailResponse { - job_run_id: protobuf_response.job_run_id, - job_label: protobuf_response.job_label, - build_request_id: protobuf_response.build_request_id, - status_code: protobuf_response.status_code, - status_name: protobuf_response.status_name, - target_partitions: protobuf_response.target_partitions, - scheduled_at: protobuf_response.scheduled_at, - started_at: protobuf_response.started_at, - completed_at: protobuf_response.completed_at, - duration_ms: protobuf_response.duration_ms, - cancelled: protobuf_response.cancelled, - cancel_reason: protobuf_response.cancel_reason, - message: protobuf_response.message, - timeline: timeline_events, - })) - } - Ok(None) => Err(( - StatusCode::NOT_FOUND, - Json(ErrorResponse { - error: format!("Task '{}' not found", job_run_id), - }), - )), - Err(e) => { - error!("Failed to get task detail: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to get task detail: {}", e), - }), - )) - } - } -} - -/// Cancel a task -#[derive(Deserialize, JsonSchema)] -pub struct CancelTaskRequest { - pub reason: String, - pub build_request_id: String, -} - -/// Request for task cancel endpoint path -#[derive(Deserialize, JsonSchema)] -pub struct TaskCancelPathRequest { - pub job_run_id: String, -} - -pub async fn cancel_task( - State(service): State, - Path(TaskCancelPathRequest { job_run_id }): Path, - Json(request): Json, -) -> Result, (StatusCode, Json)> { - let repository = TasksRepository::new(service.query_engine.clone()); - - match repository.cancel(&job_run_id, request.reason.clone(), request.build_request_id).await { - Ok(()) => Ok(Json(TaskCancelResponse { - cancelled: true, - job_run_id, - reason: request.reason, - })), - Err(e) => { - error!("Failed to cancel task: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to cancel task: {}", e), - }), - )) - } - } -} - -/// List builds using repository -pub async fn list_builds_repository( - State(service): State, - Query(params): Query>, -) -> Result, (StatusCode, Json)> { - let repository = BuildsRepository::new(service.query_engine.clone()); - let limit = params.get("limit").and_then(|s| s.parse().ok()); - - match repository.list_protobuf(limit).await { - Ok(builds) => { - let total_count = builds.len() as u32; - let protobuf_response = crate::BuildsListResponse { - builds, - total_count, - has_more: false, // TODO: implement proper pagination - }; - - let api_response = BuildsListApiResponse { - data: protobuf_response, - request_id: None, // TODO: add request ID tracking - pagination: Some(PaginationInfo { - total_count, - has_more: false, - limit: limit.map(|l| l as u32), - offset: None, - }), - }; - Ok(Json(api_response)) - }, - Err(e) => { - error!("Failed to list builds: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to list builds: {}", e), - }), - )) - } - } -} - -/// Request for build detail endpoint -#[derive(Deserialize, JsonSchema)] -pub struct BuildDetailRequest { - pub build_request_id: String, -} - -/// Get detailed build information -pub async fn get_build_detail( - State(service): State, - Path(BuildDetailRequest { build_request_id }): Path, -) -> Result, (StatusCode, Json)> { - let repository = BuildsRepository::new(service.query_engine.clone()); - - match repository.show_protobuf(&build_request_id).await { - Ok(Some(protobuf_response)) => { - // Convert protobuf response to service response (with dual status fields) - let timeline_events: Vec = protobuf_response.timeline.into_iter().map(|event| { - BuildTimelineEvent { - timestamp: event.timestamp, - status: event.status, - message: event.message, - event_type: event.event_type, - cancel_reason: event.cancel_reason, - } - }).collect(); - - Ok(Json(BuildDetailResponse { - build_request_id: protobuf_response.build_request_id, - status: protobuf_response.status, - requested_partitions: protobuf_response.requested_partitions, - total_jobs: protobuf_response.total_jobs, - completed_jobs: protobuf_response.completed_jobs, - failed_jobs: protobuf_response.failed_jobs, - cancelled_jobs: protobuf_response.cancelled_jobs, - requested_at: protobuf_response.requested_at, - started_at: protobuf_response.started_at, - completed_at: protobuf_response.completed_at, - duration_ms: protobuf_response.duration_ms, - cancelled: protobuf_response.cancelled, - cancel_reason: protobuf_response.cancel_reason, - timeline: timeline_events, - })) - } - Ok(None) => Err(( - StatusCode::NOT_FOUND, - Json(ErrorResponse { - error: format!("Build '{}' not found", build_request_id), - }), - )), - Err(e) => { - error!("Failed to get build detail: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to get build detail: {}", e), - }), - )) - } - } -} - -/// Request for build cancel endpoint path -#[derive(Deserialize, JsonSchema)] -pub struct BuildCancelPathRequest { - pub build_request_id: String, -} - -/// Cancel a build using repository -pub async fn cancel_build_repository( - State(service): State, - Path(BuildCancelPathRequest { build_request_id }): Path, - Json(request): Json, -) -> Result, (StatusCode, Json)> { - let repository = BuildsRepository::new(service.query_engine.clone()); - - match repository.cancel(&build_request_id, request.reason.clone()).await { - Ok(()) => Ok(Json(BuildCancelRepositoryResponse { - cancelled: true, - build_request_id, - })), - Err(e) => { - error!("Failed to cancel build: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to cancel build: {}", e), - }), - )) - } - } -} - -#[derive(Deserialize, JsonSchema)] -pub struct CancelBuildRepositoryRequest { - pub reason: String, -} - -// === Job Logs and Metrics Endpoints === - -use crate::{log_access::LogReader, metrics_aggregator::{MetricsAggregator, MetricsConfig}, JobLogsRequest}; -use serde::Serialize; - -/// Path parameter for job logs endpoint -#[derive(Deserialize, JsonSchema)] -pub struct JobLogsPathRequest { - pub job_run_id: String, -} - -/// Query parameters for job logs endpoint -#[derive(Deserialize, JsonSchema)] -pub struct JobLogsQueryRequest { - #[serde(default)] - pub since_timestamp: i64, - #[serde(default)] - pub min_level: i32, - #[serde(default = "default_logs_limit")] - pub limit: u32, -} - -fn default_logs_limit() -> u32 { - 1000 -} - -/// Response for job logs endpoint -#[derive(Serialize, JsonSchema)] -pub struct JobLogsApiResponse { - pub entries: Vec, - pub has_more: bool, -} - -/// Get job logs for a specific job run ID -pub async fn get_job_logs( - Path(JobLogsPathRequest { job_run_id }): Path, - axum::extract::Query(query): axum::extract::Query, -) -> Result, (StatusCode, Json)> { - let log_reader = LogReader::default(); - - let request = JobLogsRequest { - job_run_id, - since_timestamp: query.since_timestamp, - min_level: query.min_level, - limit: query.limit, - }; - - match log_reader.get_job_logs(&request) { - Ok(response) => Ok(Json(JobLogsApiResponse { - entries: response.entries, - has_more: response.has_more, - })), - Err(e) => { - error!("Failed to get job logs: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to get job logs: {}", e), - }), - )) - } - } -} - -/// List available job run IDs -#[derive(Deserialize, JsonSchema)] -pub struct ListJobsQueryRequest { - pub start_date: Option, - pub end_date: Option, -} - -/// Response for list jobs endpoint -#[derive(Serialize, JsonSchema)] -pub struct ListJobsResponse { - pub job_run_ids: Vec, -} - -pub async fn list_available_jobs( - axum::extract::Query(query): axum::extract::Query, -) -> Result, (StatusCode, Json)> { - let log_reader = LogReader::default(); - - let date_range = if let (Some(start), Some(end)) = (query.start_date, query.end_date) { - Some((start, end)) - } else { - None - }; - - match log_reader.list_available_jobs(date_range) { - Ok(job_ids) => Ok(Json(ListJobsResponse { - job_run_ids: job_ids, - })), - Err(e) => { - error!("Failed to list available jobs: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to list available jobs: {}", e), - }), - )) - } - } -} - -/// Query parameters for metrics endpoint -#[derive(Deserialize, JsonSchema)] -pub struct MetricsQueryRequest { - #[serde(default = "default_time_range_hours")] - pub time_range_hours: u64, - #[serde(default)] - pub include_job_id_labels: bool, - #[serde(default = "default_max_cardinality")] - pub max_cardinality_per_metric: usize, -} - -fn default_time_range_hours() -> u64 { - 24 -} - -fn default_max_cardinality() -> usize { - 1000 -} - -/// Get Prometheus metrics from job logs -pub async fn get_prometheus_metrics( - axum::extract::Query(query): axum::extract::Query, -) -> Result)> { - let config = MetricsConfig { - max_cardinality_per_metric: query.max_cardinality_per_metric, - time_range_hours: query.time_range_hours, - include_job_id_labels: query.include_job_id_labels, - max_jobs_per_metric: 100, - }; - - let aggregator = MetricsAggregator::new( - crate::log_collector::LogCollector::default_logs_dir(), - config - ); - - match aggregator.to_prometheus_format() { - Ok(prometheus_output) => Ok(prometheus_output), - Err(e) => { - error!("Failed to generate Prometheus metrics: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to generate Prometheus metrics: {}", e), - }), - )) - } - } -} - -/// Get log-based metrics for a specific job run -pub async fn get_job_run_metrics( - Path(JobLogsPathRequest { job_run_id }): Path, -) -> Result>, (StatusCode, Json)> { - let log_reader = LogReader::default(); - - match log_reader.get_job_metrics(&job_run_id) { - Ok(metrics) => Ok(Json(metrics)), - Err(e) => { - error!("Failed to get job metrics: {}", e); - Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to get job metrics: {}", e), - }), - )) - } - } -} - -/// Request for build mermaid diagram endpoint -#[derive(Deserialize, JsonSchema)] -pub struct BuildMermaidRequest { - pub build_request_id: String, -} - -/// Response for build mermaid diagram endpoint -#[derive(serde::Serialize, JsonSchema)] -pub struct BuildMermaidResponse { - pub mermaid: String, -} - -/// Get Mermaid diagram for a specific build request ID -pub async fn get_build_mermaid_diagram( - State(service): State, - Path(BuildMermaidRequest { build_request_id }): Path, -) -> Result, (StatusCode, Json)> { - info!("Generating mermaid diagram for build request {}", build_request_id); - - // Get build events for this build request - let events = match service.query_engine.get_build_request_events(&build_request_id, None).await { - Ok(events) => events, - Err(e) => { - error!("Failed to get build events for {}: {}", build_request_id, e); - return Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to get build events: {}", e), - }), - )); - } - }; - - if events.is_empty() { - return Err(( - StatusCode::NOT_FOUND, - Json(ErrorResponse { - error: "Build request not found".to_string(), - }), - )); - } - - // Find job graph event to get the graph structure - let job_graph = events.iter() - .find_map(|event| { - match &event.event_type { - Some(crate::build_event::EventType::JobGraphEvent(graph_event)) => { - graph_event.job_graph.as_ref() - } - _ => None, - } - }); - - match job_graph { - Some(graph) => { - // Generate mermaid diagram with current status - let mermaid_diagram = mermaid_utils::generate_mermaid_with_status(graph, &events); - - Ok(Json(BuildMermaidResponse { - mermaid: mermaid_diagram, - })) - } - None => { - Err(( - StatusCode::NOT_FOUND, - Json(ErrorResponse { - error: "No job graph found for this build request".to_string(), - }), - )) - } - } -} \ No newline at end of file diff --git a/databuild/service/main.rs b/databuild/service/main.rs deleted file mode 100644 index 9c02d6e..0000000 --- a/databuild/service/main.rs +++ /dev/null @@ -1,140 +0,0 @@ -use databuild::service::BuildGraphService; -use std::collections::HashMap; -use std::env; -use std::net::SocketAddr; -use clap::{Arg, Command}; -use log::info; -use simple_logger::SimpleLogger; - -#[tokio::main] -async fn main() { - SimpleLogger::new().init().unwrap(); - - let matches = Command::new("build-graph-service") - .version("1.0") - .about("DataBuild Build Graph Service") - .arg( - Arg::new("port") - .short('p') - .long("port") - .value_name("PORT") - .help("Port to listen on") - .default_value("8080") - ) - .arg( - Arg::new("host") - .long("host") - .value_name("HOST") - .help("Host to bind to") - .default_value("0.0.0.0") - ) - .arg( - Arg::new("event-log") - .long("event-log") - .value_name("URI") - .help("Build event log URI") - .default_value("sqlite:///tmp/databuild.db") - ) - .arg( - Arg::new("graph-label") - .long("graph-label") - .value_name("LABEL") - .help("Graph label") - .default_value("//example:graph") - ) - .arg( - Arg::new("job-lookup-path") - .long("job-lookup-path") - .value_name("PATH") - .help("Job lookup binary path") - .default_value("job_lookup") - ) - .arg( - Arg::new("print-openapi-spec") - .long("print-openapi-spec") - .help("Print OpenAPI spec to stdout and exit") - .action(clap::ArgAction::SetTrue) - ) - .get_matches(); - - let port: u16 = matches.get_one::("port").unwrap() - .parse().expect("Invalid port number"); - let host = matches.get_one::("host").unwrap(); - - // Check environment variable first, fall back to command line argument - let event_log_uri = env::var("DATABUILD_BUILD_EVENT_LOG") - .unwrap_or_else(|_| matches.get_one::("event-log").unwrap().to_string()); - - let graph_label = matches.get_one::("graph-label").unwrap().to_string(); - let job_lookup_path = matches.get_one::("job-lookup-path").unwrap().to_string(); - - // Get candidate jobs from environment - let candidate_jobs: HashMap = env::var("DATABUILD_CANDIDATE_JOBS") - .map(|s| serde_json::from_str(&s).unwrap_or_else(|_| HashMap::new())) - .unwrap_or_else(|_| HashMap::new()); - - // Handle OpenAPI spec generation - if matches.get_flag("print-openapi-spec") { - // Disable logging for OpenAPI generation to keep output clean - log::set_max_level(log::LevelFilter::Off); - - // Create a minimal service instance for OpenAPI generation - let service = match BuildGraphService::new( - "sqlite://:memory:", // Use in-memory database for spec generation - graph_label, - job_lookup_path, - candidate_jobs, - ).await { - Ok(service) => service, - Err(e) => { - eprintln!("Failed to create service for OpenAPI generation: {}", e); - std::process::exit(1); - } - }; - - // Generate and print OpenAPI spec - let spec = service.generate_openapi_spec(); - match serde_json::to_string_pretty(&spec) { - Ok(json) => { - println!("{}", json); - std::process::exit(0); - } - Err(e) => { - eprintln!("Failed to serialize OpenAPI spec: {}", e); - std::process::exit(1); - } - } - } - - info!("Starting Build Graph Service on {}:{}", host, port); - info!("Event log URI: {}", event_log_uri); - info!("Graph label: {}", graph_label); - info!("Job lookup path: {}", job_lookup_path); - info!("Candidate jobs: {} configured", candidate_jobs.len()); - - // Create service - let service = match BuildGraphService::new( - &event_log_uri, - graph_label, - job_lookup_path, - candidate_jobs, - ).await { - Ok(service) => service, - Err(e) => { - eprintln!("Failed to create service: {}", e); - std::process::exit(1); - } - }; - - // Create router - let app = service.create_router(); - - // Start server - let addr: SocketAddr = format!("{}:{}", host, port).parse().unwrap(); - info!("Build Graph Service listening on {}", addr); - - let listener = tokio::net::TcpListener::bind(&addr).await.unwrap(); - axum::serve(listener, app.into_make_service()) - .await - .unwrap(); -} \ No newline at end of file diff --git a/databuild/service/mod.rs b/databuild/service/mod.rs deleted file mode 100644 index 98b6514..0000000 --- a/databuild/service/mod.rs +++ /dev/null @@ -1,479 +0,0 @@ -use crate::*; -use crate::event_log::BuildEventLogError; -use aide::{ - axum::{ - routing::{get, post, delete}, - ApiRouter, - }, - openapi::OpenApi, -}; -use axum::{Extension, response::Response, http::StatusCode}; -use axum_jsonschema::Json; -use serde::{Deserialize, Serialize}; -use schemars::JsonSchema; -use std::collections::HashMap; -use std::sync::Arc; -use rusqlite::ToSql; -use tokio::sync::RwLock; -use uuid::Uuid; - -pub mod handlers; - -#[derive(Clone)] -pub struct BuildGraphService { - pub query_engine: Arc, - pub event_log_uri: String, - pub active_builds: Arc>>, - pub graph_label: String, - pub job_lookup_path: String, - pub candidate_jobs: HashMap, -} - -#[derive(Debug, Clone)] -pub struct BuildRequestState { - pub build_request_id: String, - pub status: BuildRequestStatus, - pub requested_partitions: Vec, - pub created_at: i64, - pub updated_at: i64, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct BuildRequest { - pub partitions: Vec, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct BuildRequestResponse { - pub build_request_id: String, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct BuildEventSummary { - pub event_id: String, - pub timestamp: i64, - pub event_type: String, - pub message: String, - pub build_request_id: String, // Build request ID for navigation - // Navigation-relevant fields (populated based on event type) - pub job_label: Option, // For job events - pub partition_ref: Option, // For partition events - pub delegated_build_id: Option, // For delegation events -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct PartitionStatusResponse { - pub partition_ref: String, - pub status_code: i32, - pub status_name: String, - pub last_updated: Option, - pub build_requests: Vec, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct PartitionEventsResponse { - pub partition_ref: String, - pub events: Vec, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct AnalyzeRequest { - pub partitions: Vec, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct AnalyzeResponse { - #[schemars(schema_with = "job_graph_schema")] - pub job_graph: serde_json::Value, -} - -fn job_graph_schema(_gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema { - schemars::schema::Schema::Object(schemars::schema::SchemaObject { - instance_type: Some(schemars::schema::SingleOrVec::Single(Box::new(schemars::schema::InstanceType::Object))), - ..Default::default() - }) -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct ErrorResponse { - pub error: String, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct BuildCancelResponse { - pub cancelled: bool, - pub build_request_id: String, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct BuildCancelRepositoryResponse { - pub cancelled: bool, - pub build_request_id: String, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct PartitionInvalidateResponse { - pub invalidated: bool, - pub partition_ref: String, - pub reason: String, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct TaskCancelResponse { - pub cancelled: bool, - pub job_run_id: String, - pub reason: String, -} - -// List endpoints request/response types -// Removed: duplicate of crate::BuildsListResponse from proto - -// Wrapper structs for API responses that contain protobuf data + service metadata -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct BuildsListApiResponse { - pub data: crate::BuildsListResponse, - pub request_id: Option, - pub pagination: Option, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct PartitionsListApiResponse { - pub data: crate::PartitionsListResponse, - pub request_id: Option, - pub pagination: Option, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct JobsListApiResponse { - pub data: crate::JobsListResponse, - pub request_id: Option, - pub pagination: Option, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct JobRunsListApiResponse { - pub data: crate::JobRunsListResponse, - pub request_id: Option, - pub pagination: Option, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct ActivityApiResponse { - pub data: crate::ActivityResponse, - pub request_id: Option, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct PaginationInfo { - pub total_count: u32, - pub has_more: bool, - pub limit: Option, - pub offset: Option, -} - -// Removed: Legacy types that duplicate proto definitions -// - BuildSummary (use crate::BuildSummary from proto) -// - PartitionsListResponse (use crate::PartitionsListResponse from proto) -// - PartitionSummary (use crate::PartitionSummary from proto) - - -// Job-related request/response types -// Removed: JobsListResponse and JobSummary (use crate:: proto versions) - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct JobMetricsResponse { - pub job_label: String, - pub success_rate: f64, - pub avg_duration_ms: Option, - pub total_runs: u32, - pub recent_runs: Vec, - pub daily_stats: Vec, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct JobRunSummary { - pub build_request_id: String, - pub partitions: Vec, - pub status_code: i32, - pub status_name: String, - pub duration_ms: Option, - pub started_at: i64, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct JobDailyStats { - pub date: String, - pub success_rate: f64, - pub avg_duration_ms: Option, - pub total_runs: u32, -} - -impl BuildGraphService { - pub async fn new( - event_log_uri: &str, - graph_label: String, - job_lookup_path: String, - candidate_jobs: HashMap, - ) -> Result { - let query_engine = crate::event_log::storage::create_bel_query_engine(event_log_uri).await?; - - Ok(Self { - query_engine, - event_log_uri: event_log_uri.to_string(), - active_builds: Arc::new(RwLock::new(HashMap::new())), - graph_label, - job_lookup_path, - candidate_jobs, - }) - } - - pub fn generate_openapi_spec(&self) -> OpenApi { - let mut api = OpenApi::default(); - - // Create API router with all routes to generate OpenAPI spec - let _ = ApiRouter::new() - .api_route("/api/v1/builds", post(handlers::submit_build_request)) - .api_route("/api/v1/builds", get(handlers::list_builds_repository)) - .api_route("/api/v1/builds/:build_request_id", get(handlers::get_build_detail)) - .api_route("/api/v1/builds/:build_request_id", delete(handlers::cancel_build_repository)) - .api_route("/api/v1/partitions", get(handlers::list_partitions_repository)) - .api_route("/api/v1/partitions/:partition_ref", get(handlers::get_partition_detail)) - .api_route("/api/v1/partitions/:partition_ref/status", get(handlers::get_partition_status)) - .api_route("/api/v1/partitions/:partition_ref/events", get(handlers::get_partition_events)) - .api_route("/api/v1/partitions/:partition_ref/invalidate", post(handlers::invalidate_partition)) - .api_route("/api/v1/jobs", get(handlers::list_jobs_repository)) - .api_route("/api/v1/jobs/:label", get(handlers::get_job_detail)) - .api_route("/api/v1/jobs/:label/metrics", get(handlers::get_job_metrics)) - .api_route("/api/v1/tasks", get(handlers::list_tasks_repository)) - .api_route("/api/v1/tasks/:job_run_id", get(handlers::get_task_detail)) - .api_route("/api/v1/tasks/:job_run_id/cancel", post(handlers::cancel_task)) - .api_route("/api/v1/activity", get(handlers::get_activity_summary)) - .api_route("/api/v1/analyze", post(handlers::analyze_build_graph)) - .finish_api(&mut api); - - api - } - - pub fn create_router(self) -> axum::Router { - let mut api = OpenApi::default(); - - let api_router = ApiRouter::new() - .api_route("/api/v1/builds", post(handlers::submit_build_request)) - .api_route("/api/v1/builds", get(handlers::list_builds_repository)) - .api_route("/api/v1/builds/:build_request_id", get(handlers::get_build_detail)) - .api_route("/api/v1/builds/:build_request_id/mermaid", get(handlers::get_build_mermaid_diagram)) - .api_route("/api/v1/builds/:build_request_id", delete(handlers::cancel_build_repository)) - .api_route("/api/v1/partitions", get(handlers::list_partitions_repository)) - .api_route("/api/v1/partitions/:partition_ref", get(handlers::get_partition_detail)) - .api_route("/api/v1/partitions/:partition_ref/status", get(handlers::get_partition_status)) - .api_route("/api/v1/partitions/:partition_ref/events", get(handlers::get_partition_events)) - .api_route("/api/v1/partitions/:partition_ref/invalidate", post(handlers::invalidate_partition)) - .api_route("/api/v1/jobs", get(handlers::list_jobs_repository)) - .api_route("/api/v1/jobs/:label", get(handlers::get_job_detail)) - .api_route("/api/v1/jobs/:label/metrics", get(handlers::get_job_metrics)) - .api_route("/api/v1/tasks", get(handlers::list_tasks_repository)) - .api_route("/api/v1/tasks/:job_run_id", get(handlers::get_task_detail)) - .api_route("/api/v1/tasks/:job_run_id/cancel", post(handlers::cancel_task)) - .api_route("/api/v1/activity", get(handlers::get_activity_summary)) - .api_route("/api/v1/analyze", post(handlers::analyze_build_graph)) - // Job logs and metrics endpoints - .api_route("/api/v1/logs/jobs", get(handlers::list_available_jobs)) - .api_route("/api/v1/logs/jobs/:job_run_id", get(handlers::get_job_logs)) - .api_route("/api/v1/logs/jobs/:job_run_id/metrics", get(handlers::get_job_run_metrics)) - .route("/api/v1/metrics", axum::routing::get(handlers::get_prometheus_metrics)) - .route("/api/v1/openapi.json", get(Self::openapi_spec)) - .with_state(Arc::new(self)) - .finish_api(&mut api); - - let static_router = axum::Router::new() - .route("/", axum::routing::get(Self::serve_index)) - .route("/static/*file", axum::routing::get(Self::serve_static)); - - axum::Router::new() - .merge(api_router) - .merge(static_router) - .layer(Extension(api)) - .layer(axum::middleware::from_fn(Self::cors_middleware)) - } - - pub async fn openapi_spec(Extension(api): Extension) -> Json { - Json(api) - } - - fn resolve_fpath(fpath: &str) -> String { - let standard_prefix = "databuild+"; - let test_prefix = "_main"; - - match ( - std::fs::read_dir(Self::get_runfile_path(&format!("{}/databuild/dashboard", standard_prefix))), - std::fs::read_dir(Self::get_runfile_path(&format!("{}/databuild/dashboard", test_prefix))), - ) { - (Ok(_), _) => Self::get_runfile_path(&format!("{}/databuild/dashboard/{}", standard_prefix, fpath)), - (Err(_), Ok(_)) => Self::get_runfile_path(&format!("{}/databuild/dashboard/{}", test_prefix, fpath)), - (_, Err(_)) => panic!("Failed to find dashboard files"), - } - } - - pub async fn serve_index() -> Response { - match std::fs::read_to_string(&Self::resolve_fpath("index.html")) { - Ok(content) => Response::builder() - .header("content-type", "text/html") - .body(content.into()) - .unwrap(), - Err(_) => Response::builder() - .status(StatusCode::INTERNAL_SERVER_ERROR) - .body("Failed to load dashboard".into()) - .unwrap(), - } - } - - pub async fn serve_static(axum::extract::Path(file): axum::extract::Path) -> Response { - match std::fs::read(&Self::resolve_fpath(&file)) { - Ok(content) => { - let content_type = match file.split('.').last() { - Some("html") => "text/html", - Some("css") => "text/css", - Some("js") => "application/javascript", - Some("png") => "image/png", - Some("jpg") | Some("jpeg") => "image/jpeg", - Some("svg") => "image/svg+xml", - Some("ico") => "image/x-icon", - _ => "application/octet-stream", - }; - - Response::builder() - .header("content-type", content_type) - .body(content.into()) - .unwrap() - } - Err(_) => Response::builder() - .status(StatusCode::NOT_FOUND) - .body("404 Not Found".into()) - .unwrap(), - } - } - - fn get_dashboard_file_path(relative_path: &str) -> String { - let runfiles_dir = std::env::var("DASHBOARD_FILES_DIR").unwrap(); - format!("{}/{}", runfiles_dir, relative_path) - } - - fn get_runfile_path(relative_path: &str) -> String { - if let Ok(runfiles_dir) = std::env::var("RUNFILES_DIR") { - format!("{}/{}", runfiles_dir, relative_path) - } else if let Ok(_manifest_file) = std::env::var("RUNFILES_MANIFEST_FILE") { - // Parse manifest file to find the actual path - // For now, just use the relative path - relative_path.to_string() - } else { - // Development mode - files might be in the workspace - relative_path.to_string() - } - } - - pub async fn cors_middleware( - request: axum::http::Request, - next: axum::middleware::Next, - ) -> axum::response::Response { - let response = next.run(request).await; - let (mut parts, body) = response.into_parts(); - - parts.headers.insert( - axum::http::header::ACCESS_CONTROL_ALLOW_ORIGIN, - axum::http::HeaderValue::from_static("*"), - ); - parts.headers.insert( - axum::http::header::ACCESS_CONTROL_ALLOW_METHODS, - axum::http::HeaderValue::from_static("GET, POST, DELETE, OPTIONS"), - ); - parts.headers.insert( - axum::http::header::ACCESS_CONTROL_ALLOW_HEADERS, - axum::http::HeaderValue::from_static("Content-Type, Authorization"), - ); - - axum::response::Response::from_parts(parts, body) - } - - pub fn generate_build_request_id() -> String { - Uuid::new_v4().to_string() - } - - pub fn status_to_string(status: BuildRequestStatus) -> String { - match BuildRequestStatusCode::try_from(status.code) { - Ok(BuildRequestStatusCode::BuildRequestUnknown) => "unknown".to_string(), - Ok(BuildRequestStatusCode::BuildRequestReceived) => "received".to_string(), - Ok(BuildRequestStatusCode::BuildRequestPlanning) => "planning".to_string(), - Ok(BuildRequestStatusCode::BuildRequestAnalysisCompleted) => "analysis_completed".to_string(), - Ok(BuildRequestStatusCode::BuildRequestExecuting) => "executing".to_string(), - Ok(BuildRequestStatusCode::BuildRequestCompleted) => "completed".to_string(), - Ok(BuildRequestStatusCode::BuildRequestFailed) => "failed".to_string(), - Ok(BuildRequestStatusCode::BuildRequestCancelled) => "cancelled".to_string(), - Ok(BuildRequestStatusCode::BuildRequestPreconditionFailed) => "precondition_failed".to_string(), - Err(_) => "error".to_string(), - } - } - - pub fn partition_status_to_string(status: PartitionStatus) -> String { - match status { - PartitionStatus::PartitionUnknown => "unknown".to_string(), - PartitionStatus::PartitionRequested => "requested".to_string(), - PartitionStatus::PartitionAnalyzed => "analyzed".to_string(), - PartitionStatus::PartitionBuilding => "building".to_string(), - PartitionStatus::PartitionAvailable => "available".to_string(), - PartitionStatus::PartitionFailed => "failed".to_string(), - PartitionStatus::PartitionDelegated => "delegated".to_string(), - } - } -} - -pub type ServiceState = Arc; - -// Repository-based response types -// Removed: PartitionDetailResponse and PartitionTimelineEvent (use crate:: proto versions) - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct JobsRepositoryListResponse { - pub jobs: Vec, - pub total_count: u32, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct JobRepositorySummary { - pub job_label: String, - pub total_runs: usize, - pub successful_runs: usize, - pub failed_runs: usize, - pub cancelled_runs: usize, - pub average_partitions_per_run: f64, - pub last_run_timestamp: i64, - pub last_run_status: String, - pub recent_builds: Vec, -} - -// Removed: JobDetailResponse, JobRunDetail, JobRunsListResponse, JobRunSummary (use crate:: proto versions) - -// Removed: TaskDetailResponse and TaskTimelineEvent (use crate:: proto versions) - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct BuildsRepositoryListResponse { - pub builds: Vec, - pub total_count: u32, -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct BuildRepositorySummary { - pub build_request_id: String, - pub status: String, - pub requested_partitions: Vec, - pub total_jobs: usize, - pub completed_jobs: usize, - pub failed_jobs: usize, - pub cancelled_jobs: usize, - pub requested_at: i64, - pub started_at: Option, - pub completed_at: Option, - pub duration_ms: Option, - pub cancelled: bool, -} - -// Removed: BuildDetailResponse and BuildTimelineEvent (use crate:: proto versions) \ No newline at end of file diff --git a/databuild/service/openapi_spec_generator.rs b/databuild/service/openapi_spec_generator.rs deleted file mode 100644 index 6dfa075..0000000 --- a/databuild/service/openapi_spec_generator.rs +++ /dev/null @@ -1,35 +0,0 @@ -use databuild::service::BuildGraphService; -use std::collections::HashMap; - -#[tokio::main] -async fn main() { - // Disable logging to keep output clean - log::set_max_level(log::LevelFilter::Off); - - // Create a minimal service instance for OpenAPI generation - let service = match BuildGraphService::new( - "sqlite://:memory:", // Use in-memory database for spec generation - "//example:graph".to_string(), - "job_lookup".to_string(), - HashMap::new(), - ).await { - Ok(service) => service, - Err(e) => { - eprintln!("Failed to create service for OpenAPI generation: {}", e); - std::process::exit(1); - } - }; - - // Generate and print OpenAPI spec - let spec = service.generate_openapi_spec(); - match serde_json::to_string_pretty(&spec) { - Ok(json) => { - println!("{}", json); - std::process::exit(0); - } - Err(e) => { - eprintln!("Failed to serialize OpenAPI spec: {}", e); - std::process::exit(1); - } - } -} \ No newline at end of file diff --git a/databuild/status_utils.rs b/databuild/status_utils.rs deleted file mode 100644 index 6d66b9b..0000000 --- a/databuild/status_utils.rs +++ /dev/null @@ -1,291 +0,0 @@ -use crate::*; - -/// Utilities for converting status enums to human-readable strings -/// This provides consistent status naming across CLI and Service interfaces - -impl PartitionStatus { - /// Convert partition status to human-readable string matching current CLI/service format - pub fn to_display_string(&self) -> String { - match self { - PartitionStatus::PartitionUnknown => "unknown".to_string(), - PartitionStatus::PartitionRequested => "requested".to_string(), - PartitionStatus::PartitionAnalyzed => "analyzed".to_string(), - PartitionStatus::PartitionBuilding => "building".to_string(), - PartitionStatus::PartitionAvailable => "available".to_string(), - PartitionStatus::PartitionFailed => "failed".to_string(), - PartitionStatus::PartitionDelegated => "delegated".to_string(), - } - } - - /// Parse a display string back to enum (for filtering, etc.) - pub fn from_display_string(s: &str) -> Option { - match s { - "unknown" => Some(PartitionStatus::PartitionUnknown), - "requested" => Some(PartitionStatus::PartitionRequested), - "analyzed" => Some(PartitionStatus::PartitionAnalyzed), - "building" => Some(PartitionStatus::PartitionBuilding), - "available" => Some(PartitionStatus::PartitionAvailable), - "failed" => Some(PartitionStatus::PartitionFailed), - "delegated" => Some(PartitionStatus::PartitionDelegated), - _ => None, - } - } -} - -impl JobStatus { - /// Convert job status to human-readable string matching current CLI/service format - pub fn to_display_string(&self) -> String { - match self { - JobStatus::JobUnknown => "unknown".to_string(), - JobStatus::JobScheduled => "scheduled".to_string(), - JobStatus::JobRunning => "running".to_string(), - JobStatus::JobCompleted => "completed".to_string(), - JobStatus::JobFailed => "failed".to_string(), - JobStatus::JobCancelled => "cancelled".to_string(), - JobStatus::JobSkipped => "skipped".to_string(), - } - } - - /// Parse a display string back to enum - pub fn from_display_string(s: &str) -> Option { - match s { - "unknown" => Some(JobStatus::JobUnknown), - "scheduled" => Some(JobStatus::JobScheduled), - "running" => Some(JobStatus::JobRunning), - "completed" => Some(JobStatus::JobCompleted), - "failed" => Some(JobStatus::JobFailed), - "cancelled" => Some(JobStatus::JobCancelled), - "skipped" => Some(JobStatus::JobSkipped), - _ => None, - } - } -} - -impl BuildRequestStatusCode { - /// Convert build request status to human-readable string matching current CLI/service format - pub fn to_display_string(&self) -> String { - match self { - BuildRequestStatusCode::BuildRequestUnknown => "unknown".to_string(), - BuildRequestStatusCode::BuildRequestReceived => "received".to_string(), - BuildRequestStatusCode::BuildRequestPlanning => "planning".to_string(), - BuildRequestStatusCode::BuildRequestAnalysisCompleted => "analysis_completed".to_string(), - BuildRequestStatusCode::BuildRequestExecuting => "executing".to_string(), - BuildRequestStatusCode::BuildRequestCompleted => "completed".to_string(), - BuildRequestStatusCode::BuildRequestFailed => "failed".to_string(), - BuildRequestStatusCode::BuildRequestCancelled => "cancelled".to_string(), - &BuildRequestStatusCode::BuildRequestPreconditionFailed => "precondition failed".to_string(), - } - } - - /// Parse a display string back to enum - pub fn from_display_string(s: &str) -> Option { - match s { - "unknown" => Some(BuildRequestStatusCode::BuildRequestUnknown), - "received" => Some(BuildRequestStatusCode::BuildRequestReceived), - "planning" => Some(BuildRequestStatusCode::BuildRequestPlanning), - "analysis_completed" => Some(BuildRequestStatusCode::BuildRequestAnalysisCompleted), - "executing" => Some(BuildRequestStatusCode::BuildRequestExecuting), - "completed" => Some(BuildRequestStatusCode::BuildRequestCompleted), - "failed" => Some(BuildRequestStatusCode::BuildRequestFailed), - "cancelled" => Some(BuildRequestStatusCode::BuildRequestCancelled), - "precondition failed" => Some(BuildRequestStatusCode::BuildRequestPreconditionFailed), - _ => None, - } - } - - pub fn status(&self) -> BuildRequestStatus { - BuildRequestStatus { - code: self.clone().into(), - name: self.to_display_string(), - } - } -} - -impl DepType { - /// Convert dependency type to human-readable string - pub fn to_display_string(&self) -> String { - match self { - DepType::Query => "query".to_string(), - DepType::Materialize => "materialize".to_string(), - } - } - - /// Parse a display string back to enum - pub fn from_display_string(s: &str) -> Option { - match s { - "query" => Some(DepType::Query), - "materialize" => Some(DepType::Materialize), - _ => None, - } - } -} - -/// Helper functions for creating protobuf list responses with dual status fields -pub mod list_response_helpers { - use super::*; - - /// Create a PartitionSummary from repository data - pub fn create_partition_summary( - partition_ref: PartitionRef, - status: PartitionStatus, - last_updated: i64, - builds_count: usize, - invalidation_count: usize, - last_successful_build: Option, - ) -> PartitionSummary { - PartitionSummary { - partition_ref: Some(partition_ref), - status_code: status as i32, - status_name: status.to_display_string(), - last_updated, - builds_count: builds_count as u32, - invalidation_count: invalidation_count as u32, - last_successful_build, - } - } - - /// Create a JobSummary from repository data - pub fn create_job_summary( - job_label: String, - total_runs: usize, - successful_runs: usize, - failed_runs: usize, - cancelled_runs: usize, - average_partitions_per_run: f64, - last_run_timestamp: i64, - last_run_status: JobStatus, - recent_builds: Vec, - ) -> JobSummary { - JobSummary { - job_label, - total_runs: total_runs as u32, - successful_runs: successful_runs as u32, - failed_runs: failed_runs as u32, - cancelled_runs: cancelled_runs as u32, - average_partitions_per_run, - last_run_timestamp, - last_run_status_code: last_run_status as i32, - last_run_status_name: last_run_status.to_display_string(), - recent_builds, - } - } - - /// Create a TaskSummary from repository data - pub fn create_task_summary( - job_run_id: String, - job_label: String, - build_request_id: String, - status: JobStatus, - target_partitions: Vec, - scheduled_at: i64, - started_at: Option, - completed_at: Option, - duration_ms: Option, - cancelled: bool, - message: String, - ) -> JobRunSummary { - JobRunSummary { - job_run_id, - job_label, - build_request_id, - status_code: status as i32, - status_name: status.to_display_string(), - target_partitions, - scheduled_at, - started_at, - completed_at, - duration_ms, - cancelled, - message, - } - } - - /// Create a BuildSummary from repository data - pub fn create_build_summary( - build_request_id: String, - status: BuildRequestStatus, - requested_partitions: Vec, - total_jobs: usize, - completed_jobs: usize, - failed_jobs: usize, - cancelled_jobs: usize, - requested_at: i64, - started_at: Option, - completed_at: Option, - duration_ms: Option, - cancelled: bool, - comment: Option, - ) -> BuildSummary { - BuildSummary { - build_request_id, - status: Some(status), - requested_partitions, - total_jobs: total_jobs as u32, - completed_jobs: completed_jobs as u32, - failed_jobs: failed_jobs as u32, - cancelled_jobs: cancelled_jobs as u32, - requested_at, - started_at, - completed_at, - duration_ms, - cancelled, - comment, - } - } - - /// Create a DataDep with dual fields from repository data - pub fn create_data_dep( - dep_type: DepType, - partition_ref: PartitionRef, - ) -> DataDep { - DataDep { - dep_type_code: dep_type as i32, - dep_type_name: dep_type.to_display_string(), - partition_ref: Some(partition_ref), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_partition_status_conversions() { - let status = PartitionStatus::PartitionAvailable; - assert_eq!(status.to_display_string(), "available"); - assert_eq!(PartitionStatus::from_display_string("available"), Some(status)); - } - - #[test] - fn test_job_status_conversions() { - let status = JobStatus::JobCompleted; - assert_eq!(status.to_display_string(), "completed"); - assert_eq!(JobStatus::from_display_string("completed"), Some(status)); - } - - #[test] - fn test_build_request_status_conversions() { - let status = BuildRequestStatusCode::BuildRequestCompleted.status(); - assert_eq!(status.name, "completed"); - } - - #[test] - fn test_dep_type_conversions() { - let dep_type = DepType::Materialize; - assert_eq!(dep_type.to_display_string(), "materialize"); - assert_eq!(DepType::from_display_string("materialize"), Some(dep_type)); - - let dep_type = DepType::Query; - assert_eq!(dep_type.to_display_string(), "query"); - assert_eq!(DepType::from_display_string("query"), Some(dep_type)); - } - - #[test] - fn test_invalid_display_string() { - assert_eq!(PartitionStatus::from_display_string("invalid"), None); - assert_eq!(JobStatus::from_display_string("invalid"), None); - assert_eq!(BuildRequestStatusCode::from_display_string("invalid"), None); - assert_eq!(DepType::from_display_string("invalid"), None); - } -} \ No newline at end of file diff --git a/databuild/test/BUILD.bazel b/databuild/test/BUILD.bazel deleted file mode 100644 index 58c6886..0000000 --- a/databuild/test/BUILD.bazel +++ /dev/null @@ -1,61 +0,0 @@ -load("@rules_proto//proto:defs.bzl", "proto_library") -load("@rules_rust//rust:defs.bzl", "rust_test") - -# Test the databuild generation -rust_test( - name = "databuild_test", - srcs = [ - "databuild_test.rs", - "//databuild:generate_databuild_rust", - ], - edition = "2021", - deps = [ - "@crates//:prost", - "@crates//:schemars", - "@crates//:serde", - "@crates//:serde_json", - ], -) - -# Generate Rust code for simple proto using prost generator -genrule( - name = "generate_simple_rust", - srcs = ["simple.proto"], - outs = ["simple.rs"], - cmd = "PROTOC=$(location @com_google_protobuf//:protoc) $(location //databuild:prost_generator) $(location simple.proto) /dev/null $@", - tools = [ - "//databuild:prost_generator", - "@com_google_protobuf//:protoc", - ], -) - -# Simple proto for testing -proto_library( - name = "simple_proto", - srcs = ["simple.proto"], - visibility = ["//visibility:public"], -) - -# Test the simple generation -rust_test( - name = "simple_test", - srcs = [ - "simple_test.rs", - ":generate_simple_rust", - ], - edition = "2021", - deps = [ - "@crates//:prost", - "@crates//:schemars", - "@crates//:serde", - "@crates//:serde_json", - ], -) - -py_test( - name = "py_proto_test", - srcs = ["py_proto_test.py"], - deps = [ - "//databuild:py_proto", - ], -) diff --git a/databuild/test/app/BUILD.bazel b/databuild/test/app/BUILD.bazel deleted file mode 100644 index 705dd61..0000000 --- a/databuild/test/app/BUILD.bazel +++ /dev/null @@ -1,15 +0,0 @@ -py_library( - name = "job_src", - srcs = glob(["**/*.py"], exclude=["e2e_test_common.py"]), - visibility = ["//visibility:public"], - deps = [ - "//databuild:py_proto", - "//databuild/dsl/python:dsl", - ], -) - -py_library( - name = "e2e_test_common", - srcs = ["e2e_test_common.py"], - visibility = ["//visibility:public"], -) diff --git a/databuild/test/app/README.md b/databuild/test/app/README.md deleted file mode 100644 index 35aaa4b..0000000 --- a/databuild/test/app/README.md +++ /dev/null @@ -1,34 +0,0 @@ - -# Test DataBuild App - -This directory contains common job components for testing databuild apps described via different methods, e.g. the core bazel targets, the python DSL, etc. - -## Structure - -The fictitious use case is "daily color votes". The underlying input data is votes per color per day, which we combine and aggregate in ways that help us test different aspects of databuild. Job exec contents should be trivial, as the purpose is to test composition. Types of partition relationships: - -- Time-range: 1 day depending on N prior days -- Multi-partition-output jobs - - Always output multiple, e.g. producing per type - - Consume different inputs based on desired output - - Produce multiple of the same type depending on input - -```mermaid -flowchart TD - daily_color_votes[(daily_color_votes/$date/$color)] - color_votes_1w[(color_votes_1w/$date/$color)] - color_votes_1m[(color_votes_1m/$date/$color)] - daily_votes[(daily_votes/$date)] - votes_1w[(votes_1w/$date)] - votes_1m[(votes_1m/$date)] - color_vote_report[(color_vote_report/$date/$color)] - ingest_color_votes --> daily_color_votes - daily_color_votes --> trailing_color_votes --> color_votes_1w & color_votes_1m - daily_color_votes --> aggregate_color_votes --> daily_votes - color_votes_1w --> aggregate_color_votes --> votes_1w - color_votes_1m --> aggregate_color_votes --> votes_1m - daily_votes & votes_1w & votes_1m & color_votes_1w & color_votes_1m --> color_vote_report_calc --> color_vote_report -``` - -## Data Access -Data access is implemented in [`dal.py`](./dal.py), with data written as lists of dicts in JSON. Partition fields are stored as values in those dicts. diff --git a/databuild/test/app/bazel/BUILD.bazel b/databuild/test/app/bazel/BUILD.bazel deleted file mode 100644 index 08fa7b0..0000000 --- a/databuild/test/app/bazel/BUILD.bazel +++ /dev/null @@ -1,157 +0,0 @@ -load("//databuild:rules.bzl", "databuild_graph", "databuild_job") - -py_library( - name = "job_src", - srcs = glob(["**/*.py"]), - visibility = ["//visibility:public"], - deps = [ - "//databuild:py_proto", - "//databuild/dsl/python:dsl", - ], -) - -# Tests -py_test( - name = "test_trailing_color_votes", - srcs = ["jobs/trailing_color_votes/test.py"], - main = "jobs/trailing_color_votes/test.py", - deps = [ - ":job_src", - "//databuild/test/app:job_src", - ], -) - -py_test( - name = "test_ingest_color_votes", - srcs = ["jobs/ingest_color_votes/test.py"], - main = "jobs/ingest_color_votes/test.py", - deps = [ - ":job_src", - "//databuild/test/app:job_src", - ], -) - -py_test( - name = "test_aggregate_color_votes", - srcs = ["jobs/aggregate_color_votes/test.py"], - main = "jobs/aggregate_color_votes/test.py", - deps = [ - ":job_src", - "//databuild/test/app:job_src", - ], -) - -py_test( - name = "test_color_vote_report_calc", - srcs = ["jobs/color_vote_report_calc/test.py"], - main = "jobs/color_vote_report_calc/test.py", - deps = [ - ":job_src", - "//databuild/test/app:job_src", - ], -) - -py_test( - name = "test_graph_analysis", - srcs = ["graph/graph_test.py"], - data = [ - ":bazel_graph.analyze", - ":bazel_graph_lookup", - ], - main = "graph/graph_test.py", - deps = [ - ":job_src", - "//databuild/test/app:job_src", - ], -) - -py_test( - name = "test_e2e", - srcs = ["test_e2e.py"], - data = [":bazel_graph.build"], - main = "test_e2e.py", - deps = ["//databuild/test/app:e2e_test_common"], -) - -# Bazel-defined -## Graph -databuild_graph( - name = "bazel_graph", - jobs = [ - ":ingest_color_votes", - ":trailing_color_votes", - ":aggregate_color_votes", - ":color_vote_report_calc", - ], - lookup = ":bazel_graph_lookup", -) - -py_binary( - name = "bazel_graph_lookup", - srcs = ["graph/lookup.py"], - main = "graph/lookup.py", -) - -## Ingest Color Votes -databuild_job( - name = "ingest_color_votes", - binary = ":ingest_color_votes_binary", -) - -py_binary( - name = "ingest_color_votes_binary", - srcs = ["jobs/ingest_color_votes/main.py"], - main = "jobs/ingest_color_votes/main.py", - deps = [ - ":job_src", - "//databuild/test/app:job_src", - ], -) - -## Trailing Color Votes -databuild_job( - name = "trailing_color_votes", - binary = ":trailing_color_votes_binary", -) - -py_binary( - name = "trailing_color_votes_binary", - srcs = ["jobs/trailing_color_votes/main.py"], - main = "jobs/trailing_color_votes/main.py", - deps = [ - ":job_src", - "//databuild/test/app:job_src", - ], -) - -## Aggregate Color Votes -databuild_job( - name = "aggregate_color_votes", - binary = ":aggregate_color_votes_binary", -) - -py_binary( - name = "aggregate_color_votes_binary", - srcs = ["jobs/aggregate_color_votes/main.py"], - main = "jobs/aggregate_color_votes/main.py", - deps = [ - ":job_src", - "//databuild/test/app:job_src", - ], -) - -## Color Vote Report Calc -databuild_job( - name = "color_vote_report_calc", - binary = ":color_vote_report_calc_binary", -) - -py_binary( - name = "color_vote_report_calc_binary", - srcs = ["jobs/color_vote_report_calc/main.py"], - main = "jobs/color_vote_report_calc/main.py", - deps = [ - ":job_src", - "//databuild/test/app:job_src", - ], -) diff --git a/databuild/test/app/bazel/README.md b/databuild/test/app/bazel/README.md deleted file mode 100644 index 90e2433..0000000 --- a/databuild/test/app/bazel/README.md +++ /dev/null @@ -1,4 +0,0 @@ - -# Bazel-Based Graph Definition - -The bazel-based graph definition relies on declaring `databuild_job` and `databuild_graph` targets which reference binaries. diff --git a/databuild/test/app/bazel/graph/graph_test.py b/databuild/test/app/bazel/graph/graph_test.py deleted file mode 100644 index 2fafc0d..0000000 --- a/databuild/test/app/bazel/graph/graph_test.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python3 -""" -Integration test for the databuild graph analysis. - -This test verifies that when we request color vote reports, the graph analyzer -correctly identifies all upstream dependencies and jobs required. -""" - -import subprocess -import json -import unittest -import os -from pathlib import Path - - -class GraphAnalysisTest(unittest.TestCase): - def setUp(self): - # Determine the path to bazel_graph.analyze - # In bazel test, we need to find the executable in the runfiles - runfiles_dir = os.environ.get('RUNFILES_DIR') - test_srcdir = os.environ.get('TEST_SRCDIR') - - possible_paths = [] - if runfiles_dir: - possible_paths.append(os.path.join(runfiles_dir, '_main', 'databuild', 'test', 'app', 'bazel_graph.analyze')) - possible_paths.append(os.path.join(runfiles_dir, 'databuild', 'test', 'app', 'bazel_graph.analyze')) - - if test_srcdir: - possible_paths.append(os.path.join(test_srcdir, '_main', 'databuild', 'test', 'app', 'bazel_graph.analyze')) - possible_paths.append(os.path.join(test_srcdir, 'databuild', 'test', 'app', 'bazel_graph.analyze')) - - # Fallback for local testing - possible_paths.extend([ - 'bazel-bin/databuild/test/app/bazel_graph.analyze', - './bazel_graph.analyze' - ]) - - self.graph_analyze = None - for path in possible_paths: - if os.path.exists(path): - self.graph_analyze = path - break - - # Ensure the executable exists - if not self.graph_analyze: - self.skipTest(f"Graph analyze executable not found in any of these paths: {possible_paths}") - - def run_graph_analyze(self, partition_refs): - """Run graph.analyze with the given partition references.""" - cmd = [self.graph_analyze] + partition_refs - result = subprocess.run(cmd, capture_output=True, text=True, cwd=os.getcwd()) - - if result.returncode != 0: - self.fail(f"Graph analyze failed with return code {result.returncode}.\nStdout: {result.stdout}\nStderr: {result.stderr}") - - # Parse the JSON output - try: - return json.loads(result.stdout) - except json.JSONDecodeError as e: - self.fail(f"Failed to parse JSON output: {e}\nOutput: {result.stdout}") - - def test_single_color_report_dependencies(self): - """Test dependencies for a single color vote report.""" - partition_refs = ["color_vote_report/2024-01-15/red"] - result = self.run_graph_analyze(partition_refs) - self.assertIn('nodes', result) - # TODO expand - - def test_multiple_color_reports_same_date(self): - """Test dependencies when requesting multiple colors for the same date.""" - partition_refs = [ - "color_vote_report/2024-01-15/red", - "color_vote_report/2024-01-15/blue" - ] - result = self.run_graph_analyze(partition_refs) - self.assertIn('nodes', result) - # TODO expand - - def test_multiple_dates_dependencies(self): - """Test dependencies when requesting reports for different dates.""" - partition_refs = [ - "color_vote_report/2024-01-15/red", - "color_vote_report/2024-01-16/red" - ] - result = self.run_graph_analyze(partition_refs) - self.assertIn('nodes', result) - # TODO expand - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/databuild/test/app/bazel/graph/lookup.py b/databuild/test/app/bazel/graph/lookup.py deleted file mode 100644 index 5c548e9..0000000 --- a/databuild/test/app/bazel/graph/lookup.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 - -from collections import defaultdict -import sys -import json - -LABEL_BASE = "//databuild/test/app/bazel" - - -def lookup(raw_ref: str): - if raw_ref.startswith("daily_color_votes"): - return LABEL_BASE + ":ingest_color_votes" - elif raw_ref.startswith("color_votes_1"): - return LABEL_BASE + ":trailing_color_votes" - elif raw_ref.startswith("daily_votes") or raw_ref.startswith("votes_1w") or raw_ref.startswith("votes_1m"): - return LABEL_BASE + ":aggregate_color_votes" - elif raw_ref.startswith("color_vote_report"): - return LABEL_BASE + ":color_vote_report_calc" - else: - raise ValueError(f"Unable to resolve job for partition: `{raw_ref}`") - - -if __name__ == "__main__": - results = defaultdict(list) - for raw_ref in sys.argv[1:]: - results[lookup(raw_ref)].append(raw_ref) - - # Output the results as JSON - print(json.dumps(dict(results))) diff --git a/databuild/test/app/bazel/graph/test.py b/databuild/test/app/bazel/graph/test.py deleted file mode 100644 index e69de29..0000000 diff --git a/databuild/test/app/bazel/jobs/aggregate_color_votes/README.md b/databuild/test/app/bazel/jobs/aggregate_color_votes/README.md deleted file mode 120000 index c5d6fcd..0000000 --- a/databuild/test/app/bazel/jobs/aggregate_color_votes/README.md +++ /dev/null @@ -1 +0,0 @@ -jobs/aggregate_color_votes/README.md \ No newline at end of file diff --git a/databuild/test/app/bazel/jobs/aggregate_color_votes/config.py b/databuild/test/app/bazel/jobs/aggregate_color_votes/config.py deleted file mode 100644 index a597da2..0000000 --- a/databuild/test/app/bazel/jobs/aggregate_color_votes/config.py +++ /dev/null @@ -1,42 +0,0 @@ -from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig, DepType, DataDep -from databuild.test.app.colors import COLORS -from datetime import date - -def configure(outputs: list[PartitionRef]) -> JobConfigureResponse: - configs = [] - - for output in outputs: - parts = output.str.split("/") - if len(parts) == 2: - output_type, data_date = parts - date.fromisoformat(data_date) # Validate date format - - # Determine input type based on output type - if output_type == "daily_votes": - input_prefix = "daily_color_votes" - elif output_type == "votes_1w": - input_prefix = "color_votes_1w" - elif output_type == "votes_1m": - input_prefix = "color_votes_1m" - else: - raise ValueError(f"Unknown output type: {output_type}") - - # Create inputs for all colors - inputs = [] - for color in COLORS: - input_ref = PartitionRef(str=f"{input_prefix}/{data_date}/{color}") - inputs.append(input_ref) - - configs.append(JobConfig( - outputs=[output], - inputs=[DataDep(dep_type_code=DepType.MATERIALIZE, dep_type_name="materialize", partition_ref=ref) for ref in inputs], - args=[], - env={ - "DATA_DATE": data_date, - "AGGREGATE_TYPE": output_type - } - )) - else: - raise ValueError(f"Invalid output partition format: {output.str}") - - return JobConfigureResponse(configs=configs) \ No newline at end of file diff --git a/databuild/test/app/bazel/jobs/aggregate_color_votes/main.py b/databuild/test/app/bazel/jobs/aggregate_color_votes/main.py deleted file mode 100644 index 1053e80..0000000 --- a/databuild/test/app/bazel/jobs/aggregate_color_votes/main.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Main entrypoint for the aggregate_color_votes job for use with bazel-defined graph.""" - -import sys -import os -import json -from databuild.proto import PartitionRef, to_dict -from databuild.test.app.bazel.jobs.aggregate_color_votes.config import configure -from databuild.test.app.jobs.aggregate_color_votes.execute import execute - -if __name__ == "__main__": - if sys.argv[1] == "config": - response = configure([ - PartitionRef(str=raw_ref) - for raw_ref in sys.argv[2:] - ]) - print(json.dumps(to_dict(response))) - elif sys.argv[1] == "exec": - execute(os.environ["DATA_DATE"], os.environ["AGGREGATE_TYPE"]) - else: - raise Exception(f"Invalid command `{sys.argv[1]}`") \ No newline at end of file diff --git a/databuild/test/app/bazel/jobs/aggregate_color_votes/test.py b/databuild/test/app/bazel/jobs/aggregate_color_votes/test.py deleted file mode 100644 index 96b332b..0000000 --- a/databuild/test/app/bazel/jobs/aggregate_color_votes/test.py +++ /dev/null @@ -1,59 +0,0 @@ -import unittest -from databuild.proto import PartitionRef -from databuild.test.app.bazel.jobs.aggregate_color_votes.config import configure -from databuild.test.app.colors import COLORS - -class TestAggregateColorVotesConfig(unittest.TestCase): - def test_configure_daily_votes(self): - outputs = [PartitionRef(str="daily_votes/2024-01-15")] - response = configure(outputs) - - self.assertEqual(len(response.configs), 1) - config = response.configs[0] - self.assertEqual(len(config.outputs), 1) - self.assertEqual(len(config.inputs), len(COLORS)) # One input per color - self.assertEqual(config.env["AGGREGATE_TYPE"], "daily_votes") - self.assertEqual(config.env["DATA_DATE"], "2024-01-15") - - # Check that inputs are from daily_color_votes - for i, color in enumerate(COLORS): - expected_input = f"daily_color_votes/2024-01-15/{color}" - self.assertEqual(config.inputs[i].partition_ref.str, expected_input) - - def test_configure_weekly_votes(self): - outputs = [PartitionRef(str="votes_1w/2024-01-21")] - response = configure(outputs) - - self.assertEqual(len(response.configs), 1) - config = response.configs[0] - self.assertEqual(config.env["AGGREGATE_TYPE"], "votes_1w") - - # Check that inputs are from color_votes_1w - for i, color in enumerate(COLORS): - expected_input = f"color_votes_1w/2024-01-21/{color}" - self.assertEqual(config.inputs[i].partition_ref.str, expected_input) - - def test_configure_monthly_votes(self): - outputs = [PartitionRef(str="votes_1m/2024-01-31")] - response = configure(outputs) - - self.assertEqual(len(response.configs), 1) - config = response.configs[0] - self.assertEqual(config.env["AGGREGATE_TYPE"], "votes_1m") - - # Check that inputs are from color_votes_1m - for i, color in enumerate(COLORS): - expected_input = f"color_votes_1m/2024-01-31/{color}" - self.assertEqual(config.inputs[i].partition_ref.str, expected_input) - - def test_configure_multiple_outputs(self): - outputs = [ - PartitionRef(str="daily_votes/2024-01-15"), - PartitionRef(str="votes_1w/2024-01-21") - ] - response = configure(outputs) - - self.assertEqual(len(response.configs), 2) # One config per output - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/databuild/test/app/bazel/jobs/color_vote_report_calc/README.md b/databuild/test/app/bazel/jobs/color_vote_report_calc/README.md deleted file mode 120000 index 7128a82..0000000 --- a/databuild/test/app/bazel/jobs/color_vote_report_calc/README.md +++ /dev/null @@ -1 +0,0 @@ -jobs/color_vote_report_calc/README.md \ No newline at end of file diff --git a/databuild/test/app/bazel/jobs/color_vote_report_calc/config.py b/databuild/test/app/bazel/jobs/color_vote_report_calc/config.py deleted file mode 100644 index 2ab8245..0000000 --- a/databuild/test/app/bazel/jobs/color_vote_report_calc/config.py +++ /dev/null @@ -1,48 +0,0 @@ -from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig, DataDep, DepType -from datetime import date -from collections import defaultdict - -def configure(outputs: list[PartitionRef]) -> JobConfigureResponse: - # This job produces a single job config that handles all requested outputs - all_dates = set() - all_colors = set() - - for output in outputs: - parts = output.str.split("/") - if len(parts) == 3 and parts[0] == "color_vote_report": - prefix, data_date, color = parts - date.fromisoformat(data_date) # Validate date format - all_dates.add(data_date) - all_colors.add(color) - else: - raise ValueError(f"Invalid output partition format: {output.str}") - - # Build inputs for all dates and colors that are actually requested - inputs = [] - - # Add total vote aggregates for all dates - for data_date in all_dates: - inputs.extend([ - PartitionRef(str=f"daily_votes/{data_date}"), - PartitionRef(str=f"votes_1w/{data_date}"), - PartitionRef(str=f"votes_1m/{data_date}") - ]) - - # Add color-specific inputs for all date/color combinations that are requested - for output in outputs: - data_date, color = output.str.split("/")[1], output.str.split("/")[2] - inputs.extend([ - PartitionRef(str=f"daily_color_votes/{data_date}/{color}"), - PartitionRef(str=f"color_votes_1w/{data_date}/{color}"), - PartitionRef(str=f"color_votes_1m/{data_date}/{color}") - ]) - - # Single job config for all outputs - pass output partition refs as args - config = JobConfig( - outputs=outputs, - inputs=[DataDep(dep_type_code=DepType.MATERIALIZE, dep_type_name="materialize", partition_ref=ref) for ref in inputs], - args=[output.str for output in outputs], - env={} - ) - - return JobConfigureResponse(configs=[config]) \ No newline at end of file diff --git a/databuild/test/app/bazel/jobs/color_vote_report_calc/main.py b/databuild/test/app/bazel/jobs/color_vote_report_calc/main.py deleted file mode 100644 index 60aaa43..0000000 --- a/databuild/test/app/bazel/jobs/color_vote_report_calc/main.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Main entrypoint for the color_vote_report_calc job for use with bazel-defined graph.""" - -import sys -import os -import json -from databuild.proto import PartitionRef, to_dict -from databuild.test.app.bazel.jobs.color_vote_report_calc.config import configure -from databuild.test.app.jobs.color_vote_report_calc.execute import execute - -if __name__ == "__main__": - if sys.argv[1] == "config": - response = configure([ - PartitionRef(str=raw_ref) - for raw_ref in sys.argv[2:] - ]) - print(json.dumps(to_dict(response))) - elif sys.argv[1] == "exec": - execute(sys.argv[2:]) - else: - raise Exception(f"Invalid command `{sys.argv[1]}`") \ No newline at end of file diff --git a/databuild/test/app/bazel/jobs/color_vote_report_calc/test.py b/databuild/test/app/bazel/jobs/color_vote_report_calc/test.py deleted file mode 100644 index 978ed77..0000000 --- a/databuild/test/app/bazel/jobs/color_vote_report_calc/test.py +++ /dev/null @@ -1,60 +0,0 @@ -import unittest -from databuild.proto import PartitionRef -from databuild.test.app.bazel.jobs.color_vote_report_calc.config import configure - -class TestColorVoteReportCalcConfig(unittest.TestCase): - def test_configure_single_output(self): - outputs = [PartitionRef(str="color_vote_report/2024-01-15/red")] - response = configure(outputs) - - self.assertEqual(len(response.configs), 1) # Always single config - config = response.configs[0] - self.assertEqual(len(config.outputs), 1) - self.assertEqual(config.args, ["color_vote_report/2024-01-15/red"]) - - # Should have inputs for total votes and color-specific votes - expected_inputs = [ - "daily_votes/2024-01-15", - "votes_1w/2024-01-15", - "votes_1m/2024-01-15", - "daily_color_votes/2024-01-15/red", - "color_votes_1w/2024-01-15/red", - "color_votes_1m/2024-01-15/red" - ] - actual_inputs = [inp.partition_ref.str for inp in config.inputs] - for expected in expected_inputs: - self.assertIn(expected, actual_inputs) - - def test_configure_multiple_outputs_same_date(self): - outputs = [ - PartitionRef(str="color_vote_report/2024-01-15/red"), - PartitionRef(str="color_vote_report/2024-01-15/blue") - ] - response = configure(outputs) - - self.assertEqual(len(response.configs), 1) # Single config for all outputs - config = response.configs[0] - self.assertEqual(len(config.outputs), 2) - self.assertEqual(set(config.args), { - "color_vote_report/2024-01-15/red", - "color_vote_report/2024-01-15/blue" - }) - - def test_configure_multiple_dates(self): - outputs = [ - PartitionRef(str="color_vote_report/2024-01-15/red"), - PartitionRef(str="color_vote_report/2024-01-16/red") - ] - response = configure(outputs) - - self.assertEqual(len(response.configs), 1) # Single config for all outputs - config = response.configs[0] - self.assertEqual(len(config.outputs), 2) - - # Should have total vote inputs for both dates - actual_inputs = [inp.partition_ref.str for inp in config.inputs] - self.assertIn("daily_votes/2024-01-15", actual_inputs) - self.assertIn("daily_votes/2024-01-16", actual_inputs) - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/databuild/test/app/bazel/jobs/ingest_color_votes/README.md b/databuild/test/app/bazel/jobs/ingest_color_votes/README.md deleted file mode 120000 index 58d5aca..0000000 --- a/databuild/test/app/bazel/jobs/ingest_color_votes/README.md +++ /dev/null @@ -1 +0,0 @@ -jobs/ingest_color_votes/README.md \ No newline at end of file diff --git a/databuild/test/app/bazel/jobs/ingest_color_votes/config.py b/databuild/test/app/bazel/jobs/ingest_color_votes/config.py deleted file mode 100644 index cea5008..0000000 --- a/databuild/test/app/bazel/jobs/ingest_color_votes/config.py +++ /dev/null @@ -1,13 +0,0 @@ -from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig - -from datetime import date - - -def configure(outputs: list[PartitionRef]) -> JobConfigureResponse: - configs = [] - for output in outputs: - prefix, data_date, color = output.str.split("/") - date.fromisoformat(data_date) # Should be able to parse date - assert prefix == "daily_color_votes" - configs.append(JobConfig(outputs = [output], inputs=[], args=[], env={"DATA_DATE": data_date, "COLOR": color})) - return JobConfigureResponse(configs=configs) diff --git a/databuild/test/app/bazel/jobs/ingest_color_votes/main.py b/databuild/test/app/bazel/jobs/ingest_color_votes/main.py deleted file mode 100644 index 888dbec..0000000 --- a/databuild/test/app/bazel/jobs/ingest_color_votes/main.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Main entrypoint for the ingest_color_votes job for use with bazel-defined graph.""" - -import sys -import os -import json -from databuild.proto import PartitionRef, to_dict -from databuild.test.app.bazel.jobs.ingest_color_votes.config import configure -from databuild.test.app.jobs.ingest_color_votes.execute import execute - -if __name__ == "__main__": - if sys.argv[1] == "config": - response = configure([ - PartitionRef(str=raw_ref) - for raw_ref in sys.argv[2:] - ]) - print(json.dumps(to_dict(response))) - elif sys.argv[1] == "exec": - execute(os.environ["DATA_DATE"], os.environ["COLOR"]) - else: - raise Exception(f"Invalid command `{sys.argv[1]}`") diff --git a/databuild/test/app/bazel/jobs/ingest_color_votes/test.py b/databuild/test/app/bazel/jobs/ingest_color_votes/test.py deleted file mode 100644 index 91c364d..0000000 --- a/databuild/test/app/bazel/jobs/ingest_color_votes/test.py +++ /dev/null @@ -1,32 +0,0 @@ -from databuild.test.app.bazel.jobs.ingest_color_votes.config import configure -from databuild.proto import PartitionRef - - -def test_ingest_color_votes_configure(): - refs_single = [PartitionRef(str="daily_color_votes/2025-01-01/red")] - config_single = configure(refs_single) - assert len(config_single.configs) == 1 - assert config_single.configs[0].outputs[0].str == "daily_color_votes/2025-01-01/red" - assert config_single.configs[0].env["COLOR"] == "red" - assert config_single.configs[0].env["DATA_DATE"] == "2025-01-01" - - refs_multiple = [ - PartitionRef(str="daily_color_votes/2025-01-02/red"), - PartitionRef(str="daily_color_votes/2025-01-02/blue"), - ] - - config_multiple = configure(refs_multiple) - assert len(config_multiple.configs) == 2 - assert len(config_multiple.configs[0].outputs) == 1 - assert config_multiple.configs[0].outputs[0].str == "daily_color_votes/2025-01-02/red" - assert config_multiple.configs[0].env["COLOR"] == "red" - assert config_multiple.configs[0].env["DATA_DATE"] == "2025-01-02" - assert len(config_multiple.configs[1].outputs) == 1 - assert config_multiple.configs[1].outputs[0].str == "daily_color_votes/2025-01-02/blue" - assert config_multiple.configs[1].env["COLOR"] == "blue" - assert config_multiple.configs[1].env["DATA_DATE"] == "2025-01-02" - - -if __name__ == '__main__': - import pytest - raise SystemExit(pytest.main([__file__])) diff --git a/databuild/test/app/bazel/jobs/trailing_color_votes/README.md b/databuild/test/app/bazel/jobs/trailing_color_votes/README.md deleted file mode 120000 index d3e0cf7..0000000 --- a/databuild/test/app/bazel/jobs/trailing_color_votes/README.md +++ /dev/null @@ -1 +0,0 @@ -jobs/trailing_color_votes/README.md \ No newline at end of file diff --git a/databuild/test/app/bazel/jobs/trailing_color_votes/config.py b/databuild/test/app/bazel/jobs/trailing_color_votes/config.py deleted file mode 100644 index 12930c4..0000000 --- a/databuild/test/app/bazel/jobs/trailing_color_votes/config.py +++ /dev/null @@ -1,46 +0,0 @@ -from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig, DepType, DataDep -from datetime import date, timedelta -from collections import defaultdict - -def configure(outputs: list[PartitionRef]) -> JobConfigureResponse: - # Group outputs by date and color - grouped_outputs = defaultdict(list) - - for output in outputs: - parts = output.str.split("/") - if len(parts) == 3 and parts[0] in ["color_votes_1w", "color_votes_1m"]: - grouped_outputs[tuple(parts[1:])].append(output) - else: - raise ValueError(f"Invalid output partition format: {output.str}") - - configs = [] - for (data_date, color), output_partitions in grouped_outputs.items(): - # Parse the output date - output_date = date.fromisoformat(data_date) - - # Determine which windows are needed and the maximum window - has_weekly = any(output.str.startswith("color_votes_1w/") for output in output_partitions) - has_monthly = any(output.str.startswith("color_votes_1m/") for output in output_partitions) - max_window = max(7 if has_weekly else 0, 28 if has_monthly else 0) - - # Generate input partition refs for the required trailing window - inputs = [] - for i in range(max_window): - input_date = output_date - timedelta(days=i) - inputs.append(PartitionRef(str=f"daily_color_votes/{input_date.isoformat()}/{color}")) - - env = { - "DATA_DATE": data_date, - "COLOR": color, - "WEEKLY": "true" if has_weekly else "false", - "MONTHLY": "true" if has_monthly else "false" - } - - configs.append(JobConfig( - outputs=output_partitions, - inputs=[DataDep(dep_type_code=DepType.MATERIALIZE, dep_type_name="materialize", partition_ref=ref) for ref in inputs], - args=[], - env=env - )) - - return JobConfigureResponse(configs=configs) \ No newline at end of file diff --git a/databuild/test/app/bazel/jobs/trailing_color_votes/main.py b/databuild/test/app/bazel/jobs/trailing_color_votes/main.py deleted file mode 100644 index e16051d..0000000 --- a/databuild/test/app/bazel/jobs/trailing_color_votes/main.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Main entrypoint for the trailing_color_votes job for use with bazel-defined graph.""" - -import sys -import os -import json -from databuild.proto import PartitionRef, to_dict -from databuild.test.app.bazel.jobs.trailing_color_votes.config import configure -from databuild.test.app.jobs.trailing_color_votes.execute import execute - -if __name__ == "__main__": - if sys.argv[1] == "config": - response = configure([ - PartitionRef(str=raw_ref) - for raw_ref in sys.argv[2:] - ]) - print(json.dumps(to_dict(response))) - elif sys.argv[1] == "exec": - execute(os.environ["DATA_DATE"], os.environ["COLOR"]) - else: - raise Exception(f"Invalid command `{sys.argv[1]}`") \ No newline at end of file diff --git a/databuild/test/app/bazel/jobs/trailing_color_votes/test.py b/databuild/test/app/bazel/jobs/trailing_color_votes/test.py deleted file mode 100644 index aaebed8..0000000 --- a/databuild/test/app/bazel/jobs/trailing_color_votes/test.py +++ /dev/null @@ -1,53 +0,0 @@ -import unittest -from databuild.proto import PartitionRef -from databuild.test.app.bazel.jobs.trailing_color_votes.config import configure - -class TestTrailingColorVotesConfig(unittest.TestCase): - def test_configure_weekly_only(self): - outputs = [PartitionRef(str="color_votes_1w/2024-01-07/red")] - response = configure(outputs) - - self.assertEqual(len(response.configs), 1) - config = response.configs[0] - self.assertEqual(len(config.outputs), 1) - self.assertEqual(len(config.inputs), 7) # 7 days for weekly - self.assertEqual(config.env["WEEKLY"], "true") - self.assertEqual(config.env["MONTHLY"], "false") - - def test_configure_monthly_only(self): - outputs = [PartitionRef(str="color_votes_1m/2024-01-28/blue")] - response = configure(outputs) - - self.assertEqual(len(response.configs), 1) - config = response.configs[0] - self.assertEqual(len(config.outputs), 1) - self.assertEqual(len(config.inputs), 28) # 28 days for monthly - self.assertEqual(config.env["WEEKLY"], "false") - self.assertEqual(config.env["MONTHLY"], "true") - - def test_configure_both_weekly_and_monthly(self): - outputs = [ - PartitionRef(str="color_votes_1w/2024-01-28/green"), - PartitionRef(str="color_votes_1m/2024-01-28/green") - ] - response = configure(outputs) - - self.assertEqual(len(response.configs), 1) # Single config for same date/color - config = response.configs[0] - self.assertEqual(len(config.outputs), 2) # Both outputs - self.assertEqual(len(config.inputs), 28) # 28 days (max of 7 and 28) - self.assertEqual(config.env["WEEKLY"], "true") - self.assertEqual(config.env["MONTHLY"], "true") - - def test_configure_multiple_colors_dates(self): - outputs = [ - PartitionRef(str="color_votes_1w/2024-01-07/red"), - PartitionRef(str="color_votes_1w/2024-01-07/blue"), - PartitionRef(str="color_votes_1m/2024-01-14/red") - ] - response = configure(outputs) - - self.assertEqual(len(response.configs), 3) # One config per unique date/color combination - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/databuild/test/app/bazel/test_e2e.py b/databuild/test/app/bazel/test_e2e.py deleted file mode 100644 index 189d2b1..0000000 --- a/databuild/test/app/bazel/test_e2e.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -""" -End-to-end test for the bazel-defined test app. - -Tests the full pipeline: build execution -> output verification -> JSON validation. -""" - -import os -from databuild.test.app.e2e_test_common import DataBuildE2ETestBase - - -class BazelE2ETest(DataBuildE2ETestBase): - """End-to-end test for the bazel-defined test app.""" - - def test_end_to_end_execution(self): - """Test full end-to-end execution of the bazel graph.""" - # Build possible paths for the bazel graph build binary - possible_paths = self.get_standard_runfiles_paths( - 'databuild/test/app/bazel/bazel_graph.build' - ) - - # Add fallback paths for local testing - possible_paths.extend([ - 'bazel-bin/databuild/test/app/bazel/bazel_graph.build', - './bazel_graph.build' - ]) - - # Find the graph build binary - graph_build_path = self.find_graph_build_binary(possible_paths) - - # Execute and verify the graph build - self.execute_and_verify_graph_build(graph_build_path) - - -if __name__ == '__main__': - import unittest - unittest.main() \ No newline at end of file diff --git a/databuild/test/app/colors.py b/databuild/test/app/colors.py deleted file mode 100644 index abae089..0000000 --- a/databuild/test/app/colors.py +++ /dev/null @@ -1,2 +0,0 @@ - -COLORS = ["red", "blue", "green", "yellow", "cerulean", "cucumber", "sage", "forest"] diff --git a/databuild/test/app/dal.py b/databuild/test/app/dal.py deleted file mode 100644 index 0c102e5..0000000 --- a/databuild/test/app/dal.py +++ /dev/null @@ -1,30 +0,0 @@ - -from databuild.proto import PartitionRef -import json -from pathlib import Path - - -def ref_path(ref: PartitionRef) -> str: - assert isinstance(ref, PartitionRef), f"Wanted PartitionRef, got `{type(ref)}`" - return "/tmp/data/" + ref.str.lstrip("/") + "/data.json" - - -def read(*refs: PartitionRef, empty_ok: bool=True) -> list[dict]: - results = [] - for ref in refs: - try: - with open(ref_path(ref)) as infile: - results.extend(json.load(infile)) - except FileNotFoundError: - if not empty_ok: - raise - return [] - return results - - -def write(ref: PartitionRef, data: list[dict]) -> None: - # mkdirs before writing in case path doesn't exist - path = ref_path(ref) - Path(path.rsplit("/", 1)[0]).mkdir(parents=True, exist_ok=True) - with open(path, "w") as outfile: - json.dump(data, outfile) diff --git a/databuild/test/app/dsl/BUILD.bazel b/databuild/test/app/dsl/BUILD.bazel deleted file mode 100644 index cdfb00b..0000000 --- a/databuild/test/app/dsl/BUILD.bazel +++ /dev/null @@ -1,54 +0,0 @@ -load("@databuild//databuild:rules.bzl", "databuild_dsl_generator") - -py_library( - name = "dsl_src", - srcs = glob( - ["*.py"], - exclude = ["test_*.py"], - ), - visibility = ["//visibility:public"], - deps = [ - "//databuild:py_proto", - "//databuild/dsl/python:dsl", - "//databuild/test/app:job_src", - ], -) - -databuild_dsl_generator( - name = "graph.generate", - graph_file = "graph.py", - graph_attr = "graph", - output_package = "//databuild/test/app/dsl", - deps = [":dsl_src"], - visibility = ["//visibility:public"], -) - -# Generate fresh DSL output for comparison testing -genrule( - name = "generate_fresh_dsl", - outs = ["generated_fresh.tar"], - cmd_bash = """ - # Create temporary directory for generation - mkdir -p temp_workspace/databuild/test/app/dsl - - # Set environment to generate to temp directory - export BUILD_WORKSPACE_DIRECTORY="temp_workspace" - - # Run the generator - $(location :graph.generate) - - # Create tar archive of generated files - if [ -d "temp_workspace/databuild/test/app/dsl/generated" ]; then - find temp_workspace/databuild/test/app/dsl/generated -exec touch -t 197001010000 {} + - tar -cf $@ -C temp_workspace/databuild/test/app/dsl/generated . - else - # Create empty tar if no files generated - tar -cf $@ -T /dev/null - fi - - # Clean up - rm -rf temp_workspace - """, - tools = [":graph.generate"], - visibility = ["//visibility:public"], -) diff --git a/databuild/test/app/dsl/claude-generated-dsl-test.md b/databuild/test/app/dsl/claude-generated-dsl-test.md deleted file mode 100644 index 8683038..0000000 --- a/databuild/test/app/dsl/claude-generated-dsl-test.md +++ /dev/null @@ -1,9 +0,0 @@ - -We can't write a direct `bazel test` for the DSL generated graph, because: - -1. Bazel doesn't allow you to `bazel run graph.generate` to generate a BUILD.bazel that will be used in the same build. -2. We don't want to leak test generation into the graph generation code (since tests here are app specific) - -Instead, we need to use a two phase process, where we rely on the graph to already be generated here, which will contain a test, such that `bazel test //...` will give us recall over generated source as well. This implies that this generated source is going to be checked in to git (gasp, I know), and we need a mechanism to ensure it stays up to date. To achieve this, we'll create a test that asserts that the contents of the `generated` dir is the exact same as the output of a new run of `graph.generate`. - -Our task is to implement this test that asserts equality between the two, e.g. the target could depend on `graph.generate`, and in the test run it and md5 the results, comparing it to the md5 of the existing generated dir. diff --git a/databuild/test/app/dsl/dsl_job_lookup.py b/databuild/test/app/dsl/dsl_job_lookup.py deleted file mode 100755 index 375a425..0000000 --- a/databuild/test/app/dsl/dsl_job_lookup.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python3 -""" -Generated job lookup for DataBuild DSL graph. -Maps partition patterns to job targets. -""" - -import sys -import re - - -# Mapping from partition patterns to job targets -JOB_MAPPINGS = { - r"daily_color_votes/(?P\d{4}-\d{2}-\d{2})/(?P[^/]+)": ":ingest_color_votes", - r"color_votes_1m/(?P\d{4}-\d{2}-\d{2})/(?P[^/]+)": ":trailing_color_votes", - r"color_votes_1w/(?P\d{4}-\d{2}-\d{2})/(?P[^/]+)": ":trailing_color_votes", - r"daily_votes/(?P\d{4}-\d{2}-\d{2})": ":aggregate_color_votes", - r"votes_1w/(?P\d{4}-\d{2}-\d{2})": ":aggregate_color_votes", - r"votes_1m/(?P\d{4}-\d{2}-\d{2})": ":aggregate_color_votes", - r"color_vote_report/(?P\d{4}-\d{2}-\d{2})/(?P[^/]+)": ":color_vote_report_calc", -} - - -def lookup_job_for_partition(partition_ref: str) -> str: - """Look up which job can build the given partition reference.""" - for pattern, job_target in JOB_MAPPINGS.items(): - if re.match(pattern, partition_ref): - return job_target - - raise ValueError(f"No job found for partition: {partition_ref}") - - -def main(): - if len(sys.argv) != 2: - print("Usage: job_lookup.py ", file=sys.stderr) - sys.exit(1) - - partition_ref = sys.argv[1] - try: - job_target = lookup_job_for_partition(partition_ref) - print(job_target) - except ValueError as e: - print(f"ERROR: {e}", file=sys.stderr) - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/databuild/test/app/dsl/dsl_job_wrapper.py b/databuild/test/app/dsl/dsl_job_wrapper.py deleted file mode 100644 index e87c663..0000000 --- a/databuild/test/app/dsl/dsl_job_wrapper.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python3 -""" -Shared DSL job wrapper that can execute any DataBuildJob defined in a DSL graph. -Configured via environment variables: -- DATABUILD_DSL_GRAPH_MODULE: Python module path containing the graph (e.g., 'databuild.test.app.dsl.graph') -- DATABUILD_JOB_CLASS: Job class name to execute (e.g., 'IngestColorVotes') -""" - -import sys -import json -import os -import importlib -from typing import List, Any -from databuild.proto import JobConfig - - -def parse_outputs_from_args(args: List[str], job_class: Any) -> List[Any]: - """Parse partition output references from command line arguments into partition objects.""" - outputs = [] - for arg in args: - # Find which output type can deserialize this partition reference - for output_type in job_class.output_types: - try: - partition = output_type.deserialize(arg) - outputs.append(partition) - break - except ValueError: - continue - else: - raise ValueError(f"No output type in {job_class.__name__} can deserialize partition ref: {arg}") - - return outputs - - -def main(): - if len(sys.argv) < 2: - print("Usage: dsl_job_wrapper.py [args...]", file=sys.stderr) - sys.exit(1) - - command = sys.argv[1] - - # Read configuration from environment - graph_module_path = os.environ.get('DATABUILD_DSL_GRAPH_MODULE') - job_class_name = os.environ.get('DATABUILD_JOB_CLASS') - - if not graph_module_path: - print("ERROR: DATABUILD_DSL_GRAPH_MODULE environment variable not set", file=sys.stderr) - sys.exit(1) - - if not job_class_name: - print("ERROR: DATABUILD_JOB_CLASS environment variable not set", file=sys.stderr) - sys.exit(1) - - try: - # Import the graph module - module = importlib.import_module(graph_module_path) - graph = getattr(module, 'graph') - - # Get the job class - job_class = getattr(module, job_class_name) - - # Create job instance - job_instance = job_class() - - except (ImportError, AttributeError) as e: - print(f"ERROR: Failed to load job {job_class_name} from {graph_module_path}: {e}", file=sys.stderr) - sys.exit(1) - - if command == "config": - try: - # Parse output partition references from remaining args - output_refs = sys.argv[2:] - if not output_refs: - print("ERROR: No output partition references provided", file=sys.stderr) - sys.exit(1) - - outputs = parse_outputs_from_args(output_refs, job_class) - - # Call job's config method - configs = job_instance.config(outputs) - - # Output each config as JSON (one per line for multiple configs) - for config in configs: - # Convert JobConfig to dict for JSON serialization - config_dict = { - 'outputs': [{'str': ref.str} for ref in config.outputs], - 'inputs': [ - { - 'dep_type_code': dep.dep_type_code, - 'dep_type_name': dep.dep_type_name, - 'partition_ref': {'str': dep.partition_ref.str} - } for dep in config.inputs - ], - 'args': config.args, - 'env': config.env, - } - print(json.dumps(config_dict)) - - except Exception as e: - print(f"ERROR: Config failed: {e}", file=sys.stderr) - sys.exit(1) - - elif command == "exec": - try: - # Call job's exec method - job_instance.exec(*sys.argv[2:]) - - except Exception as e: - print(f"ERROR: Execution failed: {e}", file=sys.stderr) - sys.exit(1) - - else: - print(f"ERROR: Unknown command '{command}'. Use 'config' or 'exec'.", file=sys.stderr) - sys.exit(1) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/databuild/test/app/dsl/generated/BUILD.bazel b/databuild/test/app/dsl/generated/BUILD.bazel deleted file mode 100644 index 215a549..0000000 --- a/databuild/test/app/dsl/generated/BUILD.bazel +++ /dev/null @@ -1,71 +0,0 @@ -load("@databuild//databuild:rules.bzl", "databuild_job", "databuild_graph") - -# Generated by DataBuild DSL - do not edit manually -# This file is generated in a subdirectory to avoid overwriting the original BUILD.bazel - -py_binary( - name = "aggregate_color_votes_binary", - srcs = ["aggregate_color_votes.py"], - main = "aggregate_color_votes.py", - deps = ["@@//databuild/test/app/dsl:dsl_src"], -) - -databuild_job( - name = "aggregate_color_votes", - binary = ":aggregate_color_votes_binary", -) -py_binary( - name = "color_vote_report_calc_binary", - srcs = ["color_vote_report_calc.py"], - main = "color_vote_report_calc.py", - deps = ["@@//databuild/test/app/dsl:dsl_src"], -) - -databuild_job( - name = "color_vote_report_calc", - binary = ":color_vote_report_calc_binary", -) -py_binary( - name = "ingest_color_votes_binary", - srcs = ["ingest_color_votes.py"], - main = "ingest_color_votes.py", - deps = ["@@//databuild/test/app/dsl:dsl_src"], -) - -databuild_job( - name = "ingest_color_votes", - binary = ":ingest_color_votes_binary", -) -py_binary( - name = "trailing_color_votes_binary", - srcs = ["trailing_color_votes.py"], - main = "trailing_color_votes.py", - deps = ["@@//databuild/test/app/dsl:dsl_src"], -) - -databuild_job( - name = "trailing_color_votes", - binary = ":trailing_color_votes_binary", -) - -py_binary( - name = "dsl_job_lookup", - srcs = ["dsl_job_lookup.py"], - deps = ["@@//databuild/test/app/dsl:dsl_src"], -) - -databuild_graph( - name = "dsl_graph", - jobs = ["aggregate_color_votes", "color_vote_report_calc", "ingest_color_votes", "trailing_color_votes"], - lookup = ":dsl_job_lookup", - visibility = ["//visibility:public"], -) - -# Create tar archive of generated files for testing -genrule( - name = "existing_generated", - srcs = glob(["*.py", "BUILD.bazel"]), - outs = ["existing_generated.tar"], - cmd = "mkdir -p temp && cp $(SRCS) temp/ && find temp -exec touch -t 197001010000 {} + && tar -cf $@ -C temp .", - visibility = ["//visibility:public"], -) diff --git a/databuild/test/app/dsl/generated/aggregate_color_votes.py b/databuild/test/app/dsl/generated/aggregate_color_votes.py deleted file mode 100755 index 59af193..0000000 --- a/databuild/test/app/dsl/generated/aggregate_color_votes.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python3 -""" -Generated job script for AggregateColorVotes. -""" - -import sys -import json -from databuild.test.app.dsl.graph import AggregateColorVotes -from databuild.proto import PartitionRef, JobConfigureResponse, to_dict - - -def parse_outputs_from_args(args: list[str]) -> list: - """Parse partition output references from command line arguments.""" - outputs = [] - for arg in args: - # Find which output type can deserialize this partition reference - for output_type in AggregateColorVotes.output_types: - try: - partition = output_type.deserialize(arg) - outputs.append(partition) - break - except ValueError: - continue - else: - raise ValueError(f"No output type in AggregateColorVotes can deserialize partition ref: {arg}") - - return outputs - - -if __name__ == "__main__": - if len(sys.argv) < 2: - raise Exception(f"Invalid command usage") - - command = sys.argv[1] - job_instance = AggregateColorVotes() - - if command == "config": - # Parse output partition references as PartitionRef objects (for Rust wrapper) - output_refs = [PartitionRef(str=raw_ref) for raw_ref in sys.argv[2:]] - - # Also parse them into DSL partition objects (for DSL job.config()) - outputs = parse_outputs_from_args(sys.argv[2:]) - - # Call job's config method - returns list[JobConfig] - configs = job_instance.config(outputs) - - # Wrap in JobConfigureResponse and serialize using to_dict() - response = JobConfigureResponse(configs=configs) - print(json.dumps(to_dict(response))) - - elif command == "exec": - # The exec method expects a JobConfig but the Rust wrapper passes args - # For now, let the DSL job handle the args directly - # TODO: This needs to be refined based on actual Rust wrapper interface - job_instance.exec(*sys.argv[2:]) - - else: - raise Exception(f"Invalid command `{sys.argv[1]}`") diff --git a/databuild/test/app/dsl/generated/color_vote_report_calc.py b/databuild/test/app/dsl/generated/color_vote_report_calc.py deleted file mode 100755 index e538772..0000000 --- a/databuild/test/app/dsl/generated/color_vote_report_calc.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python3 -""" -Generated job script for ColorVoteReportCalc. -""" - -import sys -import json -from databuild.test.app.dsl.graph import ColorVoteReportCalc -from databuild.proto import PartitionRef, JobConfigureResponse, to_dict - - -def parse_outputs_from_args(args: list[str]) -> list: - """Parse partition output references from command line arguments.""" - outputs = [] - for arg in args: - # Find which output type can deserialize this partition reference - for output_type in ColorVoteReportCalc.output_types: - try: - partition = output_type.deserialize(arg) - outputs.append(partition) - break - except ValueError: - continue - else: - raise ValueError(f"No output type in ColorVoteReportCalc can deserialize partition ref: {arg}") - - return outputs - - -if __name__ == "__main__": - if len(sys.argv) < 2: - raise Exception(f"Invalid command usage") - - command = sys.argv[1] - job_instance = ColorVoteReportCalc() - - if command == "config": - # Parse output partition references as PartitionRef objects (for Rust wrapper) - output_refs = [PartitionRef(str=raw_ref) for raw_ref in sys.argv[2:]] - - # Also parse them into DSL partition objects (for DSL job.config()) - outputs = parse_outputs_from_args(sys.argv[2:]) - - # Call job's config method - returns list[JobConfig] - configs = job_instance.config(outputs) - - # Wrap in JobConfigureResponse and serialize using to_dict() - response = JobConfigureResponse(configs=configs) - print(json.dumps(to_dict(response))) - - elif command == "exec": - # The exec method expects a JobConfig but the Rust wrapper passes args - # For now, let the DSL job handle the args directly - # TODO: This needs to be refined based on actual Rust wrapper interface - job_instance.exec(*sys.argv[2:]) - - else: - raise Exception(f"Invalid command `{sys.argv[1]}`") diff --git a/databuild/test/app/dsl/generated/dsl_job_lookup.py b/databuild/test/app/dsl/generated/dsl_job_lookup.py deleted file mode 100755 index 049f7e5..0000000 --- a/databuild/test/app/dsl/generated/dsl_job_lookup.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -""" -Generated job lookup for DataBuild DSL graph. -Maps partition patterns to job targets. -""" - -import sys -import re -import json -from collections import defaultdict - - -# Mapping from partition patterns to job targets -JOB_MAPPINGS = { - r"daily_color_votes/(?P\d{4}-\d{2}-\d{2})/(?P[^/]+)": "//databuild/test/app/dsl/generated:ingest_color_votes", - r"color_votes_1m/(?P\d{4}-\d{2}-\d{2})/(?P[^/]+)": "//databuild/test/app/dsl/generated:trailing_color_votes", - r"color_votes_1w/(?P\d{4}-\d{2}-\d{2})/(?P[^/]+)": "//databuild/test/app/dsl/generated:trailing_color_votes", - r"daily_votes/(?P\d{4}-\d{2}-\d{2})": "//databuild/test/app/dsl/generated:aggregate_color_votes", - r"votes_1w/(?P\d{4}-\d{2}-\d{2})": "//databuild/test/app/dsl/generated:aggregate_color_votes", - r"votes_1m/(?P\d{4}-\d{2}-\d{2})": "//databuild/test/app/dsl/generated:aggregate_color_votes", - r"color_vote_report/(?P\d{4}-\d{2}-\d{2})/(?P[^/]+)": "//databuild/test/app/dsl/generated:color_vote_report_calc", -} - - -def lookup_job_for_partition(partition_ref: str) -> str: - """Look up which job can build the given partition reference.""" - for pattern, job_target in JOB_MAPPINGS.items(): - if re.match(pattern, partition_ref): - return job_target - - raise ValueError(f"No job found for partition: {partition_ref}") - - -def main(): - if len(sys.argv) < 2: - print("Usage: job_lookup.py [partition_ref...]", file=sys.stderr) - sys.exit(1) - - results = defaultdict(list) - try: - for partition_ref in sys.argv[1:]: - job_target = lookup_job_for_partition(partition_ref) - results[job_target].append(partition_ref) - - # Output the results as JSON (matching existing lookup format) - print(json.dumps(dict(results))) - except ValueError as e: - print(f"ERROR: {e}", file=sys.stderr) - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/databuild/test/app/dsl/generated/ingest_color_votes.py b/databuild/test/app/dsl/generated/ingest_color_votes.py deleted file mode 100755 index af920c9..0000000 --- a/databuild/test/app/dsl/generated/ingest_color_votes.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python3 -""" -Generated job script for IngestColorVotes. -""" - -import sys -import json -from databuild.test.app.dsl.graph import IngestColorVotes -from databuild.proto import PartitionRef, JobConfigureResponse, to_dict - - -def parse_outputs_from_args(args: list[str]) -> list: - """Parse partition output references from command line arguments.""" - outputs = [] - for arg in args: - # Find which output type can deserialize this partition reference - for output_type in IngestColorVotes.output_types: - try: - partition = output_type.deserialize(arg) - outputs.append(partition) - break - except ValueError: - continue - else: - raise ValueError(f"No output type in IngestColorVotes can deserialize partition ref: {arg}") - - return outputs - - -if __name__ == "__main__": - if len(sys.argv) < 2: - raise Exception(f"Invalid command usage") - - command = sys.argv[1] - job_instance = IngestColorVotes() - - if command == "config": - # Parse output partition references as PartitionRef objects (for Rust wrapper) - output_refs = [PartitionRef(str=raw_ref) for raw_ref in sys.argv[2:]] - - # Also parse them into DSL partition objects (for DSL job.config()) - outputs = parse_outputs_from_args(sys.argv[2:]) - - # Call job's config method - returns list[JobConfig] - configs = job_instance.config(outputs) - - # Wrap in JobConfigureResponse and serialize using to_dict() - response = JobConfigureResponse(configs=configs) - print(json.dumps(to_dict(response))) - - elif command == "exec": - # The exec method expects a JobConfig but the Rust wrapper passes args - # For now, let the DSL job handle the args directly - # TODO: This needs to be refined based on actual Rust wrapper interface - job_instance.exec(*sys.argv[2:]) - - else: - raise Exception(f"Invalid command `{sys.argv[1]}`") diff --git a/databuild/test/app/dsl/generated/trailing_color_votes.py b/databuild/test/app/dsl/generated/trailing_color_votes.py deleted file mode 100755 index 0936f87..0000000 --- a/databuild/test/app/dsl/generated/trailing_color_votes.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python3 -""" -Generated job script for TrailingColorVotes. -""" - -import sys -import json -from databuild.test.app.dsl.graph import TrailingColorVotes -from databuild.proto import PartitionRef, JobConfigureResponse, to_dict - - -def parse_outputs_from_args(args: list[str]) -> list: - """Parse partition output references from command line arguments.""" - outputs = [] - for arg in args: - # Find which output type can deserialize this partition reference - for output_type in TrailingColorVotes.output_types: - try: - partition = output_type.deserialize(arg) - outputs.append(partition) - break - except ValueError: - continue - else: - raise ValueError(f"No output type in TrailingColorVotes can deserialize partition ref: {arg}") - - return outputs - - -if __name__ == "__main__": - if len(sys.argv) < 2: - raise Exception(f"Invalid command usage") - - command = sys.argv[1] - job_instance = TrailingColorVotes() - - if command == "config": - # Parse output partition references as PartitionRef objects (for Rust wrapper) - output_refs = [PartitionRef(str=raw_ref) for raw_ref in sys.argv[2:]] - - # Also parse them into DSL partition objects (for DSL job.config()) - outputs = parse_outputs_from_args(sys.argv[2:]) - - # Call job's config method - returns list[JobConfig] - configs = job_instance.config(outputs) - - # Wrap in JobConfigureResponse and serialize using to_dict() - response = JobConfigureResponse(configs=configs) - print(json.dumps(to_dict(response))) - - elif command == "exec": - # The exec method expects a JobConfig but the Rust wrapper passes args - # For now, let the DSL job handle the args directly - # TODO: This needs to be refined based on actual Rust wrapper interface - job_instance.exec(*sys.argv[2:]) - - else: - raise Exception(f"Invalid command `{sys.argv[1]}`") diff --git a/databuild/test/app/dsl/generated_test/BUILD.bazel b/databuild/test/app/dsl/generated_test/BUILD.bazel deleted file mode 100644 index d03fb63..0000000 --- a/databuild/test/app/dsl/generated_test/BUILD.bazel +++ /dev/null @@ -1,7 +0,0 @@ -py_test( - name = "test_e2e", - srcs = ["test_e2e.py"], - data = ["//databuild/test/app/dsl/generated:dsl_graph.build"], - main = "test_e2e.py", - deps = ["//databuild/test/app:e2e_test_common"], -) \ No newline at end of file diff --git a/databuild/test/app/dsl/generated_test/test_e2e.py b/databuild/test/app/dsl/generated_test/test_e2e.py deleted file mode 100644 index ebe9f80..0000000 --- a/databuild/test/app/dsl/generated_test/test_e2e.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -""" -End-to-end test for the DSL-generated test app. - -Tests the full pipeline: build execution -> output verification -> JSON validation. -""" - -import os -from databuild.test.app.e2e_test_common import DataBuildE2ETestBase - - -class DSLGeneratedE2ETest(DataBuildE2ETestBase): - """End-to-end test for the DSL-generated test app.""" - - def test_end_to_end_execution(self): - """Test full end-to-end execution of the DSL-generated graph.""" - # Build possible paths for the DSL-generated graph build binary - possible_paths = self.get_standard_runfiles_paths( - 'databuild/test/app/dsl/generated/dsl_graph.build' - ) - - # Add fallback paths for local testing - possible_paths.extend([ - 'bazel-bin/databuild/test/app/dsl/generated/dsl_graph.build', - './dsl_graph.build' - ]) - - # Find the graph build binary - graph_build_path = self.find_graph_build_binary(possible_paths) - - # Execute and verify the graph build - self.execute_and_verify_graph_build(graph_build_path) - - -if __name__ == '__main__': - import unittest - unittest.main() \ No newline at end of file diff --git a/databuild/test/app/dsl/graph.py b/databuild/test/app/dsl/graph.py deleted file mode 100644 index a2e61e0..0000000 --- a/databuild/test/app/dsl/graph.py +++ /dev/null @@ -1,131 +0,0 @@ -"""Python DSL implementation of test app""" - -from collections import defaultdict -from databuild.dsl.python.dsl import DataBuildGraph, DataBuildJob, JobConfigBuilder -from databuild.proto import JobConfig -from databuild.test.app.colors import COLORS -from databuild.test.app.jobs.ingest_color_votes.execute import execute as ingest_color_votes_exec -from databuild.test.app.jobs.trailing_color_votes.execute import execute as trailing_color_votes_exec -from databuild.test.app.jobs.aggregate_color_votes.execute import execute as aggregate_color_votes_exec -from databuild.test.app.jobs.color_vote_report_calc.execute import execute as color_vote_report_calc_exec -from databuild.test.app.dsl.partitions import ( - IngestedColorPartition, - TrailingColorVotes1MPartition, - TrailingColorVotes1WPartition, - DailyVotesPartition, - Votes1WPartition, - Votes1MPartition, - ColorVoteReportPartition -) -import os -from datetime import date, timedelta - -graph = DataBuildGraph("//databuild/test/app/dsl:dsl_graph") - - -@graph.job -class IngestColorVotes(DataBuildJob): - output_types = [IngestedColorPartition] - - def config(self, outputs: list[IngestedColorPartition]) -> list[JobConfig]: - configs = [] - for output in outputs: - env = {"DATA_DATE": output.data_date, "COLOR": output.color} - configs.append(JobConfigBuilder().add_outputs(output).set_env(env).build()) - return configs - - def exec(self, *args: str) -> None: - ingest_color_votes_exec(data_date=os.environ["DATA_DATE"], color=os.environ["COLOR"]) - - -@graph.job -class TrailingColorVotes(DataBuildJob): - output_types = [TrailingColorVotes1MPartition, TrailingColorVotes1WPartition] - - def config(self, outputs: list[TrailingColorVotes1MPartition | TrailingColorVotes1WPartition]) -> list[JobConfig]: - groups = defaultdict(list) - for output in outputs: - groups[(output.data_date, output.color)].append(output) - - configs = [] - for (data_date, color), outputs in groups.items(): - weekly = "false" - monthly = "false" - max_window = 0 - for output in outputs: - if isinstance(output, TrailingColorVotes1WPartition): - weekly = "true" - max_window = max(max_window, 7) - elif isinstance(output, TrailingColorVotes1MPartition): - monthly = "true" - max_window = max(max_window, 28) - - env = {"DATA_DATE": data_date, "COLOR": color, "WEEKLY": weekly, "MONTHLY": monthly} - config = JobConfigBuilder(env=env).add_outputs(*outputs) - for i in range(max_window): - in_date = (date.fromisoformat(data_date) - timedelta(days=i)).isoformat() - config.add_inputs(IngestedColorPartition(data_date=in_date, color=color)) - - configs.append(config.build()) - return configs - - def exec(self, *args: str) -> None: - trailing_color_votes_exec(data_date=os.environ["DATA_DATE"], color=os.environ["COLOR"]) - - -@graph.job -class AggregateColorVotes(DataBuildJob): - output_types = [DailyVotesPartition, Votes1WPartition, Votes1MPartition] - - def config(self, outputs: list[DailyVotesPartition | Votes1WPartition | Votes1MPartition]) -> list[JobConfig]: - configs = [] - - for output in outputs: - if isinstance(output, DailyVotesPartition): - InPartition = IngestedColorPartition - agg_type = "daily_votes" - elif isinstance(output, Votes1WPartition): - InPartition = TrailingColorVotes1WPartition - agg_type = "votes_1w" - elif isinstance(output, Votes1MPartition): - InPartition = TrailingColorVotes1MPartition - agg_type = "votes_1m" - else: - raise ValueError(f"Unknown output type: {output.type}") - - inputs = [InPartition(data_date=output.data_date, color=color) for color in COLORS] - env = {"DATA_DATE": output.data_date, "AGGREGATE_TYPE": agg_type} - configs.append(JobConfigBuilder().add_outputs(output).add_inputs(*inputs).set_env(env).build()) - - return configs - - def exec(self, *args: str) -> None: - aggregate_color_votes_exec(data_date=os.environ["DATA_DATE"], aggregate_type=os.environ["AGGREGATE_TYPE"]) - - -@graph.job -class ColorVoteReportCalc(DataBuildJob): - output_types = [ColorVoteReportPartition] - - def config(self, outputs: list[ColorVoteReportPartition]) -> list[JobConfig]: - config = JobConfigBuilder().add_outputs(*outputs).add_args(*[p.serialize() for p in outputs]) - - for data_date in set(p.data_date for p in outputs): - config.add_inputs( - DailyVotesPartition(data_date=data_date), - Votes1WPartition(data_date=data_date), - Votes1MPartition(data_date=data_date), - ) - - for output in outputs: - config.add_inputs( - IngestedColorPartition(data_date=output.data_date, color=output.color), - TrailingColorVotes1WPartition(data_date=output.data_date, color=output.color), - TrailingColorVotes1MPartition(data_date=output.data_date, color=output.color), - ) - - return [config.build()] - - def exec(self, *args: str) -> None: - color_vote_report_calc_exec(list(args)) - diff --git a/databuild/test/app/dsl/partitions.py b/databuild/test/app/dsl/partitions.py deleted file mode 100644 index 408c246..0000000 --- a/databuild/test/app/dsl/partitions.py +++ /dev/null @@ -1,40 +0,0 @@ -from dataclasses import dataclass -from databuild.dsl.python.dsl import PartitionPattern - -@dataclass -class DatePartitioned: - data_date: str - - -@dataclass -class DateColorPartitioned: - data_date: str - color: str - - -class IngestedColorPartition(DateColorPartitioned, PartitionPattern): - _raw_pattern = r"daily_color_votes/(?P\d{4}-\d{2}-\d{2})/(?P[^/]+)" - - -class TrailingColorVotes1WPartition(DateColorPartitioned, PartitionPattern): - _raw_pattern = r"color_votes_1w/(?P\d{4}-\d{2}-\d{2})/(?P[^/]+)" - - -class TrailingColorVotes1MPartition(DateColorPartitioned, PartitionPattern): - _raw_pattern = r"color_votes_1m/(?P\d{4}-\d{2}-\d{2})/(?P[^/]+)" - - -class DailyVotesPartition(DatePartitioned, PartitionPattern): - _raw_pattern = r"daily_votes/(?P\d{4}-\d{2}-\d{2})" - - -class Votes1WPartition(DatePartitioned, PartitionPattern): - _raw_pattern = r"votes_1w/(?P\d{4}-\d{2}-\d{2})" - - -class Votes1MPartition(DatePartitioned, PartitionPattern): - _raw_pattern = r"votes_1m/(?P\d{4}-\d{2}-\d{2})" - - -class ColorVoteReportPartition(DateColorPartitioned, PartitionPattern): - _raw_pattern = r"color_vote_report/(?P\d{4}-\d{2}-\d{2})/(?P[^/]+)" diff --git a/databuild/test/app/dsl/test/BUILD.bazel b/databuild/test/app/dsl/test/BUILD.bazel deleted file mode 100644 index 0e5a19a..0000000 --- a/databuild/test/app/dsl/test/BUILD.bazel +++ /dev/null @@ -1,87 +0,0 @@ -# Individual job configuration tests -py_test( - name = "test_ingest_color_votes", - srcs = ["test_ingest_color_votes.py"], - main = "test_ingest_color_votes.py", - deps = [ - "//databuild:py_proto", - "//databuild/dsl/python:dsl", - "//databuild/test/app:job_src", - "//databuild/test/app/dsl:dsl_src", - ], -) - -py_test( - name = "test_trailing_color_votes", - srcs = ["test_trailing_color_votes.py"], - main = "test_trailing_color_votes.py", - deps = [ - "//databuild:py_proto", - "//databuild/dsl/python:dsl", - "//databuild/test/app:job_src", - "//databuild/test/app/dsl:dsl_src", - ], -) - -py_test( - name = "test_aggregate_color_votes", - srcs = ["test_aggregate_color_votes.py"], - main = "test_aggregate_color_votes.py", - deps = [ - "//databuild:py_proto", - "//databuild/dsl/python:dsl", - "//databuild/test/app:job_src", - "//databuild/test/app/dsl:dsl_src", - ], -) - -py_test( - name = "test_color_vote_report_calc", - srcs = ["test_color_vote_report_calc.py"], - main = "test_color_vote_report_calc.py", - deps = [ - "//databuild:py_proto", - "//databuild/dsl/python:dsl", - "//databuild/test/app:job_src", - "//databuild/test/app/dsl:dsl_src", - ], -) - -# Graph analysis test -py_test( - name = "test_graph_analysis", - srcs = ["test_graph_analysis.py"], - main = "test_graph_analysis.py", - deps = [ - "//databuild:py_proto", - "//databuild/dsl/python:dsl", - "//databuild/test/app:job_src", - "//databuild/test/app/dsl:dsl_src", - ], -) - -# Bazel vs DSL comparison test -py_test( - name = "test_bazel_dsl_comparison", - srcs = ["test_bazel_dsl_comparison.py"], - main = "test_bazel_dsl_comparison.py", - deps = [ - "//databuild:py_proto", - "//databuild/dsl/python:dsl", - "//databuild/test/app:job_src", - "//databuild/test/app/bazel:job_src", - "//databuild/test/app/dsl:dsl_src", - ], -) - -# DSL generation consistency test -py_test( - name = "test_dsl_generation_consistency", - srcs = ["test_dsl_generation_consistency.py"], - main = "test_dsl_generation_consistency.py", - data = [ - "//databuild/test/app/dsl:generate_fresh_dsl", - "//databuild/test/app/dsl/generated:existing_generated", - ], - deps = [], -) diff --git a/databuild/test/app/dsl/test/test_aggregate_color_votes.py b/databuild/test/app/dsl/test/test_aggregate_color_votes.py deleted file mode 100644 index 9cadf9c..0000000 --- a/databuild/test/app/dsl/test/test_aggregate_color_votes.py +++ /dev/null @@ -1,159 +0,0 @@ -from databuild.test.app.dsl.graph import AggregateColorVotes -from databuild.test.app.dsl.partitions import ( - DailyVotesPartition, - Votes1WPartition, - Votes1MPartition, - IngestedColorPartition, - TrailingColorVotes1WPartition, - TrailingColorVotes1MPartition -) -from databuild.test.app.colors import COLORS -from databuild.proto import DepType - - -def test_aggregate_color_votes_configure_daily_votes(): - """Test AggregateColorVotes config method with daily votes output.""" - job = AggregateColorVotes() - outputs = [DailyVotesPartition(data_date="2025-01-15")] - - configs = job.config(outputs) - - assert len(configs) == 1 - config = configs[0] - assert len(config.outputs) == 1 - assert config.outputs[0].str == "daily_votes/2025-01-15" - assert config.env["DATA_DATE"] == "2025-01-15" - assert config.env["AGGREGATE_TYPE"] == "daily_votes" - - # Should have inputs for all colors - assert len(config.inputs) == len(COLORS) - expected_inputs = {f"daily_color_votes/2025-01-15/{color}" for color in COLORS} - actual_inputs = {input_dep.partition_ref.str for input_dep in config.inputs} - assert actual_inputs == expected_inputs - - # All inputs should be MATERIALIZE type - for input_dep in config.inputs: - assert input_dep.dep_type_code == DepType.MATERIALIZE - assert input_dep.dep_type_name == "materialize" - - -def test_aggregate_color_votes_configure_votes_1w(): - """Test AggregateColorVotes config method with weekly votes output.""" - job = AggregateColorVotes() - outputs = [Votes1WPartition(data_date="2025-01-15")] - - configs = job.config(outputs) - - assert len(configs) == 1 - config = configs[0] - assert len(config.outputs) == 1 - assert config.outputs[0].str == "votes_1w/2025-01-15" - assert config.env["DATA_DATE"] == "2025-01-15" - assert config.env["AGGREGATE_TYPE"] == "votes_1w" - - # Should have inputs for all colors from trailing 1w partitions - assert len(config.inputs) == len(COLORS) - expected_inputs = {f"color_votes_1w/2025-01-15/{color}" for color in COLORS} - actual_inputs = {input_dep.partition_ref.str for input_dep in config.inputs} - assert actual_inputs == expected_inputs - - -def test_aggregate_color_votes_configure_votes_1m(): - """Test AggregateColorVotes config method with monthly votes output.""" - job = AggregateColorVotes() - outputs = [Votes1MPartition(data_date="2025-01-15")] - - configs = job.config(outputs) - - assert len(configs) == 1 - config = configs[0] - assert len(config.outputs) == 1 - assert config.outputs[0].str == "votes_1m/2025-01-15" - assert config.env["DATA_DATE"] == "2025-01-15" - assert config.env["AGGREGATE_TYPE"] == "votes_1m" - - # Should have inputs for all colors from trailing 1m partitions - assert len(config.inputs) == len(COLORS) - expected_inputs = {f"color_votes_1m/2025-01-15/{color}" for color in COLORS} - actual_inputs = {input_dep.partition_ref.str for input_dep in config.inputs} - assert actual_inputs == expected_inputs - - -def test_aggregate_color_votes_configure_multiple_outputs(): - """Test AggregateColorVotes config method with multiple different output types.""" - job = AggregateColorVotes() - outputs = [ - DailyVotesPartition(data_date="2025-01-15"), - Votes1WPartition(data_date="2025-01-16"), - Votes1MPartition(data_date="2025-01-17") - ] - - configs = job.config(outputs) - - assert len(configs) == 3 # One config per output - - # Find configs by date - daily_config = None - weekly_config = None - monthly_config = None - - for config in configs: - if config.env["DATA_DATE"] == "2025-01-15": - daily_config = config - elif config.env["DATA_DATE"] == "2025-01-16": - weekly_config = config - elif config.env["DATA_DATE"] == "2025-01-17": - monthly_config = config - - assert daily_config is not None - assert weekly_config is not None - assert monthly_config is not None - - # Check daily config - assert daily_config.env["AGGREGATE_TYPE"] == "daily_votes" - assert daily_config.outputs[0].str == "daily_votes/2025-01-15" - assert len(daily_config.inputs) == len(COLORS) - assert all("daily_color_votes/2025-01-15/" in inp.partition_ref.str for inp in daily_config.inputs) - - # Check weekly config - assert weekly_config.env["AGGREGATE_TYPE"] == "votes_1w" - assert weekly_config.outputs[0].str == "votes_1w/2025-01-16" - assert len(weekly_config.inputs) == len(COLORS) - assert all("color_votes_1w/2025-01-16/" in inp.partition_ref.str for inp in weekly_config.inputs) - - # Check monthly config - assert monthly_config.env["AGGREGATE_TYPE"] == "votes_1m" - assert monthly_config.outputs[0].str == "votes_1m/2025-01-17" - assert len(monthly_config.inputs) == len(COLORS) - assert all("color_votes_1m/2025-01-17/" in inp.partition_ref.str for inp in monthly_config.inputs) - - -def test_aggregate_color_votes_configure_multiple_same_type(): - """Test AggregateColorVotes config method with multiple outputs of same type.""" - job = AggregateColorVotes() - outputs = [ - DailyVotesPartition(data_date="2025-01-15"), - DailyVotesPartition(data_date="2025-01-16") - ] - - configs = job.config(outputs) - - assert len(configs) == 2 # One config per output - - for config in configs: - assert config.env["AGGREGATE_TYPE"] == "daily_votes" - assert len(config.inputs) == len(COLORS) - - if config.env["DATA_DATE"] == "2025-01-15": - assert config.outputs[0].str == "daily_votes/2025-01-15" - assert all("daily_color_votes/2025-01-15/" in inp.partition_ref.str for inp in config.inputs) - elif config.env["DATA_DATE"] == "2025-01-16": - assert config.outputs[0].str == "daily_votes/2025-01-16" - assert all("daily_color_votes/2025-01-16/" in inp.partition_ref.str for inp in config.inputs) - else: - assert False, f"Unexpected date: {config.env['DATA_DATE']}" - - -if __name__ == '__main__': - import pytest - raise SystemExit(pytest.main([__file__])) \ No newline at end of file diff --git a/databuild/test/app/dsl/test/test_bazel_dsl_comparison.py b/databuild/test/app/dsl/test/test_bazel_dsl_comparison.py deleted file mode 100644 index f2444d3..0000000 --- a/databuild/test/app/dsl/test/test_bazel_dsl_comparison.py +++ /dev/null @@ -1,244 +0,0 @@ -#!/usr/bin/env python3 -""" -Comparison test between Bazel and DSL implementations. - -This test verifies that the DSL job configurations produce identical results -to the equivalent bazel job configurations for the same partition references. -""" - -import unittest -from databuild.proto import PartitionRef, JobConfigureResponse -from databuild.test.app.dsl.graph import ( - IngestColorVotes, - TrailingColorVotes, - AggregateColorVotes, - ColorVoteReportCalc -) -from databuild.test.app.dsl.partitions import ( - IngestedColorPartition, - TrailingColorVotes1WPartition, - TrailingColorVotes1MPartition, - DailyVotesPartition, - Votes1WPartition, - Votes1MPartition, - ColorVoteReportPartition -) - -# Import bazel job config functions -from databuild.test.app.bazel.jobs.ingest_color_votes.config import configure as bazel_ingest_config -from databuild.test.app.bazel.jobs.trailing_color_votes.config import configure as bazel_trailing_config -from databuild.test.app.bazel.jobs.aggregate_color_votes.config import configure as bazel_aggregate_config -from databuild.test.app.bazel.jobs.color_vote_report_calc.config import configure as bazel_report_config - - -class BazelDSLComparisonTest(unittest.TestCase): - """Compare bazel and DSL job configurations to ensure they produce identical results.""" - - def _compare_job_configs(self, bazel_response, dsl_configs): - """Helper to compare JobConfigureResponse from bazel with list[JobConfig] from DSL.""" - self.assertIsInstance(bazel_response, JobConfigureResponse) - self.assertIsInstance(dsl_configs, list) - - bazel_configs = bazel_response.configs - self.assertEqual(len(bazel_configs), len(dsl_configs), - "Bazel and DSL should produce same number of configs") - - # Sort both by a stable key for comparison - def config_sort_key(config): - outputs_str = ",".join(sorted(out.str for out in config.outputs)) - env_str = ",".join(f"{k}={v}" for k, v in sorted(config.env.items())) - return f"{outputs_str}:{env_str}" - - bazel_sorted = sorted(bazel_configs, key=config_sort_key) - dsl_sorted = sorted(dsl_configs, key=config_sort_key) - - for bazel_config, dsl_config in zip(bazel_sorted, dsl_sorted): - # Compare outputs - bazel_outputs = {out.str for out in bazel_config.outputs} - dsl_outputs = {out.str for out in dsl_config.outputs} - self.assertEqual(bazel_outputs, dsl_outputs, "Outputs should match") - - # Compare inputs - bazel_inputs = {(inp.partition_ref.str, inp.dep_type_code, inp.dep_type_name) - for inp in bazel_config.inputs} - dsl_inputs = {(inp.partition_ref.str, inp.dep_type_code, inp.dep_type_name) - for inp in dsl_config.inputs} - self.assertEqual(bazel_inputs, dsl_inputs, "Inputs should match") - - # Compare args - self.assertEqual(set(bazel_config.args), set(dsl_config.args), "Args should match") - - # Compare env - self.assertEqual(bazel_config.env, dsl_config.env, "Environment should match") - - def test_ingest_color_votes_comparison(self): - """Compare IngestColorVotes bazel vs DSL configurations.""" - # Test single output - partition_refs = [PartitionRef(str="daily_color_votes/2025-01-01/red")] - bazel_response = bazel_ingest_config(partition_refs) - - partitions = [IngestedColorPartition.deserialize(ref.str) for ref in partition_refs] - dsl_job = IngestColorVotes() - dsl_configs = dsl_job.config(partitions) - - self._compare_job_configs(bazel_response, dsl_configs) - - # Test multiple outputs - partition_refs = [ - PartitionRef(str="daily_color_votes/2025-01-02/red"), - PartitionRef(str="daily_color_votes/2025-01-02/blue") - ] - bazel_response = bazel_ingest_config(partition_refs) - - partitions = [IngestedColorPartition.deserialize(ref.str) for ref in partition_refs] - dsl_configs = dsl_job.config(partitions) - - self._compare_job_configs(bazel_response, dsl_configs) - - def test_trailing_color_votes_comparison(self): - """Compare TrailingColorVotes bazel vs DSL configurations.""" - # Test weekly output - partition_refs = [PartitionRef(str="color_votes_1w/2025-01-07/red")] - bazel_response = bazel_trailing_config(partition_refs) - - partitions = [TrailingColorVotes1WPartition.deserialize(ref.str) for ref in partition_refs] - dsl_job = TrailingColorVotes() - dsl_configs = dsl_job.config(partitions) - - self._compare_job_configs(bazel_response, dsl_configs) - - # Test monthly output - partition_refs = [PartitionRef(str="color_votes_1m/2025-01-28/blue")] - bazel_response = bazel_trailing_config(partition_refs) - - partitions = [TrailingColorVotes1MPartition.deserialize(ref.str) for ref in partition_refs] - dsl_configs = dsl_job.config(partitions) - - self._compare_job_configs(bazel_response, dsl_configs) - - # Test mixed weekly and monthly for same date/color - partition_refs = [ - PartitionRef(str="color_votes_1w/2025-01-28/green"), - PartitionRef(str="color_votes_1m/2025-01-28/green") - ] - bazel_response = bazel_trailing_config(partition_refs) - - partitions = [ - TrailingColorVotes1WPartition.deserialize(partition_refs[0].str), - TrailingColorVotes1MPartition.deserialize(partition_refs[1].str) - ] - dsl_configs = dsl_job.config(partitions) - - self._compare_job_configs(bazel_response, dsl_configs) - - def test_aggregate_color_votes_comparison(self): - """Compare AggregateColorVotes bazel vs DSL configurations.""" - # Test daily votes - partition_refs = [PartitionRef(str="daily_votes/2025-01-15")] - bazel_response = bazel_aggregate_config(partition_refs) - - partitions = [DailyVotesPartition.deserialize(ref.str) for ref in partition_refs] - dsl_job = AggregateColorVotes() - dsl_configs = dsl_job.config(partitions) - - self._compare_job_configs(bazel_response, dsl_configs) - - # Test weekly votes - partition_refs = [PartitionRef(str="votes_1w/2025-01-15")] - bazel_response = bazel_aggregate_config(partition_refs) - - partitions = [Votes1WPartition.deserialize(ref.str) for ref in partition_refs] - dsl_configs = dsl_job.config(partitions) - - self._compare_job_configs(bazel_response, dsl_configs) - - # Test monthly votes - partition_refs = [PartitionRef(str="votes_1m/2025-01-15")] - bazel_response = bazel_aggregate_config(partition_refs) - - partitions = [Votes1MPartition.deserialize(ref.str) for ref in partition_refs] - dsl_configs = dsl_job.config(partitions) - - self._compare_job_configs(bazel_response, dsl_configs) - - # Test multiple different types - partition_refs = [ - PartitionRef(str="daily_votes/2025-01-15"), - PartitionRef(str="votes_1w/2025-01-16"), - PartitionRef(str="votes_1m/2025-01-17") - ] - bazel_response = bazel_aggregate_config(partition_refs) - - partitions = [ - DailyVotesPartition.deserialize(partition_refs[0].str), - Votes1WPartition.deserialize(partition_refs[1].str), - Votes1MPartition.deserialize(partition_refs[2].str) - ] - dsl_configs = dsl_job.config(partitions) - - self._compare_job_configs(bazel_response, dsl_configs) - - def test_color_vote_report_calc_comparison(self): - """Compare ColorVoteReportCalc bazel vs DSL configurations.""" - # Test single report - partition_refs = [PartitionRef(str="color_vote_report/2025-01-15/red")] - bazel_response = bazel_report_config(partition_refs) - - partitions = [ColorVoteReportPartition.deserialize(ref.str) for ref in partition_refs] - dsl_job = ColorVoteReportCalc() - dsl_configs = dsl_job.config(partitions) - - self._compare_job_configs(bazel_response, dsl_configs) - - # Test multiple reports same date - partition_refs = [ - PartitionRef(str="color_vote_report/2025-01-15/red"), - PartitionRef(str="color_vote_report/2025-01-15/blue") - ] - bazel_response = bazel_report_config(partition_refs) - - partitions = [ColorVoteReportPartition.deserialize(ref.str) for ref in partition_refs] - dsl_configs = dsl_job.config(partitions) - - self._compare_job_configs(bazel_response, dsl_configs) - - # Test multiple reports different dates - partition_refs = [ - PartitionRef(str="color_vote_report/2025-01-15/red"), - PartitionRef(str="color_vote_report/2025-01-16/red") - ] - bazel_response = bazel_report_config(partition_refs) - - partitions = [ColorVoteReportPartition.deserialize(ref.str) for ref in partition_refs] - dsl_configs = dsl_job.config(partitions) - - self._compare_job_configs(bazel_response, dsl_configs) - - def test_partition_serialization_roundtrip(self): - """Test that DSL partition serialization/deserialization works correctly.""" - test_cases = [ - IngestedColorPartition(data_date="2025-01-15", color="red"), - TrailingColorVotes1WPartition(data_date="2025-01-15", color="blue"), - TrailingColorVotes1MPartition(data_date="2025-01-28", color="green"), - DailyVotesPartition(data_date="2025-01-15"), - Votes1WPartition(data_date="2025-01-15"), - Votes1MPartition(data_date="2025-01-15"), - ColorVoteReportPartition(data_date="2025-01-15", color="yellow") - ] - - for partition in test_cases: - with self.subTest(partition=partition): - # Serialize then deserialize - serialized = partition.serialize() - deserialized = type(partition).deserialize(serialized) - - # Should be equal - self.assertEqual(partition, deserialized) - - # Serializing again should give same result - reserialized = deserialized.serialize() - self.assertEqual(serialized, reserialized) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/databuild/test/app/dsl/test/test_color_vote_report_calc.py b/databuild/test/app/dsl/test/test_color_vote_report_calc.py deleted file mode 100644 index 90e5284..0000000 --- a/databuild/test/app/dsl/test/test_color_vote_report_calc.py +++ /dev/null @@ -1,204 +0,0 @@ -from databuild.test.app.dsl.graph import ColorVoteReportCalc -from databuild.test.app.dsl.partitions import ( - ColorVoteReportPartition, - DailyVotesPartition, - Votes1WPartition, - Votes1MPartition, - IngestedColorPartition, - TrailingColorVotes1WPartition, - TrailingColorVotes1MPartition -) -from databuild.proto import DepType - - -def test_color_vote_report_calc_configure_single_output(): - """Test ColorVoteReportCalc config method with single color report output.""" - job = ColorVoteReportCalc() - outputs = [ColorVoteReportPartition(data_date="2025-01-15", color="red")] - - configs = job.config(outputs) - - assert len(configs) == 1 - config = configs[0] - - # Check outputs - assert len(config.outputs) == 1 - assert config.outputs[0].str == "color_vote_report/2025-01-15/red" - - # Check args - should contain partition strings - assert len(config.args) == 1 - assert config.args[0] == "color_vote_report/2025-01-15/red" - - # Check inputs - should have aggregate inputs for the date and specific color inputs - expected_inputs = { - # Aggregate inputs for the date - "daily_votes/2025-01-15", - "votes_1w/2025-01-15", - "votes_1m/2025-01-15", - # Color-specific inputs - "daily_color_votes/2025-01-15/red", - "color_votes_1w/2025-01-15/red", - "color_votes_1m/2025-01-15/red" - } - - actual_inputs = {input_dep.partition_ref.str for input_dep in config.inputs} - assert actual_inputs == expected_inputs - - # All inputs should be MATERIALIZE type - for input_dep in config.inputs: - assert input_dep.dep_type_code == DepType.MATERIALIZE - assert input_dep.dep_type_name == "materialize" - - -def test_color_vote_report_calc_configure_multiple_colors_same_date(): - """Test ColorVoteReportCalc config method with multiple colors for same date.""" - job = ColorVoteReportCalc() - outputs = [ - ColorVoteReportPartition(data_date="2025-01-15", color="red"), - ColorVoteReportPartition(data_date="2025-01-15", color="blue") - ] - - configs = job.config(outputs) - - assert len(configs) == 1 # Single config since all outputs go to same job - config = configs[0] - - # Check outputs - assert len(config.outputs) == 2 - output_strs = {output.str for output in config.outputs} - assert "color_vote_report/2025-01-15/red" in output_strs - assert "color_vote_report/2025-01-15/blue" in output_strs - - # Check args - should contain both partition strings - assert len(config.args) == 2 - assert set(config.args) == {"color_vote_report/2025-01-15/red", "color_vote_report/2025-01-15/blue"} - - # Check inputs - should have aggregate inputs for the date and color-specific inputs for both colors - expected_inputs = { - # Aggregate inputs for the date (only one set since same date) - "daily_votes/2025-01-15", - "votes_1w/2025-01-15", - "votes_1m/2025-01-15", - # Color-specific inputs for red - "daily_color_votes/2025-01-15/red", - "color_votes_1w/2025-01-15/red", - "color_votes_1m/2025-01-15/red", - # Color-specific inputs for blue - "daily_color_votes/2025-01-15/blue", - "color_votes_1w/2025-01-15/blue", - "color_votes_1m/2025-01-15/blue" - } - - actual_inputs = {input_dep.partition_ref.str for input_dep in config.inputs} - assert actual_inputs == expected_inputs - - -def test_color_vote_report_calc_configure_multiple_dates(): - """Test ColorVoteReportCalc config method with reports for different dates.""" - job = ColorVoteReportCalc() - outputs = [ - ColorVoteReportPartition(data_date="2025-01-15", color="red"), - ColorVoteReportPartition(data_date="2025-01-16", color="red") - ] - - configs = job.config(outputs) - - assert len(configs) == 1 # Single config since all outputs go to same job - config = configs[0] - - # Check outputs - assert len(config.outputs) == 2 - output_strs = {output.str for output in config.outputs} - assert "color_vote_report/2025-01-15/red" in output_strs - assert "color_vote_report/2025-01-16/red" in output_strs - - # Check args - assert len(config.args) == 2 - assert set(config.args) == {"color_vote_report/2025-01-15/red", "color_vote_report/2025-01-16/red"} - - # Check inputs - should have aggregate inputs for both dates and color-specific inputs - expected_inputs = { - # Aggregate inputs for both dates - "daily_votes/2025-01-15", - "votes_1w/2025-01-15", - "votes_1m/2025-01-15", - "daily_votes/2025-01-16", - "votes_1w/2025-01-16", - "votes_1m/2025-01-16", - # Color-specific inputs for red on both dates - "daily_color_votes/2025-01-15/red", - "color_votes_1w/2025-01-15/red", - "color_votes_1m/2025-01-15/red", - "daily_color_votes/2025-01-16/red", - "color_votes_1w/2025-01-16/red", - "color_votes_1m/2025-01-16/red" - } - - actual_inputs = {input_dep.partition_ref.str for input_dep in config.inputs} - assert actual_inputs == expected_inputs - - -def test_color_vote_report_calc_configure_complex_scenario(): - """Test ColorVoteReportCalc config method with complex multi-date, multi-color scenario.""" - job = ColorVoteReportCalc() - outputs = [ - ColorVoteReportPartition(data_date="2025-01-15", color="red"), - ColorVoteReportPartition(data_date="2025-01-15", color="blue"), - ColorVoteReportPartition(data_date="2025-01-16", color="green"), - ColorVoteReportPartition(data_date="2025-01-17", color="red") - ] - - configs = job.config(outputs) - - assert len(configs) == 1 # Single config since all outputs go to same job - config = configs[0] - - # Check outputs - assert len(config.outputs) == 4 - expected_output_strs = { - "color_vote_report/2025-01-15/red", - "color_vote_report/2025-01-15/blue", - "color_vote_report/2025-01-16/green", - "color_vote_report/2025-01-17/red" - } - actual_output_strs = {output.str for output in config.outputs} - assert actual_output_strs == expected_output_strs - - # Check args - assert len(config.args) == 4 - assert set(config.args) == expected_output_strs - - # Check inputs - should have aggregate inputs for all unique dates and color-specific inputs - expected_inputs = { - # Aggregate inputs for all dates - "daily_votes/2025-01-15", - "votes_1w/2025-01-15", - "votes_1m/2025-01-15", - "daily_votes/2025-01-16", - "votes_1w/2025-01-16", - "votes_1m/2025-01-16", - "daily_votes/2025-01-17", - "votes_1w/2025-01-17", - "votes_1m/2025-01-17", - # Color-specific inputs - "daily_color_votes/2025-01-15/red", - "color_votes_1w/2025-01-15/red", - "color_votes_1m/2025-01-15/red", - "daily_color_votes/2025-01-15/blue", - "color_votes_1w/2025-01-15/blue", - "color_votes_1m/2025-01-15/blue", - "daily_color_votes/2025-01-16/green", - "color_votes_1w/2025-01-16/green", - "color_votes_1m/2025-01-16/green", - "daily_color_votes/2025-01-17/red", - "color_votes_1w/2025-01-17/red", - "color_votes_1m/2025-01-17/red" - } - - actual_inputs = {input_dep.partition_ref.str for input_dep in config.inputs} - assert actual_inputs == expected_inputs - - -if __name__ == '__main__': - import pytest - raise SystemExit(pytest.main([__file__])) \ No newline at end of file diff --git a/databuild/test/app/dsl/test/test_dsl_generation_consistency.py b/databuild/test/app/dsl/test/test_dsl_generation_consistency.py deleted file mode 100644 index 97bf536..0000000 --- a/databuild/test/app/dsl/test/test_dsl_generation_consistency.py +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env python3 -""" -Test that verifies the generated DSL code is up-to-date. - -This test ensures that the checked-in generated directory contents match -exactly what would be produced by a fresh run of graph.generate. -""" - -import hashlib -import os -import subprocess -import tempfile -import unittest -from pathlib import Path - - -class TestDSLGenerationConsistency(unittest.TestCase): - def setUp(self): - # Find the test runfiles directory to locate tar files - runfiles_dir = os.environ.get("RUNFILES_DIR") - if runfiles_dir: - self.runfiles_root = Path(runfiles_dir) / "_main" - else: - # Fallback for development - not expected to work in this case - self.fail("RUNFILES_DIR not set - test must be run via bazel test") - - def _compute_tar_hash(self, tar_path: Path) -> str: - """Compute MD5 hash of a tar file's contents.""" - if not tar_path.exists(): - self.fail(f"Tar file not found: {tar_path}") - - with open(tar_path, "rb") as f: - content = f.read() - return hashlib.md5(content).hexdigest() - - def _extract_and_list_tar(self, tar_path: Path) -> set: - """Extract tar file and return set of file paths and their content hashes.""" - if not tar_path.exists(): - return set() - - result = subprocess.run([ - "tar", "-tf", str(tar_path) - ], capture_output=True, text=True) - - if result.returncode != 0: - self.fail(f"Failed to list tar contents: {result.stderr}") - - return set(result.stdout.strip().split('\n')) if result.stdout.strip() else set() - - def test_generated_code_is_up_to_date(self): - """Test that the existing generated tar matches the fresh generated tar.""" - - # Find the tar files from data dependencies - existing_tar = self.runfiles_root / "databuild/test/app/dsl/generated/existing_generated.tar" - fresh_tar = self.runfiles_root / "databuild/test/app/dsl/generated_fresh.tar" - - # Compute hashes of both tar files - existing_hash = self._compute_tar_hash(existing_tar) - fresh_hash = self._compute_tar_hash(fresh_tar) - - # Compare hashes - if existing_hash != fresh_hash: - # Provide detailed diff information - existing_files = self._extract_and_list_tar(existing_tar) - fresh_files = self._extract_and_list_tar(fresh_tar) - - only_in_existing = existing_files - fresh_files - only_in_fresh = fresh_files - existing_files - - error_msg = [ - "Generated DSL code is out of date!", - f"Existing tar hash: {existing_hash}", - f"Fresh tar hash: {fresh_hash}", - "", - "To fix this, run:", - " bazel run //databuild/test/app/dsl:graph.generate", - "" - ] - - if only_in_existing: - error_msg.extend([ - "Files only in existing generated code:", - *[f" - {f}" for f in sorted(only_in_existing)], - "" - ]) - - if only_in_fresh: - error_msg.extend([ - "Files only in fresh generated code:", - *[f" + {f}" for f in sorted(only_in_fresh)], - "" - ]) - - common_files = existing_files & fresh_files - if common_files: - error_msg.extend([ - f"Common files: {len(common_files)}", - "This suggests files have different contents.", - ]) - - self.fail("\n".join(error_msg)) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/databuild/test/app/dsl/test/test_graph_analysis.py b/databuild/test/app/dsl/test/test_graph_analysis.py deleted file mode 100644 index 8a56a15..0000000 --- a/databuild/test/app/dsl/test/test_graph_analysis.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python3 -""" -Integration test for the DSL graph analysis. - -This test verifies that when we request color vote reports via the DSL graph, -the analyzer correctly identifies all upstream dependencies and jobs required. - -NOTE: This test assumes the DSL graph will have an analyze() method similar to -the bazel graph analyzer. This functionality is not yet implemented but these -tests will validate it once available. -""" - -import unittest -from databuild.test.app.dsl.graph import graph -from databuild.test.app.dsl.partitions import ColorVoteReportPartition - - -class DSLGraphAnalysisTest(unittest.TestCase): - def setUp(self): - # Ensure we have the graph instance - self.graph = graph - - def test_single_color_report_dependencies(self): - """Test dependencies for a single color vote report via DSL.""" - partition_refs = ["color_vote_report/2024-01-15/red"] - - # TODO: Once DSL graph analysis is implemented, this should call: - # result = self.graph.analyze(partition_refs) - # self.assertIn('nodes', result) - - # For now, we can at least verify the graph structure - self.assertIsNotNone(self.graph) - self.assertGreater(len(self.graph.lookup), 0) - - # Verify we can create the partition and find its producer - partition = ColorVoteReportPartition(data_date="2024-01-15", color="red") - producer_job_class = self.graph.lookup.get(ColorVoteReportPartition) - self.assertIsNotNone(producer_job_class, "ColorVoteReportPartition should have a registered producer") - - # Test that we can call the job's config method - job_instance = producer_job_class() - configs = job_instance.config([partition]) - self.assertIsInstance(configs, list) - self.assertGreater(len(configs), 0) - - def test_multiple_color_reports_same_date(self): - """Test dependencies when requesting multiple colors for the same date via DSL.""" - partition_refs = [ - "color_vote_report/2024-01-15/red", - "color_vote_report/2024-01-15/blue" - ] - - # TODO: Once DSL graph analysis is implemented, this should call: - # result = self.graph.analyze(partition_refs) - # self.assertIn('nodes', result) - - # For now, verify we can handle multiple partitions - partitions = [ - ColorVoteReportPartition(data_date="2024-01-15", color="red"), - ColorVoteReportPartition(data_date="2024-01-15", color="blue") - ] - - producer_job_class = self.graph.lookup.get(ColorVoteReportPartition) - self.assertIsNotNone(producer_job_class) - - job_instance = producer_job_class() - configs = job_instance.config(partitions) - self.assertIsInstance(configs, list) - self.assertGreater(len(configs), 0) - - def test_multiple_dates_dependencies(self): - """Test dependencies when requesting reports for different dates via DSL.""" - partition_refs = [ - "color_vote_report/2024-01-15/red", - "color_vote_report/2024-01-16/red" - ] - - # TODO: Once DSL graph analysis is implemented, this should call: - # result = self.graph.analyze(partition_refs) - # self.assertIn('nodes', result) - - # For now, verify we can handle different dates - partitions = [ - ColorVoteReportPartition(data_date="2024-01-15", color="red"), - ColorVoteReportPartition(data_date="2024-01-16", color="red") - ] - - producer_job_class = self.graph.lookup.get(ColorVoteReportPartition) - self.assertIsNotNone(producer_job_class) - - job_instance = producer_job_class() - configs = job_instance.config(partitions) - self.assertIsInstance(configs, list) - self.assertGreater(len(configs), 0) - - def test_graph_completeness(self): - """Test that the DSL graph has all expected partition types registered.""" - from databuild.test.app.dsl.partitions import ( - IngestedColorPartition, - TrailingColorVotes1WPartition, - TrailingColorVotes1MPartition, - DailyVotesPartition, - Votes1WPartition, - Votes1MPartition, - ColorVoteReportPartition - ) - - expected_partitions = { - IngestedColorPartition, - TrailingColorVotes1WPartition, - TrailingColorVotes1MPartition, - DailyVotesPartition, - Votes1WPartition, - Votes1MPartition, - ColorVoteReportPartition - } - - registered_partitions = set(self.graph.lookup.keys()) - self.assertEqual(registered_partitions, expected_partitions, - "All partition types should be registered in the graph") - - def test_partition_lookup_functionality(self): - """Test that partition lookup works correctly for all partition types.""" - from databuild.test.app.dsl.partitions import ( - IngestedColorPartition, - TrailingColorVotes1WPartition, - TrailingColorVotes1MPartition, - DailyVotesPartition, - Votes1WPartition, - Votes1MPartition, - ColorVoteReportPartition - ) - - # Test each partition type can be looked up and has a valid job - test_cases = [ - (IngestedColorPartition, IngestedColorPartition(data_date="2024-01-15", color="red")), - (TrailingColorVotes1WPartition, TrailingColorVotes1WPartition(data_date="2024-01-15", color="red")), - (TrailingColorVotes1MPartition, TrailingColorVotes1MPartition(data_date="2024-01-15", color="red")), - (DailyVotesPartition, DailyVotesPartition(data_date="2024-01-15")), - (Votes1WPartition, Votes1WPartition(data_date="2024-01-15")), - (Votes1MPartition, Votes1MPartition(data_date="2024-01-15")), - (ColorVoteReportPartition, ColorVoteReportPartition(data_date="2024-01-15", color="red")) - ] - - for partition_type, partition_instance in test_cases: - with self.subTest(partition_type=partition_type.__name__): - job_class = self.graph.lookup.get(partition_type) - self.assertIsNotNone(job_class, f"Job class for {partition_type.__name__} should be registered") - - # Verify we can instantiate the job and call config - job_instance = job_class() - configs = job_instance.config([partition_instance]) - self.assertIsInstance(configs, list, f"Config method for {partition_type.__name__} should return a list") - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/databuild/test/app/dsl/test/test_ingest_color_votes.py b/databuild/test/app/dsl/test/test_ingest_color_votes.py deleted file mode 100644 index 4c3d16e..0000000 --- a/databuild/test/app/dsl/test/test_ingest_color_votes.py +++ /dev/null @@ -1,56 +0,0 @@ -from databuild.test.app.dsl.graph import IngestColorVotes -from databuild.test.app.dsl.partitions import IngestedColorPartition -from databuild.proto import PartitionRef - - -def test_ingest_color_votes_configure_single(): - """Test IngestColorVotes config method with single output.""" - job = IngestColorVotes() - outputs = [IngestedColorPartition(data_date="2025-01-01", color="red")] - - configs = job.config(outputs) - - assert len(configs) == 1 - config = configs[0] - assert len(config.outputs) == 1 - assert config.outputs[0].str == "daily_color_votes/2025-01-01/red" - assert config.env["COLOR"] == "red" - assert config.env["DATA_DATE"] == "2025-01-01" - assert len(config.inputs) == 0 - assert len(config.args) == 0 - - -def test_ingest_color_votes_configure_multiple(): - """Test IngestColorVotes config method with multiple outputs.""" - job = IngestColorVotes() - outputs = [ - IngestedColorPartition(data_date="2025-01-02", color="red"), - IngestedColorPartition(data_date="2025-01-02", color="blue"), - ] - - configs = job.config(outputs) - - assert len(configs) == 2 - - # First config - config1 = configs[0] - assert len(config1.outputs) == 1 - assert config1.outputs[0].str == "daily_color_votes/2025-01-02/red" - assert config1.env["COLOR"] == "red" - assert config1.env["DATA_DATE"] == "2025-01-02" - assert len(config1.inputs) == 0 - assert len(config1.args) == 0 - - # Second config - config2 = configs[1] - assert len(config2.outputs) == 1 - assert config2.outputs[0].str == "daily_color_votes/2025-01-02/blue" - assert config2.env["COLOR"] == "blue" - assert config2.env["DATA_DATE"] == "2025-01-02" - assert len(config2.inputs) == 0 - assert len(config2.args) == 0 - - -if __name__ == '__main__': - import pytest - raise SystemExit(pytest.main([__file__])) \ No newline at end of file diff --git a/databuild/test/app/dsl/test/test_trailing_color_votes.py b/databuild/test/app/dsl/test/test_trailing_color_votes.py deleted file mode 100644 index 9b025e7..0000000 --- a/databuild/test/app/dsl/test/test_trailing_color_votes.py +++ /dev/null @@ -1,135 +0,0 @@ -from databuild.test.app.dsl.graph import TrailingColorVotes -from databuild.test.app.dsl.partitions import ( - TrailingColorVotes1WPartition, - TrailingColorVotes1MPartition, - IngestedColorPartition -) -from databuild.proto import DepType - - -def test_trailing_color_votes_configure_weekly_only(): - """Test TrailingColorVotes config method with weekly output only.""" - job = TrailingColorVotes() - outputs = [TrailingColorVotes1WPartition(data_date="2025-01-07", color="red")] - - configs = job.config(outputs) - - assert len(configs) == 1 - config = configs[0] - assert len(config.outputs) == 1 - assert config.outputs[0].str == "color_votes_1w/2025-01-07/red" - assert config.env["COLOR"] == "red" - assert config.env["DATA_DATE"] == "2025-01-07" - assert config.env["WEEKLY"] == "true" - assert config.env["MONTHLY"] == "false" - - # Should have 7 days of inputs - assert len(config.inputs) == 7 - expected_dates = ["2025-01-07", "2025-01-06", "2025-01-05", "2025-01-04", - "2025-01-03", "2025-01-02", "2025-01-01"] - for i, input_dep in enumerate(config.inputs): - assert input_dep.dep_type_code == DepType.MATERIALIZE - assert input_dep.dep_type_name == "materialize" - assert input_dep.partition_ref.str == f"daily_color_votes/{expected_dates[i]}/red" - - -def test_trailing_color_votes_configure_monthly_only(): - """Test TrailingColorVotes config method with monthly output only.""" - job = TrailingColorVotes() - outputs = [TrailingColorVotes1MPartition(data_date="2025-01-28", color="blue")] - - configs = job.config(outputs) - - assert len(configs) == 1 - config = configs[0] - assert len(config.outputs) == 1 - assert config.outputs[0].str == "color_votes_1m/2025-01-28/blue" - assert config.env["COLOR"] == "blue" - assert config.env["DATA_DATE"] == "2025-01-28" - assert config.env["WEEKLY"] == "false" - assert config.env["MONTHLY"] == "true" - - # Should have 28 days of inputs - assert len(config.inputs) == 28 - # Check first and last input dates - assert config.inputs[0].partition_ref.str == "daily_color_votes/2025-01-28/blue" - assert config.inputs[27].partition_ref.str == "daily_color_votes/2025-01-01/blue" - - -def test_trailing_color_votes_configure_both_weekly_and_monthly(): - """Test TrailingColorVotes config method with both weekly and monthly outputs for same date/color.""" - job = TrailingColorVotes() - outputs = [ - TrailingColorVotes1WPartition(data_date="2025-01-28", color="green"), - TrailingColorVotes1MPartition(data_date="2025-01-28", color="green") - ] - - configs = job.config(outputs) - - assert len(configs) == 1 # Should group by (data_date, color) - config = configs[0] - assert len(config.outputs) == 2 - - # Check outputs - output_strs = {output.str for output in config.outputs} - assert "color_votes_1w/2025-01-28/green" in output_strs - assert "color_votes_1m/2025-01-28/green" in output_strs - - assert config.env["COLOR"] == "green" - assert config.env["DATA_DATE"] == "2025-01-28" - assert config.env["WEEKLY"] == "true" - assert config.env["MONTHLY"] == "true" - - # Should have 28 days of inputs (max window) - assert len(config.inputs) == 28 - - -def test_trailing_color_votes_configure_multiple_groups(): - """Test TrailingColorVotes config method with outputs that require separate configs.""" - job = TrailingColorVotes() - outputs = [ - TrailingColorVotes1WPartition(data_date="2025-01-07", color="red"), - TrailingColorVotes1WPartition(data_date="2025-01-07", color="blue"), - TrailingColorVotes1MPartition(data_date="2025-01-08", color="red") - ] - - configs = job.config(outputs) - - assert len(configs) == 3 # Three different (data_date, color) combinations - - # Find configs by their characteristics - red_7th_config = None - blue_7th_config = None - red_8th_config = None - - for config in configs: - if config.env["DATA_DATE"] == "2025-01-07" and config.env["COLOR"] == "red": - red_7th_config = config - elif config.env["DATA_DATE"] == "2025-01-07" and config.env["COLOR"] == "blue": - blue_7th_config = config - elif config.env["DATA_DATE"] == "2025-01-08" and config.env["COLOR"] == "red": - red_8th_config = config - - assert red_7th_config is not None - assert blue_7th_config is not None - assert red_8th_config is not None - - # Check red 7th (weekly only) - assert red_7th_config.env["WEEKLY"] == "true" - assert red_7th_config.env["MONTHLY"] == "false" - assert len(red_7th_config.inputs) == 7 - - # Check blue 7th (weekly only) - assert blue_7th_config.env["WEEKLY"] == "true" - assert blue_7th_config.env["MONTHLY"] == "false" - assert len(blue_7th_config.inputs) == 7 - - # Check red 8th (monthly only) - assert red_8th_config.env["WEEKLY"] == "false" - assert red_8th_config.env["MONTHLY"] == "true" - assert len(red_8th_config.inputs) == 28 - - -if __name__ == '__main__': - import pytest - raise SystemExit(pytest.main([__file__])) \ No newline at end of file diff --git a/databuild/test/app/e2e_test_common.py b/databuild/test/app/e2e_test_common.py deleted file mode 100644 index 00f3ad7..0000000 --- a/databuild/test/app/e2e_test_common.py +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python3 -""" -Common end-to-end test logic for DataBuild test apps. - -Provides shared functionality for testing both bazel-defined and DSL-generated graphs. -""" - -import json -import os -import shutil -import subprocess -import time -import unittest -from pathlib import Path -from typing import List, Optional - - -class DataBuildE2ETestBase(unittest.TestCase): - """Base class for DataBuild end-to-end tests.""" - - def setUp(self): - """Set up test environment.""" - self.output_dir = Path("/tmp/data/color_votes_1w/2025-09-01/red") - self.output_file = self.output_dir / "data.json" - self.partition_ref = "color_votes_1w/2025-09-01/red" - - # Clean up any existing test data - if self.output_dir.exists(): - shutil.rmtree(self.output_dir) - - def tearDown(self): - """Clean up test environment.""" - if self.output_dir.exists(): - shutil.rmtree(self.output_dir) - - def find_graph_build_binary(self, possible_paths: List[str]) -> str: - """Find the graph.build binary from a list of possible paths.""" - graph_build_path = None - for path in possible_paths: - if os.path.exists(path): - graph_build_path = path - break - - self.assertIsNotNone(graph_build_path, - f"Graph build binary not found in any of: {possible_paths}") - return graph_build_path - - def execute_and_verify_graph_build(self, graph_build_path: str) -> None: - """Execute the graph build and verify the results.""" - # Record start time for file modification check - start_time = time.time() - - # Execute the graph build (shell script) - result = subprocess.run( - ["bash", graph_build_path, self.partition_ref], - capture_output=True, - text=True - ) - - # Verify execution succeeded - self.assertEqual(result.returncode, 0, - f"Graph build failed with stderr: {result.stderr}") - - # Verify output file was created - self.assertTrue(self.output_file.exists(), - f"Output file {self.output_file} was not created") - - # Verify file was created recently (within 60 seconds) - file_mtime = os.path.getmtime(self.output_file) - time_diff = file_mtime - start_time - self.assertGreaterEqual(time_diff, -1, # Allow 1 second clock skew - f"File appears to be too old: {time_diff} seconds") - self.assertLessEqual(time_diff, 60, - f"File creation took too long: {time_diff} seconds") - - # Verify file contains valid JSON - with open(self.output_file, 'r') as f: - content = f.read() - - try: - data = json.loads(content) - except json.JSONDecodeError as e: - self.fail(f"Output file does not contain valid JSON: {e}") - - # Basic sanity check on JSON structure - self.assertIsInstance(data, (dict, list), - "JSON should be an object or array") - - def get_standard_runfiles_paths(self, relative_path: str) -> List[str]: - """Get standard list of possible runfiles paths for a binary.""" - runfiles_dir = os.environ.get("RUNFILES_DIR") - test_srcdir = os.environ.get("TEST_SRCDIR") - - possible_paths = [] - if runfiles_dir: - possible_paths.append(os.path.join(runfiles_dir, '_main', relative_path)) - possible_paths.append(os.path.join(runfiles_dir, relative_path)) - - if test_srcdir: - possible_paths.append(os.path.join(test_srcdir, '_main', relative_path)) - possible_paths.append(os.path.join(test_srcdir, relative_path)) - - return possible_paths \ No newline at end of file diff --git a/databuild/test/app/jobs/aggregate_color_votes/README.md b/databuild/test/app/jobs/aggregate_color_votes/README.md deleted file mode 100644 index 24ea627..0000000 --- a/databuild/test/app/jobs/aggregate_color_votes/README.md +++ /dev/null @@ -1,10 +0,0 @@ - -# Aggregate Color Votes -This job adds up votes across colors for the same date. It uses an arbitrary list of colors from [`colors.py`](../../colors.py). - -## Configure -When requested for a given date, it creates a job config for each date and type of aggregation, handling daily, weekly, -and monthly aggregates. Declares data deps based on date and the colors in [`colors.py`](../../colors.py). - -## Execute -Simply sums the `votes` from the referenced partitions and writes them. diff --git a/databuild/test/app/jobs/aggregate_color_votes/execute.py b/databuild/test/app/jobs/aggregate_color_votes/execute.py deleted file mode 100644 index 4ef255f..0000000 --- a/databuild/test/app/jobs/aggregate_color_votes/execute.py +++ /dev/null @@ -1,26 +0,0 @@ -from databuild.test.app import dal -from databuild.proto import PartitionRef -from databuild.test.app.colors import COLORS - -def execute(data_date: str, aggregate_type: str): - # Determine input prefix based on aggregate type - if aggregate_type == "daily_votes": - input_prefix = "daily_color_votes" - elif aggregate_type == "votes_1w": - input_prefix = "color_votes_1w" - elif aggregate_type == "votes_1m": - input_prefix = "color_votes_1m" - else: - raise ValueError(f"Unknown aggregate type: {aggregate_type}") - - # Read data from all colors for this date - input_refs = [] - for color in COLORS: - input_refs.append(PartitionRef(str=f"{input_prefix}/{data_date}/{color}")) - - data = dal.read(*input_refs) - total_votes = sum(record["votes"] for record in data) - - # Write aggregated result - output_ref = PartitionRef(str=f"{aggregate_type}/{data_date}") - dal.write(output_ref, [{"data_date": data_date, "votes": total_votes}]) \ No newline at end of file diff --git a/databuild/test/app/jobs/color_vote_report_calc/README.md b/databuild/test/app/jobs/color_vote_report_calc/README.md deleted file mode 100644 index 7f2b601..0000000 --- a/databuild/test/app/jobs/color_vote_report_calc/README.md +++ /dev/null @@ -1,18 +0,0 @@ - -# Color Vote Report Calc -Calculates some metrics based on data calculated by other aggregates: -- Total votes - - On this day - - In last week - - In last month -- Percent of total votes going to this color - - On this day - - In last week - - In last month - -## Configure -This job tests the "produce multiple partitions based on requested inputs in one run" mode. It only ever produces a -single job config, which produces all requested outputs. - -## Execute -Iterates over requested partitions and performs calculations described above. diff --git a/databuild/test/app/jobs/color_vote_report_calc/execute.py b/databuild/test/app/jobs/color_vote_report_calc/execute.py deleted file mode 100644 index e2b2ba7..0000000 --- a/databuild/test/app/jobs/color_vote_report_calc/execute.py +++ /dev/null @@ -1,51 +0,0 @@ -from databuild.test.app import dal -from databuild.proto import PartitionRef - -def execute(output_partition_strs: list[str]): - # Parse requested outputs - outputs = [PartitionRef(str=ref_str) for ref_str in output_partition_strs] - - for output in outputs: - parts = output.str.split("/") - data_date, color = parts[1], parts[2] - - # Read total votes for this date - fail if missing - daily_total = dal.read(PartitionRef(str=f"daily_votes/{data_date}"), empty_ok=False) - weekly_total = dal.read(PartitionRef(str=f"votes_1w/{data_date}"), empty_ok=False) - monthly_total = dal.read(PartitionRef(str=f"votes_1m/{data_date}"), empty_ok=False) - - # Read color-specific votes for this date/color - fail if missing - daily_color = dal.read(PartitionRef(str=f"daily_color_votes/{data_date}/{color}"), empty_ok=False) - weekly_color = dal.read(PartitionRef(str=f"color_votes_1w/{data_date}/{color}"), empty_ok=False) - monthly_color = dal.read(PartitionRef(str=f"color_votes_1m/{data_date}/{color}"), empty_ok=False) - - # Extract vote counts - daily_total_votes = daily_total[0]["votes"] - weekly_total_votes = weekly_total[0]["votes"] - monthly_total_votes = monthly_total[0]["votes"] - - daily_color_votes = daily_color[0]["votes"] - weekly_color_votes = weekly_color[0]["votes"] - monthly_color_votes = monthly_color[0]["votes"] - - # Calculate percentages - daily_percent = (daily_color_votes / daily_total_votes * 100) if daily_total_votes > 0 else 0 - weekly_percent = (weekly_color_votes / weekly_total_votes * 100) if weekly_total_votes > 0 else 0 - monthly_percent = (monthly_color_votes / monthly_total_votes * 100) if monthly_total_votes > 0 else 0 - - # Write report - report_data = [{ - "color": color, - "data_date": data_date, - "daily_total_votes": daily_total_votes, - "weekly_total_votes": weekly_total_votes, - "monthly_total_votes": monthly_total_votes, - "daily_color_votes": daily_color_votes, - "weekly_color_votes": weekly_color_votes, - "monthly_color_votes": monthly_color_votes, - "daily_percent": daily_percent, - "weekly_percent": weekly_percent, - "monthly_percent": monthly_percent - }] - - dal.write(output, report_data) \ No newline at end of file diff --git a/databuild/test/app/jobs/ingest_color_votes/README.md b/databuild/test/app/jobs/ingest_color_votes/README.md deleted file mode 100644 index 2299dd2..0000000 --- a/databuild/test/app/jobs/ingest_color_votes/README.md +++ /dev/null @@ -1,9 +0,0 @@ - -# Ingest Color Votes -This job simply generates a random number between 0 and 1000 and writes it to the output. - -## Configure -The job has no inputs, and communicates params via env var. It generates a single job config per color date requested. - -## Execute -Generates data with `votes` being a number between 0 and 1000. diff --git a/databuild/test/app/jobs/ingest_color_votes/execute.py b/databuild/test/app/jobs/ingest_color_votes/execute.py deleted file mode 100644 index 9281624..0000000 --- a/databuild/test/app/jobs/ingest_color_votes/execute.py +++ /dev/null @@ -1,10 +0,0 @@ - -from databuild.test.app import dal -from databuild.proto import PartitionRef -import random - - -def execute(data_date: str, color: str): - random.seed(hash((data_date, color))) - ref = PartitionRef(str=f"daily_color_votes/{data_date}/{color}") - dal.write(ref, [{"color": color, "data_date": data_date, "votes": random.randint(0, 1000)}]) diff --git a/databuild/test/app/jobs/ingest_color_votes/test.py b/databuild/test/app/jobs/ingest_color_votes/test.py deleted file mode 100644 index 8c7cc54..0000000 --- a/databuild/test/app/jobs/ingest_color_votes/test.py +++ /dev/null @@ -1,17 +0,0 @@ -from databuild.test.app.jobs.ingest_color_votes.execute import execute -from databuild.test.app import dal -from databuild.proto import PartitionRef - - -def test_ingest_color_votes(): - execute("2025-01-01", "red") - results = dal.read(PartitionRef(str="daily_color_votes/2025-01-01/red")) - assert len(results) == 1 - assert results[0]["color"] == "red" - assert results[0]["data_date"] == "2025-01-01" - assert isinstance(results[0]["votes"], int) - - -if __name__ == '__main__': - import pytest - raise SystemExit(pytest.main([__file__])) diff --git a/databuild/test/app/jobs/trailing_color_votes/README.md b/databuild/test/app/jobs/trailing_color_votes/README.md deleted file mode 100644 index 9b7479c..0000000 --- a/databuild/test/app/jobs/trailing_color_votes/README.md +++ /dev/null @@ -1,11 +0,0 @@ - -# Trailing Color Votes -This job adds up votes from trailing days. For week granularity, it's 7 days, for month granularity its 28 days. - -## Configure -Produces a job config for every color and date. Uses "WEEKLY" and "MONTHLY" env vars to signal when those aggregates -should be calculated and written, e.g. when both weekly and monthly are requested for the same color + date, a single -job config is produced that configures the job to produce both. - -## Execute -Just reads trailing data for the specified color and date and adds it up, writing `votes` with the sum. diff --git a/databuild/test/app/jobs/trailing_color_votes/execute.py b/databuild/test/app/jobs/trailing_color_votes/execute.py deleted file mode 100644 index 947e08e..0000000 --- a/databuild/test/app/jobs/trailing_color_votes/execute.py +++ /dev/null @@ -1,28 +0,0 @@ -from databuild.test.app import dal -from databuild.proto import PartitionRef -from datetime import date, timedelta -import os - -def execute(data_date: str, color: str): - output_date = date.fromisoformat(data_date) - weekly = os.environ.get("WEEKLY", "false").lower() == "true" - monthly = os.environ.get("MONTHLY", "false").lower() == "true" - - def calculate_and_write(window_days: int, output_prefix: str): - # Read trailing data and sum votes - input_refs = [] - for i in range(window_days): - input_date = output_date - timedelta(days=i) - input_refs.append(PartitionRef(str=f"daily_color_votes/{input_date.isoformat()}/{color}")) - - data = dal.read(*input_refs) - total_votes = sum(record["votes"] for record in data) - - output_ref = PartitionRef(str=f"{output_prefix}/{data_date}/{color}") - dal.write(output_ref, [{"color": color, "data_date": data_date, "votes": total_votes}]) - - if weekly: - calculate_and_write(7, "color_votes_1w") - - if monthly: - calculate_and_write(28, "color_votes_1m") \ No newline at end of file diff --git a/databuild/test/databuild_test.rs b/databuild/test/databuild_test.rs deleted file mode 100644 index ab98baf..0000000 --- a/databuild/test/databuild_test.rs +++ /dev/null @@ -1,79 +0,0 @@ -// Include the generated protobuf code -include!("../databuild.rs"); - -#[cfg(test)] -mod tests { - use super::*; - use prost::Message; - - #[test] - fn test_partition_ref_creation() { - let partition_ref = PartitionRef { str: "test-partition".to_string() }; - assert_eq!(partition_ref.str, "test-partition"); - } - - #[test] - fn test_job_config_creation() { - let partition = PartitionRef { str: "output-partition".to_string() }; - let mut job_config = JobConfig::default(); - job_config.outputs.push(partition); - job_config.args.push("arg1".to_string()); - - assert_eq!(job_config.outputs.len(), 1); - assert_eq!(job_config.args.len(), 1); - assert_eq!(job_config.outputs[0].str, "output-partition"); - assert_eq!(job_config.args[0], "arg1"); - } - - #[test] - fn test_prost_serialization() { - // Test that we can properly serialize and deserialize with prost - let partition_ref = PartitionRef { str: "test-partition".to_string() }; - - // Encode to bytes using prost - let mut buf = Vec::new(); - partition_ref.encode(&mut buf).expect("Failed to encode"); - - // Decode from bytes using prost - let decoded_partition = PartitionRef::decode(&buf[..]).expect("Failed to decode"); - - assert_eq!(partition_ref.str, decoded_partition.str); - } - - #[test] - fn test_serde_serialization() { - // Test that we can serialize to JSON using serde - let partition_ref = PartitionRef { str: "test-partition".to_string() }; - - // Serialize to JSON - let json = serde_json::to_string(&partition_ref).expect("Failed to serialize to JSON"); - - // Deserialize from JSON - let decoded_partition: PartitionRef = serde_json::from_str(&json).expect("Failed to deserialize from JSON"); - - assert_eq!(partition_ref.str, decoded_partition.str); - } - - #[test] - fn test_job_graph_creation() { - let _job_label = JobLabel { label: "//my:job".to_string() }; - let graph_label = GraphLabel { label: "//my:graph".to_string() }; - - let mut job_graph = JobGraph::default(); - job_graph.label = Some(graph_label); - job_graph.outputs.push(PartitionRef { str: "output".to_string() }); - - assert!(job_graph.label.is_some()); - assert_eq!(job_graph.label.unwrap().label, "//my:graph"); - assert_eq!(job_graph.outputs.len(), 1); - } - - #[test] - fn test_dep_type_enum() { - let query_dep = DepType::Query; - let materialize_dep = DepType::Materialize; - - assert_eq!(query_dep as i32, 0); - assert_eq!(materialize_dep as i32, 1); - } -} \ No newline at end of file diff --git a/databuild/test/py_proto_test.py b/databuild/test/py_proto_test.py deleted file mode 100644 index 925b03f..0000000 --- a/databuild/test/py_proto_test.py +++ /dev/null @@ -1,10 +0,0 @@ - -def test_import(): - from databuild.proto import PartitionRef - ref = PartitionRef(str="foo_bar") - assert(ref.str) == "foo_bar" - - -if __name__ == "__main__": - import pytest - raise SystemExit(pytest.main([__file__])) diff --git a/databuild/test/simple.proto b/databuild/test/simple.proto deleted file mode 100644 index 0b7a219..0000000 --- a/databuild/test/simple.proto +++ /dev/null @@ -1,19 +0,0 @@ -syntax = "proto3"; - -package simple.v1; - -// A simple message to test protobuf code generation -message Person { - string name = 1; - int32 age = 2; - string email = 3; -} - -// A simple service request/response -message GetPersonRequest { - string person_id = 1; -} - -message GetPersonResponse { - Person person = 1; -} \ No newline at end of file diff --git a/databuild/test/simple_test.rs b/databuild/test/simple_test.rs deleted file mode 100644 index 3d0612b..0000000 --- a/databuild/test/simple_test.rs +++ /dev/null @@ -1,100 +0,0 @@ -// Include the generated protobuf code -include!("simple.rs"); - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_person_creation() { - let person = Person::default(); - assert_eq!(person.name, ""); - assert_eq!(person.age, 0); - assert_eq!(person.email, ""); - } - - #[test] - fn test_person_with_values() { - let person = Person { - name: "Alice".to_string(), - age: 30, - email: "alice@example.com".to_string(), - }; - assert_eq!(person.name, "Alice"); - assert_eq!(person.age, 30); - assert_eq!(person.email, "alice@example.com"); - } - - #[test] - fn test_get_person_request() { - let request = GetPersonRequest::default(); - assert_eq!(request.person_id, ""); - - let request_with_id = GetPersonRequest { - person_id: "123".to_string(), - }; - assert_eq!(request_with_id.person_id, "123"); - } - - #[test] - fn test_get_person_response() { - let response = GetPersonResponse::default(); - assert!(response.person.is_none()); - - let person = Person { - name: "Bob".to_string(), - age: 25, - email: "bob@example.com".to_string(), - }; - - let response_with_person = GetPersonResponse { - person: Some(person.clone()), - }; - - assert!(response_with_person.person.is_some()); - assert_eq!(response_with_person.person.unwrap().name, "Bob"); - } - - #[test] - fn test_prost_serialization() { - // Test that we can properly serialize and deserialize with prost - use prost::Message; - - let person = Person { - name: "Alice".to_string(), - age: 30, - email: "alice@example.com".to_string(), - }; - - // Encode to bytes using prost - let mut buf = Vec::new(); - person.encode(&mut buf).expect("Failed to encode"); - - // Decode from bytes using prost - let decoded_person = Person::decode(&buf[..]).expect("Failed to decode"); - - assert_eq!(person.name, decoded_person.name); - assert_eq!(person.age, decoded_person.age); - assert_eq!(person.email, decoded_person.email); - } - - #[test] - fn test_serde_serialization() { - // Test that we can serialize to JSON using serde - let person = Person { - name: "Charlie".to_string(), - age: 28, - email: "charlie@example.com".to_string(), - }; - - // Serialize to JSON - let json = serde_json::to_string(&person).expect("Failed to serialize to JSON"); - - // Deserialize from JSON - let decoded_person: Person = serde_json::from_str(&json).expect("Failed to deserialize from JSON"); - - assert_eq!(person.name, decoded_person.name); - assert_eq!(person.age, decoded_person.age); - assert_eq!(person.email, decoded_person.email); - } -} \ No newline at end of file diff --git a/design/build-event-log.md b/design/build-event-log.md index 70881aa..20facc5 100644 --- a/design/build-event-log.md +++ b/design/build-event-log.md @@ -22,10 +22,9 @@ Purpose: Store build events and provide efficient cross-graph coordination via a Minimal append-only interface optimized for sequential scanning: ```rust -#[async_trait] trait BELStorage { - async fn append_event(&self, event: BuildEvent) -> Result; // returns event index - async fn list_events(&self, since_idx: i64, filter: EventFilter) -> Result; + fn append_event(&self, event: BuildEvent) -> Result; // returns event index + fn list_events(&self, since_idx: i64, filter: EventFilter, limit: i64) -> Result; } ``` @@ -34,35 +33,19 @@ Where `EventFilter` is defined in `databuild.proto` as: message EventFilter { repeated string partition_refs = 1; // Exact partition matches repeated string partition_patterns = 2; // Glob patterns like "data/users/*" - repeated string job_labels = 3; // Job-specific events - repeated string task_ids = 4; // Task run events - repeated string build_request_ids = 5; // Build-specific events + repeated string job_labels = 3; // Job-specific events + repeated string job_run_ids = 4; // Job run events } ``` -## Query Engine Interface -App-layer aggregation that scans storage layer events: - -```rust -struct BELQueryEngine { - storage: Box, - partition_status_cache: Option, -} - -impl BELQueryEngine { - async fn get_latest_partition_status(&self, partition_ref: &str) -> Result>; - async fn get_active_builds_for_partition(&self, partition_ref: &str) -> Result>; - async fn get_build_request_summary(&self, build_id: &str) -> Result; - async fn list_build_requests(&self, limit: u32, offset: u32, status_filter: Option) -> Result>; -} -``` +The data build state is then built on top of this, as a reducer over the BEL event stream. ## Cross-Graph Coordination Graphs coordinate via the `GraphService` API for efficient event streaming: ```rust trait GraphService { - async fn list_events(&self, since_idx: i64, filter: EventFilter) -> Result; + async fn list_events(&self, since_idx: i64, filter: EventFilter, limit: i64) -> Result; } ``` diff --git a/design/core-build.md b/design/core-build.md index d189e14..dd79407 100644 --- a/design/core-build.md +++ b/design/core-build.md @@ -1,141 +1,46 @@ - # Core Build -Purpose: Centralize the build logic and semantics in a performant, correct core. +Purpose: Enable continuous reconciliation of partition wants through distributed job execution. ## Architecture -- Jobs depend on input partitions and produce output partitions. -- Graphs compose jobs to fully plan and execute builds of requested partitions. -- Both jobs and graphs emit events via the [build event log](./build-event-log.md) to update build state. -- A common interface is implemented to execute job and graph build actions, which different clients rely on (e.g. CLI, - service, etc) -- Jobs and graphs use wrappers to implement configuration and [observability](./observability.md) -- Graph-based composition is the basis for databuild application [deployment](./deploy-strategies.md) +DataBuild uses a want-driven reconciliation model inspired by Kubernetes. Users declare wants (desired partitions), and the system continuously attempts to satisfy them through job execution. + +### Key Components +- [**Wants**](./wants.md): Declarations of desired partitions with TTLs and SLAs +- **Jobs**: Stateless executables that transform input partitions to outputs +- **Graph**: Reconciliation runtime that monitors wants and dispatches jobs +- [**Build Event Log (BEL)**](./build-event-log.md): Event-sourced ledger of all system activity + +## Reconciliation Loop +The graph continuously: +1. Scans active wants from the BEL +2. Groups wants by responsible job (via graph lookup) +3. Dispatches jobs to build wanted partitions +4. Handles job results: + - **Success**: Marks partitions available + - **Missing Dependencies**: Creates wants for missing deps with traceable ID + - **Failure**: Potentially retry based on job retry strategy ## Jobs -Jobs are the atomic unit of work in databuild, executed via a Rust-based wrapper that provides: -- Structured logging and telemetry collection -- Platform-agnostic execution across local, container, and cloud environments -- Zero-network-dependency operation via log-based communication -- Standardized error handling and exit code categorization +Jobs are stateless executables with a single `exec` entrypoint. When invoked with requested partitions as args, they either: +- Successfully produce the partitions +- Fail with missing dependency error listing required upstream partitions +- Fail with other errors for potential retry -### `job.config` -Purpose: Enable planning of execution graph. Executed in-process when possible for speed. For interface details, see -[`PartitionRef`](./glossary.md#partitionref) and [`JobConfig`](./glossary.md#jobconfig) in -[`databuild.proto`](../databuild/databuild.proto). +Jobs declare execution preferences (batching, concurrency) as metadata, but contain no orchestration logic. -```rust -trait DataBuildJob { - fn config(outputs: Vec) -> JobConfig; -} -``` +## Want Propagation +When jobs report missing dependencies, the graph: +1. Parses the error for partition refs +2. Creates child wants (linked via `parent_want_id`) +3. Continues reconciliation with expanded want set -#### `job.config` State Diagrams - -```mermaid -flowchart TD - begin((begin)) --> validate_args - emit_job_config_fail --> fail((fail)) - validate_args -- fail --> emit_arg_validate_fail --> emit_job_config_fail - validate_args -- success --> emit_arg_validate_success --> run_config - run_config -- fail --> emit_config_fail --> emit_job_config_fail - run_config -- success --> emit_config_success ---> success((success)) -``` - -### `job.exec` -Purpose: Execute job in exec wrapper. - -```rust -trait DataBuildJob { - fn exec(config: JobConfig) -> PartitionManifest; -} -``` - -#### `job.exec` State Diagram -```mermaid -flowchart TD - begin((begin)) --> wrapper_validate_config - emit_job_exec_fail --> fail((fail)) - wrapper_validate_config -- fail --> emit_config_validate_fail --> emit_job_exec_fail - wrapper_validate_config -- success --> emit_config_validate_success --> wrapper_launch_task - wrapper_launch_task -- fail --> emit_task_launch_fail --> emit_job_exec_fail - wrapper_launch_task -- success --> emit_task_launch_success --> wrapper_monitor_task - wrapper_monitor_task -- heartbeat timer --> emit_heartbeat --> wrapper_monitor_task - wrapper_monitor_task -- job stderr --> emit_log_entry --> wrapper_monitor_task - wrapper_monitor_task -- job stdout --> emit_log_entry --> wrapper_monitor_task - wrapper_monitor_task -- non-zero exit --> emit_task_failed --> emit_job_exec_fail - wrapper_monitor_task -- zero exit --> emit_task_success --> emit_partition_manifest - emit_partition_manifest --> success((success)) -``` - -## Graphs -Graphs are the unit of composition. To `analyze` (plan) task graphs (see [`JobGraph`](./glossary.md#jobgraph)), they -iteratively walk back from the requested output partitions, invoking `job.config` until no unresolved partitions -remain. To `build` partitions, the graph runs `analyze` then iteratively executes the resulting task graph. - -### `graph.analyze` -Purpose: produce a complete task graph to materialize a requested set of partitions. - -```rust -trait DataBuildGraph { - fn analyze(outputs: Vec) -> JobGraph; -} -``` - -#### `graph.analyze` State Diagram -```mermaid -flowchart TD - begin((begin)) --> initialize_missing_partitions --> dispatch_missing_partitions - emit_graph_analyze_fail --> fail((fail)) - dispatch_missing_partitions -- fail --> emit_partition_dispatch_fail --> emit_graph_analyze_fail - dispatch_missing_partitions -- success --> cycle_detected? - cycle_detected? -- yes --> emit_cycle_detected --> emit_graph_analyze_fail - cycle_detected? -- no --> remaining_missing_partitions? - remaining_missing_partitions? -- yes --> dispatch_missing_partitions - remaining_missing_partitions? -- no --> emit_job_graph --> success((success)) -``` - -### `graph.build` -Purpose: analyze, then execute the resulting task graph. - -```rust -trait DataBuildGraph { - fn build(outputs: Vec); -} -``` - -#### `graph.build` State Diagram -```mermaid -flowchart TD - begin((begin)) --> graph_analyze - emit_graph_build_fail --> fail((fail)) - graph_analyze -- fail --> emit_graph_build_fail - graph_analyze -- success --> initialize_ready_jobs --> remaining_ready_jobs? - remaining_ready_jobs? -- yes --> emit_remaining_jobs --> schedule_jobs - remaining_ready_jobs? -- none schedulable --> emit_jobs_unschedulable --> emit_graph_build_fail - schedule_jobs -- fail --> emit_job_schedule_fail --> emit_graph_build_fail - schedule_jobs -- success --> emit_job_schedule_success --> await_jobs - await_jobs -- job_failure --> emit_job_failure --> emit_job_cancels --> cancel_running_jobs - cancel_running_jobs --> emit_graph_build_fail - await_jobs -- N seconds since heartbeat --> emit_heartbeat --> await_jobs - await_jobs -- job_success --> remaining_ready_jobs? - remaining_ready_jobs? -- no ---------> emit_graph_build_success --> success((success)) -``` +This creates want chains that naturally traverse the dependency graph without upfront planning. ## Correctness Strategy -- Core component interfaces are described in [`databuild.proto`](../databuild/databuild.proto), a protobuf interface - shared by all core components and all [GSLs](./graph-specification.md). -- [GSLs](./graph-specification.md) implement ergonomic graph, job, and partition helpers that make coupling explicit -- Graphs automatically detect and raise on non-unique job -> partition mappings -- Graph and job processes are fully described by state diagrams, whose state transitions are logged to the - [build event log](./build-event-log.md). - -## Partition Delegation -- Sometimes a partition already exists, or another build request is already planning on producing a partition -- A later build request with delegate to an already existing build request for said partition -- The later build request will write an event to the [build event log](./build-event-log.md) referencing the ID - of the delegate, allowing traceability of visualization - -## Heartbeats / Health Checks -- Which strategy do we use? -- If we are launching tasks to a place we can't health check, how could they heartbeat? +- **Idempotency**: Jobs must produce identical outputs given same inputs +- **Atomicity**: Partitions are either complete or absent +- **Want chains**: Full traceability via parent/root want IDs +- **Event sourcing**: All state changes recorded in BEL +- **Protobuf interface**: All build actions fit structs and interfaces defined by [`databuild/databuild.proto`](../databuild/databuild.proto) +The system achieves correctness through convergence rather than planning—continuously reconciling until wants are satisfied or expired. diff --git a/design/executor.md b/design/executor.md new file mode 100644 index 0000000..eb0864b --- /dev/null +++ b/design/executor.md @@ -0,0 +1,55 @@ + +# Executor + +Executors act as a job execution abstraction layer to adapt the graph service to different platforms on which jobs can be run (e.g. local processes, containers, kubernetes, cloud container services, databricks/EMR, etc). + +## Capabilities + +- stdout/stderr capture +- producing job BEL events +- parsing missing upstream partition deps +- heartbeating - allows the graph to determine what jobs are still live +- job re-entrance + +## Job Lifecycle + +```mermaid +stateDiagram-v2 + [*] --> Buffering + Buffering --> Queued : collecting other wants + Queued --> Running : scheduled + Running --> Running : heartbeat + Running --> Failure + Buffering --> Canceled + Queued --> Canceled + Running --> Canceled + Canceled --> [*] : will not retry + Running --> MissingDeps + Running --> Success + MissingDeps --> [*] : await deps to rerun + Failure --> [*] : retry according \n to policy + Success --> [*] +``` + +At each state transition the executor emits a BEL event to the graph + +### Buffering +For jobs that buffer - non buffering jobs emit `Buffering` but immediately move to `Queued`. Signified by BEL event with buffering start timestamp and other relevant details for when job can be queued. + +### Queued +Job run will be launched as soon as the constraints allow (pool slots/etc). + +### Running +The job run is active, as indicated by continual heartbeating. In this state, the executor will capture logs to disk. + +### MissingDeps +Job run has emitted the `__DATABUILD_ERROR__::{...}` line in stdout, executor will emit a missing deps event. + +### Canceled +Job run explicitly canceled, emits canceled event along with details. + +### Success +The job run has succeeded, executor emits events with written partitions. + +### Failure +The job run has failed. The run will be retried according to the diff --git a/design/glossary.md b/design/glossary.md index b8d1083..7793cba 100644 --- a/design/glossary.md +++ b/design/glossary.md @@ -14,21 +14,13 @@ PartitionsRefs are strings that uniquely identify partitions. They can contain a URIs, like `s3://companybkt/datasets/foo/date=2025-01-01`, or custom formats like `dal://prod/clicks/region=4/date=2025-01-01/`. PartitionRefs are used as dependency signals during [task graph analysis](./core-build.md#graphanalyze). To enable explicit coupling and ergonomics, there are generally -helper classes for creating, parsing, and accessing fields for PartitionRefs in [GSLs](#graph-specification-language-gsl). +helper classes for creating, parsing, and accessing fields for PartitionRefs in [GDLs](#graph-specification-language-gsl). # `PartitionPattern` Patterns that group partitions (e.g. a dataset) and allow for validation (e.g. does this job actually produce the expected output partition?) -# `JobConfig` -The complete configuration of a job needed to produce the desired partitions, as calculated by -[`job.config`](./core-build.md#jobconfig) - -# `JobGraph` -A complete graph of job configs, with [`PartitionRef`](#partitionref) dependency edges, which when executed will -produce the requested partitions. - -# Graph Specification Language (GSL) +# Graph Definition Language (GDL) Language-specific libraries that make implementing databuild graphs and jobs more succinct and ergonomic. See [graph specification](./graph-specification.md). diff --git a/design/graph-specification.md b/design/graph-specification.md index 87a7f42..a2d046f 100644 --- a/design/graph-specification.md +++ b/design/graph-specification.md @@ -12,208 +12,11 @@ AKA the different ways databuild applications can be described. - Purpose: compilation/build target that fulfills promise of project (like bytecode for JVM langs) - Job binaries (config and exec) - Graph lookup binary (lookup) -- Job target (config and exec) -- Graph target (build and analyze) +- Job target (with working exec binary) +- Graph target - See [core build](./core-build.md) for details ## Python - Wrapper functions enable graph registry - Partition object increases ergonomics and enables explicit data coupling - -```python - -from dataclasses import dataclass -from databuild import ( - DataBuildGraph, DataBuildJob, Partition, JobConfig, PyJobConfig, BazelJobConfig, PartitionManifest, Want -) -from helpers import ingest_reviews, categorize_reviews, sla_failure_notify -from datetime import datetime, timedelta - -graph = DataBuildGraph("//:podcast_reviews_graph") - -ALL_CATEGORIES = {"comedy", ...} - -# Partition definitions, used by the graph to resolve jobs by introspecting their config signatures -ExtractedReviews = Partition[r"reviews/date=(?P\d{4}-\d{2}-\d{2})"] -CategorizedReviews = Partition[r"categorized_reviews/category=(?P[^/]+)/date=(?P\d{4}-\d{2}-\d{2})"] -PhraseModel = Partition[r"phrase_models/category=(?P[^/]+)/date=(?P\d{4}-\d{2}-\d{2})"] -PhraseStats = Partition[r"phrase_stats/category=(?P[^/]+)/date=(?P\d{4}-\d{2}-\d{2})"] - - -@graph.job -class ExtractReviews(DataBuildJob): - def config(self, outputs: list[ExtractedReviews]) -> list[JobConfig]: - # One job run can output multiple partitions - args = [p.date for p in outputs] - return [JobConfig(outputs=outputs, inputs=[], args=args,)] - - def exec(self, config: JobConfig) -> PartitionManifest: - for (date, output) in zip(config.args, config.outputs): - ingest_reviews(date).write(output) - # Start and end time inferred by wrapper (but could be overridden) - return config.partitionManifest(job=self) - - -@dataclass -class CategorizeReviewsArgs: - date: str - category: str - - -@graph.job -class CategorizeReviews(DataBuildJob): - def config(self, outputs: list[CategorizedReviews]) -> list[JobConfig]: - # This job only outputs one partition per run - return [ - # The PyJobConfig allows you to pass objects in config, rather than just `args` and `env` - PyJobConfig[CategorizeReviewsArgs]( - outputs=[p], - inputs=ExtractedReviews.dep.materialize(date=p.date), - params=CategorizeReviewsArgs(date=p.date, category=p.category), - ) - for p in outputs - ] - - def exec(self, config: PyJobConfig[CategorizeReviewsArgs]) -> None: - categorize_reviews(config.params.date, config.params.category) - # Partition manifest automatically constructed from config - - -@graph.job -class PhraseModeling(DataBuildJob): - def config(self, outputs: list[PhraseModel]) -> list[JobConfig]: - # This job relies on a bazel executable target to run the actual job - return [ - BazelJobConfig( - outputs=[p], - inputs=[CategorizedReviews.dep.materialize(date=p.date, category=p.category)], - exec_target="//jobs:phrase_modeling", - env={"CATEGORY": p.category, "DATA_DATE": p.date}, - ) - for p in outputs - ] - - -# This job is fully defined in bazel -graph.bazel_job(target="//jobs:phrase_stats_job", outputs=list[PhraseStats]) - - -@graph.want(cron='0 0 * * *') -def phrase_stats_want() -> list[Want[PhraseStats]]: - # Crates a new want every midnight that times out in 3 days - wanted = [PhraseStats(date=datetime.now().date().isoformat(), category=cat) for cat in ALL_CATEGORIES] - on_fail = lambda p: f"Failed to calculate partition `{p}`" - return [graph.want(partitions=wanted, ttl=timedelta(days=3), on_fail=on_fail)] - -``` - -- TODO - do we need an escape hatch for "after 2025 use this job, before use that job" functionality? - -## Rust? - -## Scala? -```scala -import databuild._ -import scala.concurrent.duration._ -import java.time.LocalDate - -object PodcastReviewsGraph extends DataBuildGraph("//:podcast_reviews_graph") { - - val AllCategories = Set("comedy", ???) - - case class DatePartition(date: String) - case class CategoryDatePartition(category: String, date: String) - - - // Partition definitions using extractors - object ExtractedReviews extends Partition[DatePartition]( - """reviews/date=(?P\d{4}-\d{2}-\d{2})""".r - ) - - object CategorizedReviews extends Partition[CategoryDatePartition]( - """categorized_reviews/category=(?P[^/]+)/date=(?P\d{4}-\d{2}-\d{2})""".r - ) - - object PhraseModel extends Partition[CategoryDatePartition]( - """phrase_models/category=(?P[^/]+)/date=(?P\d{4}-\d{2}-\d{2})""".r - ) - - object PhraseStats extends Partition[CategoryDatePartition]( - """phrase_stats/category=(?P[^/]+)/date=(?P\d{4}-\d{2}-\d{2})""".r - ) - - // Job definitions - @job - object ExtractReviewsJob extends DataBuildJob[ExtractedReviews] { - def config(outputs: List[ExtractedReviews]): List[JobConfig] = { - val args = outputs.map(_.date) - List(JobConfig( - outputs = outputs, - inputs = Nil, - args = args - )) - } - - def exec(config: JobConfig): PartitionManifest = { - config.args.zip(config.outputs).foreach { case (date, output) => - ingestReviews(date).writeTo(output) - } - config.toPartitionManifest(this) - } - } - - @job - object CategorizeReviewsJob extends DataBuildJob[CategorizedReviews] { - case class Args(date: String, category: String) - - def config(outputs: List[CategorizedReviews]): List[JobConfig] = { - outputs.map { p => - ScalaJobConfig[Args]( - outputs = List(p), - inputs = ExtractedReviews.dep.materialize(date = p.date), - params = Args(p.date, p.category) - ) - } - } - - def exec(config: ScalaJobConfig[Args]): Unit = { - categorizeReviews(config.params.date, config.params.category) - // Partition manifest auto-constructed - } - } - - @job - object PhraseModelingJob extends DataBuildJob[PhraseModel] { - def config(outputs: List[PhraseModel]): List[JobConfig] = { - outputs.map { p => - BazelJobConfig( - outputs = List(p), - inputs = List(CategorizedReviews.dep.materialize( - category = p.category, - date = p.date - )), - execTarget = "//jobs:phrase_modeling", - env = Map("CATEGORY" -> p.category, "DATA_DATE" -> p.date) - ) - } - } - } - - // External bazel job - bazelJob("//jobs:phrase_stats_job", outputType = classOf[PhraseStats]) - - // Want definition - @want(cron = "0 0 * * *") - def phraseStatsWant(): List[Want[PhraseStats]] = { - val today = LocalDate.now().toString - val wanted = AllCategories.map(cat => PhraseStats(cat, today)).toList - - List(want( - partitions = wanted, - ttl = 3.days, - onFail = p => s"Failed to calculate partition `$p`" - )) - } -} -``` diff --git a/design/questions.md b/design/questions.md deleted file mode 100644 index 3a70b41..0000000 --- a/design/questions.md +++ /dev/null @@ -1,5 +0,0 @@ - -# Questions - -- What happens when we deploy a new graph, and nothing builds a wanted partition? - - Is the interaction model between graph_a -> graph_b actually graph_a registering a want in graph_b? diff --git a/design/service.md b/design/service.md index e9072e0..ad087b8 100644 --- a/design/service.md +++ b/design/service.md @@ -18,40 +18,7 @@ Services expose the `GraphService` API for cross-graph dependency management: ```rust trait GraphService { - async fn list_events(&self, since_idx: i64, filter: EventFilter) -> Result; -} -``` - -### Cross-Graph Usage Pattern -```rust -// Downstream graph subscribing to upstream partitions -struct UpstreamDependency { - service_url: String, // e.g., "https://upstream-databuild.corp.com" - partition_patterns: Vec, // e.g., ["data/users/*", "ml/models/prod/*"] - last_sync_idx: i64, -} - -// Periodic sync of relevant upstream events -async fn sync_upstream_events(upstream: &UpstreamDependency) -> Result<()> { - let client = GraphServiceClient::new(&upstream.service_url); - let filter = EventFilter { - partition_patterns: upstream.partition_patterns.clone(), - ..Default::default() - }; - - let events = client.list_events(upstream.last_sync_idx, filter).await?; - - // Process partition availability events for immediate job triggering - for event in events.events { - if let EventType::PartitionEvent(pe) = event.event_type { - if pe.status_code == PartitionStatus::PartitionAvailable { - trigger_dependent_jobs(&pe.partition_ref).await?; - } - } - } - - upstream.last_sync_idx = events.next_idx; - Ok(()) + async fn list_events(&self, since_idx: i64, filter: EventFilter, limit: i64) -> Result; } ``` @@ -59,9 +26,6 @@ async fn sync_upstream_events(upstream: &UpstreamDependency) -> Result<()> { The purpose of the API is to enable remote, programmatic interaction with databuild applications, and to host endpoints needed by the [web app](#web-app). -See [OpenAPI spec](../bazel-bin/databuild/client/openapi.json) (may need to -`bazel build //databuild/client:extract_openapi_spec` if its not found). - ## Web App The web app visualizes databuild application state via features like listing past builds, job statistics, partition liveness, build request status, etc. This section specifies the hierarchy of functions of the web app. Pages @@ -79,13 +43,17 @@ General requirements: - Graph label at top right - Search box for finding builds, jobs, and partitions (needs a new service API?) +The site is implemented via Aksama templating and HTMX for dynamic updates. + ### Home Page Jumping off point to navigate and build. -- A text box, an "Analyze" button, and a "Build" button for doing exactly that (would be great to have autocomplete, - also PartitionRef patterns would help with ergonomics for less typing / more safety) -- List recent builds with their requested partitions and current status, with link to build request page -- List of recently attempted partitions, with status, link to partition page, and link to build request page -- List of jobs, with (colored) last week success ratio, and link to job page +- Aggregate statistics + - Count of unfulfilled wants (links to want list page with filter set) + - Count of wants past SLA (links to want list page with filter set) + - Count of active job runs (links to job run list page with filter set) + - Count of partitions produced in last 24h (links to partition list page with filter set) +- List recently created wants with their requested partitions (and their current status), with link to want detail page +- List of recently attempted job runs, with status, link to job run page ### Build Request Page - Show build request ID and overall status of build (colored) and "Cancel" button at top @@ -100,6 +68,34 @@ Jumping off point to navigate and build. - Show graph diagram of job graph (collapsable) - With each job and partition status color coded & linked to related run / partition - [paginated](#build-event-log-pagination) list of related build events at bottom + +### Wants List Page +- List recently created root want summaries, with link to want detail page + +#### Want Summaries +- Want summaries are mini components that show: + - Want ID + - Comment (if any) + - SLA status or time till breach + - N / M live out of wanted partitions + - Number of child wants + - Color coded status indicator + - Number of active job runs downstream of want +- Embedded into other pages to represent a want + +### Want Detail Page +- Want IDs +- Aggregate stats + - +- Partition summaries for wanted partitions (paginated) +- Child want summaries +- Active jobs + + +- Show want details +- Show children +- Show wanted partition details +- Show job runs ### Job Status Page - Job label @@ -138,23 +134,6 @@ Jumping off point to navigate and build. - Build request link - Task link (with run time next to it) -## Triggers List Page -- Paginated list of registered triggers - - With link to trigger detail page - - With expandable list of produced build requests or wants - -## Trigger Detail Page -- Trigger name, last run at, and "Trigger" button at top -- Trigger history table, including: - - Trigger time - - Trigger result (successful/failed) - - Partitions or wants requested - -## Wants List Page - -## Want Detail Page - - ### Build Event Log Page I dunno, some people want to look at the raw thing. - A [paginated](#build-event-log-pagination) list of build event log entries diff --git a/design/wants.md b/design/wants.md index bf3b5ac..03926a1 100644 --- a/design/wants.md +++ b/design/wants.md @@ -1,287 +1,59 @@ # Wants System -Purpose: Enable declarative specification of data requirements with SLA tracking, cross-graph coordination, and efficient build triggering while maintaining atomic build semantics. +Purpose: Enable declarative partition requirements with continuous reconciliation, SLA tracking, and efficient event-driven execution. ## Overview -The wants system unifies all build requests (manual, scheduled, triggered) under a single declarative model where: -- **Wants declare intent** via events in the [build event log](./build-event-log.md) -- **Builds reactively satisfy** what's currently possible with atomic semantics -- **Monitoring identifies gaps** between declared wants and delivered partitions -- **Cross-graph coordination** happens via the `GraphService` API +Wants declare intent to have partitions exist. The graph continuously reconciles these wants by attempting execution when dependencies are satisfied. Jobs either succeed or fail with missing dependencies, which become new wants. This creates a self-discovering dependency chain without upfront planning. -## Architecture +## Want Identity -### Core Components - -1. **PartitionWantEvent**: Declarative specification of data requirements -2. **Build Evaluation**: Reactive logic that attempts to satisfy wants when possible -3. **SLA Monitoring**: External system that queries for expired wants -4. **Cross-Graph Coordination**: Event-driven dependency management across DataBuild instances - -### Want Event Schema - -Defined in `databuild.proto`: +Wants are idempotent through deterministic ID generation: ```protobuf -message PartitionWantEvent { - string partition_ref = 1; // Partition being requested - int64 created_at = 2; // Server time when want registered - int64 data_timestamp = 3; // Business time this partition represents - optional uint64 ttl_seconds = 4; // Give up after this long (from created_at) - optional uint64 sla_seconds = 5; // SLA violation after this long (from data_timestamp) - repeated string external_dependencies = 6; // Cross-graph dependencies - string want_id = 7; // Unique identifier - WantSource source = 8; // How this want was created -} - -message WantSource { - oneof source_type { - CliManual cli_manual = 1; // Manual CLI request - DashboardManual dashboard_manual = 2; // Manual dashboard request - Scheduled scheduled = 3; // Scheduled/triggered job - ApiRequest api_request = 4; // External API call - } +message PartitionWant { + string want_id = 1; // Hash(partition_ref + data_timestamp + source) + string root_want_id = 2; // Original user want + string parent_want_id = 3; // Want that triggered this + PartitionRef partition_ref = 4; + uint64 data_timestamp = 5; // Business time (e.g., "2024-01-01" → midnight UTC) + uint64 ttl_seconds = 6; // From data_timestamp + uint64 sla_seconds = 7; // From data_timestamp + WantSource source = 8; } ``` -## Want Lifecycle +Multiple identical want requests produce the same `want_id`, preventing duplication. -### 1. Want Registration +## Execution Flow -All build requests become wants: +1. **Want Registration**: User/trigger creates wants with deterministic IDs +2. **Immediate Dispatch**: Graph attempts execution without checking dependencies +3. **Runtime Discovery**: Jobs fail with `MissingDependenciesError(partitions)` +4. **Want Propagation**: Graph creates upstream wants from missing dependencies +5. **Event-Driven Retry**: When partitions become available (via BEL events), graph retries dependent wants -```rust -// CLI: databuild build data/users/2024-01-01 -PartitionWantEvent { - partition_ref: "data/users/2024-01-01", - created_at: now(), - data_timestamp: None, // These must be set explicitly in the request - ttl_seconds: None, - sla_seconds: None, - external_dependencies: vec![], // no externally sourced data necessary - want_id: generate_uuid(), - source: WantSource { ... }, -} +No polling required - partition availability events directly trigger reconciliation. -// Scheduled pipeline: Daily analytics -PartitionWantEvent { - partition_ref: "analytics/daily/2024-01-01", - created_at: now(), - data_timestamp: parse_date("2024-01-01"), - ttl_seconds: Some(365 * 24 * 3600), // Keep trying for 1 year - sla_seconds: Some(9 * 3600), // Expected by 9am (9hrs after data_timestamp) - external_dependencies: vec!["data/users/2024-01-01"], - want_id: "daily-analytics-2024-01-01", - source: WantSource { ... }, -} -``` +## Reconciliation Loop -### 2. Build Evaluation +The graph monitors two event streams: +- **New wants**: Trigger immediate execution attempts +- **Partition completions**: Trigger retry of wants previously failed on missing dependencies -DataBuild continuously evaluates build opportunities: +This creates an event-driven cascade where upstream completion immediately unlocks downstream work. -```rust -async fn evaluate_build_opportunities(&self) -> Result> { - let now = current_timestamp_nanos(); - - // Get wants that haven't exceeded TTL - let active_wants = self.get_non_expired_wants(now).await?; - - // Filter to wants where external dependencies are satisfied - let buildable_partitions = active_wants.into_iter() - .filter(|want| self.external_dependencies_satisfied(want)) - .map(|want| want.partition_ref) - .collect(); - - if buildable_partitions.is_empty() { return Ok(None); } - - // Create atomic build request for all currently buildable partitions - Ok(Some(BuildRequest { - requested_partitions: buildable_partitions, - reason: "satisfying_active_wants".to_string(), - })) -} -``` +## SLA Management -### 3. Build Triggers +SLAs and TTLs anchor to `data_timestamp`, not creation time: +- **TTL**: "Build January 1st data within 30 days of January 1st" +- **SLA**: "January 1st data should be ready by 9am January 2nd" -Builds are triggered on: -- **New want registration**: Check if newly wanted partitions are immediately buildable -- **External partition availability**: Check if any blocked wants are now unblocked -- **Manual trigger**: Force re-evaluation (for debugging) - -## Cross-Graph Coordination -### GraphService API -Graphs expose events for cross-graph coordination: - -```rust -trait GraphService { - async fn list_events(&self, since_idx: i64, filter: EventFilter) -> Result; -} -``` - -Where `EventFilter` supports partition patterns for efficient subscriptions: - -```protobuf -message EventFilter { - repeated string partition_refs = 1; // Exact partition matches - repeated string partition_patterns = 2; // Glob patterns like "data/users/*" - repeated string job_labels = 3; // Job-specific events - repeated string task_ids = 4; // Task run events - repeated string build_request_ids = 5; // Build-specific events -} -``` - -### Upstream Dependencies - -Downstream graphs subscribe to upstream events: - -```rust -struct UpstreamDependency { - service_url: String, // "https://upstream-databuild.corp.com" - partition_patterns: Vec, // ["data/users/*", "ml/models/prod/*"] - last_sync_idx: i64, -} - -// Periodic sync of upstream events -async fn sync_upstream_events(upstream: &UpstreamDependency) -> Result<()> { - let client = GraphServiceClient::new(&upstream.service_url); - let filter = EventFilter { - partition_patterns: upstream.partition_patterns.clone(), - ..Default::default() - }; - - let events = client.list_events(upstream.last_sync_idx, filter).await?; - - // Process partition availability events - for event in events.events { - if let EventType::PartitionEvent(pe) = event.event_type { - if pe.status_code == PartitionStatus::PartitionAvailable { - // Trigger local build evaluation - trigger_build_evaluation().await?; - } - } - } - - upstream.last_sync_idx = events.next_idx; - Ok(()) -} -``` - -## SLA Monitoring and TTL Management - -### SLA Violations - -External monitoring systems query for SLA violations: - -```sql --- Find SLA violations (for alerting) -SELECT * FROM partition_want_events w -WHERE w.sla_seconds IS NOT NULL -AND (w.data_timestamp + (w.sla_seconds * 1000000000)) < ? -- now -AND NOT EXISTS ( - SELECT 1 FROM partition_events p - WHERE p.partition_ref = w.partition_ref - AND p.status_code = ? -- PartitionAvailable -) -``` - -### TTL Expiration - -Wants with expired TTLs are excluded from build evaluation: - -```sql --- Get active (non-expired) wants -SELECT * FROM partition_want_events w -WHERE (w.ttl_seconds IS NULL OR w.created_at + (w.ttl_seconds * 1000000000) > ?) -- now -AND NOT EXISTS ( - SELECT 1 FROM partition_events p - WHERE p.partition_ref = w.partition_ref - AND p.status_code = ? -- PartitionAvailable -) -``` - -## Example Scenarios - -### Scenario 1: Daily Analytics Pipeline - -``` -1. 6:00 AM: Daily trigger creates want for analytics/daily/2024-01-01 - - SLA: 9:00 AM (3 hours after data_timestamp of midnight) - - TTL: 1 year (keep trying for historical data) - - External deps: ["data/users/2024-01-01"] - -2. 6:01 AM: Build evaluation runs, data/users/2024-01-01 missing - - No build request generated - -3. 8:30 AM: Upstream publishes data/users/2024-01-01 - - Cross-graph sync detects availability - - Build evaluation triggered - - BuildRequest[analytics/daily/2024-01-01] succeeds - -4. Result: Analytics available at 8:45 AM, within SLA -``` - -### Scenario 2: Late Data with SLA Miss - -``` -1. 6:00 AM: Want created for analytics/daily/2024-01-01 (SLA: 9:00 AM) -2. 9:30 AM: SLA monitoring detects violation, sends alert -3. 11:00 AM: Upstream data finally arrives -4. 11:01 AM: Build evaluation triggers, analytics built -5. Result: Late delivery logged, but data still processed -``` - -### Scenario 3: Manual CLI Build - -``` -1. User: databuild build data/transform/urgent -2. Want created with short TTL (30 min) and SLA (5 min) -3. Build evaluation: dependencies available, immediate build -4. Result: Fast feedback for interactive use -``` +This makes wants truly idempotent - the same logical want always has identical constraints regardless of when it's created. ## Benefits -### Unified Build Model -- All builds (manual, scheduled, triggered) use same want mechanism -- Complete audit trail in build event log -- Consistent SLA tracking across all build types - -### Event-Driven Efficiency -- Builds only triggered when dependencies change -- Cross-graph coordination via efficient event streaming -- No polling for task readiness within builds - -### Atomic Build Semantics -- Individual build requests remain all-or-nothing -- Fast failure provides immediate feedback -- Partial progress via multiple build requests over time - -### Flexible SLA Management -- Separate business expectations (SLA) from operational limits (TTL) -- External monitoring with clear blame assignment -- Automatic cleanup of stale wants - -### Cross-Graph Scalability -- Reliable HTTP-based coordination (no message loss) -- Efficient filtering via partition patterns -- Decentralized architecture with clear boundaries - -## Implementation Notes - -### Build Event Log Integration -- Wants are stored as events in the BEL for consistency -- Same query interfaces used for wants and build coordination -- Event-driven architecture throughout - -### Service Integration -- GraphService API exposed via HTTP for cross-graph coordination -- Dashboard integration for manual want creation -- External SLA monitoring via BEL queries - -### CLI Integration -- CLI commands create manual wants with appropriate TTLs -- Immediate build evaluation for interactive feedback -- Standard build request execution path \ No newline at end of file +- **No planning overhead**: Jobs discover dependencies at runtime +- **Natural batching**: Graph can group wants per job preferences +- **Continuous progress**: Partial availability enables marginal execution +- **Simple deployment**: No in-flight state beyond wants themselves diff --git a/design/why-databuild.md b/design/why-databuild.md index 504dc36..594e12a 100644 --- a/design/why-databuild.md +++ b/design/why-databuild.md @@ -59,7 +59,7 @@ Bullet points that should eventually become a blog post. - Version updates are the norm, not the exception ### Separation of Concerns -- **DataBuild**: Dependency resolution + execution planning +- **DataBuild**: Dependency resolution + want propagation - **External systems**: Scheduling (cron/triggers), infrastructure (Kubernetes) - **Result**: Operational complexity focused where it belongs diff --git a/docs/partition-delegation.md b/docs/partition-delegation.md deleted file mode 100644 index 95d1f4b..0000000 --- a/docs/partition-delegation.md +++ /dev/null @@ -1,256 +0,0 @@ -# Partition Delegation in DataBuild - -## Overview - -Partition delegation is a core coordination mechanism in DataBuild that prevents duplicate work by allowing build requests to delegate partition creation to other builds. This system ensures efficient resource utilization and provides complete audit trails for all build activities. - -## Motivation - -DataBuild is designed to handle concurrent build requests efficiently. Without delegation, multiple build requests might attempt to build the same partitions simultaneously, leading to: - -- **Resource Waste**: Multiple processes building identical partitions -- **Race Conditions**: Concurrent writes to the same partition outputs -- **Inconsistent State**: Different builds potentially producing different results for the same partition -- **Poor Performance**: Duplicated computation and I/O overhead - -Delegation solves these problems by establishing clear coordination rules and providing complete traceability. - -## Delegation Types - -DataBuild implements two distinct delegation patterns: - -### 1. Active Delegation - -**When**: A partition is currently being built by another active build request. - -**Behavior**: -- Delegate to the currently executing build request -- Log `DelegationEvent` pointing to the active build's ID -- No job execution occurs for the delegating request -- Wait for the active build to complete - -**Event Flow**: -``` -Build Request A wants partition X (currently being built by Build B): -1. DelegationEvent(partition=X, delegated_to=Build_B_ID, message="Delegated to active build during execution") -2. No JobEvent created for Build A -3. Task marked as succeeded locally in Build A -``` - -### 2. Historical Delegation - -**When**: A partition already exists and is available (built by a previous request). - -**Behavior**: -- Delegate to the historical build request that created the partition -- Log both `DelegationEvent` and `JOB_SKIPPED` events -- Provide complete audit trail showing why work was avoided - -**Event Flow**: -``` -Build Request A wants partition X (already available from Build C): -1. DelegationEvent(partition=X, delegated_to=Build_C_ID, message="Delegated to historical build - partition already available") -2. JobEvent(status=JOB_SKIPPED, message="Job skipped - all target partitions already available") -3. Task marked as succeeded locally in Build A -``` - -## Multi-Partition Job Coordination - -Jobs in DataBuild can produce multiple partitions. Delegation decisions are made at the **job level** based on **all target partitions**: - -### Job Execution Rules - -1. **Execute**: If ANY target partition needs building, execute the entire job -2. **Skip**: Only if ALL target partitions are already available -3. **Delegate to Active**: If ANY target partition is being built by another request - -### Example Scenarios - -**Scenario 1: Mixed Availability** -``` -Job produces partitions [A, B, C]: -- A: Available (from Build X) -- B: Needs building -- C: Available (from Build Y) - -Result: Execute the job (because B needs building) -Events: Normal job execution (JOB_SCHEDULED → JOB_RUNNING → JOB_COMPLETED/FAILED) -``` - -**Scenario 2: All Available** -``` -Job produces partitions [A, B, C]: -- A: Available (from Build X) -- B: Available (from Build Y) -- C: Available (from Build Z) - -Result: Skip the job (all partitions available) -Events: -- DelegationEvent(A, delegated_to=Build_X_ID) -- DelegationEvent(B, delegated_to=Build_Y_ID) -- DelegationEvent(C, delegated_to=Build_Z_ID) -- JobEvent(status=JOB_SKIPPED) -``` - -**Scenario 3: Active Build Conflict** -``` -Job produces partitions [A, B]: -- A: Available (from Build X) -- B: Being built by Build Y (active) - -Result: Delegate entire job to Build Y -Events: -- DelegationEvent(A, delegated_to=Build_Y_ID, message="Delegated to active build") -- DelegationEvent(B, delegated_to=Build_Y_ID, message="Delegated to active build") -- No JobEvent (delegated at planning/coordination level) -``` - -## Build Event Log Integration - -Delegation is implemented through the Build Event Log (BEL), which serves as the authoritative source for all build coordination decisions. - -### Key Event Types - -1. **DelegationEvent**: Records partition-level delegation with full traceability -2. **JobEvent**: Records job-level status including `JOB_SKIPPED` for historical delegation -3. **PartitionEvent**: Tracks partition lifecycle (`PARTITION_AVAILABLE`, etc.) -4. **BuildRequestEvent**: Tracks overall build request status - -### Event Log Queries - -**Finding Available Partitions**: -```sql -SELECT build_request_id -FROM partition_events pe -JOIN build_events be ON pe.event_id = be.event_id -WHERE pe.partition_ref = ? AND pe.status = '4' -- PARTITION_AVAILABLE -ORDER BY be.timestamp DESC -LIMIT 1 -``` - -**Finding Active Builds**: -```sql -SELECT DISTINCT be.build_request_id -FROM partition_events pe -JOIN build_events be ON pe.event_id = be.event_id -WHERE pe.partition_ref = ? -AND pe.status IN ('2', '3') -- PARTITION_SCHEDULED or PARTITION_BUILDING -AND be.build_request_id NOT IN ( - SELECT DISTINCT be3.build_request_id - FROM build_request_events bre - JOIN build_events be3 ON bre.event_id = be3.event_id - WHERE bre.status IN ('4', '5') -- BUILD_REQUEST_COMPLETED or BUILD_REQUEST_FAILED -) -``` - -## Success Rate Calculation - -The delegation system ensures accurate success rate metrics by treating delegation outcomes appropriately: - -### Job Status Classifications - -- **Successful**: `JOB_COMPLETED` (3), `JOB_SKIPPED` (6) -- **Failed**: `JOB_FAILED` (4) -- **In Progress**: `JOB_SCHEDULED` (1), `JOB_RUNNING` (2) -- **Cancelled**: `JOB_CANCELLED` (5) - -### Metrics Queries - -```sql --- Job success rate calculation -SELECT - job_label, - COUNT(CASE WHEN status IN ('3', '6') THEN 1 END) as completed_count, - COUNT(CASE WHEN status = '4' THEN 1 END) as failed_count, - COUNT(*) as total_count -FROM job_events -WHERE job_label = ? -``` - -Success Rate = (completed_count) / (total_count) where completed includes both executed and skipped jobs. - -## Implementation Architecture - -### Clean Separation of Concerns - -**Analysis Phase** (`databuild/graph/analyze.rs`): -- **Purpose**: Pure transformation of partition requests → job graph -- **Responsibility**: Determine what work would be needed (logical plan) -- **No delegation logic**: Creates jobs for all requested partitions -- **Output**: Complete job graph representing the logical work - -**Execution Phase** (`databuild/graph/execute.rs`): -- **Purpose**: Execute the job graph efficiently with delegation optimization -- **Responsibility**: Coordinate with concurrent builds and optimize execution -- **All delegation logic**: Handles both active and historical delegation -- **Event logging**: Emits all job lifecycle events including `JOB_SKIPPED` - -### Core Components - -1. **Event Log Trait** (`databuild/event_log/mod.rs`): - - `get_latest_partition_status()`: Check partition availability - - `get_build_request_for_available_partition()`: Find historical source - - `get_active_builds_for_partition()`: Find concurrent builds - -2. **Execution Coordination Logic** (`databuild/graph/execute.rs`): - - `check_build_coordination()`: Implements all delegation decision rules - - Multi-partition job evaluation logic - - Event logging for delegation and job skipping - - Handles both active delegation (to running builds) and historical delegation (to completed builds) - -3. **Dashboard Integration** (`databuild/service/handlers.rs`): - - Success rate calculations including `JOB_SKIPPED` - - Job metrics queries treating delegation as success - - Proper handling of skipped jobs in analytics - -### Delegation Decision Algorithm (Execution Phase) - -```rust -// Analysis phase creates complete job graph for all requested partitions -job_graph = analyze_partitions(requested_partitions) - -// Execution phase optimizes by delegating when possible -for each job in job_graph: - available_partitions = [] - needs_building = false - - for each partition in job.outputs: - if partition.status == PARTITION_AVAILABLE: - source_build = get_build_request_for_available_partition(partition) - available_partitions.push((partition, source_build)) - elif partition has active_builds: - // Active delegation - delegate entire job to running build - log_delegation_events_to_active_build() - mark_job_as_succeeded() - continue_to_next_job() - else: - needs_building = true - - if !needs_building && available_partitions.len() == job.outputs.len(): - // Historical delegation - all partitions available - log_delegation_events(available_partitions) // Point to source builds - log_job_skipped_event() - mark_job_as_succeeded() - elif needs_building: - // Normal execution - some partitions need building - execute_job_normally() -``` - -## Benefits - -1. **Clean Architecture**: Clear separation between logical planning (analysis) and execution optimization -2. **Efficiency**: Eliminates duplicate computation through execution-time delegation -3. **Consistency**: Single source of truth for each partition -4. **Traceability**: Complete audit trail via delegation events with full build request traceability -5. **Accuracy**: Proper success rate calculation including delegated work -6. **Scalability**: Supports concurrent build requests without conflicts -7. **Testability**: Analysis phase becomes pure function (requests → job graph) -8. **Transparency**: Clear visibility into why work was or wasn't performed - -## Future Enhancements - -1. **Cross-Build Monitoring**: Track when delegated builds complete/fail -2. **Delegation Timeouts**: Handle cases where delegated builds stall -3. **Smart Invalidation**: Detect when available partitions become stale -4. **Delegation Preferences**: Allow builds to specify delegation strategies -5. **Performance Metrics**: Track delegation efficiency and resource savings \ No newline at end of file diff --git a/plans/01-build-event-log.md b/plans/01-build-event-log.md deleted file mode 100644 index b39812a..0000000 --- a/plans/01-build-event-log.md +++ /dev/null @@ -1,338 +0,0 @@ -# Build Event Log Design - -The foundation of persistence for DataBuild is the build event log, a fact table recording events related to build requests, partitions, and jobs. Each graph has exactly one build event log, upon which other views (potentially materialized) rely and aggregate, e.g. powering the partition liveness catalog and enabling delegation to in-progress partition builds. - -## 1. Schema - -The build event log is an append-only event stream that captures all build-related activity. Each event represents a state change in either a build request, partition, or job lifecycle. - -```protobuf -// Partition lifecycle states -enum PartitionStatus { - PARTITION_UNKNOWN = 0; - PARTITION_REQUESTED = 1; // Partition requested but not yet scheduled - PARTITION_SCHEDULED = 2; // Job scheduled to produce this partition - PARTITION_BUILDING = 3; // Job actively building this partition - PARTITION_AVAILABLE = 4; // Partition successfully built and available - PARTITION_FAILED = 5; // Partition build failed - PARTITION_DELEGATED = 6; // Request delegated to existing build -} - -// Job execution lifecycle -enum JobStatus { - JOB_UNKNOWN = 0; - JOB_SCHEDULED = 1; // Job scheduled for execution - JOB_RUNNING = 2; // Job actively executing - JOB_COMPLETED = 3; // Job completed successfully - JOB_FAILED = 4; // Job execution failed - JOB_CANCELLED = 5; // Job execution cancelled -} - -// Build request lifecycle -enum BuildRequestStatus { - BUILD_REQUEST_UNKNOWN = 0; - BUILD_REQUEST_RECEIVED = 1; // Build request received and queued - BUILD_REQUEST_PLANNING = 2; // Graph analysis in progress - BUILD_REQUEST_EXECUTING = 3; // Jobs are being executed - BUILD_REQUEST_COMPLETED = 4; // All requested partitions built - BUILD_REQUEST_FAILED = 5; // Build request failed - BUILD_REQUEST_CANCELLED = 6; // Build request cancelled -} - -// Individual build event -message BuildEvent { - // Event metadata - string event_id = 1; // UUID for this event - int64 timestamp = 2; // Unix timestamp (nanoseconds) - string build_request_id = 3; // UUID of the build request - - // Event type and payload (one of) - oneof event_type { - BuildRequestEvent build_request_event = 10; - PartitionEvent partition_event = 11; - JobEvent job_event = 12; - DelegationEvent delegation_event = 13; - } -} - -// Build request lifecycle event -message BuildRequestEvent { - BuildRequestStatus status = 1; - repeated PartitionRef requested_partitions = 2; - string message = 3; // Optional status message -} - -// Partition state change event -message PartitionEvent { - PartitionRef partition_ref = 1; - PartitionStatus status = 2; - string message = 3; // Optional status message - string job_run_id = 4; // UUID of job run producing this partition (if applicable) -} - -// Job execution event -message JobEvent { - string job_run_id = 1; // UUID for this job run - JobLabel job_label = 2; // Job being executed - repeated PartitionRef target_partitions = 3; // Partitions this job run produces - JobStatus status = 4; - string message = 5; // Optional status message - JobConfig config = 6; // Job configuration used (for SCHEDULED events) - repeated PartitionManifest manifests = 7; // Results (for COMPLETED events) -} - -// Delegation event (when build request delegates to existing build) -message DelegationEvent { - PartitionRef partition_ref = 1; - string delegated_to_build_request_id = 2; // Build request handling this partition - string message = 3; // Optional message -} -``` - -Build events capture the complete lifecycle of composite build requests. A single build request can involve multiple partitions, each potentially requiring different jobs. The event stream allows reconstruction of the full state at any point in time. - -### Design Principles - -**Staleness as Planning Concern**: Staleness detection and handling occurs during the analysis/planning phase, not during execution. The analyze operation detects partitions that need rebuilding due to upstream changes and includes them in the execution graph. In-progress builds do not react to newly stale partitions - they execute their planned graph to completion. - -**Delegation as Unidirectional Optimization**: When a build request discovers another build is already producing a needed partition, it logs a delegation event and waits for that partition to become available. The delegated-to build request remains unaware of the delegation - it simply continues building its own graph. This eliminates the need for coordination protocols between builds. - -## 2. Persistence - -The build event log uses a single `build_events` table storing serialized protobuf events. This design supports multiple storage backends while maintaining consistency. - -### Storage Requirements -- **PostgreSQL**: Primary production backend -- **SQLite**: Local development and testing -- **Delta tables**: Future extensibility for analytics workloads - -### Table Schema -```sql --- Core event metadata -CREATE TABLE build_events ( - event_id UUID PRIMARY KEY, - timestamp BIGINT NOT NULL, - build_request_id UUID NOT NULL, - event_type TEXT NOT NULL -- 'build_request', 'partition', 'job', 'delegation' -); - --- Build request lifecycle events -CREATE TABLE build_request_events ( - event_id UUID PRIMARY KEY REFERENCES build_events(event_id), - status TEXT NOT NULL, -- BuildRequestStatus enum - requested_partitions TEXT[] NOT NULL, - message TEXT -); - --- Partition lifecycle events -CREATE TABLE partition_events ( - event_id UUID PRIMARY KEY REFERENCES build_events(event_id), - partition_ref TEXT NOT NULL, - status TEXT NOT NULL, -- PartitionStatus enum - message TEXT, - job_run_id UUID -- NULL for non-job-related events -); - --- Job execution events -CREATE TABLE job_events ( - event_id UUID PRIMARY KEY REFERENCES build_events(event_id), - job_run_id UUID NOT NULL, - job_label TEXT NOT NULL, - target_partitions TEXT[] NOT NULL, - status TEXT NOT NULL, -- JobStatus enum - message TEXT, - config_json TEXT, -- JobConfig as JSON (for SCHEDULED events) - manifests_json TEXT, -- PartitionManifests as JSON (for COMPLETED events) - start_time BIGINT, -- Extracted from config/manifests - end_time BIGINT -- Extracted from config/manifests -); - --- Delegation events -CREATE TABLE delegation_events ( - event_id UUID PRIMARY KEY REFERENCES build_events(event_id), - partition_ref TEXT NOT NULL, - delegated_to_build_request_id UUID NOT NULL, - message TEXT -); - --- Indexes for common query patterns -CREATE INDEX idx_build_events_build_request ON build_events(build_request_id, timestamp); -CREATE INDEX idx_build_events_timestamp ON build_events(timestamp); - -CREATE INDEX idx_partition_events_partition ON partition_events(partition_ref, timestamp); -CREATE INDEX idx_partition_events_job_run ON partition_events(job_run_id, timestamp) WHERE job_run_id IS NOT NULL; - -CREATE INDEX idx_job_events_job_run ON job_events(job_run_id); -CREATE INDEX idx_job_events_job_label ON job_events(job_label, timestamp); -CREATE INDEX idx_job_events_status ON job_events(status, timestamp); - -CREATE INDEX idx_delegation_events_partition ON delegation_events(partition_ref, timestamp); -CREATE INDEX idx_delegation_events_delegated_to ON delegation_events(delegated_to_build_request_id, timestamp); -``` - -## 3. Access Layer - -The access layer provides a simple append/query interface for build events, leaving aggregation logic to the service layer. - -### Core Interface -The normalized schema enables both simple event queries and complex analytical queries: - -```rust -trait BuildEventLog { - // Append new event to the log - async fn append_event(&self, event: BuildEvent) -> Result<(), Error>; - - // Query events by build request - async fn get_build_request_events( - &self, - build_request_id: &str, - since: Option - ) -> Result, Error>; - - // Query events by partition - async fn get_partition_events( - &self, - partition_ref: &str, - since: Option - ) -> Result, Error>; - - // Query events by job run - async fn get_job_run_events( - &self, - job_run_id: &str - ) -> Result, Error>; - - // Query events in time range - async fn get_events_in_range( - &self, - start_time: i64, - end_time: i64 - ) -> Result, Error>; - - // Execute raw SQL queries (for dashboard and debugging) - async fn execute_query(&self, query: &str) -> Result; -} -``` - -### Example Analytical Queries -The normalized schema enables dashboard queries like: - -```sql --- Job success rates by label -SELECT job_label, - COUNT(*) as total_runs, - SUM(CASE WHEN status = 'JOB_COMPLETED' THEN 1 ELSE 0 END) as successful_runs, - AVG(end_time - start_time) as avg_duration_ns -FROM job_events -WHERE status IN ('JOB_COMPLETED', 'JOB_FAILED') -GROUP BY job_label; - --- Recent partition builds -SELECT p.partition_ref, p.status, e.timestamp, j.job_label -FROM partition_events p -JOIN build_events e ON p.event_id = e.event_id -LEFT JOIN job_events j ON p.job_run_id = j.job_run_id -WHERE p.status = 'PARTITION_AVAILABLE' -ORDER BY e.timestamp DESC -LIMIT 100; - --- Build request status summary -SELECT br.status, COUNT(*) as count -FROM build_request_events br -JOIN build_events e ON br.event_id = e.event_id -WHERE e.timestamp > extract(epoch from now() - interval '24 hours') * 1000000000 -GROUP BY br.status; -``` - -The service layer builds higher-level operations on top of both the simple interface and direct SQL access. - -## 4. Core Build Implementation Integration - -### Command Line Interface - -The core build implementation (`analyze.rs` and `execute.rs`) will be enhanced with build event logging capabilities through new command line arguments: - -```bash -# Standard usage with build event logging -./analyze partition_ref1 partition_ref2 -./execute --build-event-log sqlite:///tmp/build.db < job_graph.json - -# With explicit build request ID for correlation -./analyze --build-event-log postgres://user:pass@host/db --build-request-id 12345678-1234-1234-1234-123456789012 -``` - -**New Command Line Arguments:** -- `--build-event-log ` - Specify persistence URI for build events (logging to stdout is implicit) - - `sqlite://path` - Persist to SQLite database file - - `postgres://connection` - Persist to PostgreSQL database -- `--build-request-id ` - Optional build request ID (auto-generated if not provided) - -### Integration Points - -**In `analyze.rs` (Graph Analysis Phase):** -1. **Build Request Lifecycle**: Log `BUILD_REQUEST_RECEIVED` when analysis starts, `BUILD_REQUEST_PLANNING` during dependency resolution, and `BUILD_REQUEST_COMPLETED` when analysis finishes -2. **Staleness Detection**: Query build event log for existing `PARTITION_AVAILABLE` events to identify non-stale partitions that can be skipped -3. **Delegation Logging**: Log `PARTITION_DELEGATED` events when skipping partitions that are already being built by another request -4. **Job Planning**: Log `PARTITION_SCHEDULED` events for partitions that will be built - -**In `execute.rs` (Graph Execution Phase):** -1. **Execution Lifecycle**: Log `BUILD_REQUEST_EXECUTING` when execution starts -2. **Job Execution Events**: Log `JOB_SCHEDULED`, `JOB_RUNNING`, `JOB_COMPLETED/FAILED` events throughout job execution -3. **Partition Status**: Log `PARTITION_BUILDING` when jobs start, `PARTITION_AVAILABLE/FAILED` when jobs complete -4. **Build Coordination**: Check for concurrent builds before starting partition work to avoid duplicate effort - -### Non-Stale Partition Handling - -The build event log enables intelligent partition skipping: - -1. **During Analysis**: Query for recent `PARTITION_AVAILABLE` events to identify partitions that don't need rebuilding -2. **Staleness Logic**: Compare partition timestamps with upstream dependency timestamps to determine if rebuilding is needed -3. **Skip Documentation**: Log `PARTITION_DELEGATED` events with references to the existing build request ID that produced the partition - -### Bazel Rules Integration - -The `databuild_graph` rule in `rules.bzl` will be enhanced to propagate build event logging configuration: - -```python -databuild_graph( - name = "my_graph", - jobs = [":job1", ":job2"], - lookup = ":job_lookup", - build_event_log = "sqlite:///tmp/builds.db", # New attribute -) -``` - -**Generated Targets Enhancement:** -- `my_graph_analyze`: Receives `--build-event-log` argument -- `my_graph_exec`: Receives `--build-event-log` argument -- `my_graph_build`: Coordinates build request ID across analyze/execute phases - -### Implementation Strategy - -**Phase 1: Infrastructure** -- Add `BuildEventLog` trait and implementations for stdout/SQLite/PostgreSQL -- Update `databuild.proto` with build event schema -- Add command line argument parsing to `analyze.rs` and `execute.rs` - -**Phase 2: Analysis Integration** -- Integrate build event logging into `analyze.rs` -- Implement staleness detection queries -- Add partition delegation logic - -**Phase 3: Execution Integration** -- Integrate build event logging into `execute.rs` -- Add job lifecycle event logging -- Implement build coordination checks - -**Phase 4: Bazel Integration** -- Update `databuild_graph` rule with build event log support -- Add proper argument propagation and request ID correlation -- End-to-end testing with example graphs - -### Key Benefits - -1. **Stdout Logging**: Immediate visibility into build progress with `--build-event-log stdout` -2. **Persistent History**: Database persistence enables build coordination and historical analysis -3. **Intelligent Skipping**: Avoid rebuilding fresh partitions, significantly improving build performance -4. **Build Coordination**: Prevent duplicate work when multiple builds target the same partitions -5. **Audit Trail**: Complete record of all build activities for debugging and monitoring diff --git a/plans/02-build-graph-service.md b/plans/02-build-graph-service.md deleted file mode 100644 index 1639833..0000000 --- a/plans/02-build-graph-service.md +++ /dev/null @@ -1,182 +0,0 @@ -# Build Graph Service Design - -## Overview - -The Build Graph Service is a persistent HTTP service that coordinates build requests, tracks partition status, and serves as one operational interface for the DataBuild system. It bridges the gap between the stateless core DataBuild engine and the stateful requirements of production data orchestration. - -## Core Architecture - -```rust -// Main service interface -#[async_trait] -trait BuildGraphService { - // Build request lifecycle - async fn submit_build_request(&self, partitions: Vec) -> Result; - async fn get_build_status(&self, build_request_id: &str) -> Result; - async fn cancel_build_request(&self, build_request_id: &str) -> Result<(), Error>; - - // Partition status queries - async fn get_partition_status(&self, partition_ref: &str) -> Result; - async fn get_partition_events(&self, partition_ref: &str) -> Result, Error>; - - // Graph analysis - async fn analyze_build_graph(&self, partitions: Vec) -> Result; - - // Service queries for dashboard - async fn get_recent_builds(&self, limit: Option) -> Result, Error>; - async fn get_job_metrics(&self, job_label: &str) -> Result; - async fn execute_query(&self, query: &str) -> Result; -} -``` - -## Key Components - -### 1. Build Request Coordinator -Manages the lifecycle of build requests: -- Receives partition build requests via HTTP -- Calls DataBuild core to analyze required work -- Detects delegation opportunities to existing builds -- Schedules job execution via external orchestrators -- Tracks build progress through event log - -### 2. Partition Status Tracker -Provides real-time partition status: -- Queries build event log for partition lifecycle -- Determines partition liveness and staleness -- Handles partition status API endpoints - -### 3. Event Log Interface -Wraps the build event log with service-level operations: -- Appends build events from job executions -- Provides structured queries for dashboard -- Maintains build request to partition mappings - -### 4. Job Executor -Executes jobs directly within the service: -- Spawns job processes using DataBuild core -- Monitors job execution and resource usage -- Captures job outputs and translates to build events - -## HTTP API Design - -### Build Operations -``` -POST /builds - Body: {"partitions": ["dal://table/date=2024-01-01", ...]} - Returns: {"build_request_id": "uuid"} - -GET /builds/{build_request_id} - Returns: {"status": "executing", "progress": {"graph": {...}, "events": [...]}, "partitions": [...]} - -DELETE /builds/{build_request_id} - Returns: {"cancelled": true} -``` - -### Partition Status -``` -GET /partitions/{partition_ref}/status - Returns: {"status": "available", "last_updated": "timestamp", "build_requests": [...]} - -GET /partitions/{partition_ref}/events - Returns: {"events": [...]} -``` - -### Analysis -``` -POST /analyze - Body: {"partitions": ["dal://table/date=2024-01-01"]} - Returns: {"job_graph": {...}} -``` - -## Data Flow - -1. **Build Request Submission** - - User/system submits partition build request - - Service generates build request ID - - Calls DataBuild core to analyze required work - - Logs BUILD_REQUEST_RECEIVED event - -2. **Planning Phase** - - DataBuild core analyzes partition dependencies - - Service checks for delegation opportunities - - Logs BUILD_REQUEST_PLANNING event - - Creates execution plan with job graph - -3. **Execution Phase** - - Service executes jobs directly using DataBuild core - - Jobs execute independently, logging events - - Service aggregates events to track progress - - Logs BUILD_REQUEST_EXECUTING event - -4. **Completion** - - All jobs complete successfully/with errors - - Service updates final build request status - - Logs BUILD_REQUEST_COMPLETED/FAILED event - -## Delegation Logic - -When a build request analyzes partitions, it checks for existing builds producing the same partitions: - -```rust -async fn check_delegation_opportunities( - &self, - required_partitions: &[PartitionRef] -) -> Result, Error> { - let mut decisions = Vec::new(); - - for partition in required_partitions { - if let Some(existing_build) = self.find_active_build_for_partition(partition).await? { - decisions.push(DelegationDecision::Delegate { - partition: partition.clone(), - to_build_request: existing_build.build_request_id, - }); - } else { - decisions.push(DelegationDecision::Build { - partition: partition.clone(), - }); - } - } - - Ok(decisions) -} -``` - -## Minimal Implementation Scope - -### Phase 1: Core Service -- HTTP API for build requests and status -- Integration with existing DataBuild core -- SQLite-based build event log -- Basic partition status tracking -- In-process job execution - -### Phase 2: Production Features -- PostgreSQL build event log -- Delegation logic for overlapping builds -- Comprehensive HTTP API - -### Phase 3: Advanced Features -- Monitoring and alerting integration - -## Key Design Questions - -1. **Job Execution Strategy**: The service executes jobs directly using DataBuild core for simplicity. - -2. **Concurrency Control**: How should the service handle concurrent builds of the same partition? Current design uses delegation, but alternatives include queuing or coordination protocols. - -3. **Graph Persistence**: The service will re-analyze job graphs on each request to maintain simplicity. - -4. **Multi-Graph Support**: A single service instance will handle one DataBuild graph for simplicity, with graph composition handling cross-graph dependencies. - -5. **Event Log Granularity**: How detailed should build events be? More detail enables better observability but increases storage requirements. - -## Risk Mitigation - -Given the project's emphasis on avoiding over-engineering: - -- **Start Simple**: Begin with SQLite, in-process execution, and basic APIs -- **Defer Complexity**: Postpone advanced features like optimization strategies -- **Focus on Core Value**: Prioritize build coordination over peripheral features -- **Maintain Boundaries**: Keep the service focused on orchestration, not data processing - -The service should enable the compelling DataBuild vision while remaining implementable within the project's scope constraints. \ No newline at end of file diff --git a/plans/03-service-interface-refactor.md b/plans/03-service-interface-refactor.md deleted file mode 100644 index e5261bc..0000000 --- a/plans/03-service-interface-refactor.md +++ /dev/null @@ -1,134 +0,0 @@ -# Service Interface Refactor Plan - -## Motivation - -The Build Graph Service HTTP API successfully implements core DataBuild functionality but has interface misalignments with the proto definitions that reduce type safety and limit functionality. Key issues: - -- **Type Safety Loss**: Analysis responses use `serde_json::Value` instead of structured `JobGraph` -- **Missing Manifest Access**: Build responses only return request IDs, no way to retrieve final manifests -- **Incomplete Dashboard Support**: Missing list/metrics endpoints from design specification -- **Inconsistent Error Handling**: Different patterns between gRPC and HTTP - -## Core Interface Alignment - -### Fix Analysis Response Type Safety -**Problem**: `AnalyzeResponse` uses generic JSON, losing compile-time guarantees -```rust -// Current: loses type safety -pub job_graph: serde_json::Value - -// Target: structured type -pub job_graph: JobGraph -``` - -**Critical Detail**: Ensure `JobGraph` implements `Serialize` properly and remove `serde_json::to_value()` conversion in handlers. - -### Add Manifest Retrieval -**Problem**: HTTP API returns build ID immediately but no way to get final manifests (unlike proto's synchronous `GraphBuildResponse`) - -**Solution**: Add `GET /api/v1/builds/:id/manifests` endpoint -- Store manifests in `BuildRequestState` during execution -- Update execute command to capture manifests from job graph results -- Enable both async (current) and manifest retrieval patterns - -## Missing Core Functionality - -### Dashboard Endpoints -The design specification requires but current API lacks: - -1. **Recent Builds**: `GET /api/v1/builds?limit=N` - - Enable dashboard functionality - - Support pagination for large build histories - -2. **Job Metrics**: `GET /api/v1/jobs/:label/metrics` - - Aggregate success rates, durations, failure patterns - - Essential for operational monitoring - -3. **Query Interface**: `POST /api/v1/query` - - Advanced querying beyond predefined endpoints - - SQL passthrough or structured query DSL - -**Implementation Strategy**: Extend event log with aggregation queries, add proper pagination support. - -## Error Response Standardization - -**Problem**: Current `ErrorResponse {error: String}` lacks structure and traceability - -**Target**: -```rust -pub struct ErrorResponse { - pub error: ErrorDetail, - pub request_id: Option, - pub timestamp: i64, -} -``` - -**Error Code Strategy**: -- `INVALID_PARTITION_REF` - Validation failures -- `JOB_LOOKUP_FAILED` - Graph analysis issues -- `ANALYSIS_FAILED` / `EXECUTION_FAILED` - Build process errors -- `BUILD_NOT_FOUND` - Resource not found -- `INTERNAL_ERROR` - Server errors - -**Critical**: Add request ID middleware for tracing across async operations. - -## Protocol Compatibility Strategy - -### Dual Protocol Support -**Narrative**: Some clients need gRPC's performance/streaming, others prefer HTTP's simplicity - -**Approach**: -- Implement `DataBuildService` from proto using `tonic` -- Share core logic between HTTP and gRPC handlers -- Add sync mode option: `POST /api/v1/builds?mode=sync` returns manifests directly - -**Critical Detail**: Avoid code duplication by extracting shared business logic into service layer. - -## Backward Compatibility - -**Philosophy**: Never break existing clients during improvements - -**Strategy**: -- New endpoints alongside existing ones -- API versioning (`/api/v2`) only for incompatible changes -- Feature flags for experimental functionality -- Deprecation notices with migration paths - -## Implementation Priority - -### Phase 1: Core Safety (Essential) -1. Fix `AnalyzeResponse` type safety -2. Add manifest retrieval endpoint -3. Standardize error responses with request tracing - -**Rationale**: These fix fundamental interface problems affecting all users. - -### Phase 2: Dashboard Features (High Value) -1. Recent builds list with pagination -2. Job metrics aggregation -3. Enhanced build state tracking - -**Rationale**: Enables operational monitoring and debugging workflows. - -### Phase 3: Advanced Features (Complete) -1. Query execution interface -2. gRPC service implementation -3. Sync/async mode support - -**Rationale**: Achieves full design specification compliance and protocol flexibility. - -## Critical Success Factors - -1. **Type Safety**: All responses use structured types, not generic JSON -2. **Interface Completeness**: Every proto service method has HTTP equivalent -3. **Operational Readiness**: Dashboard endpoints support real monitoring needs -4. **Zero Breakage**: Existing clients continue working throughout transition -5. **Performance**: No regression in build execution times - -## Risk Mitigation - -**Breaking Changes**: Use versioning strategy and maintain parallel endpoints during transitions - -**Performance Impact**: Implement caching for metrics aggregation, rate limiting for expensive queries - -**Complexity Growth**: Extract shared business logic to avoid duplication between HTTP/gRPC handlers \ No newline at end of file diff --git a/plans/04-end-to-end-tests-1.md b/plans/04-end-to-end-tests-1.md deleted file mode 100644 index bde303c..0000000 --- a/plans/04-end-to-end-tests-1.md +++ /dev/null @@ -1,195 +0,0 @@ -# End-to-End Tests (Phase 1) - Design Document - -## Overview - -This design document outlines the implementation of comprehensive end-to-end tests for DataBuild's core capabilities. The tests will validate that CLI and Service builds produce identical results and events, ensuring consistency across different build interfaces. - -## Objectives - -1. **Consistency Validation**: Verify that CLI and Service builds produce identical partition events and outputs -2. **Event Verification**: Ensure expected build events are generated for both build methods -3. **Isolation**: Use separate log databases to prevent test interference -4. **Integration**: Implement as `sh_test` targets to integrate with `bazel test //...` -5. **Performance**: Design tests to minimize bazel inefficiency and execution time - -## Test Scope - -### Target Examples -- **Basic Graph**: Simple random number generator with sum operations -- **Podcast Reviews**: Complex multi-stage data pipeline with dependencies - -### Test Scenarios - -#### Basic Graph Tests -1. **Single Partition Build** - - CLI: `bazel-bin/basic_graph.build pippin` - - Service: `POST /api/v1/builds {"partitions": ["pippin"]}` - - Verify: Same events, same output files - -2. **Multiple Partition Build** - - CLI: `bazel-bin/basic_graph.build pippin salem sadie` - - Service: `POST /api/v1/builds {"partitions": ["pippin", "salem", "sadie"]}` - - Verify: Same events, same output files - -3. **Sum Partition Build** - - CLI: `bazel-bin/basic_graph.build pippin_salem_sadie` - - Service: `POST /api/v1/builds {"partitions": ["pippin_salem_sadie"]}` - - Verify: Dependencies built, sum computed correctly - -#### Podcast Reviews Tests -1. **Simple Pipeline** - - CLI: `bazel-bin/podcast_reviews_graph.build "reviews/date=2020-01-01"` - - Service: `POST /api/v1/builds {"partitions": ["reviews/date=2020-01-01"]}` - - Verify: Raw reviews extracted correctly - -2. **Complex Pipeline** - - CLI: `bazel-bin/podcast_reviews_graph.build "daily_summaries/category=Technology/date=2020-01-01"` - - Service: `POST /api/v1/builds {"partitions": ["daily_summaries/category=Technology/date=2020-01-01"]}` - - Verify: Full pipeline execution with all intermediate partitions - -3. **Podcasts Metadata** - - CLI: `bazel-bin/podcast_reviews_graph.build "podcasts/all"` - - Service: `POST /api/v1/builds {"partitions": ["podcasts/all"]}` - - Verify: Metadata extraction and availability for downstream jobs - -## Test Architecture - -### Database Isolation -``` -test_data/ -> cli_test_db/ # CLI build event database -> service_test_db/ # Service build event database -> expected_outputs/ # Reference outputs for validation -``` - -### Test Structure -``` -tests/ -> end_to_end/ ->> basic_graph_test.sh ->> podcast_reviews_test.sh ->> lib/ ->>> test_utils.sh # Common test utilities ->>> db_utils.sh # Database comparison utilities ->>> service_utils.sh # Service management utilities ->> BUILD # Bazel test targets -``` - -### Bazel Integration -```python -# tests/end_to_end/BUILD -sh_test( - name = "basic_graph_e2e", - srcs = ["basic_graph_test.sh"], - data = [ - "//:basic_graph.build", - "//:basic_graph.service", - "//tests/end_to_end/lib:test_utils", - ], - env = { - "TEST_DB_DIR": "$(location test_data)", - }, - size = "medium", -) - -sh_test( - name = "podcast_reviews_e2e", - srcs = ["podcast_reviews_test.sh"], - data = [ - "//:podcast_reviews_graph.build", - "//:podcast_reviews_graph.service", - "//tests/end_to_end/lib:test_utils", - "//examples/podcast_reviews:data", - ], - env = { - "TEST_DB_DIR": "$(location test_data)", - }, - size = "large", -) -``` - -## Test Implementation Details - -### Test Flow -1. **Setup**: Create isolated test databases and clean output directories -2. **CLI Build**: Execute CLI build with test database configuration -3. **Service Build**: Start service with separate test database, execute build via HTTP -4. **Comparison**: Compare build events, output files, and partition status -5. **Cleanup**: Stop services and clean test artifacts - -### Event Validation -- **Event Count**: Same number of events for identical builds -- **Event Types**: Same sequence of build events (Started, Progress, Completed, etc.) -- **Event Metadata**: Same partition references, job names, and timestamps (within tolerance) -- **Event Ordering**: Proper dependency ordering maintained - -### Output Validation -- **File Existence**: Same output files created -- **File Content**: Identical content (accounting for any timestamp/randomness) -- **Partition Status**: Same final partition status via API - -### Service Management -```bash -# Start service with test database -start_test_service() { - local db_path="$1" - local port="$2" - - export BUILD_EVENT_LOG_DB="$db_path" - bazel-bin/basic_graph.service --port="$port" & - local service_pid=$! - - # Wait for service to be ready - wait_for_service "http://localhost:$port/health" - - echo "$service_pid" -} -``` - -## Test Efficiency - -### Basic Optimizations -- **Parallel Execution**: Tests run in parallel where possible -- **Resource Limits**: Set appropriate `size` attributes to prevent resource contention -- **Minimal Data**: Use minimal test datasets to reduce execution time - -### CI/CD Integration -- **Timeout Handling**: Reasonable timeouts for service startup/shutdown -- **Retry Logic**: Retry flaky network operations -- **Artifact Collection**: Collect logs and databases on test failure - -## Risk Mitigation - -### Test Flakiness -- **Deterministic Randomness**: Use fixed seeds for reproducible results -- **Port Management**: Dynamic port allocation to prevent conflicts -- **Database Locking**: Proper database isolation and cleanup -- **Cleanup Guarantees**: Ensure cleanup even on test failure - -## Implementation Plan - -### Phase 1: Basic Framework -1. Create test directory structure -2. Implement basic test utilities -3. Create simple Basic Graph test -4. Integrate with Bazel - -### Phase 2: Complete Implementation -1. Add Podcast Reviews tests -2. Implement comprehensive event validation -3. Create CI/CD integration -4. Ensure reliable test execution - -## Success Criteria - -1. **Consistency**: CLI and Service builds produce identical events and outputs -2. **Coverage**: All major build scenarios covered for both examples -3. **Reliability**: Tests pass consistently in CI/CD environment -4. **Integration**: Tests properly integrated with `bazel test //...` - -## Future Enhancements - -1. **Property-Based Testing**: Generate random partition combinations -2. **Performance Benchmarking**: Track build performance over time -3. **Chaos Testing**: Test resilience to failures and interruptions -4. **Load Testing**: Test service under concurrent build requests \ No newline at end of file diff --git a/plans/05-roadmap.md b/plans/05-roadmap.md deleted file mode 100644 index afecf45..0000000 --- a/plans/05-roadmap.md +++ /dev/null @@ -1,96 +0,0 @@ - -# Roadmap - -Please read the [core concepts](../core-concepts.md) and [manifesto](../manifesto.md) to understand project motivation. This roadmap describes the different phases of execution, and composition of the system/concepts at a high level. - -```mermaid -flowchart - core -- emits --> partition_event_log - partition_event_log -- read by --> build_graph_service - build_graph_service -- invokes --> core - build_graph_service -- informs --> build_graph_dashboard -``` - -# Stages - -## Foundation: data build graph definition - -Status: DONE - -This phase establishes the core capability of describing a flexible declarative partition-aware build system. This graph is the concept upon which other concepts can be attached, e.g. making the graph deployable for remote builds, etc. - -## Build Event Log - -[**Design Doc**](./build-event-log.md) - -Status: Done - -This phase establishes the build event log, which allows for tracking of partition status, coordination of build requests (e.g. avoiding duplicate work, contention, etc), and eventual visualization of build requests and partition liveness/staleness status. It is comprised of a schema as well as an access layer allowing it to be written and read by different system components. - -## Build Graph Service - -[**Design Doc**](./build-graph-service.md) - -Status: Done - -Together with the Build Event Log, this enables deployment of a persistent build service that builds data on request without needing to rebuild existing non-stale partitions. It also serves build request status and progress, and surfaces partition liveness / freshness endpoints. Key questions it answers: - -- What build requests have there been? -- What is the status of a specific build request? -- What would the build graph look like to materialize a specific partition? -- What build events have happened in this time frame? -- Is this partition live and not stale? -- What build events are relevant/related to this partition? (e.g. why doesn't this exist yet, etc) -- Build this partition, returning a build request ID. - -## End-to-End Tests (Phase 1) - -[**Design Doc**](./end-to-end-tests-1.md) - -Status: Done - -Uses the [basic graph](../examples/basic_graph/README.md) and [podcast reviews](../examples/podcast_reviews/README.md) examples to implement end-to-end testing of the databuild capabilities. - -- Build the same partitions via CLI and service, verify that we get the same events out, and that we get expected events in each -- They should have separate log databases -- Should be implemented as a sh_test or similar so that `bazel test //...` at each workplace root triggers -- Is there any risk of bazel inefficiency here / slow tests? How would we mitigate? - -## Build Graph Dashboard - -[**Design Doc**](./build-graph-dashboard.md) - -Status: In Progress - -A UI that relies on the Build Graph Service, showing things like build activity and partition liveness information. There are a few key pages: - -- Partition build request status page: shows the status of all work involved in building a partition, including upstream partition build actions and delegated (active and handled by another build request) build requests. Engineers watch this to see what's happening, it tails the build event log. -- Partition status page: Is the partition live? Stale? What past builds produced it? (with links) Also, include a button for building the partition (with option for force if it already exists and is non-stale). -- Job list page: lists all jobs included in the graph, along with aggregate success metrics and timing information. -- Job history page: for a given job, list recent job runs and their success and timing information, along with any interesting metadata. -- Job run page: All the execution information for a specific job run, including env vars, parameters, result, logs, etc. -- Analyze page: runs the graph analyze verb, returning the plan that would produce the requested partitions -- Raw SQL page: enable debugging by allowing submission of sql queries to be executed against build event log + views - -# Risks - -## Over-Engineering / Scoping - -The goal of this project is to produce a powerful, inspiring view on how declarative data builds can work, not to produce a production system. We take extra steps to achieve very high leverage and differentiated capabilities, but not to enable table stakes or obvious features that aren't required for the former. - -## Complexity - -This project already has a lot of irreducible complexity, and adding optional complexity is a likely failure mode. - -# Questions - -## Should graphs be services? -A tempting way to organize different graphs is to have them be literal services, and represent cross-graph dependency builds as requests to upstream graph services, etc. Graphs as services is attractive as service boundaries generally match org boundaries, etc, and this matches that pattern. It also means that we are creating a distributed system - though perhaps that's the implicit choice in using more than 1 graph anyway? - -## Do we need first-class trigger concepts? - -In theory, every trigger is just a simple script triggered by cron or request, that maps the input data to a set of desired partitions, likely with some intermediate step to look at a catalog and source candidate partition col values, etc. This is not very inspiring, as it doesn't sound differentiating in value, and theoretically we should punt on or simply not implement the low marginal value features. Is this truly valuable? It would need to bring a new level of convenience/simplicity/ease of deployment, or a new capability based on "expected partitions" to be justified. For example, we might be able to predict when we think partitions will land next, or what they might do in the future, that could be useful operationally. But those hypotheticals may also be best left to extensions or some "plugin" concept. - -## Will we need a dataset concept? - -Theoretically, DataBuild doesn't need the dataset concept to fully resolve build graphs, and produce desired partitions. Practically, partitions will be instances of different classes, and humans will use those classes as organizing concepts, e.g. in asking about recent partition builds of a kind. To what extent do we need to implement a dataset concept? We could implement them as views, e.g. allowing the specification of partition patterns, tagging, etc. diff --git a/plans/06-build-graph-dashboard.md b/plans/06-build-graph-dashboard.md deleted file mode 100644 index 0f379ba..0000000 --- a/plans/06-build-graph-dashboard.md +++ /dev/null @@ -1,215 +0,0 @@ - -# Build Graph Dashboard - -**Parent:** [DataBuild Roadmap](./roadmap.md) - -**Status:** In Progress - -**Implementation:** Broken into 9 incremental chunks (see [Chunk Plans](#chunk-implementation-plan)) - -- Simplicity is absolutely critical here -- Use mithril -- Use typescript -- Rely on client generated from OpenAPI interface definitions - -## Minimal Design - -### Architecture - -Single-page application consuming Build Graph Service HTTP endpoints with real-time updates via polling. - -``` -┌─────────────────┐ HTTP API ┌─────────────────┐ -│ Dashboard SPA │ ◄────────────► │ Build Graph │ -│ (Mithril/TS) │ │ Service │ -└─────────────────┘ └─────────────────┘ - │ - │ - ┌─────────────────┐ - │ Build Event Log │ - │ (SQLite/PG) │ - └─────────────────┘ -``` - -### Core Pages - -#### 1. Build Request Status (`/builds/{id}`) -Primary operational page showing real-time build progress. - -**UI Elements:** -- Build request header (ID, status, timestamp) -- Job/Partition status directed graph (with status requested → scheduled → building → available/failed) -- Per-job execution timeline with logs from build events -- Delegation indicators (when partitions delegated to other builds) - -**Key Features:** -- Auto-refresh every 2 seconds when tab is active and build is running -- Expandable job details with execution logs from build event log -- Visual progress indication via graph - -#### 2. Partitions List Page (`/partitions`) -Lists recently built partitions (searchable), with a "Build" button to trigger a new partition build (or go to existing). - -#### 3. Partition Status (`/partitions/{base64_ref}`) -Shows partition lifecycle and build history. - -**UI Elements:** -- Partition info (ref, current status, last updated) -- Build history table (build requests that produced this partition) -- "Build Now" button with force option -- Related partitions (upstream/downstream dependencies) - -**URL Handling:** -- Partition refs base64-encoded in URLs to avoid parsing ambiguity -- Client-side encode/decode for partition references - -#### 4. Recent Activity (`/`) -Dashboard home showing system activity. - -**UI Elements:** -- Recent build requests (fetched from service) -- Active builds count -- Recent partition builds -- System health indicators - -**Stateless Design:** -- All data fetched from Build Graph Service on page load -- No client-side caching or state persistence - -#### 5. Jobs List Page (`/jobs`) -Lists jobs in graph with high level metedata per job and a link to each. - -#### 6. Job Metrics (`/jobs/{label}`) -Job performance and reliability metrics. (with job label base64 encoded) - -**UI Elements:** -- Success rate chart (data from build event log via service) -- Average duration trends -- Recent runs table (with requested partitions) - -#### 5. Graph Analysis (`/analyze`) -Interactive build graph visualization. - -**UI Elements:** -- Partition input form -- Generated job graph (mermaid.js rendering - simple visualization, rendered in the client) -- Execution plan table - -### Technical Implementation - -#### State Management -```typescript -// Minimal stateless approach - only UI state -interface AppState { - currentPage: string; - polling: { - active: boolean; - interval: number; - tabVisible: boolean; // Page Visibility API - }; -} -``` - -#### API Client -```typescript -class BuildGraphClient { - async getBuildStatus(id: string): Promise; - async getPartitionStatus(ref: string): Promise; - async submitBuild(partitions: string[]): Promise; - async getJobMetrics(label: string): Promise; - async analyzeBuild(partitions: string[]): Promise; - async getRecentActivity(): Promise; -} - -// Partition reference utilities -function encodePartitionRef(ref: string): string { - return btoa(ref).replace(/\+/g, '-').replace(/\//g, '_').replace(/=/g, ''); -} - -function decodePartitionRef(encoded: string): string { - return atob(encoded.replace(/-/g, '+').replace(/_/g, '/')); -} -``` - -#### Routing -```typescript -const routes: RouteDef[] = [ - { path: '/', component: RecentActivity }, - { path: '/builds/:id', component: BuildStatus }, - { path: '/partitions/:base64_ref', component: PartitionStatus }, - { path: '/jobs/:label', component: JobMetrics }, - { path: '/analyze', component: GraphAnalysis } -]; -``` - -### Styling Approach - -Minimal CSS with utility classes: -- CSS Grid for layouts -- Tailwind + daisyui -- Consistent spacing scale (4px base unit) -- Monospace font for partition refs/IDs -- Color coding for status indicators - -### Real-time Updates - -Polling-based updates with Page Visibility API integration: -- 2s interval when tab is active and build is running -- 10s interval when tab is active and build is completed -- Polling stops when tab is not visible -- Manual refresh button always available -- Uses `document.visibilityState` to detect tab focus - -### Bundle Size Targets - -- Initial bundle: < 50KB gzipped -- Mithril: ~10KB -- Custom code: ~20KB -- CSS: ~5KB -- OpenAPI client: ~15KB - -### Data Flow - -All data flows from Build Graph Service: -1. **Job Logs**: Stored in build event log, retrieved via service API -2. **Partition Status**: Computed from build events by service -3. **Build History**: Maintained in build event log -4. **Metrics**: Aggregated from build events by service -5. **No Client State**: Dashboard fetches fresh data on each page load - -### Development Workflow - -```bash -# Development server -npm run dev - -# Build for production -npm run build - -# Type checking -npm run typecheck - -# Bundled with build graph service -./scripts/build_dashboard -``` - -## Chunk Implementation Plan - -This dashboard implementation is broken into 9 incremental, testable chunks: - -### Phase 1: Foundation & Infrastructure -- **[Chunk 1: TypeScript Client Generation](./webapp_v1/chunk-1-client-generation.md)** - Generate typed client from OpenAPI spec -- **[Chunk 2: Hello World App](./webapp_v1/chunk-2-hello-world-app.md)** - Bazel-built TypeScript + Mithril app -- **[Chunk 3: Routing Framework](./webapp_v1/chunk-3-routing-framework.md)** - Multi-page routing and layout - -### Phase 2: Core Dashboard Pages -- **[Chunk 4: Recent Activity](./webapp_v1/chunk-4-recent-activity.md)** - Dashboard home page -- **[Chunk 5: Build Status](./webapp_v1/chunk-5-build-status.md)** - Real-time build monitoring -- **[Chunk 6: Partition Pages](./webapp_v1/chunk-6-partition-pages.md)** - Partition status and history -- **[Chunk 7: Jobs Pages](./webapp_v1/chunk-7-jobs-pages.md)** - Job metrics and monitoring - -### Phase 3: Advanced Features -- **[Chunk 8: Graph Analysis](./webapp_v1/chunk-8-graph-analysis.md)** - Interactive graph visualization -- **[Chunk 9: Polish](./webapp_v1/chunk-9-polish.md)** - Final styling and optimization - -Each chunk delivers working functionality that can be tested independently while building toward the complete dashboard vision. diff --git a/plans/07-cli-service-build-unification.md b/plans/07-cli-service-build-unification.md deleted file mode 100644 index 6def826..0000000 --- a/plans/07-cli-service-build-unification.md +++ /dev/null @@ -1,336 +0,0 @@ -# CLI-Service Build Unification - -## Problem Statement - -The current DataBuild architecture has significant duplication and architectural inconsistencies between CLI and Service build orchestration: - -### Current Duplication Issues - -1. **Event Emission Logic**: Service HTTP handlers and CLI binaries contain duplicate orchestration event emission code -2. **Mode Detection**: Analysis and execution binaries (`analyze.rs` and `execute.rs`) use `DATABUILD_CLI_MODE` environment variable to conditionally emit different events -3. **Test Complexity**: End-to-end tests must account for different event patterns between CLI and Service for identical logical operations - -### Specific Code References - -- **CLI Mode Detection in Analysis**: `databuild/graph/analyze.rs:555-587` - Emits "Build request received" and "Starting build planning" events only in CLI mode -- **CLI Mode Detection in Execution**: `databuild/graph/execute.rs:413-428` and `execute.rs:753-779` - Emits execution start/completion events only in CLI mode -- **Service Orchestration**: `databuild/service/handlers.rs` - HTTP handlers emit orchestration events independently - -### Architectural Problems - -1. **Single Responsibility Violation**: Analysis and execution binaries serve dual purposes as both shared library functions and CLI entry points -2. **Consistency Risk**: Separate implementations of orchestration logic create risk of drift between CLI and Service behavior -3. **Maintenance Burden**: Changes to orchestration requirements must be implemented in multiple places - -## Current Architecture Analysis - -### Service Flow -``` -HTTP Request → Service Handler → Orchestration Events → Analysis → Execution → Completion Events -``` - -The Service has a natural coordination point in the HTTP handler that manages the entire build lifecycle and emits appropriate orchestration events. - -### CLI Flow -``` -Shell Script → Analysis Binary (CLI mode) → Execution Binary (CLI mode) → Orchestration Events -``` - -The CLI lacks a natural coordination point, forcing the shared analysis/execution binaries to detect CLI mode and emit orchestration events themselves. - -### Event Flow Comparison - -**Service Events** (coordinated): -1. Build request received -2. Starting build planning -3. Analysis events (partitions scheduled, jobs configured) -4. Starting build execution -5. Execution events (jobs scheduled/completed, partitions available) -6. Build request completed - -**CLI Events** (mode-dependent): -- Same events as Service, but emitted conditionally based on `DATABUILD_CLI_MODE` -- Creates awkward coupling between orchestration concerns and domain logic - -## Proposed Shared Library Design - -### Core Orchestrator API - -```rust -pub struct BuildOrchestrator { - event_log: Box, - build_request_id: String, - requested_partitions: Vec, -} - -impl BuildOrchestrator { - pub fn new( - event_log: Box, - build_request_id: String, - requested_partitions: Vec - ) -> Self; - - // Lifecycle events - pub async fn start_build(&self) -> Result<(), Error>; - pub async fn start_planning(&self) -> Result<(), Error>; - pub async fn start_execution(&self) -> Result<(), Error>; - pub async fn complete_build(&self, result: BuildResult) -> Result<(), Error>; - - // Domain events (pass-through to existing logic) - pub async fn emit_partition_scheduled(&self, partition: &PartitionRef) -> Result<(), Error>; - pub async fn emit_job_scheduled(&self, job: &JobEvent) -> Result<(), Error>; - pub async fn emit_job_completed(&self, job: &JobEvent) -> Result<(), Error>; - pub async fn emit_partition_available(&self, partition: &PartitionEvent) -> Result<(), Error>; - pub async fn emit_delegation(&self, partition: &str, target_build: &str, message: &str) -> Result<(), Error>; -} - -pub enum BuildResult { - Success { jobs_completed: usize }, - Failed { jobs_completed: usize, jobs_failed: usize }, - FailFast { trigger_job: String }, -} -``` - -### Event Emission Strategy - -The orchestrator will emit standardized events at specific lifecycle points: - -1. **Build Lifecycle Events**: High-level orchestration (received, planning, executing, completed) -2. **Domain Events**: Pass-through wrapper for existing analysis/execution events -3. **Consistent Timing**: All events emitted through orchestrator ensure proper sequencing - -### Error Handling - -```rust -#[derive(Debug, thiserror::Error)] -pub enum OrchestrationError { - #[error("Event log error: {0}")] - EventLog(#[from] databuild::event_log::Error), - - #[error("Build coordination error: {0}")] - Coordination(String), - - #[error("Invalid build state transition: {current} -> {requested}")] - InvalidStateTransition { current: String, requested: String }, -} -``` - -### Testing Interface - -```rust -#[cfg(test)] -impl BuildOrchestrator { - pub fn with_mock_event_log(build_request_id: String) -> (Self, MockEventLog); - pub fn emitted_events(&self) -> &[BuildEvent]; -} -``` - -## Implementation Phases - -### Phase 1: Create Shared Orchestration Library - -**Files to Create**: -- `databuild/orchestration/mod.rs` - Core orchestrator implementation -- `databuild/orchestration/events.rs` - Event type definitions and helpers -- `databuild/orchestration/error.rs` - Error types -- `databuild/orchestration/tests.rs` - Unit tests for orchestrator - -**Key Implementation Points**: -- Extract common event emission patterns from Service and CLI -- Ensure orchestrator is async-compatible with existing event log interface -- Design for testability with dependency injection - -### Phase 2: Refactor Service to Use Orchestrator - -**Files to Modify**: -- `databuild/service/handlers.rs` - Replace direct event emission with orchestrator calls -- `databuild/service/mod.rs` - Integration with orchestrator lifecycle - -**Implementation**: -- Replace existing event emission code directly with orchestrator calls -- Ensure proper error handling and async integration - -### Phase 3: Create New CLI Wrapper - -**Files to Create**: -- `databuild/cli/main.rs` - New CLI entry point using orchestrator -- `databuild/cli/error.rs` - CLI-specific error handling - -**Implementation**: -```rust -// databuild/cli/main.rs -#[tokio::main] -async fn main() -> Result<(), CliError> { - let args = parse_cli_args(); - let event_log = create_build_event_log(&args.event_log_uri).await?; - let build_request_id = args.build_request_id.unwrap_or_else(|| Uuid::new_v4().to_string()); - - let orchestrator = BuildOrchestrator::new(event_log, build_request_id, args.partitions.clone()); - - // Emit orchestration events - orchestrator.start_build().await?; - orchestrator.start_planning().await?; - - // Run analysis - let graph = run_analysis(&args.partitions, &orchestrator).await?; - - orchestrator.start_execution().await?; - - // Run execution - let result = run_execution(graph, &orchestrator).await?; - - orchestrator.complete_build(result).await?; - - Ok(()) -} -``` - -### Phase 4: Remove CLI Mode Detection - -**Files to Modify**: -- `databuild/graph/analyze.rs` - Remove lines 555-587 (CLI mode orchestration events) -- `databuild/graph/execute.rs` - Remove lines 413-428 and 753-779 (CLI mode orchestration events) - -**Verification**: -- Analysis and execution binaries become pure domain functions -- No more environment variable mode detection -- All orchestration handled by wrapper/service - -### Phase 5: Update Bazel Rules - -**Files to Modify**: -- `databuild/rules.bzl` - Update `_databuild_graph_build_impl` to use new CLI wrapper instead of direct analysis/execution pipeline - -**Before**: -```bash -$(rlocation _main/{analyze_path}) $@ | $(rlocation _main/{exec_path}) -``` - -**After**: -```bash -$(rlocation _main/{cli_wrapper_path}) $@ -``` - -### Phase 6: Update Tests - -**Files to Modify**: -- `tests/end_to_end/simple_test.sh` - Remove separate CLI/Service event validation -- `tests/end_to_end/podcast_simple_test.sh` - Same simplification -- All tests expect identical event patterns from CLI and Service - -## Migration Strategy - -### Direct Replacement Approach - -Since we don't need backwards compatibility, we can implement a direct replacement: -- Replace existing CLI mode detection immediately -- Refactor Service handlers to use orchestrator directly -- Update Bazel rules to use new CLI wrapper -- Update tests to expect unified behavior - -### Testing Strategy - -1. **Unit Tests**: Comprehensive orchestrator testing with mock event logs -2. **Integration Tests**: Existing end-to-end tests pass with unified implementation -3. **Event Verification**: Ensure orchestrator produces expected events for all scenarios - -## File Changes Required - -### New Files -- `databuild/orchestration/mod.rs` - 200+ lines, core orchestrator -- `databuild/orchestration/events.rs` - 100+ lines, event helpers -- `databuild/orchestration/error.rs` - 50+ lines, error types -- `databuild/orchestration/tests.rs` - 300+ lines, comprehensive tests -- `databuild/cli/main.rs` - 150+ lines, CLI wrapper -- `databuild/cli/error.rs` - 50+ lines, CLI error handling - -### Modified Files -- `databuild/service/handlers.rs` - Replace ~50 lines of event emission with orchestrator calls -- `databuild/graph/analyze.rs` - Remove ~30 lines of CLI mode detection -- `databuild/graph/execute.rs` - Remove ~60 lines of CLI mode detection -- `databuild/rules.bzl` - Update ~10 lines for new CLI wrapper -- `tests/end_to_end/simple_test.sh` - Simplify ~20 lines of event validation -- `tests/end_to_end/podcast_simple_test.sh` - Same simplification - -### Build Configuration -- Update `databuild/BUILD.bazel` to include orchestration module -- Update `databuild/cli/BUILD.bazel` for new CLI binary -- Modify example graphs to use new CLI wrapper - -## Benefits & Risk Analysis - -### Benefits - -1. **Maintainability**: Single source of truth for orchestration logic eliminates duplication -2. **Consistency**: Guaranteed identical events across CLI and Service interfaces -3. **Extensibility**: Foundation for SDK, additional CLI commands, monitoring integration -4. **Testing**: Simplified test expectations, better unit test coverage of orchestration -5. **Architecture**: Clean separation between orchestration and domain logic - -### Implementation Risks - -1. **Regression**: Changes to critical path could introduce subtle bugs -2. **Performance**: Additional abstraction layer could impact latency -3. **Integration**: Bazel build changes could break example workflows - -### Risk Mitigation - -1. **Phased Implementation**: Implement in stages with verification at each step -2. **Comprehensive Testing**: Thorough unit and integration testing -3. **Event Verification**: Ensure identical event patterns to current behavior - -## Future Architecture Extensions - -### SDK Integration - -The unified orchestrator provides a natural integration point for external SDKs: - -```rust -// Future SDK usage -let databuild_client = DatabuildClient::new(endpoint); -let orchestrator = databuild_client.create_orchestrator(partitions).await?; - -orchestrator.start_build().await?; -let result = databuild_client.execute_build(orchestrator).await?; -``` - -### Additional CLI Commands - -Orchestrator enables consistent event emission across CLI commands: - -```bash -databuild validate --partitions "data/users" --dry-run -databuild status --build-id "abc123" -databuild retry --build-id "abc123" --failed-jobs-only -``` - -### Monitoring Integration - -Standardized events provide foundation for observability: - -```rust -impl BuildOrchestrator { - pub fn with_tracing_span(&self, span: tracing::Span) -> Self; - pub fn emit_otel_metrics(&self) -> Result<(), Error>; -} -``` - -### CI/CD Pipeline Integration - -Orchestrator events enable standardized build reporting across environments: - -```yaml -# GitHub Actions integration -- name: DataBuild - uses: databuild/github-action@v1 - with: - partitions: "data/daily_reports" - event-log: "${{ env.DATABUILD_EVENT_LOG }}" - # Automatic event collection for build status reporting -``` - -## Conclusion - -This unification addresses fundamental architectural inconsistencies while providing a foundation for future extensibility. The phased implementation approach minimizes risk while ensuring backward compatibility throughout the transition. - -The shared orchestrator eliminates the current awkward CLI mode detection pattern and establishes DataBuild as a platform that can support multiple interfaces with guaranteed consistency. \ No newline at end of file diff --git a/plans/08-integration-test-v2.md b/plans/08-integration-test-v2.md deleted file mode 100644 index 8fe6754..0000000 --- a/plans/08-integration-test-v2.md +++ /dev/null @@ -1,148 +0,0 @@ -# Integration Test Plan for DataBuild Delegation System - -## Overview -Create comprehensive integration tests for the basic_graph example that trigger delegation scenarios and verify Build Event Log (BEL) entries to ensure the delegation system works correctly and provides proper traceability. - -## Current Test Infrastructure Analysis - -**Existing Pattern**: The current test suite in `/tests/end_to_end/` follows a mature pattern: -- **Common utilities**: `lib/test_utils.sh`, `lib/db_utils.sh`, `lib/service_utils.sh` -- **Test isolation**: Separate SQLite databases per test to prevent interference -- **CLI vs Service validation**: Tests ensure both paths produce identical events -- **Event analysis**: Detailed breakdown of job/partition/request event counts -- **Robust service management**: Start/stop with proper cleanup and health checks - -**Target System**: basic_graph example with two jobs: -- `generate_number_job`: Produces partitions like `generated_number/pippin` -- `sum_job`: Depends on multiple generated numbers, produces `sum/pippin_salem_sadie` - -## New Test Implementation Plan - -### 1. Create Delegation-Specific Test: `basic_graph_delegation_test.sh` - -**Test Scenarios**: -- **Historical Delegation**: Run same partition twice, verify second run delegates to first -- **Multi-partition Jobs**: Test delegation behavior when jobs produce multiple partitions -- **Mixed Availability**: Test jobs where some target partitions exist, others don't -- **BEL Verification**: Validate specific delegation events and job status transitions - -**Core Test Cases**: - -1. **Single Partition Historical Delegation** - - Build `generated_number/pippin` (first run - normal execution) - - Build `generated_number/pippin` again (second run - should delegate) - - Verify BEL contains: `DelegationEvent` + `JOB_SKIPPED` for second run - -2. **Multi-Partition Delegation Scenarios** - - Build `generated_number/pippin`, `generated_number/salem`, `generated_number/sadie` - - Build `sum/pippin_salem_sadie` (should delegate to existing partitions) - - Verify delegation events point to correct source build requests - -3. **Partial Delegation Scenario** - - Build `generated_number/pippin`, `generated_number/salem` - - Request `generated_number/pippin`, `generated_number/salem`, `generated_number/sadie` - - Verify: delegations for pippin/salem, normal execution for sadie - -4. **Cross-Run Delegation Chain** - - Run 1: Build `generated_number/pippin` - - Run 2: Build `generated_number/salem` - - Run 3: Build `sum/pippin_salem_sadie` (requires sadie, should delegate pippin/salem) - - Verify delegation traceability to correct source builds - -### 2. BEL Validation Utilities - -**New functions in `lib/db_utils.sh`**: -- `get_delegation_events()`: Extract delegation events for specific partition -- `verify_job_skipped()`: Check job was properly skipped with delegation -- `get_delegation_source_build()`: Validate delegation points to correct build request -- `compare_delegation_behavior()`: Compare CLI vs Service delegation consistency - -**Event Validation Logic**: -```bash -# For historical delegation, verify event sequence: -# 1. DelegationEvent(partition_ref, delegated_to_build_request_id, message) -# 2. JobEvent(status=JOB_SKIPPED, message="Job skipped - all target partitions already available") -# 3. No JobEvent(JOB_SCHEDULED/RUNNING/COMPLETED) for delegated job - -# For successful delegation: -# - Success rate should be 100% (JOB_SKIPPED counts as success) -# - Partition should show as available without re-execution -# - Build request should complete successfully -``` - -### 3. Performance and Reliability Validation - -**Delegation Efficiency Tests**: -- Time comparison: first run vs delegated run (should be significantly faster) -- Resource usage: ensure delegated runs don't spawn job processes -- Concurrency: multiple builds requesting same partition simultaneously - -**Error Scenarios**: -- Source build request failure handling -- Corrupted delegation data -- Stale partition detection - -### 4. Integration with Existing Test Suite - -**File Structure**: -``` -tests/end_to_end/ -├── basic_graph_delegation_test.sh # New delegation-specific tests -├── basic_graph_test.sh # Existing functionality tests (enhanced) -├── lib/ -│ ├── delegation_utils.sh # New delegation validation utilities -│ ├── db_utils.sh # Enhanced with delegation functions -│ └── test_utils.sh # Existing utilities -└── BUILD # Updated to include new test -``` - -**Bazel Integration**: -- Add `basic_graph_delegation_test` as new `sh_test` target -- Include in `run_e2e_tests.sh` execution -- Tag with `["delegation", "e2e"]` for selective running - -### 5. CLI vs Service Delegation Consistency - -**Validation Approach**: -- Run identical delegation scenarios through both CLI and Service -- Compare BEL entries for identical delegation behavior -- Ensure both paths produce same success rates and event counts -- Validate API responses include delegation information - -### 6. Documentation and Debugging Support - -**Test Output Enhancement**: -- Clear delegation event logging during test execution -- Detailed failure diagnostics showing expected vs actual delegation behavior -- BEL dump utilities for debugging delegation issues -- Performance metrics (execution time, event counts) - -## Expected Outcomes - -**Success Criteria**: -1. **100% Success Rate**: Delegated builds show 100% success rate in dashboard -2. **Event Consistency**: CLI and Service produce identical delegation events -3. **Traceability**: All delegations link to correct source build requests -4. **Performance**: Delegated runs complete in <5 seconds vs 30+ seconds for full execution -5. **Multi-partition Correctness**: Complex jobs with mixed partition availability handled properly - -**Regression Prevention**: -- Automated validation prevents delegation system regressions -- Comprehensive BEL verification ensures audit trail integrity -- Performance benchmarks detect delegation efficiency degradation - -## Implementation Priority - -1. **High**: Core delegation test cases (historical, multi-partition) -2. **High**: BEL validation utilities and event verification -3. **Medium**: Performance benchmarking and efficiency validation -4. **Medium**: Error scenario testing and edge cases -5. **Low**: Advanced concurrency and stress testing - -This plan provides a comprehensive testing strategy that validates both the functional correctness and performance benefits of the delegation system while ensuring long-term reliability and debuggability. - -## Implementation Notes - -This plan was created following the user's request to improve system reliability and testability for the DataBuild delegation system. The focus is on the basic_graph example because it provides a simpler, more predictable test environment compared to the podcast_reviews example, while still covering all the essential delegation scenarios. - -The delegation system currently shows some issues (67% success rate instead of 100%) that these tests should help identify and prevent regression of once fixed. The comprehensive BEL validation will ensure that the delegation events provide proper audit trails and traceability as intended by the system design. \ No newline at end of file diff --git a/plans/09-partition-leasing.md b/plans/09-partition-leasing.md deleted file mode 100644 index 4261648..0000000 --- a/plans/09-partition-leasing.md +++ /dev/null @@ -1,4 +0,0 @@ - -# Partition Leasing - -TODO - we need to implement a partition leasing mechanism, like resource leasing from pointscrape. diff --git a/plans/10-shared-core.md b/plans/10-shared-core.md deleted file mode 100644 index 17c5daa..0000000 --- a/plans/10-shared-core.md +++ /dev/null @@ -1,74 +0,0 @@ - -# Shared Core Refactor - -We want to refactor the codebase to move shared functionality into core components that are shared between interfaces (e.g. CLI and service), and which can be tested independently. The capabilities are listed in [`databuild/README.md`](../databuild/README.md#graph), and each first level bullet represents a subcommand, with sub-bullets representing sub-sub-commands, e.g. you can run `bazel-bin/mygraph.cli builds cancel c38a442d-fad3-4f74-ae3f-062e5377fe52`. This should match service capabilities, e.g. `get -XPOST localhost:8080/builds/c38a442d-fad3-4f74-ae3f-062e5377fe52/cancel`. - -These core capabilities should be factored into explicit read vs write capabilities. On the write side, it should verify the action is relevant (e.g. you can't cancel a nonexistent build, but you can request the build of an existing partition, it will just delegate), and then write the appropriate event to the BEL. Simple. On the read side, the different capabilities should be implemented by different "repositories", a'la the repository pattern. We can then handle any variation in backing database internal to the repositories (since most SQL will be valid for both SQLite, postgres, and delta). - -# Plan -We should take a phased approach to executing this plan. After implementing the core functionality and unit tests for each phase, we should pause and write down any potential refactoring that would benefit the system before moving onto the next phase. - -## Phase 1 - Implement Common Event Write Component -Goal: create a single interface for writing events to the build event log. -- Should include all existing "write" functionality, like requesting a new build, etc. -- Migrate CLI to use new write component -- Migrate service to use new write component - -## Phase 2 - Implement `MockBuildEventLog` -Goal: create a common testing tool that allows easy specification of testing conditions (e.g. BEL contents/events) to test system/graph behavior. -- Should use an in-memory sqlite database to ensure tests can be run in parallel -- Should make it very easy to specify test data (e.g. event constructors with random defaults that can be overwritten) -- Should include a trivial unit test that writes a valid event and verifies its there via real code paths. -- Design notes: shouldn't rewrite event write or repository read code; should focus on making test cases easy to describe, so then assertions can be made on repository-based queries. -- Event write and repositories should be pluggable, allowing for the MockBuildEventLog to be provided per test in a way consistent with how BEL backing databases are specified normally. - -## Phase 3 - Implement `partitions` Repository -- Create a new build event log event for partition invalidation (with reason field) -- Implement a repository in `databuild/repositories/partitions/` that queries the build event log for the following capabilities - - list - - show - - invalidate -- Add `partitions` subcommand to CLI -- Migrate or add partition capabilities to service. - -## Phase 4 - Implement `jobs` Repository -- Implement a repository in `databuild/repositories/jobs/` that queries the BEL for the following capabilities - - list - - show -- Add `jobs` subcommand to CLI -- Migrate or add jobs capabilities to service. - -## Phase 5 - Implement `tasks` Repository -- Implement a "task cancel" job BEL event (with reason field) -- Implement a repository in `databuild/repositories/tasks/` that queries the BEL for the following capabilities - - list - - show -- And add to the common write component a `cancel_task` method to implement this -- Add `tasks` subcommand to CLI -- Add service endpoint for canceling tasks -- (TODO - later we will need to implement a way to operate on the dashboard - lets do that in a later project) - -## Phase 6 - Implement `builds` Repository -- Implement a "build cancel" BEL event (with reason field) -- Implement a repository in `databuild/repositories/builds/` that queries the BEL for the following capabilities - - list - - show - - cancel -- Add to the common write component a `cancel_build` method -- Add `builds` subcommand to the CLI -- Migrate service endpoints to use the new shared impl -- Add service endpoint implementing build cancel -- Add a cancel button to the build status page (for in-progress builds) - -## Phase 7 - Testing -- Review prior work, ensure that tests have been written to cover the 90% of most important functionality for each component. -- Ensure all tests pass, and fix those that don't. -- Run e2e tests. - -## Phase 8 - Reflection & Next Steps -- Reflect on the work done and look for opportunities for improvement and refactoring -- Call out any "buried bodies" very explicitly (things which need to be revisited for the implementation to be complete) - -# Note - -Do not take shortcuts. This we are building for the long term. If you have any questions, please pause and ask. diff --git a/plans/11-web-app-compile-time-correctness.md b/plans/11-web-app-compile-time-correctness.md deleted file mode 100644 index 35c98cf..0000000 --- a/plans/11-web-app-compile-time-correctness.md +++ /dev/null @@ -1,510 +0,0 @@ -# Web App Compile-Time Correctness Plan - -## Problem Statement - -The DataBuild web application currently has a type safety blindspot where backend protobuf changes can cause runtime failures in the frontend without any compile-time warnings. While we achieved end-to-end type generation (Proto → Rust → OpenAPI → TypeScript), inconsistent data transformation patterns and loose TypeScript configuration allow type mismatches to slip through. - -**Specific observed failures:** -- `status.toLowerCase()` crashes when status objects are passed instead of strings -- `status?.status` accesses non-existent properties on protobuf response objects -- Partitions page fails silently due to unhandled nullability -- Inconsistent data shapes flowing through components - -## Root Cause Analysis - -1. **Mixed Data Contracts**: Some components expect `{ status: string }` while APIs return `{ status_code: number, status_name: string }` -2. **Inconsistent Transformations**: Data shape changes happen ad-hoc throughout the component tree -3. **Protobuf Nullability**: Generated types are honest about optional fields, but TypeScript config allows unsafe access -4. **Service Boundary Leakage**: Backend implementation details leak into frontend components - -## Solution: Three-Pronged Approach - -### Option 2: Consistent Data Transformation (Primary) -- Define canonical dashboard types separate from generated API types -- Transform data at service boundaries, never in components -- Single source of truth for data shapes within the frontend - -### Option 4: Generated Type Enforcement (Supporting) -- Use generated protobuf types in service layer for accurate contracts -- Leverage protobuf's honest nullability information -- Maintain type safety chain from backend to service boundary - -### Option 3: Stricter TypeScript Configuration (Foundation) -- Enable strict null checks to catch undefined access patterns -- Prevent implicit any types that mask runtime errors -- Force explicit handling of protobuf's optional fields - -## Implementation Plan - -### Phase 1: TypeScript Configuration Hardening - -**Goal**: Enable strict type checking to surface existing issues - -**Tasks**: -1. Update `tsconfig.json` with strict configuration: - ```json - { - "compilerOptions": { - "strict": true, - "noImplicitAny": true, - "strictNullChecks": true, - "noImplicitReturns": true, - "noUncheckedIndexedAccess": true, - "exactOptionalPropertyTypes": true - } - } - ``` - -2. Run TypeScript compilation to identify all type errors - -3. Create tracking issue for each compilation error - -**Success Criteria**: TypeScript build passes with strict configuration enabled - -**Estimated Time**: 1-2 days - -### Phase 1.5: Verification of Strict Configuration - -**Goal**: Prove strict TypeScript catches the specific issues we identified - -**Tasks**: -1. Create test cases that reproduce original failures: - ```typescript - // Test file: dashboard/verification-tests.ts - const mockResponse = { status_code: 1, status_name: "COMPLETED" }; - - // These should now cause TypeScript compilation errors: - const test1 = mockResponse.status?.toLowerCase(); // undefined property access - const test2 = mockResponse.status?.status; // nested undefined access - ``` - -2. Run TypeScript compilation and verify these cause errors: - - Document which strict rules catch which specific issues - - Confirm `strictNullChecks` prevents undefined property access - - Verify `noImplicitAny` surfaces type gaps - -3. Test protobuf nullable field handling: - ```typescript - interface TestPartitionSummary { - last_updated?: number; // optional field from protobuf - } - - // This should require explicit null checking: - const timestamp = partition.last_updated.toString(); // Should error - ``` - -**Success Criteria**: -- All identified runtime failures now cause compile-time errors -- Clear mapping between strict TypeScript rules and caught issues -- Zero false positives in existing working code - -**Estimated Time**: 0.5 days - -### Phase 2: Define Dashboard Data Contracts - -**Goal**: Create canonical frontend types independent of backend schema - -**Tasks**: -1. Define dashboard types in `dashboard/types.ts`: - ```typescript - // Dashboard-optimized types - interface DashboardBuild { - build_request_id: string; - status: string; // Always human-readable name - requested_partitions: string[]; // Always string values - total_jobs: number; - completed_jobs: number; - failed_jobs: number; - cancelled_jobs: number; - requested_at: number; - started_at: number | null; - completed_at: number | null; - duration_ms: number | null; - cancelled: boolean; - } - - interface DashboardPartition { - partition_ref: string; // Always string value - status: string; // Always human-readable name - last_updated: number | null; - build_requests: string[]; - } - - interface DashboardJob { - job_label: string; - total_runs: number; - successful_runs: number; - failed_runs: number; - cancelled_runs: number; - last_run_timestamp: number; - last_run_status: string; // Always human-readable name - average_partitions_per_run: number; - recent_builds: string[]; - } - ``` - -2. Update component attribute interfaces to use dashboard types - -3. Document the rationale for each transformation decision - -**Success Criteria**: All dashboard types are self-contained and UI-optimized - -**Estimated Time**: 2-3 days - -### Phase 3: Service Layer Transformation - -**Goal**: Create consistent transformation boundaries between API and dashboard - -**Tasks**: -1. Implement transformation functions in `services.ts`: - ```typescript - // Transform API responses to dashboard types - function transformBuildDetail(apiResponse: BuildDetailResponse): DashboardBuild { - return { - build_request_id: apiResponse.build_request_id, - status: apiResponse.status_name, - requested_partitions: apiResponse.requested_partitions.map(p => p.str), - total_jobs: apiResponse.total_jobs, - completed_jobs: apiResponse.completed_jobs, - failed_jobs: apiResponse.failed_jobs, - cancelled_jobs: apiResponse.cancelled_jobs, - requested_at: apiResponse.requested_at, - started_at: apiResponse.started_at ?? null, - completed_at: apiResponse.completed_at ?? null, - duration_ms: apiResponse.duration_ms ?? null, - cancelled: apiResponse.cancelled, - }; - } - - function transformPartitionSummary(apiResponse: PartitionSummary): DashboardPartition { - return { - partition_ref: apiResponse.partition_ref.str, - status: apiResponse.status_name, - last_updated: apiResponse.last_updated ?? null, - build_requests: apiResponse.build_requests, - }; - } - ``` - -2. Update all service methods to use transformation functions - -3. Add type guards for runtime validation: - ```typescript - function isValidBuildResponse(data: unknown): data is BuildDetailResponse { - return typeof data === 'object' && - data !== null && - 'build_request_id' in data && - 'status_name' in data; - } - ``` - -4. Handle API errors with proper typing - -**Success Criteria**: All API data flows through consistent transformation layer - -**Estimated Time**: 3-4 days - -### Phase 3.5: Transformation Validation - -**Goal**: Prove transformation functions prevent observed failures and handle edge cases - -**Tasks**: -1. Create comprehensive unit tests for transformation functions: - ```typescript - // Test file: dashboard/transformation-tests.ts - describe('transformBuildDetail', () => { - it('handles status objects correctly', () => { - const apiResponse = { status_code: 1, status_name: 'COMPLETED' }; - const result = transformBuildDetail(apiResponse); - expect(typeof result.status).toBe('string'); - expect(result.status).toBe('COMPLETED'); - }); - - it('handles null optional fields', () => { - const apiResponse = { started_at: null, completed_at: undefined }; - const result = transformBuildDetail(apiResponse); - expect(result.started_at).toBe(null); - expect(result.completed_at).toBe(null); - }); - }); - ``` - -2. Test edge cases and malformed responses: - - Missing required fields - - Null values where not expected - - Wrong data types in API responses - - Verify type guards catch invalid responses - -3. Validate PartitionRef transformations: - ```typescript - it('converts PartitionRef objects to strings', () => { - const apiResponse = { partition_ref: { str: 'test-partition' } }; - const result = transformPartitionSummary(apiResponse); - expect(typeof result.partition_ref).toBe('string'); - expect(result.partition_ref).toBe('test-partition'); - }); - ``` - -4. Test transformation against real protobuf response shapes: - - Use actual OpenAPI generated types in tests - - Verify transformations work with current API schema - - Document transformation rationale for each field - -**Success Criteria**: -- All transformation functions have >90% test coverage -- Edge cases and null handling verified -- Real API response shapes handled correctly -- Type guards prevent invalid data from reaching components - -**Estimated Time**: 1 day - -### Phase 4: Component Migration - -**Goal**: Update all components to use dashboard types exclusively - -**Tasks**: -1. Update component implementations to use dashboard types: - - Remove direct `.status_code`/`.status_name` access - - Use transformed string status values - - Handle null values explicitly where needed - -2. Fix specific identified issues: - - Line 472: `status?.status` → use `status` directly - - Badge components: Ensure they receive strings - - Partition list: Use consistent partition type - -3. Update component attribute interfaces to match dashboard types - -4. Add runtime assertions where needed: - ```typescript - if (!status) { - console.warn('Missing status in component'); - return m('span', 'Unknown Status'); - } - ``` - -**Success Criteria**: All components compile and work with dashboard types - -**Estimated Time**: 2-3 days - -### Phase 4.5: Continuous Component Verification - -**Goal**: Verify components work correctly with dashboard types throughout migration - -**Tasks**: -1. After each component migration, run verification tests: - ```typescript - // Component-specific tests - describe('BuildDetailComponent', () => { - it('renders status as string correctly', () => { - const dashboardBuild: DashboardBuild = { - status: 'COMPLETED', // Transformed string, not object - // ... other fields - }; - const component = m(BuildDetailComponent, { build: dashboardBuild }); - // Verify no runtime errors with .toLowerCase() - }); - }); - ``` - -2. Test component attribute interfaces match usage: - - Verify TypeScript compilation passes for each component - - Check that vnode.attrs typing prevents invalid property access - - Test null handling in component rendering - -3. Integration tests with real transformed data: - - Use actual service layer transformation outputs - - Verify components render correctly with dashboard types - - Test error states and missing data scenarios - -**Success Criteria**: -- Each migrated component passes TypeScript compilation -- No runtime errors when using transformed dashboard types -- Components gracefully handle null/undefined dashboard fields - -**Estimated Time**: 0.5 days (distributed across Phase 4) - -### Phase 5: Schema Change Simulation & Integration Testing - -**Goal**: Verify end-to-end compile-time correctness with simulated backend changes - -**Tasks**: -1. **Automated Schema Change Testing**: - ```bash - # Create test script: scripts/test-schema-changes.sh - - # Test 1: Add new required field to protobuf - # - Modify databuild.proto temporarily - # - Regenerate Rust types and OpenAPI schema - # - Verify TypeScript compilation fails predictably - # - Document exact error messages - - # Test 2: Remove existing field - # - Remove field from protobuf definition - # - Verify transformation functions catch missing fields - # - Confirm components fail compilation when accessing removed field - - # Test 3: Change field type (string → object) - # - Modify status field structure in protobuf - # - Verify transformation layer prevents type mismatches - # - Confirm this catches issues like original status.toLowerCase() failure - ``` - -2. **Full Build Cycle Verification**: - - Proto change → `bazel build //databuild:openapi_spec_generator` - - OpenAPI regeneration → `bazel build //databuild/client:typescript_client` - - TypeScript compilation → `bazel build //databuild/dashboard:*` - - Document each failure point and error messages - -3. **End-to-End Type Safety Validation**: - ```typescript - // Create comprehensive integration tests - describe('End-to-End Type Safety', () => { - it('prevents runtime failures from schema changes', async () => { - // Test actual API calls with transformed responses - const service = DashboardService.getInstance(); - const activity = await service.getRecentActivity(); - - // Verify transformed types prevent original failures - activity.recentBuilds.forEach(build => { - expect(typeof build.status).toBe('string'); - expect(() => build.status.toLowerCase()).not.toThrow(); - }); - }); - }); - ``` - -4. **Regression Testing for Original Failures**: - - Test status.toLowerCase() with transformed data - - Test status?.status access patterns - - Test partition.str access with transformed partition refs - - Verify null handling in timestamp fields - -5. **Real Data Flow Testing**: - - New build creation → status updates → completion - - Partition status changes using dashboard types - - Job execution monitoring with transformed data - - Error states and edge cases - -**Success Criteria**: -- Schema changes cause predictable TypeScript compilation failures -- Transformation layer prevents all identified runtime failures -- Full build cycle catches type mismatches at each stage -- Zero runtime type errors with dashboard types -- Original failure scenarios now impossible with strict types - -**Estimated Time**: 2-3 days - -### Phase 6: Documentation & Monitoring - -**Goal**: Establish practices to maintain type safety over time - -**Tasks**: -1. Document transformation patterns: - - When to create new dashboard types - - How to handle protobuf schema changes - - Service layer responsibilities - -2. Add runtime monitoring: - - Log transformation failures - - Track API response shape mismatches - - Monitor for unexpected null values - -3. Create development guidelines: - - Never use generated types directly in components - - Always transform at service boundaries - - Handle nullability explicitly - -4. Set up CI checks: - - Strict TypeScript compilation in build pipeline - - Automated schema change detection tests - - Integration test suite for type safety validation - - Pre-commit hooks for TypeScript compilation - -5. **Create Ongoing Verification Tools**: - ```bash - # CI script: scripts/verify-type-safety.sh - # - Run schema change simulation tests - # - Verify transformation tests pass - # - Check strict TypeScript compilation - # - Validate component integration tests - ``` - -**Success Criteria**: -- Team has clear practices for maintaining type safety -- CI pipeline catches type safety regressions automatically -- Schema change testing is automated and repeatable -- Documentation provides concrete examples and rationale - -**Estimated Time**: 2 days - -## Risk Mitigation - -### High-Impact Risks - -1. **Breaking Change Volume**: Strict TypeScript may reveal many existing issues - - *Mitigation*: Implement incrementally, fix issues in phases - - *Rollback*: Keep loose config as backup during transition - -2. **Performance Impact**: Additional transformation layer overhead - - *Mitigation*: Profile transformation functions, optimize hot paths - - *Monitoring*: Track bundle size and runtime performance - -3. **Developer Learning Curve**: Team needs to adapt to strict null checks - - *Mitigation*: Provide training on handling optional types - - *Support*: Create examples and best practices documentation - -### Medium-Impact Risks - -1. **API Response Changes**: Backend might return unexpected data shapes - - *Mitigation*: Add runtime validation in service layer - - *Detection*: Monitor for transformation failures - -2. **Third-party Type Conflicts**: Generated types might conflict with other libraries - - *Mitigation*: Use type aliases and careful imports - - *Testing*: Verify integration with existing dependencies - -## Success Metrics - -### Compile-Time Safety -- [ ] Zero `any` types in dashboard code -- [ ] All protobuf optional fields handled explicitly -- [ ] TypeScript strict mode enabled and passing -- [ ] Component attribute interfaces match usage - -### Runtime Reliability -- [ ] Zero "undefined is not a function" errors -- [ ] Zero "cannot read property of undefined" errors -- [ ] All API error states handled gracefully -- [ ] Consistent data shapes across all components - -### Development Experience -- [ ] Backend schema changes cause predictable frontend compilation results -- [ ] Clear error messages when types don't match -- [ ] Consistent patterns for handling new data types -- [ ] Fast iteration cycle maintained - -## Future Considerations - -### Schema Evolution Strategy -- Plan for handling breaking vs non-breaking backend changes -- Consider versioning approach for dashboard types -- Establish deprecation process for old data shapes - -### Tooling Enhancements -- Consider code generation for transformation functions -- Explore runtime schema validation libraries -- Investigate GraphQL for stronger API contracts - -### Performance Optimization -- Profile transformation layer performance -- Consider caching strategies for transformed data -- Optimize bundle size impact of strict typing - ---- - -## Implementation Notes - -This plan prioritizes compile-time correctness while maintaining development velocity. The phased approach allows for incremental progress and risk mitigation, while the three-pronged strategy (Options 2+3+4) provides comprehensive type safety from protobuf definitions through to component rendering. - -The key insight is that true compile-time correctness requires both accurate type definitions AND consistent data transformation patterns enforced by strict TypeScript configuration. \ No newline at end of file diff --git a/plans/12-dsl.md b/plans/12-dsl.md deleted file mode 100644 index 0f8c9c0..0000000 --- a/plans/12-dsl.md +++ /dev/null @@ -1,237 +0,0 @@ -# DataBuild Interface Evolution: Strategic Options and Technical Decisions - -This document outlines the key technical decisions for evolving DataBuild's interface, examining each option through the lens of modern data infrastructure needs. - -## Executive Summary - -DataBuild must choose between three fundamental interface strategies: -1. **Pure Bazel** (current): Maximum guarantees, maximum verbosity -2. **High-Level DSL**: Expressive interfaces that compile to Bazel -3. **Pure Declarative**: Eliminate orchestration entirely through relational modeling - -## The Core Technical Decisions - -### Decision 1: Where Should Dependency Logic Live? - -**Option A: In-Job Config (Current Design)** -```python -# Job knows its own dependencies -def config(self, date): - return {"inputs": [f"raw/{date}", f"raw/{date-1}"]} -``` -- ✅ **Locality of knowledge** - dependency logic next to usage -- ✅ **Natural evolution** - changes happen in one place -- ❌ **Performance overhead** - subprocess per config call -- **Thrives in**: Complex enterprise environments where jobs have intricate, evolving dependencies - -**Option B: Graph-Level Declaration** -```python -databuild_job( - name = "process_daily", - depends_on = ["raw/{date}"], - produces = ["processed/{date}"] -) -``` -- ✅ **Static analysis** - entire graph visible without execution -- ✅ **Performance** - microseconds vs seconds for planning -- ❌ **Flexibility** - harder to express dynamic dependencies -- ❌ **Implicit coupling** - jobs have to duplicate data dependency resolution -- **Thrives in**: High-frequency trading systems, real-time analytics where planning speed matters - -**Option C: Hybrid Pattern-Based** -```python -# Patterns at graph level, resolution at runtime -@job(dependency_pattern="raw/{source}/[date-window:date]") -def aggregate(date, window=7): - # Runtime resolves exact partitions -``` -- ✅ **Best of both** - fast planning with flexibility -- ✅ **Progressive disclosure** - simple cases simple -- ❌ **Complexity** - two places to look -- **Thrives in**: Modern data platforms serving diverse teams with varying sophistication - -### Decision 2: Interface Language Choice - -**Option A: Pure Bazel (Status Quo)** -```starlark -databuild_job( - name = "etl", - binary = ":etl_binary", -) -``` -**Narrative**: "The Infrastructure-as-Code Platform" -- For organizations that value reproducibility above all else -- Where data pipelines are mission-critical infrastructure -- Teams that already use Bazel for other systems - -**Strengths**: -- Hermetic builds guarantee reproducibility -- Multi-language support out of the box -- Battle-tested deployment story - -**Weaknesses**: -- High barrier to entry -- Verbose for simple cases -- Limited expressiveness - -**Option B: Python DSL → Bazel Compilation** -```python -@db.job -def process(date: str, raw: partition("raw/{date}")) -> partition("clean/{date}"): - return raw.load().transform().save() -``` -**Narrative**: "The Developer-First Data Platform" -- For data teams that move fast and iterate quickly -- Where Python is already the lingua franca -- Organizations prioritizing developer productivity - -**Strengths**: -- 10x more concise than Bazel -- Natural for data scientists/engineers -- Rich ecosystem integration - -**Weaknesses**: -- Additional compilation step -- Python-centric (less multi-language) -- Debugging across abstraction layers - -**Option C: Rust DSL with Procedural Macros** -```rust -#[job] -fn process( - #[partition("raw/{date}")] input: Partition -) -> Partition { - input.load()?.transform().save() -} -``` -**Narrative**: "The High-Performance Data Platform" -- For organizations processing massive scale -- Where performance and correctness are equally critical -- Teams willing to invest in Rust expertise - -**Strengths**: -- Compile-time guarantees with elegance -- Zero-cost abstractions -- Single language with execution engine - -**Weaknesses**: -- Steep learning curve -- Smaller talent pool -- Less flexible than Python - -### Decision 3: Orchestration Philosophy - -**Option A: Explicit Orchestration (Traditional)** -- Users define execution order and dependencies -- Similar to Airflow, Prefect, Dagster -- **Thrives in**: Organizations with complex business logic requiring explicit control - -**Option B: Implicit Orchestration (Current DataBuild)** -- Users define jobs and dependencies -- System figures out execution order -- **Thrives in**: Data engineering teams wanting to focus on transformations, not plumbing - -**Option C: No Orchestration (Pure Declarative)** -```python -@partition("clean/{date}") -class CleanData: - source = "raw/*/{date}" - - def transform(self, raw): - # Pure function, no orchestration - return clean(merge(raw)) -``` -**Narrative**: "The SQL-for-Data-Pipelines Platform" -- Orchestration is an implementation detail -- Users declare relationships, system handles everything -- **Thrives in**: Next-generation data platforms, organizations ready to rethink data processing - -**Strengths**: -- Eliminates entire categories of bugs -- Enables powerful optimizations - -**Weaknesses**: -- Paradigm shift for users -- Less control over execution -- Harder to debug when things go wrong - -## Strategic Recommendations by Use Case - -### For Startups/Fast-Moving Teams -**Recommendation**: Python DSL → Bazel -- Start with Python for rapid development -- Compile to Bazel for production -- Migrate critical jobs to native Bazel/Rust over time - -### For Enterprise/Regulated Industries -**Recommendation**: Pure Bazel with Graph-Level Dependencies -- Maintain full auditability and reproducibility -- Use graph-level deps for performance -- Consider Rust DSL for new greenfield projects - -### For Next-Gen Data Platforms -**Recommendation**: Pure Declarative with Rust Implementation -- Leap directly to declarative model -- Build on Rust for performance and correctness -- Pioneer the "SQL for pipelines" approach - -## Implementation Patterns - -### Pattern 1: Gradual Migration -``` -Current Bazel → Python DSL (compile to Bazel) → Pure Declarative -``` -- Low risk, high compatibility -- Teams can adopt at their own pace -- Preserves existing investments - -### Pattern 2: Parallel Tracks -``` -Bazel Interface (production) - ↕️ -Python Interface (development) -``` -- Different interfaces for different use cases -- Development velocity without sacrificing production guarantees -- Higher maintenance burden - -### Pattern 3: Clean Break -``` -New declarative system alongside legacy -``` -- Fastest path to innovation -- No legacy constraints -- Requires significant investment - -## Key Technical Insights - -### Single Source of Truth Principle -Whatever path chosen, dependency declaration and resolution must be co-located: -```python -# Good: Single source -def process(input: partition("raw/{date}")): - return input.load().transform() - -# Bad: Split sources -# In config: depends = ["raw/{date}"] -# In code: data = load("raw/{date}") # Duplication! -``` - -### The Pattern Language Insight -No new DSL needed for patterns - leverage existing language features: -- Python: f-strings, glob, regex -- Rust: const generics, pattern matching -- Both: bidirectional pattern template libraries - -### The Orchestration Elimination Insight -The highest abstraction isn't better orchestration - it's no orchestration. Like SQL eliminated query planning from user concern, DataBuild could eliminate execution planning. - -## Conclusion - -The optimal path depends on organizational maturity and ambition: - -1. **Conservative Evolution**: Enhance Bazel with better patterns and graph-level deps -2. **Developer-Focused**: Python DSL compiling to Bazel, maintaining guarantees -3. **Revolutionary Leap**: Pure declarative relationships with Rust implementation - -Each path has merit. The key is choosing one that aligns with your organization's data infrastructure philosophy and long-term vision. \ No newline at end of file diff --git a/plans/13-job-wrapper.md b/plans/13-job-wrapper.md deleted file mode 100644 index d7d1651..0000000 --- a/plans/13-job-wrapper.md +++ /dev/null @@ -1,346 +0,0 @@ -# Job Wrapper v2 Plan - -## Status -- Phase 0: Minimal Bootstrap [DONE] -- Phase 1: Core Protocol [MOSTLY DONE - heartbeating and metrics implemented] -- Phase 2: Platform Support [FUTURE] -- Phase 3: Production Hardening [FUTURE] -- Phase 4: Advanced Features [FUTURE] - -## Required Reading - -Before implementing this plan, engineers should thoroughly understand these design documents: - -- **[DESIGN.md](../DESIGN.md)** - Overall DataBuild architecture and job execution model -- **[design/core-build.md](../design/core-build.md)** - Core build semantics and job lifecycle state machines -- **[design/observability.md](../design/observability.md)** - Observability strategy and telemetry requirements -- **[design/build-event-log.md](../design/build-event-log.md)** - Event sourcing model and BEL integration -- **[databuild.proto](../databuild/databuild.proto)** - System interfaces and data structures - -## Overview -The job wrapper is a critical component that mediates between DataBuild graphs and job executables, providing observability, error handling, and state management. This plan describes the next generation job wrapper implementation in Rust. - -## Architecture - -### Core Design Principles -1. **Single Communication Channel**: Jobs communicate with graphs exclusively through structured logs [DONE] -2. **Platform Agnostic**: Works identically across local, Docker, K8s, and cloud platforms [PARTIAL - local only] -3. **Zero Network Requirements**: Jobs don't need to connect to any services [DONE] -4. **Fail-Safe**: Graceful handling of job crashes and fast completions [PARTIAL - basic handling only] - -### Communication Model -``` -Graph → Job: Launch with JobConfig (via CLI args/env) -Job → Graph: Structured logs (stdout) -Graph: Tails logs and interprets into metrics, events, and manifests -``` - -## Structured Log Protocol - -### Message Format (Protobuf) [DONE] -```proto -message JobLogEntry { - string timestamp = 1; - string job_id = 2; - string partition_ref = 3; - uint64 sequence_number = 4; // Monotonic sequence starting from 1 [DONE] - - oneof content { - LogMessage log = 5; // [DONE] - MetricPoint metric = 6; // [FUTURE] - JobEvent event = 7; // [DONE - WrapperJobEvent] - PartitionManifest manifest = 8; // [DONE] - } -} - -message LogMessage { // [DONE] - enum LogLevel { - DEBUG = 0; - INFO = 1; - WARN = 2; - ERROR = 3; - } - LogLevel level = 1; - string message = 2; - map fields = 3; -} - -message MetricPoint { // [FUTURE] - string name = 1; - double value = 2; - map labels = 3; - string unit = 4; -} - -message JobEvent { // [DONE - as WrapperJobEvent] - string event_type = 1; // "task_launched", "heartbeat", "task_completed", etc - google.protobuf.Any details = 2; - map metadata = 3; -} -``` - -### Log Stream Lifecycle -1. Wrapper emits `config_validate_success` event (sequence #1) [DONE] -2. Wrapper validates configuration [DONE] -3. Wrapper emits `task_launch_success` event (sequence #2) [DONE] -4. Job executes, wrapper captures stdout/stderr (sequence #3+) [DONE] -5. Wrapper emits periodic `heartbeat` events (every 30s) [DONE] -6. Wrapper detects job completion [DONE] -7. Wrapper emits `task_success`/`task_failed` event [DONE] -8. Wrapper emits `PartitionManifest` message (final required message with highest sequence number) [DONE] -9. Wrapper exits [DONE] - -The PartitionManifest serves as the implicit end-of-logs marker - the graph knows processing is complete when it sees this message. Sequence numbers enable the graph to detect missing or out-of-order messages and ensure reliable telemetry collection. [DONE - sequence numbers implemented] - -## Wrapper Implementation [PARTIAL] - -### Interfaces [DONE] -```rust -trait JobWrapper { - // Config mode - accepts PartitionRef objects - fn config(outputs: Vec) -> Result; // [DONE] - - // Exec mode - accepts serialized JobConfig - fn exec(config: JobConfig) -> Result<()>; // [DONE] -} -``` - -### Exit Code Standards [PARTIAL] - -Following POSIX conventions and avoiding collisions with standard exit codes: - -Reference: -- https://manpages.ubuntu.com/manpages/noble/man3/sysexits.h.3head.html -- https://tldp.org/LDP/abs/html/exitcodes.html - -```rust -// Standard POSIX codes we respect: [PARTIAL - basic forwarding only] -// 0 - Success -// 1 - General error -// 2 - Misuse of shell builtin -// 64 - Command line usage error (EX_USAGE) -// 65 - Data format error (EX_DATAERR) -// 66 - Cannot open input (EX_NOINPUT) -// 69 - Service unavailable (EX_UNAVAILABLE) -// 70 - Internal software error (EX_SOFTWARE) -// 71 - System error (EX_OSERR) -// 73 - Can't create output file (EX_CANTCREAT) -// 74 - Input/output error (EX_IOERR) -// 75 - Temp failure; retry (EX_TEMPFAIL) -// 77 - Permission denied (EX_NOPERM) -// 78 - Configuration error (EX_CONFIG) - -// DataBuild-specific codes (100+ to avoid collisions): [FUTURE] -// 100-109 - User-defined permanent failures -// 110-119 - User-defined transient failures -// 120-129 - User-defined resource failures -// 130+ - Other user-defined codes - -enum ExitCodeCategory { // [FUTURE] - Success, // 0 - StandardError, // 1-63 (shell/system) - PosixError, // 64-78 (sysexits.h) - TransientFailure, // 75 (EX_TEMPFAIL) or 110-119 - UserDefined, // 100+ -} -``` - -## Platform-Specific Log Handling - -### Local Execution [DONE] -- Graph spawns wrapper process [DONE] -- Graph reads from stdout pipe directly [DONE] -- PartitionManifest indicates completion [DONE] - -### Docker [FUTURE] -- Graph runs `docker run` with wrapper as entrypoint -- Graph uses `docker logs -f` to tail output -- Logs persist after container exit - -### Kubernetes [FUTURE] -- Job pods use wrapper as container entrypoint -- Graph tails logs via K8s API -- Configure `terminationGracePeriodSeconds` for log retention - -### Cloud Run / Lambda [FUTURE] -- Wrapper logs to platform logging service -- Graph queries logs via platform API -- Natural buffering and persistence - -## Observability Features - -### Metrics Collection [FUTURE] - -For metrics, we'll use a simplified StatsD-like format in our structured logs, which the graph can aggregate and expose via Prometheus format: - -```json -{ - "timestamp": "2025-01-27T10:30:45Z", - "content": { - "metric": { - "name": "rows_processed", - "value": 1500000, - "labels": { - "partition": "date=2025-01-27", - "stage": "transform" - }, - "unit": "count" - } - } -} -``` - -The graph component will: [FUTURE] -- Aggregate metrics from job logs -- Expose them in Prometheus format for scraping (when running as a service) -- Store summary metrics in the BEL for historical analysis - -For CLI-invoked builds, metrics are still captured in the BEL but not exposed for scraping (which is acceptable since these are typically one-off runs). - -### Heartbeating [DONE] - -Fixed 30-second heartbeat interval (configurable via `DATABUILD_HEARTBEAT_INTERVAL_MS`): - -```json -{ - "timestamp": "2025-01-27T10:30:45Z", - "content": { - "JobEvent": { - "event_type": "heartbeat", - "metadata": { - "memory_usage_mb": "1024.256", - "cpu_usage_percent": "85.200" - } - } - } -} -``` - -**Implementation Details:** -- Uses sysinfo crate for cross-platform process monitoring -- Heartbeat thread communicates via channels with main thread -- Includes memory usage (MB) and CPU usage (%) with 3 decimal precision -- Configurable interval for testing (default 30s, test environments use 100ms) -- Proper dependency injection via LogSink trait for testability - -### Log Bandwidth Limits [FUTURE] - -To prevent log flooding: -- Maximum log rate: 1000 messages/second -- Maximum message size: 1MB -- If limits exceeded: Wrapper emits rate limit warning and drops messages -- Final metrics show dropped message count - -## Testing Strategy - -### Unit Tests [MOSTLY DONE] -- Log parsing and serialization [DONE - protobuf serde] -- State machine transitions [DONE - JobLogEntry sequence validation] -- Heartbeat functionality [DONE - with dependency injection] -- CPU/memory metrics collection [DONE - with configurable intervals] -- Exit code categorization [FUTURE] -- Rate limiting behavior [FUTURE] - -### Integration Tests [PARTIAL] -- Full job execution lifecycle [DONE - via e2e tests] -- Resource metrics validation [DONE - CPU-intensive workload testing] -- Platform-specific log tailing [PARTIAL - local only] -- Fast job completion handling [DONE] -- Large log volume handling [FUTURE] - -### Platform Tests [PARTIAL] -- Local process execution [DONE] -- Docker container runs [FUTURE] -- Kubernetes job pods [FUTURE] -- Cloud Run invocations [FUTURE] - -### Failure Scenario Tests [PARTIAL] -- Job crashes (SIGSEGV, SIGKILL) [DONE - basic exit code forwarding] -- Wrapper crashes [FUTURE] -- Log tailing interruptions [FUTURE] -- Platform-specific failures [FUTURE] - -## Implementation Phases - -### Phase 0: Minimal Bootstrap [DONE] -Implement the absolute minimum to unblock development and testing: -- Basic wrapper that only handles happy path [DONE] -- Support for local execution only [DONE] -- Minimal log parsing in graph [DONE - wrapper emits structured logs] -- Integration with existing example jobs [DONE - e2e tests passing] - -This phase delivers a working end-to-end system that can be continuously evolved. [DONE] - -**Completed Implementation Details:** -- Created databuild/job/src/main.rs with config/exec modes [DONE] -- Uses protobuf types from databuild.proto [DONE] -- Emits JobLogEntry with sequence numbers [DONE] -- Follows core-build.md state diagram exactly [DONE] -- Forwards job stdout/stderr as LogMessage entries [DONE] -- Emits PartitionManifest on successful completion [DONE] -- Properly handles job failures with exit codes [DONE] -- Modified Bazel rules to use job_wrapper [DONE] -- All e2e tests passing [DONE] - -### Phase 1: Core Protocol [MOSTLY DONE] -- Define protobuf schemas [DONE - JobLogEntry, LogMessage, WrapperJobEvent] -- Implement structured logger [DONE - JSON serialization to stdout] -- Add error handling and exit codes [PARTIAL - basic forwarding only] -- Implement heartbeating [DONE - with CPU/memory metrics] -- Resource metrics collection [DONE - CPU time, peak memory, runtime] -- Dependency injection for testability [DONE - LogSink trait pattern] -- Graph-side log parser improvements [FUTURE - wrapper emits, graph needs to consume] -- MetricPoint message support [FUTURE] -- Advanced error categorization [FUTURE] - -### Phase 2: Platform Support [FUTURE] -- Docker integration [FUTURE] -- Kubernetes support [FUTURE] -- Cloud platform adapters [FUTURE] -- Platform-specific testing [FUTURE] - -### Phase 3: Production Hardening [FUTURE] -- Rate limiting [FUTURE] -- Error recovery [FUTURE] -- Performance optimization [FUTURE] -- Monitoring integration [FUTURE] - -### Phase 4: Advanced Features [FUTURE] -- In-process config for library jobs [FUTURE] -- Custom metrics backends [FUTURE] -- Advanced failure analysis [FUTURE] - -## Success Criteria - -1. **Zero Network Dependencies**: Jobs run without any network access [DONE] -2. **Platform Parity**: Identical behavior across all execution platforms [PARTIAL - local only] -3. **Minimal Overhead**: < 100ms wrapper overhead for config, < 1s for exec [DONE - fast execution] -4. **Complete Observability**: All job state changes captured in logs [DONE - core events captured] -5. **Graceful Failures**: No log data loss even in crash scenarios [PARTIAL - basic failure handling] - -## Next Steps - -1. Implement minimal bootstrap wrapper [DONE] -2. Test with existing example jobs [DONE] -3. Iterate on log format based on real usage [IN PROGRESS - Phase 1 continuation] -4. Gradually add features per implementation phases [IN PROGRESS] - -**Phase 1 Achievements:** -- ✅ Heartbeating support with CPU/memory metrics [DONE] -- ✅ Dependency injection for testability (LogSink trait) [DONE] -- ✅ Resource metrics collection (CPU time, peak memory, runtime) [DONE] -- ✅ Comprehensive test coverage for heartbeats and metrics [DONE] -- ✅ Configurable intervals for different environments [DONE] - -**Remaining for Phase 1 Completion:** -- Implement MetricPoint logging [FUTURE] -- Add graph-side structured log consumption [FUTURE] -- Enhanced error categorization and exit code mapping [FUTURE] - -**Recent Implementation Details:** -- Uses sysinfo 0.30 for cross-platform process monitoring -- Thread-safe heartbeat communication via mpsc channels -- Floating-point metrics with 3 decimal precision (f64) -- Environment variable configuration: `DATABUILD_HEARTBEAT_INTERVAL_MS`, `DATABUILD_METRICS_INTERVAL_MS` -- Robust test infrastructure with synthetic CPU-intensive workloads -- Proper CPU time calculation: (average_cpu_percent / 100.0) × wall_clock_time \ No newline at end of file diff --git a/plans/14-graph-side-log-consumption.md b/plans/14-graph-side-log-consumption.md deleted file mode 100644 index f33288f..0000000 --- a/plans/14-graph-side-log-consumption.md +++ /dev/null @@ -1,384 +0,0 @@ -# Graph-Side Log Consumption Plan - -## Status -- Phase 0: Design [DONE] -- Phase 1: Core Implementation [COMPLETED ✅] -- Phase 2: Advanced Features [FUTURE] - -## Phase 1 Implementation Status - -### ✅ **Core Components COMPLETED** -1. **JobLogEntry protobuf interface fixed** - Updated `databuild.proto` to use `repeated PartitionRef outputs` instead of single `string partition_ref` -2. **LogCollector implemented** - Consumes job wrapper stdout, parses structured logs, writes to date-organized JSONL files (`logs/databuild/YYYY-MM-DD/job_run_id.jsonl`) -3. **Graph integration completed** - LogCollector integrated into graph execution with UUID-based job ID coordination between graph and wrapper -4. **Unified Log Access Layer implemented** - Protobuf-based `LogReader` interface ensuring CLI/Service consistency for log retrieval -5. **Centralized metric templates** - All metric definitions centralized in `databuild/metric_templates.rs` module -6. **MetricsAggregator with cardinality safety** - Prometheus output without partition reference explosion, using job labels instead -7. **REST API endpoints implemented** - `/api/v1/jobs/{job_run_id}/logs` and `/api/v1/metrics` fully functional -8. **Graph-level job_label enrichment** - Solved cardinality issue via LogCollector enrichment pattern, consistent with design philosophy - -### ✅ **Key Architectural Decisions Implemented** -- **Cardinality-safe metrics**: Job labels used instead of high-cardinality partition references in Prometheus output -- **Graph-level enrichment**: LogCollector enriches both WrapperJobEvent and Manifest entries with job_label from graph context -- **JSONL storage**: Date-organized file structure with robust error handling and concurrent access safety -- **Unified execution paths**: Both CLI and service builds produce identical BEL events and JSONL logs in same locations -- **Job ID coordination**: UUID-based job run IDs shared between graph execution and job wrapper via environment variable - -### ✅ **All Success Criteria Met** -- ✅ **Reliable Log Capture**: All job wrapper output captured without loss through LogCollector -- ✅ **API Functionality**: REST API retrieves logs by job run ID, timestamp filtering, and log level filtering -- ✅ **Safe Metrics**: Prometheus endpoint works without cardinality explosion (job labels only, no partition refs) -- ✅ **Correctness**: No duplicated metric templates, all definitions centralized in `metric_templates.rs` -- ✅ **Concurrent Safety**: Multiple jobs write logs simultaneously without corruption via separate JSONL files per job -- ✅ **Simple Testing**: Test suite covers core functionality with minimal brittleness, all tests passing - -### 🏗️ **Implementation Files** -- `databuild/databuild.proto` - Updated protobuf interfaces -- `databuild/log_collector.rs` - Core log collection and JSONL writing -- `databuild/log_access.rs` - Unified log reading interface -- `databuild/metric_templates.rs` - Centralized metric definitions -- `databuild/metrics_aggregator.rs` - Cardinality-safe Prometheus output -- `databuild/service/handlers.rs` - REST API endpoints implementation -- `databuild/graph/execute.rs` - Integration point for LogCollector -- `databuild/job/main.rs` - Job wrapper structured log emission - -## Required Reading - -Before implementing this plan, engineers should thoroughly understand these design documents: - -- **[DESIGN.md](../DESIGN.md)** - Overall DataBuild architecture and job execution model -- **[design/core-build.md](../design/core-build.md)** - Core build semantics and job lifecycle state machines -- **[design/build-event-log.md](../design/build-event-log.md)** - Event sourcing model and BEL integration -- **[design/observability.md](../design/observability.md)** - Observability strategy and telemetry requirements -- **[plans/job-wrapper.md](./job-wrapper.md)** - Job wrapper implementation and structured log protocol -- **[databuild.proto](../databuild/databuild.proto)** - System interfaces and data structures - -## Overview - -This plan describes the graph-side implementation for consuming structured logs emitted by the job wrapper. The job wrapper emits `JobLogEntry` protobuf messages to stdout during job execution. The graph must consume these logs to provide log retrieval by job run ID and expose metrics for Prometheus scraping. - -## Key Technical Decisions - -### 1. Storage Strategy: On-Disk with BEL Separation -**Decision**: Store structured logs on disk separate from the Build Event Log (BEL). - -**Motivation**: -- Log volumes can be legitimately large and would place undue stress on the BEL-backing datastore -- BEL is optimized for event-sourcing patterns, not high-volume log queries -- Separate storage allows independent scaling and retention policies - -### 2. File Organization: Date-Organized Structure -**Decision**: Store logs in configurable-base, date-organized directories: `$LOGS_BASEPATH/YYYY-MM-DD/{job_run_id}.jsonl` - -**Motivation**: -- Enables efficient cleanup by date (future optimization) -- Simplifies manual log management during development -- Facilitates external log collection tools (future) - -### 3. Static Update Period (Phase 1) -**Decision**: Use fixed refresh interval for log processing. Adaptive batching is a future optimization. - -**Motivation**: -- Simplicity for initial implementation -- Predictable performance characteristics -- Easier to debug and test -- Can optimize later based on real usage patterns - -### 4. Manual Log Cleanup (Phase 1) -**Decision**: No automatic log retention/cleanup in initial implementation. - -**Motivation**: -- We're in early development phase -- Manual cleanup acceptable for now -- Avoids complexity in initial implementation -- Automatic retention can be added as future optimization - -### 5. Unified Telemetry Stream -**Decision**: All `JobLogEntry` messages (logs, metrics, events) flow through the same JSONL files. - -**Motivation**: -- Simplicity - single consumption pipeline -- Temporal consistency - metrics and logs naturally correlated -- Unified file format reduces complexity - -### 6. Cardinality-Safe Prometheus Metrics -**Decision**: Prometheus metrics will NOT include partition references as labels to avoid cardinality explosion. - -**Motivation**: -- Partition labels (date × customer × region × etc.) would create massive cardinality -- Focus on job-level and system-level metrics only -- Use job_id and job_type labels instead of partition-specific labels - -### 7. Centralized Metric Templates for Correctness -**Decision**: Define all Prometheus metric names and label templates in a central location to avoid string duplication. - -**Motivation**: -- Prevents implicit coupling via duplicated string templates -- Single source of truth for metric definitions -- Easier to maintain consistency across codebase - -### 8. Limited Scope (Phase 1) -**Decision**: Phase 1 focuses on log retrieval API and Prometheus metrics, excluding web app integration. - -**Motivation**: -- Web app integration is part of a bigger update -- Allows focused implementation on core log consumption -- API-first approach enables multiple consumers - -### 9. Unified Execution Paths -**Decision**: Both CLI and service builds produce identical BEL events and JSONL logs in the same locations. - -**Motivation**: -- Building with CLI then querying from service "just works" -- Single source of truth for all build artifacts -- Consistent behavior regardless of execution method -- Simplifies debugging and operational workflows - -## Interface Issues to Fix - -### JobLogEntry Protobuf Update Required -The current `JobLogEntry` definition needs updates: - -**Current (INCORRECT)**: -```proto -message JobLogEntry { - string partition_ref = 3; // Single string - // ... -} -``` - -**Required (CORRECT)**: -```proto -message JobLogEntry { - repeated PartitionRef outputs = 3; // Multiple PartitionRef objects - // ... -} -``` - -**Rationale**: Jobs produce multiple partitions, and we should use the proper `PartitionRef` type for consistency with other interfaces. - -## Architecture - -### Storage Layout -``` -/logs/databuild/ -├── 2025-01-27/ -│ ├── job_run_123abc.jsonl -│ ├── job_run_456def.jsonl -│ └── ... -├── 2025-01-28/ -│ └── ... -``` - -### File Format (JSONL) -Each file contains one JSON object per line, representing a `JobLogEntry`: -```json -{"timestamp":"2025-01-27T10:30:45Z","job_id":"job_run_123abc","outputs":[{"path":"s3://bucket/dataset/date=2025-01-27"}],"sequence_number":1,"content":{"job_event":{"event_type":"task_launched","metadata":{}}}} -{"timestamp":"2025-01-27T10:30:46Z","job_id":"job_run_123abc","outputs":[{"path":"s3://bucket/dataset/date=2025-01-27"}],"sequence_number":2,"content":{"log":{"level":"INFO","message":"Processing started","fields":{"rows":"1000"}}}} -{"timestamp":"2025-01-27T10:30:50Z","job_id":"job_run_123abc","outputs":[{"path":"s3://bucket/dataset/date=2025-01-27"}],"sequence_number":3,"content":{"metric":{"name":"rows_processed","value":1000,"labels":{"stage":"transform"},"unit":"count"}}} -``` - -### Consumption Pipeline -``` -Job Wrapper (stdout) → Graph Log Collector → JSONL Files - ↓ - Unified Log Access Layer - ↙ ↘ - Service API CLI API - ↓ - Metrics Aggregator → /api/v1/metrics -``` - -## Implementation Components - -### 1. Log Collector [PHASE 1] -**Responsibility**: Consume job wrapper stdout and write to JSONL files. - -```rust -struct LogCollector { - logs_dir: PathBuf, // /logs/databuild - active_files: HashMap, // job_run_id -> file handle -} - -impl LogCollector { - fn consume_job_output(&mut self, job_run_id: &str, stdout: &mut BufReader) -> Result<()>; - fn write_log_entry(&mut self, job_run_id: &str, entry: &JobLogEntry) -> Result<()>; - fn ensure_date_directory(&self) -> Result; -} -``` - -### 2. Unified Log Access Layer [PHASE 1] -**Responsibility**: Provide common interface for reading logs from JSONL files, used by both service and CLI. - -```rust -// Core log access implementation -struct LogReader { - logs_base_path: PathBuf, -} - -impl LogReader { - fn get_job_logs(&self, request: &JobLogsRequest) -> Result; - fn list_available_jobs(&self, date_range: Option<(String, String)>) -> Result>; - fn get_job_metrics(&self, job_run_id: &str) -> Result>; -} -``` - -**Protobuf Interface** (ensures CLI/Service consistency): -```proto -message JobLogsRequest { - string job_run_id = 1; - int64 since_timestamp = 2; // Unix timestamp (nanoseconds) - int32 min_level = 3; // LogLevel enum value - uint32 limit = 4; -} - -message JobLogsResponse { - repeated JobLogEntry entries = 1; - bool has_more = 2; -} -``` - -### 3. Metrics Templates [PHASE 1] -**Responsibility**: Centralized metric definitions to avoid string duplication. - -```rust -// Central location for all metric definitions -mod metric_templates { - pub const JOB_RUNTIME_SECONDS: &str = "databuild_job_runtime_seconds"; - pub const JOB_MEMORY_PEAK_MB: &str = "databuild_job_memory_peak_mb"; - pub const JOB_CPU_TOTAL_SECONDS: &str = "databuild_job_cpu_total_seconds"; - pub const ROWS_PROCESSED_TOTAL: &str = "databuild_rows_processed_total"; - - pub fn job_labels(job_run_id: &str, job_type: &str) -> HashMap { - let mut labels = HashMap::new(); - labels.insert("job_run_id".to_string(), job_run_id.to_string()); - labels.insert("job_type".to_string(), job_type.to_string()); - labels - } -} -``` - -### 4. Metrics Aggregator [PHASE 1] -**Responsibility**: Process `MetricPoint` messages and expose Prometheus format with safe cardinality. - -```rust -struct MetricsAggregator { - metrics: HashMap, -} - -impl MetricsAggregator { - fn ingest_metric(&mut self, metric: &MetricPoint, job_run_id: &str, job_type: &str); - fn generate_prometheus_output(&self) -> String; -} -``` - -**Safe Prometheus Output** (NO partition labels): -``` -# HELP databuild_job_runtime_seconds Job execution time in seconds -# TYPE databuild_job_runtime_seconds gauge -databuild_job_runtime_seconds{job_run_id="job_run_123abc"} 45.2 - -# HELP databuild_rows_processed_total Total rows processed by job -# TYPE databuild_rows_processed_total counter -databuild_rows_processed_total{job_run_id="job_run_123abc"} 1000 -``` - -## API Implementation - -### REST Endpoints [PHASE 1] - -**Get Job Logs**: -``` -GET /api/v1/jobs/{job_run_id}/logs?since={timestamp}&level={log_level} -``` -Response: Array of `LogEntry` objects with filtering support. - -**Prometheus Metrics Scraping**: -``` -GET /api/v1/metrics -``` -Response: All metrics in Prometheus exposition format. - -## Configuration - -### Environment Variables -```bash -# Log storage configuration -DATABUILD_LOGS_DIR=/logs/databuild # Log storage directory - -# Processing configuration -DATABUILD_LOG_REFRESH_INTERVAL_MS=1000 # Fixed refresh interval (1s) -DATABUILD_LOG_CACHE_SIZE=100 # LRU cache size for job logs -``` - -## Implementation Phases - -### Phase 1: Core Implementation [COMPLETED ✅] -**Goal**: Basic log consumption and storage with REST API for log retrieval and Prometheus metrics. - -**Deliverables** ✅: -- ✅ Fix `JobLogEntry` protobuf interface (partition_ref → outputs) -- ✅ LogCollector with JSONL file writing and graph-level job_label enrichment -- ✅ LogReader with unified protobuf interface for CLI/Service consistency -- ✅ REST API endpoints for job logs and Prometheus metrics -- ✅ MetricsAggregator with cardinality-safe output (job labels, not partition refs) -- ✅ Centralized metric templates module - -**Success Criteria** ✅: -- ✅ Job logs are captured and stored reliably via LogCollector integration -- ✅ REST API can retrieve logs by job run ID and time range with filtering -- ✅ Prometheus metrics are exposed at `/api/v1/metrics` endpoint without cardinality issues -- ✅ System handles concurrent job execution without data corruption (separate JSONL files per job) -- ✅ All metric names/labels are defined in central location (`metric_templates.rs`) - -### Phase 2: Advanced Features [FUTURE] -**Goal**: Performance optimizations and production features. - -**Deliverables**: -- Adaptive batching based on system load -- Automatic log retention and cleanup -- Web app integration for log viewing -- Rate limiting for high-volume jobs -- Performance monitoring and alerting - -## Testing Strategy - -### Core Tests (90% Coverage, Maximum Simplicity) [PHASE 1] - -**Unit Tests**: -- JSONL parsing and serialization (basic happy path) -- Metrics aggregation and Prometheus formatting (template correctness) -- API endpoint responses (log retrieval by job_run_id) - -**Integration Tests**: -- End-to-end: wrapper stdout → JSONL file → API response -- Concurrent job log collection (2-3 jobs simultaneously) -- Prometheus metrics scraping endpoint - -**Key Principle**: Tests should be simple and focus on core workflows. Avoid testing edge cases that may change as requirements evolve. - -## Future Extensions - -### Performance Optimizations [FUTURE] -- Adaptive refresh intervals based on load -- Log compression for storage efficiency -- Advanced caching strategies - -### Production Features [FUTURE] -- Automatic log retention and cleanup policies -- Integration with external log collection tools -- Web app log viewing and search capabilities - -### Monitoring Integration [FUTURE] -- Grafana dashboard templates -- Alerting on log system health -- Performance metrics for log processing pipeline - -## Success Criteria - -1. **Reliable Log Capture**: All job wrapper output captured without loss -2. **API Functionality**: Can retrieve logs by job run ID and time range -3. **Safe Metrics**: Prometheus endpoint works without cardinality explosion -4. **Correctness**: No duplicated metric templates, centralized definitions -5. **Concurrent Safety**: Multiple jobs can write logs simultaneously without corruption -6. **Simple Testing**: Test suite covers core functionality with minimal brittleness \ No newline at end of file diff --git a/plans/15-dsl-graph-generation.md b/plans/15-dsl-graph-generation.md deleted file mode 100644 index 4dd5a69..0000000 --- a/plans/15-dsl-graph-generation.md +++ /dev/null @@ -1,466 +0,0 @@ -# DSL Graph Generation: Bazel Module Generation from Python DSL - -## Motivation & High-Level Goals - -### Problem Statement -DataBuild's Python DSL provides an ergonomic interface for defining data processing graphs, but currently lacks a deployment path. Users can define jobs and graphs using the DSL, but cannot easily package and deploy them as complete, hermetic applications. This limits the DSL's utility as a production-ready interface. - -### Strategic Goals -1. **Seamless Deployment**: Enable DSL-defined graphs to be built and deployed as complete bazel modules -2. **Hermetic Packaging**: Generate self-contained modules with all dependencies resolved -3. **Interface Consistency**: Maintain CLI/Service interchangeability principle across generated modules -4. **Production Readiness**: Support container deployment and external dependency management - -### Success Criteria -- DSL graphs can be compiled to standalone bazel modules (`@my_generated_graph//...`) -- Generated modules support the full databuild interface (analyze, build, service, container images) -- External repositories can depend on databuild core and generate working applications -- End-to-end deployment pipeline from DSL definition to running containers - -## Required Reading - -### Core Design Documents -- [`DESIGN.md`](../DESIGN.md) - Overall databuild architecture and principles -- [`design/core-build.md`](../design/core-build.md) - Job and graph execution semantics -- [`design/graph-specification.md`](../design/graph-specification.md) - DSL interfaces and patterns -- [`design/service.md`](../design/service.md) - Service interface requirements -- [`design/deploy-strategies.md`](../design/deploy-strategies.md) - Deployment patterns - -### Key Source Files -- [`databuild/dsl/python/dsl.py`](../databuild/dsl/python/dsl.py) - Current DSL implementation -- [`databuild/test/app/dsl/graph.py`](../databuild/test/app/dsl/graph.py) - Reference DSL usage -- [`databuild/rules.bzl`](../databuild/rules.bzl) - Bazel rules for jobs and graphs -- [`databuild/databuild.proto`](../databuild/databuild.proto) - Core interfaces - -### Understanding Prerequisites -1. **Job Architecture**: Jobs have `.cfg`, `.exec`, and main targets with subcommand pattern -2. **Graph Structure**: Graphs require job lookup, analyze, build, and service variants -3. **Bazel Modules**: External repos use `@workspace//...` references for generated content -4. **CLI/Service Consistency**: Both interfaces must produce identical artifacts and behaviors - -## Implementation Plan - -### Phase 1: Basic Generation Infrastructure -**Goal**: Establish foundation for generating bazel modules from DSL definitions - -#### Deliverables -- Extend `DataBuildGraph.generate_bazel_module()` method -- Generate minimal `MODULE.bazel` with databuild core dependency -- Generate `BUILD.bazel` with job and graph target stubs -- Basic workspace creation and file writing utilities - -#### Implementation Tasks -1. Add `generate_bazel_module(workspace_name: str, output_dir: str)` to `DataBuildGraph` -2. Create template system for `MODULE.bazel` and `BUILD.bazel` generation -3. Implement file system utilities for creating workspace structure -4. Add basic validation for DSL graph completeness - -#### Tests & Verification -```bash -# Test: Basic generation succeeds -python -c " -from databuild.test.app.dsl.graph import graph -graph.generate_bazel_module('test_graph', '/tmp/generated') -" - -# Test: Generated files are valid -cd /tmp/generated -bazel build //... # Should succeed without errors - -# Test: Module can be referenced externally -# In separate workspace: -# bazel build @test_graph//... -``` - -#### Success Criteria -- Generated `MODULE.bazel` has correct databuild dependency -- Generated `BUILD.bazel` is syntactically valid -- External workspace can reference `@generated_graph//...` targets -- No compilation errors in generated bazel files - ---- - -### Phase 2: Job Binary Generation -**Goal**: Convert DSL job classes into executable databuild job targets - -#### Deliverables -- Auto-generate job binary Python files with config/exec subcommand handling -- Create `databuild_job` targets for each DSL job class -- Implement job lookup binary generation -- Wire partition pattern matching to job target resolution - -#### Implementation Tasks -1. Create job binary template with subcommand dispatching: - ```python - # Generated job_binary.py template - if sys.argv[1] == "config": - job_instance = MyDSLJob() - config = job_instance.config(parse_outputs(sys.argv[2:])) - print(json.dumps(config)) - elif sys.argv[1] == "exec": - config = json.loads(sys.stdin.read()) - job_instance.exec(config) - ``` - -2. Generate job lookup binary from DSL job registrations: - ```python - # Generated lookup.py - def lookup_job_for_partition(partition_ref: str) -> str: - for pattern, job_target in JOB_MAPPINGS.items(): - if pattern.match(partition_ref): - return job_target - raise ValueError(f"No job found for: {partition_ref}") - ``` - -3. Create `databuild_job` targets in generated `BUILD.bazel` -4. Handle DSL job dependencies and imports in generated files - -#### Tests & Verification -```bash -# Test: Job config execution -bazel run @test_graph//:ingest_color_votes.cfg -- \ - "daily_color_votes/2024-01-01/red" -# Should output valid JobConfig JSON - -# Test: Job exec execution -echo '{"outputs":[...], "env":{"DATA_DATE":"2024-01-01"}}' | \ - bazel run @test_graph//:ingest_color_votes.exec -# Should execute successfully - -# Test: Job lookup -bazel run @test_graph//:job_lookup -- \ - "daily_color_votes/2024-01-01/red" -# Should output: //:ingest_color_votes -``` - -#### Success Criteria -- All DSL jobs become executable `databuild_job` targets -- Job binaries correctly handle config/exec subcommands -- Job lookup correctly maps partition patterns to job targets -- Generated jobs maintain DSL semantic behavior - ---- - -### Phase 3: Two-Phase Code Generation -**Goal**: Implement proper two-phase code generation that works within Bazel's constraints - -#### Key Learning -Previous attempts failed due to fundamental Bazel constraints: -- **Loading vs Execution phases**: `load()` statements run before genrules execute -- **Dynamic target generation**: Bazel requires the complete build graph before execution begins -- **Hermeticity**: Generated BUILD files must be in source tree, not bazel-bin - -The solution: **Two-phase generation** following established patterns from protobuf, thrift, and other code generators. - -#### Two-Phase Workflow - -**Phase 1: Code Generation** (run by developer) -```bash -bazel run //databuild/test/app/dsl:graph.generate -# Generates BUILD.bazel and Python binaries into source tree -``` - -**Phase 2: Building** (normal Bazel workflow) -```bash -bazel build //databuild/test/app/dsl:graph.analyze -bazel run //databuild/test/app/dsl:graph.service -- --port 8080 -``` - -#### Implementation Tasks - -1. **Create `databuild_dsl_generator` rule**: - ```python - databuild_dsl_generator( - name = "graph.generate", - graph_file = "graph.py", - output_package = "//databuild/test/app/dsl", - deps = [":dsl_src"], - ) - ``` - -2. **Implement generator that writes to source tree**: - ```python - def _databuild_dsl_generator_impl(ctx): - script = ctx.actions.declare_file(ctx.label.name + "_generator.py") - - # Create a script that: - # 1. Loads the DSL graph - # 2. Generates BUILD.bazel and binaries - # 3. Writes them to the source tree - script_content = """ -import os -import sys -# Add workspace root to path -workspace_root = os.environ.get('BUILD_WORKSPACE_DIRECTORY') -output_dir = os.path.join(workspace_root, '{package_path}') - -# Load and generate -from {module_path} import {graph_attr} -{graph_attr}.generate_bazel_package('{name}', output_dir) -print(f'Generated BUILD.bazel and binaries in {{output_dir}}') - """.format( - package_path = ctx.attr.output_package.strip("//").replace(":", "/"), - module_path = ctx.file.graph_file.path.replace("/", ".").replace(".py", ""), - graph_attr = ctx.attr.graph_attr, - name = ctx.attr.name.replace(".generate", ""), - ) - - ctx.actions.write( - output = script, - content = script_content, - is_executable = True, - ) - - return [DefaultInfo(executable = script)] - ``` - -3. **Update `DataBuildGraph.generate_bazel_package()` to target source tree**: - ```python - def generate_bazel_package(self, name: str, output_dir: str) -> None: - """Generate BUILD.bazel and binaries into source directory""" - # Generate BUILD.bazel with real databuild targets - self._generate_build_bazel(output_dir, name) - - # Generate job binaries - self._generate_job_binaries(output_dir) - - # Generate job lookup - self._generate_job_lookup(output_dir) - - print(f"Generated package in {output_dir}") - print("Run 'bazel build :{name}.analyze' to use") - ``` - -4. **Create standard BUILD.bazel template**: - ```python - def _generate_build_bazel(self, output_dir: str, name: str): - # Generate proper databuild_job and databuild_graph targets - # that will work exactly like hand-written ones - build_content = self._build_template.format( - jobs = self._format_jobs(), - graph_name = f"{name}_graph", - job_targets = self._format_job_targets(), - ) - - with open(os.path.join(output_dir, "BUILD.bazel"), "w") as f: - f.write(build_content) - ``` - -#### Interface Design - -**For DSL Authors**: -```python -# In graph.py -graph = DataBuildGraph("my_graph") - -@graph.job -class MyJob(DataBuildJob): - # ... job definition -``` - -**For Users**: -```bash -# Generate code (phase 1) -bazel run //my/app:graph.generate - -# Use generated code (phase 2) -bazel build //my/app:graph.analyze -bazel run //my/app:graph.service -``` - -**In BUILD.bazel**: -```python -databuild_dsl_generator( - name = "graph.generate", - graph_file = "graph.py", - output_package = "//my/app", - deps = [":my_deps"], -) - -# After generation, this file will contain: -# databuild_graph(name = "graph_graph", ...) -# databuild_job(name = "my_job", ...) -# py_binary(name = "my_job_binary", ...) -``` - -#### Benefits of This Approach - -✅ **Works within Bazel constraints** - No dynamic target generation -✅ **Follows established patterns** - Same as protobuf, thrift, OpenAPI generators -✅ **Inspectable output** - Users can see generated BUILD.bazel -✅ **Version controllable** - Generated files can be checked in if desired -✅ **Incremental builds** - Standard Bazel caching works perfectly -✅ **Clean separation** - Generation vs building are separate phases - -#### Tests & Verification -```bash -# Test: Code generation -bazel run //databuild/test/app/dsl:graph.generate -# Should create BUILD.bazel and Python files in source tree - -# Test: Generated targets work -bazel build //databuild/test/app/dsl:graph_graph.analyze -# Should build successfully using generated BUILD.bazel - -# Test: End-to-end functionality -bazel run //databuild/test/app/dsl:graph_graph.analyze -- "color_vote_report/2024-01-01/red" -# Should work exactly like hand-written graph -``` - -#### Success Criteria -- Generator creates valid BUILD.bazel in source tree -- Generated targets are indistinguishable from hand-written ones -- Full DataBuild functionality works through generated code -- Clean developer workflow with clear phase separation - ---- - -### Phase 4: Graph Integration -**Goal**: Generate complete databuild graph targets with all operational variants - -#### Deliverables -- Generate `databuild_graph` target with analyze/build/service capabilities -- Create all graph variant targets (`.analyze`, `.build`, `.service`, etc.) -- Wire job dependencies into graph configuration -- Generate container deployment targets - -#### Implementation Tasks -1. Generate `databuild_graph` target with complete job list -2. Create all required graph variants: - - `my_graph.analyze` - Planning capability - - `my_graph.build` - CLI execution - - `my_graph.service` - HTTP service - - `my_graph.service.image` - Container image -3. Configure job lookup and dependency wiring -4. Add graph label and identification metadata - -#### Tests & Verification -```bash -# Test: Graph analysis -bazel run @test_graph//:my_graph.analyze -- \ - "color_vote_report/2024-01-01/red" -# Should output complete job execution plan - -# Test: Graph building -bazel run @test_graph//:my_graph.build -- \ - "daily_color_votes/2024-01-01/red" -# Should execute end-to-end build - -# Test: Service deployment -bazel run @test_graph//:my_graph.service -- --port 8081 -# Should start HTTP service on port 8081 - -# Test: Container generation -bazel build @test_graph//:my_graph.service.image -# Should create deployable container image -``` - -#### Success Criteria -- Graph targets provide full databuild functionality -- CLI and service interfaces produce identical results -- All graph operations work with generated job targets -- Container images are deployable and functional - ---- - -### Phase 4: Dependency Resolution -**Goal**: Handle external pip packages and bazel dependencies in generated modules - -#### Deliverables -- User-declared dependency system in DSL -- Generated `MODULE.bazel` with proper pip and bazel dependencies -- Dependency validation and conflict resolution -- Support for requirements files and version pinning - -#### Implementation Tasks -1. Extend `DataBuildGraph` constructor to accept dependencies: - ```python - graph = DataBuildGraph( - "//my_graph", - pip_deps=["pandas>=2.0.0", "numpy"], - bazel_deps=["@my_repo//internal:lib"] - ) - ``` - -2. Generate `MODULE.bazel` with pip extension configuration: - ```python - pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip") - pip.parse( - hub_name = "pip_deps", - python_version = "3.11", - requirements_lock = "//:requirements_lock.txt" - ) - ``` - -3. Create requirements file generation from declared dependencies -4. Add dependency validation during generation - -#### Tests & Verification -```bash -# Test: Pip dependencies resolved -bazel build @test_graph//:my_job -# Should succeed with pandas/numpy available - -# Test: Cross-module references work -# Generate graph that depends on @other_repo//lib -bazel build @test_graph//:dependent_job -# Should resolve external bazel dependencies - -# Test: Container includes all deps -bazel run @test_graph//:my_graph.service.image_load -docker run databuild_test_graph_service:latest python -c "import pandas" -# Should succeed - pandas available in container -``` - -#### Success Criteria -- Generated modules resolve all external dependencies -- Pip packages are available to job execution -- Cross-repository bazel dependencies work correctly -- Container images include complete dependency closure - ---- - -### Phase 5: End-to-End Deployment -**Goal**: Complete production deployment pipeline with observability - -#### Deliverables -- Production-ready container images with proper configuration -- Integration with existing databuild observability systems -- Build event log compatibility -- Performance optimization and resource management - -#### Implementation Tasks -1. Optimize generated container images for production use -2. Ensure build event logging works correctly in generated modules -3. Add resource configuration and limits to generated targets -4. Create deployment documentation and examples -5. Performance testing and optimization - -#### Tests & Verification -```bash -./run_e2e_tests.sh -``` - -#### Success Criteria -- Generated modules are production-ready -- Full observability and logging integration -- Performance meets production requirements -- CLI/Service consistency maintained -- Complete deployment documentation - -## Validation Strategy - -### Integration with Existing Tests -- Extend `run_e2e_tests.sh` to test generated modules -- Add generated module tests to CI/CD pipeline -- Use existing test app DSL as primary test case - -### Performance Benchmarks -- Graph analysis speed comparison (DSL vs hand-written bazel) -- Container image size optimization -- Job execution overhead measurement - -### Correctness Verification -- Build event log structure validation -- Partition resolution accuracy testing -- Dependency resolution completeness checks diff --git a/plans/16-bel-delta-backend.md b/plans/16-bel-delta-backend.md deleted file mode 100644 index 0acdbc4..0000000 --- a/plans/16-bel-delta-backend.md +++ /dev/null @@ -1,407 +0,0 @@ -# BEL Delta Table Backend Implementation Plan - -## Motivation & High-Level Goals - -### Problem Statement -DataBuild currently supports SQLite and has stubs for PostgreSQL as Build Event Log (BEL) backends. While SQLite works well for single-node deployments, and PostgreSQL would provide traditional RDBMS capabilities, neither offers the benefits of a modern lakehouse architecture. Delta Lake would provide ACID transactions, scalable storage, and better integration with data processing ecosystems while maintaining the same event-sourced/CQRS architecture. - -### Strategic Goals -1. **Lakehouse Architecture**: Enable DataBuild to use Delta tables as a BEL backend, bringing lakehouse benefits to the orchestration layer -2. **Interface Compatibility**: Maintain exact parity with the existing `BuildEventLog` trait interface -3. **ACID Guarantees**: Leverage Delta's ACID transactions for concurrent build safety -4. **Schema Evolution**: Version Delta table schemas alongside protobuf definitions for forward compatibility -5. **Storage Flexibility**: Support both local filesystem and (future) cloud storage backends - -### Success Criteria -- Delta backend passes all existing BEL trait tests with identical results to SQLite -- CLI and Service can use Delta backend interchangeably via URI configuration -- Events written to Delta backend can be queried with same performance characteristics as SQLite for typical workloads -- Schema versioning allows for backward-compatible evolution of event structures - -## Technical Design - -### URI Format -Following industry conventions for Delta table references: -- Local filesystem: `delta:///absolute/path/to/table` -- Future S3 support: `delta+s3://bucket/path/to/table` -- Future Azure support: `delta+azure://container/path/to/table` - -### Table Schema -Single Delta table with nested structures matching the protobuf definitions: - -```sql -CREATE TABLE build_events ( - -- Core event fields - event_id STRING NOT NULL, - timestamp BIGINT NOT NULL, - build_request_id STRING NOT NULL, - event_type STRING NOT NULL, - - -- Event-specific nested structures (all nullable) - build_request_event STRUCT< - status_code INT, - status_name STRING, - requested_partitions ARRAY, - message STRING - >, - - partition_event STRUCT< - partition_ref STRING, - status_code INT, - status_name STRING, - message STRING, - job_run_id STRING - >, - - job_event STRUCT< - job_run_id STRING, - job_label STRING, - target_partitions ARRAY, - status_code INT, - status_name STRING, - message STRING, - config STRING, -- JSON serialized JobConfig - manifests STRING -- JSON serialized array of PartitionManifest - >, - - delegation_event STRUCT< - partition_ref STRING, - delegated_to_build_request_id STRING, - message STRING - >, - - job_graph_event STRUCT< - job_graph STRING, -- JSON serialized JobGraph - message STRING - >, - - partition_invalidation_event STRUCT< - partition_ref STRING, - reason STRING - >, - - task_cancel_event STRUCT< - job_run_id STRING, - reason STRING - >, - - build_cancel_event STRUCT< - reason STRING - > -) -``` - -### Query Implementation -Use native delta-rs capabilities with in-memory filtering for CQRS-style aggregations: -- All read operations implemented using delta-rs table scanning with Arrow RecordBatches -- In-memory filtering and aggregation in Rust (similar to SQLite approach initially) -- Leverage Delta's partition filtering where possible to reduce data scanned -- No external query engine dependencies initially - can add DataFusion later when needed - -## Implementation Plan - -### Current Status: PHASE 3 COMPLETED ✅ - -**Implementation Status**: Core Delta backend functionality is complete and operational: - -- ✅ **Full Delta backend implemented** with deltalake v0.27 and comprehensive write functionality -- ✅ **All tests passing**: 91 tests pass including Delta-specific append and read validation tests -- ✅ **Production ready**: Delta backend can create tables, write events with ACID transactions, and handle all query types -- ✅ **Build integration complete**: Successfully compiles without dependency conflicts - -**Key Achievements**: -- **Complete BuildEventLog trait implementation** with sophisticated filtering logic -- **Dual schema approach** for Arrow RecordBatch compatibility -- **Full event serialization** for all 8 BuildEvent types with JSON encoding -- **Automatic table creation** and ACID transaction support -- **Comprehensive test coverage** including end-to-end write/read validation - -**Current Functional Status**: -- ✅ **Write operations**: Fully functional with Delta table creation and event appending -- ✅ **Read operations**: Trait methods implemented with table opening validation (returns empty for now) -- ✅ **Error handling**: Complete error mapping and type safety throughout -- ✅ **URI support**: `delta://` URIs supported in DataBuild configuration - -**DataFusion Integration Note**: -- DataFusion integration was attempted but encountered version compatibility issues -- Core Delta functionality works without DataFusion dependency -- Future enhancement can add full table scanning when version conflicts are resolved - -### Phase 1: Basic Delta Backend Structure - COMPLETED ✅ -**Status**: ✅ Structure implemented, ✅ Dependencies enabled and working - -#### Completed Deliverables -- ✅ New `databuild/event_log/delta.rs` module with full trait implementation -- ✅ `DeltaBuildEventLog` struct implementing `BuildEventLog` trait -- ✅ URI recognition in `databuild/event_log/mod.rs` for `delta://` URIs -- ❌ **Dependencies disabled** in `MODULE.bazel` (lines 138-144) due to Arrow/chrono conflict - -#### Implementation Status -1. ❌ **Delta dependencies disabled** in `MODULE.bazel`: - ```python - # Delta backend temporarily disabled due to Arrow/chrono ecosystem conflict - # Even with chrono removed from our direct dependencies, it comes in transitively - # through rusqlite and schemars, and conflicts with deltalake's arrow-arith - # crate.spec( - # package = "deltalake", - # version = "0.20", - # ) - ``` - -2. ✅ **Delta module created** in `databuild/event_log/delta.rs` with complete structure: - ```rust - pub struct DeltaBuildEventLog { - table_path: String, - } - // All trait methods implemented with detailed error messages - ``` - -3. ✅ **URI recognition implemented** in `databuild/event_log/mod.rs` - -4. ✅ **Chrono dependency removed** from DataBuild codebase (replaced with std::time in log_collector.rs) - -#### Verification Status -- ❌ Cannot test due to disabled dependencies -- ✅ Code structure ready for when dependencies can be enabled -- ✅ No direct chrono usage remains in DataBuild - -#### Resolution Paths -1. **Wait for ecosystem fix**: Monitor Arrow ecosystem for chrono conflict resolution -2. **Alternative Delta implementation**: Research delta-rs alternatives or native Parquet backend -3. **Dependency replacement**: Replace rusqlite/schemars with chrono-free alternatives -4. **Fork approach**: Fork and modify dependencies to resolve conflicts - ---- - -### Phase 2: Event Writing Implementation - COMPLETED ✅ - -**Status**: ✅ Full implementation complete with working Delta table creation and append - -#### Completed Deliverables -- ✅ **Complete event serialization**: `event_to_record_batch()` converts all BuildEvent types to Arrow RecordBatch -- ✅ **Arrow schema definition**: Complete Delta table schema with all event type columns -- ✅ **JSON serialization**: All event subtypes properly serialized as JSON strings -- ✅ **Error handling**: Proper error mapping for serialization failures -- ✅ **Build verification**: Code compiles successfully with deltalake v0.27 -- ✅ **Comprehensive test suite**: All 8 BuildEvent types have serialization tests that pass -- ✅ **Write API research**: Found correct `RecordBatchWriter` and `DeltaWriter` APIs -- ✅ **Table creation implemented**: StructField-based schema creation for new Delta tables -- ✅ **Full append functionality**: Complete `append_event()` with table creation and writing -- ✅ **End-to-end test**: `test_append_event()` passes, creating tables and writing events - -#### Current Status -- ✅ **Event serialization working**: BuildEvent → RecordBatch conversion fully implemented and tested -- ✅ **Write API working**: RecordBatchWriter::for_table() → write() → flush_and_commit() pattern implemented -- ✅ **Table creation solved**: Separate Delta schema using StructField for table creation -- ✅ **Append functionality complete**: Full end-to-end event writing with ACID transactions -- 📝 **Ready for Phase 3**: Core Delta backend functionality complete and tested - -#### Technical Achievement -- **Dual schema approach**: Arrow schema for RecordBatch, Delta StructField schema for table creation -- **Automatic table creation**: Creates Delta table on first append if it doesn't exist -- **ACID compliance**: Uses Delta's transaction system for reliable writes -- **Type safety**: Proper enum conversions and JSON serialization with error handling - -### Phase 2: Event Writing Implementation -**Goal**: Implement event append functionality with ACID guarantees - -#### Deliverables -- Full `append_event()` implementation -- Event serialization to Delta schema format -- Transaction handling for concurrent writes - -#### Implementation Tasks -1. Implement event-to-row conversion: - - Convert `BuildEvent` to Delta row format - - Handle all event type variants - - Serialize complex fields (configs, manifests) as JSON strings - -2. Implement `append_event()` with Delta transactions: - - Open Delta table - - Convert event to row - - Append with ACID transaction - - Handle conflicts/retries - -3. Add helper functions for enum conversions and JSON serialization - -#### Tests & Verification -- Parity test: Write same events to SQLite and Delta, verify identical -- Concurrent write test: Multiple writers don't corrupt data -- All event types can be written and read back - -#### Success Criteria -- Events written to Delta match SQLite implementation exactly -- Concurrent writes maintain ACID properties -- No data loss or corruption under load - ---- - -### Phase 3: Native Query Implementation - COMPLETED ✅ - -**Status**: ✅ Core implementation complete with working write functionality and read infrastructure - -#### Completed Deliverables -- ✅ **All BuildEventLog trait methods implemented**: Complete trait implementation with sophisticated in-memory filtering -- ✅ **Write functionality working**: Full `append_event()` with table creation and ACID transactions -- ✅ **Read infrastructure in place**: All query methods implemented with placeholder Delta table opening -- ✅ **Comprehensive filtering logic**: Complex multi-event-type filtering for partition queries and job run queries -- ✅ **Error handling**: Proper error mapping throughout the pipeline -- ✅ **Test coverage**: All tests passing including end-to-end append tests - -#### Current Status -- ✅ **Core functionality complete**: Delta backend creates tables, writes events, and handles all query types -- ✅ **Build integration working**: Successfully compiles with deltalake v0.27 without version conflicts -- ✅ **Test validation**: All Delta backend tests pass (91 total tests, including Delta-specific ones) -- 🔄 **Read implementation**: Currently returns empty results but validates table existence -- 📋 **DataFusion integration deferred**: Version conflicts resolved by focusing on core Delta functionality first - -#### Technical Achievements -- **Dual schema approach**: Separate Arrow and Delta schemas for compatibility -- **Full event serialization**: All 8 BuildEvent types serialize correctly to Arrow RecordBatch -- **ACID compliance**: Uses Delta's transaction system for reliable concurrent writes -- **Complex query filtering**: Sophisticated in-memory processing supporting all query patterns -- **Type-safe implementation**: Proper enum conversions and JSON serialization with comprehensive error handling - -#### DataFusion Integration Status -- **Issue identified**: Version conflicts between DataFusion v49.0 and deltalake v0.27 dependencies -- **Workaround implemented**: Core Delta functionality working without DataFusion dependency -- **Future resolution**: Can be addressed in Phase 4 with compatible DataFusion version or alternative scanning approach - -#### Next Steps (Future Enhancement) -- **Delta table scanning**: Replace placeholder `read_all_events()` with actual RecordBatch iteration -- **DataFusion integration**: Resolve version conflicts to enable SQL-based querying -- **Performance optimization**: Add benchmarking and optimize for larger datasets - ---- - -### Phase 4: Schema Versioning -**Goal**: Support schema evolution alongside protobuf versions - -#### Deliverables -- Schema version tracking in Delta table properties -- Migration path for schema updates -- Backward compatibility guarantees - -#### Implementation Tasks -1. Add schema version to Delta table properties: - - Store version in table metadata - - Check version on table open - - Handle version mismatches - -2. Create schema migration framework: - - Define migration path from v1 to vN - - Implement safe column additions - - Handle nullable fields for backward compatibility - -3. Document schema evolution process - -#### Tests & Verification -- Test reading v1 data with v2 code -- Test schema migration process -- Verify no data loss during migration - -#### Success Criteria -- Schema version tracked and validated -- Safe migration path defined -- Backward compatibility maintained - ---- - -### Phase 5: Integration and Polish -**Goal**: Complete integration with DataBuild system - -#### Deliverables -- Full test coverage and parity validation -- Documentation updates -- Performance benchmarking - -#### Implementation Tasks -1. Complete test suite: - - Unit tests for all methods - - Integration tests with mock data - - Parity test suite comparing all backends - - Memory usage and performance tests - -2. Update documentation: - - Add Delta backend to README - - Document URI format and limitations - - Add deployment considerations - - Document when to choose Delta vs SQLite - -3. Performance optimization: - - Profile scanning and filtering operations - - Optimize JSON parsing and Arrow processing - - Add benchmarks against SQLite backend - -#### Tests & Verification -- Full test suite passes -- Performance benchmarks complete -- E2E tests work with Delta backend (future) - -#### Success Criteria -- Delta backend fully integrated and tested -- Performance characteristics documented and acceptable -- Clear migration path from SQLite documented - -## Future Enhancements - -### Cloud Storage Support -- Add `object_store` dependency -- Implement S3, Azure, GCS support -- Handle authentication and credentials - -### Performance Optimizations -- Implement columnar filtering before deserialization -- Add Delta table partitioning by timestamp -- Cache frequently accessed metadata -- Optimize Arrow RecordBatch processing - -### Advanced Features -- Delta table compaction and optimization -- Time-based partition pruning -- Change data feed for incremental processing -- Support for Delta table ACID transactions - -## Risks and Mitigations - -### Risk: Query Performance -**Mitigation**: Start with simple implementation, profile actual usage, optimize based on real workload patterns - -### Risk: Schema Evolution Complexity -**Mitigation**: Start with simple versioning, require manual migration initially, automate as patterns emerge - -### Risk: Delta Library Maturity -**Mitigation**: Pin to stable version, thorough testing, maintain SQLite as fallback option - -## Dependencies - -### Required Crates -- `deltalake` - Delta Lake implementation (includes Arrow support) - -### Future Crates -- `object_store` - Cloud storage support (future) - -## Testing Strategy - -### Unit Tests -- Test each method independently -- Mock Delta table for fast tests -- Verify event serialization - -### Integration Tests -- Full lifecycle tests (write → read → aggregate) -- Concurrent operation tests -- Large dataset tests - -### Parity Tests -- Compare Delta and SQLite outputs -- Ensure identical behavior -- Validate all edge cases - -## Success Metrics - -1. **Functional Parity**: 100% of BuildEventLog trait methods implemented -2. **Test Coverage**: >90% code coverage with comprehensive tests -3. **Performance**: Query latency within 2x of SQLite for p95 queries -4. **Reliability**: Zero data loss under concurrent load -5. **Compatibility**: CLI and Service work identically with Delta backend \ No newline at end of file diff --git a/plans/17-python-dsl-generator-fix.md b/plans/17-python-dsl-generator-fix.md deleted file mode 100644 index 052d6fa..0000000 --- a/plans/17-python-dsl-generator-fix.md +++ /dev/null @@ -1,164 +0,0 @@ -# Plan 17: Python DSL Generator Fix - -## Problem Statement - -The `databuild_dsl_generator` rule currently fails when trying to generate DSL code because it cannot properly provide dependencies to the underlying generator script. The issue manifests as import errors like "No module named 'databuild.test'" or "No module named 'betterproto2'". - -## Root Cause Analysis - -The current implementation has two separate approaches that don't work well together: - -1. **Direct py_binary approach** (`//databuild/dsl/python:generator`) - Works but requires hard-coding dependencies -2. **DSL generator rule approach** (`databuild_dsl_generator`) - Should allow dynamic deps but has runfiles propagation issues - -The core issue is that when `databuild_dsl_generator` creates a wrapper script that calls the py_binary generator via subprocess, the dependencies specified in the rule's `deps` attribute aren't available in the Python environment of the subprocess. - -## Current State - -- `databuild_dsl_generator` rule exists in `databuild/rules.bzl` (lines 944-1125) -- Generic `generator.py` exists in `databuild/dsl/python/generator.py` -- py_binary generator exists in `databuild/dsl/python/BUILD.bazel` (lines 10-22) -- Test case: `//databuild/test/app/dsl:graph.generate` - -## Solution Design: Custom py_binary per DSL - -Instead of using a generic generator + wrapper script approach, create a custom py_binary for each `databuild_dsl_generator` target that includes the specific dependencies. - -### Architecture Changes - -1. **Remove subprocess approach**: Stop calling the generic py_binary via subprocess -2. **Create custom py_binary**: Generate a py_binary rule dynamically with the specific deps -3. **Embed generator logic**: Include the generator logic directly in the custom binary -4. **Proper runfiles**: Ensure all dependencies flow through Bazel's runfiles mechanism - -## Implementation Plan - -### Phase 1: Restructure Generator Logic - -1. **Extract core generator logic** from `databuild/dsl/python/generator.py` into a library: - ```python - # databuild/dsl/python/generator_lib.py - def generate_dsl_package(module_path: str, graph_attr: str, output_dir: str): - # Move the core logic here - ``` - -2. **Create generator library target** in `databuild/dsl/python/BUILD.bazel`: - ```python - py_library( - name = "generator_lib", - srcs = ["generator_lib.py"], - deps = [":dsl", "//databuild:py_proto"], - ) - ``` - -3. **Update standalone generator** to use the library: - ```python - # databuild/dsl/python/generator.py - from databuild.dsl.python.generator_lib import generate_dsl_package - - def main(): - generate_dsl_package(sys.argv[1], sys.argv[2], sys.argv[3]) - ``` - -### Phase 2: Modify DSL Generator Rule - -4. **Update `_databuild_dsl_generator_impl`** in `databuild/rules.bzl`: - - Remove the subprocess-based wrapper script approach - - Create a custom py_binary that includes the generator library + user deps - - Generate the custom binary's source code with the specific module/attr parameters - -5. **New implementation structure**: - ```python - def _databuild_dsl_generator_impl(ctx): - # Create custom generator script - custom_generator = ctx.actions.declare_file(ctx.label.name + "_custom_generator.py") - - # Generate script content with embedded parameters - script_content = generate_custom_generator_script( - module_path=module_path, - graph_attr=ctx.attr.graph_attr, - package_path=package_path - ) - - ctx.actions.write(output=custom_generator, content=script_content) - - # Create runfiles with all dependencies - runfiles = ctx.runfiles(files=[custom_generator, ctx.file.graph_file]) - for dep in ctx.attr.deps: - runfiles = runfiles.merge(dep.default_runfiles) - # Include generator_lib and py_proto dependencies - runfiles = runfiles.merge(ctx.attr._generator_lib.default_runfiles) - runfiles = runfiles.merge(ctx.attr._py_proto.default_runfiles) - - return [DefaultInfo(executable=custom_generator, runfiles=runfiles)] - ``` - -### Phase 3: Update Rule Attributes - -6. **Add generator_lib dependency** to rule attributes: - ```python - "_generator_lib": attr.label( - default = "@databuild//databuild/dsl/python:generator_lib", - ), - ``` - -7. **Remove unnecessary attributes** like `_generator`, `_betterproto2` that are no longer needed - -### Phase 4: Custom Generator Script Template - -8. **Create template for custom generator scripts**: - ```python - def generate_custom_generator_script(module_path, graph_attr, package_path): - return f"""#!/usr/bin/env python3 - import os - import sys - from databuild.dsl.python.generator_lib import generate_dsl_package - - def main(): - workspace_root = os.environ.get('BUILD_WORKSPACE_DIRECTORY') - if workspace_root: - output_dir = os.path.join(workspace_root, '{package_path}') - else: - output_dir = '.' - - try: - generate_dsl_package('{module_path}', '{graph_attr}', output_dir) - except Exception as e: - print(f"ERROR: Generation failed: {{e}}", file=sys.stderr) - sys.exit(1) - - if __name__ == "__main__": - main() - """ - ``` - -## Testing Strategy - -1. **Unit test the generator library** separately from the rule -2. **Test with existing DSL** (`//databuild/test/app/dsl:graph.generate`) -3. **Verify dependency isolation** - ensure different DSL generators don't interfere -4. **Test pip dependency propagation** - ensure betterproto2 and other external deps work - -## Success Criteria - -- [ ] `bazel run //databuild/test/app/dsl:graph.generate` works without import errors -- [ ] Users can specify `deps = [":their_dsl_src"]` and have those deps available to generator -- [ ] No hard-coded dependencies in the core generator components -- [ ] Proper error messages when imports fail -- [ ] Generated code works with existing DataBuild infrastructure - -## Risks and Mitigations - -**Risk**: Bazel rule complexity increases -**Mitigation**: Extract reusable functions, add comprehensive comments - -**Risk**: Runfiles might still not propagate correctly -**Mitigation**: Test with various dependency types early in implementation - -## Files to Modify - -- `databuild/dsl/python/BUILD.bazel` - Add generator_lib target -- `databuild/dsl/python/generator_lib.py` - New file with extracted logic -- `databuild/dsl/python/generator.py` - Simplify to use library -- `databuild/rules.bzl` - Rewrite `_databuild_dsl_generator_impl` -- `databuild/BUILD.bazel` - Ensure betterproto2 is properly included in py_proto diff --git a/plans/18-bel-refactor.md b/plans/18-bel-refactor.md deleted file mode 100644 index 32f5690..0000000 --- a/plans/18-bel-refactor.md +++ /dev/null @@ -1,304 +0,0 @@ -# BEL Refactoring to 3-Tier Architecture - -## Overview - -This plan restructures DataBuild's Build Event Log (BEL) access layer from the current monolithic trait to a clean 3-tier architecture as described in [design/build-event-log.md](../design/build-event-log.md). This refactoring creates clear separation of concerns and simplifies the codebase by removing complex storage backends. - -## Current State Analysis - -The current BEL implementation (`databuild/event_log/mod.rs`) has a single `BuildEventLog` trait that mixes: -- Low-level storage operations (`append_event`, `get_events_in_range`) -- High-level aggregation queries (`list_build_requests`, `get_activity_summary`) -- Application-specific logic (`get_latest_partition_status`, `get_active_builds_for_partition`) - -This creates several problems: -- Storage backends must implement complex aggregation logic -- No clear separation between storage and business logic -- Difficult to extend with new query patterns -- Delta Lake implementation adds unnecessary complexity - -## Target Architecture - -### 1. Storage Layer: `BELStorage` Trait -Minimal append-only interface optimized for sequential scanning: - -```rust -#[async_trait] -pub trait BELStorage: Send + Sync { - /// Append a single event, returns the sequential index - async fn append_event(&self, event: BuildEvent) -> Result; - - /// List events with filtering, starting from a given index - async fn list_events(&self, since_idx: i64, filter: EventFilter) -> Result; - - /// Initialize storage backend (create tables, etc.) - async fn initialize(&self) -> Result<()>; -} - -#[derive(Debug, Clone)] -pub struct EventPage { - pub events: Vec, - pub next_idx: i64, - pub has_more: bool, -} -``` - -### 2. Query Engine Layer: `BELQueryEngine` -App-layer aggregation that scans storage events: - -```rust -pub struct BELQueryEngine { - storage: Arc, -} - -impl BELQueryEngine { - pub fn new(storage: Arc) -> Self { - Self { storage } - } - - /// Get latest status for a partition by scanning recent events - pub async fn get_latest_partition_status(&self, partition_ref: &str) -> Result>; - - /// Get all build requests that are currently building a partition - pub async fn get_active_builds_for_partition(&self, partition_ref: &str) -> Result>; - - /// Get summary of a build request by aggregating its events - pub async fn get_build_request_summary(&self, build_id: &str) -> Result; - - /// List build requests with pagination and filtering - pub async fn list_build_requests(&self, request: BuildsListRequest) -> Result; - - /// Get activity summary for dashboard - pub async fn get_activity_summary(&self) -> Result; -} -``` - -### 3. Client Layer: Repository Pattern -Clean interfaces for CLI, Service, and Dashboard (unchanged from current): - -```rust -// Existing repositories continue to work, but now use BELQueryEngine -pub struct PartitionsRepository { - query_engine: Arc, -} - -pub struct BuildsRepository { - query_engine: Arc, -} -``` - -## Implementation Plan - -### Phase 1: Create Storage Layer Interface - -1. **Define New Storage Trait** - ```rust - // In databuild/event_log/storage.rs - pub trait BELStorage { /* as defined above */ } - - pub fn create_bel_storage(uri: &str) -> Result>; - ``` - -2. **Add EventFilter to Protobuf** - ```protobuf - // In databuild/databuild.proto - message EventFilter { - repeated string partition_refs = 1; - repeated string partition_patterns = 2; - repeated string job_labels = 3; - repeated string task_ids = 4; - repeated string build_request_ids = 5; - } - - message EventPage { - repeated BuildEvent events = 1; - int64 next_idx = 2; - bool has_more = 3; - } - ``` - -3. **Implement SQLite Storage Backend** - ```rust - // In databuild/event_log/sqlite_storage.rs - pub struct SqliteBELStorage { - pool: sqlx::SqlitePool, - } - - impl BELStorage for SqliteBELStorage { - async fn append_event(&self, event: BuildEvent) -> Result { - // Simple INSERT returning rowid - let serialized = serde_json::to_string(&event)?; - let row_id = sqlx::query("INSERT INTO build_events (event_data) VALUES (?)") - .bind(serialized) - .execute(&self.pool) - .await? - .last_insert_rowid(); - Ok(row_id) - } - - async fn list_events(&self, since_idx: i64, filter: EventFilter) -> Result { - // Efficient sequential scan with filtering - // Build WHERE clause based on filter criteria - // Return paginated results - } - } - ``` - -### Phase 2: Create Query Engine Layer - -1. **Implement BELQueryEngine** - ```rust - // In databuild/event_log/query_engine.rs - impl BELQueryEngine { - pub async fn get_latest_partition_status(&self, partition_ref: &str) -> Result> { - // Scan recent partition events to determine current status - let filter = EventFilter { - partition_refs: vec![partition_ref.to_string()], - ..Default::default() - }; - - let events = self.storage.list_events(0, filter).await?; - self.aggregate_partition_status(&events.events) - } - - async fn aggregate_partition_status(&self, events: &[BuildEvent]) -> Result> { - // Walk through events chronologically to determine final partition status - // Return the most recent status - } - } - ``` - -2. **Implement All Current Query Methods** - - Port all methods from current `BuildEventLog` trait - - Use event scanning and aggregation instead of complex SQL queries - - Keep same return types for compatibility - -### Phase 3: Migrate Existing Code - -1. **Update Repository Constructors** - ```rust - // Old: PartitionsRepository::new(Arc) - // New: PartitionsRepository::new(Arc) - - impl PartitionsRepository { - pub fn new(query_engine: Arc) -> Self { - Self { query_engine } - } - - pub async fn list_protobuf(&self, request: PartitionsListRequest) -> Result { - self.query_engine.list_build_requests(request).await - } - } - ``` - -2. **Update CLI and Service Initialization** - ```rust - // In CLI main.rs and service mod.rs - let storage = create_bel_storage(&event_log_uri).await?; - let query_engine = Arc::new(BELQueryEngine::new(storage)); - - let partitions_repo = PartitionsRepository::new(query_engine.clone()); - let builds_repo = BuildsRepository::new(query_engine.clone()); - ``` - -### Phase 4: Remove Legacy Components - -1. **Remove Delta Lake Implementation** - ```rust - // Delete databuild/event_log/delta.rs - // Remove delta dependencies from MODULE.bazel - // Remove delta:// support from create_build_event_log() - ``` - -2. **Deprecate Old BuildEventLog Trait** - ```rust - // Mark as deprecated, keep for backwards compatibility during transition - #[deprecated(note = "Use BELQueryEngine and BELStorage instead")] - pub trait BuildEventLog { /* existing implementation */ } - ``` - -3. **Update Factory Function** - ```rust - // In databuild/event_log/mod.rs - pub async fn create_build_event_log(uri: &str) -> Result> { - let storage = if uri == "stdout" { - Arc::new(stdout::StdoutBELStorage::new()) as Arc - } else if uri.starts_with("sqlite://") { - let path = &uri[9..]; - let storage = sqlite_storage::SqliteBELStorage::new(path).await?; - storage.initialize().await?; - Arc::new(storage) as Arc - } else if uri.starts_with("postgres://") { - let storage = postgres_storage::PostgresBELStorage::new(uri).await?; - storage.initialize().await?; - Arc::new(storage) as Arc - } else { - return Err(BuildEventLogError::ConnectionError( - format!("Unsupported build event log URI: {}", uri) - )); - }; - - Ok(Arc::new(BELQueryEngine::new(storage))) - } - ``` - -### Phase 5: Final Cleanup - -1. **Remove Legacy Implementations** - - Delete complex aggregation logic from existing storage backends - - Simplify remaining backends to implement only new `BELStorage` trait - - Remove deprecated `BuildEventLog` trait - -2. **Update Documentation** - - Update design docs to reflect new architecture - - Create migration guide for external users - - Update code examples and README - -## Benefits of 3-Tier Architecture - -### ✅ **Simplified Codebase** -- Removes complex Delta Lake dependencies -- Storage backends focus only on append + scan operations -- Clear separation between storage and business logic - -### ✅ **Better Maintainability** -- Single SQLite implementation for most use cases -- Query logic centralized in one place -- Easier to debug and test each layer independently - -### ✅ **Future-Ready Foundation** -- Clean foundation for wants system (next phase) -- Easy to add new storage backends when needed -- Query engine ready for cross-graph coordination APIs - -### ✅ **Performance Benefits** -- Eliminates complex SQL joins in storage layer -- Enables sequential scanning optimizations -- Cleaner separation allows targeted optimizations - -## Success Criteria - -### Phase 1-2: Foundation -- [ ] Storage layer trait compiles and tests pass -- [ ] SQLite storage backend supports append + list operations -- [ ] Query engine provides same functionality as current BEL trait -- [ ] EventFilter protobuf types generate correctly - -### Phase 3-4: Migration -- [ ] All repositories work with new query engine -- [ ] CLI and service use new architecture -- [ ] Existing functionality unchanged from user perspective -- [ ] Delta Lake implementation removed - -### Phase 5: Completion -- [ ] Legacy BEL trait removed -- [ ] Performance meets or exceeds current implementation -- [ ] Documentation updated for new architecture -- [ ] Codebase simplified and maintainable - -## Risk Mitigation - -1. **Gradual Migration**: Implement new architecture alongside existing code -2. **Feature Parity**: Ensure all existing functionality works before removing old code -3. **Performance Testing**: Benchmark new implementation against current performance -4. **Simple First**: Start with SQLite-only implementation, add complexity later as needed \ No newline at end of file diff --git a/plans/19-client-server-cli.md b/plans/19-client-server-cli.md deleted file mode 100644 index 7337445..0000000 --- a/plans/19-client-server-cli.md +++ /dev/null @@ -1,182 +0,0 @@ -# Client-Server CLI Architecture - -## Overview - -This plan transforms DataBuild's CLI from a monolithic in-process execution model to a Bazel-style client-server architecture. The CLI becomes a thin client that delegates all operations to a persistent service process, enabling better resource management and build coordination. - -## Current State Analysis - -The current CLI (`databuild/cli/main.rs`) directly: -- Creates event log connections -- Runs analysis and execution in-process -- Spawns bazel processes directly -- No coordination between concurrent CLI invocations - -This creates several limitations: -- No coordination between concurrent builds -- Multiple BEL connections from concurrent CLI calls -- Each CLI process spawns separate bazel execution -- No shared execution environment for builds - -## Target Architecture - -### Bazel-Style Client-Server Model - -**CLI (Thin Client)**: -- Auto-starts service if not running -- Delegates all operations to service via HTTP -- Streams progress back to user -- Auto-shuts down idle service - -**Service (Persistent Process)**: -- Maintains single BEL connection -- Coordinates builds across multiple CLI calls -- Manages bazel execution processes -- Auto-shuts down after idle timeout - -## Implementation Plan - -### Phase 1: Service Foundation - -1. **Extend Current Service for CLI Operations** - - Add new endpoints to handle CLI build requests - - Move analysis and execution logic from CLI to service - - Service maintains orchestrator state and coordinates builds - -2. **Add CLI-Specific API Endpoints** - - `/api/v1/cli/build` - Handle build requests from CLI - - `/api/v1/cli/builds/{id}/progress` - Stream build progress via Server-Sent Events - - Request/response types for CLI build operations - - Background vs foreground build support - -3. **Add Service Auto-Management** - - Service tracks last activity timestamp - - Configurable auto-shutdown timeout (default: 5 minutes) - - Service monitors for idle state and gracefully shuts down - - Activity tracking includes API calls and active builds - -4. **Service Port Management** - - Service attempts to bind to preferred port (e.g., 8080) - - If port unavailable, tries next available port in range - - Service writes actual port to lockfile/pidfile for CLI discovery - - CLI reads port from lockfile to connect to running service - - Cleanup lockfile on service shutdown - -### Phase 2: Thin CLI Implementation - -1. **New CLI Main Function** - - Replace existing main with service delegation logic - - Parse arguments and determine target service operation - - Handle service connection and auto-start logic - - Preserve existing CLI interface and help text - -2. **Service Client Implementation** - - HTTP client for communicating with service - - Auto-start service if not already running - - Health check and connection retry logic - - Progress streaming for real-time build feedback - -3. **Build Command via Service** - - Parse build arguments and create service request - - Submit build request to service endpoint - - Stream progress updates for foreground builds - - Return immediately for background builds with build ID - -### Phase 3: Repository Commands via Service - -1. **Delegate Repository Commands to Service** - - Partition, build, job, and task commands go through service - - Use existing service API endpoints where available - - Maintain same output formats (table, JSON) as current CLI - - Preserve all existing functionality and options - -2. **Service Client Repository Methods** - - Client methods for each repository operation - - Handle pagination, filtering, and formatting options - - Error handling and appropriate HTTP status code handling - - URL encoding for partition references and other parameters - -### Phase 4: Complete Migration - -1. **Remove Old CLI Implementation** - - Delete existing `databuild/cli/main.rs` implementation - - Remove in-process analysis and execution logic - - Clean up CLI-specific dependencies that are no longer needed - - Update build configuration to use new thin client only - -2. **Service Integration Testing** - - End-to-end testing of CLI-to-service communication - - Verify all existing CLI functionality works through service - - Performance testing to ensure no regression - - Error handling validation for various failure modes - -### Phase 5: Integration and Testing - -1. **Environment Variable Support** - - `DATABUILD_SERVICE_URL` for custom service locations - - `DATABUILD_SERVICE_TIMEOUT` for auto-shutdown configuration - - Existing BEL environment variables passed to service - - Clear precedence rules for configuration sources - -2. **Error Handling and User Experience** - - Service startup timeout and clear error messages - - Connection failure handling with fallback suggestions - - Health check logic to verify service readiness - - Graceful handling of service unavailability - -## Benefits of Client-Server Architecture - -### ✅ **Build Coordination** -- Multiple CLI calls share same service instance -- Coordination between concurrent builds -- Single BEL connection eliminates connection conflicts - -### ✅ **Resource Management** -- Auto-shutdown prevents resource leaks -- Service manages persistent connections -- Better isolation between CLI and build execution -- Shared bazel execution environment - -### ✅ **Improved User Experience** -- Background builds with `--background` flag -- Real-time progress streaming -- Consistent build execution environment - -### ✅ **Simplified Architecture** -- Single execution path through service -- Cleaner separation of concerns -- Reduced code duplication - -### ✅ **Future-Ready Foundation** -- Service architecture prepared for additional coordination features -- HTTP API foundation for programmatic access -- Clear separation of concerns between client and execution - -## Success Criteria - -### Phase 1-2: Service Foundation -- [ ] Service can handle CLI build requests -- [ ] Service auto-shutdown works correctly -- [ ] Service port management and discovery works -- [ ] New CLI can start and connect to service -- [ ] Build requests execute with same functionality as current CLI - -### Phase 3-4: Complete Migration -- [ ] All CLI commands work via service delegation -- [ ] Repository commands (partitions, builds, etc.) work via HTTP API -- [ ] Old CLI implementation completely removed -- [ ] Error handling provides clear user feedback - -### Phase 5: Polish -- [ ] Multiple concurrent CLI calls work correctly -- [ ] Background builds work as expected -- [ ] Performance meets or exceeds current CLI -- [ ] Service management is reliable and transparent - -## Risk Mitigation - -1. **Thorough Testing**: Comprehensive testing before removing old CLI -2. **Feature Parity**: Ensure all existing functionality works via service -3. **Performance Validation**: Benchmark new implementation against current performance -4. **Simple Protocol**: Use HTTP/JSON for service communication (not gRPC initially) -5. **Clear Error Messages**: Service startup and connection failures should be obvious to users \ No newline at end of file diff --git a/plans/20-wants-initial.md b/plans/20-wants-initial.md deleted file mode 100644 index e0faa44..0000000 --- a/plans/20-wants-initial.md +++ /dev/null @@ -1,163 +0,0 @@ -# Wants System Implementation - -## Overview - -This plan implements the wants system described in [design/wants.md](../design/wants.md), transitioning DataBuild from direct build requests to a declarative want-based model with cross-graph coordination and SLA tracking. This builds on the 3-tier BEL architecture and client-server CLI established in the previous phases. - -## Prerequisites - -This plan assumes completion of: -- **Phase 18**: 3-tier BEL architecture with storage/query/client layers -- **Phase 19**: Client-server CLI architecture with service delegation - -## Implementation Phases - -### Phase 1: Extend BEL Storage for Wants - -1. **Add PartitionWantEvent to databuild.proto** - - Want event schema as defined in design/wants.md - - Want source tracking (CLI, dashboard, scheduled, API) - - TTL and SLA timestamp fields - - External dependency specifications - -2. **Extend BELStorage Interface** - - Add `append_want()` method for want events - - Extend `EventFilter` to support want filtering - - Add want-specific query capabilities to storage layer - -3. **Implement in SQLite Storage Backend** - - Add wants table with appropriate indexes - - Implement want filtering in list_events() - - Schema migration logic for existing databases - -### Phase 2: Basic Want API in Service - -1. **Implement Want Management in Service** - - Service methods for creating and querying wants - - Want lifecycle management (creation, expiration, satisfaction) - - Integration with existing service auto-management - -2. **Add Want HTTP Endpoints** - - `POST /api/v1/wants` - Create new want - - `GET /api/v1/wants` - List active wants with filtering - - `GET /api/v1/wants/{id}` - Get want details - - `DELETE /api/v1/wants/{id}` - Cancel want - -3. **CLI Want Commands** - - `./bazel-bin/my_graph.build want create ` with SLA/TTL options - - `./bazel-bin/my_graph.build want list` with filtering options - - `./bazel-bin/my_graph.build want status ` for want status - - Modify build commands to create wants via service - -### Phase 3: Want-Driven Build Evaluation - -1. **Implement Build Evaluator in Service** - - Continuous evaluation loop that checks for buildable wants - - External dependency satisfaction checking - - TTL expiration filtering for active wants - -2. **Replace Build Request Handling** - - Graph build commands create wants instead of direct build requests - - Service background loop evaluates wants and triggers builds - - Maintain atomic build semantics while satisfying multiple wants - -3. **Build Coordination Logic** - - Aggregate wants that can be satisfied by same build - - Priority handling for urgent wants (short SLA) - - Resource coordination across concurrent want evaluation - -### Phase 4: Cross-Graph Coordination - -1. **Implement GraphService API** - - HTTP API for cross-graph event streaming as defined in design/wants.md - - Event filtering for efficient partition pattern subscriptions - - Service-to-service communication for upstream dependencies - -2. **Upstream Dependency Configuration** - - Service configuration for upstream DataBuild instances - - Partition pattern subscriptions to upstream graphs - - Automatic want evaluation when upstream partitions become available - -3. **Cross-Graph Event Sync** - - Background sync process for upstream events - - Triggering local build evaluation on upstream availability - - Reliable HTTP-based coordination to avoid message loss - -### Phase 5: SLA Monitoring and Dashboard Integration - -1. **SLA Violation Tracking** - - External monitoring endpoints for SLA violations - - Want timeline and status tracking - - Integration with existing dashboard for want visualization - -2. **Want Dashboard Features** - - Want creation and monitoring UI - - Cross-graph dependency visualization - - SLA violation dashboard and alerting - -3. **Migration from Direct Builds** - - All build requests go through want system - - Remove direct build request pathways - - Update documentation for new build model - -## Benefits of Want-Based Architecture - -### ✅ **Unified Build Model** -- All builds (manual, scheduled, triggered) use same want mechanism -- Complete audit trail in build event log -- Consistent SLA tracking across all build types - -### ✅ **Event-Driven Efficiency** -- Builds only triggered when dependencies change -- Cross-graph coordination via efficient event streaming -- No polling for task readiness within builds - -### ✅ **Atomic Build Semantics Preserved** -- Individual build requests remain all-or-nothing -- Fast failure provides immediate feedback -- Partial progress via multiple build requests over time - -### ✅ **Flexible SLA Management** -- Separate business expectations (SLA) from operational limits (TTL) -- External monitoring with clear blame assignment -- Automatic cleanup of stale wants - -### ✅ **Cross-Graph Scalability** -- Reliable HTTP-based coordination -- Efficient filtering via partition patterns -- Decentralized architecture with clear boundaries - -## Success Criteria - -### Phase 1: Storage Foundation -- [ ] Want events can be stored and queried in BEL storage -- [ ] EventFilter supports want-specific filtering -- [ ] SQLite backend handles want operations efficiently - -### Phase 2: Basic Want API -- [ ] Service can create and query wants via HTTP API -- [ ] Graph build commands work for want management -- [ ] Build commands create wants instead of direct builds - -### Phase 3: Want-Driven Builds -- [ ] Service background loop evaluates wants continuously -- [ ] Build evaluation triggers on want creation and external events -- [ ] TTL expiration and external dependency checking work correctly - -### Phase 4: Cross-Graph Coordination -- [ ] GraphService API returns filtered events for cross-graph coordination -- [ ] Upstream partition availability triggers downstream want evaluation -- [ ] Service-to-service communication is reliable and efficient - -### Phase 5: Complete Migration -- [ ] All builds go through want system -- [ ] Dashboard supports want creation and monitoring -- [ ] SLA violation endpoints provide monitoring integration -- [ ] Documentation reflects new want-based build model - -## Risk Mitigation - -1. **Incremental Migration**: Implement wants alongside existing build system initially -2. **Performance Validation**: Ensure want evaluation doesn't introduce significant latency -3. **Backwards Compatibility**: Maintain existing build semantics during transition -4. **Monitoring Integration**: Provide clear observability into want lifecycle and performance \ No newline at end of file diff --git a/plans/ideas.md b/plans/ideas.md deleted file mode 100644 index bafc7b7..0000000 --- a/plans/ideas.md +++ /dev/null @@ -1 +0,0 @@ -- Generate a helm chart / kuztomize thing / docker compose spec? \ No newline at end of file diff --git a/plans/todo.md b/plans/todo.md deleted file mode 100644 index 34f87a3..0000000 --- a/plans/todo.md +++ /dev/null @@ -1,15 +0,0 @@ - -- Implement python dsl -- Achieve fast configuration (betterproto2 imports are sus) -- Remove manual reference of enum values, e.g. [here](../databuild/repositories/builds/mod.rs:85) -- On build request detail page, show aggregated job results -- How do we encode job labels in the path? (Build event job links are not encoding job labels properly) -- Resolve double type system with protobuf and openapi -- Plan for external worker dispatch (e.g. k8s pod per build, or launch in container service) - - k8s can use [jobs](https://kubernetes.io/docs/concepts/workloads/controllers/job/) -- Should we have meaningful exit codes? E.g. "retry-able error", etc? -- Fully joinable build/job IDs - ensure all execution logs / metrics are joinable to build request ID? -- Triggers? -- Add build request notes -- Status indicator for page selection -- Use path based navigation instead of hashbang? diff --git a/plans/webapp_v1/chunk-1-client-generation.md b/plans/webapp_v1/chunk-1-client-generation.md deleted file mode 100644 index e7a0e4d..0000000 --- a/plans/webapp_v1/chunk-1-client-generation.md +++ /dev/null @@ -1,160 +0,0 @@ -# Chunk 1: TypeScript Client Generation - -**Parent:** [Build Graph Dashboard](../build-graph-dashboard.md) -**Next:** [Chunk 2: Hello World App](./chunk-2-hello-world-app.md) - -## Overview - -Generate TypeScript client code from the DataBuild Build Graph Service OpenAPI specification to provide type-safe API access for the dashboard. - -## Scope - -### In Scope -- Generate OpenAPI specification from Build Graph Service using aide -- Create typed TypeScript client from OpenAPI spec using OpenAPI Generator -- Set up Bazel rules for OpenAPI-to-TypeScript generation -- Implement path parameter structs for proper aide OpenAPI generation -- CLI-based OpenAPI spec extraction - -### Out of Scope -- HTTP-based spec extraction (use CLI flag instead) -- External Bazel modules (use OpenAPI Generator JAR directly) -- Custom operation IDs (use auto-generated method names) -- Client-side validation (rely on server-side validation) - -## Technical Approach - -### OpenAPI-Based Generation -Generate TypeScript client from OpenAPI specification automatically derived from Build Graph Service: - -Key API endpoints to generate TypeScript client for: -- `POST /api/v1/builds` - Submit build request -- `GET /api/v1/builds/:id` - Get build status -- `DELETE /api/v1/builds/:id` - Cancel build request -- `GET /api/v1/partitions/:ref/status` - Get partition status -- `GET /api/v1/partitions/:ref/events` - Get partition events -- `POST /api/v1/analyze` - Analyze build graph - -### Generated Client Structure -```typescript -// Generated types from OpenAPI spec (auto-generated models) -export * from './AnalyzeRequest'; -export * from './BuildRequest'; -export * from './BuildStatusResponse'; -export * from './PartitionStatusResponse'; -// ... all other model types - -// Generated client API with auto-generated method names -export class DefaultApi { - constructor(configuration?: Configuration); - - // Auto-generated method names from OpenAPI paths - async apiV1BuildsPost(requestParameters: ApiV1BuildsPostRequest): Promise; - async apiV1BuildsIdGet(requestParameters: ApiV1BuildsIdGetRequest): Promise; - async apiV1BuildsIdDelete(requestParameters: ApiV1BuildsIdDeleteRequest): Promise; - async apiV1PartitionsRefStatusGet(requestParameters: ApiV1PartitionsRefStatusGetRequest): Promise; - async apiV1PartitionsRefEventsGet(requestParameters: ApiV1PartitionsRefEventsGetRequest): Promise; - async apiV1AnalyzePost(requestParameters: ApiV1AnalyzePostRequest): Promise; -} - -// Type-safe request parameter interfaces -export interface ApiV1BuildsPostRequest { - buildRequest: BuildRequest; -} - -export interface ApiV1BuildsIdGetRequest { - id: string; -} -``` - -### Bazel Integration -- Create `//databuild/client:typescript` target -- Extract OpenAPI spec using CLI flag `--print-openapi-spec` -- Use OpenAPI Generator JAR to create TypeScript client -- Ensure hermetic build process with JAR download -- Output client code to `databuild/client/typescript_generated/` - -### Path Parameter Requirements -For aide to properly generate OpenAPI path parameters, all path parameters must be wrapped in structs: - -```rust -#[derive(Deserialize, JsonSchema)] -pub struct BuildStatusRequest { - pub id: String, -} - -pub async fn get_build_status( - Path(request): Path -) -> Result, ...> { - // Use request.id instead of direct string -} -``` - -## Implementation Strategy - -1. **✅ Add OpenAPI Generation to Service** - - ✅ Add `aide` dependency for OpenAPI spec generation - - ✅ Modify Axum service to use `aide::ApiRouter` - - ✅ Add JsonSchema derives to request/response types - - ✅ Add path parameter structs for proper aide integration - - ✅ Create CLI flag `--print-openapi-spec` for spec generation - -2. **✅ OpenAPI Spec Extraction** - - ✅ Create Bazel rule using CLI flag instead of HTTP endpoint - - ✅ Extract spec using `build_graph_service --print-openapi-spec` - - ✅ Save spec as build artifact (JSON) - -3. **✅ TypeScript Client Generation** - - ✅ Use OpenAPI Generator JAR directly (not Bazel module) - - ✅ Create Bazel rule to generate TypeScript client from spec - - ✅ Use `typescript-fetch` generator for modern fetch-based client - - ✅ Configure with camelCase naming and single request parameters - -4. **✅ Bazel Rules** - - ✅ Create `//databuild/client:typescript_client` target - - ✅ Generate client code during build with proper file copying - - ✅ Ensure hermetic build with JAR download - -## Deliverables - -- [x] OpenAPI spec generation from Build Graph Service via CLI flag -- [x] TypeScript interfaces for all API request/response types (auto-generated) -- [x] Typed HTTP client for Build Graph Service endpoints with fetch API -- [x] Bazel rules for automated client generation using OpenAPI Generator JAR -- [x] Path parameter struct implementation for aide compatibility - -## Success Criteria - -- ✅ OpenAPI spec accurately reflects all service endpoints with proper path parameters -- ✅ Generated TypeScript compiles without errors using `typescript-fetch` generator -- ✅ Client provides type safety for all API endpoints with auto-generated method names -- ✅ Bazel build integrates seamlessly and generates client automatically -- ✅ Path parameter structs enable proper aide OpenAPI generation -- ✅ Ready for use in Chunk 2 (Hello World App) - -## Testing - -- ✅ Verify OpenAPI spec generation from service using CLI flag -- ✅ Verify generated TypeScript compiles without errors -- ✅ Validate OpenAPI spec passes OpenAPI Generator validation -- ✅ Ensure build process is hermetic and reproducible with JAR download -- ✅ Verify all API endpoints generate proper TypeScript method signatures -- ✅ Confirm path parameters are properly typed in generated client - -## Implementation Notes - -### Lessons Learned - -1. **aide Path Parameter Requirements**: aide requires path parameters to be wrapped in structs with `JsonSchema` derives, not simple `Path` extractors. - -2. **OpenAPI Generator Bazel Integration**: The official Bazel module was not available in registry, so we used the JAR directly via genrule for better reliability. - -3. **CLI vs HTTP Extraction**: Using a CLI flag for spec extraction is simpler and more reliable than starting an HTTP server. - -4. **Auto-generated Method Names**: Without custom operationIds, OpenAPI Generator creates method names like `apiV1BuildsPost`. This can be improved in future iterations. - -### Build Targets - -- `//databuild/client:extract_openapi_spec` - Extracts OpenAPI spec JSON -- `//databuild/client:typescript_client` - Generates TypeScript client -- `//databuild/client:typescript` - Main target for consuming the client \ No newline at end of file diff --git a/plans/webapp_v1/chunk-2-hello-world-app.md b/plans/webapp_v1/chunk-2-hello-world-app.md deleted file mode 100644 index 2ac5024..0000000 --- a/plans/webapp_v1/chunk-2-hello-world-app.md +++ /dev/null @@ -1,110 +0,0 @@ -# Chunk 2: Hello World App - -**Parent:** [Build Graph Dashboard](../build-graph-dashboard.md) -**Previous:** [Chunk 1: TypeScript Client Generation](./chunk-1-client-generation.md) -**Next:** [Chunk 3: Routing Framework](./chunk-3-routing-framework.md) - -## Overview - -Create a minimal "Hello World" single-page application using TypeScript and Mithril, fully integrated with the Bazel build system and served by the Build Graph Service. - -## Scope - -### In Scope -- Basic TypeScript + Mithril SPA setup -- Bazel BUILD rules for TypeScript compilation -- Integration with Build Graph Service (served by same process) -- Hermetic development workflow -- Basic bundling and minification - -### Out of Scope -- Complex routing (handled in Chunk 3) -- Styling framework (basic CSS only) -- Real API integration (use generated client from Chunk 1 for basic connectivity test) - -## Technical Approach - -### Application Structure -``` -databuild/ -├── dashboard/ -│ ├── BUILD -│ ├── src/ -│ │ ├── main.ts # Application entry point -│ │ ├── components/ -│ │ │ └── HelloWorld.ts # Basic component -│ │ └── api/ -│ │ └── client.ts # Generated client import -│ ├── static/ -│ │ └── index.html # HTML template -│ └── tsconfig.json # TypeScript configuration -``` - -### Bazel Integration -- Create `//databuild/dashboard:app` target -- Use `rules_nodejs` for TypeScript compilation -- Bundle with webpack or similar tool -- Serve static files from Build Graph Service - -### Service Integration -- Update Build Graph Service to serve static files -- Add route for `/*` to serve dashboard -- Ensure API routes take precedence over static files -- Dashboard should self-configure service URL - -### Development Workflow -```bash -# Build dashboard -bazel build //databuild/dashboard:app - -# Development mode (with file watching) -bazel run //databuild/dashboard:dev - -# Type checking -bazel run //databuild/dashboard:typecheck -``` - -## Implementation Strategy - -1. **Set Up Bazel Rules** - - Configure `rules_nodejs` for TypeScript - - Create build targets for development and production - - Set up bundling pipeline - -2. **Create Basic App** - - Simple Mithril application with single component - - Import and test generated TypeScript client - - Basic connectivity test to service API - -3. **Service Integration** - - Update Build Graph Service to serve static files - - Add dashboard route configuration - - Ensure proper content-type headers - -4. **Development Tooling** - - File watching for development - - TypeScript compilation - - Error reporting - -## Deliverables - -- [ ] Working TypeScript + Mithril "Hello World" app -- [ ] Bazel BUILD rules for compilation and bundling -- [ ] Integration with Build Graph Service -- [ ] Development workflow scripts -- [ ] Basic connectivity test with generated client - -## Success Criteria - -- App compiles and runs without errors -- Served by Build Graph Service on same port as API -- TypeScript client successfully connects to service -- Development workflow supports rapid iteration -- Hermetic build process - -## Testing - -- Build app with `bazel build //databuild/dashboard:app` -- Start service and verify dashboard loads -- Test API connectivity from dashboard -- Verify TypeScript compilation and type checking work \ No newline at end of file diff --git a/plans/webapp_v1/chunk-3-routing-framework.md b/plans/webapp_v1/chunk-3-routing-framework.md deleted file mode 100644 index 0f4b931..0000000 --- a/plans/webapp_v1/chunk-3-routing-framework.md +++ /dev/null @@ -1,130 +0,0 @@ -# Chunk 3: Routing Framework - -**Parent:** [Build Graph Dashboard](../build-graph-dashboard.md) -**Previous:** [Chunk 2: Hello World App](./chunk-2-hello-world-app.md) -**Next:** [Chunk 4: Recent Activity](./chunk-4-recent-activity.md) - -## Overview - -Implement multi-page routing using Mithril's routing system, create the base layout with navigation, and handle URL encoding/decoding for partition references. - -## Scope - -### In Scope -- Mithril routing for all planned dashboard pages -- Base layout with navigation header -- URL encoding/decoding for partition references -- Initial page scaffolding for all routes -- Basic Tailwind + DaisyUI styling setup - -### Out of Scope -- Full page implementations (handled in subsequent chunks) -- Complex state management -- Advanced styling (minimal styling only) - -## Technical Approach - -### Routing Structure -```typescript -const routes = { - '/': RecentActivity, - '/builds/:id': BuildStatus, - '/partitions': PartitionsList, - '/partitions/:base64_ref': PartitionStatus, - '/jobs': JobsList, - '/jobs/:label': JobMetrics, - '/analyze': GraphAnalysis, -}; -``` - -### URL Encoding Utilities -```typescript -// Partition reference URL encoding -function encodePartitionRef(ref: string): string { - return btoa(ref).replace(/\+/g, '-').replace(/\//g, '_').replace(/=/g, ''); -} - -function decodePartitionRef(encoded: string): string { - // Add padding if needed - const padded = encoded.replace(/-/g, '+').replace(/_/g, '/'); - return atob(padded); -} -``` - -### Base Layout -```typescript -const Layout = { - view: (vnode: any) => [ - m('header.navbar', [ - m('nav', [ - m('a[href="/"]', 'Dashboard'), - m('a[href="/partitions"]', 'Partitions'), - m('a[href="/jobs"]', 'Jobs'), - m('a[href="/analyze"]', 'Analyze'), - ]) - ]), - m('main', vnode.children) - ] -}; -``` - -### Page Scaffolding -Create placeholder components for each route: -- `RecentActivity` - Dashboard home -- `BuildStatus` - Build request status -- `PartitionsList` - Partition listing -- `PartitionStatus` - Individual partition status -- `JobsList` - Jobs listing -- `JobMetrics` - Job metrics and history -- `GraphAnalysis` - Graph analysis tool - -## Implementation Strategy - -1. **Set Up Routing** - - Configure Mithril routing - - Create route definitions - - Implement navigation handlers - -2. **Create Base Layout** - - Navigation header with links - - Main content area - - Basic responsive design - -3. **Implement URL Encoding** - - Partition reference encoding/decoding - - URL parameter handling - - Error handling for invalid refs - -4. **Add Tailwind + DaisyUI** - - Configure build system for CSS processing - - Add basic styling to layout - - Set up design tokens - -5. **Create Page Scaffolds** - - Placeholder components for each route - - Basic page structure - - Navigation between pages - -## Deliverables - -- [ ] Working multi-page routing system -- [ ] Base layout with navigation -- [ ] URL encoding/decoding for partition refs -- [ ] Scaffold pages for all planned routes -- [ ] Basic Tailwind + DaisyUI styling setup - -## Success Criteria - -- All routes load without errors -- Navigation between pages works correctly -- Partition reference encoding/decoding handles edge cases -- Layout is responsive and functional -- Ready for page implementations in subsequent chunks - -## Testing - -- Navigate to all routes and verify they load -- Test partition reference encoding/decoding with various inputs -- Verify browser back/forward navigation works -- Test responsive layout on different screen sizes -- Validate URL parameter handling \ No newline at end of file diff --git a/plans/webapp_v1/chunk-4-recent-activity.md b/plans/webapp_v1/chunk-4-recent-activity.md deleted file mode 100644 index 716ce82..0000000 --- a/plans/webapp_v1/chunk-4-recent-activity.md +++ /dev/null @@ -1,148 +0,0 @@ -# Chunk 4: Recent Activity - -**Parent:** [Build Graph Dashboard](../build-graph-dashboard.md) -**Previous:** [Chunk 3: Routing Framework](./chunk-3-routing-framework.md) -**Next:** [Chunk 5: Build Status](./chunk-5-build-status.md) - -## Overview - -Implement the dashboard home page showing recent build activity and system status with real-time updates via polling. - -## Scope - -### In Scope -- Recent build requests display -- Active builds count and status -- Recent partition builds -- System health indicators -- Basic polling for real-time updates -- Tailwind + DaisyUI styling - -### Out of Scope -- Complex filtering or searching -- Historical data beyond recent activity -- Advanced visualizations -- User preferences/settings - -## Technical Approach - -### Data Sources -From Build Graph Service API: -- Recent build requests (from event log) -- Active builds count -- Recent partition builds -- System status - -### Component Structure -```typescript -const RecentActivity = { - oninit: () => { - // Start polling for updates - this.pollInterval = setInterval(this.loadData, 5000); - this.loadData(); - }, - - onremove: () => { - // Clean up polling - clearInterval(this.pollInterval); - }, - - view: () => [ - m('.dashboard-header', [ - m('h1', 'DataBuild Dashboard'), - m('.stats', [ - m('.stat-item', `Active Builds: ${this.activeBuilds}`), - m('.stat-item', `Recent Builds: ${this.recentBuilds.length}`), - ]) - ]), - - m('.dashboard-content', [ - m('.recent-builds', [ - m('h2', 'Recent Build Requests'), - m('table', this.recentBuilds.map(build => - m('tr', [ - m('td', m('a', { href: `/builds/${build.id}` }, build.id)), - m('td', build.status), - m('td', formatTime(build.created_at)), - ]) - )) - ]), - - m('.recent-partitions', [ - m('h2', 'Recent Partition Builds'), - m('table', this.recentPartitions.map(partition => - m('tr', [ - m('td', m('a', { - href: `/partitions/${encodePartitionRef(partition.ref)}` - }, partition.ref)), - m('td', partition.status), - m('td', formatTime(partition.updated_at)), - ]) - )) - ]) - ]) - ] -}; -``` - -### Polling Strategy -- Poll every 5 seconds for updates -- Use Page Visibility API to pause when tab inactive -- Show loading states during updates -- Handle connection errors gracefully - -### Styling -- Use DaisyUI stat components for metrics -- Table layout for recent items -- Responsive grid for dashboard sections -- Status badges for build states - -## Implementation Strategy - -1. **Create Data Layer** - - API calls for recent activity data - - Polling manager with visibility detection - - Error handling and retries - -2. **Build UI Components** - - Dashboard header with metrics - - Recent builds table - - Recent partitions table - - Loading and error states - -3. **Implement Real-time Updates** - - Set up polling with proper cleanup - - Page Visibility API integration - - Optimistic updates for better UX - -4. **Add Styling** - - DaisyUI components for consistent look - - Responsive layout - - Status indicators and badges - -## Deliverables - -- [ ] Dashboard home page with recent activity -- [ ] Real-time polling with visibility detection -- [ ] Recent build requests table with links -- [ ] Recent partition builds display -- [ ] System metrics and health indicators -- [ ] Responsive styling with DaisyUI - -## Success Criteria - -- Page loads and displays recent activity -- Real-time updates work correctly -- Links navigate to appropriate detail pages -- Polling stops when tab is inactive -- Layout is responsive and well-styled -- Error states are handled gracefully - -## Testing - -- Verify recent builds and partitions display -- Test real-time updates with running builds -- Validate links to build and partition detail pages -- Check polling behavior with tab visibility changes -- Test error handling with service unavailable -- Verify responsive layout on different screen sizes \ No newline at end of file diff --git a/plans/webapp_v1/chunk-5-build-status.md b/plans/webapp_v1/chunk-5-build-status.md deleted file mode 100644 index 3aac839..0000000 --- a/plans/webapp_v1/chunk-5-build-status.md +++ /dev/null @@ -1,237 +0,0 @@ -# Chunk 5: Build Status - -**Parent:** [Build Graph Dashboard](../build-graph-dashboard.md) -**Previous:** [Chunk 4: Recent Activity](./chunk-4-recent-activity.md) -**Next:** [Chunk 6: Partition Pages](./chunk-6-partition-pages.md) - -## Overview - -Implement the core operational build request status page with real-time updates, job/partition status visualization, and execution logs. - -## Scope - -### In Scope -- Build request status display with real-time updates -- Job and partition status visualization -- Execution timeline and logs from build events -- Delegation indicators for shared builds -- Auto-refresh with Page Visibility API -- Expandable job details - -### Out of Scope -- Complex graph visualizations (simple status indicators) -- Historical build comparisons -- Advanced filtering of events -- User interactions beyond viewing - -## Technical Approach - -### Data Sources -From Build Graph Service API: -- `/api/v1/builds/:id` - Individual build request details and events -- `/api/v1/partitions/:ref/status` - Individual partition status -- `/api/v1/partitions/:ref/events` - Partition-specific events - -Available TypeScript types from generated client: -- `BuildStatusResponse` - Build request metadata, status, events -- `PartitionStatusResponse` - Individual partition status -- `PartitionEventsResponse` - Partition build events -- `BuildRequestStatus` enum - Status values (Received, Planning, Executing, Completed, Failed, Cancelled) -- `PartitionStatus` enum - Status values (Requested, Scheduled, Building, Available, Failed, Delegated) - -**Note**: Timestamps are in nanoseconds and require conversion via `/1000000` for JavaScript Date objects. - -### Component Structure -```typescript -import { DefaultApi, Configuration, BuildStatusResponse, PartitionStatusResponse } from '../client/typescript_generated/src/index'; -import { pollingManager, formatTime } from './services'; - -const BuildStatus = { - data: null as BuildStatusResponse | null, - loading: true, - error: null as string | null, - partitionStatuses: new Map(), - logsExpanded: {} as Record, - - oninit: (vnode) => { - this.buildId = vnode.attrs.id; - this.loadBuild(); - this.startPolling(); - }, - - onremove: () => { - pollingManager.stopPolling(`build-status-${this.buildId}`); - }, - - async loadBuild() { - try { - this.loading = true; - this.error = null; - m.redraw(); - - const apiClient = new DefaultApi(new Configuration({ basePath: '' })); - - // Get build status - const buildResponse = await apiClient.apiV1BuildsBuildRequestIdGet({ buildRequestId: this.buildId }); - this.data = buildResponse; - - // Load partition statuses for all requested partitions - if (buildResponse.requestedPartitions) { - for (const partition of buildResponse.requestedPartitions) { - try { - const partitionStatus = await apiClient.apiV1PartitionsRefStatusGet({ - ref: partition.str - }); - this.partitionStatuses.set(partition.str, partitionStatus); - } catch (e) { - console.warn(`Failed to load status for partition ${partition.str}:`, e); - } - } - } - - this.loading = false; - m.redraw(); - } catch (error) { - console.error('Failed to load build:', error); - this.error = error instanceof Error ? error.message : 'Failed to load build'; - this.loading = false; - m.redraw(); - } - }, - - startPolling() { - // Use different poll intervals based on build status - const isActive = this.data?.status === 'BuildRequestExecuting' || - this.data?.status === 'BuildRequestPlanning'; - const interval = isActive ? 2000 : 10000; // 2s for active, 10s for completed - - pollingManager.startPolling(`build-status-${this.buildId}`, () => { - this.loadBuild(); - }, interval); - }, - - view: () => [ - // Loading/error states similar to RecentActivity component - this.loading && !this.data ? m('.loading-state', '...') : null, - this.error ? m('.error-state', this.error) : null, - - this.data ? [ - m('.build-header', [ - m('h1', `Build ${this.buildId}`), - m('.build-meta', [ - m(`span.badge.${this.getStatusClass(this.data.status)}`, this.data.status), - m('.timestamp', formatTime(new Date(this.data.createdAt / 1000000).toISOString())), - m('.partitions', `${this.data.requestedPartitions?.length || 0} partitions`), - ]) - ]), - - m('.build-content', [ - m('.partition-status', [ - m('h2', 'Partition Status'), - m('.partition-grid', - this.data.requestedPartitions?.map(partition => { - const status = this.partitionStatuses.get(partition.str); - return m('.partition-card', [ - m('.partition-ref', partition.str), - m(`span.badge.${this.getPartitionStatusClass(status?.status)}`, - status?.status || 'Unknown'), - status?.updatedAt ? - m('.updated-time', formatTime(new Date(status.updatedAt / 1000000).toISOString())) : null - ]); - }) || [] - ) - ]), - - m('.execution-timeline', [ - m('h2', 'Execution Timeline'), - m('.timeline', this.data.events?.map(event => - m('.timeline-item', [ - m('.timestamp', formatTime(new Date(event.timestamp / 1000000).toISOString())), - m('.event-type', event.eventType), - m('.message', event.message || ''), - // Add expandable logs for job events - this.isJobEvent(event) ? m('.expandable-logs', [ - m('button.btn.btn-sm', { - onclick: () => this.toggleLogs(event.eventId) - }, this.logsExpanded[event.eventId] ? 'Hide Logs' : 'Show Logs'), - this.logsExpanded[event.eventId] ? - m('.logs', this.formatJobLogs(event)) : null - ]) : null - ]) - ) || []) - ]) - ]) - ] : null - ] -}; -``` - -### Real-time Updates -- Use existing `pollingManager` from `services.ts` with Page Visibility API -- Poll every 2 seconds when build status is `BuildRequestExecuting` or `BuildRequestPlanning` -- Poll every 10 seconds when build is `BuildRequestCompleted`, `BuildRequestFailed`, or `BuildRequestCancelled` -- Automatic polling pause when tab is not visible -- Explicit `m.redraw()` calls after async data loading - -### Status Visualization -- Color-coded status badges using DaisyUI classes: - - `badge-success` for `BuildRequestCompleted` / `PartitionAvailable` - - `badge-warning` for `BuildRequestExecuting` / `PartitionBuilding` - - `badge-error` for `BuildRequestFailed` / `PartitionFailed` - - `badge-neutral` for other states -- Timeline visualization for build events with timestamp formatting -- Delegation indicators for `PartitionDelegated` status with build request links - -## Implementation Strategy - -1. **Extend Existing Infrastructure** - - Use established `DefaultApi` client pattern from `services.ts` - - Leverage existing `pollingManager` with Page Visibility API - - Follow `RecentActivity` component patterns for loading/error states - - Import generated TypeScript types from client - -2. **Build Status Components** - - Build header with metadata using `BuildStatusResponse` - - Partition status grid using `PartitionStatusResponse` for each partition - - Execution timeline parsing `events` array from build response - - Expandable log sections for job events with state management - -3. **Real-time Updates** - - Intelligent polling based on `BuildRequestStatus` enum values - - Reuse existing `pollingManager` infrastructure - - Loading states and error handling following established patterns - - Proper `m.redraw()` calls after async operations - -4. **Status Visualization** - - Status badge classes using established DaisyUI patterns - - Timeline layout similar to existing dashboard components - - Partition delegation links using build request IDs - - Timestamp formatting using existing `formatTime` utility - -## Deliverables - -- [ ] Build request status page with real-time updates -- [ ] Partition status grid with visual indicators -- [ ] Execution timeline with build events -- [ ] Expandable job logs and details -- [ ] Auto-refresh with visibility detection -- [ ] Delegation indicators and links - -## Success Criteria - -- Real-time updates show build progress accurately -- All partition statuses are clearly displayed -- Job logs are accessible and readable -- Polling behaves correctly based on build state -- Delegation to other builds is clearly indicated -- Page is responsive and performs well - -## Testing - -- Test with running builds to verify real-time updates -- Verify partition status changes are reflected -- Test job log expansion and readability -- Validate polling behavior with tab visibility -- Test with delegated builds -- Verify error handling with invalid build IDs -- Check performance with large build requests \ No newline at end of file diff --git a/plans/webapp_v1/chunk-6-partition-pages.md b/plans/webapp_v1/chunk-6-partition-pages.md deleted file mode 100644 index 3566d5e..0000000 --- a/plans/webapp_v1/chunk-6-partition-pages.md +++ /dev/null @@ -1,224 +0,0 @@ -# Chunk 6: Partition Pages - -**Parent:** [Build Graph Dashboard](../build-graph-dashboard.md) -**Previous:** [Chunk 5: Build Status](./chunk-5-build-status.md) -**Next:** [Chunk 7: Jobs Pages](./chunk-7-jobs-pages.md) - -## Overview - -Implement partition listing and individual partition status pages with build history, "Build Now" functionality, and related partition discovery. - -## Scope - -### In Scope -- Partition listing page with search functionality -- Individual partition status pages -- Build history for each partition -- "Build Now" button with force rebuild option -- Related partitions (upstream/downstream dependencies) -- Partition reference URL handling with base64 encoding - -### Out of Scope -- Complex dependency graph visualization -- Partition metadata beyond build history -- Advanced filtering beyond basic search -- Batch operations on multiple partitions - -## Technical Approach - -### Data Sources -From Build Graph Service API: -- `/api/v1/partitions/:ref/status` - Partition status and metadata -- `/api/v1/partitions/:ref/events` - Build events for partition -- `/api/v1/builds` - Submit new build requests -- Recent partitions from build event log - -### URL Encoding -```typescript -// Handle partition references in URLs -function encodePartitionRef(ref: string): string { - return btoa(ref).replace(/\+/g, '-').replace(/\//g, '_').replace(/=/g, ''); -} - -function decodePartitionRef(encoded: string): string { - // Add padding if needed - const padded = encoded.replace(/-/g, '+').replace(/_/g, '/'); - return atob(padded); -} -``` - -### Component Structure -```typescript -const PartitionsList = { - oninit: () => { - this.partitions = []; - this.searchTerm = ''; - this.loadPartitions(); - }, - - view: () => [ - m('.partitions-header', [ - m('h1', 'Partitions'), - m('input', { - placeholder: 'Search partitions...', - oninput: (e) => this.searchTerm = e.target.value, - }) - ]), - - m('.partitions-table', [ - m('table.table', [ - m('thead', [ - m('tr', [ - m('th', 'Partition Reference'), - m('th', 'Status'), - m('th', 'Last Updated'), - m('th', 'Actions'), - ]) - ]), - m('tbody', this.filteredPartitions().map(partition => - m('tr', [ - m('td', m('a', { - href: `/partitions/${encodePartitionRef(partition.ref)}` - }, partition.ref)), - m('td', m('.badge', partition.status)), - m('td', formatTime(partition.last_updated)), - m('td', [ - m('button.btn.btn-sm', { - onclick: () => this.buildPartition(partition.ref) - }, 'Build') - ]) - ]) - )) - ]) - ]) - ] -}; - -const PartitionStatus = { - oninit: (vnode) => { - this.partitionRef = decodePartitionRef(vnode.attrs.base64_ref); - this.partition = null; - this.buildHistory = []; - this.relatedPartitions = []; - this.loadPartition(); - }, - - view: () => [ - m('.partition-header', [ - m('h1', this.partitionRef), - m('.partition-meta', [ - m('.badge', this.partition?.status), - m('.timestamp', formatTime(this.partition?.last_updated)), - ]), - m('.partition-actions', [ - m('button.btn.btn-primary', { - onclick: () => this.buildPartition(false) - }, 'Build Now'), - m('button.btn.btn-secondary', { - onclick: () => this.buildPartition(true) - }, 'Force Rebuild'), - ]) - ]), - - m('.partition-content', [ - m('.build-history', [ - m('h2', 'Build History'), - m('table.table', [ - m('thead', [ - m('tr', [ - m('th', 'Build Request'), - m('th', 'Status'), - m('th', 'Started'), - m('th', 'Completed'), - ]) - ]), - m('tbody', this.buildHistory.map(build => - m('tr', [ - m('td', m('a', { href: `/builds/${build.id}` }, build.id)), - m('td', m('.badge', build.status)), - m('td', formatTime(build.started_at)), - m('td', formatTime(build.completed_at)), - ]) - )) - ]) - ]), - - m('.related-partitions', [ - m('h2', 'Related Partitions'), - m('.partition-deps', [ - m('h3', 'Dependencies'), - m('ul', this.relatedPartitions.dependencies?.map(dep => - m('li', m('a', { - href: `/partitions/${encodePartitionRef(dep)}` - }, dep)) - )) - ]), - m('.partition-dependents', [ - m('h3', 'Dependents'), - m('ul', this.relatedPartitions.dependents?.map(dep => - m('li', m('a', { - href: `/partitions/${encodePartitionRef(dep)}` - }, dep)) - )) - ]) - ]) - ]) - ] -}; -``` - -### Build Now Functionality -- Submit build request for specific partition -- Handle force rebuild option -- Redirect to build status page -- Show loading states during submission - -## Implementation Strategy - -1. **Create Data Layer** - - API integration for partition data - - Search and filtering logic - - Build request submission - -2. **Build List Page** - - Partition table with search - - Status indicators - - Quick build actions - -3. **Individual Partition Pages** - - Partition status display - - Build history table - - Related partitions discovery - -4. **Build Actions** - - "Build Now" functionality - - Force rebuild option - - Error handling and feedback - -## Deliverables - -- [ ] Partition listing page with search -- [ ] Individual partition status pages -- [ ] Build history display -- [ ] "Build Now" and force rebuild functionality -- [ ] Related partitions discovery -- [ ] URL encoding/decoding for partition references - -## Success Criteria - -- Partition list loads and search works correctly -- Individual partition pages display complete information -- Build history shows all relevant builds -- "Build Now" successfully submits builds -- Related partitions are discoverable -- URL encoding handles all partition reference formats - -## Testing - -- Test partition list search with various terms -- Verify individual partition pages load correctly -- Test build history display and links -- Submit builds and verify they start correctly -- Test force rebuild functionality -- Validate URL encoding with complex partition references -- Check related partitions discovery \ No newline at end of file diff --git a/plans/webapp_v1/chunk-7-jobs-pages.md b/plans/webapp_v1/chunk-7-jobs-pages.md deleted file mode 100644 index 848a398..0000000 --- a/plans/webapp_v1/chunk-7-jobs-pages.md +++ /dev/null @@ -1,230 +0,0 @@ -# Chunk 7: Jobs Pages - -**Parent:** [Build Graph Dashboard](../build-graph-dashboard.md) -**Previous:** [Chunk 6: Partition Pages](./chunk-6-partition-pages.md) -**Next:** [Chunk 8: Graph Analysis](./chunk-8-graph-analysis.md) - -## Overview - -Implement job listing and individual job metrics pages with performance data, success rates, and execution history. - -## Scope - -### In Scope -- Jobs listing page with high-level metadata -- Individual job metrics and performance pages -- Success rate tracking and trends -- Recent job runs with execution details -- Average duration and timing analysis -- Job label URL encoding for safe navigation - -### Out of Scope -- Complex performance analytics -- Historical trend analysis beyond recent runs -- Job configuration editing -- Advanced filtering beyond basic search - -## Technical Approach - -### Data Sources -From Build Graph Service API and build event log: -- Job list from graph analysis -- Job execution history from build events -- Performance metrics aggregated from event data -- Success/failure rates from job events - -### URL Encoding -```typescript -// Handle job labels in URLs (similar to partition refs) -function encodeJobLabel(label: string): string { - return btoa(label).replace(/\+/g, '-').replace(/\//g, '_').replace(/=/g, ''); -} - -function decodeJobLabel(encoded: string): string { - const padded = encoded.replace(/-/g, '+').replace(/_/g, '/'); - return atob(padded); -} -``` - -### Component Structure -```typescript -const JobsList = { - oninit: () => { - this.jobs = []; - this.searchTerm = ''; - this.loadJobs(); - }, - - view: () => [ - m('.jobs-header', [ - m('h1', 'Jobs'), - m('input', { - placeholder: 'Search jobs...', - oninput: (e) => this.searchTerm = e.target.value, - }) - ]), - - m('.jobs-table', [ - m('table.table', [ - m('thead', [ - m('tr', [ - m('th', 'Job Label'), - m('th', 'Success Rate'), - m('th', 'Avg Duration'), - m('th', 'Recent Runs'), - m('th', 'Last Run'), - ]) - ]), - m('tbody', this.filteredJobs().map(job => - m('tr', [ - m('td', m('a', { - href: `/jobs/${encodeJobLabel(job.label)}` - }, job.label)), - m('td', [ - m('.badge', job.success_rate >= 0.9 ? 'success' : 'warning'), - ` ${Math.round(job.success_rate * 100)}%` - ]), - m('td', formatDuration(job.avg_duration)), - m('td', job.recent_runs), - m('td', formatTime(job.last_run)), - ]) - )) - ]) - ]) - ] -}; - -const JobMetrics = { - oninit: (vnode) => { - this.jobLabel = decodeJobLabel(vnode.attrs.label); - this.metrics = null; - this.recentRuns = []; - this.loadJobMetrics(); - }, - - view: () => [ - m('.job-header', [ - m('h1', this.jobLabel), - m('.job-stats', [ - m('.stat', [ - m('.stat-title', 'Success Rate'), - m('.stat-value', `${Math.round(this.metrics?.success_rate * 100)}%`), - ]), - m('.stat', [ - m('.stat-title', 'Avg Duration'), - m('.stat-value', formatDuration(this.metrics?.avg_duration)), - ]), - m('.stat', [ - m('.stat-title', 'Total Runs'), - m('.stat-value', this.metrics?.total_runs), - ]), - ]) - ]), - - m('.job-content', [ - m('.performance-chart', [ - m('h2', 'Performance Trends'), - m('.chart-placeholder', 'Success rate and duration trends over time'), - // Simple trend visualization or table - m('table.table', [ - m('thead', [ - m('tr', [ - m('th', 'Date'), - m('th', 'Success Rate'), - m('th', 'Avg Duration'), - ]) - ]), - m('tbody', this.metrics?.daily_stats?.map(stat => - m('tr', [ - m('td', formatDate(stat.date)), - m('td', `${Math.round(stat.success_rate * 100)}%`), - m('td', formatDuration(stat.avg_duration)), - ]) - )) - ]) - ]), - - m('.recent-runs', [ - m('h2', 'Recent Runs'), - m('table.table', [ - m('thead', [ - m('tr', [ - m('th', 'Build Request'), - m('th', 'Partitions'), - m('th', 'Status'), - m('th', 'Duration'), - m('th', 'Started'), - ]) - ]), - m('tbody', this.recentRuns.map(run => - m('tr', [ - m('td', m('a', { href: `/builds/${run.build_id}` }, run.build_id)), - m('td', run.partitions.join(', ')), - m('td', m('.badge', run.status)), - m('td', formatDuration(run.duration)), - m('td', formatTime(run.started_at)), - ]) - )) - ]) - ]) - ]) - ] -}; -``` - -### Metrics Calculation -From build event log: -- Success rate: completed jobs / total jobs -- Average duration: mean time from SCHEDULED to COMPLETED/FAILED -- Recent runs: last 50 job executions -- Daily aggregations for trend analysis - -## Implementation Strategy - -1. **Create Data Layer** - - API integration for job data - - Metrics calculation from build events - - Search and filtering logic - -2. **Build Jobs List Page** - - Job table with search - - High-level metrics display - - Performance indicators - -3. **Individual Job Pages** - - Detailed metrics display - - Performance trends - - Recent runs history - -4. **Performance Visualization** - - Simple trend charts or tables - - Success rate indicators - - Duration analysis - -## Deliverables - -- [ ] Jobs listing page with search and metrics -- [ ] Individual job metrics pages -- [ ] Success rate tracking and display -- [ ] Performance trends visualization -- [ ] Recent job runs history -- [ ] Job label URL encoding/decoding - -## Success Criteria - -- Jobs list loads with accurate metrics -- Individual job pages show detailed performance data -- Success rates are calculated correctly -- Performance trends are meaningful -- Recent runs link to build details -- URL encoding handles all job label formats - -## Testing - -- Test jobs list search functionality -- Verify individual job pages load correctly -- Validate metrics calculations against known data -- Test performance trend accuracy -- Check recent runs links and data -- Validate URL encoding with complex job labels -- Test with jobs that have no runs yet \ No newline at end of file diff --git a/plans/webapp_v1/chunk-8-graph-analysis.md b/plans/webapp_v1/chunk-8-graph-analysis.md deleted file mode 100644 index f30d760..0000000 --- a/plans/webapp_v1/chunk-8-graph-analysis.md +++ /dev/null @@ -1,209 +0,0 @@ -# Chunk 8: Graph Analysis - -**Parent:** [Build Graph Dashboard](../build-graph-dashboard.md) -**Previous:** [Chunk 7: Jobs Pages](./chunk-7-jobs-pages.md) -**Next:** [Chunk 9: Polish](./chunk-9-polish.md) - -## Overview - -Implement interactive build graph analysis with partition input forms, Mermaid.js visualization, and execution plan display. - -## Scope - -### In Scope -- Partition input form for analysis requests -- Mermaid.js visualization of job graphs -- Execution plan table with job details -- Interactive graph exploration -- Error handling for invalid partition references - -### Out of Scope -- Complex graph editing capabilities -- Advanced graph algorithms or analysis -- Performance optimization for large graphs -- Real-time graph updates - -## Technical Approach - -### Data Sources -From Build Graph Service API: -- `/api/v1/analyze` - Analyze build graph for partitions -- Returns `JobGraph` with tasks and dependencies - -### Component Structure -```typescript -const GraphAnalysis = { - oninit: () => { - this.partitions = ['']; - this.jobGraph = null; - this.loading = false; - this.error = null; - this.mermaidRendered = false; - }, - - view: () => [ - m('.analysis-header', [ - m('h1', 'Graph Analysis'), - m('p', 'Analyze the build graph for specific partitions') - ]), - - m('.analysis-form', [ - m('h2', 'Partition References'), - m('.partition-inputs', [ - this.partitions.map((partition, index) => - m('.input-group', [ - m('input', { - value: partition, - placeholder: 'Enter partition reference...', - oninput: (e) => this.partitions[index] = e.target.value, - }), - m('button.btn.btn-outline', { - onclick: () => this.removePartition(index), - disabled: this.partitions.length <= 1 - }, 'Remove') - ]) - ), - m('button.btn.btn-outline', { - onclick: () => this.addPartition() - }, 'Add Partition') - ]), - - m('.form-actions', [ - m('button.btn.btn-primary', { - onclick: () => this.analyzeGraph(), - disabled: this.loading || !this.hasValidPartitions() - }, this.loading ? 'Analyzing...' : 'Analyze Graph') - ]) - ]), - - this.error ? m('.error-message', [ - m('.alert.alert-error', this.error) - ]) : null, - - this.jobGraph ? m('.analysis-results', [ - m('.graph-visualization', [ - m('h2', 'Job Graph'), - m('#mermaid-graph', { - oncreate: () => this.renderMermaid(), - onupdate: () => this.renderMermaid() - }) - ]), - - m('.execution-plan', [ - m('h2', 'Execution Plan'), - m('table.table', [ - m('thead', [ - m('tr', [ - m('th', 'Job'), - m('th', 'Outputs'), - m('th', 'Inputs'), - m('th', 'Arguments'), - ]) - ]), - m('tbody', this.jobGraph.nodes.map(task => - m('tr', [ - m('td', m('a', { - href: `/jobs/${encodeJobLabel(task.job.label)}` - }, task.job.label)), - m('td', m('ul', task.config.outputs.map(output => - m('li', m('a', { - href: `/partitions/${encodePartitionRef(output.str)}` - }, output.str)) - ))), - m('td', m('ul', task.config.inputs.map(input => - m('li', input.partition_ref.str) - ))), - m('td', m('code', task.config.args.join(' '))), - ]) - )) - ]) - ]) - ]) : null - ] -}; -``` - -### Mermaid.js Integration -```typescript -// Generate Mermaid diagram from JobGraph -function generateMermaidDiagram(jobGraph: JobGraph): string { - const nodes = jobGraph.nodes.map(task => - `${task.job.label}["${task.job.label}"]` - ).join('\n '); - - const edges = jobGraph.nodes.flatMap(task => - task.config.inputs.map(input => { - const sourceJob = findJobForPartition(input.partition_ref.str); - return sourceJob ? `${sourceJob} --> ${task.job.label}` : null; - }).filter(Boolean) - ).join('\n '); - - return `graph TD\n ${nodes}\n ${edges}`; -} - -// Render with Mermaid -function renderMermaid() { - if (!this.jobGraph || this.mermaidRendered) return; - - const diagram = generateMermaidDiagram(this.jobGraph); - mermaid.render('graph', diagram, (svgCode) => { - document.getElementById('mermaid-graph').innerHTML = svgCode; - this.mermaidRendered = true; - }); -} -``` - -### Form Management -- Dynamic partition input fields -- Add/remove partition functionality -- Input validation -- Error handling for invalid references - -## Implementation Strategy - -1. **Create Form Interface** - - Dynamic partition input management - - Form validation and submission - - Loading states and error handling - -2. **Integrate Graph Analysis API** - - API calls to analyze endpoint - - Error handling for analysis failures - - JobGraph data processing - -3. **Add Mermaid.js Visualization** - - Include Mermaid.js library - - Generate diagrams from JobGraph - - Handle rendering lifecycle - -4. **Build Execution Plan Display** - - Table layout for job details - - Links to job and partition pages - - Clear display of dependencies - -## Deliverables - -- [ ] Interactive partition input form -- [ ] Mermaid.js graph visualization -- [ ] Execution plan table with job details -- [ ] Error handling for invalid inputs -- [ ] Links to related job and partition pages - -## Success Criteria - -- Form allows adding/removing partition inputs -- Graph analysis API integration works correctly -- Mermaid diagrams render accurately -- Execution plan shows complete job details -- Error handling provides useful feedback -- Links navigate to appropriate detail pages - -## Testing - -- Test form with various partition combinations -- Verify graph analysis API calls work correctly -- Test Mermaid diagram generation and rendering -- Validate execution plan accuracy -- Test error handling with invalid partitions -- Check links to job and partition detail pages -- Test with complex multi-job graphs \ No newline at end of file diff --git a/plans/webapp_v1/chunk-9-polish.md b/plans/webapp_v1/chunk-9-polish.md deleted file mode 100644 index 3ffb881..0000000 --- a/plans/webapp_v1/chunk-9-polish.md +++ /dev/null @@ -1,128 +0,0 @@ -# Chunk 9: Polish - -**Parent:** [Build Graph Dashboard](../build-graph-dashboard.md) -**Previous:** [Chunk 8: Graph Analysis](./chunk-8-graph-analysis.md) - -## Overview - -Final polish phase focusing on complete Tailwind + DaisyUI styling implementation, performance optimization, user experience improvements, and production readiness. - -## Scope - -### In Scope -- Complete Tailwind + DaisyUI styling across all pages -- Performance optimization to meet bundle size targets -- Error handling and user experience improvements -- Loading states and feedback mechanisms -- Responsive design refinements -- Production build optimization - -### Out of Scope -- New functionality or features -- Complex animations or transitions -- Advanced accessibility features beyond basics -- Internationalization - -## Technical Approach - -### Styling Implementation -Complete DaisyUI component integration: -- Consistent color scheme and typography -- Proper spacing and layout -- Status badges and indicators -- Form styling and validation feedback -- Table and data display improvements - -### Performance Optimization -Target: < 50KB gzipped bundle -- Code splitting for better loading -- Tree shaking unused code -- Optimize CSS bundle size -- Lazy loading for non-critical components -- Bundle analysis and optimization - -### User Experience Improvements -- Consistent loading states across all pages -- Error boundaries and graceful error handling -- Toast notifications for user actions -- Confirmation dialogs for destructive actions -- Keyboard navigation support -- Mobile-responsive design - -### Production Readiness -- Environment-specific configuration -- Error logging and monitoring hooks -- Performance monitoring -- Cache headers and static asset optimization -- Security headers and CSP - -## Implementation Strategy - -1. **Complete Styling System** - - Apply DaisyUI components consistently - - Create shared style utilities - - Implement responsive design patterns - - Add loading and error states - -2. **Performance Optimization** - - Analyze bundle size and optimize - - Implement code splitting - - Optimize CSS and assets - - Add performance monitoring - -3. **User Experience Polish** - - Add feedback mechanisms - - Improve error handling - - Add loading indicators - - Polish interactions - -4. **Production Preparation** - - Environment configuration - - Monitoring and logging - - Security improvements - - Deployment optimization - -## Deliverables - -- [ ] Complete Tailwind + DaisyUI styling implementation -- [ ] Bundle size optimization (< 50KB gzipped) -- [ ] Comprehensive error handling and user feedback -- [ ] Loading states and progress indicators -- [ ] Mobile-responsive design -- [ ] Production-ready build configuration - -## Success Criteria - -- All pages have consistent, professional styling -- Bundle size meets target (< 50KB gzipped) -- Error handling provides helpful user feedback -- Loading states appear where appropriate -- Dashboard works well on mobile devices -- Production build is optimized and secure - -## Testing - -- Test all pages for consistent styling -- Verify bundle size meets targets -- Test error handling scenarios -- Validate loading states and feedback -- Test responsive design on various devices -- Verify production build works correctly -- Performance testing under load - -## Bundle Size Breakdown Target -- Mithril: ~10KB -- Custom code: ~20KB -- CSS (Tailwind + DaisyUI): ~5KB -- Protobuf client: ~15KB -- **Total: < 50KB gzipped** - -## Final Checklist -- [ ] All pages styled with DaisyUI -- [ ] Bundle size optimized -- [ ] Error handling complete -- [ ] Loading states implemented -- [ ] Mobile responsive -- [ ] Production build ready -- [ ] Performance targets met -- [ ] Security considerations addressed \ No newline at end of file diff --git a/scripts/prepare_dev b/scripts/prepare_dev new file mode 100755 index 0000000..5ee5525 --- /dev/null +++ b/scripts/prepare_dev @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +sh scripts/generate_proto_for_ide.sh + +python3 scripts/generate_cargo_toml.py