Add plan for partition activity log
This commit is contained in:
parent
b767a74350
commit
13e80f3c88
4 changed files with 275 additions and 11 deletions
|
|
@ -3,4 +3,4 @@
|
|||
|
||||
A bazel-based data build system.
|
||||
|
||||
For important context, check out [the manifesto](./manifesto.md), and [core concepts](./core-concepts.md). Also, check out [`databuild.proto`](./databuild.proto) for key system interfaces.
|
||||
For important context, check out [the manifesto](./manifesto.md), and [core concepts](./core-concepts.md). Also, check out [`databuild.proto`](./databuild/databuild.proto) for key system interfaces.
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -257,11 +257,54 @@
|
|||
"@@rules_oci+//oci:extensions.bzl%oci": {
|
||||
"general": {
|
||||
"bzlTransitiveDigest": "KHcdN2ovRQGX1MKsH0nGoGPFd/84U43tssN2jImCeJU=",
|
||||
"usagesDigest": "/O1PwnnkqSBmI9Oe08ZYYqjM4IS8JR+/9rjgzVTNDaQ=",
|
||||
"usagesDigest": "Apc1C60UoQEX5juQx/xCHcVJYryUnymx2oAbLy1lTaY=",
|
||||
"recordedFileInputs": {},
|
||||
"recordedDirentsInputs": {},
|
||||
"envVariables": {},
|
||||
"generatedRepoSpecs": {
|
||||
"debian_linux_arm64_v8": {
|
||||
"repoRuleId": "@@rules_oci+//oci/private:pull.bzl%oci_pull",
|
||||
"attributes": {
|
||||
"www_authenticate_challenges": {},
|
||||
"scheme": "https",
|
||||
"registry": "index.docker.io",
|
||||
"repository": "library/python",
|
||||
"identifier": "3.12-bookworm",
|
||||
"platform": "linux/arm64/v8",
|
||||
"target_name": "debian_linux_arm64_v8",
|
||||
"bazel_tags": []
|
||||
}
|
||||
},
|
||||
"debian_linux_amd64": {
|
||||
"repoRuleId": "@@rules_oci+//oci/private:pull.bzl%oci_pull",
|
||||
"attributes": {
|
||||
"www_authenticate_challenges": {},
|
||||
"scheme": "https",
|
||||
"registry": "index.docker.io",
|
||||
"repository": "library/python",
|
||||
"identifier": "3.12-bookworm",
|
||||
"platform": "linux/amd64",
|
||||
"target_name": "debian_linux_amd64",
|
||||
"bazel_tags": []
|
||||
}
|
||||
},
|
||||
"debian": {
|
||||
"repoRuleId": "@@rules_oci+//oci/private:pull.bzl%oci_alias",
|
||||
"attributes": {
|
||||
"target_name": "debian",
|
||||
"www_authenticate_challenges": {},
|
||||
"scheme": "https",
|
||||
"registry": "index.docker.io",
|
||||
"repository": "library/python",
|
||||
"identifier": "3.12-bookworm",
|
||||
"platforms": {
|
||||
"@@platforms//cpu:arm64": "@debian_linux_arm64_v8",
|
||||
"@@platforms//cpu:x86_64": "@debian_linux_amd64"
|
||||
},
|
||||
"bzlmod_repository": "debian",
|
||||
"reproducible": true
|
||||
}
|
||||
},
|
||||
"oci_crane_darwin_amd64": {
|
||||
"repoRuleId": "@@rules_oci+//oci:repositories.bzl%crane_repositories",
|
||||
"attributes": {
|
||||
|
|
@ -377,7 +420,11 @@
|
|||
}
|
||||
},
|
||||
"moduleExtensionMetadata": {
|
||||
"explicitRootModuleDirectDeps": [],
|
||||
"explicitRootModuleDirectDeps": [
|
||||
"debian",
|
||||
"debian_linux_arm64_v8",
|
||||
"debian_linux_amd64"
|
||||
],
|
||||
"explicitRootModuleDirectDevDeps": [],
|
||||
"useAllRepos": "NO",
|
||||
"reproducible": false
|
||||
|
|
|
|||
169
plans/partition-activity-log.md
Normal file
169
plans/partition-activity-log.md
Normal file
|
|
@ -0,0 +1,169 @@
|
|||
# Partition Activity Log Design
|
||||
|
||||
## Overview
|
||||
|
||||
The partition activity log extends the existing `Catalog` service with a comprehensive event-driven system that tracks partition lifecycle, coordinates parallel builds, and provides rich visualization capabilities. This design is inspired by Bazel's build action logging architecture.
|
||||
|
||||
## Core Architecture
|
||||
|
||||
The partition activity log extends the existing `Catalog` service with a comprehensive event-driven system that tracks partition lifecycle, coordinates parallel builds, and provides rich visualization capabilities.
|
||||
|
||||
## 1. Partition Event Schema
|
||||
|
||||
```protobuf
|
||||
// Partition lifecycle states
|
||||
enum PartitionStatus {
|
||||
PARTITION_UNKNOWN = 0;
|
||||
PARTITION_REQUESTED = 1; // Partition requested but not yet scheduled
|
||||
PARTITION_SCHEDULED = 2; // Job scheduled to produce this partition
|
||||
PARTITION_BUILDING = 3; // Job actively building this partition
|
||||
PARTITION_AVAILABLE = 4; // Partition successfully built and available
|
||||
PARTITION_FAILED = 5; // Partition build failed
|
||||
PARTITION_STALE = 6; // Partition exists but upstream dependencies changed
|
||||
PARTITION_DELEGATED = 7; // Request delegated to existing build
|
||||
}
|
||||
|
||||
// Individual partition activity event
|
||||
message PartitionEvent {
|
||||
// Event identity
|
||||
string partition_event_id = 1;
|
||||
google.protobuf.Timestamp timestamp = 2;
|
||||
|
||||
// Partition identification
|
||||
PartitionRef partition_ref = 3;
|
||||
PartitionStatus status = 4;
|
||||
|
||||
// Build context
|
||||
string job_graph_run_id = 5; // Links to graph execution
|
||||
string job_run_id = 6; // Links to specific job run
|
||||
JobLabel producing_job = 7; // Which job produces this partition
|
||||
|
||||
// Coordination metadata
|
||||
repeated string requesting_clients = 8; // Who requested this partition
|
||||
string delegated_to_run_id = 9; // If delegated, which run
|
||||
|
||||
// Dependencies
|
||||
repeated PartitionRef upstream_deps = 10;
|
||||
repeated PartitionRef downstream_deps = 11;
|
||||
|
||||
// Data about the partition
|
||||
PartitionManifest manifest = 12; // Present when status = AVAILABLE
|
||||
string failure_reason = 13; // Present when status = FAILED
|
||||
}
|
||||
```
|
||||
|
||||
## 2. Coordination Mechanisms
|
||||
|
||||
The system handles parallel build coordination through:
|
||||
|
||||
### A. Request Deduplication
|
||||
- When multiple clients request the same partition, the system:
|
||||
1. Checks if partition is already being built
|
||||
2. If yes, adds client to `requesting_clients` and returns delegation token
|
||||
3. If no, initiates new build and tracks all requesters
|
||||
|
||||
### B. Dependency Coordination
|
||||
- Tracks upstream/downstream relationships in partition events
|
||||
- Enables cascade invalidation when upstream partitions change
|
||||
- Supports "optimistic building" where downstream jobs can start before all deps are ready
|
||||
|
||||
### C. Build State Management
|
||||
```protobuf
|
||||
// Enhanced catalog service for coordination
|
||||
service EnhancedCatalog {
|
||||
// Request partition build (may delegate to existing build)
|
||||
rpc RequestPartition(RequestPartitionRequest) returns (RequestPartitionResponse);
|
||||
|
||||
// Query partition status and activity
|
||||
rpc QueryPartitionActivity(PartitionActivityQuery) returns (PartitionActivityResponse);
|
||||
|
||||
// Stream partition events for real-time monitoring
|
||||
rpc StreamPartitionEvents(PartitionEventStream) returns (stream PartitionEvent);
|
||||
|
||||
// Get dependency graph for visualization
|
||||
rpc GetDependencyGraph(DependencyGraphRequest) returns (DependencyGraphResponse);
|
||||
}
|
||||
```
|
||||
|
||||
## 3. Parallel Build Coordination Algorithm
|
||||
|
||||
```
|
||||
function RequestPartition(partition_ref, client_id):
|
||||
existing_event = GetLatestPartitionEvent(partition_ref)
|
||||
|
||||
if existing_event.status IN [SCHEDULED, BUILDING]:
|
||||
// Delegate to existing build
|
||||
UpdateEvent(partition_ref, DELEGATED, {
|
||||
requesting_clients: append(client_id),
|
||||
delegated_to_run_id: existing_event.job_graph_run_id
|
||||
})
|
||||
return DelegationToken(existing_event.job_graph_run_id)
|
||||
|
||||
elif existing_event.status == AVAILABLE AND not IsStale(partition_ref):
|
||||
// Partition already exists and is fresh
|
||||
return PartitionLocation(partition_ref)
|
||||
|
||||
else:
|
||||
// Start new build
|
||||
run_id = StartJobGraphRun(partition_ref)
|
||||
CreateEvent(partition_ref, SCHEDULED, {
|
||||
job_graph_run_id: run_id,
|
||||
requesting_clients: [client_id]
|
||||
})
|
||||
return BuildToken(run_id)
|
||||
```
|
||||
|
||||
## 4. Visualization & Status Tracking
|
||||
|
||||
The system provides multiple visualization interfaces:
|
||||
|
||||
### A. Real-time Build Dashboard
|
||||
- Live partition status across all active builds
|
||||
- Dependency graph visualization with status colors
|
||||
- Build queue depth and estimated completion times
|
||||
- Resource utilization metrics
|
||||
|
||||
### B. Partition Lineage Tracking
|
||||
- Complete upstream/downstream dependency chains
|
||||
- Historical build timeline for each partition
|
||||
- Impact analysis for code/data changes
|
||||
|
||||
### C. Build Coordination Metrics
|
||||
- Delegation efficiency (avoided duplicate builds)
|
||||
- Parallel build coordination success rates
|
||||
- Build time distributions and bottlenecks
|
||||
|
||||
## 5. Integration with Existing Architecture
|
||||
|
||||
The partition activity log integrates seamlessly with DataBuild's existing components:
|
||||
|
||||
- **Job Events**: Links to existing `JobEvent` and `GraphEvent` structures
|
||||
- **Partition Manifests**: Extends existing `PartitionManifest` with activity metadata
|
||||
- **Catalog Service**: Enhances existing catalog with partition-centric views
|
||||
- **Build Coordination**: Works with existing graph analysis and execution
|
||||
|
||||
## 6. Storage & Performance Considerations
|
||||
|
||||
- **Event Store**: Append-only partition event log with efficient querying
|
||||
- **Materialized Views**: Pre-computed partition status and dependency graphs
|
||||
- **Retention Policy**: Configurable retention for historical events
|
||||
- **Indexing**: Efficient lookups by partition_ref, job_graph_run_id, and timestamp
|
||||
|
||||
## 7. Event Emission Pattern
|
||||
|
||||
The system follows an **append-only event log pattern**:
|
||||
|
||||
- **New Event Per State Change**: Every state transition or action emits a completely new `PartitionEvent`
|
||||
- **No Event Mutation**: Existing events are never modified - the log is append-only
|
||||
- **View-Time Aggregation**: Current partition status is derived by reading the latest event for each partition
|
||||
- **Client Updates**: When a new client requests an already-building partition, a new DELEGATED event is emitted with the updated `requesting_clients` list
|
||||
|
||||
This approach provides:
|
||||
- **Complete Audit Trail**: Full history of all partition state changes
|
||||
- **Concurrent Safety**: No race conditions from concurrent updates
|
||||
- **Replay Capability**: System state can be reconstructed from event log
|
||||
- **Scalability**: Events can be partitioned and distributed efficiently
|
||||
|
||||
## Implementation Notes
|
||||
|
||||
This design provides a comprehensive partition activity log that enables efficient coordination of parallel builds while maintaining full visibility into partition lifecycle and dependencies, directly supporting DataBuild's goal of taking complete responsibility for the data build process.
|
||||
Loading…
Reference in a new issue