Add plan for partition activity log access layer

This commit is contained in:
Stuart Axelbrooke 2025-07-03 14:21:01 -07:00
parent 13e80f3c88
commit dd6870b980
2 changed files with 744 additions and 0 deletions

View file

@ -173,6 +173,199 @@ service DataBuildService {
rpc Build(GraphBuildRequest) returns (GraphBuildResponse);
}
///////////////////////////////////////////////////////////////////////////////////////////////
// Partition Activity Log
///////////////////////////////////////////////////////////////////////////////////////////////
// Partition lifecycle states
enum PartitionStatus {
PARTITION_UNKNOWN = 0;
PARTITION_REQUESTED = 1; // Partition requested but not yet scheduled
PARTITION_SCHEDULED = 2; // Job scheduled to produce this partition
PARTITION_BUILDING = 3; // Job actively building this partition
PARTITION_AVAILABLE = 4; // Partition successfully built and available
PARTITION_FAILED = 5; // Partition build failed
PARTITION_STALE = 6; // Partition exists but upstream dependencies changed
PARTITION_DELEGATED = 7; // Request delegated to existing build
}
// Individual partition activity event
message PartitionEvent {
// Event identity
string partition_event_id = 1;
google.protobuf.Timestamp timestamp = 2;
// Partition identification
PartitionRef partition_ref = 3;
PartitionStatus status = 4;
// Build context
string job_graph_run_id = 5; // Links to graph execution
string job_run_id = 6; // Links to specific job run
JobLabel producing_job = 7; // Which job produces this partition
// Coordination metadata
repeated string requesting_clients = 8; // Who requested this partition
string delegated_to_run_id = 9; // If delegated, which run
// Dependencies
repeated PartitionRef upstream_deps = 10;
repeated PartitionRef downstream_deps = 11;
// Data about the partition
PartitionManifest manifest = 12; // Present when status = AVAILABLE
string failure_reason = 13; // Present when status = FAILED
// Storage metadata
string storage_backend = 14;
map<string, string> storage_metadata = 15;
}
// Query capabilities
message PartitionEventQuery {
repeated PartitionRef partition_refs = 1;
repeated PartitionStatus statuses = 2;
repeated string job_graph_run_ids = 3;
TimeRange time_range = 4;
int32 limit = 5;
int32 offset = 6;
OrderBy order_by = 7;
}
message TimeRange {
google.protobuf.Timestamp start = 1;
google.protobuf.Timestamp end = 2;
}
message OrderBy {
string field = 1;
bool ascending = 2;
}
// Stream filtering
message EventStreamFilter {
repeated PartitionRef partition_refs = 1;
repeated PartitionStatus statuses = 2;
google.protobuf.Timestamp since = 3;
}
// Coordination support
message ActiveBuild {
PartitionRef partition_ref = 1;
string job_graph_run_id = 2;
PartitionStatus status = 3;
repeated string requesting_clients = 4;
google.protobuf.Timestamp started_at = 5;
}
message DependencyGraph {
PartitionRef root = 1;
repeated DependencyNode nodes = 2;
repeated DependencyEdge edges = 3;
}
message DependencyNode {
PartitionRef partition_ref = 1;
PartitionStatus status = 2;
google.protobuf.Timestamp last_updated = 3;
}
message DependencyEdge {
PartitionRef source = 1;
PartitionRef target = 2;
DependencyType type = 3;
}
enum DependencyType {
DEPENDS_ON = 0;
PRODUCES = 1;
}
// Request/Response message types
message AppendPartitionEventRequest {
PartitionEvent event = 1;
}
message AppendPartitionEventResponse {
string event_id = 1;
}
message GetLatestPartitionEventRequest {
PartitionRef partition_ref = 1;
}
message GetLatestPartitionEventResponse {
PartitionEvent event = 1;
}
message QueryPartitionEventsRequest {
PartitionEventQuery query = 1;
}
message QueryPartitionEventsResponse {
repeated PartitionEvent events = 1;
int64 total_count = 2;
}
message StreamPartitionEventsRequest {
EventStreamFilter filter = 1;
}
message RequestPartitionRequest {
PartitionRef partition_ref = 1;
string client_id = 2;
RequestOptions options = 3;
}
message RequestOptions {
bool allow_delegation = 1;
int32 timeout_seconds = 2;
}
message RequestPartitionResponse {
RequestResult result = 1;
}
message RequestResult {
oneof result {
PartitionLocation available = 1;
DelegationToken delegated = 2;
BuildToken building = 3;
}
}
message PartitionLocation {
PartitionRef partition_ref = 1;
PartitionManifest manifest = 2;
}
message DelegationToken {
string job_graph_run_id = 1;
google.protobuf.Timestamp estimated_completion = 2;
}
message BuildToken {
string job_graph_run_id = 1;
google.protobuf.Timestamp started_at = 2;
}
message GetActiveBuildStatusRequest {
repeated PartitionRef partition_refs = 1;
}
message GetActiveBuildStatusResponse {
repeated ActiveBuild active_builds = 1;
}
message GetDependencyGraphRequest {
PartitionRef partition_ref = 1;
int32 depth = 2;
}
message GetDependencyGraphResponse {
DependencyGraph graph = 1;
}
///////////////////////////////////////////////////////////////////////////////////////////////
// Catalog
///////////////////////////////////////////////////////////////////////////////////////////////
@ -305,4 +498,15 @@ service Catalog {
// Enables lookup of partition manifests produced as part of prior job runs
rpc ListPartitions(ListPartitionManifestsRequest) returns (ListPartitionManifestsResponse);
// Partition activity log methods
rpc AppendPartitionEvent(AppendPartitionEventRequest) returns (AppendPartitionEventResponse);
rpc GetLatestPartitionEvent(GetLatestPartitionEventRequest) returns (GetLatestPartitionEventResponse);
rpc QueryPartitionEvents(QueryPartitionEventsRequest) returns (QueryPartitionEventsResponse);
rpc StreamPartitionEvents(StreamPartitionEventsRequest) returns (stream PartitionEvent);
// Coordination methods
rpc RequestPartition(RequestPartitionRequest) returns (RequestPartitionResponse);
rpc GetActiveBuildStatus(GetActiveBuildStatusRequest) returns (GetActiveBuildStatusResponse);
rpc GetDependencyGraph(GetDependencyGraphRequest) returns (GetDependencyGraphResponse);
}

View file

@ -0,0 +1,540 @@
# Partition Activity Log Access Layer Design
## Overview
This document details the implementation of a swappable datastore access layer for the Partition Activity Log, enabling support for SQLite, PostgreSQL, and Delta Lake backends while maintaining consistent semantics and performance characteristics.
## Architecture Overview
```
┌─────────────────────────────────────────────────────────────┐
│ DataBuild Core │
├─────────────────────────────────────────────────────────────┤
│ Activity Log Service │
├─────────────────────────────────────────────────────────────┤
│ Abstract Storage Interface │
├─────────────────────────────────────────────────────────────┤
│ SQLite Impl │ PostgreSQL Impl │ Delta Lake Impl │
└─────────────────────────────────────────────────────────────┘
```
## 1. Abstract Storage Interface
### Core Trait Definition
```rust
pub trait PartitionActivityStore: Send + Sync {
type Error: std::error::Error + Send + Sync + 'static;
// Event operations
async fn append_event(&self, event: &PartitionEvent) -> Result<(), Self::Error>;
async fn get_latest_event(&self, partition_ref: &PartitionRef) -> Result<Option<PartitionEvent>, Self::Error>;
async fn get_events_since(&self, partition_ref: &PartitionRef, since: Timestamp) -> Result<Vec<PartitionEvent>, Self::Error>;
// Batch operations for performance
async fn append_events(&self, events: &[PartitionEvent]) -> Result<(), Self::Error>;
async fn get_latest_events(&self, partition_refs: &[PartitionRef]) -> Result<Vec<Option<PartitionEvent>>, Self::Error>;
// Query operations
async fn query_events(&self, query: &PartitionEventQuery) -> Result<Vec<PartitionEvent>, Self::Error>;
async fn count_events(&self, query: &PartitionEventQuery) -> Result<u64, Self::Error>;
// Coordination queries
async fn get_active_builds(&self) -> Result<Vec<ActiveBuild>, Self::Error>;
async fn get_dependency_graph(&self, partition_ref: &PartitionRef, depth: u32) -> Result<DependencyGraph, Self::Error>;
// Streaming for real-time updates
async fn stream_events(&self, filter: &EventStreamFilter) -> Result<Box<dyn Stream<Item = PartitionEvent>>, Self::Error>;
// Maintenance operations
async fn vacuum(&self, before: Timestamp) -> Result<u64, Self::Error>;
async fn get_statistics(&self) -> Result<StoreStatistics, Self::Error>;
}
```
### Query Types
```rust
#[derive(Debug, Clone)]
pub struct PartitionEventQuery {
pub partition_refs: Option<Vec<PartitionRef>>,
pub statuses: Option<Vec<PartitionStatus>>,
pub job_graph_run_ids: Option<Vec<String>>,
pub time_range: Option<(Timestamp, Timestamp)>,
pub limit: Option<u32>,
pub offset: Option<u32>,
pub order_by: Option<OrderBy>,
}
#[derive(Debug, Clone)]
pub struct EventStreamFilter {
pub partition_refs: Option<Vec<PartitionRef>>,
pub statuses: Option<Vec<PartitionStatus>>,
pub since: Option<Timestamp>,
}
#[derive(Debug, Clone)]
pub struct ActiveBuild {
pub partition_ref: PartitionRef,
pub job_graph_run_id: String,
pub status: PartitionStatus,
pub requesting_clients: Vec<String>,
pub started_at: Timestamp,
}
#[derive(Debug, Clone)]
pub struct DependencyGraph {
pub root: PartitionRef,
pub nodes: Vec<DependencyNode>,
pub edges: Vec<DependencyEdge>,
}
```
## 2. Storage Implementation Strategies
### A. SQLite Implementation
**Use Case**: Single-node development, embedded deployments, testing
**Schema**:
```sql
CREATE TABLE partition_events (
id INTEGER PRIMARY KEY AUTOINCREMENT,
partition_event_id TEXT NOT NULL UNIQUE,
partition_ref TEXT NOT NULL,
status INTEGER NOT NULL,
timestamp INTEGER NOT NULL,
job_graph_run_id TEXT,
job_run_id TEXT,
producing_job_label TEXT,
requesting_clients TEXT, -- JSON array
delegated_to_run_id TEXT,
upstream_deps TEXT, -- JSON array
downstream_deps TEXT, -- JSON array
manifest_data BLOB, -- Protobuf serialized
failure_reason TEXT,
-- Indexes for performance
INDEX idx_partition_ref (partition_ref),
INDEX idx_timestamp (timestamp),
INDEX idx_job_graph_run_id (job_graph_run_id),
INDEX idx_status (status),
INDEX idx_partition_timestamp (partition_ref, timestamp DESC)
);
-- Materialized view for latest events
CREATE TABLE partition_latest_events (
partition_ref TEXT PRIMARY KEY,
latest_event_id INTEGER NOT NULL,
status INTEGER NOT NULL,
timestamp INTEGER NOT NULL,
FOREIGN KEY (latest_event_id) REFERENCES partition_events(id)
);
```
**Implementation Highlights**:
- WAL mode for better concurrency
- Prepared statements for performance
- Periodic VACUUM for maintenance
- JSON functions for array queries
### B. PostgreSQL Implementation
**Use Case**: Production deployments, high concurrency, complex queries
**Schema**:
```sql
CREATE TABLE partition_events (
id BIGSERIAL PRIMARY KEY,
partition_event_id UUID NOT NULL UNIQUE,
partition_ref TEXT NOT NULL,
status INTEGER NOT NULL,
timestamp TIMESTAMPTZ NOT NULL,
job_graph_run_id TEXT,
job_run_id TEXT,
producing_job_label TEXT,
requesting_clients JSONB,
delegated_to_run_id TEXT,
upstream_deps JSONB,
downstream_deps JSONB,
manifest_data BYTEA,
failure_reason TEXT,
-- Optimized indexes
INDEX idx_partition_ref_timestamp ON partition_events USING BTREE (partition_ref, timestamp DESC),
INDEX idx_job_graph_run_id ON partition_events USING HASH (job_graph_run_id),
INDEX idx_status_timestamp ON partition_events USING BTREE (status, timestamp DESC),
INDEX idx_requesting_clients ON partition_events USING GIN (requesting_clients)
);
-- Partitioning by time for large datasets
CREATE TABLE partition_events_y2024m01 PARTITION OF partition_events
FOR VALUES FROM ('2024-01-01') TO ('2024-02-01');
-- Materialized view with automatic refresh
CREATE MATERIALIZED VIEW partition_latest_events AS
SELECT DISTINCT ON (partition_ref)
partition_ref, id, status, timestamp
FROM partition_events
ORDER BY partition_ref, timestamp DESC;
CREATE UNIQUE INDEX ON partition_latest_events (partition_ref);
```
**Implementation Highlights**:
- Connection pooling with deadpool-postgres
- JSONB for flexible array queries
- Partitioning by time for scalability
- Materialized views with refresh strategies
- Listen/Notify for real-time streaming
### C. Delta Lake Implementation
**Use Case**: Data lake environments, analytics workloads, historical analysis
**Schema**:
```
partition_events/
├── _delta_log/
│ ├── 00000000000000000000.json
│ └── 00000000000000000001.json
└── part-00000-*.parquet
```
**Table Schema**:
```rust
Schema::new(vec![
Field::new("partition_event_id", DataType::Utf8, false),
Field::new("partition_ref", DataType::Utf8, false),
Field::new("status", DataType::Int32, false),
Field::new("timestamp", DataType::Timestamp(TimeUnit::Microsecond, None), false),
Field::new("job_graph_run_id", DataType::Utf8, true),
Field::new("job_run_id", DataType::Utf8, true),
Field::new("producing_job_label", DataType::Utf8, true),
Field::new("requesting_clients", DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), true),
Field::new("delegated_to_run_id", DataType::Utf8, true),
Field::new("upstream_deps", DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), true),
Field::new("downstream_deps", DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), true),
Field::new("manifest_data", DataType::Binary, true),
Field::new("failure_reason", DataType::Utf8, true),
])
```
**Implementation Highlights**:
- Partitioned by date for query performance
- Z-ordering on partition_ref for clustering
- Optimize operations for compaction
- DataFusion for complex analytical queries
- Streaming through Delta Lake's log tail
## 3. Configuration and Factory Pattern
### Configuration Schema
```rust
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum StoreConfig {
SQLite {
path: PathBuf,
pragma: HashMap<String, String>,
},
PostgreSQL {
connection_string: String,
pool_size: Option<u32>,
partition_strategy: Option<PartitionStrategy>,
},
DeltaLake {
table_uri: String,
storage_options: HashMap<String, String>,
partition_columns: Vec<String>,
},
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum PartitionStrategy {
ByMonth,
ByWeek,
ByDay,
}
```
### Factory Implementation
```rust
pub struct PartitionActivityStoreFactory;
impl PartitionActivityStoreFactory {
pub async fn create(config: &StoreConfig) -> Result<Box<dyn PartitionActivityStore>, Box<dyn std::error::Error>> {
match config {
StoreConfig::SQLite { path, pragma } => {
let store = SQLitePartitionActivityStore::new(path, pragma).await?;
Ok(Box::new(store))
}
StoreConfig::PostgreSQL { connection_string, pool_size, partition_strategy } => {
let store = PostgreSQLPartitionActivityStore::new(
connection_string,
pool_size.unwrap_or(10),
partition_strategy.clone()
).await?;
Ok(Box::new(store))
}
StoreConfig::DeltaLake { table_uri, storage_options, partition_columns } => {
let store = DeltaLakePartitionActivityStore::new(
table_uri,
storage_options.clone(),
partition_columns.clone()
).await?;
Ok(Box::new(store))
}
}
}
}
```
## 4. Protocol Buffer Extensions
### New Message Types
```protobuf
// Enhanced partition event with storage metadata
message PartitionEvent {
// Event identity
string partition_event_id = 1;
google.protobuf.Timestamp timestamp = 2;
// Partition identification
PartitionRef partition_ref = 3;
PartitionStatus status = 4;
// Build context
string job_graph_run_id = 5;
string job_run_id = 6;
JobLabel producing_job = 7;
// Coordination metadata
repeated string requesting_clients = 8;
string delegated_to_run_id = 9;
// Dependencies
repeated PartitionRef upstream_deps = 10;
repeated PartitionRef downstream_deps = 11;
// Data about the partition
PartitionManifest manifest = 12;
string failure_reason = 13;
// Storage metadata (added)
string storage_backend = 14;
map<string, string> storage_metadata = 15;
}
// Query capabilities
message PartitionEventQuery {
repeated PartitionRef partition_refs = 1;
repeated PartitionStatus statuses = 2;
repeated string job_graph_run_ids = 3;
TimeRange time_range = 4;
int32 limit = 5;
int32 offset = 6;
OrderBy order_by = 7;
}
message TimeRange {
google.protobuf.Timestamp start = 1;
google.protobuf.Timestamp end = 2;
}
message OrderBy {
string field = 1;
bool ascending = 2;
}
// Stream filtering
message EventStreamFilter {
repeated PartitionRef partition_refs = 1;
repeated PartitionStatus statuses = 2;
google.protobuf.Timestamp since = 3;
}
// Coordination support
message ActiveBuild {
PartitionRef partition_ref = 1;
string job_graph_run_id = 2;
PartitionStatus status = 3;
repeated string requesting_clients = 4;
google.protobuf.Timestamp started_at = 5;
}
message DependencyGraph {
PartitionRef root = 1;
repeated DependencyNode nodes = 2;
repeated DependencyEdge edges = 3;
}
message DependencyNode {
PartitionRef partition_ref = 1;
PartitionStatus status = 2;
google.protobuf.Timestamp last_updated = 3;
}
message DependencyEdge {
PartitionRef source = 1;
PartitionRef target = 2;
DependencyType type = 3;
}
enum DependencyType {
DEPENDS_ON = 0;
PRODUCES = 1;
}
```
### Enhanced Catalog Service
```protobuf
service EnhancedCatalog {
// Existing catalog methods
rpc ListJobGraphRuns(ListJobGraphRunsRequest) returns (ListJobGraphRunsResponse);
rpc ListPartitions(ListPartitionManifestsRequest) returns (ListPartitionManifestsResponse);
// New partition activity methods
rpc AppendPartitionEvent(AppendPartitionEventRequest) returns (AppendPartitionEventResponse);
rpc GetLatestPartitionEvent(GetLatestPartitionEventRequest) returns (GetLatestPartitionEventResponse);
rpc QueryPartitionEvents(QueryPartitionEventsRequest) returns (QueryPartitionEventsResponse);
rpc StreamPartitionEvents(StreamPartitionEventsRequest) returns (stream PartitionEvent);
// Coordination methods
rpc RequestPartition(RequestPartitionRequest) returns (RequestPartitionResponse);
rpc GetActiveBuildStatus(GetActiveBuildStatusRequest) returns (GetActiveBuildStatusResponse);
rpc GetDependencyGraph(GetDependencyGraphRequest) returns (GetDependencyGraphResponse);
}
// Request/Response message types
message AppendPartitionEventRequest {
PartitionEvent event = 1;
}
message AppendPartitionEventResponse {
string event_id = 1;
}
message GetLatestPartitionEventRequest {
PartitionRef partition_ref = 1;
}
message GetLatestPartitionEventResponse {
PartitionEvent event = 1;
}
message QueryPartitionEventsRequest {
PartitionEventQuery query = 1;
}
message QueryPartitionEventsResponse {
repeated PartitionEvent events = 1;
int64 total_count = 2;
}
message StreamPartitionEventsRequest {
EventStreamFilter filter = 1;
}
message RequestPartitionRequest {
PartitionRef partition_ref = 1;
string client_id = 2;
RequestOptions options = 3;
}
message RequestOptions {
bool allow_delegation = 1;
int32 timeout_seconds = 2;
}
message RequestPartitionResponse {
RequestResult result = 1;
}
message RequestResult {
oneof result {
PartitionLocation available = 1;
DelegationToken delegated = 2;
BuildToken building = 3;
}
}
message PartitionLocation {
PartitionRef partition_ref = 1;
PartitionManifest manifest = 2;
}
message DelegationToken {
string job_graph_run_id = 1;
google.protobuf.Timestamp estimated_completion = 2;
}
message BuildToken {
string job_graph_run_id = 1;
google.protobuf.Timestamp started_at = 2;
}
```
## 5. Implementation Phases
### Phase 1: Core Infrastructure
- Abstract trait definition
- SQLite implementation (simplest)
- Basic factory pattern
- Unit tests for storage operations
### Phase 2: PostgreSQL Implementation
- Connection pooling setup
- Advanced indexing strategies
- Materialized view management
- Performance benchmarking
### Phase 3: Delta Lake Implementation
- Delta Lake table setup
- Parquet serialization
- Query optimization
- Analytics integration
### Phase 4: Integration & Testing
- End-to-end integration tests
- Performance comparison benchmarks
- Migration utilities between backends
- Production deployment guides
## 6. Performance Considerations
### Write Performance
- **SQLite**: Single-writer limitation, but excellent for development
- **PostgreSQL**: Excellent concurrent write performance with proper indexing
- **Delta Lake**: Batch writes preferred, automatic compaction
### Read Performance
- **SQLite**: Fast for simple queries, limited by single-process nature
- **PostgreSQL**: Excellent with proper indexing and materialized views
- **Delta Lake**: Optimized for analytical queries, Z-ordering for point lookups
### Storage Efficiency
- **SQLite**: Compact storage, limited by 281TB max size
- **PostgreSQL**: Efficient with proper partitioning, unlimited growth
- **Delta Lake**: Columnar storage, excellent compression, versioning overhead
## 7. Migration Strategy
### Cross-Backend Migration
```rust
pub async fn migrate_events(
source: &dyn PartitionActivityStore,
target: &dyn PartitionActivityStore,
batch_size: usize,
time_range: Option<(Timestamp, Timestamp)>,
) -> Result<u64, Box<dyn std::error::Error>> {
// Implementation for moving events between stores
}
```
### Schema Evolution
- Forward-compatible protobuf changes
- Storage-specific schema migration scripts
- Version tracking in metadata
This design provides a robust, scalable foundation for the partition activity log while maintaining flexibility for different deployment scenarios and performance requirements.