Big change
Some checks are pending
/ setup (push) Waiting to run

This commit is contained in:
soaxelbrooke 2025-07-13 21:18:15 -07:00
parent aa561a8281
commit bfec05e065
38 changed files with 2745 additions and 441 deletions

View file

@ -122,6 +122,10 @@ crate.spec(
package = "axum-jsonschema", package = "axum-jsonschema",
version = "0.8.0", version = "0.8.0",
) )
crate.spec(
package = "thiserror",
version = "1.0",
)
crate.spec( crate.spec(
features = ["debug-embed"], features = ["debug-embed"],
package = "rust-embed", package = "rust-embed",

File diff suppressed because one or more lines are too long

View file

@ -39,6 +39,9 @@ rust_library(
"event_log/sqlite.rs", "event_log/sqlite.rs",
"event_log/stdout.rs", "event_log/stdout.rs",
"lib.rs", "lib.rs",
"orchestration/mod.rs",
"orchestration/error.rs",
"orchestration/events.rs",
"service/handlers.rs", "service/handlers.rs",
"service/mod.rs", "service/mod.rs",
":generate_databuild_rust", ":generate_databuild_rust",
@ -59,6 +62,7 @@ rust_library(
"@crates//:schemars", "@crates//:schemars",
"@crates//:serde", "@crates//:serde",
"@crates//:serde_json", "@crates//:serde_json",
"@crates//:thiserror",
"@crates//:tokio", "@crates//:tokio",
"@crates//:uuid", "@crates//:uuid",
], ],
@ -104,6 +108,16 @@ rust_binary(
], ],
) )
# Test for orchestration module
rust_test(
name = "orchestration_test",
crate = ":databuild",
edition = "2021",
deps = [
"@crates//:tokio",
],
)
# Legacy filegroup for backwards compatibility # Legacy filegroup for backwards compatibility
filegroup( filegroup(
name = "proto", name = "proto",

26
databuild/cli/BUILD.bazel Normal file
View file

@ -0,0 +1,26 @@
load("@rules_rust//rust:defs.bzl", "rust_binary")
# DataBuild CLI wrapper using orchestrator
rust_binary(
name = "databuild_cli",
srcs = [
"main.rs",
"error.rs",
],
edition = "2021",
visibility = ["//visibility:public"],
data = [
"//databuild/graph:analyze",
"//databuild/graph:execute",
],
deps = [
"//databuild:databuild",
"@crates//:clap",
"@crates//:log",
"@crates//:serde_json",
"@crates//:simple_logger",
"@crates//:thiserror",
"@crates//:tokio",
"@crates//:uuid",
],
)

25
databuild/cli/error.rs Normal file
View file

@ -0,0 +1,25 @@
use crate::event_log::BuildEventLogError;
use crate::orchestration::OrchestrationError;
#[derive(Debug, thiserror::Error)]
pub enum CliError {
#[error("Event log error: {0}")]
EventLog(#[from] BuildEventLogError),
#[error("Orchestration error: {0}")]
Orchestration(#[from] OrchestrationError),
#[error("Analysis error: {0}")]
Analysis(String),
#[error("Execution error: {0}")]
Execution(String),
#[error("Environment error: {0}")]
Environment(String),
#[error("Invalid arguments: {0}")]
InvalidArguments(String),
}
pub type Result<T> = std::result::Result<T, CliError>;

207
databuild/cli/main.rs Normal file
View file

@ -0,0 +1,207 @@
use databuild::*;
use databuild::event_log::create_build_event_log;
use databuild::orchestration::{BuildOrchestrator, BuildResult};
use clap::{Arg, Command as ClapCommand};
use log::info;
use simple_logger::SimpleLogger;
use std::env;
use std::process::{Command, Stdio};
use uuid::Uuid;
mod error;
use error::{CliError, Result};
/// Run the analyze command and return the job graph
async fn run_analysis(
partitions: &[String],
orchestrator: &BuildOrchestrator,
) -> Result<JobGraph> {
info!("Running analysis for partitions: {:?}", partitions);
// Get required environment variables
let candidate_jobs = env::var("DATABUILD_CANDIDATE_JOBS")
.map_err(|_| CliError::Environment("DATABUILD_CANDIDATE_JOBS not set".to_string()))?;
let job_lookup_path = env::var("DATABUILD_JOB_LOOKUP_PATH")
.map_err(|_| CliError::Environment("DATABUILD_JOB_LOOKUP_PATH not set".to_string()))?;
let graph_label = env::var("DATABUILD_GRAPH_LABEL")
.map_err(|_| CliError::Environment("DATABUILD_GRAPH_LABEL not set".to_string()))?;
// Find analyze binary using runfiles
let analyze_path = env::var("RUNFILES_DIR")
.map(|runfiles_dir| format!("{}/databuild+/databuild/graph/analyze", runfiles_dir))
.or_else(|_| {
// Fallback for direct execution
Ok("./databuild/graph/analyze".to_string())
})
.map_err(|e: std::env::VarError| CliError::Environment(format!("Failed to locate analyze binary: {}", e)))?;
// Build analyze command
let cmd = Command::new(analyze_path)
.args(partitions)
.env("DATABUILD_CANDIDATE_JOBS", candidate_jobs)
.env("DATABUILD_JOB_LOOKUP_PATH", job_lookup_path)
.env("DATABUILD_GRAPH_LABEL", graph_label)
.env("DATABUILD_MODE", "plan")
.env("DATABUILD_BUILD_REQUEST_ID", orchestrator.build_request_id())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.map_err(|e| CliError::Analysis(format!("Failed to spawn analyze process: {}", e)))?;
let output = cmd.wait_with_output()
.map_err(|e| CliError::Analysis(format!("Failed to run analyze: {}", e)))?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(CliError::Analysis(format!("Analysis failed: {}", stderr)));
}
let stdout = String::from_utf8_lossy(&output.stdout);
let job_graph: JobGraph = serde_json::from_str(&stdout)
.map_err(|e| CliError::Analysis(format!("Failed to parse job graph: {}", e)))?;
info!("Analysis complete, found {} tasks", job_graph.nodes.len());
Ok(job_graph)
}
/// Run the execute command with the job graph
async fn run_execution(
job_graph: JobGraph,
orchestrator: &BuildOrchestrator,
) -> Result<BuildResult> {
info!("Running execution for {} tasks", job_graph.nodes.len());
// Serialize job graph to JSON for the execute command
let job_graph_json = serde_json::to_string(&job_graph)
.map_err(|e| CliError::Execution(format!("Failed to serialize job graph: {}", e)))?;
// Get required environment variables
let candidate_jobs = env::var("DATABUILD_CANDIDATE_JOBS")
.map_err(|_| CliError::Environment("DATABUILD_CANDIDATE_JOBS not set".to_string()))?;
let build_event_log_uri = env::var("DATABUILD_BUILD_EVENT_LOG").unwrap_or_else(|_| "stdout".to_string());
// Find execute binary using runfiles
let execute_path = env::var("RUNFILES_DIR")
.map(|runfiles_dir| format!("{}/databuild+/databuild/graph/execute", runfiles_dir))
.or_else(|_| {
// Fallback for direct execution
Ok("./databuild/graph/execute".to_string())
})
.map_err(|e: std::env::VarError| CliError::Environment(format!("Failed to locate execute binary: {}", e)))?;
// Build execute command
let mut cmd = Command::new(execute_path)
.env("DATABUILD_CANDIDATE_JOBS", candidate_jobs)
.env("DATABUILD_BUILD_EVENT_LOG", build_event_log_uri)
.env("DATABUILD_BUILD_REQUEST_ID", orchestrator.build_request_id())
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.map_err(|e| CliError::Execution(format!("Failed to spawn execute process: {}", e)))?;
// Write job graph to stdin
if let Some(stdin) = cmd.stdin.as_mut() {
use std::io::Write;
stdin.write_all(job_graph_json.as_bytes())
.map_err(|e| CliError::Execution(format!("Failed to write job graph to execute: {}", e)))?;
}
let output = cmd.wait_with_output()
.map_err(|e| CliError::Execution(format!("Failed to run execute: {}", e)))?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(CliError::Execution(format!("Execution failed: {}", stderr)));
}
// For now, assume success if the command completed without error
// In the future, we could parse the output to get more detailed results
info!("Execution completed successfully");
Ok(BuildResult::Success { jobs_completed: job_graph.nodes.len() })
}
#[tokio::main]
async fn main() -> Result<()> {
// Initialize logger
SimpleLogger::new()
.with_level(log::LevelFilter::Info)
.init()
.map_err(|e| CliError::Environment(format!("Failed to initialize logger: {}", e)))?;
info!("Starting DataBuild CLI wrapper");
// Parse command line arguments
let matches = ClapCommand::new("databuild")
.version("1.0")
.about("DataBuild unified CLI")
.arg(
Arg::new("partitions")
.help("Partition references to build")
.required(true)
.num_args(1..)
.value_name("PARTITIONS")
)
.arg(
Arg::new("event-log")
.long("event-log")
.help("Event log URI (default: stdout)")
.value_name("URI")
)
.arg(
Arg::new("build-request-id")
.long("build-request-id")
.help("Build request ID (default: generate UUID)")
.value_name("ID")
)
.get_matches();
let partitions: Vec<String> = matches.get_many::<String>("partitions")
.unwrap()
.cloned()
.collect();
let event_log_uri = matches.get_one::<String>("event-log")
.cloned()
.or_else(|| env::var("DATABUILD_BUILD_EVENT_LOG").ok())
.unwrap_or_else(|| "stdout".to_string());
let build_request_id = matches.get_one::<String>("build-request-id")
.cloned()
.or_else(|| env::var("DATABUILD_BUILD_REQUEST_ID").ok())
.unwrap_or_else(|| Uuid::new_v4().to_string());
info!("Build request ID: {}", build_request_id);
info!("Partitions: {:?}", partitions);
info!("Event log URI: {}", event_log_uri);
// Create event log and orchestrator
let event_log = create_build_event_log(&event_log_uri).await?;
let requested_partitions: Vec<PartitionRef> = partitions.iter()
.map(|p| PartitionRef { str: p.clone() })
.collect();
let orchestrator = BuildOrchestrator::new(
std::sync::Arc::from(event_log),
build_request_id,
requested_partitions,
);
// Emit orchestration events
orchestrator.start_build().await?;
orchestrator.start_planning().await?;
// Run analysis
let job_graph = run_analysis(&partitions, &orchestrator).await?;
orchestrator.start_execution().await?;
// Run execution
let result = run_execution(job_graph, &orchestrator).await?;
orchestrator.complete_build(result).await?;
info!("DataBuild CLI completed successfully");
Ok(())
}

View file

@ -43,6 +43,12 @@ genrule(
"typescript_generated/src/models/BuildSummary.ts", "typescript_generated/src/models/BuildSummary.ts",
"typescript_generated/src/models/BuildsListResponse.ts", "typescript_generated/src/models/BuildsListResponse.ts",
"typescript_generated/src/models/CancelBuildRequest.ts", "typescript_generated/src/models/CancelBuildRequest.ts",
"typescript_generated/src/models/JobDailyStats.ts",
"typescript_generated/src/models/JobMetricsRequest.ts",
"typescript_generated/src/models/JobMetricsResponse.ts",
"typescript_generated/src/models/JobRunSummary.ts",
"typescript_generated/src/models/JobSummary.ts",
"typescript_generated/src/models/JobsListResponse.ts",
"typescript_generated/src/models/PartitionEventsRequest.ts", "typescript_generated/src/models/PartitionEventsRequest.ts",
"typescript_generated/src/models/PartitionEventsResponse.ts", "typescript_generated/src/models/PartitionEventsResponse.ts",
"typescript_generated/src/models/PartitionStatusRequest.ts", "typescript_generated/src/models/PartitionStatusRequest.ts",
@ -84,6 +90,12 @@ genrule(
cp $$TEMP_DIR/src/models/BuildSummary.ts $(location typescript_generated/src/models/BuildSummary.ts) cp $$TEMP_DIR/src/models/BuildSummary.ts $(location typescript_generated/src/models/BuildSummary.ts)
cp $$TEMP_DIR/src/models/BuildsListResponse.ts $(location typescript_generated/src/models/BuildsListResponse.ts) cp $$TEMP_DIR/src/models/BuildsListResponse.ts $(location typescript_generated/src/models/BuildsListResponse.ts)
cp $$TEMP_DIR/src/models/CancelBuildRequest.ts $(location typescript_generated/src/models/CancelBuildRequest.ts) cp $$TEMP_DIR/src/models/CancelBuildRequest.ts $(location typescript_generated/src/models/CancelBuildRequest.ts)
cp $$TEMP_DIR/src/models/JobDailyStats.ts $(location typescript_generated/src/models/JobDailyStats.ts)
cp $$TEMP_DIR/src/models/JobMetricsRequest.ts $(location typescript_generated/src/models/JobMetricsRequest.ts)
cp $$TEMP_DIR/src/models/JobMetricsResponse.ts $(location typescript_generated/src/models/JobMetricsResponse.ts)
cp $$TEMP_DIR/src/models/JobRunSummary.ts $(location typescript_generated/src/models/JobRunSummary.ts)
cp $$TEMP_DIR/src/models/JobSummary.ts $(location typescript_generated/src/models/JobSummary.ts)
cp $$TEMP_DIR/src/models/JobsListResponse.ts $(location typescript_generated/src/models/JobsListResponse.ts)
cp $$TEMP_DIR/src/models/PartitionEventsRequest.ts $(location typescript_generated/src/models/PartitionEventsRequest.ts) cp $$TEMP_DIR/src/models/PartitionEventsRequest.ts $(location typescript_generated/src/models/PartitionEventsRequest.ts)
cp $$TEMP_DIR/src/models/PartitionEventsResponse.ts $(location typescript_generated/src/models/PartitionEventsResponse.ts) cp $$TEMP_DIR/src/models/PartitionEventsResponse.ts $(location typescript_generated/src/models/PartitionEventsResponse.ts)
cp $$TEMP_DIR/src/models/PartitionStatusRequest.ts $(location typescript_generated/src/models/PartitionStatusRequest.ts) cp $$TEMP_DIR/src/models/PartitionStatusRequest.ts $(location typescript_generated/src/models/PartitionStatusRequest.ts)

View file

@ -1,7 +1,7 @@
{ {
"compilerOptions": { "compilerOptions": {
"target": "ES2020", "target": "ES2020",
"module": "ESNext", "module": "CommonJS",
"moduleResolution": "node", "moduleResolution": "node",
"allowJs": true, "allowJs": true,
"declaration": true, "declaration": true,

View file

@ -1,6 +1,6 @@
import m from 'mithril'; import m from 'mithril';
import { DashboardService, pollingManager, formatTime, formatDateTime, RecentActivitySummary } from './services'; import { DashboardService, pollingManager, formatTime, formatDateTime, formatDuration, formatDate, RecentActivitySummary } from './services';
import { encodePartitionRef, decodePartitionRef, BuildStatusBadge, PartitionStatusBadge, EventTypeBadge } from './utils'; import { encodePartitionRef, decodePartitionRef, encodeJobLabel, decodeJobLabel, BuildStatusBadge, PartitionStatusBadge, EventTypeBadge } from './utils';
// Page scaffold components // Page scaffold components
export const RecentActivity = { export const RecentActivity = {
@ -975,41 +975,293 @@ export const PartitionStatus = {
}; };
export const JobsList = { export const JobsList = {
view: () => m('div.container.mx-auto.p-4', [ jobs: [] as any[],
m('h1.text-3xl.font-bold.mb-4', 'Jobs'), searchTerm: '',
m('div.card.bg-base-100.shadow-xl', [ loading: false,
m('div.card-body', [ error: null as string | null,
m('h2.card-title', 'Job Listing'), searchTimeout: null as NodeJS.Timeout | null,
m('p', 'Jobs in the graph with high-level metadata and performance metrics.'),
m('div.alert.alert-info', [ oninit(vnode: any) {
m('span', 'Job list will be populated from the graph configuration.'), JobsList.loadJobs();
]), },
]),
]), async loadJobs() {
JobsList.loading = true;
JobsList.error = null;
try {
const service = DashboardService.getInstance();
JobsList.jobs = await service.getJobs(JobsList.searchTerm || undefined);
} catch (error) {
console.error('Failed to load jobs:', error);
JobsList.error = 'Failed to load jobs. Please try again.';
} finally {
JobsList.loading = false;
m.redraw();
}
},
filteredJobs() {
if (!JobsList.searchTerm) {
return JobsList.jobs;
}
const search = JobsList.searchTerm.toLowerCase();
return JobsList.jobs.filter((job: any) =>
job.job_label.toLowerCase().includes(search)
);
},
view: () => {
if (JobsList.loading) {
return m('div.container.mx-auto.p-4', [
m('div.flex.justify-center.items-center.h-64', [
m('div.loading.loading-spinner.loading-lg')
]) ])
]);
}
if (JobsList.error) {
return m('div.container.mx-auto.p-4', [
m('div.alert.alert-error', [
m('span', JobsList.error),
m('div', [
m('button.btn.btn-sm.btn-outline', {
onclick: () => JobsList.loadJobs()
}, 'Retry')
])
])
]);
}
return m('div.container.mx-auto.p-4', [
// Jobs Header
m('.jobs-header.mb-6', [
m('h1.text-3xl.font-bold.mb-4', 'Jobs'),
m('div.flex.gap-4.items-center.mb-4', [
m('input.input.input-bordered.flex-1[placeholder="Search jobs..."]', {
value: JobsList.searchTerm,
oninput: (e: Event) => {
JobsList.searchTerm = (e.target as HTMLInputElement).value;
// Debounce search
if (JobsList.searchTimeout) clearTimeout(JobsList.searchTimeout);
JobsList.searchTimeout = setTimeout(() => JobsList.loadJobs(), 300);
}
}),
m('button.btn.btn-outline', {
onclick: () => JobsList.loadJobs()
}, 'Refresh')
])
]),
// Jobs Table
JobsList.filteredJobs().length === 0 ?
m('div.text-center.py-8.text-base-content.opacity-60', 'No jobs found') :
m('.jobs-table.card.bg-base-100.shadow-xl', [
m('.card-body.p-0', [
m('.overflow-x-auto', [
m('table.table.table-zebra', [
m('thead', [
m('tr', [
m('th', 'Job Label'),
m('th', 'Success Rate'),
m('th', 'Avg Duration'),
m('th', 'Recent Runs'),
m('th', 'Last Run'),
])
]),
m('tbody', JobsList.filteredJobs().map((job: any) =>
m('tr.hover', [
m('td', [
m('a.link.link-primary.font-mono.text-sm', {
href: `/jobs/${encodeJobLabel(job.job_label)}`,
onclick: (e: Event) => {
e.preventDefault();
m.route.set(`/jobs/${encodeJobLabel(job.job_label)}`);
}
}, job.job_label)
]),
m('td', [
m(`span.badge.${job.success_rate >= 0.9 ? 'badge-success' : job.success_rate >= 0.7 ? 'badge-warning' : 'badge-error'}`,
`${Math.round(job.success_rate * 100)}%`)
]),
m('td', formatDuration(job.avg_duration_ms)),
m('td', (job.recent_runs || 0).toString()),
m('td.text-sm.opacity-70',
job.last_run ? formatTime(new Date(job.last_run / 1000000).toISOString()) : '—'),
])
))
])
])
])
])
]);
}
}; };
export const JobMetrics = { export const JobMetrics = {
view: (vnode: any) => { jobLabel: '',
const jobLabel = vnode.attrs.label; metrics: null as any,
loading: false,
error: null as string | null,
oninit(vnode: any) {
JobMetrics.jobLabel = decodeJobLabel(vnode.attrs.label);
JobMetrics.loadJobMetrics();
},
async loadJobMetrics() {
JobMetrics.loading = true;
JobMetrics.error = null;
try {
const service = DashboardService.getInstance();
JobMetrics.metrics = await service.getJobMetrics(JobMetrics.jobLabel);
if (!JobMetrics.metrics) {
JobMetrics.error = 'Job not found or no metrics available';
}
} catch (error) {
console.error('Failed to load job metrics:', error);
JobMetrics.error = 'Failed to load job metrics. Please try again.';
} finally {
JobMetrics.loading = false;
m.redraw();
}
},
view: () => {
if (JobMetrics.loading) {
return m('div.container.mx-auto.p-4', [ return m('div.container.mx-auto.p-4', [
m('h1.text-3xl.font-bold.mb-4', `Job Metrics: ${jobLabel}`), m('div.flex.justify-center.items-center.h-64', [
m('div.card.bg-base-100.shadow-xl', [ m('div.loading.loading-spinner.loading-lg')
m('div.card-body', [ ])
m('h2.card-title', 'Performance and Reliability'), ]);
m('p', 'Success rate charts, duration trends, and recent runs.'), }
m('div.stats.shadow', [
m('div.stat', [ if (JobMetrics.error) {
m('div.stat-title', 'Success Rate'), return m('div.container.mx-auto.p-4', [
m('div.stat-value', '95%'), m('div.alert.alert-error', [
m('span', JobMetrics.error),
m('div', [
m('button.btn.btn-sm.btn-outline', {
onclick: () => JobMetrics.loadJobMetrics()
}, 'Retry')
])
])
]);
}
if (!JobMetrics.metrics) {
return m('div.container.mx-auto.p-4', [
m('div.text-center.py-8.text-base-content.opacity-60', 'No metrics available')
]);
}
return m('div.container.mx-auto.p-4', [
// Job Header
m('.job-header.mb-6', [
m('h1.text-3xl.font-bold.mb-4', [
'Job Metrics: ',
m('span.font-mono.text-2xl', JobMetrics.jobLabel)
]), ]),
m('div.stat', [ m('.job-stats.grid.grid-cols-1.md:grid-cols-3.gap-4.mb-6', [
m('div.stat-title', 'Avg Duration'), m('.stat.bg-base-100.shadow-xl.rounded-lg.p-4', [
m('div.stat-value', '2.5s'), m('.stat-title', 'Success Rate'),
m('.stat-value.text-3xl', [
m(`span.${JobMetrics.metrics.success_rate >= 0.9 ? 'text-success' : JobMetrics.metrics.success_rate >= 0.7 ? 'text-warning' : 'text-error'}`,
`${Math.round(JobMetrics.metrics.success_rate * 100)}%`)
]), ]),
]), ]),
m('.stat.bg-base-100.shadow-xl.rounded-lg.p-4', [
m('.stat-title', 'Avg Duration'),
m('.stat-value.text-3xl', formatDuration(JobMetrics.metrics.avg_duration_ms)),
]), ]),
m('.stat.bg-base-100.shadow-xl.rounded-lg.p-4', [
m('.stat-title', 'Total Runs'),
m('.stat-value.text-3xl', JobMetrics.metrics.total_runs),
]), ]),
])
]),
// Main Content
m('.job-content.space-y-6', [
// Performance Trends
JobMetrics.metrics.daily_stats?.length > 0 && m('.performance-trends.card.bg-base-100.shadow-xl', [
m('.card-body', [
m('h2.card-title.text-xl.mb-4', 'Performance Trends (Last 30 Days)'),
m('.overflow-x-auto', [
m('table.table.table-sm', [
m('thead', [
m('tr', [
m('th', 'Date'),
m('th', 'Success Rate'),
m('th', 'Avg Duration'),
m('th', 'Total Runs'),
])
]),
m('tbody', JobMetrics.metrics.daily_stats.map((stat: any) =>
m('tr.hover', [
m('td', formatDate(stat.date)),
m('td', [
m(`span.badge.${stat.success_rate >= 0.9 ? 'badge-success' : stat.success_rate >= 0.7 ? 'badge-warning' : 'badge-error'}`,
`${Math.round(stat.success_rate * 100)}%`)
]),
m('td', formatDuration(stat.avg_duration_ms)),
m('td', stat.total_runs),
])
))
])
])
])
]),
// Recent Runs
m('.recent-runs.card.bg-base-100.shadow-xl', [
m('.card-body', [
m('h2.card-title.text-xl.mb-4', `Recent Runs (${JobMetrics.metrics.recent_runs?.length || 0})`),
!JobMetrics.metrics.recent_runs || JobMetrics.metrics.recent_runs.length === 0 ?
m('.text-center.py-8.text-base-content.opacity-60', 'No recent runs available') :
m('.overflow-x-auto', [
m('table.table.table-sm', [
m('thead', [
m('tr', [
m('th', 'Build Request'),
m('th', 'Partitions'),
m('th', 'Status'),
m('th', 'Duration'),
m('th', 'Started'),
])
]),
m('tbody', JobMetrics.metrics.recent_runs.map((run: any) =>
m('tr.hover', [
m('td', [
m('a.link.link-primary.font-mono.text-sm', {
href: `/builds/${run.build_request_id}`,
onclick: (e: Event) => {
e.preventDefault();
m.route.set(`/builds/${run.build_request_id}`);
}
}, run.build_request_id)
]),
m('td.text-sm', [
m('span.font-mono', run.partitions.slice(0, 3).join(', ')),
run.partitions.length > 3 && m('span.opacity-60', ` +${run.partitions.length - 3} more`)
]),
m('td', [
m(`span.badge.${run.status === 'completed' ? 'badge-success' :
run.status === 'failed' ? 'badge-error' :
run.status === 'running' ? 'badge-warning' : 'badge-info'}`,
run.status)
]),
m('td', formatDuration(run.duration_ms)),
m('td.text-sm.opacity-70',
formatTime(new Date(run.started_at / 1000000).toISOString())),
])
))
])
])
])
])
])
]); ]);
} }
}; };

View file

@ -1,5 +1,5 @@
// Import the generated TypeScript client // Import the generated TypeScript client
import { DefaultApi, Configuration, ActivityResponse, BuildSummary, PartitionSummary } from '../client/typescript_generated/src/index'; import { DefaultApi, Configuration, ActivityResponse, BuildSummary, PartitionSummary, JobsListResponse, JobMetricsResponse, JobSummary, JobRunSummary, JobDailyStats } from '../client/typescript_generated/src/index';
// Base API configuration // Base API configuration
const API_BASE = '/api/v1'; const API_BASE = '/api/v1';
@ -87,6 +87,48 @@ export class DashboardService {
}; };
} }
} }
async getJobs(searchTerm?: string): Promise<JobSummary[]> {
try {
// Build query parameters manually since the generated client may not support query params correctly
const queryParams = new URLSearchParams();
if (searchTerm) {
queryParams.append('search', searchTerm);
}
const url = `/api/v1/jobs${queryParams.toString() ? '?' + queryParams.toString() : ''}`;
const response = await fetch(url);
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const data: JobsListResponse = await response.json();
return data.jobs;
} catch (error) {
console.error('Failed to fetch jobs:', error);
return [];
}
}
async getJobMetrics(jobLabel: string): Promise<JobMetricsResponse | null> {
try {
// Encode job label like partition refs for URL safety
const encodedLabel = btoa(jobLabel).replace(/\+/g, '-').replace(/\//g, '_').replace(/=/g, '');
const url = `/api/v1/jobs/${encodedLabel}`;
const response = await fetch(url);
if (!response.ok) {
if (response.status === 404) {
return null; // Job not found
}
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const data: JobMetricsResponse = await response.json();
return data;
} catch (error) {
console.error('Failed to fetch job metrics:', error);
return null;
}
}
} }
// Polling manager with Page Visibility API integration // Polling manager with Page Visibility API integration
@ -200,3 +242,32 @@ export function formatDateTime(isoString: string): string {
// Insert milliseconds between seconds and AM/PM: "7/12/2025, 9:03:48.264 AM EST" // Insert milliseconds between seconds and AM/PM: "7/12/2025, 9:03:48.264 AM EST"
return `${dateStr}, ${timeStr.replace(/(\d{2})\s+(AM|PM)/, `$1.${millisStr} $2`)}`; return `${dateStr}, ${timeStr.replace(/(\d{2})\s+(AM|PM)/, `$1.${millisStr} $2`)}`;
} }
export function formatDuration(durationMs?: number | null): string {
if (!durationMs || durationMs <= 0) {
return '—';
}
if (durationMs < 1000) {
return `${Math.round(durationMs)}ms`;
} else if (durationMs < 60000) {
return `${(durationMs / 1000).toFixed(1)}s`;
} else if (durationMs < 3600000) {
const minutes = Math.floor(durationMs / 60000);
const seconds = Math.floor((durationMs % 60000) / 1000);
return `${minutes}m ${seconds}s`;
} else {
const hours = Math.floor(durationMs / 3600000);
const minutes = Math.floor((durationMs % 3600000) / 60000);
return `${hours}h ${minutes}m`;
}
}
export function formatDate(dateString: string): string {
const date = new Date(dateString);
return date.toLocaleDateString('en-US', {
month: 'short',
day: 'numeric',
year: 'numeric'
});
}

View file

@ -10,6 +10,18 @@ export function decodePartitionRef(encoded: string): string {
return atob(padded); return atob(padded);
} }
// Job label encoding utilities (same pattern as partition refs)
export function encodeJobLabel(label: string): string {
return btoa(label).replace(/\+/g, '-').replace(/\//g, '_').replace(/=/g, '');
}
export function decodeJobLabel(encoded: string): string {
// Add padding if needed
const padding = '='.repeat((4 - (encoded.length % 4)) % 4);
const padded = encoded.replace(/-/g, '+').replace(/_/g, '/') + padding;
return atob(padded);
}
import m from 'mithril'; import m from 'mithril';
// Mithril components for status badges - encapsulates both logic and presentation // Mithril components for status badges - encapsulates both logic and presentation
@ -46,7 +58,7 @@ export const PartitionStatusBadge = {
if (normalizedStatus.includes('available')) { if (normalizedStatus.includes('available')) {
badgeClass = 'badge-success'; badgeClass = 'badge-success';
} else if (normalizedStatus.includes('building') || normalizedStatus.includes('scheduled')) { } else if (normalizedStatus.includes('building') || normalizedStatus.includes('analyzed')) {
badgeClass = 'badge-warning'; badgeClass = 'badge-warning';
} else if (normalizedStatus.includes('requested') || normalizedStatus.includes('delegated')) { } else if (normalizedStatus.includes('requested') || normalizedStatus.includes('delegated')) {
badgeClass = 'badge-info'; badgeClass = 'badge-info';

View file

@ -162,8 +162,8 @@ message GraphBuildResponse { repeated PartitionManifest manifests = 1; }
// Partition lifecycle states // Partition lifecycle states
enum PartitionStatus { enum PartitionStatus {
PARTITION_UNKNOWN = 0; PARTITION_UNKNOWN = 0;
PARTITION_REQUESTED = 1; // Partition requested but not yet scheduled PARTITION_REQUESTED = 1; // Partition requested but not yet analyzed
PARTITION_SCHEDULED = 2; // Job scheduled to produce this partition PARTITION_ANALYZED = 2; // Partition analyzed successfully
PARTITION_BUILDING = 3; // Job actively building this partition PARTITION_BUILDING = 3; // Job actively building this partition
PARTITION_AVAILABLE = 4; // Partition successfully built and available PARTITION_AVAILABLE = 4; // Partition successfully built and available
PARTITION_FAILED = 5; // Partition build failed PARTITION_FAILED = 5; // Partition build failed
@ -178,6 +178,7 @@ enum JobStatus {
JOB_COMPLETED = 3; // Job completed successfully JOB_COMPLETED = 3; // Job completed successfully
JOB_FAILED = 4; // Job execution failed JOB_FAILED = 4; // Job execution failed
JOB_CANCELLED = 5; // Job execution cancelled JOB_CANCELLED = 5; // Job execution cancelled
JOB_SKIPPED = 6; // Job skipped because target partitions already available
} }
// Build request lifecycle // Build request lifecycle
@ -185,6 +186,7 @@ enum BuildRequestStatus {
BUILD_REQUEST_UNKNOWN = 0; BUILD_REQUEST_UNKNOWN = 0;
BUILD_REQUEST_RECEIVED = 1; // Build request received and queued BUILD_REQUEST_RECEIVED = 1; // Build request received and queued
BUILD_REQUEST_PLANNING = 2; // Graph analysis in progress BUILD_REQUEST_PLANNING = 2; // Graph analysis in progress
BUILD_REQUEST_ANALYSIS_COMPLETED = 7; // Graph analysis completed successfully
BUILD_REQUEST_EXECUTING = 3; // Jobs are being executed BUILD_REQUEST_EXECUTING = 3; // Jobs are being executed
BUILD_REQUEST_COMPLETED = 4; // All requested partitions built BUILD_REQUEST_COMPLETED = 4; // All requested partitions built
BUILD_REQUEST_FAILED = 5; // Build request failed BUILD_REQUEST_FAILED = 5; // Build request failed

View file

@ -130,6 +130,12 @@ pub trait BuildEventLog: Send + Sync {
// Get aggregated activity summary for dashboard // Get aggregated activity summary for dashboard
async fn get_activity_summary(&self) -> Result<ActivitySummary>; async fn get_activity_summary(&self) -> Result<ActivitySummary>;
// Get the build request ID that created an available partition
async fn get_build_request_for_available_partition(
&self,
partition_ref: &str
) -> Result<Option<String>>; // build request ID that made partition available
} }
// Helper function to generate event ID // Helper function to generate event ID

View file

@ -120,4 +120,13 @@ impl BuildEventLog for PostgresBuildEventLog {
"PostgreSQL implementation not yet available".to_string() "PostgreSQL implementation not yet available".to_string()
)) ))
} }
async fn get_build_request_for_available_partition(
&self,
_partition_ref: &str
) -> Result<Option<String>> {
Err(BuildEventLogError::DatabaseError(
"PostgreSQL implementation not yet available".to_string()
))
}
} }

View file

@ -22,7 +22,7 @@ fn int_to_partition_status(i: i32) -> PartitionStatus {
match i { match i {
0 => PartitionStatus::PartitionUnknown, 0 => PartitionStatus::PartitionUnknown,
1 => PartitionStatus::PartitionRequested, 1 => PartitionStatus::PartitionRequested,
2 => PartitionStatus::PartitionScheduled, 2 => PartitionStatus::PartitionAnalyzed,
3 => PartitionStatus::PartitionBuilding, 3 => PartitionStatus::PartitionBuilding,
4 => PartitionStatus::PartitionAvailable, 4 => PartitionStatus::PartitionAvailable,
5 => PartitionStatus::PartitionFailed, 5 => PartitionStatus::PartitionFailed,
@ -381,7 +381,18 @@ impl BuildEventLog for SqliteBuildEventLog {
let rows = stmt.query_map([], |row| { let rows = stmt.query_map([], |row| {
let mut row_data = Vec::new(); let mut row_data = Vec::new();
for i in 0..column_count { for i in 0..column_count {
let value: String = row.get(i).unwrap_or_default(); // Try to get as different types and convert to string
let value: String = if let Ok(int_val) = row.get::<_, i64>(i) {
int_val.to_string()
} else if let Ok(float_val) = row.get::<_, f64>(i) {
float_val.to_string()
} else if let Ok(str_val) = row.get::<_, String>(i) {
str_val
} else if let Ok(str_val) = row.get::<_, Option<String>>(i) {
str_val.unwrap_or_default()
} else {
String::new()
};
row_data.push(value); row_data.push(value);
} }
Ok(row_data) Ok(row_data)
@ -397,37 +408,13 @@ impl BuildEventLog for SqliteBuildEventLog {
rows: result_rows, rows: result_rows,
}) })
} }
async fn get_latest_partition_status( async fn get_latest_partition_status(
&self, &self,
partition_ref: &str partition_ref: &str
) -> Result<Option<(PartitionStatus, i64)>> { ) -> Result<Option<(PartitionStatus, i64)>> {
let conn = self.connection.lock().unwrap(); match self.get_meaningful_partition_status(partition_ref).await? {
Some((status, timestamp, _build_request_id)) => Ok(Some((status, timestamp))),
let query = "SELECT pe.status, be.timestamp None => Ok(None),
FROM partition_events pe
JOIN build_events be ON pe.event_id = be.event_id
WHERE pe.partition_ref = ?1
ORDER BY be.timestamp DESC
LIMIT 1";
let mut stmt = conn.prepare(query)
.map_err(|e| BuildEventLogError::QueryError(e.to_string()))?;
let result = stmt.query_row([partition_ref], |row| {
let status_str: String = row.get(0)?;
let timestamp: i64 = row.get(1)?;
let status = status_str.parse::<i32>().unwrap_or(0);
Ok((status, timestamp))
});
match result {
Ok((status, timestamp)) => {
let partition_status = PartitionStatus::try_from(status).unwrap_or(PartitionStatus::PartitionUnknown);
Ok(Some((partition_status, timestamp)))
}
Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
Err(e) => Err(BuildEventLogError::QueryError(e.to_string())),
} }
} }
@ -445,7 +432,7 @@ impl BuildEventLog for SqliteBuildEventLog {
FROM partition_events pe FROM partition_events pe
JOIN build_events be ON pe.event_id = be.event_id JOIN build_events be ON pe.event_id = be.event_id
WHERE pe.partition_ref = ?1 WHERE pe.partition_ref = ?1
AND pe.status IN ('2', '3') -- PARTITION_SCHEDULED or PARTITION_BUILDING AND pe.status IN ('2', '3') -- PARTITION_ANALYZED or PARTITION_BUILDING
AND be.build_request_id NOT IN ( AND be.build_request_id NOT IN (
SELECT DISTINCT be3.build_request_id SELECT DISTINCT be3.build_request_id
FROM build_request_events bre FROM build_request_events bre
@ -551,70 +538,60 @@ impl BuildEventLog for SqliteBuildEventLog {
offset: u32, offset: u32,
status_filter: Option<PartitionStatus>, status_filter: Option<PartitionStatus>,
) -> Result<(Vec<PartitionSummary>, u32)> { ) -> Result<(Vec<PartitionSummary>, u32)> {
// Get all unique partition refs first, ordered by most recent activity
let (total_count, partition_refs) = {
let conn = self.connection.lock().unwrap(); let conn = self.connection.lock().unwrap();
// Build query based on status filter let count_query = "SELECT COUNT(DISTINCT pe.partition_ref)
let (where_clause, count_where_clause) = match status_filter { FROM partition_events pe";
Some(_) => (" WHERE pe.status = ?1", " WHERE pe.status = ?1"), let total_count: u32 = conn.query_row(count_query, [], |row| row.get(0))
None => ("", ""),
};
let query = format!(
"SELECT pe.partition_ref, pe.status, MAX(be.timestamp) as updated_at, be.build_request_id
FROM build_events be
JOIN partition_events pe ON be.event_id = pe.event_id{}
GROUP BY pe.partition_ref
ORDER BY updated_at DESC
LIMIT {} OFFSET {}",
where_clause, limit, offset
);
let count_query = format!(
"SELECT COUNT(DISTINCT pe.partition_ref)
FROM build_events be
JOIN partition_events pe ON be.event_id = pe.event_id{}",
count_where_clause
);
// Execute count query first
let total_count: u32 = if let Some(status) = status_filter {
let status_str = format!("{:?}", status);
conn.query_row(&count_query, params![status_str], |row| row.get(0))
.map_err(|e| BuildEventLogError::QueryError(e.to_string()))?
} else {
conn.query_row(&count_query, [], |row| row.get(0))
.map_err(|e| BuildEventLogError::QueryError(e.to_string()))?
};
// Execute main query
let mut stmt = conn.prepare(&query)
.map_err(|e| BuildEventLogError::QueryError(e.to_string()))?; .map_err(|e| BuildEventLogError::QueryError(e.to_string()))?;
let row_mapper = |row: &Row| -> rusqlite::Result<PartitionSummary> { let refs_query = "SELECT DISTINCT pe.partition_ref
let status_str: String = row.get(1)?; FROM partition_events pe
let status = status_str.parse::<i32>() JOIN build_events be ON pe.event_id = be.event_id
.map(int_to_partition_status) GROUP BY pe.partition_ref
.unwrap_or(PartitionStatus::PartitionUnknown); ORDER BY MAX(be.timestamp) DESC
LIMIT ? OFFSET ?";
Ok(PartitionSummary { let mut stmt = conn.prepare(refs_query)
partition_ref: row.get(0)?, .map_err(|e| BuildEventLogError::QueryError(e.to_string()))?;
status,
updated_at: row.get(2)?, let rows = stmt.query_map([limit, offset], |row| {
build_request_id: Some(row.get(3)?), let partition_ref: String = row.get(0)?;
}) Ok(partition_ref)
}).map_err(|e| BuildEventLogError::QueryError(e.to_string()))?;
let mut partition_refs = Vec::new();
for row in rows {
partition_refs.push(row.map_err(|e| BuildEventLogError::QueryError(e.to_string()))?);
}
(total_count, partition_refs)
}; };
let rows = if let Some(status) = status_filter { // Get meaningful status for each partition using shared helper
let status_str = format!("{:?}", status);
stmt.query_map(params![status_str], row_mapper)
} else {
stmt.query_map([], row_mapper)
}.map_err(|e| BuildEventLogError::QueryError(e.to_string()))?;
let mut summaries = Vec::new(); let mut summaries = Vec::new();
for row in rows { for partition_ref in partition_refs {
summaries.push(row.map_err(|e| BuildEventLogError::QueryError(e.to_string()))?); if let Some((status, updated_at, build_request_id)) = self.get_meaningful_partition_status(&partition_ref).await? {
// Apply status filter if specified
if let Some(filter_status) = status_filter {
if status != filter_status {
continue;
} }
}
summaries.push(PartitionSummary {
partition_ref,
status,
updated_at,
build_request_id: Some(build_request_id),
});
}
}
// Sort by updated_at descending (most recent first)
summaries.sort_by(|a, b| b.updated_at.cmp(&a.updated_at));
Ok((summaries, total_count)) Ok((summaries, total_count))
} }
@ -742,4 +719,103 @@ impl BuildEventLog for SqliteBuildEventLog {
Ok(()) Ok(())
} }
async fn get_build_request_for_available_partition(
&self,
partition_ref: &str
) -> Result<Option<String>> {
let conn = self.connection.lock().unwrap();
// Find the most recent PARTITION_AVAILABLE event for this partition
let query = "SELECT be.build_request_id
FROM partition_events pe
JOIN build_events be ON pe.event_id = be.event_id
WHERE pe.partition_ref = ?1 AND pe.status = '4'
ORDER BY be.timestamp DESC
LIMIT 1";
let result = conn.query_row(query, [partition_ref], |row| {
let build_request_id: String = row.get(0)?;
Ok(build_request_id)
});
match result {
Ok(build_request_id) => Ok(Some(build_request_id)),
Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
Err(e) => Err(BuildEventLogError::QueryError(e.to_string())),
}
}
}
impl SqliteBuildEventLog {
// Shared helper method to get the meaningful partition status for build coordination and display
// This implements the "delegation-friendly" logic: if a partition was ever available, it remains available
async fn get_meaningful_partition_status(
&self,
partition_ref: &str
) -> Result<Option<(PartitionStatus, i64, String)>> { // (status, timestamp, build_request_id)
let conn = self.connection.lock().unwrap();
// Check for ANY historical completion first - this is resilient to later events being added
let available_query = "SELECT pe.status, be.timestamp, be.build_request_id
FROM partition_events pe
JOIN build_events be ON pe.event_id = be.event_id
WHERE pe.partition_ref = ?1 AND pe.status = '4'
ORDER BY be.timestamp DESC
LIMIT 1";
let mut available_stmt = conn.prepare(available_query)
.map_err(|e| BuildEventLogError::QueryError(e.to_string()))?;
let available_result = available_stmt.query_row([partition_ref], |row| {
let status_str: String = row.get(0)?;
let timestamp: i64 = row.get(1)?;
let build_request_id: String = row.get(2)?;
let status = status_str.parse::<i32>()
.map_err(|_e| rusqlite::Error::InvalidColumnType(0, status_str.clone(), rusqlite::types::Type::Integer))?;
Ok((status, timestamp, build_request_id))
});
match available_result {
Ok((status, timestamp, build_request_id)) => {
let partition_status = PartitionStatus::try_from(status)
.map_err(|_| BuildEventLogError::QueryError(format!("Invalid partition status: {}", status)))?;
return Ok(Some((partition_status, timestamp, build_request_id)));
}
Err(rusqlite::Error::QueryReturnedNoRows) => {
// No available partition found, fall back to latest status
}
Err(e) => return Err(BuildEventLogError::QueryError(e.to_string())),
}
// Fall back to latest status if no available partition found
let latest_query = "SELECT pe.status, be.timestamp, be.build_request_id
FROM partition_events pe
JOIN build_events be ON pe.event_id = be.event_id
WHERE pe.partition_ref = ?1
ORDER BY be.timestamp DESC
LIMIT 1";
let mut latest_stmt = conn.prepare(latest_query)
.map_err(|e| BuildEventLogError::QueryError(e.to_string()))?;
let result = latest_stmt.query_row([partition_ref], |row| {
let status_str: String = row.get(0)?;
let timestamp: i64 = row.get(1)?;
let build_request_id: String = row.get(2)?;
let status = status_str.parse::<i32>()
.map_err(|_e| rusqlite::Error::InvalidColumnType(0, status_str.clone(), rusqlite::types::Type::Integer))?;
Ok((status, timestamp, build_request_id))
});
match result {
Ok((status, timestamp, build_request_id)) => {
let partition_status = PartitionStatus::try_from(status)
.map_err(|_| BuildEventLogError::QueryError(format!("Invalid partition status: {}", status)))?;
Ok(Some((partition_status, timestamp, build_request_id)))
}
Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
Err(e) => Err(BuildEventLogError::QueryError(e.to_string())),
}
}
} }

View file

@ -126,4 +126,14 @@ impl BuildEventLog for StdoutBuildEventLog {
"Stdout build event log does not support querying".to_string() "Stdout build event log does not support querying".to_string()
)) ))
} }
async fn get_build_request_for_available_partition(
&self,
_partition_ref: &str
) -> Result<Option<String>> {
// Stdout implementation doesn't support querying
Err(BuildEventLogError::QueryError(
"Stdout build event log does not support querying".to_string()
))
}
} }

View file

@ -171,60 +171,17 @@ fn configure_parallel(job_refs: HashMap<String, Vec<String>>, num_workers: usize
Ok(all_tasks) Ok(all_tasks)
} }
// Check for non-stale partitions and delegation opportunities // Simple staleness check - all requested partitions need jobs created
// Delegation optimization happens in execution phase
async fn check_partition_staleness( async fn check_partition_staleness(
partition_refs: &[String], partition_refs: &[String],
event_log: &Box<dyn BuildEventLog>, _event_log: &Box<dyn BuildEventLog>,
build_request_id: &str _build_request_id: &str
) -> Result<(Vec<String>, Vec<String>), String> { ) -> Result<(Vec<String>, Vec<String>), String> {
let mut stale_partitions = Vec::new(); // Analysis phase creates jobs for all requested partitions
let mut delegated_partitions = Vec::new(); // Execution phase will handle delegation optimization
let stale_partitions = partition_refs.to_vec();
for partition_ref in partition_refs { let delegated_partitions = Vec::new();
// Check latest partition status
match event_log.get_latest_partition_status(partition_ref).await {
Ok(Some((PartitionStatus::PartitionAvailable, _timestamp))) => {
info!("Partition {} is already available, skipping", partition_ref);
// Could add more sophisticated staleness checking here based on timestamp
// For now, assume available partitions are fresh
delegated_partitions.push(partition_ref.clone());
}
Ok(Some((PartitionStatus::PartitionBuilding, _timestamp))) |
Ok(Some((PartitionStatus::PartitionScheduled, _timestamp))) => {
// Check if another build is actively working on this partition
match event_log.get_active_builds_for_partition(partition_ref).await {
Ok(active_builds) if !active_builds.is_empty() => {
info!("Partition {} is being built by another request: {:?}", partition_ref, active_builds);
// Log delegation event
for delegated_to_build_id in &active_builds {
let event = create_build_event(
build_request_id.to_string(),
crate::build_event::EventType::DelegationEvent(DelegationEvent {
partition_ref: Some(PartitionRef { str: partition_ref.clone() }),
delegated_to_build_request_id: delegated_to_build_id.clone(),
message: format!("Delegated to existing build"),
})
);
if let Err(e) = event_log.append_event(event).await {
error!("Failed to log delegation event: {}", e);
}
}
delegated_partitions.push(partition_ref.clone());
}
_ => {
// No active builds, consider it stale and needs rebuilding
stale_partitions.push(partition_ref.clone());
}
}
}
_ => {
// Partition not found or failed, needs to be built
stale_partitions.push(partition_ref.clone());
}
}
}
Ok((stale_partitions, delegated_partitions)) Ok((stale_partitions, delegated_partitions))
} }
@ -276,23 +233,7 @@ async fn plan(
unhandled_refs.insert(ref_str.clone()); unhandled_refs.insert(ref_str.clone());
} }
// Log partition scheduling events for stale partitions // Note: Partition analysis events will be logged after successful job graph creation
if let Some(ref event_log) = build_event_log {
for partition_ref in &stale_refs {
let event = create_build_event(
build_request_id.to_string(),
crate::build_event::EventType::PartitionEvent(PartitionEvent {
partition_ref: Some(PartitionRef { str: partition_ref.clone() }),
status: PartitionStatus::PartitionScheduled as i32,
message: "Partition scheduled for building".to_string(),
job_run_id: String::new(),
})
);
if let Err(e) = event_log.append_event(event).await {
error!("Failed to log partition scheduled event: {}", e);
}
}
}
let mut epoch = 0; let mut epoch = 0;
let mut nodes = Vec::new(); let mut nodes = Vec::new();
@ -379,35 +320,21 @@ async fn plan(
} }
} }
// Check if we have delegated partitions that explain why we have no nodes if !nodes.is_empty() {
let all_partitions_delegated = nodes.is_empty() &&
output_refs.iter().all(|ref_str| _delegated_refs.contains(ref_str));
if !nodes.is_empty() || all_partitions_delegated {
if all_partitions_delegated {
info!("Planning complete: all {} partitions delegated to other builds", output_refs.len());
} else {
info!("Planning complete: created graph with {} nodes for {} output refs", nodes.len(), output_refs.len()); info!("Planning complete: created graph with {} nodes for {} output refs", nodes.len(), output_refs.len());
}
// Log planning completion // Log analysis completion event
if let Some(ref event_log) = build_event_log { if let Some(ref event_log) = build_event_log {
let message = if all_partitions_delegated {
format!("Analysis completed - all {} partitions delegated to active builds", output_refs.len())
} else {
format!("Analysis completed with {} jobs", nodes.len())
};
let event = create_build_event( let event = create_build_event(
build_request_id.to_string(), build_request_id.to_string(),
crate::build_event::EventType::BuildRequestEvent(BuildRequestEvent { crate::build_event::EventType::BuildRequestEvent(BuildRequestEvent {
status: BuildRequestStatus::BuildRequestCompleted as i32, status: BuildRequestStatus::BuildRequestAnalysisCompleted as i32,
requested_partitions: output_refs.iter().map(|s| PartitionRef { str: s.clone() }).collect(), requested_partitions: output_refs.iter().map(|s| PartitionRef { str: s.clone() }).collect(),
message, message: format!("Analysis completed successfully, {} tasks planned", nodes.len()),
}) })
); );
if let Err(e) = event_log.append_event(event).await { if let Err(e) = event_log.append_event(event).await {
error!("Failed to log completion event: {}", e); error!("Failed to log analysis completion event: {}", e);
} }
} }
@ -618,43 +545,6 @@ async fn main() {
None None
}; };
// Check if this is a CLI build (not service-initiated)
let is_cli_mode = env::var("DATABUILD_CLI_MODE").is_ok();
// Emit orchestration events for CLI mode to match Service behavior
if is_cli_mode {
if let Some(ref event_log) = build_event_log {
// Emit "Build request received" event
let event = create_build_event(
build_request_id.clone(),
crate::build_event::EventType::BuildRequestEvent(BuildRequestEvent {
status: BuildRequestStatus::BuildRequestReceived as i32,
requested_partitions: args.iter()
.map(|p| PartitionRef { str: p.clone() })
.collect(),
message: "Build request received".to_string(),
})
);
if let Err(e) = event_log.append_event(event).await {
error!("Failed to log build request received event: {}", e);
}
// Emit "Starting build planning" event
let event = create_build_event(
build_request_id.clone(),
crate::build_event::EventType::BuildRequestEvent(BuildRequestEvent {
status: BuildRequestStatus::BuildRequestPlanning as i32,
requested_partitions: args.iter()
.map(|p| PartitionRef { str: p.clone() })
.collect(),
message: "Starting build planning".to_string(),
})
);
if let Err(e) = event_log.append_event(event).await {
error!("Failed to log build planning event: {}", e);
}
}
}
match mode.as_str() { match mode.as_str() {
"plan" => { "plan" => {

View file

@ -1,4 +1,4 @@
use databuild::{JobGraph, Task, JobStatus, BuildRequestStatus, PartitionStatus, BuildRequestEvent, JobEvent, PartitionEvent}; use databuild::{JobGraph, Task, JobStatus, BuildRequestStatus, PartitionStatus, BuildRequestEvent, JobEvent, PartitionEvent, PartitionRef};
use databuild::event_log::{create_build_event_log, create_build_event}; use databuild::event_log::{create_build_event_log, create_build_event};
use databuild::build_event::EventType; use databuild::build_event::EventType;
use crossbeam_channel::{Receiver, Sender}; use crossbeam_channel::{Receiver, Sender};
@ -263,14 +263,56 @@ fn is_task_ready(task: &Task, completed_outputs: &HashSet<String>) -> bool {
true true
} }
// Check if partitions are already being built by other build requests // Check if partitions are already available or being built by other build requests
async fn check_build_coordination( async fn check_build_coordination(
task: &Task, task: &Task,
event_log: &Box<dyn databuild::event_log::BuildEventLog>, event_log: &Box<dyn databuild::event_log::BuildEventLog>,
build_request_id: &str build_request_id: &str
) -> Result<bool, String> { ) -> Result<(bool, bool, Vec<(PartitionRef, String)>), String> {
for output_ref in &task.config.as_ref().unwrap().outputs { let outputs = &task.config.as_ref().unwrap().outputs;
// Check if this partition is already being built by another request let mut available_partitions = Vec::new();
let mut needs_building = false;
for output_ref in outputs {
debug!("Checking build coordination for partition: {}", output_ref.str);
// First check if this partition is already available
match event_log.get_latest_partition_status(&output_ref.str).await {
Ok(Some((status, _timestamp))) => {
debug!("Partition {} has status: {:?}", output_ref.str, status);
if status == databuild::PartitionStatus::PartitionAvailable {
// Get which build request created this partition
match event_log.get_build_request_for_available_partition(&output_ref.str).await {
Ok(Some(source_build_id)) => {
info!("Partition {} already available from build {}", output_ref.str, source_build_id);
available_partitions.push((output_ref.clone(), source_build_id));
continue;
}
Ok(None) => {
error!("Partition {} is available but no source build found - this indicates a bug in the event log implementation", output_ref.str);
return Err(format!("Available partition {} has no source build ID. This suggests the event log is missing required data.", output_ref.str));
}
Err(e) => {
error!("Failed to get source build for partition {}: {}", output_ref.str, e);
return Err(format!("Cannot determine source build for available partition {}: {}", output_ref.str, e));
}
}
} else {
debug!("Partition {} has non-available status {:?}, needs building", output_ref.str, status);
needs_building = true;
}
}
Ok(None) => {
debug!("Partition {} has no status, needs building", output_ref.str);
needs_building = true;
}
Err(e) => {
error!("Failed to check partition status for {}: {}", output_ref.str, e);
return Err(format!("Cannot check partition status: {}. Use a queryable event log (e.g., SQLite) for builds that need to check existing partitions.", e));
}
}
// Check if this partition is being built by another request
match event_log.get_active_builds_for_partition(&output_ref.str).await { match event_log.get_active_builds_for_partition(&output_ref.str).await {
Ok(active_builds) => { Ok(active_builds) => {
let other_builds: Vec<String> = active_builds.into_iter() let other_builds: Vec<String> = active_builds.into_iter()
@ -281,7 +323,7 @@ async fn check_build_coordination(
info!("Partition {} is already being built by other requests: {:?}. Delegating.", info!("Partition {} is already being built by other requests: {:?}. Delegating.",
output_ref.str, other_builds); output_ref.str, other_builds);
// Log delegation event // Log delegation event for active builds
for delegated_to_build_id in &other_builds { for delegated_to_build_id in &other_builds {
let event = create_build_event( let event = create_build_event(
build_request_id.to_string(), build_request_id.to_string(),
@ -296,17 +338,25 @@ async fn check_build_coordination(
} }
} }
return Ok(false); // Don't build this task, it's delegated return Ok((false, false, available_partitions)); // Don't build, delegated to active build
} }
} }
Err(e) => { Err(e) => {
warn!("Failed to check active builds for partition {}: {}", output_ref.str, e); error!("Failed to check active builds for partition {}: {}", output_ref.str, e);
// Continue with build on error to avoid blocking return Err(format!("Cannot check active builds: {}. Use a queryable event log (e.g., SQLite) for builds that need to check for concurrent execution.", e));
}
} }
} }
Ok(true) // Safe to build // If we reach here, this partition needs to be built
needs_building = true;
}
// Only skip the job if ALL partitions are already available
if !needs_building && available_partitions.len() == outputs.len() {
Ok((false, true, available_partitions)) // Don't build, skip due to all partitions available
} else {
Ok((true, false, available_partitions)) // Need to build (some partitions unavailable)
}
} }
fn log_status_summary( fn log_status_summary(
@ -338,7 +388,14 @@ fn log_status_summary(
#[tokio::main] #[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> { async fn main() -> Result<(), Box<dyn std::error::Error>> {
simple_logger::SimpleLogger::new().with_level(log::LevelFilter::Info).init()?; simple_logger::SimpleLogger::new()
.with_level(
std::env::var("RUST_LOG")
.unwrap_or_else(|_| "info".to_string())
.parse()
.unwrap_or(log::LevelFilter::Info)
)
.init()?;
// Get build event log configuration from environment variables // Get build event log configuration from environment variables
let build_event_log_uri = std::env::var("DATABUILD_BUILD_EVENT_LOG").ok(); let build_event_log_uri = std::env::var("DATABUILD_BUILD_EVENT_LOG").ok();
@ -367,26 +424,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
info!("Executing job graph with {} nodes", graph.nodes.len()); info!("Executing job graph with {} nodes", graph.nodes.len());
// Check if this is a CLI build (not service-initiated)
let is_cli_mode = std::env::var("DATABUILD_CLI_MODE").is_ok();
// Emit orchestration events for CLI mode to match Service behavior
if is_cli_mode {
if let Some(ref event_log) = build_event_log {
// Emit "Starting build execution" event (matches Service pattern)
let event = create_build_event(
build_request_id.clone(),
EventType::BuildRequestEvent(BuildRequestEvent {
status: BuildRequestStatus::BuildRequestExecuting as i32,
requested_partitions: graph.outputs.clone(),
message: "Starting build execution".to_string(),
})
);
if let Err(e) = event_log.append_event(event).await {
error!("Failed to log build execution event: {}", e);
}
}
}
// Log build request execution start (existing detailed event) // Log build request execution start (existing detailed event)
if let Some(ref event_log) = build_event_log { if let Some(ref event_log) = build_event_log {
@ -522,22 +559,63 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
if task_states.get(&task_key) == Some(&TaskState::Pending) { if task_states.get(&task_key) == Some(&TaskState::Pending) {
if is_task_ready(task_node, &completed_outputs) { if is_task_ready(task_node, &completed_outputs) {
// Check build coordination if event log is available // Check build coordination if event log is available
let should_build = if let Some(ref event_log) = build_event_log { let (should_build, is_skipped, available_partitions) = if let Some(ref event_log) = build_event_log {
match check_build_coordination(task_node, event_log, &build_request_id).await { match check_build_coordination(task_node, event_log, &build_request_id).await {
Ok(should_build) => should_build, Ok((should_build, is_skipped, available_partitions)) => (should_build, is_skipped, available_partitions),
Err(e) => { Err(e) => {
error!("Error checking build coordination for {}: {}", error!("Error checking build coordination for {}: {}",
task_node.job.as_ref().unwrap().label, e); task_node.job.as_ref().unwrap().label, e);
true // Default to building on error (true, false, Vec::<(PartitionRef, String)>::new()) // Default to building on error
} }
} }
} else { } else {
true // No event log, always build (true, false, Vec::<(PartitionRef, String)>::new()) // No event log, always build
}; };
if !should_build { if !should_build {
// Task delegated to another build, mark as succeeded if is_skipped {
info!("Task {} delegated to another build request", task_node.job.as_ref().unwrap().label); // Task skipped due to all partitions already available
info!("Task {} skipped - all target partitions already available", task_node.job.as_ref().unwrap().label);
// Log delegation events for each available partition
if let Some(ref event_log) = build_event_log {
for (partition_ref, source_build_id) in &available_partitions {
let delegation_event = create_build_event(
build_request_id.clone(),
EventType::DelegationEvent(databuild::DelegationEvent {
partition_ref: Some(partition_ref.clone()),
delegated_to_build_request_id: source_build_id.clone(),
message: "Delegated to historical build - partition already available".to_string(),
})
);
if let Err(e) = event_log.append_event(delegation_event).await {
error!("Failed to log historical delegation event: {}", e);
}
}
// Log JOB_SKIPPED event
let job_run_id = Uuid::new_v4().to_string();
let job_event = create_build_event(
build_request_id.clone(),
EventType::JobEvent(JobEvent {
job_run_id: job_run_id.clone(),
job_label: task_node.job.clone(),
target_partitions: task_node.config.as_ref().unwrap().outputs.clone(),
status: JobStatus::JobSkipped as i32,
message: "Job skipped - all target partitions already available".to_string(),
config: task_node.config.clone(),
manifests: vec![],
})
);
if let Err(e) = event_log.append_event(job_event).await {
error!("Failed to log job skipped event: {}", e);
}
}
} else {
// Task delegated to active build
info!("Task {} delegated to active build request", task_node.job.as_ref().unwrap().label);
}
task_states.insert(task_key.clone(), TaskState::Succeeded); task_states.insert(task_key.clone(), TaskState::Succeeded);
// Mark outputs as completed // Mark outputs as completed
@ -669,34 +747,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
info!("Execution complete: {} succeeded, {} failed", success_count, failure_count); info!("Execution complete: {} succeeded, {} failed", success_count, failure_count);
// Emit orchestration completion events for CLI mode to match Service behavior
if is_cli_mode {
if let Some(ref event_log) = build_event_log {
let orchestration_message = if failure_count > 0 || fail_fast_triggered {
"Build request failed"
} else {
"Build request completed successfully"
};
let orchestration_status = if failure_count > 0 || fail_fast_triggered {
BuildRequestStatus::BuildRequestFailed
} else {
BuildRequestStatus::BuildRequestCompleted
};
let event = create_build_event(
build_request_id.clone(),
EventType::BuildRequestEvent(BuildRequestEvent {
status: orchestration_status as i32,
requested_partitions: graph.outputs.clone(),
message: orchestration_message.to_string(),
})
);
if let Err(e) = event_log.append_event(event).await {
error!("Failed to log build request completion event: {}", e);
}
}
}
// Log final build request status (existing detailed event) // Log final build request status (existing detailed event)
if let Some(ref event_log) = build_event_log { if let Some(ref event_log) = build_event_log {

View file

@ -4,8 +4,14 @@ include!("databuild.rs");
// Event log module // Event log module
pub mod event_log; pub mod event_log;
// Orchestration module
pub mod orchestration;
// Service module // Service module
pub mod service; pub mod service;
// Re-export commonly used types from event_log // Re-export commonly used types from event_log
pub use event_log::{BuildEventLog, BuildEventLogError, create_build_event_log}; pub use event_log::{BuildEventLog, BuildEventLogError, create_build_event_log};
// Re-export orchestration types
pub use orchestration::{BuildOrchestrator, BuildResult, OrchestrationError};

View file

@ -0,0 +1,15 @@
use crate::event_log::BuildEventLogError;
#[derive(Debug, thiserror::Error)]
pub enum OrchestrationError {
#[error("Event log error: {0}")]
EventLog(#[from] BuildEventLogError),
#[error("Build coordination error: {0}")]
Coordination(String),
#[error("Invalid build state transition: {current} -> {requested}")]
InvalidStateTransition { current: String, requested: String },
}
pub type Result<T> = std::result::Result<T, OrchestrationError>;

View file

@ -0,0 +1,146 @@
use crate::*;
use crate::event_log::{create_build_event, current_timestamp_nanos, generate_event_id};
/// Helper functions for creating standardized build events
pub fn create_build_request_received_event(
build_request_id: String,
requested_partitions: Vec<PartitionRef>,
) -> BuildEvent {
create_build_event(
build_request_id,
build_event::EventType::BuildRequestEvent(BuildRequestEvent {
status: BuildRequestStatus::BuildRequestReceived as i32,
requested_partitions,
message: "Build request received".to_string(),
}),
)
}
pub fn create_build_planning_started_event(
build_request_id: String,
) -> BuildEvent {
create_build_event(
build_request_id,
build_event::EventType::BuildRequestEvent(BuildRequestEvent {
status: BuildRequestStatus::BuildRequestPlanning as i32,
requested_partitions: vec![],
message: "Starting build planning".to_string(),
}),
)
}
pub fn create_build_execution_started_event(
build_request_id: String,
) -> BuildEvent {
create_build_event(
build_request_id,
build_event::EventType::BuildRequestEvent(BuildRequestEvent {
status: BuildRequestStatus::BuildRequestExecuting as i32,
requested_partitions: vec![],
message: "Starting build execution".to_string(),
}),
)
}
pub fn create_build_completed_event(
build_request_id: String,
result: &super::BuildResult,
) -> BuildEvent {
let message = match result {
super::BuildResult::Success { jobs_completed } => {
format!("Build completed successfully with {} jobs", jobs_completed)
}
super::BuildResult::Failed { jobs_completed, jobs_failed } => {
format!("Build failed: {} jobs completed, {} jobs failed", jobs_completed, jobs_failed)
}
super::BuildResult::FailFast { trigger_job } => {
format!("Build failed fast due to job: {}", trigger_job)
}
};
let status = match result {
super::BuildResult::Success { .. } => BuildRequestStatus::BuildRequestCompleted,
super::BuildResult::Failed { .. } | super::BuildResult::FailFast { .. } => BuildRequestStatus::BuildRequestFailed,
};
create_build_event(
build_request_id,
build_event::EventType::BuildRequestEvent(BuildRequestEvent {
status: status as i32,
requested_partitions: vec![],
message,
}),
)
}
pub fn create_analysis_completed_event(
build_request_id: String,
requested_partitions: Vec<PartitionRef>,
task_count: usize,
) -> BuildEvent {
create_build_event(
build_request_id,
build_event::EventType::BuildRequestEvent(BuildRequestEvent {
status: BuildRequestStatus::BuildRequestAnalysisCompleted as i32,
requested_partitions,
message: format!("Analysis completed successfully, {} tasks planned", task_count),
}),
)
}
pub fn create_job_scheduled_event(
build_request_id: String,
job_event: &JobEvent,
) -> BuildEvent {
BuildEvent {
event_id: generate_event_id(),
timestamp: current_timestamp_nanos(),
build_request_id,
event_type: Some(build_event::EventType::JobEvent(job_event.clone())),
}
}
pub fn create_job_completed_event(
build_request_id: String,
job_event: &JobEvent,
) -> BuildEvent {
BuildEvent {
event_id: generate_event_id(),
timestamp: current_timestamp_nanos(),
build_request_id,
event_type: Some(build_event::EventType::JobEvent(job_event.clone())),
}
}
pub fn create_partition_available_event(
build_request_id: String,
partition_event: &PartitionEvent,
) -> BuildEvent {
BuildEvent {
event_id: generate_event_id(),
timestamp: current_timestamp_nanos(),
build_request_id,
event_type: Some(build_event::EventType::PartitionEvent(partition_event.clone())),
}
}
pub fn create_delegation_event(
build_request_id: String,
partition_ref: &str,
target_build: &str,
message: &str,
) -> BuildEvent {
let partition = PartitionRef {
str: partition_ref.to_string(),
};
create_build_event(
build_request_id,
build_event::EventType::DelegationEvent(DelegationEvent {
partition_ref: Some(partition),
delegated_to_build_request_id: target_build.to_string(),
message: message.to_string(),
}),
)
}

View file

@ -0,0 +1,375 @@
use crate::*;
use crate::event_log::BuildEventLog;
use log::info;
use std::sync::Arc;
pub mod error;
pub mod events;
pub use error::{OrchestrationError, Result};
/// Result of a build execution
#[derive(Debug, Clone)]
pub enum BuildResult {
Success { jobs_completed: usize },
Failed { jobs_completed: usize, jobs_failed: usize },
FailFast { trigger_job: String },
}
/// Core orchestrator for managing build lifecycle and event emission
pub struct BuildOrchestrator {
event_log: Arc<dyn BuildEventLog>,
build_request_id: String,
requested_partitions: Vec<PartitionRef>,
}
impl BuildOrchestrator {
/// Create a new build orchestrator
pub fn new(
event_log: Arc<dyn BuildEventLog>,
build_request_id: String,
requested_partitions: Vec<PartitionRef>,
) -> Self {
Self {
event_log,
build_request_id,
requested_partitions,
}
}
/// Get the build request ID
pub fn build_request_id(&self) -> &str {
&self.build_request_id
}
/// Get the requested partitions
pub fn requested_partitions(&self) -> &[PartitionRef] {
&self.requested_partitions
}
/// Emit build request received event and start the build lifecycle
pub async fn start_build(&self) -> Result<()> {
info!("Starting build for request: {}", self.build_request_id);
let event = events::create_build_request_received_event(
self.build_request_id.clone(),
self.requested_partitions.clone(),
);
self.event_log.append_event(event).await
.map_err(OrchestrationError::EventLog)?;
Ok(())
}
/// Emit build planning started event
pub async fn start_planning(&self) -> Result<()> {
info!("Starting build planning for request: {}", self.build_request_id);
let event = events::create_build_planning_started_event(
self.build_request_id.clone(),
);
self.event_log.append_event(event).await
.map_err(OrchestrationError::EventLog)?;
Ok(())
}
/// Emit build execution started event
pub async fn start_execution(&self) -> Result<()> {
info!("Starting build execution for request: {}", self.build_request_id);
let event = events::create_build_execution_started_event(
self.build_request_id.clone(),
);
self.event_log.append_event(event).await
.map_err(OrchestrationError::EventLog)?;
Ok(())
}
/// Emit build completion event
pub async fn complete_build(&self, result: BuildResult) -> Result<()> {
info!("Completing build for request: {} with result: {:?}",
self.build_request_id, result);
let event = events::create_build_completed_event(
self.build_request_id.clone(),
&result,
);
self.event_log.append_event(event).await
.map_err(OrchestrationError::EventLog)?;
Ok(())
}
/// Emit analysis completed event
pub async fn emit_analysis_completed(&self, task_count: usize) -> Result<()> {
let event = events::create_analysis_completed_event(
self.build_request_id.clone(),
self.requested_partitions.clone(),
task_count,
);
self.event_log.append_event(event).await
.map_err(OrchestrationError::EventLog)?;
Ok(())
}
/// Emit job scheduled event
pub async fn emit_job_scheduled(&self, job: &JobEvent) -> Result<()> {
let event = events::create_job_scheduled_event(
self.build_request_id.clone(),
job,
);
self.event_log.append_event(event).await
.map_err(OrchestrationError::EventLog)?;
Ok(())
}
/// Emit job completed event
pub async fn emit_job_completed(&self, job: &JobEvent) -> Result<()> {
let event = events::create_job_completed_event(
self.build_request_id.clone(),
job,
);
self.event_log.append_event(event).await
.map_err(OrchestrationError::EventLog)?;
Ok(())
}
/// Emit partition available event
pub async fn emit_partition_available(&self, partition: &PartitionEvent) -> Result<()> {
let event = events::create_partition_available_event(
self.build_request_id.clone(),
partition,
);
self.event_log.append_event(event).await
.map_err(OrchestrationError::EventLog)?;
Ok(())
}
/// Emit delegation event
pub async fn emit_delegation(
&self,
partition_ref: &str,
target_build: &str,
message: &str,
) -> Result<()> {
let event = events::create_delegation_event(
self.build_request_id.clone(),
partition_ref,
target_build,
message,
);
self.event_log.append_event(event).await
.map_err(OrchestrationError::EventLog)?;
Ok(())
}
/// Get reference to the event log for direct access if needed
pub fn event_log(&self) -> &dyn BuildEventLog {
self.event_log.as_ref()
}
}
#[cfg(test)]
mod tests {
use super::*;
use async_trait::async_trait;
use std::sync::{Arc, Mutex};
/// Mock event log for testing that captures events
struct MockEventLog {
events: Arc<Mutex<Vec<BuildEvent>>>,
}
impl MockEventLog {
fn new() -> (Self, Arc<Mutex<Vec<BuildEvent>>>) {
let events = Arc::new(Mutex::new(Vec::new()));
let log = Self {
events: events.clone(),
};
(log, events)
}
}
#[async_trait]
impl BuildEventLog for MockEventLog {
async fn append_event(&self, event: BuildEvent) -> crate::event_log::Result<()> {
self.events.lock().unwrap().push(event);
Ok(())
}
async fn get_build_request_events(
&self,
_build_request_id: &str,
_since: Option<i64>,
) -> crate::event_log::Result<Vec<BuildEvent>> {
Ok(self.events.lock().unwrap().clone())
}
async fn get_partition_events(
&self,
_partition_ref: &str,
_since: Option<i64>,
) -> crate::event_log::Result<Vec<BuildEvent>> {
Ok(vec![])
}
async fn get_job_run_events(
&self,
_job_run_id: &str,
) -> crate::event_log::Result<Vec<BuildEvent>> {
Ok(vec![])
}
async fn get_events_in_range(
&self,
_start_time: i64,
_end_time: i64,
) -> crate::event_log::Result<Vec<BuildEvent>> {
Ok(vec![])
}
async fn execute_query(&self, _query: &str) -> crate::event_log::Result<crate::event_log::QueryResult> {
Ok(crate::event_log::QueryResult {
columns: vec![],
rows: vec![],
})
}
async fn get_latest_partition_status(
&self,
_partition_ref: &str,
) -> crate::event_log::Result<Option<(PartitionStatus, i64)>> {
Ok(None)
}
async fn get_active_builds_for_partition(
&self,
_partition_ref: &str,
) -> crate::event_log::Result<Vec<String>> {
Ok(vec![])
}
async fn initialize(&self) -> crate::event_log::Result<()> {
Ok(())
}
async fn list_build_requests(
&self,
_limit: u32,
_offset: u32,
_status_filter: Option<BuildRequestStatus>,
) -> crate::event_log::Result<(Vec<crate::event_log::BuildRequestSummary>, u32)> {
Ok((vec![], 0))
}
async fn list_recent_partitions(
&self,
_limit: u32,
_offset: u32,
_status_filter: Option<PartitionStatus>,
) -> crate::event_log::Result<(Vec<crate::event_log::PartitionSummary>, u32)> {
Ok((vec![], 0))
}
async fn get_activity_summary(&self) -> crate::event_log::Result<crate::event_log::ActivitySummary> {
Ok(crate::event_log::ActivitySummary {
active_builds_count: 0,
recent_builds: vec![],
recent_partitions: vec![],
total_partitions_count: 0,
})
}
async fn get_build_request_for_available_partition(
&self,
_partition_ref: &str,
) -> crate::event_log::Result<Option<String>> {
Ok(None)
}
}
#[tokio::test]
async fn test_build_lifecycle_events() {
let (mock_log, events) = MockEventLog::new();
let partitions = vec![PartitionRef { str: "test/partition".to_string() }];
let orchestrator = BuildOrchestrator::new(
Arc::new(mock_log),
"test-build-123".to_string(),
partitions.clone(),
);
// Test full build lifecycle
orchestrator.start_build().await.unwrap();
orchestrator.start_planning().await.unwrap();
orchestrator.start_execution().await.unwrap();
orchestrator.complete_build(BuildResult::Success { jobs_completed: 5 }).await.unwrap();
let emitted_events = events.lock().unwrap();
assert_eq!(emitted_events.len(), 4);
// Verify event types and build request IDs
for event in emitted_events.iter() {
assert_eq!(event.build_request_id, "test-build-123");
}
// Verify first event is build request received
if let Some(build_event::EventType::BuildRequestEvent(br_event)) = &emitted_events[0].event_type {
assert_eq!(br_event.status, BuildRequestStatus::BuildRequestReceived as i32);
assert_eq!(br_event.requested_partitions, partitions);
} else {
panic!("First event should be BuildRequestEvent");
}
}
#[tokio::test]
async fn test_partition_and_job_events() {
let (mock_log, events) = MockEventLog::new();
let orchestrator = BuildOrchestrator::new(
Arc::new(mock_log),
"test-build-456".to_string(),
vec![],
);
// Test analysis completed event
orchestrator.emit_analysis_completed(3).await.unwrap();
// Test job event
let partition = PartitionRef { str: "data/users".to_string() };
let job_event = JobEvent {
job_run_id: "job-run-123".to_string(),
job_label: Some(JobLabel { label: "//:test_job".to_string() }),
target_partitions: vec![partition.clone()],
status: JobStatus::JobScheduled as i32,
message: "Job scheduled".to_string(),
config: None,
manifests: vec![],
};
orchestrator.emit_job_scheduled(&job_event).await.unwrap();
let emitted_events = events.lock().unwrap();
assert_eq!(emitted_events.len(), 2);
// All events should have the correct build request ID
for event in emitted_events.iter() {
assert_eq!(event.build_request_id, "test-build-456");
}
}
}

View file

@ -266,9 +266,10 @@ def databuild_graph(name, jobs, lookup, visibility = None):
) )
_databuild_graph_build( _databuild_graph_build(
name = "%s.build" % name, name = "%s.build" % name,
analyze = "%s.analyze" % name, cli_wrapper = "@databuild//databuild/cli:databuild_cli",
exec = "%s.exec" % name,
jobs = jobs, jobs = jobs,
lookup = "%s.lookup" % name,
graph_label = "//%s:%s" % (native.package_name(), name),
visibility = visibility, visibility = visibility,
) )
# Build deployment targets (renamed for hierarchical namespacing) # Build deployment targets (renamed for hierarchical namespacing)
@ -629,26 +630,87 @@ DataBuildGraphInfo = provider(
) )
def _databuild_graph_build_impl(ctx): def _databuild_graph_build_impl(ctx):
"""Wraps the analyze and execute targets in a shell script.""" """Wraps the DataBuild CLI wrapper in a shell script."""
script = ctx.actions.declare_file(ctx.label.name) script = ctx.actions.declare_file(ctx.label.name)
# Build DATABUILD_CANDIDATE_JOBS JSON string with runtime rlocation resolution
candidate_jobs_script_lines = ["CANDIDATE_JOBS_JSON=\"{\""]
for i, job in enumerate(ctx.attr.jobs):
job_label = "//" + job.label.package + ":" + job.label.name
configure_path = job[DataBuildJobInfo].configure.files_to_run.executable.short_path
separator = "," if i < len(ctx.attr.jobs) - 1 else ""
candidate_jobs_script_lines.append(
'CANDIDATE_JOBS_JSON="${CANDIDATE_JOBS_JSON}\\"%s\\":\\"$(rlocation _main/%s)\\"%s"' % (
job_label, configure_path, separator
)
)
candidate_jobs_script_lines.append('CANDIDATE_JOBS_JSON="${CANDIDATE_JOBS_JSON}}"')
candidate_jobs_script = "\n".join(candidate_jobs_script_lines)
script_content = RUNFILES_PREFIX + """
# Build DATABUILD_CANDIDATE_JOBS dynamically with proper rlocation resolution
%s
export DATABUILD_CANDIDATE_JOBS="$CANDIDATE_JOBS_JSON"
export DATABUILD_JOB_LOOKUP_PATH="$(rlocation _main/%s)"
export DATABUILD_GRAPH_LABEL="%s"
# Generate a single build request ID for the entire CLI operation
export DATABUILD_BUILD_REQUEST_ID=$(python3 -c "import uuid; print(uuid.uuid4())")
# Run unified DataBuild CLI wrapper
"$(rlocation databuild+/databuild/cli/databuild_cli)" "$@"
""" % (
candidate_jobs_script,
ctx.attr.lookup.files_to_run.executable.short_path,
ctx.attr.graph_label
)
ctx.actions.write( ctx.actions.write(
output = script, output = script,
is_executable = True, is_executable = True,
content = "#!/bin/bash\n\n" + RUNFILES_PREFIX + """ content = script_content,
# Set CLI mode to indicate this is a direct CLI build (not service-initiated)
export DATABUILD_CLI_MODE=true
# Run analysis and execution phases
$(rlocation _main/{analyze_path}) $@ | $(rlocation _main/{exec_path})
""".format(
analyze_path = ctx.attr.analyze.files_to_run.executable.short_path,
exec_path = ctx.attr.exec.files_to_run.executable.short_path,
),
) )
# Gather the configure and execute executables
configure_executables = [
job[DataBuildJobInfo].configure.files_to_run.executable
for job in ctx.attr.jobs
]
# Get the execute targets - these are the .exec files that need to be in runfiles
execute_executables = []
for job in ctx.attr.jobs:
# The job target itself contains references to both configure and execute
# We need to find the .exec target for each job
job_name = job.label.name
exec_target_name = job_name + ".exec"
# Find the .exec target in the same package
for attr_name in dir(job):
if attr_name.endswith("_exec") or exec_target_name in attr_name:
exec_target = getattr(job, attr_name, None)
if exec_target and hasattr(exec_target, "files_to_run"):
execute_executables.append(exec_target.files_to_run.executable)
break
# Also check if we can access exec targets directly from job dependencies
all_job_files = []
for job in ctx.attr.jobs:
if hasattr(job, "default_runfiles") and job.default_runfiles:
all_job_files.extend(job.default_runfiles.files.to_list())
runfiles = ctx.runfiles( runfiles = ctx.runfiles(
files = [ctx.executable.analyze, ctx.executable.exec], files = [ctx.executable.cli_wrapper, ctx.executable.lookup] + configure_executables + execute_executables + all_job_files,
).merge(ctx.attr.analyze.default_runfiles).merge(ctx.attr.exec.default_runfiles) ).merge(ctx.attr.cli_wrapper.default_runfiles).merge(ctx.attr.lookup.default_runfiles).merge(
ctx.attr._bash_runfiles.default_runfiles
)
# Merge runfiles from all configure targets and job targets
for job in ctx.attr.jobs:
configure_target = job[DataBuildJobInfo].configure
runfiles = runfiles.merge(configure_target.default_runfiles)
# Also merge the job's own runfiles which should include the .exec target
runfiles = runfiles.merge(job.default_runfiles)
return [ return [
DefaultInfo( DefaultInfo(
@ -656,8 +718,8 @@ $(rlocation _main/{analyze_path}) $@ | $(rlocation _main/{exec_path})
runfiles = runfiles, runfiles = runfiles,
), ),
DataBuildGraphInfo( DataBuildGraphInfo(
analyze = ctx.attr.analyze, analyze = ctx.attr.cli_wrapper,
exec = ctx.attr.exec, exec = ctx.attr.cli_wrapper,
jobs = ctx.attr.jobs, jobs = ctx.attr.jobs,
), ),
] ]
@ -665,14 +727,8 @@ $(rlocation _main/{analyze_path}) $@ | $(rlocation _main/{exec_path})
_databuild_graph_build = rule( _databuild_graph_build = rule(
implementation = _databuild_graph_build_impl, implementation = _databuild_graph_build_impl,
attrs = { attrs = {
"analyze": attr.label( "cli_wrapper": attr.label(
doc = "Target that implements the graph analysis logic", doc = "Target that implements the unified DataBuild CLI",
mandatory = True,
executable = True,
cfg = "target",
),
"exec": attr.label(
doc = "Target that implements the graph execution logic",
mandatory = True, mandatory = True,
executable = True, executable = True,
cfg = "target", cfg = "target",
@ -681,6 +737,20 @@ _databuild_graph_build = rule(
doc = "The list of jobs that are candidates for building partitions in this databuild graph", doc = "The list of jobs that are candidates for building partitions in this databuild graph",
allow_empty = False, allow_empty = False,
), ),
"lookup": attr.label(
doc = "Target that implements job lookup for desired partition refs",
mandatory = True,
executable = True,
cfg = "target",
),
"graph_label": attr.string(
doc = "The label of this graph for identification",
mandatory = True,
),
"_bash_runfiles": attr.label(
default = Label("@bazel_tools//tools/bash/runfiles"),
allow_files = True,
),
}, },
executable = True, executable = True,
) )

View file

@ -1,5 +1,6 @@
use super::*; use super::*;
use crate::event_log::{current_timestamp_nanos, create_build_event}; use crate::event_log::{current_timestamp_nanos, create_build_event};
use crate::orchestration::{BuildOrchestrator, BuildResult};
use axum::{ use axum::{
extract::{Path, State}, extract::{Path, State},
http::StatusCode, http::StatusCode,
@ -11,6 +12,42 @@ use schemars::JsonSchema;
use std::process::Command; use std::process::Command;
use std::env; use std::env;
// Simple base64 URL-safe decoding function for job labels
fn base64_url_decode(encoded: &str) -> Result<String, Box<dyn std::error::Error>> {
// Convert URL-safe base64 back to regular base64
let mut padded = encoded.replace('-', "+").replace('_', "/");
// Add padding if needed
match padded.len() % 4 {
2 => padded.push_str("=="),
3 => padded.push_str("="),
_ => {}
}
// Manual base64 decoding (simplified)
let alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
let mut result = Vec::new();
let mut buffer = 0u32;
let mut bits = 0;
for c in padded.chars() {
if c == '=' { break; }
if let Some(index) = alphabet.find(c) {
buffer = (buffer << 6) | (index as u32);
bits += 6;
if bits >= 8 {
result.push(((buffer >> (bits - 8)) & 0xFF) as u8);
bits -= 8;
}
}
}
String::from_utf8(result).map_err(|e| e.into())
}
pub async fn submit_build_request( pub async fn submit_build_request(
State(service): State<ServiceState>, State(service): State<ServiceState>,
Json(request): Json<BuildRequest>, Json(request): Json<BuildRequest>,
@ -35,19 +72,18 @@ pub async fn submit_build_request(
active_builds.insert(build_request_id.clone(), build_state); active_builds.insert(build_request_id.clone(), build_state);
} }
// Log build request received event // Create orchestrator and emit build request received event
let event = create_build_event( let requested_partitions: Vec<PartitionRef> = request.partitions.iter()
build_request_id.clone(),
crate::build_event::EventType::BuildRequestEvent(BuildRequestEvent {
status: BuildRequestStatus::BuildRequestReceived as i32,
requested_partitions: request.partitions.iter()
.map(|p| PartitionRef { str: p.clone() }) .map(|p| PartitionRef { str: p.clone() })
.collect(), .collect();
message: "Build request received".to_string(),
}), let orchestrator = BuildOrchestrator::new(
service.event_log.clone(),
build_request_id.clone(),
requested_partitions,
); );
if let Err(e) = service.event_log.append_event(event).await { if let Err(e) = orchestrator.start_build().await {
error!("Failed to log build request received event: {}", e); error!("Failed to log build request received event: {}", e);
return Err(( return Err((
StatusCode::INTERNAL_SERVER_ERROR, StatusCode::INTERNAL_SERVER_ERROR,
@ -138,6 +174,7 @@ pub async fn get_build_status(
0 => BuildRequestStatus::BuildRequestUnknown, // Default protobuf value - should not happen in production 0 => BuildRequestStatus::BuildRequestUnknown, // Default protobuf value - should not happen in production
1 => BuildRequestStatus::BuildRequestReceived, 1 => BuildRequestStatus::BuildRequestReceived,
2 => BuildRequestStatus::BuildRequestPlanning, 2 => BuildRequestStatus::BuildRequestPlanning,
7 => BuildRequestStatus::BuildRequestAnalysisCompleted,
3 => BuildRequestStatus::BuildRequestExecuting, 3 => BuildRequestStatus::BuildRequestExecuting,
4 => BuildRequestStatus::BuildRequestCompleted, 4 => BuildRequestStatus::BuildRequestCompleted,
5 => BuildRequestStatus::BuildRequestFailed, 5 => BuildRequestStatus::BuildRequestFailed,
@ -394,22 +431,22 @@ async fn execute_build_request(
) -> Result<(), String> { ) -> Result<(), String> {
info!("Starting build execution for request {}", build_request_id); info!("Starting build execution for request {}", build_request_id);
// Create orchestrator for this build request
let requested_partitions: Vec<PartitionRef> = partitions.iter()
.map(|p| PartitionRef { str: p.clone() })
.collect();
let orchestrator = BuildOrchestrator::new(
service.event_log.clone(),
build_request_id.clone(),
requested_partitions,
);
// Update status to planning // Update status to planning
update_build_request_status(&service, &build_request_id, BuildRequestStatus::BuildRequestPlanning).await; update_build_request_status(&service, &build_request_id, BuildRequestStatus::BuildRequestPlanning).await;
// Log planning event // Log planning event
let event = create_build_event( if let Err(e) = orchestrator.start_planning().await {
build_request_id.clone(),
crate::build_event::EventType::BuildRequestEvent(BuildRequestEvent {
status: BuildRequestStatus::BuildRequestPlanning as i32,
requested_partitions: partitions.iter()
.map(|p| PartitionRef { str: p.clone() })
.collect(),
message: "Starting build planning".to_string(),
}),
);
if let Err(e) = service.event_log.append_event(event).await {
error!("Failed to log planning event: {}", e); error!("Failed to log planning event: {}", e);
} }
@ -419,6 +456,12 @@ async fn execute_build_request(
Err(e) => { Err(e) => {
error!("Failed to analyze build graph: {}", e); error!("Failed to analyze build graph: {}", e);
update_build_request_status(&service, &build_request_id, BuildRequestStatus::BuildRequestFailed).await; update_build_request_status(&service, &build_request_id, BuildRequestStatus::BuildRequestFailed).await;
// Log failure event
if let Err(log_err) = orchestrator.complete_build(BuildResult::Failed { jobs_completed: 0, jobs_failed: 1 }).await {
error!("Failed to log failure event: {}", log_err);
}
return Err(e); return Err(e);
} }
}; };
@ -427,18 +470,7 @@ async fn execute_build_request(
update_build_request_status(&service, &build_request_id, BuildRequestStatus::BuildRequestExecuting).await; update_build_request_status(&service, &build_request_id, BuildRequestStatus::BuildRequestExecuting).await;
// Log executing event // Log executing event
let event = create_build_event( if let Err(e) = orchestrator.start_execution().await {
build_request_id.clone(),
crate::build_event::EventType::BuildRequestEvent(BuildRequestEvent {
status: BuildRequestStatus::BuildRequestExecuting as i32,
requested_partitions: partitions.iter()
.map(|p| PartitionRef { str: p.clone() })
.collect(),
message: "Starting build execution".to_string(),
}),
);
if let Err(e) = service.event_log.append_event(event).await {
error!("Failed to log executing event: {}", e); error!("Failed to log executing event: {}", e);
} }
@ -449,18 +481,7 @@ async fn execute_build_request(
update_build_request_status(&service, &build_request_id, BuildRequestStatus::BuildRequestCompleted).await; update_build_request_status(&service, &build_request_id, BuildRequestStatus::BuildRequestCompleted).await;
// Log completion event // Log completion event
let event = create_build_event( if let Err(e) = orchestrator.complete_build(BuildResult::Success { jobs_completed: 0 }).await {
build_request_id.clone(),
crate::build_event::EventType::BuildRequestEvent(BuildRequestEvent {
status: BuildRequestStatus::BuildRequestCompleted as i32,
requested_partitions: partitions.iter()
.map(|p| PartitionRef { str: p.clone() })
.collect(),
message: "Build request completed successfully".to_string(),
}),
);
if let Err(e) = service.event_log.append_event(event).await {
error!("Failed to log completion event: {}", e); error!("Failed to log completion event: {}", e);
} }
@ -471,19 +492,8 @@ async fn execute_build_request(
update_build_request_status(&service, &build_request_id, BuildRequestStatus::BuildRequestFailed).await; update_build_request_status(&service, &build_request_id, BuildRequestStatus::BuildRequestFailed).await;
// Log failure event // Log failure event
let event = create_build_event( if let Err(log_err) = orchestrator.complete_build(BuildResult::Failed { jobs_completed: 0, jobs_failed: 1 }).await {
build_request_id.clone(), error!("Failed to log failure event: {}", log_err);
crate::build_event::EventType::BuildRequestEvent(BuildRequestEvent {
status: BuildRequestStatus::BuildRequestFailed as i32,
requested_partitions: partitions.iter()
.map(|p| PartitionRef { str: p.clone() })
.collect(),
message: format!("Build request failed: {}", e),
}),
);
if let Err(e) = service.event_log.append_event(event).await {
error!("Failed to log failure event: {}", e);
} }
Err(e) Err(e)
@ -693,7 +703,7 @@ pub async fn list_partitions(
let status_filter = params.get("status") let status_filter = params.get("status")
.and_then(|s| match s.as_str() { .and_then(|s| match s.as_str() {
"requested" => Some(PartitionStatus::PartitionRequested), "requested" => Some(PartitionStatus::PartitionRequested),
"scheduled" => Some(PartitionStatus::PartitionScheduled), "analyzed" => Some(PartitionStatus::PartitionAnalyzed),
"building" => Some(PartitionStatus::PartitionBuilding), "building" => Some(PartitionStatus::PartitionBuilding),
"available" => Some(PartitionStatus::PartitionAvailable), "available" => Some(PartitionStatus::PartitionAvailable),
"failed" => Some(PartitionStatus::PartitionFailed), "failed" => Some(PartitionStatus::PartitionFailed),
@ -777,3 +787,263 @@ pub async fn get_activity_summary(
} }
} }
} }
#[derive(Deserialize, JsonSchema)]
pub struct JobMetricsRequest {
pub label: String,
}
pub async fn list_jobs(
State(service): State<ServiceState>,
Query(params): Query<HashMap<String, String>>,
) -> Result<Json<JobsListResponse>, (StatusCode, Json<ErrorResponse>)> {
let search_term = params.get("search").map(|s| s.to_lowercase());
// Debug: Let's see what's actually in the database
let debug_query = "
SELECT
je.job_label,
je.status,
COUNT(*) as count_for_this_status
FROM job_events je
JOIN build_events be ON je.event_id = be.event_id
WHERE je.job_label != ''
GROUP BY je.job_label, je.status
ORDER BY je.job_label, je.status";
// Log the debug results first
if let Ok(debug_result) = service.event_log.execute_query(debug_query).await {
for row in &debug_result.rows {
if row.len() >= 3 {
log::info!("Debug: job_label={}, status={}, count={}", row[0], row[1], row[2]);
}
}
}
// Original query but let's see all statuses
let query = "
SELECT
je.job_label,
COUNT(CASE WHEN je.status IN ('3', '6') THEN 1 END) as completed_count,
COUNT(CASE WHEN je.status = '4' THEN 1 END) as failed_count,
COUNT(*) as total_count,
-- For now, skip duration calculation since we need start/end times
NULL as avg_duration_ms,
MAX(be.timestamp) as last_run,
GROUP_CONCAT(DISTINCT je.status) as all_statuses
FROM job_events je
JOIN build_events be ON je.event_id = be.event_id
WHERE je.job_label != ''
GROUP BY je.job_label
ORDER BY last_run DESC";
match service.event_log.execute_query(query).await {
Ok(result) => {
let mut jobs = Vec::new();
for row in result.rows {
if row.len() >= 7 {
let job_label = &row[0];
// Apply search filter if provided
if let Some(ref search) = search_term {
if !job_label.to_lowercase().contains(search) {
continue;
}
}
let completed_count: u32 = row[1].parse().unwrap_or(0);
let failed_count: u32 = row[2].parse().unwrap_or(0);
let total_count: u32 = row[3].parse().unwrap_or(0);
let avg_duration_ms: Option<i64> = row[4].parse().ok();
let last_run: Option<i64> = row[5].parse().ok();
let all_statuses = &row[6];
// Log additional debug info
log::info!("Job: {}, completed: {}, failed: {}, total: {}, statuses: {}",
job_label, completed_count, failed_count, total_count, all_statuses);
let success_rate = if total_count > 0 {
completed_count as f64 / total_count as f64
} else {
0.0
};
jobs.push(JobSummary {
job_label: job_label.clone(),
success_rate,
avg_duration_ms,
recent_runs: total_count.min(50), // Limit to recent runs
last_run,
});
}
}
let total_count = jobs.len() as u32;
Ok(Json(JobsListResponse {
jobs,
total_count,
}))
}
Err(e) => {
error!("Failed to list jobs: {}", e);
Err((
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: format!("Failed to list jobs: {}", e),
}),
))
}
}
}
pub async fn get_job_metrics(
State(service): State<ServiceState>,
Path(JobMetricsRequest { label }): Path<JobMetricsRequest>,
) -> Result<Json<JobMetricsResponse>, (StatusCode, Json<ErrorResponse>)> {
// Decode the base64-encoded job label
let decoded_label = match base64_url_decode(&label) {
Ok(decoded) => decoded,
Err(_) => {
return Err((
StatusCode::BAD_REQUEST,
Json(ErrorResponse {
error: "Invalid job label encoding".to_string(),
}),
));
}
};
log::info!("get_job_metrics: encoded='{}', decoded='{}'", label, decoded_label);
// Get overall job metrics
let metrics_query = "
SELECT
COUNT(CASE WHEN je.status IN ('3', '6') THEN 1 END) as completed_count,
COUNT(*) as total_count,
-- Skip duration calculation for now
NULL as avg_duration_ms
FROM job_events je
JOIN build_events be ON je.event_id = be.event_id
WHERE je.job_label = ?";
let (success_rate, total_runs, avg_duration_ms) = match service.event_log.execute_query(&metrics_query.replace("?", &format!("'{}'", decoded_label))).await {
Ok(result) if !result.rows.is_empty() => {
let row = &result.rows[0];
let completed_count: u32 = row[0].parse().unwrap_or(0);
let total_count: u32 = row[1].parse().unwrap_or(0);
let avg_duration: Option<i64> = row[2].parse().ok();
let success_rate = if total_count > 0 {
completed_count as f64 / total_count as f64
} else {
0.0
};
(success_rate, total_count, avg_duration)
}
_ => (0.0, 0, None),
};
// Get recent runs
let recent_runs_query = "
SELECT DISTINCT
be.build_request_id,
je.target_partitions,
je.status,
be.timestamp,
(julianday('now') - julianday(be.timestamp/1000000000, 'unixepoch')) * 24 * 60 * 60 * 1000 as duration_ms
FROM job_events je
JOIN build_events be ON je.event_id = be.event_id
WHERE je.job_label = ?
ORDER BY be.timestamp DESC
LIMIT 50";
let recent_runs = match service.event_log.execute_query(&recent_runs_query.replace("?", &format!("'{}'", decoded_label))).await {
Ok(result) => {
result.rows.into_iter().map(|row| {
let build_request_id = row[0].clone();
let partitions_json: String = row[1].clone();
let status_code: String = row[2].clone();
let started_at: i64 = row[3].parse().unwrap_or(0);
let duration_ms: Option<i64> = row[4].parse().ok();
let partitions: Vec<String> = serde_json::from_str::<Vec<serde_json::Value>>(&partitions_json)
.unwrap_or_default()
.into_iter()
.filter_map(|v| {
v.get("str").and_then(|s| s.as_str()).map(|s| s.to_string())
})
.collect();
let status = match status_code.as_str() {
"1" => "scheduled",
"2" => "running",
"3" => "completed",
"4" => "failed",
"5" => "cancelled",
_ => "unknown",
};
JobRunSummary {
build_request_id,
partitions,
status: status.to_string(),
duration_ms,
started_at,
}
}).collect()
}
Err(_) => Vec::new(),
};
// Get daily stats (simplified - just recent days)
let daily_stats_query = "
SELECT
date(be.timestamp/1000000000, 'unixepoch') as date,
COUNT(CASE WHEN je.status IN ('3', '6') THEN 1 END) as completed_count,
COUNT(*) as total_count,
-- Skip duration calculation for now
NULL as avg_duration_ms
FROM job_events je
JOIN build_events be ON je.event_id = be.event_id
WHERE je.job_label = ?
AND be.timestamp > (strftime('%s', 'now', '-30 days') * 1000000000)
GROUP BY date(be.timestamp/1000000000, 'unixepoch')
ORDER BY date DESC";
let daily_stats = match service.event_log.execute_query(&daily_stats_query.replace("?", &format!("'{}'", decoded_label))).await {
Ok(result) => {
result.rows.into_iter().map(|row| {
let date = row[0].clone();
let completed_count: u32 = row[1].parse().unwrap_or(0);
let total_count: u32 = row[2].parse().unwrap_or(0);
let avg_duration: Option<i64> = row[3].parse().ok();
let success_rate = if total_count > 0 {
completed_count as f64 / total_count as f64
} else {
0.0
};
JobDailyStats {
date,
success_rate,
avg_duration_ms: avg_duration,
total_runs: total_count,
}
}).collect()
}
Err(_) => Vec::new(),
};
Ok(Json(JobMetricsResponse {
job_label: decoded_label,
success_rate,
avg_duration_ms,
total_runs,
recent_runs,
daily_stats,
}))
}

View file

@ -141,6 +141,49 @@ pub struct ActivityResponse {
pub graph_name: String, pub graph_name: String,
} }
// Job-related request/response types
#[derive(Debug, Serialize, Deserialize, JsonSchema)]
pub struct JobsListResponse {
pub jobs: Vec<JobSummary>,
pub total_count: u32,
}
#[derive(Debug, Serialize, Deserialize, JsonSchema)]
pub struct JobSummary {
pub job_label: String,
pub success_rate: f64,
pub avg_duration_ms: Option<i64>,
pub recent_runs: u32,
pub last_run: Option<i64>,
}
#[derive(Debug, Serialize, Deserialize, JsonSchema)]
pub struct JobMetricsResponse {
pub job_label: String,
pub success_rate: f64,
pub avg_duration_ms: Option<i64>,
pub total_runs: u32,
pub recent_runs: Vec<JobRunSummary>,
pub daily_stats: Vec<JobDailyStats>,
}
#[derive(Debug, Serialize, Deserialize, JsonSchema)]
pub struct JobRunSummary {
pub build_request_id: String,
pub partitions: Vec<String>,
pub status: String,
pub duration_ms: Option<i64>,
pub started_at: i64,
}
#[derive(Debug, Serialize, Deserialize, JsonSchema)]
pub struct JobDailyStats {
pub date: String,
pub success_rate: f64,
pub avg_duration_ms: Option<i64>,
pub total_runs: u32,
}
impl BuildGraphService { impl BuildGraphService {
pub async fn new( pub async fn new(
event_log_uri: &str, event_log_uri: &str,
@ -172,6 +215,8 @@ impl BuildGraphService {
.api_route("/api/v1/partitions", get(handlers::list_partitions)) .api_route("/api/v1/partitions", get(handlers::list_partitions))
.api_route("/api/v1/partitions/:ref/status", get(handlers::get_partition_status)) .api_route("/api/v1/partitions/:ref/status", get(handlers::get_partition_status))
.api_route("/api/v1/partitions/:ref/events", get(handlers::get_partition_events)) .api_route("/api/v1/partitions/:ref/events", get(handlers::get_partition_events))
.api_route("/api/v1/jobs", get(handlers::list_jobs))
.api_route("/api/v1/jobs/:label", get(handlers::get_job_metrics))
.api_route("/api/v1/activity", get(handlers::get_activity_summary)) .api_route("/api/v1/activity", get(handlers::get_activity_summary))
.api_route("/api/v1/analyze", post(handlers::analyze_build_graph)) .api_route("/api/v1/analyze", post(handlers::analyze_build_graph))
.finish_api(&mut api); .finish_api(&mut api);
@ -190,6 +235,8 @@ impl BuildGraphService {
.api_route("/api/v1/partitions", get(handlers::list_partitions)) .api_route("/api/v1/partitions", get(handlers::list_partitions))
.api_route("/api/v1/partitions/:ref/status", get(handlers::get_partition_status)) .api_route("/api/v1/partitions/:ref/status", get(handlers::get_partition_status))
.api_route("/api/v1/partitions/:ref/events", get(handlers::get_partition_events)) .api_route("/api/v1/partitions/:ref/events", get(handlers::get_partition_events))
.api_route("/api/v1/jobs", get(handlers::list_jobs))
.api_route("/api/v1/jobs/:label", get(handlers::get_job_metrics))
.api_route("/api/v1/activity", get(handlers::get_activity_summary)) .api_route("/api/v1/activity", get(handlers::get_activity_summary))
.api_route("/api/v1/analyze", post(handlers::analyze_build_graph)) .api_route("/api/v1/analyze", post(handlers::analyze_build_graph))
.route("/api/v1/openapi.json", get(Self::openapi_spec)) .route("/api/v1/openapi.json", get(Self::openapi_spec))
@ -299,6 +346,7 @@ impl BuildGraphService {
BuildRequestStatus::BuildRequestUnknown => "unknown".to_string(), BuildRequestStatus::BuildRequestUnknown => "unknown".to_string(),
BuildRequestStatus::BuildRequestReceived => "received".to_string(), BuildRequestStatus::BuildRequestReceived => "received".to_string(),
BuildRequestStatus::BuildRequestPlanning => "planning".to_string(), BuildRequestStatus::BuildRequestPlanning => "planning".to_string(),
BuildRequestStatus::BuildRequestAnalysisCompleted => "analysis_completed".to_string(),
BuildRequestStatus::BuildRequestExecuting => "executing".to_string(), BuildRequestStatus::BuildRequestExecuting => "executing".to_string(),
BuildRequestStatus::BuildRequestCompleted => "completed".to_string(), BuildRequestStatus::BuildRequestCompleted => "completed".to_string(),
BuildRequestStatus::BuildRequestFailed => "failed".to_string(), BuildRequestStatus::BuildRequestFailed => "failed".to_string(),
@ -310,7 +358,7 @@ impl BuildGraphService {
match status { match status {
PartitionStatus::PartitionUnknown => "unknown".to_string(), PartitionStatus::PartitionUnknown => "unknown".to_string(),
PartitionStatus::PartitionRequested => "requested".to_string(), PartitionStatus::PartitionRequested => "requested".to_string(),
PartitionStatus::PartitionScheduled => "scheduled".to_string(), PartitionStatus::PartitionAnalyzed => "analyzed".to_string(),
PartitionStatus::PartitionBuilding => "building".to_string(), PartitionStatus::PartitionBuilding => "building".to_string(),
PartitionStatus::PartitionAvailable => "available".to_string(), PartitionStatus::PartitionAvailable => "available".to_string(),
PartitionStatus::PartitionFailed => "failed".to_string(), PartitionStatus::PartitionFailed => "failed".to_string(),

View file

@ -171,6 +171,20 @@ Success Rate = (completed_count) / (total_count) where completed includes both e
## Implementation Architecture ## Implementation Architecture
### Clean Separation of Concerns
**Analysis Phase** (`databuild/graph/analyze.rs`):
- **Purpose**: Pure transformation of partition requests → job graph
- **Responsibility**: Determine what work would be needed (logical plan)
- **No delegation logic**: Creates jobs for all requested partitions
- **Output**: Complete job graph representing the logical work
**Execution Phase** (`databuild/graph/execute.rs`):
- **Purpose**: Execute the job graph efficiently with delegation optimization
- **Responsibility**: Coordinate with concurrent builds and optimize execution
- **All delegation logic**: Handles both active and historical delegation
- **Event logging**: Emits all job lifecycle events including `JOB_SKIPPED`
### Core Components ### Core Components
1. **Event Log Trait** (`databuild/event_log/mod.rs`): 1. **Event Log Trait** (`databuild/event_log/mod.rs`):
@ -178,20 +192,25 @@ Success Rate = (completed_count) / (total_count) where completed includes both e
- `get_build_request_for_available_partition()`: Find historical source - `get_build_request_for_available_partition()`: Find historical source
- `get_active_builds_for_partition()`: Find concurrent builds - `get_active_builds_for_partition()`: Find concurrent builds
2. **Coordination Logic** (`databuild/graph/execute.rs`): 2. **Execution Coordination Logic** (`databuild/graph/execute.rs`):
- `check_build_coordination()`: Implements delegation decision rules - `check_build_coordination()`: Implements all delegation decision rules
- Multi-partition job evaluation logic - Multi-partition job evaluation logic
- Event logging for delegation and job skipping - Event logging for delegation and job skipping
- Handles both active delegation (to running builds) and historical delegation (to completed builds)
3. **Dashboard Integration** (`databuild/service/handlers.rs`): 3. **Dashboard Integration** (`databuild/service/handlers.rs`):
- Success rate calculations including `JOB_SKIPPED` - Success rate calculations including `JOB_SKIPPED`
- Job metrics queries treating delegation as success - Job metrics queries treating delegation as success
- Proper handling of skipped jobs in analytics - Proper handling of skipped jobs in analytics
### Delegation Decision Algorithm ### Delegation Decision Algorithm (Execution Phase)
```rust ```rust
for each job in execution_plan: // Analysis phase creates complete job graph for all requested partitions
job_graph = analyze_partitions(requested_partitions)
// Execution phase optimizes by delegating when possible
for each job in job_graph:
available_partitions = [] available_partitions = []
needs_building = false needs_building = false
@ -200,14 +219,16 @@ for each job in execution_plan:
source_build = get_build_request_for_available_partition(partition) source_build = get_build_request_for_available_partition(partition)
available_partitions.push((partition, source_build)) available_partitions.push((partition, source_build))
elif partition has active_builds: elif partition has active_builds:
delegate_entire_job_to_active_build() // Active delegation - delegate entire job to running build
return log_delegation_events_to_active_build()
mark_job_as_succeeded()
continue_to_next_job()
else: else:
needs_building = true needs_building = true
if !needs_building && available_partitions.len() == job.outputs.len(): if !needs_building && available_partitions.len() == job.outputs.len():
// Historical delegation - all partitions available // Historical delegation - all partitions available
log_delegation_events(available_partitions) log_delegation_events(available_partitions) // Point to source builds
log_job_skipped_event() log_job_skipped_event()
mark_job_as_succeeded() mark_job_as_succeeded()
elif needs_building: elif needs_building:
@ -217,12 +238,14 @@ for each job in execution_plan:
## Benefits ## Benefits
1. **Efficiency**: Eliminates duplicate computation 1. **Clean Architecture**: Clear separation between logical planning (analysis) and execution optimization
2. **Consistency**: Single source of truth for each partition 2. **Efficiency**: Eliminates duplicate computation through execution-time delegation
3. **Traceability**: Complete audit trail via delegation events 3. **Consistency**: Single source of truth for each partition
4. **Accuracy**: Proper success rate calculation including delegated work 4. **Traceability**: Complete audit trail via delegation events with full build request traceability
5. **Scalability**: Supports concurrent build requests without conflicts 5. **Accuracy**: Proper success rate calculation including delegated work
6. **Transparency**: Clear visibility into why work was or wasn't performed 6. **Scalability**: Supports concurrent build requests without conflicts
7. **Testability**: Analysis phase becomes pure function (requests → job graph)
8. **Transparency**: Clear visibility into why work was or wasn't performed
## Future Enhancements ## Future Enhancements

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
34

File diff suppressed because one or more lines are too long

1
generated_number/pippin Normal file
View file

@ -0,0 +1 @@
34

View file

@ -0,0 +1,336 @@
# CLI-Service Build Unification
## Problem Statement
The current DataBuild architecture has significant duplication and architectural inconsistencies between CLI and Service build orchestration:
### Current Duplication Issues
1. **Event Emission Logic**: Service HTTP handlers and CLI binaries contain duplicate orchestration event emission code
2. **Mode Detection**: Analysis and execution binaries (`analyze.rs` and `execute.rs`) use `DATABUILD_CLI_MODE` environment variable to conditionally emit different events
3. **Test Complexity**: End-to-end tests must account for different event patterns between CLI and Service for identical logical operations
### Specific Code References
- **CLI Mode Detection in Analysis**: `databuild/graph/analyze.rs:555-587` - Emits "Build request received" and "Starting build planning" events only in CLI mode
- **CLI Mode Detection in Execution**: `databuild/graph/execute.rs:413-428` and `execute.rs:753-779` - Emits execution start/completion events only in CLI mode
- **Service Orchestration**: `databuild/service/handlers.rs` - HTTP handlers emit orchestration events independently
### Architectural Problems
1. **Single Responsibility Violation**: Analysis and execution binaries serve dual purposes as both shared library functions and CLI entry points
2. **Consistency Risk**: Separate implementations of orchestration logic create risk of drift between CLI and Service behavior
3. **Maintenance Burden**: Changes to orchestration requirements must be implemented in multiple places
## Current Architecture Analysis
### Service Flow
```
HTTP Request → Service Handler → Orchestration Events → Analysis → Execution → Completion Events
```
The Service has a natural coordination point in the HTTP handler that manages the entire build lifecycle and emits appropriate orchestration events.
### CLI Flow
```
Shell Script → Analysis Binary (CLI mode) → Execution Binary (CLI mode) → Orchestration Events
```
The CLI lacks a natural coordination point, forcing the shared analysis/execution binaries to detect CLI mode and emit orchestration events themselves.
### Event Flow Comparison
**Service Events** (coordinated):
1. Build request received
2. Starting build planning
3. Analysis events (partitions scheduled, jobs configured)
4. Starting build execution
5. Execution events (jobs scheduled/completed, partitions available)
6. Build request completed
**CLI Events** (mode-dependent):
- Same events as Service, but emitted conditionally based on `DATABUILD_CLI_MODE`
- Creates awkward coupling between orchestration concerns and domain logic
## Proposed Shared Library Design
### Core Orchestrator API
```rust
pub struct BuildOrchestrator {
event_log: Box<dyn BuildEventLog>,
build_request_id: String,
requested_partitions: Vec<PartitionRef>,
}
impl BuildOrchestrator {
pub fn new(
event_log: Box<dyn BuildEventLog>,
build_request_id: String,
requested_partitions: Vec<PartitionRef>
) -> Self;
// Lifecycle events
pub async fn start_build(&self) -> Result<(), Error>;
pub async fn start_planning(&self) -> Result<(), Error>;
pub async fn start_execution(&self) -> Result<(), Error>;
pub async fn complete_build(&self, result: BuildResult) -> Result<(), Error>;
// Domain events (pass-through to existing logic)
pub async fn emit_partition_scheduled(&self, partition: &PartitionRef) -> Result<(), Error>;
pub async fn emit_job_scheduled(&self, job: &JobEvent) -> Result<(), Error>;
pub async fn emit_job_completed(&self, job: &JobEvent) -> Result<(), Error>;
pub async fn emit_partition_available(&self, partition: &PartitionEvent) -> Result<(), Error>;
pub async fn emit_delegation(&self, partition: &str, target_build: &str, message: &str) -> Result<(), Error>;
}
pub enum BuildResult {
Success { jobs_completed: usize },
Failed { jobs_completed: usize, jobs_failed: usize },
FailFast { trigger_job: String },
}
```
### Event Emission Strategy
The orchestrator will emit standardized events at specific lifecycle points:
1. **Build Lifecycle Events**: High-level orchestration (received, planning, executing, completed)
2. **Domain Events**: Pass-through wrapper for existing analysis/execution events
3. **Consistent Timing**: All events emitted through orchestrator ensure proper sequencing
### Error Handling
```rust
#[derive(Debug, thiserror::Error)]
pub enum OrchestrationError {
#[error("Event log error: {0}")]
EventLog(#[from] databuild::event_log::Error),
#[error("Build coordination error: {0}")]
Coordination(String),
#[error("Invalid build state transition: {current} -> {requested}")]
InvalidStateTransition { current: String, requested: String },
}
```
### Testing Interface
```rust
#[cfg(test)]
impl BuildOrchestrator {
pub fn with_mock_event_log(build_request_id: String) -> (Self, MockEventLog);
pub fn emitted_events(&self) -> &[BuildEvent];
}
```
## Implementation Phases
### Phase 1: Create Shared Orchestration Library
**Files to Create**:
- `databuild/orchestration/mod.rs` - Core orchestrator implementation
- `databuild/orchestration/events.rs` - Event type definitions and helpers
- `databuild/orchestration/error.rs` - Error types
- `databuild/orchestration/tests.rs` - Unit tests for orchestrator
**Key Implementation Points**:
- Extract common event emission patterns from Service and CLI
- Ensure orchestrator is async-compatible with existing event log interface
- Design for testability with dependency injection
### Phase 2: Refactor Service to Use Orchestrator
**Files to Modify**:
- `databuild/service/handlers.rs` - Replace direct event emission with orchestrator calls
- `databuild/service/mod.rs` - Integration with orchestrator lifecycle
**Implementation**:
- Replace existing event emission code directly with orchestrator calls
- Ensure proper error handling and async integration
### Phase 3: Create New CLI Wrapper
**Files to Create**:
- `databuild/cli/main.rs` - New CLI entry point using orchestrator
- `databuild/cli/error.rs` - CLI-specific error handling
**Implementation**:
```rust
// databuild/cli/main.rs
#[tokio::main]
async fn main() -> Result<(), CliError> {
let args = parse_cli_args();
let event_log = create_build_event_log(&args.event_log_uri).await?;
let build_request_id = args.build_request_id.unwrap_or_else(|| Uuid::new_v4().to_string());
let orchestrator = BuildOrchestrator::new(event_log, build_request_id, args.partitions.clone());
// Emit orchestration events
orchestrator.start_build().await?;
orchestrator.start_planning().await?;
// Run analysis
let graph = run_analysis(&args.partitions, &orchestrator).await?;
orchestrator.start_execution().await?;
// Run execution
let result = run_execution(graph, &orchestrator).await?;
orchestrator.complete_build(result).await?;
Ok(())
}
```
### Phase 4: Remove CLI Mode Detection
**Files to Modify**:
- `databuild/graph/analyze.rs` - Remove lines 555-587 (CLI mode orchestration events)
- `databuild/graph/execute.rs` - Remove lines 413-428 and 753-779 (CLI mode orchestration events)
**Verification**:
- Analysis and execution binaries become pure domain functions
- No more environment variable mode detection
- All orchestration handled by wrapper/service
### Phase 5: Update Bazel Rules
**Files to Modify**:
- `databuild/rules.bzl` - Update `_databuild_graph_build_impl` to use new CLI wrapper instead of direct analysis/execution pipeline
**Before**:
```bash
$(rlocation _main/{analyze_path}) $@ | $(rlocation _main/{exec_path})
```
**After**:
```bash
$(rlocation _main/{cli_wrapper_path}) $@
```
### Phase 6: Update Tests
**Files to Modify**:
- `tests/end_to_end/simple_test.sh` - Remove separate CLI/Service event validation
- `tests/end_to_end/podcast_simple_test.sh` - Same simplification
- All tests expect identical event patterns from CLI and Service
## Migration Strategy
### Direct Replacement Approach
Since we don't need backwards compatibility, we can implement a direct replacement:
- Replace existing CLI mode detection immediately
- Refactor Service handlers to use orchestrator directly
- Update Bazel rules to use new CLI wrapper
- Update tests to expect unified behavior
### Testing Strategy
1. **Unit Tests**: Comprehensive orchestrator testing with mock event logs
2. **Integration Tests**: Existing end-to-end tests pass with unified implementation
3. **Event Verification**: Ensure orchestrator produces expected events for all scenarios
## File Changes Required
### New Files
- `databuild/orchestration/mod.rs` - 200+ lines, core orchestrator
- `databuild/orchestration/events.rs` - 100+ lines, event helpers
- `databuild/orchestration/error.rs` - 50+ lines, error types
- `databuild/orchestration/tests.rs` - 300+ lines, comprehensive tests
- `databuild/cli/main.rs` - 150+ lines, CLI wrapper
- `databuild/cli/error.rs` - 50+ lines, CLI error handling
### Modified Files
- `databuild/service/handlers.rs` - Replace ~50 lines of event emission with orchestrator calls
- `databuild/graph/analyze.rs` - Remove ~30 lines of CLI mode detection
- `databuild/graph/execute.rs` - Remove ~60 lines of CLI mode detection
- `databuild/rules.bzl` - Update ~10 lines for new CLI wrapper
- `tests/end_to_end/simple_test.sh` - Simplify ~20 lines of event validation
- `tests/end_to_end/podcast_simple_test.sh` - Same simplification
### Build Configuration
- Update `databuild/BUILD.bazel` to include orchestration module
- Update `databuild/cli/BUILD.bazel` for new CLI binary
- Modify example graphs to use new CLI wrapper
## Benefits & Risk Analysis
### Benefits
1. **Maintainability**: Single source of truth for orchestration logic eliminates duplication
2. **Consistency**: Guaranteed identical events across CLI and Service interfaces
3. **Extensibility**: Foundation for SDK, additional CLI commands, monitoring integration
4. **Testing**: Simplified test expectations, better unit test coverage of orchestration
5. **Architecture**: Clean separation between orchestration and domain logic
### Implementation Risks
1. **Regression**: Changes to critical path could introduce subtle bugs
2. **Performance**: Additional abstraction layer could impact latency
3. **Integration**: Bazel build changes could break example workflows
### Risk Mitigation
1. **Phased Implementation**: Implement in stages with verification at each step
2. **Comprehensive Testing**: Thorough unit and integration testing
3. **Event Verification**: Ensure identical event patterns to current behavior
## Future Architecture Extensions
### SDK Integration
The unified orchestrator provides a natural integration point for external SDKs:
```rust
// Future SDK usage
let databuild_client = DatabuildClient::new(endpoint);
let orchestrator = databuild_client.create_orchestrator(partitions).await?;
orchestrator.start_build().await?;
let result = databuild_client.execute_build(orchestrator).await?;
```
### Additional CLI Commands
Orchestrator enables consistent event emission across CLI commands:
```bash
databuild validate --partitions "data/users" --dry-run
databuild status --build-id "abc123"
databuild retry --build-id "abc123" --failed-jobs-only
```
### Monitoring Integration
Standardized events provide foundation for observability:
```rust
impl BuildOrchestrator {
pub fn with_tracing_span(&self, span: tracing::Span) -> Self;
pub fn emit_otel_metrics(&self) -> Result<(), Error>;
}
```
### CI/CD Pipeline Integration
Orchestrator events enable standardized build reporting across environments:
```yaml
# GitHub Actions integration
- name: DataBuild
uses: databuild/github-action@v1
with:
partitions: "data/daily_reports"
event-log: "${{ env.DATABUILD_EVENT_LOG }}"
# Automatic event collection for build status reporting
```
## Conclusion
This unification addresses fundamental architectural inconsistencies while providing a foundation for future extensibility. The phased implementation approach minimizes risk while ensuring backward compatibility throughout the transition.
The shared orchestrator eliminates the current awkward CLI mode detection pattern and establishes DataBuild as a platform that can support multiple interfaces with guaranteed consistency.

View file

@ -0,0 +1,148 @@
# Integration Test Plan for DataBuild Delegation System
## Overview
Create comprehensive integration tests for the basic_graph example that trigger delegation scenarios and verify Build Event Log (BEL) entries to ensure the delegation system works correctly and provides proper traceability.
## Current Test Infrastructure Analysis
**Existing Pattern**: The current test suite in `/tests/end_to_end/` follows a mature pattern:
- **Common utilities**: `lib/test_utils.sh`, `lib/db_utils.sh`, `lib/service_utils.sh`
- **Test isolation**: Separate SQLite databases per test to prevent interference
- **CLI vs Service validation**: Tests ensure both paths produce identical events
- **Event analysis**: Detailed breakdown of job/partition/request event counts
- **Robust service management**: Start/stop with proper cleanup and health checks
**Target System**: basic_graph example with two jobs:
- `generate_number_job`: Produces partitions like `generated_number/pippin`
- `sum_job`: Depends on multiple generated numbers, produces `sum/pippin_salem_sadie`
## New Test Implementation Plan
### 1. Create Delegation-Specific Test: `basic_graph_delegation_test.sh`
**Test Scenarios**:
- **Historical Delegation**: Run same partition twice, verify second run delegates to first
- **Multi-partition Jobs**: Test delegation behavior when jobs produce multiple partitions
- **Mixed Availability**: Test jobs where some target partitions exist, others don't
- **BEL Verification**: Validate specific delegation events and job status transitions
**Core Test Cases**:
1. **Single Partition Historical Delegation**
- Build `generated_number/pippin` (first run - normal execution)
- Build `generated_number/pippin` again (second run - should delegate)
- Verify BEL contains: `DelegationEvent` + `JOB_SKIPPED` for second run
2. **Multi-Partition Delegation Scenarios**
- Build `generated_number/pippin`, `generated_number/salem`, `generated_number/sadie`
- Build `sum/pippin_salem_sadie` (should delegate to existing partitions)
- Verify delegation events point to correct source build requests
3. **Partial Delegation Scenario**
- Build `generated_number/pippin`, `generated_number/salem`
- Request `generated_number/pippin`, `generated_number/salem`, `generated_number/sadie`
- Verify: delegations for pippin/salem, normal execution for sadie
4. **Cross-Run Delegation Chain**
- Run 1: Build `generated_number/pippin`
- Run 2: Build `generated_number/salem`
- Run 3: Build `sum/pippin_salem_sadie` (requires sadie, should delegate pippin/salem)
- Verify delegation traceability to correct source builds
### 2. BEL Validation Utilities
**New functions in `lib/db_utils.sh`**:
- `get_delegation_events()`: Extract delegation events for specific partition
- `verify_job_skipped()`: Check job was properly skipped with delegation
- `get_delegation_source_build()`: Validate delegation points to correct build request
- `compare_delegation_behavior()`: Compare CLI vs Service delegation consistency
**Event Validation Logic**:
```bash
# For historical delegation, verify event sequence:
# 1. DelegationEvent(partition_ref, delegated_to_build_request_id, message)
# 2. JobEvent(status=JOB_SKIPPED, message="Job skipped - all target partitions already available")
# 3. No JobEvent(JOB_SCHEDULED/RUNNING/COMPLETED) for delegated job
# For successful delegation:
# - Success rate should be 100% (JOB_SKIPPED counts as success)
# - Partition should show as available without re-execution
# - Build request should complete successfully
```
### 3. Performance and Reliability Validation
**Delegation Efficiency Tests**:
- Time comparison: first run vs delegated run (should be significantly faster)
- Resource usage: ensure delegated runs don't spawn job processes
- Concurrency: multiple builds requesting same partition simultaneously
**Error Scenarios**:
- Source build request failure handling
- Corrupted delegation data
- Stale partition detection
### 4. Integration with Existing Test Suite
**File Structure**:
```
tests/end_to_end/
├── basic_graph_delegation_test.sh # New delegation-specific tests
├── basic_graph_test.sh # Existing functionality tests (enhanced)
├── lib/
│ ├── delegation_utils.sh # New delegation validation utilities
│ ├── db_utils.sh # Enhanced with delegation functions
│ └── test_utils.sh # Existing utilities
└── BUILD # Updated to include new test
```
**Bazel Integration**:
- Add `basic_graph_delegation_test` as new `sh_test` target
- Include in `run_e2e_tests.sh` execution
- Tag with `["delegation", "e2e"]` for selective running
### 5. CLI vs Service Delegation Consistency
**Validation Approach**:
- Run identical delegation scenarios through both CLI and Service
- Compare BEL entries for identical delegation behavior
- Ensure both paths produce same success rates and event counts
- Validate API responses include delegation information
### 6. Documentation and Debugging Support
**Test Output Enhancement**:
- Clear delegation event logging during test execution
- Detailed failure diagnostics showing expected vs actual delegation behavior
- BEL dump utilities for debugging delegation issues
- Performance metrics (execution time, event counts)
## Expected Outcomes
**Success Criteria**:
1. **100% Success Rate**: Delegated builds show 100% success rate in dashboard
2. **Event Consistency**: CLI and Service produce identical delegation events
3. **Traceability**: All delegations link to correct source build requests
4. **Performance**: Delegated runs complete in <5 seconds vs 30+ seconds for full execution
5. **Multi-partition Correctness**: Complex jobs with mixed partition availability handled properly
**Regression Prevention**:
- Automated validation prevents delegation system regressions
- Comprehensive BEL verification ensures audit trail integrity
- Performance benchmarks detect delegation efficiency degradation
## Implementation Priority
1. **High**: Core delegation test cases (historical, multi-partition)
2. **High**: BEL validation utilities and event verification
3. **Medium**: Performance benchmarking and efficiency validation
4. **Medium**: Error scenario testing and edge cases
5. **Low**: Advanced concurrency and stress testing
This plan provides a comprehensive testing strategy that validates both the functional correctness and performance benefits of the delegation system while ensuring long-term reliability and debuggability.
## Implementation Notes
This plan was created following the user's request to improve system reliability and testability for the DataBuild delegation system. The focus is on the basic_graph example because it provides a simpler, more predictable test environment compared to the podcast_reviews example, while still covering all the essential delegation scenarios.
The delegation system currently shows some issues (67% success rate instead of 100%) that these tests should help identify and prevent regression of once fixed. The comprehensive BEL validation will ensure that the delegation events provide proper audit trails and traceability as intended by the system design.

View file

@ -98,6 +98,15 @@ main() {
"$SCRIPT_DIR/examples/basic_graph/bazel-bin/basic_graph.build" \ "$SCRIPT_DIR/examples/basic_graph/bazel-bin/basic_graph.build" \
"$SCRIPT_DIR/examples/basic_graph/bazel-bin/basic_graph.service" "$SCRIPT_DIR/examples/basic_graph/bazel-bin/basic_graph.service"
# Run delegation test for basic graph
log_info "Running test: Basic Graph Delegation Test"
if ! (cd "$SCRIPT_DIR/examples/basic_graph" && \
"$SCRIPT_DIR/tests/end_to_end/delegation_test.sh" \
"bazel-bin/basic_graph.build"); then
test_fail "Test failed: Basic Graph Delegation Test"
fi
test_pass "Test passed: Basic Graph Delegation Test"
# Test 2: Podcast Reviews # Test 2: Podcast Reviews
log_info "=== Podcast Reviews End-to-End Tests ===" log_info "=== Podcast Reviews End-to-End Tests ==="
@ -117,6 +126,15 @@ main() {
fi fi
test_pass "Test passed: Podcast Reviews Simple Test" test_pass "Test passed: Podcast Reviews Simple Test"
# Run delegation test for podcast reviews
log_info "Running test: Podcast Reviews Delegation Test"
if ! (cd "$SCRIPT_DIR/examples/podcast_reviews" && \
"$SCRIPT_DIR/tests/end_to_end/delegation_test.sh" \
"bazel-bin/podcast_reviews_graph.build"); then
test_fail "Test failed: Podcast Reviews Delegation Test"
fi
test_pass "Test passed: Podcast Reviews Delegation Test"
# Test 3: Core DataBuild Tests (if any exist) # Test 3: Core DataBuild Tests (if any exist)
log_info "=== Core DataBuild Tests ===" log_info "=== Core DataBuild Tests ==="
@ -130,7 +148,9 @@ main() {
# Summary # Summary
log_info "=== Test Summary ===" log_info "=== Test Summary ==="
test_pass "Basic Graph CLI and Service builds work correctly" test_pass "Basic Graph CLI and Service builds work correctly"
test_pass "Basic Graph delegation prevents duplicate builds"
test_pass "Podcast Reviews CLI build works correctly" test_pass "Podcast Reviews CLI build works correctly"
test_pass "Podcast Reviews delegation prevents duplicate builds"
test_pass "Build event logging functions properly" test_pass "Build event logging functions properly"
test_pass "Service APIs respond correctly" test_pass "Service APIs respond correctly"
@ -142,6 +162,8 @@ main() {
log_info " ✅ Both CLI and Service approaches work consistently" log_info " ✅ Both CLI and Service approaches work consistently"
log_info " ✅ Complex pipeline jobs (podcast reviews) execute successfully" log_info " ✅ Complex pipeline jobs (podcast reviews) execute successfully"
log_info " ✅ Event logging to SQLite databases works" log_info " ✅ Event logging to SQLite databases works"
log_info " ✅ Build delegation prevents duplicate execution of completed partitions"
log_info " ✅ Second builds delegate to existing partitions and run much faster"
} }
# Handle cleanup on exit # Handle cleanup on exit

View file

@ -9,6 +9,23 @@ filegroup(
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
) )
# Delegation test that verifies second builds properly delegate to existing partitions
sh_test(
name = "delegation_test",
srcs = ["delegation_test.sh"],
data = [
"//examples/basic_graph:basic_graph.build",
":test_utils",
],
size = "medium",
timeout = "moderate",
env = {
"PATH": "/usr/bin:/bin:/usr/local/bin",
},
tags = ["e2e", "delegation"],
args = ["$(location //examples/basic_graph:basic_graph.build)"],
)
# Simple shell script test that validates the test runner # Simple shell script test that validates the test runner
sh_test( sh_test(
name = "e2e_runner_test", name = "e2e_runner_test",
@ -22,6 +39,7 @@ sh_test(
"simple_test.sh", "simple_test.sh",
"basic_graph_test.sh", "basic_graph_test.sh",
"podcast_reviews_test.sh", "podcast_reviews_test.sh",
"delegation_test.sh",
], ],
size = "small", size = "small",
timeout = "short", timeout = "short",

View file

@ -0,0 +1,135 @@
#!/bin/bash
# End-to-end test for build delegation functionality
# Tests that running the same build twice results in proper delegation on the second run
set -euo pipefail
CLI_BUILD="${1:-}"
PARTITION_REF="${2:-}"
if [[ -z "$CLI_BUILD" ]]; then
echo "Usage: $0 <cli_build_binary> [partition_ref]"
exit 1
fi
# Auto-detect partition based on binary name if not provided
if [[ -z "$PARTITION_REF" ]]; then
if [[ "$CLI_BUILD" == *"basic_graph"* ]]; then
PARTITION_REF="generated_number/pippin"
elif [[ "$CLI_BUILD" == *"podcast"* ]]; then
PARTITION_REF="daily_summaries/category=comedy/date=2020-01-01"
else
echo "[ERROR] Could not auto-detect partition reference. Please provide one as second argument."
exit 1
fi
fi
TEST_DB="/tmp/delegation_test.db"
echo "[INFO] Testing delegation functionality..."
# Clean up any existing state
rm -f "$TEST_DB" /tmp/*generated_number* 2>/dev/null || true
export DATABUILD_BUILD_EVENT_LOG="sqlite://$TEST_DB"
echo "[INFO] Running first build (should execute all jobs)..."
START_TIME=$(date +%s%N)
if ! "$CLI_BUILD" "$PARTITION_REF" > /tmp/delegation_first_build.log 2>&1; then
echo "[ERROR] First build failed"
cat /tmp/delegation_first_build.log
exit 1
fi
FIRST_BUILD_TIME=$(($(date +%s%N) - START_TIME))
echo "[INFO] First build completed in $(($FIRST_BUILD_TIME / 1000000))ms"
# Verify first build created events and partitions
if [[ ! -f "$TEST_DB" ]]; then
echo "[ERROR] Database not created after first build"
exit 1
fi
FIRST_BUILD_EVENTS=$(sqlite3 "$TEST_DB" "SELECT COUNT(*) FROM build_events;" 2>/dev/null || echo "0")
FIRST_JOB_EVENTS=$(sqlite3 "$TEST_DB" "SELECT COUNT(*) FROM job_events;" 2>/dev/null || echo "0")
PARTITION_AVAILABLE_COUNT=$(sqlite3 "$TEST_DB" "SELECT COUNT(*) FROM partition_events WHERE status = '4';" 2>/dev/null || echo "0")
echo "[INFO] First build: $FIRST_BUILD_EVENTS total events, $FIRST_JOB_EVENTS job events, $PARTITION_AVAILABLE_COUNT partitions available"
if [[ "$PARTITION_AVAILABLE_COUNT" -eq 0 ]]; then
echo "[ERROR] No partitions marked as available after first build"
sqlite3 "$TEST_DB" "SELECT partition_ref, status FROM partition_events ORDER BY rowid;"
exit 1
fi
echo "[INFO] Running second build (should delegate to existing partitions)..."
START_TIME=$(date +%s%N)
if ! "$CLI_BUILD" "$PARTITION_REF" > /tmp/delegation_second_build.log 2>&1; then
echo "[ERROR] Second build failed"
cat /tmp/delegation_second_build.log
exit 1
fi
SECOND_BUILD_TIME=$(($(date +%s%N) - START_TIME))
echo "[INFO] Second build completed in $(($SECOND_BUILD_TIME / 1000000))ms"
# Verify second build was much faster (delegation working)
SPEED_RATIO=$((FIRST_BUILD_TIME / SECOND_BUILD_TIME))
if [[ "$SPEED_RATIO" -lt 3 ]]; then
echo "[WARNING] Second build not significantly faster than first (ratio: $SPEED_RATIO). May indicate delegation not working optimally."
fi
# Count events after second build
SECOND_BUILD_EVENTS=$(sqlite3 "$TEST_DB" "SELECT COUNT(*) FROM build_events;" 2>/dev/null || echo "0")
SECOND_JOB_EVENTS=$(sqlite3 "$TEST_DB" "SELECT COUNT(*) FROM job_events;" 2>/dev/null || echo "0")
TOTAL_JOB_EVENTS=$((SECOND_JOB_EVENTS - FIRST_JOB_EVENTS))
echo "[INFO] Second build added: $((SECOND_BUILD_EVENTS - FIRST_BUILD_EVENTS)) total events, $TOTAL_JOB_EVENTS job events"
# Verify delegation occurred by checking for JobSkipped events
SKIPPED_JOBS=$(sqlite3 "$TEST_DB" "SELECT COUNT(*) FROM job_events WHERE status = '6';" 2>/dev/null || echo "0")
DELEGATION_EVENTS=$(sqlite3 "$TEST_DB" "SELECT COUNT(*) FROM delegation_events;" 2>/dev/null || echo "0")
echo "[INFO] Delegation results: $SKIPPED_JOBS jobs skipped, $DELEGATION_EVENTS delegation events"
if [[ "$SKIPPED_JOBS" -eq 0 ]]; then
echo "[ERROR] No jobs were skipped in second build - delegation not working"
echo "[DEBUG] Job event status breakdown:"
sqlite3 "$TEST_DB" "SELECT status, COUNT(*) FROM job_events GROUP BY status ORDER BY status;" 2>/dev/null || echo "Failed to query job events"
exit 1
fi
# Verify no duplicate partition building occurred
DUPLICATE_PARTITIONS=$(sqlite3 "$TEST_DB" "
SELECT partition_ref, COUNT(*) as build_count
FROM partition_events
WHERE status = '4'
GROUP BY partition_ref
HAVING build_count > 1
" 2>/dev/null || echo "")
if [[ -n "$DUPLICATE_PARTITIONS" ]]; then
echo "[ERROR] Partitions were built multiple times (delegation failed):"
echo "$DUPLICATE_PARTITIONS"
exit 1
fi
# Verify the target partition was properly delegated
TARGET_PARTITION_BUILDS=$(sqlite3 "$TEST_DB" "
SELECT COUNT(*)
FROM partition_events
WHERE partition_ref = '$PARTITION_REF' AND status = '4'
" 2>/dev/null || echo "0")
if [[ "$TARGET_PARTITION_BUILDS" -ne 1 ]]; then
echo "[ERROR] Target partition '$PARTITION_REF' was built $TARGET_PARTITION_BUILDS times instead of 1"
sqlite3 "$TEST_DB" "SELECT be.build_request_id, pe.partition_ref, pe.status, be.timestamp FROM partition_events pe JOIN build_events be ON pe.event_id = be.event_id WHERE pe.partition_ref = '$PARTITION_REF' ORDER BY be.timestamp;"
exit 1
fi
echo "[INFO] ✅ Delegation test passed!"
echo "[INFO] Summary:"
echo "[INFO] - First build: $(($FIRST_BUILD_TIME / 1000000))ms, created $PARTITION_AVAILABLE_COUNT available partitions"
echo "[INFO] - Second build: $(($SECOND_BUILD_TIME / 1000000))ms, delegated $SKIPPED_JOBS jobs"
echo "[INFO] - Speed improvement: ${SPEED_RATIO}x faster"
echo "[INFO] - No duplicate partition building detected"
# Clean up
rm -f "$TEST_DB" /tmp/delegation_*.log /tmp/*generated_number* 2>/dev/null || true

View file

@ -38,7 +38,8 @@ echo "[INFO] Testing Service build for podcast reviews..."
SERVICE_DB_PATH="/tmp/podcast_reviews_graph_service.db" SERVICE_DB_PATH="/tmp/podcast_reviews_graph_service.db"
rm -f "$SERVICE_DB_PATH" rm -f "$SERVICE_DB_PATH"
# Start service # Start service with its own database
export DATABUILD_BUILD_EVENT_LOG="sqlite://$SERVICE_DB_PATH"
SERVICE_PORT=58082 SERVICE_PORT=58082
"$SERVICE_BINARY" --port="$SERVICE_PORT" --host="127.0.0.1" > /tmp/podcast_service.log 2>&1 & "$SERVICE_BINARY" --port="$SERVICE_PORT" --host="127.0.0.1" > /tmp/podcast_service.log 2>&1 &
SERVICE_PID=$! SERVICE_PID=$!
@ -90,7 +91,7 @@ for i in {1..60}; do
echo "[ERROR] Service build failed: $STATUS_RESPONSE" echo "[ERROR] Service build failed: $STATUS_RESPONSE"
exit 1 exit 1
;; ;;
"running"|"RUNNING"|"pending"|"PENDING"|"planning"|"PLANNING") "running"|"RUNNING"|"pending"|"PENDING"|"planning"|"PLANNING"|"executing"|"EXECUTING")
echo "[INFO] Build status: $STATUS" echo "[INFO] Build status: $STATUS"
sleep 2 sleep 2
;; ;;
@ -136,11 +137,12 @@ if [[ "$CLI_EVENTS" -gt 0 ]] && [[ "$SERVICE_EVENTS" -gt 0 ]]; then
echo "[INFO] Partition events: CLI=$CLI_PARTITION_EVENTS, Service=$SERVICE_PARTITION_EVENTS" echo "[INFO] Partition events: CLI=$CLI_PARTITION_EVENTS, Service=$SERVICE_PARTITION_EVENTS"
echo "[INFO] Request events: CLI=$CLI_REQUEST_EVENTS, Service=$SERVICE_REQUEST_EVENTS" echo "[INFO] Request events: CLI=$CLI_REQUEST_EVENTS, Service=$SERVICE_REQUEST_EVENTS"
# Validate core events are identical (job, partition, and request events should all match now) # Validate that CLI and Service produce identical event patterns
if [[ "$CLI_JOB_EVENTS" -eq "$SERVICE_JOB_EVENTS" ]] && [[ "$CLI_PARTITION_EVENTS" -eq "$SERVICE_PARTITION_EVENTS" ]] && [[ "$CLI_REQUEST_EVENTS" -eq "$SERVICE_REQUEST_EVENTS" ]]; then if [[ "$CLI_JOB_EVENTS" -eq "$SERVICE_JOB_EVENTS" ]] && [[ "$CLI_PARTITION_EVENTS" -eq "$SERVICE_PARTITION_EVENTS" ]] && [[ "$CLI_REQUEST_EVENTS" -eq "$SERVICE_REQUEST_EVENTS" ]]; then
echo "[INFO] ✅ All build events (job, partition, and request) are identical" echo "[INFO] ✅ Core build events (job, partition, and request) are identical"
else else
echo "[ERROR] ❌ Build events differ between CLI and Service - this indicates a problem" echo "[ERROR] ❌ Core build events differ - CLI and Service should produce identical events"
echo "[ERROR] This indicates the CLI is not properly coordinating analysis and execution phases"
exit 1 exit 1
fi fi
@ -149,6 +151,7 @@ if [[ "$CLI_EVENTS" -gt 0 ]] && [[ "$SERVICE_EVENTS" -gt 0 ]]; then
echo "[INFO] ✅ Total event counts are identical: $CLI_EVENTS events each" echo "[INFO] ✅ Total event counts are identical: $CLI_EVENTS events each"
else else
echo "[ERROR] ❌ Total event counts differ: CLI=$CLI_EVENTS, Service=$SERVICE_EVENTS" echo "[ERROR] ❌ Total event counts differ: CLI=$CLI_EVENTS, Service=$SERVICE_EVENTS"
echo "[ERROR] CLI and Service should produce identical event patterns for the same operations"
exit 1 exit 1
fi fi

View file

@ -12,8 +12,14 @@ if [[ -z "$CLI_BUILD" ]] || [[ -z "$SERVICE_BINARY" ]]; then
fi fi
echo "[INFO] Testing CLI build..." echo "[INFO] Testing CLI build..."
# Clean up any existing partition state and databases to prevent delegation
rm -f /tmp/simple_test_cli.db /tmp/basic_graph_service.db
rm -f /tmp/*generated_number* 2>/dev/null || true
# Kill any existing service processes that might hold database locks
killall basic_graph.service 2>/dev/null || true
killall build_graph_service 2>/dev/null || true
sleep 1
export DATABUILD_BUILD_EVENT_LOG="sqlite:///tmp/simple_test_cli.db" export DATABUILD_BUILD_EVENT_LOG="sqlite:///tmp/simple_test_cli.db"
rm -f /tmp/simple_test_cli.db
# Test CLI build # Test CLI build
if ! "$CLI_BUILD" "generated_number/pippin" > /tmp/cli_output.log 2>&1; then if ! "$CLI_BUILD" "generated_number/pippin" > /tmp/cli_output.log 2>&1; then
@ -36,9 +42,11 @@ fi
echo "[INFO] Testing Service build..." echo "[INFO] Testing Service build..."
# The service uses a hardcoded database path # The service uses a hardcoded database path
SERVICE_DB_PATH="/tmp/basic_graph_service.db" SERVICE_DB_PATH="/tmp/basic_graph_service.db"
# Clean up service database (already cleaned above, but being explicit)
rm -f "$SERVICE_DB_PATH" rm -f "$SERVICE_DB_PATH"
# Start service # Start service with its own database
export DATABUILD_BUILD_EVENT_LOG="sqlite://$SERVICE_DB_PATH"
SERVICE_PORT=58080 SERVICE_PORT=58080
"$SERVICE_BINARY" --port="$SERVICE_PORT" --host="127.0.0.1" > /tmp/service.log 2>&1 & "$SERVICE_BINARY" --port="$SERVICE_PORT" --host="127.0.0.1" > /tmp/service.log 2>&1 &
SERVICE_PID=$! SERVICE_PID=$!
@ -92,7 +100,7 @@ for i in {1..30}; do
echo "[ERROR] Service build failed: $STATUS_RESPONSE" echo "[ERROR] Service build failed: $STATUS_RESPONSE"
exit 1 exit 1
;; ;;
"running"|"RUNNING"|"pending"|"PENDING"|"planning"|"PLANNING") "running"|"RUNNING"|"pending"|"PENDING"|"planning"|"PLANNING"|"executing"|"EXECUTING")
echo "[INFO] Build status: $STATUS" echo "[INFO] Build status: $STATUS"
sleep 2 sleep 2
;; ;;
@ -138,11 +146,21 @@ if [[ "$CLI_EVENTS" -gt 0 ]] && [[ "$SERVICE_EVENTS" -gt 0 ]]; then
echo "[INFO] Partition events: CLI=$CLI_PARTITION_EVENTS, Service=$SERVICE_PARTITION_EVENTS" echo "[INFO] Partition events: CLI=$CLI_PARTITION_EVENTS, Service=$SERVICE_PARTITION_EVENTS"
echo "[INFO] Request events: CLI=$CLI_REQUEST_EVENTS, Service=$SERVICE_REQUEST_EVENTS" echo "[INFO] Request events: CLI=$CLI_REQUEST_EVENTS, Service=$SERVICE_REQUEST_EVENTS"
# Validate core events are identical # Validate that CLI and Service produce identical event patterns
if [[ "$CLI_JOB_EVENTS" -eq "$SERVICE_JOB_EVENTS" ]] && [[ "$CLI_PARTITION_EVENTS" -eq "$SERVICE_PARTITION_EVENTS" ]] && [[ "$CLI_REQUEST_EVENTS" -eq "$SERVICE_REQUEST_EVENTS" ]]; then if [[ "$CLI_JOB_EVENTS" -eq "$SERVICE_JOB_EVENTS" ]] && [[ "$CLI_PARTITION_EVENTS" -eq "$SERVICE_PARTITION_EVENTS" ]] && [[ "$CLI_REQUEST_EVENTS" -eq "$SERVICE_REQUEST_EVENTS" ]]; then
echo "[INFO] ✅ Core build events (job and partition) are identical" echo "[INFO] ✅ Core build events (job, partition, and request) are identical"
else else
echo "[ERROR] ❌ Core build events differ - this indicates a problem" echo "[ERROR] ❌ Core build events differ - CLI and Service should produce identical events"
echo "[ERROR] This indicates the CLI is not properly coordinating analysis and execution phases"
exit 1
fi
# Validate total event counts are identical
if [[ "$CLI_EVENTS" -eq "$SERVICE_EVENTS" ]]; then
echo "[INFO] ✅ Total event counts are identical: $CLI_EVENTS events each"
else
echo "[ERROR] ❌ Total event counts differ: CLI=$CLI_EVENTS, Service=$SERVICE_EVENTS"
echo "[ERROR] CLI and Service should produce identical event patterns for the same operations"
exit 1 exit 1
fi fi

View file

@ -6,7 +6,10 @@ set -euo pipefail
echo "[INFO] Validating E2E test runner setup" echo "[INFO] Validating E2E test runner setup"
# Check if the test runner exists # Check if the test runner exists
RUNNER_PATH="/Users/stuart/Projects/databuild/run_e2e_tests.sh" # Find the runner relative to the workspace root
SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")"
WORKSPACE_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
RUNNER_PATH="$WORKSPACE_ROOT/run_e2e_tests.sh"
if [[ ! -f "$RUNNER_PATH" ]]; then if [[ ! -f "$RUNNER_PATH" ]]; then
echo "[ERROR] E2E test runner not found at: $RUNNER_PATH" echo "[ERROR] E2E test runner not found at: $RUNNER_PATH"
exit 1 exit 1