big bump

2025-11-16 22:21:56 -08:00 · 2025-11-16 22:21:56 -08:00 · 2cf778a07b
commit 2cf778a07b
parent 5361e295e0
13 changed files with 737 additions and 316 deletions
--- a/MODULE.bazel
+++ b/MODULE.bazel
@ -3,15 +3,21 @@ module(
    version = "0.1",
 )

-bazel_dep(name = "bazel_skylib", version = "1.8.1")
-bazel_dep(name = "platforms", version = "0.0.11")
-bazel_dep(name = "rules_shell", version = "0.4.0")
+bazel_dep(name = "bazel_skylib", version = "1.8.2")
+bazel_dep(name = "platforms", version = "1.0.0")
+bazel_dep(name = "rules_shell", version = "0.6.1")
 bazel_dep(name = "rules_oci", version = "2.2.6")
 bazel_dep(name = "aspect_bazel_lib", version = "2.14.0")
-bazel_dep(name = "rules_rust", version = "0.61.0")
+bazel_dep(name = "rules_rust", version = "0.67.0")
 bazel_dep(name = "rules_proto", version = "7.0.2")
 bazel_dep(name = "protobuf", version = "29.0", repo_name = "com_google_protobuf")

+#rust = use_extension("@rules_rust//rust:extensions.bzl", "rust")
+#rust.toolchain(
+#    edition = "2024",
+#    versions = ["1.91.1"],
+#)
+
 crate = use_extension("@rules_rust//crate_universe:extensions.bzl", "crate")
 crate.spec(
    features = ["derive"],
--- a/MODULE.bazel.lock
+++ b/MODULE.bazel.lock
--- a/databuild/BUILD.bazel
+++ b/databuild/BUILD.bazel
@ -40,6 +40,7 @@ rust_test(
    name = "databuild_test",
    crate = ":databuild",
    data = ["//databuild/test:test_job_helper"],
+    env = {"RUST_BACKTRACE": "1"},
 )

 # Legacy filegroup for backwards compatibility
--- a/databuild/build_event_log.rs
+++ b/databuild/build_event_log.rs
@ -6,15 +6,15 @@ use std::error::Error;
 use std::fmt::Debug;
 use std::time::{SystemTime, UNIX_EPOCH};
 use crate::build_state::BuildState;
-use crate::util::current_timestamp;
+use crate::util::{current_timestamp, DatabuildError};

 pub trait BELStorage {
-    fn append_event(&mut self, event: &Event) -> Result<u64, Box<dyn Error>>;
+    fn append_event(&mut self, event: &Event) -> Result<u64, DatabuildError>;
    fn list_events(
        &self,
        since_idx: u64,
        limit: u64,
-    ) -> Result<Vec<DataBuildEvent>, Box<dyn Error>>;
+    ) -> Result<Vec<DataBuildEvent>, DatabuildError>;
 }

 #[derive(Debug, Clone)]
@ -35,7 +35,7 @@ impl MemoryBELStorage {
 }

 impl BELStorage for MemoryBELStorage {
-    fn append_event(&mut self, event: &Event) -> Result<u64, Box<dyn Error>> {
+    fn append_event(&mut self, event: &Event) -> Result<u64, DatabuildError> {
        let timestamp = current_timestamp();
        let dbe = DataBuildEvent {
            timestamp,
@ -50,7 +50,7 @@ impl BELStorage for MemoryBELStorage {
        &self,
        since_idx: u64,
        limit: u64,
-    ) -> Result<Vec<DataBuildEvent>, Box<dyn Error>> {
+    ) -> Result<Vec<DataBuildEvent>, DatabuildError> {
        Ok(self
            .events
            .iter()
@ -67,7 +67,7 @@ struct SqliteBELStorage {
 }

 impl SqliteBELStorage {
-    fn create(database_url: &str) -> Result<SqliteBELStorage, Box<dyn Error>> {
+    fn create(database_url: &str) -> Result<SqliteBELStorage, DatabuildError> {
        let connection = Connection::open(database_url)?;

        // Create the events table
@ -85,7 +85,7 @@ impl SqliteBELStorage {
 }

 impl BELStorage for SqliteBELStorage {
-    fn append_event(&mut self, event: &Event) -> Result<u64, Box<dyn Error>> {
+    fn append_event(&mut self, event: &Event) -> Result<u64, DatabuildError> {
        let now = SystemTime::now();
        let duration_since_epoch = now.duration_since(UNIX_EPOCH).expect("Time went backwards");
        let timestamp = duration_since_epoch.as_nanos() as u64;
@ -113,7 +113,7 @@ impl BELStorage for SqliteBELStorage {
        &self,
        since_idx: u64,
        limit: u64,
-    ) -> Result<Vec<DataBuildEvent>, Box<dyn Error>> {
+    ) -> Result<Vec<DataBuildEvent>, DatabuildError> {
        let mut stmt = self.connection.prepare(
            "SELECT event_id, timestamp, event_data FROM events
             WHERE timestamp > ?1
@ -164,8 +164,8 @@ impl<S: BELStorage + Debug> BuildEventLog<S> {
        BuildEventLog { storage, state }
    }

-    pub fn append_event(&mut self, event: &Event) -> Result<u64, Box<dyn Error>> {
-        self.state.handle_event(&event);
+    pub fn append_event(&mut self, event: &Event) -> Result<u64, DatabuildError> {
+        self.state.handle_event(&event)?;
        let idx = self.storage.append_event(event)?;
        Ok(idx)
    }
--- a/databuild/build_state.rs
+++ b/databuild/build_state.rs
@ -1,10 +1,15 @@
 use crate::data_build_event::Event;
-use crate::util::current_timestamp;
+use crate::data_deps::{missing_deps_to_want_events, WantTimestamps};
+use crate::job_run::{DepMissJobRun, SubProcessBackend};
+use crate::util::{current_timestamp, DatabuildError};
 use crate::{
-    JobRunDetail, ListJobRunsRequest, ListJobRunsResponse, ListPartitionsRequest,
-    ListPartitionsResponse, ListTaintsRequest, ListTaintsResponse, ListWantsRequest,
-    ListWantsResponse, PartitionDetail, PartitionRef, PartitionStatusCode, TaintDetail, WantDetail,
-    WantStatusCode,
+    JobRunBufferEventV1, JobRunCancelEventV1, JobRunDetail, JobRunFailureEventV1,
+    JobRunHeartbeatEventV1, JobRunMissingDepsEventV1,
+    JobRunStatusCode, JobRunSuccessEventV1, ListJobRunsRequest, ListJobRunsResponse,
+    ListPartitionsRequest, ListPartitionsResponse, ListTaintsRequest, ListTaintsResponse,
+    ListWantsRequest, ListWantsResponse, PartitionDetail, PartitionRef, PartitionStatusCode,
+    TaintCreateEventV1, TaintDeleteEventV1, TaintDetail, WantCancelEventV1, WantCreateEventV1,
+    WantDetail, WantStatusCode,
 };
 use rusqlite::types::FromSql;
 use rusqlite::ToSql;
@ -35,10 +40,10 @@ and updates, which is exceptionally fast.

 #[derive(Debug, Clone)]
 pub struct BuildState {
-    pub wants: BTreeMap<String, WantDetail>,
-    pub taints: BTreeMap<String, TaintDetail>,
-    pub partitions: BTreeMap<String, PartitionDetail>,
-    pub job_runs: BTreeMap<String, JobRunDetail>,
+    wants: BTreeMap<String, WantDetail>,
+    taints: BTreeMap<String, TaintDetail>,
+    partitions: BTreeMap<String, PartitionDetail>,
+    job_runs: BTreeMap<String, JobRunDetail>,
 }

 impl Default for BuildState {
@ -53,21 +58,202 @@ impl Default for BuildState {
 }

 impl BuildState {
-    pub fn handle_event(&mut self, event: &Event) -> () {
-        match event {
-            Event::WantCreateV1(e) => {
-                self.wants.insert(e.want_id.clone(), e.clone().into());
+    pub fn count_job_runs(&self) -> usize {
+        self.job_runs.len()
    }
-            Event::WantCancelV1(e) => {
-                if let Some(want) = self.wants.get_mut(&e.want_id) {
+
+    /// Handles reacting to events, updating state, and erroring if its an invalid state transition
+    pub fn handle_event(&mut self, event: &Event) -> Result<(), DatabuildError> {
+        match event {
+            // JobRun events
+            Event::JobRunBufferV1(e) => self.handle_job_run_buffer(e),
+            Event::JobRunHeartbeatV1(e) => self.handle_job_run_heartbeat(e),
+            Event::JobRunFailureV1(e) => self.handle_job_run_failure(e),
+            Event::JobRunCancelV1(e) => self.handle_job_run_cancel(e),
+            Event::JobRunSuccessV1(e) => self.handle_job_run_success(e),
+            Event::JobRunMissingDepsV1(e) => self.handle_job_run_dep_miss(e),
+            // Want events
+            Event::WantCreateV1(e) => self.handle_want_create(e),
+            Event::WantCancelV1(e) => self.handle_want_cancel(e),
+            // Taint events
+            Event::TaintCreateV1(e) => self.handle_taint_create(e),
+            Event::TaintDeleteV1(e) => self.handle_taint_delete(e),
+            // Ruh roh!
+            _ => panic!("Unhandled event type! {:?}", event),
+        }
+    }
+
+    fn handle_want_create(&mut self, event: &WantCreateEventV1) -> Result<(), DatabuildError> {
+        self.wants
+            .insert(event.want_id.clone(), event.clone().into());
+        Ok(())
+    }
+
+    fn handle_want_cancel(&mut self, event: &WantCancelEventV1) -> Result<(), DatabuildError> {
+        if let Some(want) = self.wants.get_mut(&event.want_id) {
            want.status = Some(WantStatusCode::WantCanceled.into());
            want.last_updated_timestamp = current_timestamp();
        }
+        Ok(())
    }
-            _ => (),
+
+    fn handle_job_run_buffer(&mut self, event: &JobRunBufferEventV1) -> Result<(), DatabuildError> {
+        // No job run should exist
+        if self.job_runs.get(&event.job_run_id).is_some() {
+            Err(format!("Job run ID collision on job run ID {}", event.job_run_id).into())
+        } else {
+            // Create job run to be inserted
+            let job_run: JobRunDetail = event.clone().into();
+
+            for pref in job_run.building_partitions.iter() {
+                // Update all wants that point to this partition ref to `Building`
+                // Query notes: "update all wants that point to this partition to building"
+                if let Some(want) = self.wants.get_mut(&pref.r#ref) {
+                    want.status = Some(WantStatusCode::WantBuilding.into());
                }
            }

+            self.job_runs.insert(event.job_run_id.clone(), job_run.clone());
+            println!("Inserted job run: {:?}", job_run);
+            Ok(())
+        }
+    }
+
+    fn update_job_run_status(
+        &mut self,
+        job_run_id: &str,
+        status: JobRunStatusCode,
+    ) -> Result<(), DatabuildError> {
+        if let Some(job_run) = self.job_runs.get_mut(job_run_id) {
+            job_run.last_heartbeat_at = Some(current_timestamp());
+            job_run.status = Some(status.into());
+            Ok(())
+        } else {
+            Err(format!("Job run ID {} not found", job_run_id).into())
+        }
+    }
+
+    fn update_partition_status(
+        &mut self,
+        pref: &PartitionRef,
+        status: PartitionStatusCode,
+        job_run_id: Option<&str>,
+    ) -> Result<(), DatabuildError> {
+        if let Some(partition) = self.partitions.get_mut(&pref.r#ref) {
+            partition.status = Some(status.clone().into());
+            partition.last_updated_timestamp = Some(current_timestamp());
+            if let Some(job_run_id) = job_run_id.map(str::to_string) {
+                if !partition.job_run_ids.contains(&job_run_id) {
+                    partition.job_run_ids.push(job_run_id);
+                }
+            }
+        } else {
+            // Partition doesn't exist yet, needs to be inserted
+            let want_ids = if let Some(jrid) = job_run_id {
+                let job_run = self.get_job_run(jrid).expect("Job run must exist for partition");
+                job_run.servicing_wants.iter().map(|wap| wap.want_id.clone()).collect()
+            } else {
+                vec![]
+            };
+
+            let partition = PartitionDetail {
+                r#ref: Some(pref.clone()),
+                status: Some(status.into()),
+                last_updated_timestamp: Some(current_timestamp()),
+                job_run_ids: job_run_id.map(|jrid| vec![jrid.to_string()]).unwrap_or(vec![]),
+                want_ids,
+                ..PartitionDetail::default()
+            };
+            self.partitions.insert(pref.r#ref.clone(), partition);
+        };
+
+        self.update_wants_for_partition(&pref)
+    }
+
+    /// Walks the state from this want ID to update its status.
+    fn update_want_status(&mut self, want_id: &str) -> Result<(), DatabuildError> {
+        if let Some(want) = self.wants.get(want_id) {
+            let details: Vec<Option<PartitionDetail>> = want
+                .upstreams
+                .iter()
+                .map(|pref| self.get_partition(&pref.r#ref))
+                .collect();
+            let status: WantStatusCode = details.into();
+            if let Some(mut_want) = self.wants.get_mut(want_id) {
+                mut_want.status = Some(status.into());
+                mut_want.last_updated_timestamp = current_timestamp();
+            }
+            Ok(())
+        } else {
+            Err(format!("Want id {} not found", want_id).into())
+        }
+    }
+
+    fn handle_job_run_heartbeat(
+        &mut self,
+        event: &JobRunHeartbeatEventV1,
+    ) -> Result<(), DatabuildError> {
+        self.update_job_run_status(&event.job_run_id, JobRunStatusCode::JobRunRunning)
+    }
+
+    fn handle_job_run_success(
+        &mut self,
+        event: &JobRunSuccessEventV1,
+    ) -> Result<(), DatabuildError> {
+        println!("Job run success event: {:?}", event);
+        self.update_job_run_status(&event.job_run_id, JobRunStatusCode::JobRunSucceeded)?;
+        let job_run = self.get_job_run(&event.job_run_id).unwrap();
+        // Update partitions being build by this job
+        for pref in job_run.building_partitions {
+            self.update_partition_status(&pref, PartitionStatusCode::PartitionLive, Some(&event.job_run_id))?;
+        }
+        Ok(())
+    }
+
+    fn update_wants_for_partition(&mut self, pref: &PartitionRef) -> Result<(), DatabuildError> {
+        // todo!("Go to every want that references this partition and update its status")
+        let want_ids = self
+            .partitions
+            .get(&pref.r#ref)
+            .map(|p| p.want_ids.clone())
+            .ok_or(format!("Partition for ref {} not found", pref.r#ref))?;
+        for want_id in want_ids.iter() {
+            self.update_want_status(want_id)?;
+        }
+        Ok(())
+    }
+
+    fn handle_job_run_failure(
+        &mut self,
+        event: &JobRunFailureEventV1,
+    ) -> Result<(), DatabuildError> {
+        self.update_job_run_status(&event.job_run_id, JobRunStatusCode::JobRunFailed)?;
+        let job_run = self.get_job_run(&event.job_run_id).unwrap();
+        for pref in job_run.building_partitions {
+            self.update_partition_status(&pref, PartitionStatusCode::PartitionFailed, Some(&event.job_run_id))?;
+        }
+        Ok(())
+    }
+
+    fn handle_job_run_cancel(&mut self, event: &JobRunCancelEventV1) -> Result<(), DatabuildError> {
+        todo!("should update already inserted job run, partition status, want status")
+    }
+
+    fn handle_job_run_dep_miss(
+        &mut self,
+        event: &JobRunMissingDepsEventV1,
+    ) -> Result<(), DatabuildError> {
+        todo!("should update already inserted job run, schedule wants...?")
+    }
+
+    fn handle_taint_create(&mut self, event: &TaintCreateEventV1) -> Result<(), DatabuildError> {
+        todo!("...?")
+    }
+
+    fn handle_taint_delete(&mut self, event: &TaintDeleteEventV1) -> Result<(), DatabuildError> {
+        todo!("...?")
+    }
+
    fn with_wants(self, wants: BTreeMap<String, WantDetail>) -> Self {
        Self { wants, ..self }
    }
@ -186,6 +372,39 @@ impl BuildState {
                .collect(),
        )
    }
+
+    /// Maps a dep miss into the BEL events it implies, so that the job can be run successfully later
+    pub fn dep_miss_to_events(
+        &self,
+        dep_miss: &DepMissJobRun<SubProcessBackend>,
+    ) -> Result<Vec<Event>, DatabuildError> {
+        let mut events = vec![];
+        // Append literal job run dep miss
+        events.push(dep_miss.state.to_event(&dep_miss.id()));
+        // Append wants from dep miss
+        let job_run_detail = self
+            .get_job_run(&dep_miss.job_run_id.to_string())
+            .ok_or(format!(
+                "Unable to find job run with id `{}`",
+                dep_miss.job_run_id
+            ))?;
+        // Infer data/SLA timestamps from upstream want
+        let want_timestamps: WantTimestamps = job_run_detail
+            .servicing_wants
+            .iter()
+            .flat_map(|wap| self.get_want(&wap.want_id).map(|w| w.into()))
+            .reduce(|a: WantTimestamps, b: WantTimestamps| a.merge(b))
+            .ok_or(format!("No servicing wants found"))?;
+        // Create wants from dep misses
+        let want_events = missing_deps_to_want_events(
+            dep_miss.state.missing_deps.clone(),
+            &dep_miss.job_run_id,
+            want_timestamps,
+        );
+        events.extend(want_events);
+
+        Ok(events)
+    }
 }

 /// The status of partitions required by a want to build (sensed from dep miss job run)
--- a/databuild/databuild.proto
+++ b/databuild/databuild.proto
@ -53,7 +53,8 @@ message WantAttributedPartitions {
 message JobRunBufferEventV1 {
  string job_run_id = 1;
  string job_label = 2;
-  repeated WantAttributedPartitions want_attributed_partitions = 3;
+  repeated PartitionRef building_partitions = 3;
+  repeated WantAttributedPartitions want_attributed_partitions = 4;
  // TODO how do we handle buffer definition? Start simple, noop until we want something here?
 }
 // Just indicates that job has entered queue
@ -189,6 +190,7 @@ message PartitionDetail {
  optional uint64 last_updated_timestamp = 3;
  // IDs that associate the partition with other objects
  repeated string job_run_ids = 4;
+  // Wants that reference this partition
  repeated string want_ids = 5;
  repeated string taint_ids = 6;
 }
@ -225,7 +227,8 @@ message JobRunDetail {
  string id = 1;
  JobRunStatus status = 2;
  optional uint64 last_heartbeat_at = 3;
-  repeated WantAttributedPartitions servicing_wants = 4;
+  repeated PartitionRef building_partitions = 4;
+  repeated WantAttributedPartitions servicing_wants = 5;
 }


--- a/databuild/event_transforms.rs
+++ b/databuild/event_transforms.rs
@ -1,10 +1,7 @@
 use crate::data_build_event::Event;
 use crate::util::current_timestamp;
-use crate::{
-    event_source, EventSource, JobRunStatus, JobRunStatusCode, JobTriggeredEvent,
-    ManuallyTriggeredEvent, PartitionRef, PartitionStatus, PartitionStatusCode, WantCancelEventV1,
-    WantCreateEventV1, WantDetail, WantStatus, WantStatusCode,
-};
+use crate::{event_source, EventSource, JobRunBufferEventV1, JobRunDetail, JobRunStatus, JobRunStatusCode, JobTriggeredEvent, ManuallyTriggeredEvent, PartitionDetail, PartitionRef, PartitionStatus, PartitionStatusCode, WantAttributedPartitions, WantCancelEventV1, WantCreateEventV1, WantDetail, WantStatus, WantStatusCode};
+use crate::PartitionStatusCode::{PartitionFailed, PartitionLive};

 impl From<&WantCreateEventV1> for WantDetail {
    fn from(e: &WantCreateEventV1) -> Self {
@ -38,6 +35,15 @@ impl From<WantCancelEventV1> for Event {
    }
 }

+impl From<WantCreateEventV1> for WantAttributedPartitions {
+    fn from(value: WantCreateEventV1) -> Self {
+        Self {
+            want_id: value.want_id,
+            partitions: value.partitions,
+        }
+    }
+}
+
 impl From<WantStatusCode> for WantStatus {
    fn from(code: WantStatusCode) -> Self {
        WantStatus {
@ -47,6 +53,50 @@ impl From<WantStatusCode> for WantStatus {
    }
 }

+impl From<JobRunBufferEventV1> for JobRunDetail {
+    fn from(value: JobRunBufferEventV1) -> Self {
+        Self {
+            id: value.job_run_id,
+            status: Some(JobRunStatusCode::JobRunQueued.into()),
+            last_heartbeat_at: None,
+            building_partitions: value.building_partitions,
+            servicing_wants: value.want_attributed_partitions,
+        }
+    }
+}
+
+
+pub fn want_status_matches_any(pds: &Vec<Option<PartitionDetail>>, status: PartitionStatusCode) -> bool {
+    pds.iter()
+        .any(|pd| pd.clone()
+            .map(|pd| pd.status == Some(status.into()))
+        .unwrap_or(false))
+}
+
+pub fn want_status_matches_all(pds: &Vec<Option<PartitionDetail>>, status: PartitionStatusCode) -> bool {
+    pds.iter()
+        .all(|pd| pd.clone()
+            .map(|pd| pd.status == Some(status.into()))
+        .unwrap_or(false))
+}
+
+/// Merges a list of partition details into a single status code.
+/// Takes the lowest state as the want status.
+impl Into<WantStatusCode> for Vec<Option<PartitionDetail>> {
+
+    fn into(self) -> WantStatusCode {
+        if want_status_matches_any(&self, PartitionFailed) {
+            WantStatusCode::WantFailed
+        } else if want_status_matches_all(&self, PartitionLive) {
+            WantStatusCode::WantSuccessful
+        } else if self.iter().any(|pd| pd.is_none()) {
+            WantStatusCode::WantBuilding
+        } else {
+            WantStatusCode::WantIdle
+        }
+    }
+}
+
 impl From<&str> for PartitionRef {
    fn from(value: &str) -> Self {
        Self {
@ -88,3 +138,12 @@ impl From<JobTriggeredEvent> for EventSource {
        }
    }
 }
+
+impl From<&WantDetail> for WantAttributedPartitions {
+    fn from(value: &WantDetail) -> Self {
+        Self {
+            want_id: value.want_id.clone(),
+            partitions: value.partitions.clone(),
+        }
+    }
+}
--- a/databuild/job_run.rs
+++ b/databuild/job_run.rs
@ -4,6 +4,7 @@ use crate::{
    EventSource, JobRunCancelEventV1, JobRunFailureEventV1, JobRunMissingDepsEventV1, JobRunStatus,
    JobRunSuccessEventV1, MissingDeps, ReadDeps,
 };
+use crate::util::DatabuildError;
 use std::collections::HashMap;
 use std::error::Error;
 use std::io::{BufRead, BufReader};
@ -34,21 +35,21 @@ pub trait JobRunBackend: Sized {
    fn start(
        not_started: Self::NotStartedState,
        env: Option<HashMap<String, String>>,
-    ) -> Result<Self::RunningState, Box<dyn Error>>;
+    ) -> Result<Self::RunningState, DatabuildError>;

    /// Poll a running job for state changes
    fn poll(
        running: &mut Self::RunningState,
    ) -> Result<
        PollResult<Self::CompletedState, Self::FailedState, Self::DepMissState>,
-        Box<dyn Error>,
+        DatabuildError,
    >;

    /// Cancel a running job
    fn cancel_job(
        running: Self::RunningState,
        source: EventSource,
-    ) -> Result<Self::CanceledState, Box<dyn Error>>;
+    ) -> Result<Self::CanceledState, DatabuildError>;
 }

 /// Result of polling a running job
@ -91,14 +92,14 @@ impl<B: JobRunBackend> NotStartedJobRun<B> {
        }
    }

-    pub fn run(self) -> Result<RunningJobRun<B>, Box<dyn Error>> {
+    pub fn run(self) -> Result<RunningJobRun<B>, DatabuildError> {
        self.run_with_env(None)
    }

    pub fn run_with_env(
        self,
        env: Option<HashMap<String, String>>,
-    ) -> Result<RunningJobRun<B>, Box<dyn Error>> {
+    ) -> Result<RunningJobRun<B>, DatabuildError> {
        let running_state = B::start(self.state, env)?;
        Ok(JobRun {
            job_run_id: self.job_run_id,
@ -110,7 +111,7 @@ impl<B: JobRunBackend> NotStartedJobRun<B> {

 // Methods available only on Running state
 impl<B: JobRunBackend> RunningJobRun<B> {
-    pub fn visit(&mut self) -> Result<JobRunVisitResult<B>, Box<dyn Error>> {
+    pub fn visit(&mut self) -> Result<JobRunVisitResult<B>, DatabuildError> {
        match B::poll(&mut self.state)? {
            PollResult::StillRunning => Ok(JobRunVisitResult::StillRunning),
            PollResult::Completed(completed_state) => {
@ -140,7 +141,7 @@ impl<B: JobRunBackend> RunningJobRun<B> {
        }
    }

-    pub fn cancel(self, source: EventSource) -> Result<CanceledJobRun<B>, Box<dyn Error>> {
+    pub fn cancel(self, source: EventSource) -> Result<CanceledJobRun<B>, DatabuildError> {
        let canceled_state = B::cancel_job(self.state, source)?;
        Ok(JobRun {
            job_run_id: self.job_run_id,
@ -223,7 +224,7 @@ impl JobRunBackend for SubProcessBackend {
    fn start(
        not_started: Self::NotStartedState,
        env: Option<HashMap<String, String>>,
-    ) -> Result<Self::RunningState, Box<dyn Error>> {
+    ) -> Result<Self::RunningState, DatabuildError> {
        let process = Command::new(not_started.entry_point)
            .args(not_started.args)
            .stdout(Stdio::piped())
@ -241,7 +242,7 @@ impl JobRunBackend for SubProcessBackend {
        running: &mut Self::RunningState,
    ) -> Result<
        PollResult<Self::CompletedState, Self::FailedState, Self::DepMissState>,
-        Box<dyn Error>,
+        DatabuildError,
    > {
        // Non-blocking check for exit status
        if let Some(exit_status) = running.process.try_wait()? {
@ -309,7 +310,7 @@ impl JobRunBackend for SubProcessBackend {
    fn cancel_job(
        mut running: Self::RunningState,
        source: EventSource,
-    ) -> Result<Self::CanceledState, Box<dyn Error>> {
+    ) -> Result<Self::CanceledState, DatabuildError> {
        // Kill the process
        running.process.kill()?;

--- a/databuild/orchestrator.rs
+++ b/databuild/orchestrator.rs
@ -1,30 +1,35 @@
 use crate::build_event_log::{BELStorage, BuildEventLog, MemoryBELStorage};
 use crate::build_state::BuildState;
 use crate::data_build_event::Event;
-use crate::data_deps::{missing_deps_to_want_events, WantTimestamps};
 use crate::job::JobConfiguration;
 use crate::job_run::{
    CompletedJobRun, DepMissJobRun, FailedJobRun, NotStartedJobRun, RunningJobRun,
    SubProcessBackend,
 };
-use crate::{PartitionRef, WantDetail};
+use crate::{JobRunBufferEventV1, PartitionRef, WantDetail};
 use std::collections::HashMap;
 use std::error::Error;
 use std::fmt::Debug;
+use crate::util::DatabuildError;

 /**
 Orchestrator turns wants, config, and BEL state into scheduled jobs. It uses lightweight threads +
-the visitor pattern to monitor job exec progress and liveness, and adds
-*/
+the visitor pattern to monitor job exec progress and liveness.

+JTBDs:
+- Orchestrator turns job run dep miss failures into derivative wants for the missed partitions
+- Orchestrator turns schedulable wants into job runs to build the requested partitions
+- Orchestrator polls queued and active job runs, keeping track of their state, and scheduling queued
+  jobs when possible
+*/
 struct Orchestrator<S: BELStorage + Debug> {
-    bel: BuildEventLog<S>,
-    not_started_jobs: Vec<NotStartedJobRun<SubProcessBackend>>,
-    running_jobs: Vec<RunningJobRun<SubProcessBackend>>,
-    completed_jobs: Vec<CompletedJobRun<SubProcessBackend>>,
-    failed_jobs: Vec<FailedJobRun<SubProcessBackend>>,
-    dep_miss_jobs: Vec<DepMissJobRun<SubProcessBackend>>,
-    config: OrchestratorConfig,
+    pub bel: BuildEventLog<S>,
+    pub not_started_jobs: Vec<NotStartedJobRun<SubProcessBackend>>,
+    pub running_jobs: Vec<RunningJobRun<SubProcessBackend>>,
+    pub completed_jobs: Vec<CompletedJobRun<SubProcessBackend>>,
+    pub failed_jobs: Vec<FailedJobRun<SubProcessBackend>>,
+    pub dep_miss_jobs: Vec<DepMissJobRun<SubProcessBackend>>,
+    pub config: OrchestratorConfig,
 }

 impl Default for Orchestrator<MemoryBELStorage> {
@ -134,10 +139,22 @@ impl<S: BELStorage + Debug> Orchestrator<S> {
        }
    }

-    /** Continuously invoked function to watch job run status */
-    fn poll_job_runs(&mut self) -> Result<(), Box<dyn Error>> {
+    fn job_runs_count(&self) -> usize {
+        self.not_started_jobs.len()
+            + self.running_jobs.len()
+            + self.completed_jobs.len()
+            + self.failed_jobs.len()
+            + self.dep_miss_jobs.len()
+    }
+
+    /// Visits individual job runs, appending resulting events, and moving runs between run status
+    /// containers.
+    fn poll_job_runs(&mut self) -> Result<(), DatabuildError> {
        use crate::job_run::JobRunVisitResult;

+        // Coherence check setup
+        let total_runs_count = self.job_runs_count();
+
        // First, start any not-started jobs
        while let Some(job) = self.not_started_jobs.pop() {
            let running = job.run()?;
@ -146,25 +163,35 @@ impl<S: BELStorage + Debug> Orchestrator<S> {

        // Visit running jobs and transition them to terminal states
        let mut still_running = Vec::new();
+        // TODO make sure that failure in the middle can't mess up build state - likely need to
+        //  refactor here (e.g. turn state changes into data, commit them after all have been
+        //  calculated and validated)
        for mut job in self.running_jobs.drain(..) {
            match job.visit()? {
                JobRunVisitResult::StillRunning => {
+                    println!("Still running job: {:?}", job.id());
                    still_running.push(job);
                }
                JobRunVisitResult::Completed(completed) => {
                    // Emit success event
-                    let event: Event = completed.state.to_event(&completed.id());
+                    println!("Completed job: {:?}", completed.id());
+                    let result = run_complete_to_events(&self.bel.state, &completed)?;
+                    for event in result.events {
                        self.bel.append_event(&event)?;
+                    }
+                    // Move job to completed
                    self.completed_jobs.push(completed);
                }
                JobRunVisitResult::Failed(failed) => {
                    // Emit failure event
+                    println!("Failed job: {:?}", failed.id());
                    let event: Event = failed.state.to_event(&failed.id());
                    self.bel.append_event(&event)?;
                    self.failed_jobs.push(failed);
                }
                JobRunVisitResult::DepMiss(dep_miss) => {
-                    for event in dep_miss_to_events(&self.bel.state, &dep_miss)? {
+                    println!("Dep miss job: {:?}", dep_miss.job_run_id);
+                    for event in self.bel.state.dep_miss_to_events(&dep_miss)? {
                        self.bel.append_event(&event)?;
                    }
                    // Record missing upstream status in want details
@ -174,11 +201,18 @@ impl<S: BELStorage + Debug> Orchestrator<S> {
        }
        self.running_jobs = still_running;

+        // Panic because this should never happen
+        assert_eq!(
+            self.job_runs_count(),
+            total_runs_count,
+            "Detected job run count change during job run visit (should never happen)"
+        );
+
        Ok(())
    }

    /** Continuously invoked function to watch wants and schedule new jobs */
-    fn poll_wants(&mut self) -> Result<(), Box<dyn Error>> {
+    fn poll_wants(&mut self) -> Result<(), DatabuildError> {
        // Collect unhandled wants, group by job that handles each partition,
        let schedulability = self.bel.state.schedulable_wants();
        println!("schedulability: {:?}", schedulability);
@ -202,8 +236,17 @@ impl<S: BELStorage + Debug> Orchestrator<S> {
            )
            .into())
        } else {
+            // Spawn jobs and add events
            for wg in grouped_wants.want_groups {
-                self.not_started_jobs.push(wg.spawn()?);
+                let job_run = wg.spawn()?;
+                let job_buffer_event = Event::JobRunBufferV1(JobRunBufferEventV1 {
+                    job_run_id: job_run.job_run_id.into(),
+                    job_label: wg.job.label,
+                    building_partitions: wg.wants.iter().map(|w| w.partitions.clone()).flatten().collect(),
+                    want_attributed_partitions: wg.wants.iter().map(|w| w.into()).collect(),
+                });
+                self.bel.append_event(&job_buffer_event)?;
+                self.not_started_jobs.push(job_run);
            }

            Ok(())
@ -236,50 +279,48 @@ impl<S: BELStorage + Debug> Orchestrator<S> {
        }
    }

-    fn step(&mut self) -> Result<(), Box<dyn Error>> {
+    fn step(&mut self) -> Result<(), DatabuildError> {
        self.poll_job_runs()?;
        self.poll_wants()?;
        Ok(())
    }

    /** Entrypoint for running jobs */
-    pub fn join(&mut self) -> Result<(), Box<dyn Error>> {
+    pub fn join(&mut self) -> Result<(), DatabuildError> {
        loop {
            self.step()?
        }
    }
 }

-fn dep_miss_to_events(
-    bel_state: &BuildState,
-    dep_miss: &DepMissJobRun<SubProcessBackend>,
-) -> Result<Vec<Event>, Box<dyn Error>> {
-    let mut events = vec![];
-    // Append literal job run dep miss
-    events.push(dep_miss.state.to_event(&dep_miss.id()));
-    // Append wants from dep miss
-    let job_run_detail = bel_state
-        .get_job_run(&dep_miss.job_run_id.to_string())
-        .ok_or(format!(
-            "Unable to find job run with id `{}`",
-            dep_miss.job_run_id
-        ))?;
-    // Infer data/SLA timestamps from upstream want
-    let want_timestamps: WantTimestamps = job_run_detail
-        .servicing_wants
-        .iter()
-        .flat_map(|wap| bel_state.get_want(&wap.want_id).map(|w| w.into()))
-        .reduce(|a: WantTimestamps, b: WantTimestamps| a.merge(b))
-        .ok_or(format!("No servicing wants found"))?;
-    // Create wants from dep misses
-    let want_events = missing_deps_to_want_events(
-        dep_miss.state.missing_deps.clone(),
-        &dep_miss.job_run_id,
-        want_timestamps,
-    );
-    events.extend(want_events);
+#[derive(Default, Clone, Debug)]
+pub struct JobRunCompleteResult {
+    /// Events to append to the BEL from this job completing
+    pub events: Vec<Event>,
+}

-    Ok(events)
+/// Handle successful run completion:
+/// - Adding run success event
+/// - Updating status for partitions actually built by the job
+fn run_complete_to_events(
+    bel_state: &BuildState,
+    completed: &CompletedJobRun<SubProcessBackend>,
+) -> Result<JobRunCompleteResult, DatabuildError> {
+    let mut events = vec![
+        // Event marking completion of job
+        completed.state.to_event(&completed.id()),
+    ];
+    // let job_detail = bel_state
+    //     .get_job_run(&completed.job_run_id.to_string())
+    //     .ok_or(format!(
+    //         "No job run found for id `{}`",
+    //         completed.job_run_id
+    //     ))?;
+
+    Ok(JobRunCompleteResult {
+        // built_partitions: job_detail.building_partitions,
+        events,
+    })
 }

 #[cfg(test)]
@ -405,6 +446,7 @@ mod tests {
                orchestrator.bel.append_event(&e).expect("append");
            }
            assert_eq!(orchestrator.not_started_jobs.len(), 0);
+            assert_eq!(orchestrator.bel.state.count_job_runs(), 0);

            // When
            assert_eq!(orchestrator.bel.state.schedulable_wants().0.len(), 1);
@ -425,7 +467,8 @@ mod tests {
                    .args,
                vec!["data/alpha"],
                "should have scheduled alpha job"
-            )
+            );
+            assert_eq!(orchestrator.bel.state.count_job_runs(), 1);
        }

        // Use case: A schedulable want that can't be matched to a job should return an error
@ -453,9 +496,10 @@ mod tests {
    mod want_create {
        use crate::data_build_event::Event;
        use crate::job_run::{DepMissJobRun, SubProcessDepMiss};
-        use crate::orchestrator::dep_miss_to_events;
        use crate::orchestrator::tests::{build_orchestrator, setup_scenario_a_to_b};
-        use crate::{JobRunDetail, MissingDeps, WantAttributedPartitions, WantCreateEventV1};
+        use crate::{
+            JobRunBufferEventV1, MissingDeps, WantAttributedPartitions, WantCreateEventV1,
+        };
        use std::marker::PhantomData;
        use uuid::Uuid;

@ -465,33 +509,37 @@ mod tests {
            // Given a
            let mut orchestrator = setup_scenario_a_to_b(build_orchestrator());
            // Add event for originating want
+            let want_create = WantCreateEventV1::sample();
+            let building_partitions = vec!["data/beta".into()];
            orchestrator
                .bel
                .append_event(&Event::WantCreateV1(WantCreateEventV1 {
-                    partitions: vec!["data/beta".into()],
-                    ..WantCreateEventV1::sample()
+                    partitions: building_partitions.clone(),
+                    ..want_create.clone()
                }))
                .expect("event append");
            // Create failed job run detail
+            let want_attributed_partitions: Vec<WantAttributedPartitions> =
+                vec![want_create.clone().into()];
            let job_run_id = Uuid::new_v4();
-            let job_run = JobRunDetail {
-                servicing_wants: orchestrator
-                    .bel
-                    .state
-                    .wants
-                    .values()
-                    .map(|w| WantAttributedPartitions {
-                        want_id: w.want_id.clone(),
-                        partitions: w.partitions.clone(),
-                    })
-                    .collect(),
-                ..JobRunDetail::default()
+            let job_run = JobRunBufferEventV1 {
+                job_run_id: job_run_id.into(),
+                building_partitions: building_partitions.clone(),
+                want_attributed_partitions: want_attributed_partitions.clone(),
+                ..JobRunBufferEventV1::default()
            };
+            orchestrator
+                .bel
+                .append_event(&Event::JobRunBufferV1(job_run))
+                .expect("event append");
+
+            // Job runs should not be empty
            orchestrator
                .bel
                .state
-                .job_runs
-                .insert(job_run_id.into(), job_run);
+                .get_job_run(&job_run_id.to_string())
+                .expect("job run should exist");
+
            // Add event for job failure
            let dep_miss_job_run = DepMissJobRun {
                job_run_id,
@ -507,7 +555,12 @@ mod tests {
            };

            // When calculating events from dep miss
-            let events = dep_miss_to_events(&orchestrator.bel.state, &dep_miss_job_run).unwrap();
+            // TODO this needs to be migrated - orchestrator shouldn't contain mapping logic
+            let events = orchestrator
+                .bel
+                .state
+                .dep_miss_to_events(&dep_miss_job_run)
+                .unwrap();

            // Should have scheduled a job for alpha
            assert_eq!(
@ -537,12 +590,65 @@ mod tests {

    /// Orchestrator needs to be able to achieve high level orchestration use cases.
    mod orchestration {
+        use crate::data_build_event::Event;
+        use crate::orchestrator::tests::{build_orchestrator, setup_scenario_a_to_b};
+        use crate::{PartitionStatusCode, WantCreateEventV1};
+        use std::thread;
+        use std::time::Duration;
+
        /// Use case: should run a job to produce a partition in reaction to a want, then have the
        /// want fulfilled.
        #[test]
-        #[ignore]
        fn test_want_builds_partition() {
-            todo!()
+            // Given
+            let mut orchestrator = setup_scenario_a_to_b(build_orchestrator());
+            // Add event for originating want
+            let partition = "data/alpha";
+            orchestrator
+                .bel
+                .append_event(&Event::WantCreateV1(WantCreateEventV1 {
+                    partitions: vec![partition.into()],
+                    ..WantCreateEventV1::sample()
+                }))
+                .expect("event append");
+
+            // When
+            // Poll wants then schedule pending jobs
+            orchestrator.poll_wants().expect("stage unscheduled jobs based on wants failed");
+            assert_eq!(orchestrator.not_started_jobs.len(), 1);
+            // poll job runs should start job run
+            orchestrator.poll_job_runs().expect("should start run");
+            assert_eq!(orchestrator.running_jobs.len(), 1);
+            assert_eq!(orchestrator.bel.state.count_job_runs(), 1);
+            thread::sleep(Duration::from_millis(1));
+            // Should still be running after 1ms
+            orchestrator.poll_job_runs().expect("should still be running");
+            assert_eq!(orchestrator.running_jobs.len(), 1);
+            assert_eq!(orchestrator.bel.state.count_job_runs(), 1);
+            println!("STATE: {:?}", orchestrator.bel.state);
+
+            // Wait for it to complete
+            thread::sleep(Duration::from_millis(10));
+            orchestrator.poll_job_runs().expect("should be able to poll existing job run");
+
+            // Job run should have succeeded
+            assert!(orchestrator.not_started_jobs.is_empty());
+            assert!(orchestrator.failed_jobs.is_empty());
+            assert!(orchestrator.dep_miss_jobs.is_empty());
+            assert!(orchestrator.running_jobs.is_empty());
+            assert_eq!(orchestrator.completed_jobs.len(), 1);
+
+            // Build state should show partition as live
+            assert_eq!(
+                orchestrator
+                    .bel
+                    .state
+                    .get_partition(partition)
+                    .unwrap()
+                    .status,
+                Some(PartitionStatusCode::PartitionLive.into()),
+                "partition should be live after job run completion"
+            );
        }

        // Use case: a graph with multi-hop deps should achieve the multi-hop build
--- a/databuild/util.rs
+++ b/databuild/util.rs
@ -1,7 +1,75 @@
 use std::time::{SystemTime, UNIX_EPOCH};
+use std::backtrace::Backtrace;

 pub fn current_timestamp() -> u64 {
    let now = SystemTime::now();
    let duration_since_epoch = now.duration_since(UNIX_EPOCH).expect("Time went backwards");
    duration_since_epoch.as_nanos() as u64
 }
+
+fn maybe_backtrace() -> Backtrace {
+    if std::env::var("RUST_BACKTRACE").is_ok() {
+        Backtrace::force_capture()
+    } else {
+        Backtrace::disabled()
+    }
+}
+
+#[derive(Debug)]
+pub struct DatabuildError {
+    msg: String,
+    source: Option<Box<dyn std::error::Error + Send + Sync>>,
+    backtrace: Backtrace,
+}
+
+impl DatabuildError {
+    fn new(msg: impl Into<String>) -> Self {
+        Self {
+            msg: msg.into(),
+            source: None,
+            backtrace: maybe_backtrace()
+        }
+    }
+}
+
+impl From<std::io::Error> for DatabuildError {
+    fn from(err: std::io::Error) -> Self {
+        Self {
+            msg: err.to_string(),
+            source: Some(Box::new(err)),
+            backtrace: maybe_backtrace()
+        }
+    }
+}
+
+impl From<rusqlite::Error> for DatabuildError {
+    fn from(err: rusqlite::Error) -> Self {
+        Self {
+            msg: err.to_string(),
+            source: Some(Box::new(err)),
+            backtrace: maybe_backtrace()
+        }
+    }
+}
+
+impl From<prost::EncodeError> for DatabuildError {
+    fn from(err: prost::EncodeError) -> Self {
+        Self {
+            msg: err.to_string(),
+            source: Some(Box::new(err)),
+            backtrace: maybe_backtrace()
+        }
+    }
+}
+
+impl From<String> for DatabuildError {
+    fn from(value: String) -> Self {
+        Self::new(value)
+    }
+}
+
+impl std::fmt::Display for DatabuildError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.msg)
+    }
+}
--- a/docs/ideas/metadata.md
+++ b/docs/ideas/metadata.md
@ -0,0 +1,2 @@
+
+It would be cool to have user-defined partition/want/job-run metadata, and allow querying of this metadata. Basic example: adding a `run_url` to a job or `adls_location` to a partition. More advanced: adding a `dbx_cores` field to job runs, and using querying over job runs downstream from a want to control parallelism down to the number-of-cores-used level.
--- a/docs/ideas/querying.md
+++ b/docs/ideas/querying.md
@ -0,0 +1,16 @@
+
+Querying seems to be a fundamental factor of the problem. For instance:
+
+- Upon canceling a want, canceling all wants it spawned, and the jobs attached to them.
+- Answering the question, "what in-progress job runs were spawned by this want?"
+- Answering, "why was this partition built?"
+- Answering, "what partitions needed to be built and jobs run to fulfill this want?"
+- Answering, "what jobs produce the partitions missed by this job run?"
+
+Let's start prefixing functions that should probably be mostly queries with `query_`.
+
+
+Notes on JTBDs and queries:
+
+- When a want is schedulable (query), map the requested partitions to the job runs that create them (query), and start them
+- 
--- a/scripts/generate_cargo_toml.py
+++ b/scripts/generate_cargo_toml.py
@ -129,40 +129,6 @@ def parse_crate_specs(module_content):
            crates[package] = crate_info

    return crates
-    """Extract crate specifications from MODULE.bazel content."""
-    crates = {}
-
-    # Find all crate.spec() calls
-    spec_pattern = r'crate\.spec\(\s*(.*?)\s*\)'
-    specs = re.findall(spec_pattern, module_content, re.DOTALL)
-
-    for spec in specs:
-        # Parse the spec parameters
-        package_match = re.search(r'package\s*=\s*"([^"]+)"', spec)
-        version_match = re.search(r'version\s*=\s*"([^"]+)"', spec)
-        features_match = re.search(r'features\s*=\s*\[(.*?)\]', spec, re.DOTALL)
-        default_features_match = re.search(r'default_features\s*=\s*False', spec)
-
-        if package_match and version_match:
-            package = package_match.group(1)
-            version = version_match.group(1)
-
-            crate_info = {"version": version}
-
-            # Handle features
-            if features_match:
-                features_str = features_match.group(1)
-                features = [f.strip().strip('"') for f in features_str.split(',') if f.strip()]
-                if features:
-                    crate_info["features"] = features
-
-            # Handle default-features = false
-            if default_features_match:
-                crate_info["default-features"] = False
-
-            crates[package] = crate_info
-
-    return crates

 def generate_cargo_toml(crates, structure, project_name="databuild"):
    """Generate Cargo.toml content from parsed crates and project structure."""
@ -170,7 +136,7 @@ def generate_cargo_toml(crates, structure, project_name="databuild"):
        f'[package]',
        f'name = "{project_name}"',
        f'version = "0.1.0"',
-        f'edition = "2021"',
+        f'edition = "2024"',
        f'',
        f'# Generated from MODULE.bazel for IDE support only',
        f'# Actual dependencies are managed by Bazel',
				`@ -0,0 +1,2 @@`

				It would be cool to have user-defined partition/want/job-run metadata, and allow querying of this metadata. Basic example: adding a `run_url` to a job or `adls_location` to a partition. More advanced: adding a `dbx_cores` field to job runs, and using querying over job runs downstream from a want to control parallelism down to the number-of-cores-used level.