refactor top level job run interface to represent job states as states

2025-10-16 18:47:23 -07:00 · 2025-10-16 18:47:23 -07:00 · eeb90d0386
commit eeb90d0386
parent 6572d4e3bd
3 changed files with 368 additions and 278 deletions
--- a/databuild/job.rs
+++ b/databuild/job.rs
@ -1,7 +1,6 @@
-use crate::job_run::{spawn_job_run, JobRun, JobRunConfig};
+use crate::job_run::{NotStartedJobRun, SubProcessBackend};
 use crate::{PartitionRef, WantDetail};
 use regex::Regex;
-use std::error::Error;

 #[derive(Debug, Clone)]
 pub struct JobConfiguration {
@ -12,12 +11,11 @@ pub struct JobConfiguration {

 impl JobConfiguration {
    /** Launch job to build the partitions specified by the provided wants. */
-    pub fn spawn(&self, wants: Vec<WantDetail>) -> Result<Box<dyn JobRun>, Box<dyn Error>> {
+    pub fn spawn(&self, wants: Vec<WantDetail>) -> Result<NotStartedJobRun<SubProcessBackend>, std::io::Error> {
        let wanted_refs: Vec<PartitionRef> =
            wants.iter().flat_map(|want| want.partitions.clone()).collect();
        let args: Vec<String> = wanted_refs.iter().map(|pref| pref.r#ref.clone()).collect();
-        let config = JobRunConfig::SubProcess { entry_point: self.entry_point.clone(), args };
-        spawn_job_run(config)
+        Ok(NotStartedJobRun::spawn(self.entry_point.clone(), args))
    }

    pub fn matches(&self, refs: &PartitionRef) -> bool {
--- a/databuild/job_run.rs
+++ b/databuild/job_run.rs
@ -1,4 +1,5 @@
 use std::collections::HashMap;
+use std::marker::PhantomData;
 use crate::data_build_event::Event;
 use crate::{JobRunHeartbeatEventV1, JobRunStatus, JobRunStatusCode, JobRunSuccessEventV1, JobRunFailureEventV1, JobRunCancelEventV1, EventSource};
 use std::error::Error;
@ -9,227 +10,309 @@ use uuid::Uuid;
 // TODO log to /var/log/databuild/jobruns/$JOB_RUN_ID/, and rotate over max size (e.g. only ever use 1GB for logs)
 // Leave door open to background log processor that tails job logs, but don't include in jobrun concept

-pub trait JobRun {
-    fn id(&self) -> Uuid;
-    /**
-    Visit is responsible for observing the state of the job run,
-    */
-    fn visit(&mut self) -> Result<JobRunPollResult, Box<dyn Error>>;
-    fn cancel(&mut self, source: EventSource) -> Result<JobRunCancelEventV1, Box<dyn Error>>;
-    fn run_with_env(&mut self, env: Option<HashMap<String, String>>) -> Result<JobRunHeartbeatEventV1, Box<dyn Error>>;
-    fn run(&mut self) -> Result<JobRunHeartbeatEventV1, Box<dyn Error>> {
+/// Backend trait that defines the state types and transition logic for different job run implementations
+pub trait JobRunBackend: Sized {
+    type NotStartedState;
+    type RunningState;
+    type CompletedState;
+    type FailedState;
+    type CanceledState;
+
+    /// Create a new not-started job run
+    fn create(entry_point: String, args: Vec<String>) -> Self::NotStartedState;
+
+    /// Transition from NotStarted to Running
+    fn start(
+        not_started: Self::NotStartedState,
+        env: Option<HashMap<String, String>>
+    ) -> Result<Self::RunningState, Box<dyn Error>>;
+
+    /// Poll a running job for state changes
+    fn poll(
+        running: &mut Self::RunningState
+    ) -> Result<PollResult<Self::CompletedState, Self::FailedState>, Box<dyn Error>>;
+
+    /// Cancel a running job
+    fn cancel_job(
+        running: Self::RunningState,
+        source: EventSource
+    ) -> Result<Self::CanceledState, Box<dyn Error>>;
+}
+
+/// Result of polling a running job
+pub enum PollResult<C, F> {
+    StillRunning,
+    Completed(C),
+    Failed(F),
+}
+
+/// Generic JobRun that works with any backend, parameterized by state
+pub struct JobRun<B: JobRunBackend, S> {
+    pub job_run_id: Uuid,
+    pub state: S,
+    _backend: PhantomData<B>,
+}
+
+/// Type aliases for specific states
+pub type NotStartedJobRun<B> = JobRun<B, <B as JobRunBackend>::NotStartedState>;
+pub type RunningJobRun<B> = JobRun<B, <B as JobRunBackend>::RunningState>;
+pub type CompletedJobRun<B> = JobRun<B, <B as JobRunBackend>::CompletedState>;
+pub type FailedJobRun<B> = JobRun<B, <B as JobRunBackend>::FailedState>;
+pub type CanceledJobRun<B> = JobRun<B, <B as JobRunBackend>::CanceledState>;
+
+// Methods available on all JobRun states
+impl<B: JobRunBackend, S> JobRun<B, S> {
+    pub fn id(&self) -> Uuid {
+        self.job_run_id
+    }
+}
+
+// Methods available only on NotStarted state
+impl<B: JobRunBackend> NotStartedJobRun<B> {
+    pub fn spawn(entry_point: String, args: Vec<String>) -> Self {
+        JobRun {
+            job_run_id: Uuid::new_v4(),
+            state: B::create(entry_point, args),
+            _backend: PhantomData,
+        }
+    }
+
+    pub fn run(self) -> Result<RunningJobRun<B>, Box<dyn Error>> {
        self.run_with_env(None)
    }
+
+    pub fn run_with_env(
+        self,
+        env: Option<HashMap<String, String>>
+    ) -> Result<RunningJobRun<B>, Box<dyn Error>> {
+        let running_state = B::start(self.state, env)?;
+        Ok(JobRun {
+            job_run_id: self.job_run_id,
+            state: running_state,
+            _backend: PhantomData,
+        })
+    }
+}
+
+// Methods available only on Running state
+impl<B: JobRunBackend> RunningJobRun<B> {
+    pub fn visit(&mut self) -> Result<JobRunVisitResult<B>, Box<dyn Error>> {
+        match B::poll(&mut self.state)? {
+            PollResult::StillRunning => Ok(JobRunVisitResult::StillRunning),
+            PollResult::Completed(completed_state) => {
+                let job_run_id = self.job_run_id;
+                Ok(JobRunVisitResult::Completed(JobRun {
+                    job_run_id,
+                    state: completed_state,
+                    _backend: PhantomData,
+                }))
+            }
+            PollResult::Failed(failed_state) => {
+                let job_run_id = self.job_run_id;
+                Ok(JobRunVisitResult::Failed(JobRun {
+                    job_run_id,
+                    state: failed_state,
+                    _backend: PhantomData,
+                }))
+            }
+        }
+    }
+
+    pub fn cancel(self, source: EventSource) -> Result<CanceledJobRun<B>, Box<dyn Error>> {
+        let canceled_state = B::cancel_job(self.state, source)?;
+        Ok(JobRun {
+            job_run_id: self.job_run_id,
+            state: canceled_state,
+            _backend: PhantomData,
+        })
+    }
+}
+
+/// Result of visiting a running job
+pub enum JobRunVisitResult<B: JobRunBackend> {
+    StillRunning,
+    Completed(CompletedJobRun<B>),
+    Failed(FailedJobRun<B>),
 }

 pub enum JobRunConfig {
    SubProcess { entry_point: String, args: Vec<String> },
 }

-pub fn spawn_job_run(config: JobRunConfig) -> Result<Box<dyn JobRun>, Box<dyn Error>> {
-    match config {
-        JobRunConfig::SubProcess { entry_point, args } => Ok(SubProcessJobRun::spawn(entry_point, args)?),
-        _ => Err("No impl for this job config type".into()),
-    }
-}
+// ===== SubProcess Backend Implementation =====

-pub struct SubProcessJobRun {
-    pub job_run_id: Uuid,
+/// SubProcess backend for running jobs as local subprocesses
+pub struct SubProcessBackend;
+
+/// NotStarted state for SubProcess backend
+pub struct SubProcessNotStarted {
    pub entry_point: String,
    pub args: Vec<String>,
-    pub state: JobRunState,
 }

-enum JobRunState {
-    NotStarted,
-    Running {
-        process: Child,
-        stdout_buffer: Vec<String>,
-    },
-    Completed {
-        exit_code: i32,
-        stdout_buffer: Vec<String>,
-    },
-    Failed {
-        exit_code: i32,
-        reason: String,
-        stdout_buffer: Vec<String>,
-    },
-    Canceled {
-        source: EventSource,
-        stdout_buffer: Vec<String>,
-    },
+/// Running state for SubProcess backend
+pub struct SubProcessRunning {
+    pub process: Child,
+    pub stdout_buffer: Vec<String>,
 }

-impl JobRun for SubProcessJobRun {
-    fn id(&self) -> Uuid {
-        self.job_run_id
+/// Completed state for SubProcess backend
+pub struct SubProcessCompleted {
+    pub exit_code: i32,
+    pub stdout_buffer: Vec<String>,
+}
+
+/// Failed state for SubProcess backend
+pub struct SubProcessFailed {
+    pub exit_code: i32,
+    pub reason: String,
+    pub stdout_buffer: Vec<String>,
+}
+
+/// Canceled state for SubProcess backend
+pub struct SubProcessCanceled {
+    pub source: EventSource,
+    pub stdout_buffer: Vec<String>,
+}
+
+impl JobRunBackend for SubProcessBackend {
+    type NotStartedState = SubProcessNotStarted;
+    type RunningState = SubProcessRunning;
+    type CompletedState = SubProcessCompleted;
+    type FailedState = SubProcessFailed;
+    type CanceledState = SubProcessCanceled;
+
+    fn create(entry_point: String, args: Vec<String>) -> Self::NotStartedState {
+        SubProcessNotStarted { entry_point, args }
    }

-    fn visit(&mut self) -> Result<JobRunPollResult, Box<dyn Error>> {
-        let mut new_events = Vec::new();
-
-        match &mut self.state {
-            JobRunState::Running { process, stdout_buffer } => {
-                // Non-blocking check for exit status
-                if let Some(exit_status) = process.try_wait()? {
-                    // Read any remaining stdout
-                    if let Some(stdout) = process.stdout.take() {
-                        let reader = BufReader::new(stdout);
-                        for line in reader.lines() {
-                            // TODO we should write lines to the job's file logs
-                            if let Ok(line) = line {
-                                stdout_buffer.push(line);
-                            }
-                        }
-                    }
-
-                    // Take ownership of the current state to transition
-                    let old_state = std::mem::replace(&mut self.state, JobRunState::NotStarted);
-                    let stdout_buf = if let JobRunState::Running { stdout_buffer, .. } = old_state {
-                        stdout_buffer
-                    } else {
-                        Vec::new()
-                    };
-
-                    // Check exit status and transition to terminal state
-                    match exit_status.code() {
-                        Some(0) => {
-                            // Success case
-                            self.state = JobRunState::Completed {
-                                exit_code: 0,
-                                stdout_buffer: stdout_buf,
-                            };
-                            new_events.push(Event::JobRunSuccessV1(JobRunSuccessEventV1 {
-                                job_run_id: self.job_run_id.to_string(),
-                            }));
-                            return Ok(JobRunPollResult {
-                                new_events,
-                                status: JobRunStatusCode::JobRunSucceeded.into(),
-                            });
-                        }
-                        Some(code) => {
-                            // Failed with exit code
-                            let reason = format!("Job failed with exit code {}", code);
-                            self.state = JobRunState::Failed {
-                                exit_code: code,
-                                reason: reason.clone(),
-                                stdout_buffer: stdout_buf,
-                            };
-                            new_events.push(Event::JobRunFailureV1(JobRunFailureEventV1 {
-                                job_run_id: self.job_run_id.to_string(),
-                                reason,
-                            }));
-                            return Ok(JobRunPollResult {
-                                new_events,
-                                status: JobRunStatusCode::JobRunFailed.into(),
-                            });
-                        }
-                        None => {
-                            // Terminated by signal (Unix) - treat as failure
-                            let reason = format!("Job terminated by signal: {}", exit_status);
-                            self.state = JobRunState::Failed {
-                                exit_code: -1,
-                                reason: reason.clone(),
-                                stdout_buffer: stdout_buf,
-                            };
-                            new_events.push(Event::JobRunFailureV1(JobRunFailureEventV1 {
-                                job_run_id: self.job_run_id.to_string(),
-                                reason,
-                            }));
-                            return Ok(JobRunPollResult {
-                                new_events,
-                                status: JobRunStatusCode::JobRunFailed.into(),
-                            });
-                        }
-                    }
-                }
-
-                // Still running
-                Ok(JobRunPollResult {
-                    new_events,
-                    status: JobRunStatusCode::JobRunRunning.into(),
-                })
-            }
-            _ => Err("visit() called on non-running job".into()),
-        }
-    }
-
-    fn cancel(&mut self, source: EventSource) -> Result<JobRunCancelEventV1, Box<dyn Error>> {
-        match std::mem::replace(&mut self.state, JobRunState::NotStarted) {
-            JobRunState::Running { mut process, stdout_buffer } => {
-                // Kill the process
-                process.kill()?;
-
-                // Wait for it to actually terminate
-                process.wait()?;
-
-                // Transition to Canceled state
-                self.state = JobRunState::Canceled {
-                    source: source.clone(),
-                    stdout_buffer,
-                };
-
-                Ok(JobRunCancelEventV1 {
-                    job_run_id: self.job_run_id.to_string(),
-                    source: Some(source),
-                    comment: Some("Job was canceled".to_string()),
-                })
-            }
-            other_state => {
-                // Restore the state and error
-                self.state = other_state;
-                Err("cancel() called on non-running job".into())
-            }
-        }
-    }
-
-    /// Mostly for test purposes
-    fn run_with_env(&mut self, env: Option<HashMap<String, String>>) -> Result<JobRunHeartbeatEventV1, Box<dyn Error>> {
-        match &self.state {
-            JobRunState::NotStarted => {
-                let process = Command::new(self.entry_point.clone())
-                    .args(self.args.clone())
+    fn start(
+        not_started: Self::NotStartedState,
+        env: Option<HashMap<String, String>>
+    ) -> Result<Self::RunningState, Box<dyn Error>> {
+        let process = Command::new(not_started.entry_point)
+            .args(not_started.args)
            .stdout(Stdio::piped())
            .stderr(Stdio::piped())
            .envs(env.unwrap_or_default())
            .spawn()?;

-                self.state = JobRunState::Running {
+        Ok(SubProcessRunning {
            process,
            stdout_buffer: Vec::new(),
-                };
+        })
+    }

-                // TODO should this return the event now? Or enqueue it? No sense in waiting I suppose, and orchestrator should just handle it?
-                Ok(JobRunHeartbeatEventV1 { job_run_id: self.id().to_string() })
-            }
-            _ => Err("run() called on already-running or completed job".into())
+    fn poll(
+        running: &mut Self::RunningState
+    ) -> Result<PollResult<Self::CompletedState, Self::FailedState>, Box<dyn Error>> {
+        // Non-blocking check for exit status
+        if let Some(exit_status) = running.process.try_wait()? {
+            // Read any remaining stdout
+            if let Some(stdout) = running.process.stdout.take() {
+                let reader = BufReader::new(stdout);
+                for line in reader.lines() {
+                    // TODO we should write lines to the job's file logs
+                    if let Ok(line) = line {
+                        running.stdout_buffer.push(line);
+                    }
                }
            }
-}

-impl SubProcessJobRun {
-    pub fn spawn(entry_point: String, args: Vec<String>) -> Result<Box<dyn JobRun>, Box<dyn Error>> {
-        Ok(Box::new(SubProcessJobRun {
-            job_run_id: Uuid::new_v4(),
-            entry_point,
-            args,
-            state: JobRunState::NotStarted,
+            // Take ownership of stdout_buffer
+            let stdout_buffer = std::mem::take(&mut running.stdout_buffer);
+
+            // Check exit status and return appropriate result
+            match exit_status.code() {
+                Some(0) => {
+                    // Success case
+                    Ok(PollResult::Completed(SubProcessCompleted {
+                        exit_code: 0,
+                        stdout_buffer,
                    }))
                }
+                Some(code) => {
+                    // Failed with exit code
+                    let reason = format!("Job failed with exit code {}", code);
+                    Ok(PollResult::Failed(SubProcessFailed {
+                        exit_code: code,
+                        reason,
+                        stdout_buffer,
+                    }))
+                }
+                None => {
+                    // Terminated by signal (Unix) - treat as failure
+                    let reason = format!("Job terminated by signal: {}", exit_status);
+                    Ok(PollResult::Failed(SubProcessFailed {
+                        exit_code: -1,
+                        reason,
+                        stdout_buffer,
+                    }))
+                }
+            }
+        } else {
+            // Still running
+            Ok(PollResult::StillRunning)
+        }
+    }
+
+    fn cancel_job(
+        mut running: Self::RunningState,
+        source: EventSource
+    ) -> Result<Self::CanceledState, Box<dyn Error>> {
+        // Kill the process
+        running.process.kill()?;
+
+        // Wait for it to actually terminate
+        running.process.wait()?;
+
+        // Return canceled state
+        Ok(SubProcessCanceled {
+            source,
+            stdout_buffer: running.stdout_buffer,
+        })
+    }
 }

+// Helper functions to convert between states and events
+impl SubProcessCompleted {
+    pub fn to_event(&self, job_run_id: &Uuid) -> Event {
+        Event::JobRunSuccessV1(JobRunSuccessEventV1 {
+            job_run_id: job_run_id.to_string(),
+        })
+    }
+}
+
+impl SubProcessFailed {
+    pub fn to_event(&self, job_run_id: &Uuid) -> Event {
+        Event::JobRunFailureV1(JobRunFailureEventV1 {
+            job_run_id: job_run_id.to_string(),
+            reason: self.reason.clone(),
+        })
+    }
+}
+
+impl SubProcessCanceled {
+    pub fn to_event(&self, job_run_id: &Uuid) -> JobRunCancelEventV1 {
+        JobRunCancelEventV1 {
+            job_run_id: job_run_id.to_string(),
+            source: Some(self.source.clone()),
+            comment: Some("Job was canceled".to_string()),
+        }
+    }
+}
+
+// Old JobRunPollResult structure - kept for compatibility during migration
 pub struct JobRunPollResult {
-    pub new_events: Vec<Event>,  // Parsed BEL events, not raw lines
+    pub new_events: Vec<Event>,
    pub status: JobRunStatus,
 }

 mod tests {
    use std::collections::HashMap;
    use crate::data_build_event::Event;
-    use crate::job_run::{JobRun, SubProcessJobRun};
-    use crate::{JobRunStatusCode, ManuallyTriggeredEvent};
-    use uuid::Uuid;
+    use crate::job_run::{JobRunVisitResult, NotStartedJobRun, SubProcessBackend};
+    use crate::{ManuallyTriggeredEvent};

    fn test_helper_path() -> String {
        std::env::var("TEST_SRCDIR")
@ -241,35 +324,29 @@ mod tests {
    #[test]
    fn test_job_run_success_returns_job_run_success_event() {
        // Spawn a job run that will succeed (exit code 0)
-        let mut job_run = SubProcessJobRun::spawn(test_helper_path(), vec![]).unwrap();
+        let job_run: NotStartedJobRun<SubProcessBackend> = NotStartedJobRun::spawn(test_helper_path(), vec![]);

-        // Start the job
-        job_run.run().unwrap();
+        // Start the job - this consumes the NotStarted and returns Running
+        let mut running_job = job_run.run().unwrap();

        // Poll until we get completion
        loop {
-            let result = job_run.visit().unwrap();
-
-            // Check if we got a success event
-            let has_success = result.new_events.iter().any(|event| {
-                matches!(event, Event::JobRunSuccessV1(_))
-            });
-
-            if has_success {
-                let expected = JobRunStatusCode::JobRunSucceeded as i32;
-                assert!(matches!(result.status.code, expected));
+            match running_job.visit().unwrap() {
+                JobRunVisitResult::Completed(completed) => {
+                    // Generate the event from the completed state
+                    let event = completed.state.to_event(&completed.id());
+                    assert!(matches!(event, Event::JobRunSuccessV1(_)));
                    break;
                }
-
-            // If job is still running, sleep briefly and poll again
-            let expected = JobRunStatusCode::JobRunRunning as i32;
-            if matches!(result.status.code, expected) {
+                JobRunVisitResult::Failed(failed) => {
+                    panic!("Job failed unexpectedly: {}", failed.state.reason);
+                }
+                JobRunVisitResult::StillRunning => {
+                    // Sleep briefly and poll again
                    std::thread::sleep(std::time::Duration::from_millis(10));
                    continue;
                }
-
-            // If we got here, job failed when it shouldn't have
-            panic!("Job failed unexpectedly: {:?}", result.status);
+            }
        }
    }

@ -277,41 +354,32 @@ mod tests {
    #[test]
    fn test_job_run_failure_returns_job_run_failure_event() {
        // Spawn a job run
-        let mut job_run = SubProcessJobRun::spawn(test_helper_path(), vec![]).unwrap();
+        let job_run: NotStartedJobRun<SubProcessBackend> = NotStartedJobRun::spawn(test_helper_path(), vec![]);

        // Start the job with an exit code that indicates failure (non-zero)
        let env: HashMap<String, String> = HashMap::from([
            ("DATABUILD_TEST_EXIT_CODE".to_string(), "1".to_string())
        ]);
-        job_run.run_with_env(Some(env)).unwrap();
-
+        let mut running_job = job_run.run_with_env(Some(env)).unwrap();

        // Poll until we get completion
        loop {
-            let result = job_run.visit().unwrap();
-
-            // Check if we got a success event
-            if result.new_events.iter().any(|event| {
-                matches!(event, Event::JobRunSuccessV1(_))
-            }) {
+            match running_job.visit().unwrap() {
+                JobRunVisitResult::Completed(_) => {
                    panic!("Job succeeded unexpectedly");
-            };
-
-            if result.new_events.iter().any(|event| {
-                matches!(event, Event::JobRunFailureV1(_))
-            })  {
+                }
+                JobRunVisitResult::Failed(failed) => {
+                    // Generate the event from the failed state
+                    let event = failed.state.to_event(&failed.id());
+                    assert!(matches!(event, Event::JobRunFailureV1(_)));
                    break;
                }
-
-            // If job is still running, sleep briefly and poll again
-            let expected = JobRunStatusCode::JobRunRunning as i32;
-            if matches!(result.status.code, expected) {
+                JobRunVisitResult::StillRunning => {
+                    // Sleep briefly and poll again
                    std::thread::sleep(std::time::Duration::from_millis(10));
                    continue;
                }
-
-            // If we got here, job failed when it shouldn't have
-            panic!("Job failed unexpectedly: {:?}", result.status);
+            }
        }
    }

@ -322,28 +390,32 @@ mod tests {
    fn test_job_run_cancel_returns_job_run_cancel_event() {
        use std::fs;
        use crate::ManuallyTriggeredEvent;
+        use uuid::Uuid;

        // Create a temp file path for the test
        let temp_file = format!("/tmp/databuild_test_cancel_{}", Uuid::new_v4());

        // Spawn a job run that will sleep for 1 second and write a file
-        let mut job_run = SubProcessJobRun::spawn(test_helper_path(), vec![]).unwrap();
+        let job_run: NotStartedJobRun<SubProcessBackend> = NotStartedJobRun::spawn(test_helper_path(), vec![]);

        let env: HashMap<String, String> = HashMap::from([
            ("DATABUILD_TEST_SLEEP_MS".to_string(), "1000".to_string()),
            ("DATABUILD_TEST_OUTPUT_FILE".to_string(), temp_file.clone()),
            ("DATABUILD_TEST_FILE_CONTENT".to_string(), "completed".to_string()),
        ]);
-        job_run.run_with_env(Some(env)).unwrap();
+        let running_job = job_run.run_with_env(Some(env)).unwrap();

        // Give it a tiny bit of time to start
        std::thread::sleep(std::time::Duration::from_millis(10));

-        // Cancel the job before it can complete - this returns the cancel event
-        let cancel_event = job_run.cancel(ManuallyTriggeredEvent { user: "test_user".into() }.into()).unwrap();
+        // Cancel the job before it can complete - this consumes the running job and returns canceled
+        let canceled_job = running_job.cancel(ManuallyTriggeredEvent { user: "test_user".into() }.into()).unwrap();
+
+        // Generate the cancel event from the canceled state
+        let cancel_event = canceled_job.state.to_event(&canceled_job.id());

        // Verify we got the cancel event
-        assert_eq!(cancel_event.job_run_id, job_run.id().to_string());
+        assert_eq!(cancel_event.job_run_id, canceled_job.id().to_string());
        assert!(cancel_event.source.is_some());
        assert_eq!(cancel_event.comment, Some("Job was canceled".to_string()));

--- a/databuild/orchestrator.rs
+++ b/databuild/orchestrator.rs
@ -1,7 +1,7 @@
 use crate::build_event_log::{BELStorage, BuildEventLog, MemoryBELStorage};
 use crate::job::JobConfiguration;
-use crate::job_run::JobRun;
-use crate::{JobRunStatusCode, PartitionRef, WantDetail};
+use crate::job_run::{NotStartedJobRun, RunningJobRun, CompletedJobRun, FailedJobRun, SubProcessBackend};
+use crate::{PartitionRef, WantDetail};
 use std::collections::HashMap;
 use std::error::Error;
 use std::fmt::Debug;
@ -13,7 +13,10 @@ the visitor pattern to monitor job exec progress and liveness, and adds

 struct Orchestrator<S: BELStorage + Debug> {
    bel: BuildEventLog<S>,
-    job_runs: Vec<Box<dyn JobRun>>,
+    not_started_jobs: Vec<NotStartedJobRun<SubProcessBackend>>,
+    running_jobs: Vec<RunningJobRun<SubProcessBackend>>,
+    completed_jobs: Vec<CompletedJobRun<SubProcessBackend>>,
+    failed_jobs: Vec<FailedJobRun<SubProcessBackend>>,
    config: OrchestratorConfig,
 }

@ -21,7 +24,10 @@ impl Default for Orchestrator<MemoryBELStorage> {
    fn default() -> Self {
        Self {
            bel: Default::default(),
-            job_runs: Default::default(),
+            not_started_jobs: Default::default(),
+            running_jobs: Default::default(),
+            completed_jobs: Default::default(),
+            failed_jobs: Default::default(),
            config: Default::default(),
        }
    }
@ -31,7 +37,10 @@ impl Orchestrator<MemoryBELStorage> {
    fn copy(&self) -> Self {
        Self {
            bel: self.bel.clone(),
-            job_runs: Default::default(),
+            not_started_jobs: Default::default(),
+            running_jobs: Default::default(),
+            completed_jobs: Default::default(),
+            failed_jobs: Default::default(),
            config: self.config.clone(),
        }
    }
@ -39,27 +48,18 @@ impl Orchestrator<MemoryBELStorage> {

 impl<S: BELStorage + Debug> Orchestrator<S> {
    fn with_config(self, config: OrchestratorConfig) -> Self {
-        Self {
-            bel: self.bel,
-            job_runs: self.job_runs,
-            config,
-        }
+        Self { config, ..self }
    }

    fn with_jobs(self, jobs: Vec<JobConfiguration>) -> Self {
        Self {
-            bel: self.bel,
-            job_runs: self.job_runs,
            config: self.config.with_jobs(jobs),
+            ..self
        }
    }

    fn with_bel(self, bel: BuildEventLog<S>) -> Self {
-        Self {
-            bel,
-            job_runs: self.job_runs,
-            config: self.config,
-        }
+        Self { bel, ..self }
    }
 }

@ -110,30 +110,46 @@ impl<S: BELStorage + Debug> Orchestrator<S> {
    fn new(storage: S, config: OrchestratorConfig) -> Self {
        Self {
            bel: BuildEventLog::new(storage, Default::default()),
-            job_runs: Vec::new(),
+            not_started_jobs: Vec::new(),
+            running_jobs: Vec::new(),
+            completed_jobs: Vec::new(),
+            failed_jobs: Vec::new(),
            config,
        }
    }

    /** Continuously invoked function to watch job run status */
    fn poll_job_runs(&mut self) -> Result<(), Box<dyn Error>> {
-        // Visit existing jobs, remove completed
-        self.job_runs.retain_mut(|jr| {
-            // Append emitted events
-            let result = jr
-                .visit()
-                .expect("Job visit failed");
-            result.new_events
-                .iter()
-                .for_each(|event| {
-                    self.bel
-                        .append_event(&event)
-                        .expect("Failed to append event");
-                });
+        use crate::job_run::JobRunVisitResult;

-            // Retain job run if it doesn't yet have an exit code (still running)
-            result.status.code == JobRunStatusCode::JobRunRunning as i32
-        });
+        // First, start any not-started jobs
+        while let Some(job) = self.not_started_jobs.pop() {
+            let running = job.run()?;
+            self.running_jobs.push(running);
+        }
+
+        // Visit running jobs and transition them to terminal states
+        let mut still_running = Vec::new();
+        for mut job in self.running_jobs.drain(..) {
+            match job.visit()? {
+                JobRunVisitResult::StillRunning => {
+                    still_running.push(job);
+                }
+                JobRunVisitResult::Completed(completed) => {
+                    // Emit success event
+                    let event = completed.state.to_event(&completed.id());
+                    self.bel.append_event(&event)?;
+                    self.completed_jobs.push(completed);
+                }
+                JobRunVisitResult::Failed(failed) => {
+                    // Emit failure event
+                    let event = failed.state.to_event(&failed.id());
+                    self.bel.append_event(&event)?;
+                    self.failed_jobs.push(failed);
+                }
+            }
+        }
+        self.running_jobs = still_running;

        Ok(())
    }
@ -152,18 +168,18 @@ impl<S: BELStorage + Debug> Orchestrator<S> {
            .collect();
        let grouped_wants = Orchestrator::<S>::group_wants(&self.config, &schedulable_wants);

-        if !grouped_wants.want_groups.is_empty() {
+        if !grouped_wants.unhandled_wants.is_empty() {
            // All wants must be mapped to jobs that can be handled
            // TODO we probably want to handle this gracefully in the near future
            Err(format!(
                "Unable to map following wants: {:?}",
-                &grouped_wants.want_groups
+                &grouped_wants.unhandled_wants
            )
            .into())
        } else {
            for wg in grouped_wants.want_groups {
                let job_run = wg.job.spawn(wg.wants)?;
-                self.job_runs.push(job_run);
+                self.not_started_jobs.push(job_run);
            }

            Ok(())
@ -256,11 +272,15 @@ mod tests {
        #[test]
        fn test_empty_wants_noop() {
            let mut orchestrator = build_orchestrator();
-            assert!(orchestrator.job_runs.is_empty()); // Should init with no work to do
+            // Should init with no work to do
+            assert!(orchestrator.not_started_jobs.is_empty());
+            assert!(orchestrator.running_jobs.is_empty());
            orchestrator
                .poll_wants()
                .expect("shouldn't fail to poll empty wants");
-            assert!(orchestrator.job_runs.is_empty()); // Should still be empty since no work to do
+            // Should still be empty since no work to do
+            assert!(orchestrator.not_started_jobs.is_empty());
+            assert!(orchestrator.running_jobs.is_empty());
        }

        // Use case: Some schedulable wants with jobs that can be matched should launch those jobs