databuild/databuild/orchestrator.rs

use crate::build_event_log::{BELStorage, BuildEventLog, MemoryBELStorage};
use crate::data_build_event::Event;
use crate::job::JobConfiguration;
use crate::job_run::SubProcessBackend;
use crate::util::DatabuildError;
use crate::{JobRunBufferEventV1, PartitionRef, WantDetail};
use std::collections::HashMap;
use std::fmt::Debug;

/**
Orchestrator turns wants, config, and BEL state into scheduled jobs. It uses lightweight threads +
the visitor pattern to monitor job exec progress and liveness.

JTBDs:
- Orchestrator turns job run dep miss failures into derivative wants for the missed partitions
- Orchestrator turns schedulable wants into job runs to build the requested partitions
- Orchestrator polls queued and active job runs, keeping track of their state, and scheduling queued
  jobs when possible
*/
struct Orchestrator<S: BELStorage + Debug> {
    pub bel: BuildEventLog<S>,
    pub config: OrchestratorConfig,
    pub job_runs: Vec<crate::job_run::JobRun<SubProcessBackend>>,
}

impl Default for Orchestrator<MemoryBELStorage> {
    fn default() -> Self {
        Self {
            bel: Default::default(),
            config: Default::default(),
            job_runs: Default::default(),
        }
    }
}

impl Orchestrator<MemoryBELStorage> {
    fn copy(&self) -> Self {
        Self {
            bel: self.bel.clone(),
            config: self.config.clone(),
            job_runs: Default::default(),
        }
    }
}

impl<S: BELStorage + Debug> Orchestrator<S> {
    fn with_config(self, config: OrchestratorConfig) -> Self {
        Self { config, ..self }
    }

    fn with_jobs(self, jobs: Vec<JobConfiguration>) -> Self {
        Self {
            config: self.config.with_jobs(jobs),
            ..self
        }
    }

    fn with_bel(self, bel: BuildEventLog<S>) -> Self {
        Self { bel, ..self }
    }
}

#[derive(Debug, Clone)]
struct OrchestratorConfig {
    jobs: Vec<JobConfiguration>,
}

impl Default for OrchestratorConfig {
    fn default() -> Self {
        Self {
            jobs: Vec::default(),
        }
    }
}

impl OrchestratorConfig {
    fn job_configuration_for_label(&self, label: &str) -> Option<JobConfiguration> {
        self.jobs.iter().find(|job| job.label == label).cloned()
    }

    fn match_job_partition(&self, pref: &PartitionRef) -> Option<JobConfiguration> {
        self.jobs.iter().find(|job| job.matches(pref)).cloned()
    }

    fn with_jobs(self, jobs: Vec<JobConfiguration>) -> Self {
        Self { jobs }
    }

    fn with_job(self, job: JobConfiguration) -> Self {
        Self { jobs: vec![job] }
    }
}

#[derive(Debug, Clone)]
struct WantGroup {
    job: JobConfiguration,
    wants: Vec<WantDetail>,
}


#[derive(Debug, Clone)]
struct GroupedWants {
    want_groups: Vec<WantGroup>,
    unhandled_wants: Vec<WantDetail>,
}

impl GroupedWants {
    pub fn validate(&self) -> Result<(), DatabuildError> {
        if !self.unhandled_wants.is_empty() {
            // All wants must be mapped to jobs that can be handled
            // TODO we probably want to handle this gracefully in the near future
            Err(format!("Unable to map following wants: {:?}", self.unhandled_wants).into())
        } else {
            Ok(())
        }
    }
}

impl<S: BELStorage + Debug> Orchestrator<S> {
    fn new(storage: S, config: OrchestratorConfig) -> Self {
        Self {
            bel: BuildEventLog::new(storage, Default::default()),
            config,
            job_runs: Vec::new(),
        }
    }

    fn schedule_queued_jobs(&mut self) -> Result<(), DatabuildError> {
        use crate::job_run::JobRun;

        let mut new_jobs = Vec::new();
        for job in self.job_runs.drain(..) {
            let transitioned = match job {
                JobRun::NotStarted(not_started) => JobRun::Running(not_started.run(None)?),
                other => other, // Pass through all other states unchanged
            };
            new_jobs.push(transitioned);
        }
        self.job_runs = new_jobs;
        Ok(())
    }

    /// Visits individual job runs, appending resulting events, and moving runs between run status
    /// containers. Either jobs are still running, or they are moved to terminal states.
    fn poll_job_runs(&mut self) -> Result<(), DatabuildError> {
        use crate::job_run::{JobRun, VisitResult};

        self.schedule_queued_jobs()?;

        // Visit all running jobs using type-safe transitions
        let mut new_jobs = Vec::new();
        for job in self.job_runs.drain(..) {
            let transitioned = match job {
                JobRun::Running(running) => {
                    match running.visit()? {
                        VisitResult::StillRunning(still_running) => {
                            println!("Still running job: {:?}", still_running.job_run_id);
                            JobRun::Running(still_running)
                        }
                        VisitResult::Completed(completed) => {
                            println!("Completed job: {:?}", completed.job_run_id);
                            let event = completed.state.to_event(&completed.job_run_id);
                            self.bel.append_event(&event)?;
                            JobRun::Completed(completed)
                        }
                        VisitResult::Failed(failed) => {
                            println!("Failed job: {:?}", failed.job_run_id);
                            let event = failed.state.to_event(&failed.job_run_id);
                            self.bel.append_event(&event)?;
                            JobRun::Failed(failed)
                        }
                        VisitResult::DepMiss(dep_miss) => {
                            println!("Dep miss job: {:?}", dep_miss.job_run_id);
                            let event = dep_miss.state.to_event(&dep_miss.job_run_id);
                            self.bel.append_event(&event)?;
                            JobRun::DepMiss(dep_miss)
                        }
                    }
                }
                other => other, // Pass through all non-running states unchanged
            };
            new_jobs.push(transitioned);
        }
        self.job_runs = new_jobs;

        Ok(())
    }

    /** Continuously invoked function to watch wants and schedule new jobs */
    fn poll_wants(&mut self) -> Result<(), DatabuildError> {
        // Collect unhandled wants, group by job that handles each partition,
        let schedulable_wants = self.bel.state.wants_schedulability().schedulable_wants();
        let grouped_wants = Orchestrator::<S>::group_wants(&self.config, &schedulable_wants);
        grouped_wants.validate()?;

        // Spawn jobs and add events
        for wg in grouped_wants.want_groups {
            self.queue_job(wg)?;
        }

        Ok(())
    }

    fn group_wants(config: &OrchestratorConfig, wants: &Vec<WantDetail>) -> GroupedWants {
        let mut want_groups: HashMap<String, Vec<WantDetail>> = Default::default();
        let mut unhandled_wants: Vec<WantDetail> = Default::default();
        wants.iter().for_each(|want| {
            want.partitions.iter().for_each(|pref| {
                let matched_job = config.match_job_partition(pref);
                match matched_job {
                    None => unhandled_wants.push(want.clone()),
                    Some(jc) => want_groups.entry(jc.label).or_default().push(want.clone()),
                }
            });
        });
        GroupedWants {
            want_groups: want_groups
                .iter()
                .map(|(k, v)| WantGroup {
                    job: config
                        .job_configuration_for_label(k)
                        .expect(&format!("Job configuration not found for label `{}`", k)),
                    wants: v.to_owned(),
                })
                .collect(),
            unhandled_wants,
        }
    }

    fn queue_job(&mut self, wg: WantGroup) -> Result<(), DatabuildError> {
        use crate::job_run::JobRun;

        // Compute args from wants the same way JobConfiguration::spawn() does
        let wanted_refs: Vec<crate::PartitionRef> =
            wg.wants.iter().flat_map(|want| want.partitions.clone()).collect();
        let args: Vec<String> = wanted_refs.iter().map(|pref| pref.r#ref.clone()).collect();
        let job_run = JobRun::spawn(wg.job.entry_point.clone(), args);

        // Create job run buffer event
        let job_buffer_event = Event::JobRunBufferV1(JobRunBufferEventV1 {
            job_run_id: job_run.job_run_id().to_string(),
            job_label: wg.job.label,
            building_partitions: wg
                .wants
                .iter()
                .map(|w| w.partitions.clone())
                .flatten()
                .collect(),
            want_attributed_partitions: wg.wants.iter().map(|w| w.into()).collect(),
        });
        self.bel.append_event(&job_buffer_event)?;
        self.job_runs.push(job_run);

        Ok(())
    }

    fn step(&mut self) -> Result<(), DatabuildError> {
        self.poll_job_runs()?;
        self.poll_wants()?;
        Ok(())
    }

    // Helper methods for tests to count jobs by state
    #[cfg(test)]
    fn count_running_jobs(&self) -> usize {
        use crate::job_run::JobRun;
        self.job_runs.iter().filter(|j| matches!(j, JobRun::Running(_))).count()
    }

    #[cfg(test)]
    fn count_terminal_jobs(&self) -> usize {
        self.job_runs.iter().filter(|j| j.is_terminal()).count()
    }

    #[cfg(test)]
    fn count_not_started_jobs(&self) -> usize {
        use crate::job_run::JobRun;
        self.job_runs.iter().filter(|j| matches!(j, JobRun::NotStarted(_))).count()
    }

    #[cfg(test)]
    fn count_dep_miss_jobs(&self) -> usize {
        use crate::job_run::JobRun;
        self.job_runs.iter().filter(|j| matches!(j, JobRun::DepMiss(_))).count()
    }

    #[cfg(test)]
    fn count_completed_jobs(&self) -> usize {
        use crate::job_run::JobRun;
        self.job_runs.iter().filter(|j| matches!(j, JobRun::Completed(_))).count()
    }

    /** Entrypoint for running jobs */
    pub fn join(&mut self) -> Result<(), DatabuildError> {
        loop {
            self.step()?
        }
    }
}

#[cfg(test)]
mod tests {
    use crate::WantCreateEventV1;
    use crate::build_event_log::MemoryBELStorage;
    use crate::job::JobConfiguration;
    use crate::mock_job_run::MockJobRun;
    use crate::orchestrator::{Orchestrator, OrchestratorConfig};
    use crate::util::current_timestamp;
    use uuid::Uuid;

    fn build_orchestrator() -> Orchestrator<MemoryBELStorage> {
        Orchestrator::default()
    }

    impl WantCreateEventV1 {
        /// A naive random want for testing purposes
        pub fn sample() -> Self {
            Self {
                want_id: Uuid::new_v4().to_string(),
                partitions: vec![],
                data_timestamp: current_timestamp(),
                ttl_seconds: 1000,
                sla_seconds: 1000,
                source: None,
                comment: Some("test want".to_string()),
            }
        }
    }

    /// Scenario 1
    /// A test scenario that simulates a databuild application with 2 jobs, alpha and beta, with
    /// alpha depending on a single output from beta, and beta with no deps.
    fn setup_scenario_a_to_b(
        mut orchestrator: Orchestrator<MemoryBELStorage>,
    ) -> Orchestrator<MemoryBELStorage> {
        // Define test jobs
        orchestrator.config = OrchestratorConfig {
            jobs: vec![
                JobConfiguration {
                    label: "alpha".to_string(),
                    patterns: vec!["data/alpha".to_string()],
                    entry_point: MockJobRun::bin_path(),
                },
                JobConfiguration {
                    label: "beta".to_string(),
                    patterns: vec!["data/beta".to_string()],
                    entry_point: MockJobRun::bin_path(),
                },
            ],
        };
        orchestrator
    }

    // The orchestrator needs to be able to actually execute job runs
    mod run_jobs {
        // Use case: the orchestrator should be able to execute a spawned-process job
        #[test]
        #[ignore] // TODO define this interface
        fn test_spawned_process_job() {
            todo!()
        }
    }

    // The orchestrator relies on polling job run status to react to job completions that imply
    // key outcomes like:
    // - Success: partitions produced, other job runs may be schedulable
    // - Dep miss: wants need to be created
    // - Failure: engineer likely needs to react
    mod poll_job_runs {
        // Use case: we find a job that has completed, BEL should be written with appropriate event
        //  (both for success and fail cases)
        #[test]
        #[ignore]
        fn test_job_completion_events() {
            todo!()
        }

        //Use case: a job has written new stdout, it should produce a new heartbeat event in the BEL
        // TODO - we should come back here later and ensure we have a minimum heartbeat period
        #[test]
        #[ignore]
        fn test_heartbeat_from_stdout() {
            todo!()
        }
    }

    // The orchestrator polls wants so that it can react to new wants created by users, or to wants
    // created by itself (for dep miss job run failures)
    mod poll_wants {
        use crate::WantCreateEventV1;
        use crate::data_build_event::Event;
        use crate::orchestrator::tests::{build_orchestrator, setup_scenario_a_to_b};

        // Use case: Empty schedulable wants is a valid case, and should create no new jobs.
        #[test]
        fn test_empty_wants_noop() {
            let mut orchestrator = build_orchestrator();
            // Should init with no work to do
            assert_eq!(orchestrator.count_not_started_jobs(), 0);
            assert_eq!(orchestrator.count_running_jobs(), 0);
            orchestrator
                .poll_wants()
                .expect("shouldn't fail to poll empty wants");
            // Should still be empty since no work to do
            assert_eq!(orchestrator.count_not_started_jobs(), 0);
            assert_eq!(orchestrator.count_running_jobs(), 0);
        }

        // Use case: Some schedulable wants with jobs that can be matched should launch those jobs
        //  (but in this case using a noop/mock child process)
        #[test]
        fn test_schedulable_wants_should_schedule() {
            // Given
            let mut orchestrator = setup_scenario_a_to_b(build_orchestrator());
            let events = vec![Event::WantCreateV1(WantCreateEventV1 {
                partitions: vec!["data/alpha".into()],
                ..WantCreateEventV1::sample()
            })];
            assert_eq!(orchestrator.bel.state.wants_schedulability().0.len(), 0);
            for e in events {
                orchestrator.bel.append_event(&e).expect("append");
            }
            assert_eq!(orchestrator.count_not_started_jobs(), 0);
            assert_eq!(orchestrator.bel.state.count_job_runs(), 0);

            // When
            assert_eq!(orchestrator.bel.state.wants_schedulability().0.len(), 1);
            orchestrator
                .poll_wants()
                .expect("shouldn't fail to poll wants");

            // Should schedule alpha job
            assert_eq!(orchestrator.count_not_started_jobs(), 1);
            // Verify the job has the right args by checking the first NotStarted job
            use crate::job_run::JobRun;
            let not_started_job = orchestrator.job_runs.iter().find(|j| matches!(j, JobRun::NotStarted(_))).unwrap();
            if let JobRun::NotStarted(job) = not_started_job {
                assert_eq!(job.state.args, vec!["data/alpha"], "should have scheduled alpha job");
            }
            assert_eq!(orchestrator.bel.state.count_job_runs(), 1);
        }

        // Use case: A schedulable want that can't be matched to a job should return an error
        #[test]
        fn test_schedulable_want_no_matching_job() {
            // Given
            let mut orchestrator = build_orchestrator();
            let events = vec![Event::WantCreateV1(WantCreateEventV1 {
                partitions: vec!["data/alpha".into()],
                ..WantCreateEventV1::sample()
            })];

            // When
            orchestrator
                .poll_wants()
                .expect("shouldn't fail to poll wants");

            // Should not have scheduled any jobs
            assert_eq!(orchestrator.count_not_started_jobs(), 0);
        }
    }

    /// Orchestrator want creation is the means of data dependency propagation, allowing the
    /// orchestrator to create partitions needed by jobs that produce the existing wanted partitions.
    mod want_create {

        // /// Use case: The orchestrator should map a failed job into a set of wants
        // #[test]
        // fn test_job_fail_want_mapping() {
        //     // Given a
        //     let mut orchestrator = setup_scenario_a_to_b(build_orchestrator());
        //     // Add event for originating want
        //     let want_create = WantCreateEventV1::sample();
        //     let building_partitions = vec!["data/beta".into()];
        //     orchestrator
        //         .bel
        //         .append_event(&Event::WantCreateV1(WantCreateEventV1 {
        //             partitions: building_partitions.clone(),
        //             ..want_create.clone()
        //         }))
        //         .expect("event append");
        //     // Create failed job run detail
        //     let want_attributed_partitions: Vec<WantAttributedPartitions> =
        //         vec![want_create.clone().into()];
        //     let job_run_id = Uuid::new_v4();
        //     let job_run = JobRunBufferEventV1 {
        //         job_run_id: job_run_id.into(),
        //         building_partitions: building_partitions.clone(),
        //         want_attributed_partitions: want_attributed_partitions.clone(),
        //         ..JobRunBufferEventV1::default()
        //     };
        //     orchestrator
        //         .bel
        //         .append_event(&Event::JobRunBufferV1(job_run))
        //         .expect("event append");
        //
        //     // Job runs should not be empty
        //     orchestrator
        //         .bel
        //         .state
        //         .get_job_run(&job_run_id.to_string())
        //         .expect("job run should exist");
        //
        //     // Add event for job failure
        //     let dep_miss_job_run = DepMissJobRun {
        //         job_run_id,
        //         state: SubProcessDepMiss {
        //             stdout_buffer: vec![],
        //             missing_deps: vec![MissingDeps {
        //                 impacted: vec!["data/beta".into()],
        //                 missing: vec!["data/alpha".into()],
        //             }],
        //             read_deps: vec![],
        //         },
        //         _backend: PhantomData,
        //     };
        //
        //     // When calculating events from dep miss
        //     // TODO this needs to be migrated - orchestrator shouldn't contain mapping logic
        //     let dep_miss_event = dep_miss_job_run.state.to_event(&dep_miss_job_run.id());;
        //     let events = orchestrator
        //         .bel
        //         .state
        //         .handle_job_run_dep_miss(&dep_miss_event)
        //         .unwrap();
        //
        //     // Should have scheduled a job for alpha
        //     assert_eq!(
        //         events
        //             .iter()
        //             .filter(|e| match e {
        //                 Event::WantCreateV1(e) => e.partitions.contains(&"data/alpha".into()),
        //                 _ => false,
        //             })
        //             .count(),
        //         1
        //     );
        //     assert!(
        //         orchestrator.not_started_jobs.is_empty(),
        //         "shouldn't have scheduled yet"
        //     );
        //
        //     // Should schedule job after we poll wants
        //     orchestrator.poll_wants().expect("poll wants");
        //     assert_eq!(
        //         orchestrator.not_started_jobs.len(),
        //         1,
        //         "should have scheduled job"
        //     );
        // }
    }

    /// Orchestrator needs to be able to achieve high level orchestration use cases.
    mod orchestration {
        use crate::data_build_event::Event;
        use crate::orchestrator::tests::{build_orchestrator, setup_scenario_a_to_b};
        use crate::{PartitionStatusCode, WantCreateEventV1};
        use std::thread;
        use std::time::Duration;

        /// Use case: should run a job to produce a partition in reaction to a want, then have the
        /// want fulfilled.
        #[test]
        fn test_want_builds_partition() {
            // Given
            let mut orchestrator = setup_scenario_a_to_b(build_orchestrator());
            // Add event for originating want
            let partition = "data/alpha";
            orchestrator
                .bel
                .append_event(&Event::WantCreateV1(WantCreateEventV1 {
                    partitions: vec![partition.into()],
                    ..WantCreateEventV1::sample()
                }))
                .expect("event append");

            // When
            // Poll wants then schedule pending jobs
            orchestrator
                .poll_wants()
                .expect("stage unscheduled jobs based on wants failed");
            assert_eq!(orchestrator.count_not_started_jobs(), 1);
            // step should start job run
            orchestrator.step().expect("should start run");
            assert_eq!(orchestrator.count_running_jobs(), 1);
            assert_eq!(orchestrator.bel.state.count_job_runs(), 1);
            thread::sleep(Duration::from_millis(1));
            // Should still be running after 1ms
            orchestrator
                .step()
                .expect("should still be running");
            assert_eq!(orchestrator.count_running_jobs(), 1);
            assert_eq!(orchestrator.bel.state.count_job_runs(), 1);
            println!("STATE: {:?}", orchestrator.bel.state);

            // Wait for it to complete
            thread::sleep(Duration::from_millis(10));
            orchestrator
                .step()
                .expect("should be able to poll existing job run");

            // Job run should have succeeded
            assert_eq!(orchestrator.count_not_started_jobs(), 0);
            assert_eq!(orchestrator.count_completed_jobs(), 1);

            // Build state should show partition as live
            assert_eq!(
                orchestrator
                    .bel
                    .state
                    .get_partition(partition)
                    .unwrap()
                    .status,
                Some(PartitionStatusCode::PartitionLive.into()),
                "partition should be live after job run completion"
            );
        }

        /// Helper to wait for running jobs to complete with timeout
        fn wait_for_jobs_to_complete<S: crate::build_event_log::BELStorage + std::fmt::Debug>(
            orchestrator: &mut crate::orchestrator::Orchestrator<S>,
            max_steps: usize,
        ) -> Result<(), String> {
            use std::thread;
            use std::time::Duration;

            for _i in 0..max_steps {
                thread::sleep(Duration::from_millis(50));
                if orchestrator.count_running_jobs() == 0 {
                    return Ok(());
                }
                orchestrator
                    .step()
                    .map_err(|e| format!("step failed: {}", e))?;
            }

            Err(format!("Jobs did not complete after {} steps", max_steps))
        }

        // Use case: a graph with multi-hop deps should achieve the multi-hop build
        // - Job B depends on part_a produced by job A
        // - Job B should be attempted, fail, and create a want for part_a
        // - Job A should be attempted, succeed, and produce part_a
        // - Job B should be attempted, succeed, and produce part_b
        #[test]
        fn test_multi_hop_want_builds_partition() {
            use crate::job::JobConfiguration;
            use crate::orchestrator::OrchestratorConfig;
            use std::fs;
            use std::os::unix::fs::PermissionsExt;

            // Clean up marker file from any previous runs
            let marker_file = "/tmp/databuild_test_alpha_complete";
            let _ = fs::remove_file(marker_file);

            // Create inline test scripts in /tmp
            let alpha_script = "/tmp/test_job_alpha.sh";
            let beta_script = "/tmp/test_job_beta.sh";

            // Alpha job: creates marker file and outputs success
            fs::write(
                alpha_script,
                r#"#!/bin/bash
touch /tmp/databuild_test_alpha_complete
echo '{"DataDepLogLine":{"Success":{"version":"1","produced_partitions":["data/alpha"]}}}'
"#,
            )
            .unwrap();

            // Beta job: checks for alpha marker, outputs dep miss if not found
            fs::write(beta_script, r#"#!/bin/bash
if [ ! -f /tmp/databuild_test_alpha_complete ]; then
  echo 'DATABUILD_MISSING_DEPS_JSON:{"version":"1","missing_deps":[{"impacted":[{"ref":"data/beta"}],"missing":[{"ref":"data/alpha"}]}]}'
  exit 1
fi
echo 'Beta succeeded'
"#).unwrap();

            // Make scripts executable
            fs::set_permissions(alpha_script, fs::Permissions::from_mode(0o755)).unwrap();
            fs::set_permissions(beta_script, fs::Permissions::from_mode(0o755)).unwrap();

            // Given: Set up orchestrator with alpha and beta jobs using test scripts
            let mut orchestrator = build_orchestrator();
            orchestrator.config = OrchestratorConfig {
                jobs: vec![
                    JobConfiguration {
                        label: "alpha".to_string(),
                        patterns: vec!["data/alpha".to_string()],
                        entry_point: alpha_script.to_string(),
                    },
                    JobConfiguration {
                        label: "beta".to_string(),
                        patterns: vec!["data/beta".to_string()],
                        entry_point: beta_script.to_string(),
                    },
                ],
            };

            let partition_beta = "data/beta";
            let partition_alpha = "data/alpha";

            // Create initial want for beta partition
            orchestrator
                .bel
                .append_event(&Event::WantCreateV1(WantCreateEventV1 {
                    partitions: vec![partition_beta.into()],
                    ..WantCreateEventV1::sample()
                }))
                .expect("event append");

            // When: Run orchestrator steps to let it naturally handle the multi-hop build
            // Step 1: Should schedule beta job (want -> not_started_jobs)
            orchestrator.step().expect("step 1");
            assert_eq!(
                orchestrator.count_not_started_jobs(),
                1,
                "beta job should be queued"
            );

            // Step 2: Should start beta job (not_started_jobs -> running_jobs)
            orchestrator.step().expect("step 2");
            assert_eq!(
                orchestrator.count_running_jobs(),
                1,
                "beta job should be running"
            );

            // Step 3: Beta job detects missing alpha dep and creates want
            wait_for_jobs_to_complete(&mut orchestrator, 10).expect("beta job should complete");
            // (Beta should now be in dep_miss state, and a want for alpha should be created)
            assert_eq!(
                orchestrator.count_dep_miss_jobs(),
                1,
                "beta should have dep miss"
            );

            // Step 4: Should schedule and start alpha job
            // (dep miss handler created the alpha want, which will be picked up by poll_wants)
            orchestrator.step().expect("step 4");
            assert_eq!(
                orchestrator.count_running_jobs(),
                1,
                "alpha job should be running"
            );

            // Step 6: Alpha completes successfully
            wait_for_jobs_to_complete(&mut orchestrator, 10).expect("alpha job should complete");
            assert_eq!(
                orchestrator.count_completed_jobs(),
                1,
                "alpha should complete"
            );
            assert_eq!(
                orchestrator
                    .bel
                    .state
                    .get_partition(partition_alpha)
                    .unwrap()
                    .status,
                Some(PartitionStatusCode::PartitionLive.into()),
                "alpha partition should be live"
            );

            // Step 7: Beta is rescheduled and started (want -> running_jobs)
            orchestrator.step().expect("step 7");
            assert_eq!(orchestrator.count_running_jobs(), 1, "beta should be running");

            // Step 8: Beta completes successfully
            wait_for_jobs_to_complete(&mut orchestrator, 10).expect("beta job should complete");

            // Then: Verify both partitions are live and both jobs completed
            assert_eq!(
                orchestrator.count_completed_jobs(),
                2,
                "both jobs should complete"
            );
            assert_eq!(
                orchestrator.count_dep_miss_jobs(),
                1,
                "should have one dep miss"
            );

            assert_eq!(
                orchestrator
                    .bel
                    .state
                    .get_partition(partition_alpha)
                    .unwrap()
                    .status,
                Some(PartitionStatusCode::PartitionLive.into()),
                "alpha partition should be live"
            );
            assert_eq!(
                orchestrator
                    .bel
                    .state
                    .get_partition(partition_beta)
                    .unwrap()
                    .status,
                Some(PartitionStatusCode::PartitionLive.into()),
                "beta partition should be live after multi-hop build"
            );

            // Cleanup
            let _ = fs::remove_file(marker_file);
            let _ = fs::remove_file(alpha_script);
            let _ = fs::remove_file(beta_script);
        }
    }

    // The orchestrator groups wants to enable efficient execution. Many individual wants may
    // reference the same partitions, or many different partitions may be referenced by many
    // different wants. The orchestrator needs to be able to achieve job run batching, where a
    // single job run builds multiple partitions from multiple different wants.
    mod want_grouping {
        use super::super::*;
        use crate::build_event_log::MemoryBELStorage;
        use crate::{PartitionRef, WantDetail};

        fn create_job_config(label: &str, pattern: &str) -> JobConfiguration {
            JobConfiguration {
                label: label.to_string(),
                patterns: vec![pattern.to_string()],
                entry_point: "test_entrypoint".to_string(),
            }
        }

        fn create_want_detail(want_id: &str, partition_refs: Vec<&str>) -> WantDetail {
            WantDetail {
                want_id: want_id.to_string(),
                partitions: partition_refs
                    .iter()
                    .map(|r| PartitionRef {
                        r#ref: r.to_string(),
                    })
                    .collect(),
                upstreams: vec![],
                data_timestamp: 0,
                ttl_seconds: 0,
                sla_seconds: 0,
                source: None,
                comment: None,
                status: None,
                last_updated_timestamp: 0,
            }
        }

        #[test]
        fn test_group_wants_empty_config_empty_wants() {
            let config = OrchestratorConfig { jobs: vec![] };
            let wants = vec![];

            let result = Orchestrator::<MemoryBELStorage>::group_wants(&config, &wants);

            assert!(result.want_groups.is_empty());
            assert!(result.unhandled_wants.is_empty());
        }

        #[test]
        fn test_group_wants_one_want_matches_job() {
            let job_config = create_job_config("test_job", "partition.*");
            let config = OrchestratorConfig {
                jobs: vec![job_config.clone()],
            };
            let want = create_want_detail("want1", vec!["partition1"]);
            let wants = vec![want.clone()];

            let result = Orchestrator::<MemoryBELStorage>::group_wants(&config, &wants);

            assert!(result.unhandled_wants.is_empty());
            assert_eq!(result.want_groups.len(), 1);
            assert_eq!(result.want_groups[0].job.label, "test_job");
            assert_eq!(result.want_groups[0].wants.len(), 1);
            assert_eq!(result.want_groups[0].wants[0].want_id, "want1");
        }

        #[test]
        fn test_group_wants_one_unmatching_want() {
            let job_config = create_job_config("test_job", "^test_pattern$");
            let config = OrchestratorConfig {
                jobs: vec![job_config],
            };
            let want = create_want_detail("want1", vec!["different_partition"]);
            let wants = vec![want.clone()];

            let result = Orchestrator::<MemoryBELStorage>::group_wants(&config, &wants);

            assert_eq!(result.unhandled_wants.len(), 1);
            assert_eq!(result.unhandled_wants[0].want_id, "want1");
            assert!(result.want_groups.is_empty());
        }

        #[test]
        fn test_group_wants_multiple_wants_different_jobs() {
            let job_config1 = create_job_config("job1", "pattern1.*");
            let job_config2 = create_job_config("job2", "pattern2.*");
            let config = OrchestratorConfig {
                jobs: vec![job_config1, job_config2],
            };

            let want1 = create_want_detail("want1", vec!["pattern1_partition"]);
            let want2 = create_want_detail("want2", vec!["pattern1_other"]);
            let want3 = create_want_detail("want3", vec!["pattern2_partition"]);
            let wants = vec![want1, want2, want3];

            let result = Orchestrator::<MemoryBELStorage>::group_wants(&config, &wants);

            assert!(result.unhandled_wants.is_empty());
            assert_eq!(result.want_groups.len(), 2);

            // Find job1 group
            let job1_group = result
                .want_groups
                .iter()
                .find(|wg| wg.job.label == "job1")
                .unwrap();
            assert_eq!(job1_group.wants.len(), 2);

            // Find job2 group
            let job2_group = result
                .want_groups
                .iter()
                .find(|wg| wg.job.label == "job2")
                .unwrap();
            assert_eq!(job2_group.wants.len(), 1);
        }
    }
}