refactor top level job run interface to represent job states as states
This commit is contained in:
parent
6572d4e3bd
commit
eeb90d0386
3 changed files with 368 additions and 278 deletions
|
|
@ -1,7 +1,6 @@
|
|||
use crate::job_run::{spawn_job_run, JobRun, JobRunConfig};
|
||||
use crate::job_run::{NotStartedJobRun, SubProcessBackend};
|
||||
use crate::{PartitionRef, WantDetail};
|
||||
use regex::Regex;
|
||||
use std::error::Error;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct JobConfiguration {
|
||||
|
|
@ -12,12 +11,11 @@ pub struct JobConfiguration {
|
|||
|
||||
impl JobConfiguration {
|
||||
/** Launch job to build the partitions specified by the provided wants. */
|
||||
pub fn spawn(&self, wants: Vec<WantDetail>) -> Result<Box<dyn JobRun>, Box<dyn Error>> {
|
||||
pub fn spawn(&self, wants: Vec<WantDetail>) -> Result<NotStartedJobRun<SubProcessBackend>, std::io::Error> {
|
||||
let wanted_refs: Vec<PartitionRef> =
|
||||
wants.iter().flat_map(|want| want.partitions.clone()).collect();
|
||||
let args: Vec<String> = wanted_refs.iter().map(|pref| pref.r#ref.clone()).collect();
|
||||
let config = JobRunConfig::SubProcess { entry_point: self.entry_point.clone(), args };
|
||||
spawn_job_run(config)
|
||||
Ok(NotStartedJobRun::spawn(self.entry_point.clone(), args))
|
||||
}
|
||||
|
||||
pub fn matches(&self, refs: &PartitionRef) -> bool {
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use crate::data_build_event::Event;
|
||||
use crate::{JobRunHeartbeatEventV1, JobRunStatus, JobRunStatusCode, JobRunSuccessEventV1, JobRunFailureEventV1, JobRunCancelEventV1, EventSource};
|
||||
use std::error::Error;
|
||||
|
|
@ -9,227 +10,309 @@ use uuid::Uuid;
|
|||
// TODO log to /var/log/databuild/jobruns/$JOB_RUN_ID/, and rotate over max size (e.g. only ever use 1GB for logs)
|
||||
// Leave door open to background log processor that tails job logs, but don't include in jobrun concept
|
||||
|
||||
pub trait JobRun {
|
||||
fn id(&self) -> Uuid;
|
||||
/**
|
||||
Visit is responsible for observing the state of the job run,
|
||||
*/
|
||||
fn visit(&mut self) -> Result<JobRunPollResult, Box<dyn Error>>;
|
||||
fn cancel(&mut self, source: EventSource) -> Result<JobRunCancelEventV1, Box<dyn Error>>;
|
||||
fn run_with_env(&mut self, env: Option<HashMap<String, String>>) -> Result<JobRunHeartbeatEventV1, Box<dyn Error>>;
|
||||
fn run(&mut self) -> Result<JobRunHeartbeatEventV1, Box<dyn Error>> {
|
||||
/// Backend trait that defines the state types and transition logic for different job run implementations
|
||||
pub trait JobRunBackend: Sized {
|
||||
type NotStartedState;
|
||||
type RunningState;
|
||||
type CompletedState;
|
||||
type FailedState;
|
||||
type CanceledState;
|
||||
|
||||
/// Create a new not-started job run
|
||||
fn create(entry_point: String, args: Vec<String>) -> Self::NotStartedState;
|
||||
|
||||
/// Transition from NotStarted to Running
|
||||
fn start(
|
||||
not_started: Self::NotStartedState,
|
||||
env: Option<HashMap<String, String>>
|
||||
) -> Result<Self::RunningState, Box<dyn Error>>;
|
||||
|
||||
/// Poll a running job for state changes
|
||||
fn poll(
|
||||
running: &mut Self::RunningState
|
||||
) -> Result<PollResult<Self::CompletedState, Self::FailedState>, Box<dyn Error>>;
|
||||
|
||||
/// Cancel a running job
|
||||
fn cancel_job(
|
||||
running: Self::RunningState,
|
||||
source: EventSource
|
||||
) -> Result<Self::CanceledState, Box<dyn Error>>;
|
||||
}
|
||||
|
||||
/// Result of polling a running job
|
||||
pub enum PollResult<C, F> {
|
||||
StillRunning,
|
||||
Completed(C),
|
||||
Failed(F),
|
||||
}
|
||||
|
||||
/// Generic JobRun that works with any backend, parameterized by state
|
||||
pub struct JobRun<B: JobRunBackend, S> {
|
||||
pub job_run_id: Uuid,
|
||||
pub state: S,
|
||||
_backend: PhantomData<B>,
|
||||
}
|
||||
|
||||
/// Type aliases for specific states
|
||||
pub type NotStartedJobRun<B> = JobRun<B, <B as JobRunBackend>::NotStartedState>;
|
||||
pub type RunningJobRun<B> = JobRun<B, <B as JobRunBackend>::RunningState>;
|
||||
pub type CompletedJobRun<B> = JobRun<B, <B as JobRunBackend>::CompletedState>;
|
||||
pub type FailedJobRun<B> = JobRun<B, <B as JobRunBackend>::FailedState>;
|
||||
pub type CanceledJobRun<B> = JobRun<B, <B as JobRunBackend>::CanceledState>;
|
||||
|
||||
// Methods available on all JobRun states
|
||||
impl<B: JobRunBackend, S> JobRun<B, S> {
|
||||
pub fn id(&self) -> Uuid {
|
||||
self.job_run_id
|
||||
}
|
||||
}
|
||||
|
||||
// Methods available only on NotStarted state
|
||||
impl<B: JobRunBackend> NotStartedJobRun<B> {
|
||||
pub fn spawn(entry_point: String, args: Vec<String>) -> Self {
|
||||
JobRun {
|
||||
job_run_id: Uuid::new_v4(),
|
||||
state: B::create(entry_point, args),
|
||||
_backend: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn run(self) -> Result<RunningJobRun<B>, Box<dyn Error>> {
|
||||
self.run_with_env(None)
|
||||
}
|
||||
|
||||
pub fn run_with_env(
|
||||
self,
|
||||
env: Option<HashMap<String, String>>
|
||||
) -> Result<RunningJobRun<B>, Box<dyn Error>> {
|
||||
let running_state = B::start(self.state, env)?;
|
||||
Ok(JobRun {
|
||||
job_run_id: self.job_run_id,
|
||||
state: running_state,
|
||||
_backend: PhantomData,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Methods available only on Running state
|
||||
impl<B: JobRunBackend> RunningJobRun<B> {
|
||||
pub fn visit(&mut self) -> Result<JobRunVisitResult<B>, Box<dyn Error>> {
|
||||
match B::poll(&mut self.state)? {
|
||||
PollResult::StillRunning => Ok(JobRunVisitResult::StillRunning),
|
||||
PollResult::Completed(completed_state) => {
|
||||
let job_run_id = self.job_run_id;
|
||||
Ok(JobRunVisitResult::Completed(JobRun {
|
||||
job_run_id,
|
||||
state: completed_state,
|
||||
_backend: PhantomData,
|
||||
}))
|
||||
}
|
||||
PollResult::Failed(failed_state) => {
|
||||
let job_run_id = self.job_run_id;
|
||||
Ok(JobRunVisitResult::Failed(JobRun {
|
||||
job_run_id,
|
||||
state: failed_state,
|
||||
_backend: PhantomData,
|
||||
}))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cancel(self, source: EventSource) -> Result<CanceledJobRun<B>, Box<dyn Error>> {
|
||||
let canceled_state = B::cancel_job(self.state, source)?;
|
||||
Ok(JobRun {
|
||||
job_run_id: self.job_run_id,
|
||||
state: canceled_state,
|
||||
_backend: PhantomData,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of visiting a running job
|
||||
pub enum JobRunVisitResult<B: JobRunBackend> {
|
||||
StillRunning,
|
||||
Completed(CompletedJobRun<B>),
|
||||
Failed(FailedJobRun<B>),
|
||||
}
|
||||
|
||||
pub enum JobRunConfig {
|
||||
SubProcess { entry_point: String, args: Vec<String> },
|
||||
}
|
||||
|
||||
pub fn spawn_job_run(config: JobRunConfig) -> Result<Box<dyn JobRun>, Box<dyn Error>> {
|
||||
match config {
|
||||
JobRunConfig::SubProcess { entry_point, args } => Ok(SubProcessJobRun::spawn(entry_point, args)?),
|
||||
_ => Err("No impl for this job config type".into()),
|
||||
}
|
||||
}
|
||||
// ===== SubProcess Backend Implementation =====
|
||||
|
||||
pub struct SubProcessJobRun {
|
||||
pub job_run_id: Uuid,
|
||||
/// SubProcess backend for running jobs as local subprocesses
|
||||
pub struct SubProcessBackend;
|
||||
|
||||
/// NotStarted state for SubProcess backend
|
||||
pub struct SubProcessNotStarted {
|
||||
pub entry_point: String,
|
||||
pub args: Vec<String>,
|
||||
pub state: JobRunState,
|
||||
}
|
||||
|
||||
enum JobRunState {
|
||||
NotStarted,
|
||||
Running {
|
||||
process: Child,
|
||||
stdout_buffer: Vec<String>,
|
||||
},
|
||||
Completed {
|
||||
exit_code: i32,
|
||||
stdout_buffer: Vec<String>,
|
||||
},
|
||||
Failed {
|
||||
exit_code: i32,
|
||||
reason: String,
|
||||
stdout_buffer: Vec<String>,
|
||||
},
|
||||
Canceled {
|
||||
source: EventSource,
|
||||
stdout_buffer: Vec<String>,
|
||||
},
|
||||
/// Running state for SubProcess backend
|
||||
pub struct SubProcessRunning {
|
||||
pub process: Child,
|
||||
pub stdout_buffer: Vec<String>,
|
||||
}
|
||||
|
||||
impl JobRun for SubProcessJobRun {
|
||||
fn id(&self) -> Uuid {
|
||||
self.job_run_id
|
||||
/// Completed state for SubProcess backend
|
||||
pub struct SubProcessCompleted {
|
||||
pub exit_code: i32,
|
||||
pub stdout_buffer: Vec<String>,
|
||||
}
|
||||
|
||||
/// Failed state for SubProcess backend
|
||||
pub struct SubProcessFailed {
|
||||
pub exit_code: i32,
|
||||
pub reason: String,
|
||||
pub stdout_buffer: Vec<String>,
|
||||
}
|
||||
|
||||
/// Canceled state for SubProcess backend
|
||||
pub struct SubProcessCanceled {
|
||||
pub source: EventSource,
|
||||
pub stdout_buffer: Vec<String>,
|
||||
}
|
||||
|
||||
impl JobRunBackend for SubProcessBackend {
|
||||
type NotStartedState = SubProcessNotStarted;
|
||||
type RunningState = SubProcessRunning;
|
||||
type CompletedState = SubProcessCompleted;
|
||||
type FailedState = SubProcessFailed;
|
||||
type CanceledState = SubProcessCanceled;
|
||||
|
||||
fn create(entry_point: String, args: Vec<String>) -> Self::NotStartedState {
|
||||
SubProcessNotStarted { entry_point, args }
|
||||
}
|
||||
|
||||
fn visit(&mut self) -> Result<JobRunPollResult, Box<dyn Error>> {
|
||||
let mut new_events = Vec::new();
|
||||
|
||||
match &mut self.state {
|
||||
JobRunState::Running { process, stdout_buffer } => {
|
||||
// Non-blocking check for exit status
|
||||
if let Some(exit_status) = process.try_wait()? {
|
||||
// Read any remaining stdout
|
||||
if let Some(stdout) = process.stdout.take() {
|
||||
let reader = BufReader::new(stdout);
|
||||
for line in reader.lines() {
|
||||
// TODO we should write lines to the job's file logs
|
||||
if let Ok(line) = line {
|
||||
stdout_buffer.push(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Take ownership of the current state to transition
|
||||
let old_state = std::mem::replace(&mut self.state, JobRunState::NotStarted);
|
||||
let stdout_buf = if let JobRunState::Running { stdout_buffer, .. } = old_state {
|
||||
stdout_buffer
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
// Check exit status and transition to terminal state
|
||||
match exit_status.code() {
|
||||
Some(0) => {
|
||||
// Success case
|
||||
self.state = JobRunState::Completed {
|
||||
exit_code: 0,
|
||||
stdout_buffer: stdout_buf,
|
||||
};
|
||||
new_events.push(Event::JobRunSuccessV1(JobRunSuccessEventV1 {
|
||||
job_run_id: self.job_run_id.to_string(),
|
||||
}));
|
||||
return Ok(JobRunPollResult {
|
||||
new_events,
|
||||
status: JobRunStatusCode::JobRunSucceeded.into(),
|
||||
});
|
||||
}
|
||||
Some(code) => {
|
||||
// Failed with exit code
|
||||
let reason = format!("Job failed with exit code {}", code);
|
||||
self.state = JobRunState::Failed {
|
||||
exit_code: code,
|
||||
reason: reason.clone(),
|
||||
stdout_buffer: stdout_buf,
|
||||
};
|
||||
new_events.push(Event::JobRunFailureV1(JobRunFailureEventV1 {
|
||||
job_run_id: self.job_run_id.to_string(),
|
||||
reason,
|
||||
}));
|
||||
return Ok(JobRunPollResult {
|
||||
new_events,
|
||||
status: JobRunStatusCode::JobRunFailed.into(),
|
||||
});
|
||||
}
|
||||
None => {
|
||||
// Terminated by signal (Unix) - treat as failure
|
||||
let reason = format!("Job terminated by signal: {}", exit_status);
|
||||
self.state = JobRunState::Failed {
|
||||
exit_code: -1,
|
||||
reason: reason.clone(),
|
||||
stdout_buffer: stdout_buf,
|
||||
};
|
||||
new_events.push(Event::JobRunFailureV1(JobRunFailureEventV1 {
|
||||
job_run_id: self.job_run_id.to_string(),
|
||||
reason,
|
||||
}));
|
||||
return Ok(JobRunPollResult {
|
||||
new_events,
|
||||
status: JobRunStatusCode::JobRunFailed.into(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Still running
|
||||
Ok(JobRunPollResult {
|
||||
new_events,
|
||||
status: JobRunStatusCode::JobRunRunning.into(),
|
||||
})
|
||||
}
|
||||
_ => Err("visit() called on non-running job".into()),
|
||||
}
|
||||
}
|
||||
|
||||
fn cancel(&mut self, source: EventSource) -> Result<JobRunCancelEventV1, Box<dyn Error>> {
|
||||
match std::mem::replace(&mut self.state, JobRunState::NotStarted) {
|
||||
JobRunState::Running { mut process, stdout_buffer } => {
|
||||
// Kill the process
|
||||
process.kill()?;
|
||||
|
||||
// Wait for it to actually terminate
|
||||
process.wait()?;
|
||||
|
||||
// Transition to Canceled state
|
||||
self.state = JobRunState::Canceled {
|
||||
source: source.clone(),
|
||||
stdout_buffer,
|
||||
};
|
||||
|
||||
Ok(JobRunCancelEventV1 {
|
||||
job_run_id: self.job_run_id.to_string(),
|
||||
source: Some(source),
|
||||
comment: Some("Job was canceled".to_string()),
|
||||
})
|
||||
}
|
||||
other_state => {
|
||||
// Restore the state and error
|
||||
self.state = other_state;
|
||||
Err("cancel() called on non-running job".into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Mostly for test purposes
|
||||
fn run_with_env(&mut self, env: Option<HashMap<String, String>>) -> Result<JobRunHeartbeatEventV1, Box<dyn Error>> {
|
||||
match &self.state {
|
||||
JobRunState::NotStarted => {
|
||||
let process = Command::new(self.entry_point.clone())
|
||||
.args(self.args.clone())
|
||||
fn start(
|
||||
not_started: Self::NotStartedState,
|
||||
env: Option<HashMap<String, String>>
|
||||
) -> Result<Self::RunningState, Box<dyn Error>> {
|
||||
let process = Command::new(not_started.entry_point)
|
||||
.args(not_started.args)
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.envs(env.unwrap_or_default())
|
||||
.spawn()?;
|
||||
|
||||
self.state = JobRunState::Running {
|
||||
Ok(SubProcessRunning {
|
||||
process,
|
||||
stdout_buffer: Vec::new(),
|
||||
};
|
||||
})
|
||||
}
|
||||
|
||||
// TODO should this return the event now? Or enqueue it? No sense in waiting I suppose, and orchestrator should just handle it?
|
||||
Ok(JobRunHeartbeatEventV1 { job_run_id: self.id().to_string() })
|
||||
}
|
||||
_ => Err("run() called on already-running or completed job".into())
|
||||
fn poll(
|
||||
running: &mut Self::RunningState
|
||||
) -> Result<PollResult<Self::CompletedState, Self::FailedState>, Box<dyn Error>> {
|
||||
// Non-blocking check for exit status
|
||||
if let Some(exit_status) = running.process.try_wait()? {
|
||||
// Read any remaining stdout
|
||||
if let Some(stdout) = running.process.stdout.take() {
|
||||
let reader = BufReader::new(stdout);
|
||||
for line in reader.lines() {
|
||||
// TODO we should write lines to the job's file logs
|
||||
if let Ok(line) = line {
|
||||
running.stdout_buffer.push(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SubProcessJobRun {
|
||||
pub fn spawn(entry_point: String, args: Vec<String>) -> Result<Box<dyn JobRun>, Box<dyn Error>> {
|
||||
Ok(Box::new(SubProcessJobRun {
|
||||
job_run_id: Uuid::new_v4(),
|
||||
entry_point,
|
||||
args,
|
||||
state: JobRunState::NotStarted,
|
||||
// Take ownership of stdout_buffer
|
||||
let stdout_buffer = std::mem::take(&mut running.stdout_buffer);
|
||||
|
||||
// Check exit status and return appropriate result
|
||||
match exit_status.code() {
|
||||
Some(0) => {
|
||||
// Success case
|
||||
Ok(PollResult::Completed(SubProcessCompleted {
|
||||
exit_code: 0,
|
||||
stdout_buffer,
|
||||
}))
|
||||
}
|
||||
Some(code) => {
|
||||
// Failed with exit code
|
||||
let reason = format!("Job failed with exit code {}", code);
|
||||
Ok(PollResult::Failed(SubProcessFailed {
|
||||
exit_code: code,
|
||||
reason,
|
||||
stdout_buffer,
|
||||
}))
|
||||
}
|
||||
None => {
|
||||
// Terminated by signal (Unix) - treat as failure
|
||||
let reason = format!("Job terminated by signal: {}", exit_status);
|
||||
Ok(PollResult::Failed(SubProcessFailed {
|
||||
exit_code: -1,
|
||||
reason,
|
||||
stdout_buffer,
|
||||
}))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Still running
|
||||
Ok(PollResult::StillRunning)
|
||||
}
|
||||
}
|
||||
|
||||
fn cancel_job(
|
||||
mut running: Self::RunningState,
|
||||
source: EventSource
|
||||
) -> Result<Self::CanceledState, Box<dyn Error>> {
|
||||
// Kill the process
|
||||
running.process.kill()?;
|
||||
|
||||
// Wait for it to actually terminate
|
||||
running.process.wait()?;
|
||||
|
||||
// Return canceled state
|
||||
Ok(SubProcessCanceled {
|
||||
source,
|
||||
stdout_buffer: running.stdout_buffer,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Helper functions to convert between states and events
|
||||
impl SubProcessCompleted {
|
||||
pub fn to_event(&self, job_run_id: &Uuid) -> Event {
|
||||
Event::JobRunSuccessV1(JobRunSuccessEventV1 {
|
||||
job_run_id: job_run_id.to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl SubProcessFailed {
|
||||
pub fn to_event(&self, job_run_id: &Uuid) -> Event {
|
||||
Event::JobRunFailureV1(JobRunFailureEventV1 {
|
||||
job_run_id: job_run_id.to_string(),
|
||||
reason: self.reason.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl SubProcessCanceled {
|
||||
pub fn to_event(&self, job_run_id: &Uuid) -> JobRunCancelEventV1 {
|
||||
JobRunCancelEventV1 {
|
||||
job_run_id: job_run_id.to_string(),
|
||||
source: Some(self.source.clone()),
|
||||
comment: Some("Job was canceled".to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Old JobRunPollResult structure - kept for compatibility during migration
|
||||
pub struct JobRunPollResult {
|
||||
pub new_events: Vec<Event>, // Parsed BEL events, not raw lines
|
||||
pub new_events: Vec<Event>,
|
||||
pub status: JobRunStatus,
|
||||
}
|
||||
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
use crate::data_build_event::Event;
|
||||
use crate::job_run::{JobRun, SubProcessJobRun};
|
||||
use crate::{JobRunStatusCode, ManuallyTriggeredEvent};
|
||||
use uuid::Uuid;
|
||||
use crate::job_run::{JobRunVisitResult, NotStartedJobRun, SubProcessBackend};
|
||||
use crate::{ManuallyTriggeredEvent};
|
||||
|
||||
fn test_helper_path() -> String {
|
||||
std::env::var("TEST_SRCDIR")
|
||||
|
|
@ -241,35 +324,29 @@ mod tests {
|
|||
#[test]
|
||||
fn test_job_run_success_returns_job_run_success_event() {
|
||||
// Spawn a job run that will succeed (exit code 0)
|
||||
let mut job_run = SubProcessJobRun::spawn(test_helper_path(), vec![]).unwrap();
|
||||
let job_run: NotStartedJobRun<SubProcessBackend> = NotStartedJobRun::spawn(test_helper_path(), vec![]);
|
||||
|
||||
// Start the job
|
||||
job_run.run().unwrap();
|
||||
// Start the job - this consumes the NotStarted and returns Running
|
||||
let mut running_job = job_run.run().unwrap();
|
||||
|
||||
// Poll until we get completion
|
||||
loop {
|
||||
let result = job_run.visit().unwrap();
|
||||
|
||||
// Check if we got a success event
|
||||
let has_success = result.new_events.iter().any(|event| {
|
||||
matches!(event, Event::JobRunSuccessV1(_))
|
||||
});
|
||||
|
||||
if has_success {
|
||||
let expected = JobRunStatusCode::JobRunSucceeded as i32;
|
||||
assert!(matches!(result.status.code, expected));
|
||||
match running_job.visit().unwrap() {
|
||||
JobRunVisitResult::Completed(completed) => {
|
||||
// Generate the event from the completed state
|
||||
let event = completed.state.to_event(&completed.id());
|
||||
assert!(matches!(event, Event::JobRunSuccessV1(_)));
|
||||
break;
|
||||
}
|
||||
|
||||
// If job is still running, sleep briefly and poll again
|
||||
let expected = JobRunStatusCode::JobRunRunning as i32;
|
||||
if matches!(result.status.code, expected) {
|
||||
JobRunVisitResult::Failed(failed) => {
|
||||
panic!("Job failed unexpectedly: {}", failed.state.reason);
|
||||
}
|
||||
JobRunVisitResult::StillRunning => {
|
||||
// Sleep briefly and poll again
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we got here, job failed when it shouldn't have
|
||||
panic!("Job failed unexpectedly: {:?}", result.status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -277,41 +354,32 @@ mod tests {
|
|||
#[test]
|
||||
fn test_job_run_failure_returns_job_run_failure_event() {
|
||||
// Spawn a job run
|
||||
let mut job_run = SubProcessJobRun::spawn(test_helper_path(), vec![]).unwrap();
|
||||
let job_run: NotStartedJobRun<SubProcessBackend> = NotStartedJobRun::spawn(test_helper_path(), vec![]);
|
||||
|
||||
// Start the job with an exit code that indicates failure (non-zero)
|
||||
let env: HashMap<String, String> = HashMap::from([
|
||||
("DATABUILD_TEST_EXIT_CODE".to_string(), "1".to_string())
|
||||
]);
|
||||
job_run.run_with_env(Some(env)).unwrap();
|
||||
|
||||
let mut running_job = job_run.run_with_env(Some(env)).unwrap();
|
||||
|
||||
// Poll until we get completion
|
||||
loop {
|
||||
let result = job_run.visit().unwrap();
|
||||
|
||||
// Check if we got a success event
|
||||
if result.new_events.iter().any(|event| {
|
||||
matches!(event, Event::JobRunSuccessV1(_))
|
||||
}) {
|
||||
match running_job.visit().unwrap() {
|
||||
JobRunVisitResult::Completed(_) => {
|
||||
panic!("Job succeeded unexpectedly");
|
||||
};
|
||||
|
||||
if result.new_events.iter().any(|event| {
|
||||
matches!(event, Event::JobRunFailureV1(_))
|
||||
}) {
|
||||
}
|
||||
JobRunVisitResult::Failed(failed) => {
|
||||
// Generate the event from the failed state
|
||||
let event = failed.state.to_event(&failed.id());
|
||||
assert!(matches!(event, Event::JobRunFailureV1(_)));
|
||||
break;
|
||||
}
|
||||
|
||||
// If job is still running, sleep briefly and poll again
|
||||
let expected = JobRunStatusCode::JobRunRunning as i32;
|
||||
if matches!(result.status.code, expected) {
|
||||
JobRunVisitResult::StillRunning => {
|
||||
// Sleep briefly and poll again
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we got here, job failed when it shouldn't have
|
||||
panic!("Job failed unexpectedly: {:?}", result.status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -322,28 +390,32 @@ mod tests {
|
|||
fn test_job_run_cancel_returns_job_run_cancel_event() {
|
||||
use std::fs;
|
||||
use crate::ManuallyTriggeredEvent;
|
||||
use uuid::Uuid;
|
||||
|
||||
// Create a temp file path for the test
|
||||
let temp_file = format!("/tmp/databuild_test_cancel_{}", Uuid::new_v4());
|
||||
|
||||
// Spawn a job run that will sleep for 1 second and write a file
|
||||
let mut job_run = SubProcessJobRun::spawn(test_helper_path(), vec![]).unwrap();
|
||||
let job_run: NotStartedJobRun<SubProcessBackend> = NotStartedJobRun::spawn(test_helper_path(), vec![]);
|
||||
|
||||
let env: HashMap<String, String> = HashMap::from([
|
||||
("DATABUILD_TEST_SLEEP_MS".to_string(), "1000".to_string()),
|
||||
("DATABUILD_TEST_OUTPUT_FILE".to_string(), temp_file.clone()),
|
||||
("DATABUILD_TEST_FILE_CONTENT".to_string(), "completed".to_string()),
|
||||
]);
|
||||
job_run.run_with_env(Some(env)).unwrap();
|
||||
let running_job = job_run.run_with_env(Some(env)).unwrap();
|
||||
|
||||
// Give it a tiny bit of time to start
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
|
||||
// Cancel the job before it can complete - this returns the cancel event
|
||||
let cancel_event = job_run.cancel(ManuallyTriggeredEvent { user: "test_user".into() }.into()).unwrap();
|
||||
// Cancel the job before it can complete - this consumes the running job and returns canceled
|
||||
let canceled_job = running_job.cancel(ManuallyTriggeredEvent { user: "test_user".into() }.into()).unwrap();
|
||||
|
||||
// Generate the cancel event from the canceled state
|
||||
let cancel_event = canceled_job.state.to_event(&canceled_job.id());
|
||||
|
||||
// Verify we got the cancel event
|
||||
assert_eq!(cancel_event.job_run_id, job_run.id().to_string());
|
||||
assert_eq!(cancel_event.job_run_id, canceled_job.id().to_string());
|
||||
assert!(cancel_event.source.is_some());
|
||||
assert_eq!(cancel_event.comment, Some("Job was canceled".to_string()));
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
use crate::build_event_log::{BELStorage, BuildEventLog, MemoryBELStorage};
|
||||
use crate::job::JobConfiguration;
|
||||
use crate::job_run::JobRun;
|
||||
use crate::{JobRunStatusCode, PartitionRef, WantDetail};
|
||||
use crate::job_run::{NotStartedJobRun, RunningJobRun, CompletedJobRun, FailedJobRun, SubProcessBackend};
|
||||
use crate::{PartitionRef, WantDetail};
|
||||
use std::collections::HashMap;
|
||||
use std::error::Error;
|
||||
use std::fmt::Debug;
|
||||
|
|
@ -13,7 +13,10 @@ the visitor pattern to monitor job exec progress and liveness, and adds
|
|||
|
||||
struct Orchestrator<S: BELStorage + Debug> {
|
||||
bel: BuildEventLog<S>,
|
||||
job_runs: Vec<Box<dyn JobRun>>,
|
||||
not_started_jobs: Vec<NotStartedJobRun<SubProcessBackend>>,
|
||||
running_jobs: Vec<RunningJobRun<SubProcessBackend>>,
|
||||
completed_jobs: Vec<CompletedJobRun<SubProcessBackend>>,
|
||||
failed_jobs: Vec<FailedJobRun<SubProcessBackend>>,
|
||||
config: OrchestratorConfig,
|
||||
}
|
||||
|
||||
|
|
@ -21,7 +24,10 @@ impl Default for Orchestrator<MemoryBELStorage> {
|
|||
fn default() -> Self {
|
||||
Self {
|
||||
bel: Default::default(),
|
||||
job_runs: Default::default(),
|
||||
not_started_jobs: Default::default(),
|
||||
running_jobs: Default::default(),
|
||||
completed_jobs: Default::default(),
|
||||
failed_jobs: Default::default(),
|
||||
config: Default::default(),
|
||||
}
|
||||
}
|
||||
|
|
@ -31,7 +37,10 @@ impl Orchestrator<MemoryBELStorage> {
|
|||
fn copy(&self) -> Self {
|
||||
Self {
|
||||
bel: self.bel.clone(),
|
||||
job_runs: Default::default(),
|
||||
not_started_jobs: Default::default(),
|
||||
running_jobs: Default::default(),
|
||||
completed_jobs: Default::default(),
|
||||
failed_jobs: Default::default(),
|
||||
config: self.config.clone(),
|
||||
}
|
||||
}
|
||||
|
|
@ -39,27 +48,18 @@ impl Orchestrator<MemoryBELStorage> {
|
|||
|
||||
impl<S: BELStorage + Debug> Orchestrator<S> {
|
||||
fn with_config(self, config: OrchestratorConfig) -> Self {
|
||||
Self {
|
||||
bel: self.bel,
|
||||
job_runs: self.job_runs,
|
||||
config,
|
||||
}
|
||||
Self { config, ..self }
|
||||
}
|
||||
|
||||
fn with_jobs(self, jobs: Vec<JobConfiguration>) -> Self {
|
||||
Self {
|
||||
bel: self.bel,
|
||||
job_runs: self.job_runs,
|
||||
config: self.config.with_jobs(jobs),
|
||||
..self
|
||||
}
|
||||
}
|
||||
|
||||
fn with_bel(self, bel: BuildEventLog<S>) -> Self {
|
||||
Self {
|
||||
bel,
|
||||
job_runs: self.job_runs,
|
||||
config: self.config,
|
||||
}
|
||||
Self { bel, ..self }
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -110,30 +110,46 @@ impl<S: BELStorage + Debug> Orchestrator<S> {
|
|||
fn new(storage: S, config: OrchestratorConfig) -> Self {
|
||||
Self {
|
||||
bel: BuildEventLog::new(storage, Default::default()),
|
||||
job_runs: Vec::new(),
|
||||
not_started_jobs: Vec::new(),
|
||||
running_jobs: Vec::new(),
|
||||
completed_jobs: Vec::new(),
|
||||
failed_jobs: Vec::new(),
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
/** Continuously invoked function to watch job run status */
|
||||
fn poll_job_runs(&mut self) -> Result<(), Box<dyn Error>> {
|
||||
// Visit existing jobs, remove completed
|
||||
self.job_runs.retain_mut(|jr| {
|
||||
// Append emitted events
|
||||
let result = jr
|
||||
.visit()
|
||||
.expect("Job visit failed");
|
||||
result.new_events
|
||||
.iter()
|
||||
.for_each(|event| {
|
||||
self.bel
|
||||
.append_event(&event)
|
||||
.expect("Failed to append event");
|
||||
});
|
||||
use crate::job_run::JobRunVisitResult;
|
||||
|
||||
// Retain job run if it doesn't yet have an exit code (still running)
|
||||
result.status.code == JobRunStatusCode::JobRunRunning as i32
|
||||
});
|
||||
// First, start any not-started jobs
|
||||
while let Some(job) = self.not_started_jobs.pop() {
|
||||
let running = job.run()?;
|
||||
self.running_jobs.push(running);
|
||||
}
|
||||
|
||||
// Visit running jobs and transition them to terminal states
|
||||
let mut still_running = Vec::new();
|
||||
for mut job in self.running_jobs.drain(..) {
|
||||
match job.visit()? {
|
||||
JobRunVisitResult::StillRunning => {
|
||||
still_running.push(job);
|
||||
}
|
||||
JobRunVisitResult::Completed(completed) => {
|
||||
// Emit success event
|
||||
let event = completed.state.to_event(&completed.id());
|
||||
self.bel.append_event(&event)?;
|
||||
self.completed_jobs.push(completed);
|
||||
}
|
||||
JobRunVisitResult::Failed(failed) => {
|
||||
// Emit failure event
|
||||
let event = failed.state.to_event(&failed.id());
|
||||
self.bel.append_event(&event)?;
|
||||
self.failed_jobs.push(failed);
|
||||
}
|
||||
}
|
||||
}
|
||||
self.running_jobs = still_running;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -152,18 +168,18 @@ impl<S: BELStorage + Debug> Orchestrator<S> {
|
|||
.collect();
|
||||
let grouped_wants = Orchestrator::<S>::group_wants(&self.config, &schedulable_wants);
|
||||
|
||||
if !grouped_wants.want_groups.is_empty() {
|
||||
if !grouped_wants.unhandled_wants.is_empty() {
|
||||
// All wants must be mapped to jobs that can be handled
|
||||
// TODO we probably want to handle this gracefully in the near future
|
||||
Err(format!(
|
||||
"Unable to map following wants: {:?}",
|
||||
&grouped_wants.want_groups
|
||||
&grouped_wants.unhandled_wants
|
||||
)
|
||||
.into())
|
||||
} else {
|
||||
for wg in grouped_wants.want_groups {
|
||||
let job_run = wg.job.spawn(wg.wants)?;
|
||||
self.job_runs.push(job_run);
|
||||
self.not_started_jobs.push(job_run);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
|
@ -256,11 +272,15 @@ mod tests {
|
|||
#[test]
|
||||
fn test_empty_wants_noop() {
|
||||
let mut orchestrator = build_orchestrator();
|
||||
assert!(orchestrator.job_runs.is_empty()); // Should init with no work to do
|
||||
// Should init with no work to do
|
||||
assert!(orchestrator.not_started_jobs.is_empty());
|
||||
assert!(orchestrator.running_jobs.is_empty());
|
||||
orchestrator
|
||||
.poll_wants()
|
||||
.expect("shouldn't fail to poll empty wants");
|
||||
assert!(orchestrator.job_runs.is_empty()); // Should still be empty since no work to do
|
||||
// Should still be empty since no work to do
|
||||
assert!(orchestrator.not_started_jobs.is_empty());
|
||||
assert!(orchestrator.running_jobs.is_empty());
|
||||
}
|
||||
|
||||
// Use case: Some schedulable wants with jobs that can be matched should launch those jobs
|
||||
|
|
|
|||
Loading…
Reference in a new issue