Exec wrapper metrics and heartbeat
This commit is contained in:
parent
f1bd273816
commit
79f316e0db
5 changed files with 670 additions and 180 deletions
|
|
@ -131,6 +131,10 @@ crate.spec(
|
|||
package = "rust-embed",
|
||||
version = "8.0",
|
||||
)
|
||||
crate.spec(
|
||||
package = "sysinfo",
|
||||
version = "0.30",
|
||||
)
|
||||
crate.from_specs()
|
||||
use_repo(crate, "crates")
|
||||
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -9,6 +9,7 @@ rust_binary(
|
|||
"@crates//:serde",
|
||||
"@crates//:serde_json",
|
||||
"@crates//:uuid",
|
||||
"@crates//:sysinfo",
|
||||
],
|
||||
)
|
||||
|
||||
|
|
@ -20,5 +21,6 @@ rust_test(
|
|||
"@crates//:serde",
|
||||
"@crates//:serde_json",
|
||||
"@crates//:uuid",
|
||||
"@crates//:sysinfo",
|
||||
],
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,10 +1,13 @@
|
|||
use std::env;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::process::{Command, Stdio};
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{SystemTime, UNIX_EPOCH, Duration};
|
||||
use std::thread;
|
||||
// All serialization handled by protobuf serde derives
|
||||
use serde_json;
|
||||
use uuid::Uuid;
|
||||
use sysinfo::{System, ProcessRefreshKind, Pid};
|
||||
|
||||
// Import protobuf types from databuild
|
||||
use databuild::{
|
||||
|
|
@ -141,13 +144,128 @@ impl JobWrapper {
|
|||
}
|
||||
|
||||
let mut child = cmd.spawn()?;
|
||||
let child_pid = child.id();
|
||||
|
||||
// Send the config to the job
|
||||
if let Some(stdin) = child.stdin.as_mut() {
|
||||
stdin.write_all(buffer.as_bytes())?;
|
||||
}
|
||||
|
||||
let output = child.wait_with_output()?;
|
||||
// Start heartbeat thread
|
||||
let heartbeat_job_id = self.job_id.clone();
|
||||
let heartbeat_partition_ref = partition_ref.clone();
|
||||
let heartbeat_sequence = Arc::new(Mutex::new(0u64));
|
||||
let heartbeat_sequence_clone = heartbeat_sequence.clone();
|
||||
|
||||
let heartbeat_handle = thread::spawn(move || {
|
||||
let mut system = System::new_all();
|
||||
let pid = Pid::from(child_pid as usize);
|
||||
|
||||
loop {
|
||||
thread::sleep(Duration::from_secs(30));
|
||||
|
||||
// Refresh process info
|
||||
system.refresh_processes_specifics(ProcessRefreshKind::new());
|
||||
|
||||
// Check if process still exists
|
||||
if let Some(process) = system.process(pid) {
|
||||
let memory_mb = process.memory() / 1024 / 1024;
|
||||
let cpu_percent = process.cpu_usage();
|
||||
|
||||
// Create heartbeat event with metrics
|
||||
let mut metadata = std::collections::HashMap::new();
|
||||
metadata.insert("memory_usage_mb".to_string(), memory_mb.to_string());
|
||||
metadata.insert("cpu_usage_percent".to_string(), cpu_percent.to_string());
|
||||
|
||||
// Get next sequence number for heartbeat
|
||||
let seq = {
|
||||
let mut seq_lock = heartbeat_sequence_clone.lock().unwrap();
|
||||
*seq_lock += 1;
|
||||
*seq_lock
|
||||
};
|
||||
|
||||
let heartbeat_event = JobLogEntry {
|
||||
timestamp: JobWrapper::get_timestamp(),
|
||||
job_id: heartbeat_job_id.clone(),
|
||||
partition_ref: heartbeat_partition_ref.clone(),
|
||||
sequence_number: seq,
|
||||
content: Some(job_log_entry::Content::JobEvent(WrapperJobEvent {
|
||||
event_type: "heartbeat".to_string(),
|
||||
job_status: None,
|
||||
exit_code: None,
|
||||
metadata,
|
||||
})),
|
||||
};
|
||||
|
||||
// Print the heartbeat (thread-safe since println! is synchronized)
|
||||
println!("{}", serde_json::to_string(&heartbeat_event).unwrap());
|
||||
} else {
|
||||
// Process no longer exists, exit heartbeat thread
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Track metrics while job is running
|
||||
let job_start_time = SystemTime::now();
|
||||
let mut system = System::new_all();
|
||||
let pid = Pid::from(child_pid as usize);
|
||||
|
||||
let mut peak_memory_mb = 0u64;
|
||||
let mut cpu_samples = Vec::new();
|
||||
let mut stdout_buffer = Vec::new();
|
||||
let mut stderr_buffer = Vec::new();
|
||||
|
||||
// Poll process status and metrics
|
||||
let (output, peak_memory_mb, total_cpu_ms, job_duration) = loop {
|
||||
// Check if process has exited
|
||||
match child.try_wait()? {
|
||||
Some(status) => {
|
||||
// Process has exited, collect any remaining output
|
||||
if let Some(mut stdout) = child.stdout.take() {
|
||||
stdout.read_to_end(&mut stdout_buffer)?;
|
||||
}
|
||||
if let Some(mut stderr) = child.stderr.take() {
|
||||
stderr.read_to_end(&mut stderr_buffer)?;
|
||||
}
|
||||
|
||||
// Calculate final metrics
|
||||
let job_duration = job_start_time.elapsed()
|
||||
.map_err(|e| io::Error::new(io::ErrorKind::Other, format!("Time calculation error: {}", e)))?;
|
||||
let total_cpu_ms = (cpu_samples.iter().sum::<f32>() * 10.0) as u64; // Convert to milliseconds
|
||||
|
||||
// Stop heartbeat thread
|
||||
drop(heartbeat_handle);
|
||||
|
||||
// Update sequence number to account for heartbeats
|
||||
let heartbeat_count = heartbeat_sequence.lock().unwrap();
|
||||
self.sequence_number = self.sequence_number.max(*heartbeat_count);
|
||||
drop(heartbeat_count);
|
||||
|
||||
// Create output struct to match original behavior
|
||||
let output = std::process::Output {
|
||||
status,
|
||||
stdout: stdout_buffer,
|
||||
stderr: stderr_buffer,
|
||||
};
|
||||
|
||||
break (output, peak_memory_mb, total_cpu_ms, job_duration);
|
||||
}
|
||||
None => {
|
||||
// Process still running, collect metrics
|
||||
system.refresh_processes_specifics(ProcessRefreshKind::new());
|
||||
|
||||
if let Some(process) = system.process(pid) {
|
||||
let memory_mb = process.memory() / 1024 / 1024;
|
||||
peak_memory_mb = peak_memory_mb.max(memory_mb);
|
||||
cpu_samples.push(process.cpu_usage());
|
||||
}
|
||||
|
||||
// Sleep briefly before next poll (100ms)
|
||||
thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
}
|
||||
};
|
||||
let success = output.status.success();
|
||||
let exit_code = output.status.code().unwrap_or(-1);
|
||||
|
||||
|
|
@ -170,6 +288,20 @@ impl JobWrapper {
|
|||
}));
|
||||
}
|
||||
|
||||
// Emit job summary with resource metrics
|
||||
let mut summary_metadata = std::collections::HashMap::new();
|
||||
summary_metadata.insert("runtime_ms".to_string(), job_duration.as_millis().to_string());
|
||||
summary_metadata.insert("peak_memory_mb".to_string(), peak_memory_mb.to_string());
|
||||
summary_metadata.insert("total_cpu_ms".to_string(), total_cpu_ms.to_string());
|
||||
summary_metadata.insert("exit_code".to_string(), exit_code.to_string());
|
||||
|
||||
self.emit_log(&partition_ref, job_log_entry::Content::JobEvent(WrapperJobEvent {
|
||||
event_type: "job_summary".to_string(),
|
||||
job_status: None,
|
||||
exit_code: Some(exit_code),
|
||||
metadata: summary_metadata,
|
||||
}));
|
||||
|
||||
if success {
|
||||
// Following the state diagram: wrapper_monitor_task -> zero exit -> emit_task_success
|
||||
self.emit_log(&partition_ref, job_log_entry::Content::JobEvent(WrapperJobEvent {
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue