Exec wrapper metrics and heartbeat

This commit is contained in:
Stuart Axelbrooke 2025-07-28 03:55:40 -07:00
parent f1bd273816
commit 79f316e0db
5 changed files with 670 additions and 180 deletions

View file

@ -131,6 +131,10 @@ crate.spec(
package = "rust-embed", package = "rust-embed",
version = "8.0", version = "8.0",
) )
crate.spec(
package = "sysinfo",
version = "0.30",
)
crate.from_specs() crate.from_specs()
use_repo(crate, "crates") use_repo(crate, "crates")

File diff suppressed because one or more lines are too long

View file

@ -9,6 +9,7 @@ rust_binary(
"@crates//:serde", "@crates//:serde",
"@crates//:serde_json", "@crates//:serde_json",
"@crates//:uuid", "@crates//:uuid",
"@crates//:sysinfo",
], ],
) )
@ -20,5 +21,6 @@ rust_test(
"@crates//:serde", "@crates//:serde",
"@crates//:serde_json", "@crates//:serde_json",
"@crates//:uuid", "@crates//:uuid",
"@crates//:sysinfo",
], ],
) )

View file

@ -1,10 +1,13 @@
use std::env; use std::env;
use std::io::{self, Read, Write}; use std::io::{self, Read, Write};
use std::process::{Command, Stdio}; use std::process::{Command, Stdio};
use std::time::{SystemTime, UNIX_EPOCH}; use std::sync::{Arc, Mutex};
use std::time::{SystemTime, UNIX_EPOCH, Duration};
use std::thread;
// All serialization handled by protobuf serde derives // All serialization handled by protobuf serde derives
use serde_json; use serde_json;
use uuid::Uuid; use uuid::Uuid;
use sysinfo::{System, ProcessRefreshKind, Pid};
// Import protobuf types from databuild // Import protobuf types from databuild
use databuild::{ use databuild::{
@ -141,13 +144,128 @@ impl JobWrapper {
} }
let mut child = cmd.spawn()?; let mut child = cmd.spawn()?;
let child_pid = child.id();
// Send the config to the job // Send the config to the job
if let Some(stdin) = child.stdin.as_mut() { if let Some(stdin) = child.stdin.as_mut() {
stdin.write_all(buffer.as_bytes())?; stdin.write_all(buffer.as_bytes())?;
} }
let output = child.wait_with_output()?; // Start heartbeat thread
let heartbeat_job_id = self.job_id.clone();
let heartbeat_partition_ref = partition_ref.clone();
let heartbeat_sequence = Arc::new(Mutex::new(0u64));
let heartbeat_sequence_clone = heartbeat_sequence.clone();
let heartbeat_handle = thread::spawn(move || {
let mut system = System::new_all();
let pid = Pid::from(child_pid as usize);
loop {
thread::sleep(Duration::from_secs(30));
// Refresh process info
system.refresh_processes_specifics(ProcessRefreshKind::new());
// Check if process still exists
if let Some(process) = system.process(pid) {
let memory_mb = process.memory() / 1024 / 1024;
let cpu_percent = process.cpu_usage();
// Create heartbeat event with metrics
let mut metadata = std::collections::HashMap::new();
metadata.insert("memory_usage_mb".to_string(), memory_mb.to_string());
metadata.insert("cpu_usage_percent".to_string(), cpu_percent.to_string());
// Get next sequence number for heartbeat
let seq = {
let mut seq_lock = heartbeat_sequence_clone.lock().unwrap();
*seq_lock += 1;
*seq_lock
};
let heartbeat_event = JobLogEntry {
timestamp: JobWrapper::get_timestamp(),
job_id: heartbeat_job_id.clone(),
partition_ref: heartbeat_partition_ref.clone(),
sequence_number: seq,
content: Some(job_log_entry::Content::JobEvent(WrapperJobEvent {
event_type: "heartbeat".to_string(),
job_status: None,
exit_code: None,
metadata,
})),
};
// Print the heartbeat (thread-safe since println! is synchronized)
println!("{}", serde_json::to_string(&heartbeat_event).unwrap());
} else {
// Process no longer exists, exit heartbeat thread
break;
}
}
});
// Track metrics while job is running
let job_start_time = SystemTime::now();
let mut system = System::new_all();
let pid = Pid::from(child_pid as usize);
let mut peak_memory_mb = 0u64;
let mut cpu_samples = Vec::new();
let mut stdout_buffer = Vec::new();
let mut stderr_buffer = Vec::new();
// Poll process status and metrics
let (output, peak_memory_mb, total_cpu_ms, job_duration) = loop {
// Check if process has exited
match child.try_wait()? {
Some(status) => {
// Process has exited, collect any remaining output
if let Some(mut stdout) = child.stdout.take() {
stdout.read_to_end(&mut stdout_buffer)?;
}
if let Some(mut stderr) = child.stderr.take() {
stderr.read_to_end(&mut stderr_buffer)?;
}
// Calculate final metrics
let job_duration = job_start_time.elapsed()
.map_err(|e| io::Error::new(io::ErrorKind::Other, format!("Time calculation error: {}", e)))?;
let total_cpu_ms = (cpu_samples.iter().sum::<f32>() * 10.0) as u64; // Convert to milliseconds
// Stop heartbeat thread
drop(heartbeat_handle);
// Update sequence number to account for heartbeats
let heartbeat_count = heartbeat_sequence.lock().unwrap();
self.sequence_number = self.sequence_number.max(*heartbeat_count);
drop(heartbeat_count);
// Create output struct to match original behavior
let output = std::process::Output {
status,
stdout: stdout_buffer,
stderr: stderr_buffer,
};
break (output, peak_memory_mb, total_cpu_ms, job_duration);
}
None => {
// Process still running, collect metrics
system.refresh_processes_specifics(ProcessRefreshKind::new());
if let Some(process) = system.process(pid) {
let memory_mb = process.memory() / 1024 / 1024;
peak_memory_mb = peak_memory_mb.max(memory_mb);
cpu_samples.push(process.cpu_usage());
}
// Sleep briefly before next poll (100ms)
thread::sleep(Duration::from_millis(100));
}
}
};
let success = output.status.success(); let success = output.status.success();
let exit_code = output.status.code().unwrap_or(-1); let exit_code = output.status.code().unwrap_or(-1);
@ -170,6 +288,20 @@ impl JobWrapper {
})); }));
} }
// Emit job summary with resource metrics
let mut summary_metadata = std::collections::HashMap::new();
summary_metadata.insert("runtime_ms".to_string(), job_duration.as_millis().to_string());
summary_metadata.insert("peak_memory_mb".to_string(), peak_memory_mb.to_string());
summary_metadata.insert("total_cpu_ms".to_string(), total_cpu_ms.to_string());
summary_metadata.insert("exit_code".to_string(), exit_code.to_string());
self.emit_log(&partition_ref, job_log_entry::Content::JobEvent(WrapperJobEvent {
event_type: "job_summary".to_string(),
job_status: None,
exit_code: Some(exit_code),
metadata: summary_metadata,
}));
if success { if success {
// Following the state diagram: wrapper_monitor_task -> zero exit -> emit_task_success // Following the state diagram: wrapper_monitor_task -> zero exit -> emit_task_success
self.emit_log(&partition_ref, job_log_entry::Content::JobEvent(WrapperJobEvent { self.emit_log(&partition_ref, job_log_entry::Content::JobEvent(WrapperJobEvent {

File diff suppressed because one or more lines are too long