Exec wrapper metrics and heartbeat
This commit is contained in:
parent
f1bd273816
commit
79f316e0db
5 changed files with 670 additions and 180 deletions
|
|
@ -131,6 +131,10 @@ crate.spec(
|
||||||
package = "rust-embed",
|
package = "rust-embed",
|
||||||
version = "8.0",
|
version = "8.0",
|
||||||
)
|
)
|
||||||
|
crate.spec(
|
||||||
|
package = "sysinfo",
|
||||||
|
version = "0.30",
|
||||||
|
)
|
||||||
crate.from_specs()
|
crate.from_specs()
|
||||||
use_repo(crate, "crates")
|
use_repo(crate, "crates")
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -9,6 +9,7 @@ rust_binary(
|
||||||
"@crates//:serde",
|
"@crates//:serde",
|
||||||
"@crates//:serde_json",
|
"@crates//:serde_json",
|
||||||
"@crates//:uuid",
|
"@crates//:uuid",
|
||||||
|
"@crates//:sysinfo",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -20,5 +21,6 @@ rust_test(
|
||||||
"@crates//:serde",
|
"@crates//:serde",
|
||||||
"@crates//:serde_json",
|
"@crates//:serde_json",
|
||||||
"@crates//:uuid",
|
"@crates//:uuid",
|
||||||
|
"@crates//:sysinfo",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,13 @@
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::io::{self, Read, Write};
|
use std::io::{self, Read, Write};
|
||||||
use std::process::{Command, Stdio};
|
use std::process::{Command, Stdio};
|
||||||
use std::time::{SystemTime, UNIX_EPOCH};
|
use std::sync::{Arc, Mutex};
|
||||||
|
use std::time::{SystemTime, UNIX_EPOCH, Duration};
|
||||||
|
use std::thread;
|
||||||
// All serialization handled by protobuf serde derives
|
// All serialization handled by protobuf serde derives
|
||||||
use serde_json;
|
use serde_json;
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
use sysinfo::{System, ProcessRefreshKind, Pid};
|
||||||
|
|
||||||
// Import protobuf types from databuild
|
// Import protobuf types from databuild
|
||||||
use databuild::{
|
use databuild::{
|
||||||
|
|
@ -141,13 +144,128 @@ impl JobWrapper {
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut child = cmd.spawn()?;
|
let mut child = cmd.spawn()?;
|
||||||
|
let child_pid = child.id();
|
||||||
|
|
||||||
// Send the config to the job
|
// Send the config to the job
|
||||||
if let Some(stdin) = child.stdin.as_mut() {
|
if let Some(stdin) = child.stdin.as_mut() {
|
||||||
stdin.write_all(buffer.as_bytes())?;
|
stdin.write_all(buffer.as_bytes())?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let output = child.wait_with_output()?;
|
// Start heartbeat thread
|
||||||
|
let heartbeat_job_id = self.job_id.clone();
|
||||||
|
let heartbeat_partition_ref = partition_ref.clone();
|
||||||
|
let heartbeat_sequence = Arc::new(Mutex::new(0u64));
|
||||||
|
let heartbeat_sequence_clone = heartbeat_sequence.clone();
|
||||||
|
|
||||||
|
let heartbeat_handle = thread::spawn(move || {
|
||||||
|
let mut system = System::new_all();
|
||||||
|
let pid = Pid::from(child_pid as usize);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
thread::sleep(Duration::from_secs(30));
|
||||||
|
|
||||||
|
// Refresh process info
|
||||||
|
system.refresh_processes_specifics(ProcessRefreshKind::new());
|
||||||
|
|
||||||
|
// Check if process still exists
|
||||||
|
if let Some(process) = system.process(pid) {
|
||||||
|
let memory_mb = process.memory() / 1024 / 1024;
|
||||||
|
let cpu_percent = process.cpu_usage();
|
||||||
|
|
||||||
|
// Create heartbeat event with metrics
|
||||||
|
let mut metadata = std::collections::HashMap::new();
|
||||||
|
metadata.insert("memory_usage_mb".to_string(), memory_mb.to_string());
|
||||||
|
metadata.insert("cpu_usage_percent".to_string(), cpu_percent.to_string());
|
||||||
|
|
||||||
|
// Get next sequence number for heartbeat
|
||||||
|
let seq = {
|
||||||
|
let mut seq_lock = heartbeat_sequence_clone.lock().unwrap();
|
||||||
|
*seq_lock += 1;
|
||||||
|
*seq_lock
|
||||||
|
};
|
||||||
|
|
||||||
|
let heartbeat_event = JobLogEntry {
|
||||||
|
timestamp: JobWrapper::get_timestamp(),
|
||||||
|
job_id: heartbeat_job_id.clone(),
|
||||||
|
partition_ref: heartbeat_partition_ref.clone(),
|
||||||
|
sequence_number: seq,
|
||||||
|
content: Some(job_log_entry::Content::JobEvent(WrapperJobEvent {
|
||||||
|
event_type: "heartbeat".to_string(),
|
||||||
|
job_status: None,
|
||||||
|
exit_code: None,
|
||||||
|
metadata,
|
||||||
|
})),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Print the heartbeat (thread-safe since println! is synchronized)
|
||||||
|
println!("{}", serde_json::to_string(&heartbeat_event).unwrap());
|
||||||
|
} else {
|
||||||
|
// Process no longer exists, exit heartbeat thread
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Track metrics while job is running
|
||||||
|
let job_start_time = SystemTime::now();
|
||||||
|
let mut system = System::new_all();
|
||||||
|
let pid = Pid::from(child_pid as usize);
|
||||||
|
|
||||||
|
let mut peak_memory_mb = 0u64;
|
||||||
|
let mut cpu_samples = Vec::new();
|
||||||
|
let mut stdout_buffer = Vec::new();
|
||||||
|
let mut stderr_buffer = Vec::new();
|
||||||
|
|
||||||
|
// Poll process status and metrics
|
||||||
|
let (output, peak_memory_mb, total_cpu_ms, job_duration) = loop {
|
||||||
|
// Check if process has exited
|
||||||
|
match child.try_wait()? {
|
||||||
|
Some(status) => {
|
||||||
|
// Process has exited, collect any remaining output
|
||||||
|
if let Some(mut stdout) = child.stdout.take() {
|
||||||
|
stdout.read_to_end(&mut stdout_buffer)?;
|
||||||
|
}
|
||||||
|
if let Some(mut stderr) = child.stderr.take() {
|
||||||
|
stderr.read_to_end(&mut stderr_buffer)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate final metrics
|
||||||
|
let job_duration = job_start_time.elapsed()
|
||||||
|
.map_err(|e| io::Error::new(io::ErrorKind::Other, format!("Time calculation error: {}", e)))?;
|
||||||
|
let total_cpu_ms = (cpu_samples.iter().sum::<f32>() * 10.0) as u64; // Convert to milliseconds
|
||||||
|
|
||||||
|
// Stop heartbeat thread
|
||||||
|
drop(heartbeat_handle);
|
||||||
|
|
||||||
|
// Update sequence number to account for heartbeats
|
||||||
|
let heartbeat_count = heartbeat_sequence.lock().unwrap();
|
||||||
|
self.sequence_number = self.sequence_number.max(*heartbeat_count);
|
||||||
|
drop(heartbeat_count);
|
||||||
|
|
||||||
|
// Create output struct to match original behavior
|
||||||
|
let output = std::process::Output {
|
||||||
|
status,
|
||||||
|
stdout: stdout_buffer,
|
||||||
|
stderr: stderr_buffer,
|
||||||
|
};
|
||||||
|
|
||||||
|
break (output, peak_memory_mb, total_cpu_ms, job_duration);
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// Process still running, collect metrics
|
||||||
|
system.refresh_processes_specifics(ProcessRefreshKind::new());
|
||||||
|
|
||||||
|
if let Some(process) = system.process(pid) {
|
||||||
|
let memory_mb = process.memory() / 1024 / 1024;
|
||||||
|
peak_memory_mb = peak_memory_mb.max(memory_mb);
|
||||||
|
cpu_samples.push(process.cpu_usage());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sleep briefly before next poll (100ms)
|
||||||
|
thread::sleep(Duration::from_millis(100));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
let success = output.status.success();
|
let success = output.status.success();
|
||||||
let exit_code = output.status.code().unwrap_or(-1);
|
let exit_code = output.status.code().unwrap_or(-1);
|
||||||
|
|
||||||
|
|
@ -170,6 +288,20 @@ impl JobWrapper {
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Emit job summary with resource metrics
|
||||||
|
let mut summary_metadata = std::collections::HashMap::new();
|
||||||
|
summary_metadata.insert("runtime_ms".to_string(), job_duration.as_millis().to_string());
|
||||||
|
summary_metadata.insert("peak_memory_mb".to_string(), peak_memory_mb.to_string());
|
||||||
|
summary_metadata.insert("total_cpu_ms".to_string(), total_cpu_ms.to_string());
|
||||||
|
summary_metadata.insert("exit_code".to_string(), exit_code.to_string());
|
||||||
|
|
||||||
|
self.emit_log(&partition_ref, job_log_entry::Content::JobEvent(WrapperJobEvent {
|
||||||
|
event_type: "job_summary".to_string(),
|
||||||
|
job_status: None,
|
||||||
|
exit_code: Some(exit_code),
|
||||||
|
metadata: summary_metadata,
|
||||||
|
}));
|
||||||
|
|
||||||
if success {
|
if success {
|
||||||
// Following the state diagram: wrapper_monitor_task -> zero exit -> emit_task_success
|
// Following the state diagram: wrapper_monitor_task -> zero exit -> emit_task_success
|
||||||
self.emit_log(&partition_ref, job_log_entry::Content::JobEvent(WrapperJobEvent {
|
self.emit_log(&partition_ref, job_log_entry::Content::JobEvent(WrapperJobEvent {
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue