523 lines
No EOL
20 KiB
Rust
523 lines
No EOL
20 KiB
Rust
use crate::{JobLogEntry, job_log_entry, WrapperJobEvent};
|
|
use std::collections::HashMap;
|
|
|
|
/// Template for metric extraction from job events
|
|
#[derive(Debug, Clone)]
|
|
pub struct MetricTemplate {
|
|
pub name: String,
|
|
pub help: String,
|
|
pub metric_type: MetricType,
|
|
pub extractor: MetricExtractor,
|
|
pub labels: Vec<String>, // Static label names for this metric
|
|
}
|
|
|
|
/// Prometheus metric types
|
|
#[derive(Debug, Clone)]
|
|
pub enum MetricType {
|
|
Counter,
|
|
Gauge,
|
|
Histogram,
|
|
Summary,
|
|
}
|
|
|
|
/// Strategy for extracting metric values from job events
|
|
#[derive(Debug, Clone)]
|
|
pub enum MetricExtractor {
|
|
/// Extract from job event metadata by key
|
|
EventMetadata {
|
|
event_type: String,
|
|
metadata_key: String,
|
|
/// Optional conversion function name for non-numeric values
|
|
converter: Option<MetricConverter>,
|
|
},
|
|
/// Count occurrences of specific event types
|
|
EventCount {
|
|
event_type: String,
|
|
},
|
|
/// Extract job duration from start/end events
|
|
JobDuration,
|
|
/// Extract peak memory from job summary
|
|
PeakMemory,
|
|
/// Extract total CPU time from job summary
|
|
TotalCpuTime,
|
|
/// Extract exit code from job events
|
|
ExitCode,
|
|
}
|
|
|
|
/// Converters for non-numeric metadata values
|
|
#[derive(Debug, Clone)]
|
|
pub enum MetricConverter {
|
|
/// Convert boolean strings to 0/1
|
|
BoolToFloat,
|
|
/// Convert status strings to numeric codes
|
|
StatusToCode(HashMap<String, f64>),
|
|
/// Parse duration strings like "123ms" to seconds
|
|
DurationToSeconds,
|
|
}
|
|
|
|
/// Result of metric extraction
|
|
#[derive(Debug)]
|
|
pub struct ExtractedMetric {
|
|
pub name: String,
|
|
pub value: f64,
|
|
pub labels: HashMap<String, String>,
|
|
pub help: String,
|
|
pub metric_type: MetricType,
|
|
}
|
|
|
|
impl MetricTemplate {
|
|
/// Extract a metric from a job log entry if applicable
|
|
pub fn extract(&self, entry: &JobLogEntry) -> Option<ExtractedMetric> {
|
|
let value = match &self.extractor {
|
|
MetricExtractor::EventMetadata { event_type, metadata_key, converter } => {
|
|
if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content {
|
|
if event.event_type == *event_type {
|
|
if let Some(raw_value) = event.metadata.get(metadata_key) {
|
|
self.convert_value(raw_value, converter)?
|
|
} else {
|
|
return None;
|
|
}
|
|
} else {
|
|
return None;
|
|
}
|
|
} else {
|
|
return None;
|
|
}
|
|
},
|
|
MetricExtractor::EventCount { event_type } => {
|
|
if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content {
|
|
if event.event_type == *event_type {
|
|
1.0
|
|
} else {
|
|
return None;
|
|
}
|
|
} else {
|
|
return None;
|
|
}
|
|
},
|
|
MetricExtractor::JobDuration => {
|
|
if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content {
|
|
if event.event_type == "job_summary" {
|
|
if let Some(runtime_str) = event.metadata.get("runtime_ms") {
|
|
runtime_str.parse::<f64>().ok()? / 1000.0 // Convert to seconds
|
|
} else {
|
|
return None;
|
|
}
|
|
} else {
|
|
return None;
|
|
}
|
|
} else {
|
|
return None;
|
|
}
|
|
},
|
|
MetricExtractor::PeakMemory => {
|
|
if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content {
|
|
if event.event_type == "job_summary" {
|
|
if let Some(memory_str) = event.metadata.get("peak_memory_mb") {
|
|
memory_str.parse::<f64>().ok()?
|
|
} else {
|
|
return None;
|
|
}
|
|
} else {
|
|
return None;
|
|
}
|
|
} else {
|
|
return None;
|
|
}
|
|
},
|
|
MetricExtractor::TotalCpuTime => {
|
|
if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content {
|
|
if event.event_type == "job_summary" {
|
|
if let Some(cpu_str) = event.metadata.get("total_cpu_ms") {
|
|
cpu_str.parse::<f64>().ok()? / 1000.0 // Convert to seconds
|
|
} else {
|
|
return None;
|
|
}
|
|
} else {
|
|
return None;
|
|
}
|
|
} else {
|
|
return None;
|
|
}
|
|
},
|
|
MetricExtractor::ExitCode => {
|
|
if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content {
|
|
if let Some(exit_code) = event.exit_code {
|
|
exit_code as f64
|
|
} else {
|
|
return None;
|
|
}
|
|
} else {
|
|
return None;
|
|
}
|
|
},
|
|
};
|
|
|
|
// Generate labels for this metric
|
|
let mut labels = HashMap::new();
|
|
|
|
// Always include job_id as a label (but this is excluded by default for cardinality safety)
|
|
labels.insert("job_id".to_string(), entry.job_id.clone());
|
|
|
|
// Extract job label from manifest if available - this is the low-cardinality identifier
|
|
if let Some(job_log_entry::Content::Manifest(manifest)) = &entry.content {
|
|
if let Some(task) = &manifest.task {
|
|
if let Some(job) = &task.job {
|
|
labels.insert("job_label".to_string(), job.label.clone());
|
|
}
|
|
}
|
|
}
|
|
|
|
// Add job status and job label if available from job events
|
|
if let Some(job_log_entry::Content::JobEvent(event)) = &entry.content {
|
|
if let Some(job_status) = &event.job_status {
|
|
labels.insert("job_status".to_string(), job_status.clone());
|
|
}
|
|
if let Some(job_label) = &event.job_label {
|
|
labels.insert("job_label".to_string(), job_label.clone());
|
|
}
|
|
}
|
|
|
|
Some(ExtractedMetric {
|
|
name: self.name.clone(),
|
|
value,
|
|
labels,
|
|
help: self.help.clone(),
|
|
metric_type: self.metric_type.clone(),
|
|
})
|
|
}
|
|
|
|
fn convert_value(&self, raw_value: &str, converter: &Option<MetricConverter>) -> Option<f64> {
|
|
match converter {
|
|
None => raw_value.parse().ok(),
|
|
Some(MetricConverter::BoolToFloat) => {
|
|
match raw_value.to_lowercase().as_str() {
|
|
"true" | "1" | "yes" => Some(1.0),
|
|
"false" | "0" | "no" => Some(0.0),
|
|
_ => None,
|
|
}
|
|
},
|
|
Some(MetricConverter::StatusToCode(mapping)) => {
|
|
mapping.get(raw_value).copied()
|
|
},
|
|
Some(MetricConverter::DurationToSeconds) => {
|
|
// Parse formats like "123ms", "45s", "2.5m"
|
|
if raw_value.ends_with("ms") {
|
|
raw_value.trim_end_matches("ms").parse::<f64>().ok().map(|v| v / 1000.0)
|
|
} else if raw_value.ends_with("s") {
|
|
raw_value.trim_end_matches("s").parse::<f64>().ok()
|
|
} else if raw_value.ends_with("m") {
|
|
raw_value.trim_end_matches("m").parse::<f64>().ok().map(|v| v * 60.0)
|
|
} else {
|
|
raw_value.parse::<f64>().ok()
|
|
}
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/// Get standard DataBuild metric templates
|
|
pub fn get_standard_metrics() -> Vec<MetricTemplate> {
|
|
vec![
|
|
// Job execution metrics
|
|
MetricTemplate {
|
|
name: "databuild_job_duration_seconds".to_string(),
|
|
help: "Duration of job execution in seconds".to_string(),
|
|
metric_type: MetricType::Histogram,
|
|
extractor: MetricExtractor::JobDuration,
|
|
labels: vec!["job_label".to_string()],
|
|
},
|
|
MetricTemplate {
|
|
name: "databuild_job_peak_memory_mb".to_string(),
|
|
help: "Peak memory usage of job in megabytes".to_string(),
|
|
metric_type: MetricType::Gauge,
|
|
extractor: MetricExtractor::PeakMemory,
|
|
labels: vec!["job_label".to_string()],
|
|
},
|
|
MetricTemplate {
|
|
name: "databuild_job_cpu_time_seconds".to_string(),
|
|
help: "Total CPU time consumed by job in seconds".to_string(),
|
|
metric_type: MetricType::Counter,
|
|
extractor: MetricExtractor::TotalCpuTime,
|
|
labels: vec!["job_label".to_string()],
|
|
},
|
|
MetricTemplate {
|
|
name: "databuild_job_exit_code".to_string(),
|
|
help: "Exit code of job execution".to_string(),
|
|
metric_type: MetricType::Gauge,
|
|
extractor: MetricExtractor::ExitCode,
|
|
labels: vec!["job_label".to_string(), "job_status".to_string()],
|
|
},
|
|
|
|
// Job event counters
|
|
MetricTemplate {
|
|
name: "databuild_job_events_total".to_string(),
|
|
help: "Total number of job events".to_string(),
|
|
metric_type: MetricType::Counter,
|
|
extractor: MetricExtractor::EventCount { event_type: "task_success".to_string() },
|
|
labels: vec!["job_label".to_string()],
|
|
},
|
|
MetricTemplate {
|
|
name: "databuild_job_failures_total".to_string(),
|
|
help: "Total number of job failures".to_string(),
|
|
metric_type: MetricType::Counter,
|
|
extractor: MetricExtractor::EventCount { event_type: "task_failed".to_string() },
|
|
labels: vec!["job_label".to_string()],
|
|
},
|
|
MetricTemplate {
|
|
name: "databuild_heartbeats_total".to_string(),
|
|
help: "Total number of heartbeat events".to_string(),
|
|
metric_type: MetricType::Counter,
|
|
extractor: MetricExtractor::EventCount { event_type: "heartbeat".to_string() },
|
|
labels: vec!["job_label".to_string()],
|
|
},
|
|
]
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::{PartitionRef, log_message, LogMessage};
|
|
|
|
fn create_test_job_summary_entry(job_id: &str, runtime_ms: &str, memory_mb: &str, cpu_ms: &str, exit_code: i32) -> JobLogEntry {
|
|
let mut metadata = HashMap::new();
|
|
metadata.insert("runtime_ms".to_string(), runtime_ms.to_string());
|
|
metadata.insert("peak_memory_mb".to_string(), memory_mb.to_string());
|
|
metadata.insert("total_cpu_ms".to_string(), cpu_ms.to_string());
|
|
metadata.insert("exit_code".to_string(), exit_code.to_string());
|
|
|
|
JobLogEntry {
|
|
timestamp: "1234567890".to_string(),
|
|
job_id: job_id.to_string(),
|
|
outputs: vec![PartitionRef { r#str: "reviews/date=2025-01-27".to_string() }],
|
|
sequence_number: 1,
|
|
content: Some(job_log_entry::Content::JobEvent(WrapperJobEvent {
|
|
event_type: "job_summary".to_string(),
|
|
job_status: Some("JOB_COMPLETED".to_string()),
|
|
exit_code: Some(exit_code),
|
|
metadata,
|
|
job_label: None,
|
|
})),
|
|
}
|
|
}
|
|
|
|
fn create_test_task_success_entry(job_id: &str) -> JobLogEntry {
|
|
JobLogEntry {
|
|
timestamp: "1234567890".to_string(),
|
|
job_id: job_id.to_string(),
|
|
outputs: vec![PartitionRef { r#str: "podcasts/date=2025-01-27".to_string() }],
|
|
sequence_number: 2,
|
|
content: Some(job_log_entry::Content::JobEvent(WrapperJobEvent {
|
|
event_type: "task_success".to_string(),
|
|
job_status: Some("JOB_COMPLETED".to_string()),
|
|
exit_code: Some(0),
|
|
metadata: HashMap::new(),
|
|
job_label: None,
|
|
})),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_job_duration_extraction() {
|
|
let template = MetricTemplate {
|
|
name: "test_duration".to_string(),
|
|
help: "Test duration".to_string(),
|
|
metric_type: MetricType::Histogram,
|
|
extractor: MetricExtractor::JobDuration,
|
|
labels: vec![],
|
|
};
|
|
|
|
let entry = create_test_job_summary_entry("test-job", "2500", "64.5", "1200", 0);
|
|
let metric = template.extract(&entry).unwrap();
|
|
|
|
assert_eq!(metric.name, "test_duration");
|
|
assert_eq!(metric.value, 2.5); // 2500ms -> 2.5s
|
|
assert_eq!(metric.labels.get("job_id").unwrap(), "test-job");
|
|
// Note: job_label would only be available from manifest entries, not job_summary
|
|
}
|
|
|
|
#[test]
|
|
fn test_memory_extraction() {
|
|
let template = MetricTemplate {
|
|
name: "test_memory".to_string(),
|
|
help: "Test memory".to_string(),
|
|
metric_type: MetricType::Gauge,
|
|
extractor: MetricExtractor::PeakMemory,
|
|
labels: vec![],
|
|
};
|
|
|
|
let entry = create_test_job_summary_entry("test-job", "2500", "128.75", "1200", 0);
|
|
let metric = template.extract(&entry).unwrap();
|
|
|
|
assert_eq!(metric.value, 128.75);
|
|
}
|
|
|
|
#[test]
|
|
fn test_cpu_time_extraction() {
|
|
let template = MetricTemplate {
|
|
name: "test_cpu".to_string(),
|
|
help: "Test CPU".to_string(),
|
|
metric_type: MetricType::Counter,
|
|
extractor: MetricExtractor::TotalCpuTime,
|
|
labels: vec![],
|
|
};
|
|
|
|
let entry = create_test_job_summary_entry("test-job", "2500", "64.5", "1500", 0);
|
|
let metric = template.extract(&entry).unwrap();
|
|
|
|
assert_eq!(metric.value, 1.5); // 1500ms -> 1.5s
|
|
}
|
|
|
|
#[test]
|
|
fn test_exit_code_extraction() {
|
|
let template = MetricTemplate {
|
|
name: "test_exit_code".to_string(),
|
|
help: "Test exit code".to_string(),
|
|
metric_type: MetricType::Gauge,
|
|
extractor: MetricExtractor::ExitCode,
|
|
labels: vec![],
|
|
};
|
|
|
|
let entry = create_test_job_summary_entry("test-job", "2500", "64.5", "1200", 42);
|
|
let metric = template.extract(&entry).unwrap();
|
|
|
|
assert_eq!(metric.value, 42.0);
|
|
assert_eq!(metric.labels.get("job_status").unwrap(), "JOB_COMPLETED");
|
|
}
|
|
|
|
#[test]
|
|
fn test_event_count_extraction() {
|
|
let template = MetricTemplate {
|
|
name: "test_success_count".to_string(),
|
|
help: "Test success count".to_string(),
|
|
metric_type: MetricType::Counter,
|
|
extractor: MetricExtractor::EventCount { event_type: "task_success".to_string() },
|
|
labels: vec![],
|
|
};
|
|
|
|
let entry = create_test_task_success_entry("test-job");
|
|
let metric = template.extract(&entry).unwrap();
|
|
|
|
assert_eq!(metric.value, 1.0);
|
|
// Note: job_label would only be available from manifest entries, not job events
|
|
}
|
|
|
|
#[test]
|
|
fn test_event_metadata_extraction() {
|
|
let template = MetricTemplate {
|
|
name: "test_runtime".to_string(),
|
|
help: "Test runtime from metadata".to_string(),
|
|
metric_type: MetricType::Gauge,
|
|
extractor: MetricExtractor::EventMetadata {
|
|
event_type: "job_summary".to_string(),
|
|
metadata_key: "runtime_ms".to_string(),
|
|
converter: None,
|
|
},
|
|
labels: vec![],
|
|
};
|
|
|
|
let entry = create_test_job_summary_entry("test-job", "3000", "64.5", "1200", 0);
|
|
let metric = template.extract(&entry).unwrap();
|
|
|
|
assert_eq!(metric.value, 3000.0);
|
|
}
|
|
|
|
|
|
#[test]
|
|
fn test_bool_converter() {
|
|
let template = MetricTemplate {
|
|
name: "test_bool".to_string(),
|
|
help: "Test bool".to_string(),
|
|
metric_type: MetricType::Gauge,
|
|
extractor: MetricExtractor::EventMetadata {
|
|
event_type: "test_event".to_string(),
|
|
metadata_key: "success".to_string(),
|
|
converter: Some(MetricConverter::BoolToFloat),
|
|
},
|
|
labels: vec![],
|
|
};
|
|
|
|
assert_eq!(template.convert_value("true", &Some(MetricConverter::BoolToFloat)), Some(1.0));
|
|
assert_eq!(template.convert_value("false", &Some(MetricConverter::BoolToFloat)), Some(0.0));
|
|
assert_eq!(template.convert_value("yes", &Some(MetricConverter::BoolToFloat)), Some(1.0));
|
|
assert_eq!(template.convert_value("no", &Some(MetricConverter::BoolToFloat)), Some(0.0));
|
|
assert_eq!(template.convert_value("invalid", &Some(MetricConverter::BoolToFloat)), None);
|
|
}
|
|
|
|
#[test]
|
|
fn test_duration_converter() {
|
|
let template = MetricTemplate {
|
|
name: "test_duration".to_string(),
|
|
help: "Test duration".to_string(),
|
|
metric_type: MetricType::Gauge,
|
|
extractor: MetricExtractor::EventMetadata {
|
|
event_type: "test_event".to_string(),
|
|
metadata_key: "duration".to_string(),
|
|
converter: Some(MetricConverter::DurationToSeconds),
|
|
},
|
|
labels: vec![],
|
|
};
|
|
|
|
assert_eq!(template.convert_value("1000ms", &Some(MetricConverter::DurationToSeconds)), Some(1.0));
|
|
assert_eq!(template.convert_value("5s", &Some(MetricConverter::DurationToSeconds)), Some(5.0));
|
|
assert_eq!(template.convert_value("2.5m", &Some(MetricConverter::DurationToSeconds)), Some(150.0));
|
|
assert_eq!(template.convert_value("42", &Some(MetricConverter::DurationToSeconds)), Some(42.0));
|
|
}
|
|
|
|
#[test]
|
|
fn test_standard_metrics() {
|
|
let metrics = get_standard_metrics();
|
|
assert!(!metrics.is_empty());
|
|
|
|
// Verify we have the key metrics
|
|
let metric_names: Vec<&String> = metrics.iter().map(|m| &m.name).collect();
|
|
assert!(metric_names.contains(&&"databuild_job_duration_seconds".to_string()));
|
|
assert!(metric_names.contains(&&"databuild_job_peak_memory_mb".to_string()));
|
|
assert!(metric_names.contains(&&"databuild_job_cpu_time_seconds".to_string()));
|
|
assert!(metric_names.contains(&&"databuild_job_failures_total".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn test_no_extraction_for_wrong_event_type() {
|
|
let template = MetricTemplate {
|
|
name: "test_metric".to_string(),
|
|
help: "Test".to_string(),
|
|
metric_type: MetricType::Counter,
|
|
extractor: MetricExtractor::EventCount { event_type: "task_failed".to_string() },
|
|
labels: vec![],
|
|
};
|
|
|
|
let entry = create_test_task_success_entry("test-job"); // This is task_success, not task_failed
|
|
let result = template.extract(&entry);
|
|
|
|
assert!(result.is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn test_no_extraction_for_log_entries() {
|
|
let template = MetricTemplate {
|
|
name: "test_metric".to_string(),
|
|
help: "Test".to_string(),
|
|
metric_type: MetricType::Counter,
|
|
extractor: MetricExtractor::JobDuration,
|
|
labels: vec![],
|
|
};
|
|
|
|
// Create a log entry instead of job event
|
|
let entry = JobLogEntry {
|
|
timestamp: "1234567890".to_string(),
|
|
job_id: "test-job".to_string(),
|
|
outputs: vec![PartitionRef { r#str: "test/partition".to_string() }],
|
|
sequence_number: 1,
|
|
content: Some(job_log_entry::Content::Log(LogMessage {
|
|
level: log_message::LogLevel::Info as i32,
|
|
message: "Test log message".to_string(),
|
|
fields: HashMap::new(),
|
|
})),
|
|
};
|
|
|
|
let result = template.extract(&entry);
|
|
assert!(result.is_none());
|
|
}
|
|
} |