Add rust execute impl

2025-05-07 20:01:10 -07:00 · 2025-05-07 20:01:10 -07:00 · 91d5fd26bc
commit 91d5fd26bc
parent f2567f7567
2 changed files with 445 additions and 0 deletions
--- a/databuild/graph/BUILD.bazel
+++ b/databuild/graph/BUILD.bazel
@ -13,6 +13,21 @@ go_binary(
    visibility = ["//visibility:public"],
 )
 rust_binary(
    name = "execute_rs",
    srcs = ["execute.rs"],
    edition = "2021",
    deps = [
        "//databuild:structs",
        "@crates//:serde",
        "@crates//:serde_json",
        "@crates//:log",
        "@crates//:simple_logger",
        "@crates//:crossbeam-channel",
    ],
    visibility = ["//visibility:public"],
 )
 rust_binary(
    name = "analyze",
    srcs = ["analyze.rs"],
--- a/databuild/graph/execute.rs
+++ b/databuild/graph/execute.rs
@ -0,0 +1,430 @@
 use structs::{DataDepType, JobConfig, JobGraph, Task};
 use crossbeam_channel::{Receiver, Sender};
 use log::{debug, error, info, warn};
 use serde::{Deserialize, Serialize};
 use std::collections::{HashMap, HashSet};
 use std::io::{Read, Write};
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
 use std::sync::Arc;
 use std::thread;
 use std::time::{Duration, Instant};
 const NUM_WORKERS: usize = 4;
 const LOG_INTERVAL: Duration = Duration::from_secs(5);
 const FAIL_FAST: bool = true; // Same default as the Go version
 #[derive(Debug, Clone, PartialEq, Eq)]
 enum TaskState {
    Pending,
    Running,
    Succeeded,
    Failed,
 }
 #[derive(Debug, Clone)]
 struct TaskExecutionResult {
    task_key: String,
    job_label: String, // For logging
    success: bool,
    stdout: String,
    stderr: String,
    duration: Duration,
    error_message: Option<String>,
 }
 // Generates a unique key for a task based on its JobLabel, input and output references.
 // Mirrors the Go implementation's getTaskKey.
 fn get_task_key(task: &Task) -> String {
    let mut key_parts = Vec::new();
    key_parts.push(task.job_label.clone());
    for input_dep in &task.config.inputs {
        key_parts.push(format!("input:{}", input_dep.reference));
    }
    for output_ref in &task.config.outputs {
        key_parts.push(format!("output:{}", output_ref));
    }
    key_parts.join("|")
 }
 // Resolves the executable path from runfiles.
 // Mirrors the Go implementation's resolveExecutableFromRunfiles.
 fn resolve_executable_from_runfiles(job_label: &str) -> PathBuf {
    let mut target_name = job_label.to_string();
    if let Some(colon_index) = job_label.rfind(':') {
        target_name = job_label[colon_index + 1..].to_string();
    } else if let Some(name) = Path::new(job_label).file_name().and_then(|n| n.to_str()) {
        target_name = name.to_string();
    }
    let exec_name = format!("{}.exec", target_name);
    if let Ok(runfiles_dir_str) = std::env::var("RUNFILES_DIR") {
        let path = PathBuf::from(runfiles_dir_str).join("_main").join(&exec_name);
        debug!("Resolved executable path (RUNFILES_DIR): {}", path.display());
        return path;
    }
    if let Ok(current_exe) = std::env::current_exe() {
        let mut runfiles_dir_path = PathBuf::from(format!("{}.runfiles", current_exe.display()));
        if !runfiles_dir_path.is_dir() { // Bazel often puts it next to the binary
             if let Some(parent) = current_exe.parent() {
                runfiles_dir_path = parent.join(format!("{}.runfiles", current_exe.file_name().unwrap_or_default().to_string_lossy()));
             }
        }
        if runfiles_dir_path.is_dir() {
            let path = runfiles_dir_path.join("_main").join(&exec_name);
            debug!("Resolved executable path (derived RUNFILES_DIR): {}", path.display());
            return path;
        } else {
            warn!("Warning: RUNFILES_DIR not found or invalid, and derived path {} is not a directory.", runfiles_dir_path.display());
        }
    } else {
        warn!("Warning: Could not determine current executable path.");
    }
    let fallback_path = PathBuf::from(format!("{}.exec", job_label));
    warn!("Falling back to direct executable path: {}", fallback_path.display());
    fallback_path
 }
 fn worker(
    task_rx: Receiver<Arc<Task>>,
    result_tx: Sender<TaskExecutionResult>,
    worker_id: usize,
 ) {
    info!("[Worker {}] Starting", worker_id);
    while let Ok(task) = task_rx.recv() {
        let task_key = get_task_key(&task);
        info!("[Worker {}] Starting job: {} (Key: {})", worker_id, task.job_label, task_key);
        let start_time = Instant::now();
        let exec_path = resolve_executable_from_runfiles(&task.job_label);
        let config_json = match serde_json::to_string(&task.config) {
            Ok(json) => json,
            Err(e) => {
                let err_msg = format!("Failed to serialize task config for {}: {}", task.job_label, e);
                error!("[Worker {}] {}", worker_id, err_msg);
                result_tx
                    .send(TaskExecutionResult {
                        task_key,
                        job_label: task.job_label.clone(),
                        success: false,
                        stdout: String::new(),
                        stderr: err_msg.clone(),
                        duration: start_time.elapsed(),
                        error_message: Some(err_msg),
                    })
                    .unwrap_or_else(|e| error!("[Worker {}] Failed to send error result: {}", worker_id, e));
                continue;
            }
        };
        let mut cmd = Command::new(&exec_path);
        cmd.stdin(Stdio::piped())
            .stdout(Stdio::piped())
            .stderr(Stdio::piped());
        // Set environment variables from the current process's environment
        // This mirrors the Go `cmd.Env = os.Environ()` behavior.
        // Task-specific env vars from task.config.env are passed via JSON through stdin.
        cmd.env_clear(); // Start with no environment variables
        for (key, value) in std::env::vars() {
            cmd.env(key, value); // Add current process's environment variables
        }
        match cmd.spawn() {
            Ok(mut child) => {
                if let Some(mut child_stdin) = child.stdin.take() {
                    if let Err(e) = child_stdin.write_all(config_json.as_bytes()) {
                        let err_msg = format!("[Worker {}] Failed to write to stdin for {}: {}", worker_id, task.job_label, e);
                        error!("{}", err_msg);
                        // Ensure child is killed if stdin write fails before wait
                        let _ = child.kill(); 
                        let _ = child.wait(); // Reap the child
                        result_tx.send(TaskExecutionResult {
                                task_key,
                                job_label: task.job_label.clone(),
                                success: false,
                                stdout: String::new(),
                                stderr: err_msg.clone(),
                                duration: start_time.elapsed(),
                                error_message: Some(err_msg),
                            })
                            .unwrap_or_else(|e| error!("[Worker {}] Failed to send error result: {}", worker_id, e));
                        continue;
                    }
                    drop(child_stdin); // Close stdin to signal EOF to the child
                } else {
                     let err_msg = format!("[Worker {}] Failed to get stdin for {}", worker_id, task.job_label);
                     error!("{}", err_msg);
                     result_tx.send(TaskExecutionResult {
                                task_key,
                                job_label: task.job_label.clone(),
                                success: false,
                                stdout: String::new(),
                                stderr: err_msg.clone(),
                                duration: start_time.elapsed(),
                                error_message: Some(err_msg),
                            })
                            .unwrap_or_else(|e| error!("[Worker {}] Failed to send error result: {}", worker_id, e));
                    continue;
                }
                match child.wait_with_output() {
                    Ok(output) => {
                        let duration = start_time.elapsed();
                        let success = output.status.success();
                        let stdout = String::from_utf8_lossy(&output.stdout).to_string();
                        let stderr = String::from_utf8_lossy(&output.stderr).to_string();
                        if success {
                            info!(
                                "[Worker {}] Job succeeded: {} (Duration: {:?})",
                                worker_id, task.job_label, duration
                            );
                        } else {
                            error!(
                                "[Worker {}] Job failed: {} (Duration: {:?}, Status: {:?})\nStdout: {}\nStderr: {}",
                                worker_id, task.job_label, duration, output.status, stdout, stderr
                            );
                        }
                        result_tx
                            .send(TaskExecutionResult {
                                task_key,
                                job_label: task.job_label.clone(),
                                success,
                                stdout,
                                stderr,
                                duration,
                                error_message: if success { None } else { Some(format!("Exited with status: {:?}", output.status)) },
                            })
                            .unwrap_or_else(|e| error!("[Worker {}] Failed to send result: {}", worker_id, e));
                    }
                    Err(e) => {
                        let err_msg = format!("[Worker {}] Failed to execute or wait for {}: {}", worker_id, task.job_label, e);
                        error!("{}", err_msg);
                        result_tx
                            .send(TaskExecutionResult {
                                task_key,
                                job_label: task.job_label.clone(),
                                success: false,
                                stdout: String::new(),
                                stderr: err_msg.clone(),
                                duration: start_time.elapsed(),
                                error_message: Some(err_msg),
                            })
                            .unwrap_or_else(|e| error!("[Worker {}] Failed to send execution error result: {}", worker_id, e));
                    }
                }
            }
            Err(e) => {
                let err_msg = format!("[Worker {}] Failed to spawn command for {}: {} (Path: {:?})", worker_id, task.job_label, e, exec_path);
                error!("{}", err_msg);
                 result_tx
                    .send(TaskExecutionResult {
                        task_key,
                        job_label: task.job_label.clone(),
                        success: false,
                        stdout: String::new(),
                        stderr: err_msg.clone(),
                        duration: start_time.elapsed(),
                        error_message: Some(err_msg),
                    })
                    .unwrap_or_else(|e| error!("[Worker {}] Failed to send spawn error result: {}", worker_id, e));
            }
        }
    }
    info!("[Worker {}] Exiting", worker_id);
 }
 fn is_task_ready(task: &Task, completed_outputs: &HashSet<String>) -> bool {
    for dep in &task.config.inputs {
        if dep.dep_type == DataDepType::Materialize {
            if !completed_outputs.contains(&dep.reference) {
                return false;
            }
        }
    }
    true
 }
 fn log_status_summary(
    task_states: &HashMap<String, TaskState>,
    original_tasks_by_key: &HashMap<String, Arc<Task>>,
 ) {
    let mut pending_tasks = Vec::new();
    let mut running_tasks = Vec::new();
    let mut succeeded_tasks = Vec::new();
    let mut failed_tasks = Vec::new();
    for (key, state) in task_states {
        let label = original_tasks_by_key.get(key).map_or_else(|| key.as_str(), |t| t.job_label.as_str());
        match state {
            TaskState::Pending => pending_tasks.push(label),
            TaskState::Running => running_tasks.push(label),
            TaskState::Succeeded => succeeded_tasks.push(label),
            TaskState::Failed => failed_tasks.push(label),
        }
    }
    info!("Task Status Summary:");
    info!("  Pending ({}): {:?}", pending_tasks.len(), pending_tasks);
    info!("  Running ({}): {:?}", running_tasks.len(), running_tasks);
    info!("  Succeeded ({}): {:?}", succeeded_tasks.len(), succeeded_tasks);
    info!("  Failed ({}): {:?}", failed_tasks.len(), failed_tasks);
 }
 fn main() -> Result<(), Box<dyn std::error::Error>> {
    simple_logger::SimpleLogger::new().with_level(log::LevelFilter::Info).init()?;
    let mut buffer = String::new();
    std::io::stdin().read_to_string(&mut buffer)?;
    let graph: JobGraph = serde_json::from_str(&buffer)?;
    info!("Executing job graph with {} nodes", graph.nodes.len());
    let mut task_states: HashMap<String, TaskState> = HashMap::new();
    let mut original_tasks_by_key: HashMap<String, Arc<Task>> = HashMap::new();
    let graph_nodes_arc: Vec<Arc<Task>> = graph.nodes.into_iter().map(Arc::new).collect();
    for task_node in &graph_nodes_arc {
        let key = get_task_key(task_node);
        task_states.insert(key.clone(), TaskState::Pending);
        original_tasks_by_key.insert(key, task_node.clone());
    }
    let mut completed_outputs: HashSet<String> = HashSet::new();
    let mut job_results: Vec<TaskExecutionResult> = Vec::new();
    let (task_tx, task_rx): (Sender<Arc<Task>>, Receiver<Arc<Task>>) = crossbeam_channel::unbounded();
    let (result_tx, result_rx): (Sender<TaskExecutionResult>, Receiver<TaskExecutionResult>) = crossbeam_channel::unbounded();
    let mut worker_handles = Vec::new();
    for i in 0..NUM_WORKERS {
        let task_rx_clone = task_rx.clone();
        let result_tx_clone = result_tx.clone();
        worker_handles.push(thread::spawn(move || {
            worker(task_rx_clone, result_tx_clone, i + 1);
        }));
    }
    // Drop the original result_tx so the channel closes when all workers are done
    // if result_rx is the only remaining receiver.
    drop(result_tx); 
    let mut last_log_time = Instant::now();
    let mut active_tasks_count = 0;
    let mut fail_fast_triggered = false;
    loop {
        // 1. Process results
        while let Ok(result) = result_rx.try_recv() {
            active_tasks_count -= 1;
            info!(
                "Received result for task {}: Success: {}",
                result.job_label, result.success
            );
            let current_state = if result.success {
                TaskState::Succeeded
            } else {
                TaskState::Failed
            };
            task_states.insert(result.task_key.clone(), current_state);
            if result.success {
                if let Some(original_task) = original_tasks_by_key.get(&result.task_key) {
                    for output_ref in &original_task.config.outputs {
                        completed_outputs.insert(output_ref.clone());
                    }
                }
            } else {
                if FAIL_FAST {
                    warn!("Fail-fast enabled and task {} failed. Shutting down.", result.job_label);
                    fail_fast_triggered = true;
                }
            }
            job_results.push(result);
        }
        // 2. Check for fail-fast break
        if fail_fast_triggered && active_tasks_count == 0 { // Wait for running tasks to finish if fail fast
             info!("All active tasks completed after fail-fast trigger.");
            break;
        }
        if fail_fast_triggered && active_tasks_count > 0 {
            // Don't schedule new tasks, just wait for active ones or log
        } else if !fail_fast_triggered { // Only dispatch if not in fail-fast shutdown
            // 3. Dispatch ready tasks
            for task_node in &graph_nodes_arc {
                let task_key = get_task_key(task_node);
                if task_states.get(&task_key) == Some(&TaskState::Pending) {
                    if is_task_ready(task_node, &completed_outputs) {
                        info!("Dispatching task: {}", task_node.job_label);
                        task_states.insert(task_key.clone(), TaskState::Running);
                        task_tx.send(task_node.clone())?;
                        active_tasks_count += 1;
                    }
                }
            }
        }
        // 4. Periodic logging
        if last_log_time.elapsed() >= LOG_INTERVAL {
            log_status_summary(&task_states, &original_tasks_by_key);
            last_log_time = Instant::now();
        }
        // 5. Check completion
        let all_done = task_states.values().all(|s| *s == TaskState::Succeeded || *s == TaskState::Failed);
        if active_tasks_count == 0 && all_done {
            info!("All tasks are in a terminal state and no tasks are active.");
            break;
        }
        // Avoid busy-waiting if no events, give channels time
        // Select would be better here, but for simplicity:
        thread::sleep(Duration::from_millis(50));
    }
    info!("Shutting down workers...");
    drop(task_tx); // Signal workers to stop by closing the task channel
    for handle in worker_handles {
        handle.join().expect("Failed to join worker thread");
    }
    info!("All workers finished.");
    // Final processing of any remaining results (should be minimal if loop logic is correct)
    while let Ok(result) = result_rx.try_recv() {
        active_tasks_count -= 1; // Should be 0
        info!(
            "Received late result for task {}: Success: {}",
            result.job_label, result.success
        );
        // Update state for completeness, though it might not affect overall outcome now
        let current_state = if result.success { TaskState::Succeeded } else { TaskState::Failed };
        task_states.insert(result.task_key.clone(), current_state);
        job_results.push(result);
    }
    let success_count = job_results.iter().filter(|r| r.success).count();
    let failure_count = job_results.len() - success_count;
    info!("Execution complete: {} succeeded, {} failed", success_count, failure_count);
    if failure_count > 0 || fail_fast_triggered {
        error!("Execution finished with errors.");
        std::process::exit(1);
    }
    Ok(())
 }