fix locking, and update multihop example
This commit is contained in:
parent
f7c196c9b3
commit
9a072ff74d
5 changed files with 120 additions and 109 deletions
|
|
@ -232,6 +232,39 @@ async fn cmd_serve(port: u16, database: &str, config_path: &str, config: &Databu
|
|||
// Initialize logging
|
||||
tracing_subscriber::fmt::init();
|
||||
|
||||
// Acquire and hold the server lock for the duration of the server
|
||||
let mut server_lock = ServerLock::new(&config.graph_label).unwrap_or_else(|e| {
|
||||
eprintln!("Failed to create server lock: {}", e);
|
||||
std::process::exit(1);
|
||||
});
|
||||
|
||||
// Try to acquire exclusive lock
|
||||
match server_lock.try_lock() {
|
||||
Ok(true) => {
|
||||
// Write our state
|
||||
let config_hash = ServerLock::hash_config(Path::new(config_path)).unwrap_or_default();
|
||||
let state = lib::server_lock::ServerLockState {
|
||||
pid: std::process::id(),
|
||||
port,
|
||||
started_at: ServerLock::now_millis(),
|
||||
config_hash,
|
||||
};
|
||||
if let Err(e) = server_lock.write_state(&state) {
|
||||
eprintln!("Failed to write server state: {}", e);
|
||||
}
|
||||
}
|
||||
Ok(false) => {
|
||||
// Another server is holding the lock - this shouldn't happen in daemon mode
|
||||
// but could happen if user manually runs serve while another server is running
|
||||
eprintln!("Another server is already running for graph '{}'. Use 'databuild stop' first.", config.graph_label);
|
||||
std::process::exit(1);
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("Failed to acquire server lock: {}", e);
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
println!("Loaded configuration from: {}", config_path);
|
||||
println!(" Graph: {}", config.graph_label);
|
||||
println!(" Jobs: {}", config.jobs.len());
|
||||
|
|
|
|||
|
|
@ -78,9 +78,9 @@ impl DatabuildConfig {
|
|||
/// If `bel_uri` is not set, returns the default path `.databuild/${graph_label}/bel.sqlite`.
|
||||
/// Relative paths are not resolved here - that's the caller's responsibility.
|
||||
pub fn effective_bel_uri(&self) -> String {
|
||||
self.bel_uri.clone().unwrap_or_else(|| {
|
||||
format!(".databuild/{}/bel.sqlite", self.graph_label)
|
||||
})
|
||||
self.bel_uri
|
||||
.clone()
|
||||
.unwrap_or_else(|| format!(".databuild/{}/bel.sqlite", self.graph_label))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -134,7 +134,10 @@ mod tests {
|
|||
assert_eq!(config.effective_bel_uri(), ".databuild/my_graph/bel.sqlite");
|
||||
|
||||
// Custom: uses provided value
|
||||
let config = DatabuildConfig::from_json(r#"{ "graph_label": "my_graph", "bel_uri": "postgresql://localhost/db" }"#).unwrap();
|
||||
let config = DatabuildConfig::from_json(
|
||||
r#"{ "graph_label": "my_graph", "bel_uri": "postgresql://localhost/db" }"#,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(config.effective_bel_uri(), "postgresql://localhost/db");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
//!
|
||||
//! Implements the classic double-fork pattern to create a proper Unix daemon.
|
||||
|
||||
use crate::server_lock::{ServerLock, ServerLockState};
|
||||
use crate::server_lock::ServerLock;
|
||||
use crate::util::DatabuildError;
|
||||
use std::fs::OpenOptions;
|
||||
use std::path::Path;
|
||||
|
|
@ -108,68 +108,51 @@ pub fn ensure_server_running(
|
|||
graph_label: &str,
|
||||
config_hash: &str,
|
||||
) -> Result<DaemonizeResult, DatabuildError> {
|
||||
let mut lock = ServerLock::new(graph_label)?;
|
||||
let lock = ServerLock::new(graph_label)?;
|
||||
|
||||
// Try to acquire the lock
|
||||
if lock.try_lock()? {
|
||||
// We got the lock, so no server is running
|
||||
// Find an available port
|
||||
let port = find_available_port(3538)?;
|
||||
|
||||
// Write initial state (we'll update after server starts)
|
||||
let state = ServerLockState {
|
||||
pid: std::process::id(),
|
||||
port,
|
||||
started_at: ServerLock::now_millis(),
|
||||
config_hash: config_hash.to_string(),
|
||||
};
|
||||
lock.write_state(&state)?;
|
||||
|
||||
// Spawn the daemon
|
||||
let log_path = lock.log_path();
|
||||
let mut child = spawn_daemon(config_path, port, &log_path)?;
|
||||
|
||||
// Update state with actual PID
|
||||
let pid = child.id();
|
||||
let state = ServerLockState {
|
||||
pid,
|
||||
port,
|
||||
started_at: ServerLock::now_millis(),
|
||||
config_hash: config_hash.to_string(),
|
||||
};
|
||||
lock.write_state(&state)?;
|
||||
|
||||
// Release the lock so the daemon can grab it
|
||||
drop(lock);
|
||||
|
||||
// Wait for server to become healthy
|
||||
wait_for_health(port, 10000)?;
|
||||
|
||||
Ok(DaemonizeResult::Started { port })
|
||||
} else {
|
||||
// Server is already running, read the port
|
||||
let state = lock
|
||||
.read_state()?
|
||||
.ok_or_else(|| DatabuildError::from("Lock held but no state found"))?;
|
||||
|
||||
// Verify server is actually healthy
|
||||
if !health_check(state.port) {
|
||||
// First, check if there's already a running server by reading existing state
|
||||
if let Some(state) = lock.read_state()? {
|
||||
// Check if that process is still running
|
||||
if ServerLock::is_process_running(state.pid) {
|
||||
// Verify server is actually healthy
|
||||
if health_check(state.port) {
|
||||
// Check if config has changed
|
||||
if state.config_hash != config_hash {
|
||||
eprintln!(
|
||||
"Warning: Config has changed since server started.\n\
|
||||
Run 'databuild stop && databuild serve' to apply changes."
|
||||
);
|
||||
}
|
||||
return Ok(DaemonizeResult::AlreadyRunning { port: state.port });
|
||||
}
|
||||
// Process exists but not healthy - might still be starting up
|
||||
// Wait a bit and check again
|
||||
if wait_for_health(state.port, 5000).is_ok() {
|
||||
return Ok(DaemonizeResult::AlreadyRunning { port: state.port });
|
||||
}
|
||||
// Still unhealthy, will need to be stopped manually
|
||||
return Err(DatabuildError::from(format!(
|
||||
"Server at port {} appears unhealthy. Try 'databuild stop' and retry.",
|
||||
state.port
|
||||
)));
|
||||
} else {
|
||||
// Stale lock file - process is gone, clean up
|
||||
lock.remove_stale_lock()?;
|
||||
}
|
||||
|
||||
// Check if config has changed
|
||||
if state.config_hash != config_hash {
|
||||
eprintln!(
|
||||
"Warning: Config has changed since server started.\n\
|
||||
Run 'databuild stop && databuild serve' to apply changes."
|
||||
);
|
||||
}
|
||||
|
||||
Ok(DaemonizeResult::AlreadyRunning { port: state.port })
|
||||
}
|
||||
|
||||
// No server running - start one
|
||||
// Find an available port
|
||||
let port = find_available_port(3538)?;
|
||||
|
||||
// Spawn the daemon - it will acquire its own lock
|
||||
let log_path = lock.log_path();
|
||||
let _child = spawn_daemon(config_path, port, &log_path)?;
|
||||
|
||||
// Wait for server to become healthy (which implies it has acquired the lock)
|
||||
wait_for_health(port, 10000)?;
|
||||
|
||||
Ok(DaemonizeResult::Started { port })
|
||||
}
|
||||
|
||||
/// Stop a running server.
|
||||
|
|
|
|||
|
|
@ -45,8 +45,9 @@ impl ServerLock {
|
|||
/// Creates the .databuild/${graph_label}/ directory if it doesn't exist.
|
||||
pub fn new_in_dir(base_dir: &Path, graph_label: &str) -> Result<Self, DatabuildError> {
|
||||
let graph_dir = base_dir.join(".databuild").join(graph_label);
|
||||
fs::create_dir_all(&graph_dir)
|
||||
.map_err(|e| DatabuildError::from(format!("Failed to create graph directory: {}", e)))?;
|
||||
fs::create_dir_all(&graph_dir).map_err(|e| {
|
||||
DatabuildError::from(format!("Failed to create graph directory: {}", e))
|
||||
})?;
|
||||
|
||||
let lock_path = graph_dir.join("server.lock");
|
||||
|
||||
|
|
@ -88,7 +89,10 @@ impl ServerLock {
|
|||
// Lock is held by another process
|
||||
Ok(false)
|
||||
}
|
||||
Err(e) => Err(DatabuildError::from(format!("Failed to acquire lock: {}", e))),
|
||||
Err(e) => Err(DatabuildError::from(format!(
|
||||
"Failed to acquire lock: {}",
|
||||
e
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -151,9 +155,7 @@ impl ServerLock {
|
|||
#[cfg(unix)]
|
||||
{
|
||||
use std::os::unix::process::CommandExt;
|
||||
unsafe {
|
||||
libc::kill(pid as i32, 0) == 0
|
||||
}
|
||||
unsafe { libc::kill(pid as i32, 0) == 0 }
|
||||
}
|
||||
#[cfg(not(unix))]
|
||||
{
|
||||
|
|
@ -278,7 +280,10 @@ mod tests {
|
|||
let temp = tempdir().unwrap();
|
||||
let lock = ServerLock::new_in_dir(temp.path(), "nonexistent_graph").unwrap();
|
||||
let state = lock.read_state().unwrap();
|
||||
assert!(state.is_none(), "Reading nonexistent lock file should return None");
|
||||
assert!(
|
||||
state.is_none(),
|
||||
"Reading nonexistent lock file should return None"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -295,6 +300,9 @@ mod tests {
|
|||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
|
||||
assert!(now >= before && now <= after, "now_millis should be between before and after");
|
||||
assert!(
|
||||
now >= before && now <= after,
|
||||
"now_millis should be between before and after"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,71 +4,55 @@ set -euo pipefail
|
|||
# Navigate to repository root
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
# Configuration
|
||||
PORT=3050
|
||||
DB_PATH="/tmp/databuild_multihop.db"
|
||||
CONFIG="examples/multihop/config.json"
|
||||
FLAG_FILE="/tmp/databuild_multihop_alpha_complete"
|
||||
PID_FILE="/tmp/databuild_multihop.pid"
|
||||
|
||||
echo "=== DataBuild Multi-Hop Example ==="
|
||||
echo
|
||||
|
||||
# Clean up previous state
|
||||
echo "Cleaning up previous state..."
|
||||
rm -f "$DB_PATH" "$FLAG_FILE" "$PID_FILE"
|
||||
pkill -f "databuild.*serve.*port $PORT" || true
|
||||
sleep 1
|
||||
rm -f "$FLAG_FILE"
|
||||
rm -rf .databuild/multihop/
|
||||
./bazel-bin/databuild/databuild --config "$CONFIG" stop 2>/dev/null || true
|
||||
|
||||
# Build the binary
|
||||
echo "Building databuild CLI..."
|
||||
bazel build //databuild:databuild
|
||||
|
||||
# Start the server in background
|
||||
echo "Starting databuild server on port $PORT..."
|
||||
./bazel-bin/databuild/databuild serve \
|
||||
--port $PORT \
|
||||
--database "$DB_PATH" \
|
||||
--config examples/multihop/config.json &
|
||||
echo
|
||||
echo "=== Starting server ==="
|
||||
echo
|
||||
|
||||
SERVER_PID=$!
|
||||
echo $SERVER_PID > "$PID_FILE"
|
||||
echo "Server started with PID $SERVER_PID"
|
||||
# Start the server by making a request (triggers auto-start)
|
||||
OUTPUT=$(./bazel-bin/databuild/databuild --config "$CONFIG" wants list 2>&1)
|
||||
|
||||
# Wait for server to be ready
|
||||
echo "Waiting for server to start..."
|
||||
sleep 2
|
||||
|
||||
# Test server health
|
||||
if curl -s http://localhost:$PORT/health > /dev/null 2>&1; then
|
||||
echo "Server is ready!"
|
||||
else
|
||||
echo "WARNING: Server health check failed, but continuing..."
|
||||
fi
|
||||
# Extract port from status
|
||||
PORT=$(./bazel-bin/databuild/databuild --config "$CONFIG" status 2>&1 | grep "Port:" | awk '{print $2}')
|
||||
|
||||
echo
|
||||
echo "=== Server is running ==="
|
||||
echo "Server running at: http://127.0.0.1:${PORT}"
|
||||
echo
|
||||
echo "You can now interact with the server:"
|
||||
echo "=== Ready to run example ==="
|
||||
echo
|
||||
echo "Try the following commands:"
|
||||
echo
|
||||
echo " # Check server status"
|
||||
echo " ./bazel-bin/databuild/databuild --config $CONFIG status"
|
||||
echo
|
||||
echo " # Create a want for data/beta (triggers dependency chain)"
|
||||
echo " ./bazel-bin/databuild/databuild --server http://localhost:$PORT want data/beta"
|
||||
echo " ./bazel-bin/databuild/databuild --config $CONFIG want data/beta"
|
||||
echo
|
||||
echo " # Monitor wants"
|
||||
echo " ./bazel-bin/databuild/databuild --server http://localhost:$PORT wants list"
|
||||
echo " ./bazel-bin/databuild/databuild --config $CONFIG wants list"
|
||||
echo
|
||||
echo " # Monitor job runs"
|
||||
echo " ./bazel-bin/databuild/databuild --server http://localhost:$PORT job-runs list"
|
||||
echo " ./bazel-bin/databuild/databuild --config $CONFIG job-runs list"
|
||||
echo
|
||||
echo " # Monitor partitions"
|
||||
echo " ./bazel-bin/databuild/databuild --server http://localhost:$PORT partitions list"
|
||||
echo " ./bazel-bin/databuild/databuild --config $CONFIG partitions list"
|
||||
echo
|
||||
echo "To stop the server:"
|
||||
echo " kill $SERVER_PID"
|
||||
echo " # or: pkill -f 'databuild.*serve.*port $PORT'"
|
||||
echo " # Stop the server when done"
|
||||
echo " ./bazel-bin/databuild/databuild --config $CONFIG stop"
|
||||
echo
|
||||
echo "Server logs will appear below. Press Ctrl+C to stop."
|
||||
echo "=========================================="
|
||||
echo
|
||||
|
||||
# Wait for the server process
|
||||
wait $SERVER_PID
|
||||
echo "==========================================="
|
||||
|
|
|
|||
Loading…
Reference in a new issue