Delta backend: compaction and vacuum

This commit is contained in:
Stuart Axelbrooke 2025-08-06 16:17:50 -07:00
parent 5de1f25587
commit 57ad5c41a5
5 changed files with 3994 additions and 138 deletions

View file

@ -136,6 +136,7 @@ crate.spec(
version = "0.30",
)
crate.spec(
features = ["datafusion"],
package = "deltalake",
version = "0.27",
)
@ -143,6 +144,10 @@ crate.spec(
package = "parquet",
version = "55.2",
)
crate.spec(
package = "chrono",
version = "0.4",
)
crate.from_specs()
use_repo(crate, "crates")

File diff suppressed because one or more lines are too long

View file

@ -56,6 +56,7 @@ rust_library(
"@crates//:aide",
"@crates//:axum",
"@crates//:axum-jsonschema",
"@crates//:chrono",
"@crates//:deltalake",
"@crates//:log",
"@crates//:parquet",

View file

@ -1,6 +1,8 @@
use super::*;
use async_trait::async_trait;
use deltalake::{DeltaTableBuilder, DeltaOps, open_table, writer::RecordBatchWriter, writer::DeltaWriter};
use deltalake::{DeltaTableBuilder, DeltaOps, open_table, writer::RecordBatchWriter, writer::DeltaWriter,
operations::optimize::OptimizeBuilder, operations::vacuum::VacuumBuilder};
use chrono::Duration;
use deltalake::arrow::array::{Array, RecordBatch, StringArray, Int64Array};
use deltalake::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
use deltalake::kernel::{StructField, DataType as DeltaDataType};
@ -375,6 +377,91 @@ impl DeltaBuildEventLog {
).map_err(|e| BuildEventLogError::SerializationError(format!("Failed to create RecordBatch: {}", e)))
}
/// Check if compaction should be triggered based on file count and run it in background
async fn maybe_compact_on_file_count(&self) {
match self.should_compact().await {
Ok(true) => {
// Spawn background compaction task to avoid blocking writes
let table_path = self.table_path.clone();
tokio::spawn(async move {
if let Err(e) = Self::run_compaction(&table_path).await {
log::warn!("Background compaction failed for {}: {}", table_path, e);
} else {
log::info!("Background compaction completed for {}", table_path);
}
});
}
Ok(false) => {
// No compaction needed
}
Err(e) => {
log::warn!("Failed to check compaction status: {}", e);
}
}
}
/// Check if the table should be compacted based on file count threshold
async fn should_compact(&self) -> Result<bool> {
// Configurable threshold - default to 50 files
let threshold = std::env::var("DATABUILD_DELTA_COMPACT_THRESHOLD")
.unwrap_or_else(|_| "50".to_string())
.parse::<usize>()
.unwrap_or(50);
// Try to open the table to check file count
match open_table(&self.table_path).await {
Ok(table) => {
let file_uris: Vec<String> = table.get_file_uris()
.map_err(|e| BuildEventLogError::DatabaseError(format!("Failed to get file URIs: {}", e)))?
.collect();
let file_count = file_uris.len();
log::debug!("Delta table {} has {} files (threshold: {})", self.table_path, file_count, threshold);
Ok(file_count > threshold)
}
Err(e) => {
log::debug!("Could not check file count for compaction: {}", e);
Ok(false) // Don't compact if we can't check
}
}
}
/// Run compaction on the table using Delta's native optimize + vacuum operations
async fn run_compaction(table_path: &str) -> Result<()> {
let table = open_table(table_path).await
.map_err(|e| BuildEventLogError::DatabaseError(format!("Failed to open table for compaction: {}", e)))?;
// Step 1: Optimize (merge small files into larger ones)
let table_state = table.state.clone().ok_or_else(|| BuildEventLogError::DatabaseError("Table state is None".to_string()))?;
let (table_after_optimize, optimize_metrics) = OptimizeBuilder::new(table.log_store(), table_state)
.await
.map_err(|e| BuildEventLogError::DatabaseError(format!("Failed to run optimization: {}", e)))?;
log::info!("Optimize completed for {}: {:?}", table_path, optimize_metrics);
// Step 2: Vacuum with 0 hour retention to immediately delete old files
let files_before: Vec<String> = table.get_file_uris()
.map_err(|e| BuildEventLogError::DatabaseError(format!("Failed to count files: {}", e)))?
.collect();
log::info!("Files before compaction: {}", files_before.len());
let table_state_after_optimize = table_after_optimize.state.clone().ok_or_else(|| BuildEventLogError::DatabaseError("Table state after optimize is None".to_string()))?;
let (_final_table, vacuum_metrics) = VacuumBuilder::new(table_after_optimize.log_store(), table_state_after_optimize)
.with_retention_period(Duration::zero()) // 0 retention - delete old files immediately
.await
.map_err(|e| BuildEventLogError::DatabaseError(format!("Failed to run vacuum: {}", e)))?;
let files_after: Vec<String> = _final_table.get_file_uris()
.map_err(|e| BuildEventLogError::DatabaseError(format!("Failed to count files: {}", e)))?
.collect();
log::info!("Files after compaction: {}", files_after.len());
log::info!("Compaction completed for {}: optimize_metrics={:?}, vacuum_metrics={:?}",
table_path, optimize_metrics, vacuum_metrics);
Ok(())
}
}
#[async_trait]
@ -410,6 +497,9 @@ impl BuildEventLog for DeltaBuildEventLog {
writer.flush_and_commit(&mut table).await
.map_err(|e| BuildEventLogError::DatabaseError(format!("Failed to commit: {}", e)))?;
// Check if we should compact (non-blocking)
self.maybe_compact_on_file_count().await;
Ok(())
}

File diff suppressed because one or more lines are too long