Reorganize repo

This commit is contained in:
Stuart Axelbrooke 2025-05-07 17:37:29 -07:00
parent f5eeeeb3dd
commit f2567f7567
No known key found for this signature in database
GPG key ID: 1B0A848C29D46A35
25 changed files with 113 additions and 125 deletions

View file

@ -1,24 +1,6 @@
filegroup(
name = "jq",
srcs = ["//runtime:jq"],
visibility = ["//visibility:public"],
)
filegroup(
name = "proto",
srcs = ["databuild.proto"],
visibility = ["//visibility:public"],
)
# Expose Cargo.toml for crate_universe
exports_files(
["Cargo.toml"],
visibility = ["//visibility:public"],
)
# Create an empty Cargo.lock file that will be generated by Bazel
exports_files(
["Cargo.lock"],
srcs = ["//databuild/runtime:jq"],
visibility = ["//visibility:public"],
)

File diff suppressed because one or more lines are too long

View file

@ -37,14 +37,14 @@ In DataBuild, `Job`s are the atomic unit of data processing, representing the ma
Jobs are assumed to be idempotent and independent, such that two jobs configured to produce separate partitions can run without interaction. These assumptions allow jobs to state only their immediate upstream and output data dependencies (the partitions they consume and produce), and in a graph leave no ambiguity about what must be done to produce a desired partition.
Jobs are implemented via the [`databuild_job`](./rules.bzl) bazel rule. An extremely basic job definition can be found in the [basic_job example](./examples/basic_job/).
Jobs are implemented via the [`databuild_job`](databuild/rules.bzl) bazel rule. An extremely basic job definition can be found in the [basic_job example](./examples/basic_job/).
## Graphs
A `Graph` is the composition of jobs and partitions via their data dependencies. Graphs answer "what partitions does a job require to produce its outputs?", and "what job must be run to produce a given partition?" Defining a graph relies on only the list of involved jobs, and a lookup executable that transforms desired partitions into the job(s) that produce.
Graphs expose two entrypoints: `graph.analyze`, which produces the literal `JobGraph` specifying the structure of the build graph to be execute to build a specific set of partitions (enabling visualization, planning, precondition checking, etc); and `graph.build`, which runs the build process for a set of requested partitions (relying on `graph.analyze` to plan). Other entrypoints are described in the [graph README](./graph/README.md).
Graphs expose two entrypoints: `graph.analyze`, which produces the literal `JobGraph` specifying the structure of the build graph to be execute to build a specific set of partitions (enabling visualization, planning, precondition checking, etc); and `graph.build`, which runs the build process for a set of requested partitions (relying on `graph.analyze` to plan). Other entrypoints are described in the [graph README](databuild/graph/README.md).
Graphs are implemented via the [`databuild_graph`](./rules.bzl) bazel rule. A basic graph definition can be found in the [basic_graph example](./examples/basic_graph/).
Graphs are implemented via the [`databuild_graph`](databuild/rules.bzl) bazel rule. A basic graph definition can be found in the [basic_graph example](./examples/basic_graph/).
### Implementing a Graph
To make a fully described graph, engineers must define:

20
databuild/BUILD.bazel Normal file
View file

@ -0,0 +1,20 @@
load("@rules_rust//rust:defs.bzl", "rust_library")
filegroup(
name = "proto",
srcs = ["databuild.proto"],
visibility = ["//visibility:public"],
)
rust_library(
name = "structs",
srcs = [
"structs.rs",
],
deps = [
"@crates//:serde",
],
edition = "2021",
visibility = ["//visibility:public"],
)

View file

@ -18,6 +18,7 @@ rust_binary(
srcs = ["analyze.rs"],
edition = "2021",
deps = [
"//databuild:structs",
"@crates//:serde",
"@crates//:serde_json",
"@crates//:log",

View file

@ -3,67 +3,9 @@ use std::env;
use std::process::{Command, exit};
use std::sync::{Arc, Mutex};
use std::thread;
use serde::{Deserialize, Serialize};
use serde_json::{self, json};
use log::{info, error};
use simple_logger::SimpleLogger;
use std::str::FromStr;
// Data structures that mirror the Go implementation
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "lowercase")]
enum DataDepType {
Query,
Materialize,
}
impl FromStr for DataDepType {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"query" => Ok(DataDepType::Query),
"materialize" => Ok(DataDepType::Materialize),
_ => Err(format!("Unknown DataDepType: {}", s)),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct DataDep {
#[serde(rename = "depType")]
dep_type: DataDepType,
#[serde(rename = "ref")]
reference: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct JobConfig {
inputs: Vec<DataDep>,
outputs: Vec<String>,
args: Vec<String>,
env: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct Task {
#[serde(rename = "jobLabel")]
job_label: String,
config: JobConfig,
}
#[derive(Debug, Serialize, Deserialize)]
struct JobGraph {
outputs: Vec<String>,
nodes: Vec<Task>,
}
// Function to convert a job label to a configuration path
fn job_label_to_cfg_path(job_label: &str) -> String {
let without_prefix = job_label.replace("//", "");
let with_slash = without_prefix.replace(":", "/");
format!(".{}.cfg", with_slash)
}
use structs::*;
// Configure a job to produce the desired outputs
fn configure(job_label: &str, output_refs: &[String]) -> Result<Vec<Task>, String> {

View file

@ -5,7 +5,7 @@ set -e
%{PREFIX}
EXECUTABLE_BINARY="$(rlocation "databuild+/graph/$(basename "%{EXECUTABLE_PATH}")_")/execute"
EXECUTABLE_BINARY="$(rlocation "databuild+/databuild/graph/$(basename "%{EXECUTABLE_PATH}")_")/execute"
# Run the execution
exec "${EXECUTABLE_BINARY}" "$@"

View file

@ -7,7 +7,7 @@ set -e
# Locate the Rust binary using its standard runfiles path
# Assumes workspace name is 'databuild'
EXECUTABLE_BINARY="$(rlocation "databuild/graph/analyze")"
EXECUTABLE_BINARY="$(rlocation "databuild/databuild/graph/analyze")"
# Run the analysis
exec "${EXECUTABLE_BINARY}" "$@"

View file

@ -1,5 +1,5 @@
sh_test(
name = "analyze_test",
srcs = ["analyze_test.sh"],
data = ["//graph:analyze"],
data = ["//databuild/graph:analyze"],
)

View file

@ -1,3 +1,3 @@
#!/usr/bin/env bash
DATABUILD_MODE=import_test DATABUILD_JOB_LOOKUP_PATH=foo DATABUILD_CANDIDATE_JOBS=bar graph/analyze
DATABUILD_MODE=import_test DATABUILD_JOB_LOOKUP_PATH=foo DATABUILD_CANDIDATE_JOBS=bar databuild/graph/analyze

View file

@ -4,7 +4,7 @@ set -e
%{RUNFILES_PREFIX}
EXECUTE_BINARY="$(rlocation "_main/$(basename "%{EXECUTE_PATH}")")"
JQ="$(rlocation "databuild+/runtime/$(basename "%{JQ_PATH}")")"
JQ="$(rlocation "databuild+/databuild/runtime/$(basename "%{JQ_PATH}")")"
# First argument should be the path to a config file
CONFIG_FILE=${1:-}

View file

@ -104,7 +104,7 @@ _databuild_job_cfg_rule = rule(
mandatory = True,
),
"_template": attr.label(
default = "@databuild//runtime:simple_executable_wrapper.sh.tpl",
default = "@databuild//databuild/runtime:simple_executable_wrapper.sh.tpl",
allow_single_file = True,
),
"_bash_runfiles": attr.label(
@ -169,11 +169,11 @@ _databuild_job_exec_rule = rule(
cfg = "target",
),
"_template": attr.label(
default = "@databuild//job:execute_wrapper.sh.tpl",
default = "@databuild//databuild/job:execute_wrapper.sh.tpl",
allow_single_file = True,
),
"_jq": attr.label(
default = "@databuild//runtime:jq",
default = "@databuild//databuild/runtime:jq",
executable = True,
cfg = "target",
),
@ -323,7 +323,7 @@ _databuild_graph_lookup = rule(
cfg = "target",
),
"_template": attr.label(
default = "@databuild//runtime:simple_executable_wrapper.sh.tpl",
default = "@databuild//databuild/runtime:simple_executable_wrapper.sh.tpl",
allow_single_file = True,
),
"_bash_runfiles": attr.label(
@ -410,7 +410,7 @@ _databuild_graph_analyze = rule(
allow_empty = False,
),
"_template": attr.label(
default = "@databuild//graph:rust_analyze_wrapper.sh.tpl",
default = "@databuild//databuild/graph:rust_analyze_wrapper.sh.tpl",
allow_single_file = True,
),
"_bash_runfiles": attr.label(
@ -418,7 +418,7 @@ _databuild_graph_analyze = rule(
allow_files = True,
),
"_analyze": attr.label(
default = "@databuild//graph:analyze",
default = "@databuild//databuild/graph:analyze",
executable = True,
cfg = "target",
),
@ -501,7 +501,7 @@ _databuild_graph_mermaid = rule(
allow_empty = False,
),
"_template": attr.label(
default = "@databuild//graph:go_analyze_wrapper.sh.tpl",
default = "@databuild//databuild/graph:rust_analyze_wrapper.sh.tpl",
allow_single_file = True,
),
"_bash_runfiles": attr.label(
@ -509,7 +509,7 @@ _databuild_graph_mermaid = rule(
allow_files = True,
),
"_analyze": attr.label(
default = "@databuild//graph:analyze",
default = "@databuild//databuild/graph:analyze",
executable = True,
cfg = "target",
),
@ -564,7 +564,7 @@ _databuild_graph_exec = rule(
allow_empty = False,
),
"_template": attr.label(
default = "@databuild//graph:go_exec_wrapper.sh.tpl",
default = "@databuild//databuild/graph:go_exec_wrapper.sh.tpl",
allow_single_file = True,
),
"_bash_runfiles": attr.label(
@ -572,7 +572,7 @@ _databuild_graph_exec = rule(
allow_files = True,
),
"_execute": attr.label(
default = "@databuild//graph:execute",
default = "@databuild//databuild/graph:execute",
executable = True,
cfg = "target",
)

52
databuild/structs.rs Normal file
View file

@ -0,0 +1,52 @@
use std::collections::HashMap;
use serde::{Deserialize, Serialize};
use std::str::FromStr;
// Data structures that mirror the Go implementation
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum DataDepType {
Query,
Materialize,
}
impl FromStr for DataDepType {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"query" => Ok(DataDepType::Query),
"materialize" => Ok(DataDepType::Materialize),
_ => Err(format!("Unknown DataDepType: {}", s)),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DataDep {
#[serde(rename = "depType")]
pub dep_type: DataDepType,
#[serde(rename = "ref")]
pub reference: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct JobConfig {
pub inputs: Vec<DataDep>,
pub outputs: Vec<String>,
pub args: Vec<String>,
pub env: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Task {
#[serde(rename = "jobLabel")]
pub job_label: String,
pub config: JobConfig,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct JobGraph {
pub outputs: Vec<String>,
pub nodes: Vec<Task>,
}

View file

@ -1,4 +1,4 @@
load("@databuild//:rules.bzl", "databuild_graph", "databuild_job")
load("@databuild//databuild:rules.bzl", "databuild_graph", "databuild_job")
load("@rules_java//java:defs.bzl", "java_binary")
platform(

View file

@ -2,7 +2,9 @@
set -e
# Test the .exec rule
echo exec
basic_graph.exec < <(basic_graph.analyze /tmp/databuild_test/examples/basic_graph/generated_number/pippin_salem_sadie)
# Test the .build rule
basic_graph.build /tmp/databuild_test/examples/basic_graph/generated_number/pippin_salem_sadie
echo build
basic_graph.build /tmp/databuild_test/examples/basic_graph/generated_number/pippin_salem_sadie

View file

@ -1,4 +1,4 @@
load("@databuild//:rules.bzl", "databuild_job")
load("@databuild//databuild:rules.bzl", "databuild_job")
databuild_job(
name = "test_job",

View file

@ -1,5 +1,5 @@
load("//:py_repl.bzl", "py_repl")
load("@databuild//:rules.bzl", "databuild_job")
load("@databuild//databuild:rules.bzl", "databuild_job")
load("@rules_python//python:pip.bzl", "compile_pip_requirements")
load("@pypi//:requirements.bzl", "requirement")

View file

@ -1,11 +0,0 @@
#!/bin/bash
set -e
%{RUNFILES_PREFIX}
%{PREFIX}
EXECUTABLE_BINARY="$(rlocation "databuild+/graph/$(basename "%{EXECUTABLE_PATH}")_")/analyze"
# Run the configuration
exec "${EXECUTABLE_BINARY}" "$@"