Reorganize repo

2025-05-07 17:37:29 -07:00 · 2025-05-07 17:37:29 -07:00 · f2567f7567
commit f2567f7567
parent f5eeeeb3dd
25 changed files with 113 additions and 125 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -1,24 +1,6 @@

 filegroup(
    name = "jq",
-    srcs = ["//runtime:jq"],
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "proto",
-    srcs = ["databuild.proto"],
-    visibility = ["//visibility:public"],
-)
-
-# Expose Cargo.toml for crate_universe
-exports_files(
-    ["Cargo.toml"],
-    visibility = ["//visibility:public"],
-)
-
-# Create an empty Cargo.lock file that will be generated by Bazel
-exports_files(
-    ["Cargo.lock"],
+    srcs = ["//databuild/runtime:jq"],
    visibility = ["//visibility:public"],
 )
--- a/MODULE.bazel.lock
+++ b/MODULE.bazel.lock
--- a/core-concepts.md
+++ b/core-concepts.md
@ -37,14 +37,14 @@ In DataBuild, `Job`s are the atomic unit of data processing, representing the ma

 Jobs are assumed to be idempotent and independent, such that two jobs configured to produce separate partitions can run without interaction. These assumptions allow jobs to state only their immediate upstream and output data dependencies (the partitions they consume and produce), and in a graph leave no ambiguity about what must be done to produce a desired partition.

-Jobs are implemented via the [`databuild_job`](./rules.bzl) bazel rule. An extremely basic job definition can be found in the [basic_job example](./examples/basic_job/). 
+Jobs are implemented via the [`databuild_job`](databuild/rules.bzl) bazel rule. An extremely basic job definition can be found in the [basic_job example](./examples/basic_job/). 

 ## Graphs
 A `Graph` is the composition of jobs and partitions via their data dependencies. Graphs answer "what partitions does a job require to produce its outputs?", and "what job must be run to produce a given partition?" Defining a graph relies on only the list of involved jobs, and a lookup executable that transforms desired partitions into the job(s) that produce.

-Graphs expose two entrypoints: `graph.analyze`, which produces the literal `JobGraph` specifying the structure of the build graph to be execute to build a specific set of partitions (enabling visualization, planning, precondition checking, etc); and `graph.build`, which runs the build process for a set of requested partitions (relying on `graph.analyze` to plan). Other entrypoints are described in the [graph README](./graph/README.md).
+Graphs expose two entrypoints: `graph.analyze`, which produces the literal `JobGraph` specifying the structure of the build graph to be execute to build a specific set of partitions (enabling visualization, planning, precondition checking, etc); and `graph.build`, which runs the build process for a set of requested partitions (relying on `graph.analyze` to plan). Other entrypoints are described in the [graph README](databuild/graph/README.md).

-Graphs are implemented via the [`databuild_graph`](./rules.bzl) bazel rule. A basic graph definition can be found in the [basic_graph example](./examples/basic_graph/).
+Graphs are implemented via the [`databuild_graph`](databuild/rules.bzl) bazel rule. A basic graph definition can be found in the [basic_graph example](./examples/basic_graph/).

 ### Implementing a Graph
 To make a fully described graph, engineers must define:
--- a/databuild/BUILD.bazel
+++ b/databuild/BUILD.bazel
@ -0,0 +1,20 @@
+
+load("@rules_rust//rust:defs.bzl", "rust_library")
+
+filegroup(
+    name = "proto",
+    srcs = ["databuild.proto"],
+    visibility = ["//visibility:public"],
+)
+
+rust_library(
+    name = "structs",
+    srcs = [
+        "structs.rs",
+    ],
+    deps = [
+        "@crates//:serde",
+    ],
+    edition = "2021",
+    visibility = ["//visibility:public"],
+)
--- a/databuild/databuild.proto
+++ b/databuild/databuild.proto
--- a/databuild/graph/BUILD.bazel
+++ b/databuild/graph/BUILD.bazel
@ -18,6 +18,7 @@ rust_binary(
    srcs = ["analyze.rs"],
    edition = "2021",
    deps = [
+        "//databuild:structs",
        "@crates//:serde",
        "@crates//:serde_json",
        "@crates//:log",
--- a/databuild/graph/README.md
+++ b/databuild/graph/README.md
--- a/databuild/graph/analyze.rs
+++ b/databuild/graph/analyze.rs
@ -3,67 +3,9 @@ use std::env;
 use std::process::{Command, exit};
 use std::sync::{Arc, Mutex};
 use std::thread;
-use serde::{Deserialize, Serialize};
-use serde_json::{self, json};
 use log::{info, error};
 use simple_logger::SimpleLogger;
-use std::str::FromStr;
-
-// Data structures that mirror the Go implementation
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
-#[serde(rename_all = "lowercase")]
-enum DataDepType {
-    Query,
-    Materialize,
-}
-
-impl FromStr for DataDepType {
-    type Err = String;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s.to_lowercase().as_str() {
-            "query" => Ok(DataDepType::Query),
-            "materialize" => Ok(DataDepType::Materialize),
-            _ => Err(format!("Unknown DataDepType: {}", s)),
-        }
-    }
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-struct DataDep {
-    #[serde(rename = "depType")]
-    dep_type: DataDepType,
-    #[serde(rename = "ref")]
-    reference: String,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-struct JobConfig {
-    inputs: Vec<DataDep>,
-    outputs: Vec<String>,
-    args: Vec<String>,
-    env: HashMap<String, String>,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-struct Task {
-    #[serde(rename = "jobLabel")]
-    job_label: String,
-    config: JobConfig,
-}
-
-#[derive(Debug, Serialize, Deserialize)]
-struct JobGraph {
-    outputs: Vec<String>,
-    nodes: Vec<Task>,
-}
-
-// Function to convert a job label to a configuration path
-fn job_label_to_cfg_path(job_label: &str) -> String {
-    let without_prefix = job_label.replace("//", "");
-    let with_slash = without_prefix.replace(":", "/");
-    format!(".{}.cfg", with_slash)
-}
+use structs::*;

 // Configure a job to produce the desired outputs
 fn configure(job_label: &str, output_refs: &[String]) -> Result<Vec<Task>, String> {
--- a/databuild/graph/execute.go
+++ b/databuild/graph/execute.go
--- a/databuild/graph/go_exec_wrapper.sh.tpl
+++ b/databuild/graph/go_exec_wrapper.sh.tpl
@ -5,7 +5,7 @@ set -e

 %{PREFIX}

-EXECUTABLE_BINARY="$(rlocation "databuild+/graph/$(basename "%{EXECUTABLE_PATH}")_")/execute"
+EXECUTABLE_BINARY="$(rlocation "databuild+/databuild/graph/$(basename "%{EXECUTABLE_PATH}")_")/execute"

 # Run the execution
 exec "${EXECUTABLE_BINARY}" "$@"
--- a/databuild/graph/rust_analyze_wrapper.sh.tpl
+++ b/databuild/graph/rust_analyze_wrapper.sh.tpl
@ -7,7 +7,7 @@ set -e

 # Locate the Rust binary using its standard runfiles path
 # Assumes workspace name is 'databuild'
-EXECUTABLE_BINARY="$(rlocation "databuild/graph/analyze")"
+EXECUTABLE_BINARY="$(rlocation "databuild/databuild/graph/analyze")"

 # Run the analysis
 exec "${EXECUTABLE_BINARY}" "$@"
--- a/databuild/graph/test/BUILD.bazel
+++ b/databuild/graph/test/BUILD.bazel
@ -1,5 +1,5 @@
 sh_test(
    name = "analyze_test",
    srcs = ["analyze_test.sh"],
-    data = ["//graph:analyze"],
+    data = ["//databuild/graph:analyze"],
 )
--- a/databuild/graph/test/analyze_test.sh
+++ b/databuild/graph/test/analyze_test.sh
@ -1,3 +1,3 @@
 #!/usr/bin/env bash

-DATABUILD_MODE=import_test DATABUILD_JOB_LOOKUP_PATH=foo DATABUILD_CANDIDATE_JOBS=bar graph/analyze
+DATABUILD_MODE=import_test DATABUILD_JOB_LOOKUP_PATH=foo DATABUILD_CANDIDATE_JOBS=bar databuild/graph/analyze
--- a/databuild/job/BUILD.bazel
+++ b/databuild/job/BUILD.bazel
--- a/databuild/job/README.md
+++ b/databuild/job/README.md
--- a/databuild/job/execute_wrapper.sh.tpl
+++ b/databuild/job/execute_wrapper.sh.tpl
@ -4,7 +4,7 @@ set -e
 %{RUNFILES_PREFIX}

 EXECUTE_BINARY="$(rlocation "_main/$(basename "%{EXECUTE_PATH}")")"
-JQ="$(rlocation "databuild+/runtime/$(basename "%{JQ_PATH}")")"
+JQ="$(rlocation "databuild+/databuild/runtime/$(basename "%{JQ_PATH}")")"

 # First argument should be the path to a config file
 CONFIG_FILE=${1:-}
--- a/databuild/rules.bzl
+++ b/databuild/rules.bzl
@ -104,7 +104,7 @@ _databuild_job_cfg_rule = rule(
            mandatory = True,
        ),
        "_template": attr.label(
-            default = "@databuild//runtime:simple_executable_wrapper.sh.tpl",
+            default = "@databuild//databuild/runtime:simple_executable_wrapper.sh.tpl",
            allow_single_file = True,
        ),
        "_bash_runfiles": attr.label(
@ -169,11 +169,11 @@ _databuild_job_exec_rule = rule(
            cfg = "target",
        ),
        "_template": attr.label(
-            default = "@databuild//job:execute_wrapper.sh.tpl",
+            default = "@databuild//databuild/job:execute_wrapper.sh.tpl",
            allow_single_file = True,
        ),
        "_jq": attr.label(
-            default = "@databuild//runtime:jq",
+            default = "@databuild//databuild/runtime:jq",
            executable = True,
            cfg = "target",
        ),
@ -323,7 +323,7 @@ _databuild_graph_lookup = rule(
            cfg = "target",
        ),
        "_template": attr.label(
-            default = "@databuild//runtime:simple_executable_wrapper.sh.tpl",
+            default = "@databuild//databuild/runtime:simple_executable_wrapper.sh.tpl",
            allow_single_file = True,
        ),
        "_bash_runfiles": attr.label(
@ -410,7 +410,7 @@ _databuild_graph_analyze = rule(
            allow_empty = False,
        ),
        "_template": attr.label(
-            default = "@databuild//graph:rust_analyze_wrapper.sh.tpl",
+            default = "@databuild//databuild/graph:rust_analyze_wrapper.sh.tpl",
            allow_single_file = True,
        ),
        "_bash_runfiles": attr.label(
@ -418,7 +418,7 @@ _databuild_graph_analyze = rule(
            allow_files = True,
        ),
        "_analyze": attr.label(
-            default = "@databuild//graph:analyze",
+            default = "@databuild//databuild/graph:analyze",
            executable = True,
            cfg = "target",
        ),
@ -501,7 +501,7 @@ _databuild_graph_mermaid = rule(
            allow_empty = False,
        ),
        "_template": attr.label(
-            default = "@databuild//graph:go_analyze_wrapper.sh.tpl",
+            default = "@databuild//databuild/graph:rust_analyze_wrapper.sh.tpl",
            allow_single_file = True,
        ),
        "_bash_runfiles": attr.label(
@ -509,7 +509,7 @@ _databuild_graph_mermaid = rule(
            allow_files = True,
        ),
        "_analyze": attr.label(
-            default = "@databuild//graph:analyze",
+            default = "@databuild//databuild/graph:analyze",
            executable = True,
            cfg = "target",
        ),
@ -564,7 +564,7 @@ _databuild_graph_exec = rule(
            allow_empty = False,
        ),
        "_template": attr.label(
-            default = "@databuild//graph:go_exec_wrapper.sh.tpl",
+            default = "@databuild//databuild/graph:go_exec_wrapper.sh.tpl",
            allow_single_file = True,
        ),
        "_bash_runfiles": attr.label(
@ -572,7 +572,7 @@ _databuild_graph_exec = rule(
            allow_files = True,
        ),
        "_execute": attr.label(
-            default = "@databuild//graph:execute",
+            default = "@databuild//databuild/graph:execute",
            executable = True,
            cfg = "target",
        )
--- a/databuild/runtime/BUILD.bazel
+++ b/databuild/runtime/BUILD.bazel
--- a/databuild/runtime/simple_executable_wrapper.sh.tpl
+++ b/databuild/runtime/simple_executable_wrapper.sh.tpl
--- a/databuild/structs.rs
+++ b/databuild/structs.rs
@ -0,0 +1,52 @@
+use std::collections::HashMap;
+use serde::{Deserialize, Serialize};
+use std::str::FromStr;
+
+// Data structures that mirror the Go implementation
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[serde(rename_all = "lowercase")]
+pub enum DataDepType {
+    Query,
+    Materialize,
+}
+
+impl FromStr for DataDepType {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "query" => Ok(DataDepType::Query),
+            "materialize" => Ok(DataDepType::Materialize),
+            _ => Err(format!("Unknown DataDepType: {}", s)),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DataDep {
+    #[serde(rename = "depType")]
+    pub dep_type: DataDepType,
+    #[serde(rename = "ref")]
+    pub reference: String,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JobConfig {
+    pub inputs: Vec<DataDep>,
+    pub outputs: Vec<String>,
+    pub args: Vec<String>,
+    pub env: HashMap<String, String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Task {
+    #[serde(rename = "jobLabel")]
+    pub job_label: String,
+    pub config: JobConfig,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct JobGraph {
+    pub outputs: Vec<String>,
+    pub nodes: Vec<Task>,
+}
--- a/examples/basic_graph/BUILD.bazel
+++ b/examples/basic_graph/BUILD.bazel
@ -1,4 +1,4 @@
-load("@databuild//:rules.bzl", "databuild_graph", "databuild_job")
+load("@databuild//databuild:rules.bzl", "databuild_graph", "databuild_job")
 load("@rules_java//java:defs.bzl", "java_binary")

 platform(
--- a/examples/basic_graph/test/exec_test.sh
+++ b/examples/basic_graph/test/exec_test.sh
@ -2,7 +2,9 @@
 set -e

 # Test the .exec rule
+echo exec
 basic_graph.exec < <(basic_graph.analyze /tmp/databuild_test/examples/basic_graph/generated_number/pippin_salem_sadie)

 # Test the .build rule
-basic_graph.build /tmp/databuild_test/examples/basic_graph/generated_number/pippin_salem_sadie
+echo build
+basic_graph.build /tmp/databuild_test/examples/basic_graph/generated_number/pippin_salem_sadie
--- a/examples/basic_job/BUILD.bazel
+++ b/examples/basic_job/BUILD.bazel
@ -1,4 +1,4 @@
-load("@databuild//:rules.bzl", "databuild_job")
+load("@databuild//databuild:rules.bzl", "databuild_job")

 databuild_job(
    name = "test_job",
--- a/examples/podcast_reviews/BUILD.bazel
+++ b/examples/podcast_reviews/BUILD.bazel
@ -1,5 +1,5 @@
 load("//:py_repl.bzl", "py_repl")
-load("@databuild//:rules.bzl", "databuild_job")
+load("@databuild//databuild:rules.bzl", "databuild_job")
 load("@rules_python//python:pip.bzl", "compile_pip_requirements")
 load("@pypi//:requirements.bzl", "requirement")

--- a/graph/go_analyze_wrapper.sh.tpl
+++ b/graph/go_analyze_wrapper.sh.tpl
@ -1,11 +0,0 @@
-#!/bin/bash
-set -e
-
-%{RUNFILES_PREFIX}
-
-%{PREFIX}
-
-EXECUTABLE_BINARY="$(rlocation "databuild+/graph/$(basename "%{EXECUTABLE_PATH}")_")/analyze"
-
-# Run the configuration
-exec "${EXECUTABLE_BINARY}" "$@"