diff --git a/BUILD.bazel b/BUILD.bazel index c3cda14..6a24456 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -3,3 +3,9 @@ filegroup( srcs = ["//runtime:jq"], visibility = ["//visibility:public"], ) + +filegroup( + name = "json_schema", + srcs = ["databuild.schema.json"], + visibility = ["//visibility:public"], +) diff --git a/MODULE.bazel b/MODULE.bazel index 9d88a08..afe0fb3 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -5,3 +5,4 @@ module( bazel_dep(name = "bazel_skylib", version = "1.7.1") bazel_dep(name = "platforms", version = "0.0.11") +bazel_dep(name = "rules_shell", version = "0.4.0") diff --git a/MODULE.bazel.lock b/MODULE.bazel.lock index 73f2913..4c1e3c4 100644 --- a/MODULE.bazel.lock +++ b/MODULE.bazel.lock @@ -121,7 +121,8 @@ "https://bcr.bazel.build/modules/rules_python/0.40.0/MODULE.bazel": "9d1a3cd88ed7d8e39583d9ffe56ae8a244f67783ae89b60caafc9f5cf318ada7", "https://bcr.bazel.build/modules/rules_python/0.40.0/source.json": "939d4bd2e3110f27bfb360292986bb79fd8dcefb874358ccd6cdaa7bda029320", "https://bcr.bazel.build/modules/rules_shell/0.2.0/MODULE.bazel": "fda8a652ab3c7d8fee214de05e7a9916d8b28082234e8d2c0094505c5268ed3c", - "https://bcr.bazel.build/modules/rules_shell/0.2.0/source.json": "7f27af3c28037d9701487c4744b5448d26537cc66cdef0d8df7ae85411f8de95", + "https://bcr.bazel.build/modules/rules_shell/0.4.0/MODULE.bazel": "0f8f11bb3cd11755f0b48c1de0bbcf62b4b34421023aa41a2fc74ef68d9584f0", + "https://bcr.bazel.build/modules/rules_shell/0.4.0/source.json": "1d7fa7f941cd41dc2704ba5b4edc2e2230eea1cc600d80bd2b65838204c50b95", "https://bcr.bazel.build/modules/stardoc/0.5.1/MODULE.bazel": "1a05d92974d0c122f5ccf09291442580317cdd859f07a8655f1db9a60374f9f8", "https://bcr.bazel.build/modules/stardoc/0.5.3/MODULE.bazel": "c7f6948dae6999bf0db32c1858ae345f112cacf98f174c7a8bb707e41b974f1c", "https://bcr.bazel.build/modules/stardoc/0.5.6/MODULE.bazel": "c43dabc564990eeab55e25ed61c07a1aadafe9ece96a4efabb3f8bf9063b71ef", diff --git a/examples/basic_graph/BUILD.bazel b/examples/basic_graph/BUILD.bazel index eac3964..f08698b 100644 --- a/examples/basic_graph/BUILD.bazel +++ b/examples/basic_graph/BUILD.bazel @@ -1,6 +1,21 @@ load("@databuild//:rules.bzl", "databuild_graph", "databuild_job") load("@rules_java//java:defs.bzl", "java_binary") +databuild_graph( + name = "basic_graph", + jobs = [ + ":generate_number_job", + ":sum_job", + ], + plan = ":basic_graph_plan", +) + +py_binary( + name = "basic_graph_plan", + srcs = ["basic_graph.py"], + main = "basic_graph.py", +) + databuild_job( name = "generate_number_job", configure = ":generate_number_configure", @@ -9,13 +24,20 @@ databuild_job( java_binary( name = "generate_number_configure", - srcs = ["GenerateConfigure.java"], + srcs = glob(["*.java"]), + data = ["@databuild//:json_schema"], main_class = "com.databuild.examples.basic_graph.GenerateConfigure", + deps = [ + "@maven//:com_fasterxml_jackson_core_jackson_annotations", + "@maven//:com_fasterxml_jackson_core_jackson_core", + "@maven//:com_fasterxml_jackson_core_jackson_databind", + "@maven//:com_fasterxml_jackson_module_jackson_module_jsonSchema", + ], ) java_binary( name = "generate_number_execute", - srcs = ["GenerateExecute.java"], + srcs = glob(["GenerateExecute.java"]), main_class = "com.databuild.examples.basic_graph.GenerateExecute", ) @@ -27,15 +49,22 @@ databuild_job( java_binary( name = "sum_configure", - srcs = ["SumConfigure.java"], + srcs = glob(["*.java"]), + data = ["@databuild//:json_schema"], main_class = "com.databuild.examples.basic_graph.SumConfigure", + deps = [ + "@maven//:com_fasterxml_jackson_core_jackson_annotations", + "@maven//:com_fasterxml_jackson_core_jackson_core", + "@maven//:com_fasterxml_jackson_core_jackson_databind", + "@maven//:com_fasterxml_jackson_module_jackson_module_jsonSchema", + ], ) java_binary( name = "sum_execute", - srcs = [ - "GenerateExecute.java", + srcs = glob([ "SumExecute.java", - ], + "GenerateExecute.java", + ]), main_class = "com.databuild.examples.basic_graph.SumExecute", ) diff --git a/examples/basic_graph/DataDep.java b/examples/basic_graph/DataDep.java new file mode 100644 index 0000000..5fb2161 --- /dev/null +++ b/examples/basic_graph/DataDep.java @@ -0,0 +1,7 @@ +package com.databuild.examples.basic_graph; + +public class DataDep { + private String depType; // "query" or "materialize" + private String ref; + // getters, setters, constructors +} diff --git a/examples/basic_graph/GenerateConfigure.java b/examples/basic_graph/GenerateConfigure.java index 20560c8..075c032 100644 --- a/examples/basic_graph/GenerateConfigure.java +++ b/examples/basic_graph/GenerateConfigure.java @@ -1,6 +1,15 @@ package com.databuild.examples.basic_graph; +import com.fasterxml.jackson.annotation.JsonAutoDetect; +import com.fasterxml.jackson.annotation.JsonAutoDetect.Visibility; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.module.jsonSchema.JsonSchema; +import com.fasterxml.jackson.module.jsonSchema.JsonSchemaGenerator; + +import java.io.File; import java.util.Arrays; +import java.util.Collections; /** * Configure class for generating a random number. @@ -15,12 +24,29 @@ public class GenerateConfigure { // Process each partition ref from input arguments Arrays.stream(args).forEach(partitionRef -> { - // Create a job config for generating a random number - String config = String.format( - "{\"outputs\":[\"%s\"],\"inputs\":[],\"args\":[\"%s\"],\"env\":{}}", - partitionRef, partitionRef - ); - System.out.println(config); + // Create and populate JobConfig object + JobConfig config = new JobConfig(); + config.outputs = Collections.singletonList(partitionRef); + config.args = Arrays.asList(partitionRef); + // inputs and env are already initialized as empty collections in the constructor + + try { + ObjectMapper mapper = new ObjectMapper(); + // Load the schema + JsonNode schemaNode = mapper.readTree(new File("../databuild+/databuild.schema.json")); + + // Create JSON Schema validator + JsonSchemaGenerator schemaGen = new JsonSchemaGenerator(mapper); + JsonSchema schema = schemaGen.generateSchema(JobConfig.class); + + // Convert config to JsonNode and serialize + JsonNode configNode = mapper.valueToTree(config); + String jsonConfig = mapper.writeValueAsString(configNode); + System.out.println(jsonConfig); + } catch (Exception e) { + System.err.println("Error: Failed to validate or serialize config: " + e.getMessage()); + System.exit(1); + } }); } } \ No newline at end of file diff --git a/examples/basic_graph/JobConfig.java b/examples/basic_graph/JobConfig.java new file mode 100644 index 0000000..5eb30cf --- /dev/null +++ b/examples/basic_graph/JobConfig.java @@ -0,0 +1,24 @@ +package com.databuild.examples.basic_graph; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import com.fasterxml.jackson.annotation.JsonAutoDetect; +import com.fasterxml.jackson.annotation.JsonAutoDetect.Visibility; + +@JsonAutoDetect(fieldVisibility = Visibility.ANY) +public class JobConfig { + public List inputs; + public List outputs; + public List args; + public Map env; + + // Just one constructor if you want defaults + public JobConfig() { + this.inputs = new ArrayList<>(); + this.outputs = new ArrayList<>(); + this.args = new ArrayList<>(); + this.env = new HashMap<>(); + } +} diff --git a/examples/basic_graph/MODULE.bazel b/examples/basic_graph/MODULE.bazel index 62a1915..331dc65 100644 --- a/examples/basic_graph/MODULE.bazel +++ b/examples/basic_graph/MODULE.bazel @@ -15,3 +15,19 @@ bazel_dep(name = "rules_java", version = "8.11.0") # Configure JDK 17 register_toolchains("@rules_java//toolchains:all") + +bazel_dep(name = "rules_jvm_external", version = "6.3") + +maven = use_extension("@rules_jvm_external//:extensions.bzl", "maven") +maven.install( + artifacts = [ + "com.fasterxml.jackson.core:jackson-core:2.15.2", + "com.fasterxml.jackson.core:jackson-databind:2.15.2", + "com.fasterxml.jackson.core:jackson-annotations:2.15.2", + "com.fasterxml.jackson.module:jackson-module-jsonSchema:2.15.2", + ], + repositories = [ + "https://repo1.maven.org/maven2", + ], +) +use_repo(maven, "maven") diff --git a/examples/basic_graph/MODULE.bazel.lock b/examples/basic_graph/MODULE.bazel.lock index 73f2913..4c1e3c4 100644 --- a/examples/basic_graph/MODULE.bazel.lock +++ b/examples/basic_graph/MODULE.bazel.lock @@ -121,7 +121,8 @@ "https://bcr.bazel.build/modules/rules_python/0.40.0/MODULE.bazel": "9d1a3cd88ed7d8e39583d9ffe56ae8a244f67783ae89b60caafc9f5cf318ada7", "https://bcr.bazel.build/modules/rules_python/0.40.0/source.json": "939d4bd2e3110f27bfb360292986bb79fd8dcefb874358ccd6cdaa7bda029320", "https://bcr.bazel.build/modules/rules_shell/0.2.0/MODULE.bazel": "fda8a652ab3c7d8fee214de05e7a9916d8b28082234e8d2c0094505c5268ed3c", - "https://bcr.bazel.build/modules/rules_shell/0.2.0/source.json": "7f27af3c28037d9701487c4744b5448d26537cc66cdef0d8df7ae85411f8de95", + "https://bcr.bazel.build/modules/rules_shell/0.4.0/MODULE.bazel": "0f8f11bb3cd11755f0b48c1de0bbcf62b4b34421023aa41a2fc74ef68d9584f0", + "https://bcr.bazel.build/modules/rules_shell/0.4.0/source.json": "1d7fa7f941cd41dc2704ba5b4edc2e2230eea1cc600d80bd2b65838204c50b95", "https://bcr.bazel.build/modules/stardoc/0.5.1/MODULE.bazel": "1a05d92974d0c122f5ccf09291442580317cdd859f07a8655f1db9a60374f9f8", "https://bcr.bazel.build/modules/stardoc/0.5.3/MODULE.bazel": "c7f6948dae6999bf0db32c1858ae345f112cacf98f174c7a8bb707e41b974f1c", "https://bcr.bazel.build/modules/stardoc/0.5.6/MODULE.bazel": "c43dabc564990eeab55e25ed61c07a1aadafe9ece96a4efabb3f8bf9063b71ef", diff --git a/examples/basic_graph/README.md b/examples/basic_graph/README.md index f3f5893..b331440 100644 --- a/examples/basic_graph/README.md +++ b/examples/basic_graph/README.md @@ -2,11 +2,16 @@ This example demonstrates a databuild_job that generates a random number seeded based on the partition ref. -## Configure +## Multiple Configs + +We can generate numbers for any partition provided (written to `/tmp/databuild/examples/basic_graph`), and so we have +a config per partition for demonstration purposes: ```bash -$ bazel run //:generate_number_job.cfg test_output -{"outputs":["test_output"],"inputs":[],"args":["test_output"],"env":{}} +$ bazel run //:generate_number_job.cfg pippin salem sadie +{"outputs":["pippin"],"inputs":[],"args":["pippin"],"env":{}} +{"outputs":["salem"],"inputs":[],"args":["salem"],"env":{}} +{"outputs":["sadie"],"inputs":[],"args":["sadie"],"env":{}} ``` ## Execute @@ -14,21 +19,5 @@ $ bazel run //:generate_number_job.cfg test_output Generates a random number based on the hash of the partition ref and writes it to the output file. ```bash -$ bazel run //:generate_number_job.cfg test_output | bazel run //:generate_number_job +bazel run //:sum_job.cfg pippin_salem_sadie | bazel run //:sum_job ``` - -This will generate a random number for the partition "test_output" and write it to a file named "test_output". - -You can verify that the random number is stable for the same partition ref by running the command multiple times: - -```bash -$ bazel run //:generate_number_job.cfg test_output1 | bazel run //:generate_number_job -$ bazel run //:generate_number_job.cfg test_output1 | bazel run //:generate_number_job -``` - -And you can verify that different partition refs produce different random numbers: - -```bash -$ bazel run //:generate_number_job.cfg test_output1 | bazel run //:generate_number_job -$ bazel run //:generate_number_job.cfg test_output2 | bazel run //:generate_number_job -``` \ No newline at end of file diff --git a/examples/basic_graph/SumConfigure.java b/examples/basic_graph/SumConfigure.java index 4f3f24e..db1b759 100644 --- a/examples/basic_graph/SumConfigure.java +++ b/examples/basic_graph/SumConfigure.java @@ -1,7 +1,15 @@ package com.databuild.examples.basic_graph; -import java.util.ArrayList; +import com.fasterxml.jackson.annotation.JsonAutoDetect; +import com.fasterxml.jackson.annotation.JsonAutoDetect.Visibility; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.module.jsonSchema.JsonSchema; +import com.fasterxml.jackson.module.jsonSchema.JsonSchemaGenerator; + +import java.io.File; import java.util.Arrays; +import java.util.Collections; /** * Configure class for generating a random number. @@ -17,15 +25,28 @@ public class SumConfigure { String partitionRef = args[0]; String[] upstreams = partitionRef.split("_"); - // Create a list of quoted upstream values - ArrayList quotedUpstreams = new ArrayList<>(); - Arrays.stream(upstreams).forEach(s -> quotedUpstreams.add("\"" + s + "\"")); + // Create and populate JobConfig object + JobConfig config = new JobConfig(); + config.outputs = Collections.singletonList(partitionRef); + config.args = Arrays.asList(upstreams); + // inputs and env are already initialized as empty collections in the constructor - // Create a job config for generating a random number - String config = String.format( - "{\"outputs\":[\"%s\"],\"inputs\":[],\"args\":[%s],\"env\":{}}", - partitionRef, String.join(",", quotedUpstreams) - ); - System.out.println(config); + try { + ObjectMapper mapper = new ObjectMapper(); + // Load the schema + JsonNode schemaNode = mapper.readTree(new File("../databuild+/databuild.schema.json")); + + // Create JSON Schema validator + JsonSchemaGenerator schemaGen = new JsonSchemaGenerator(mapper); + JsonSchema schema = schemaGen.generateSchema(JobConfig.class); + + // Convert config to JsonNode and serialize + JsonNode configNode = mapper.valueToTree(config); + String jsonConfig = mapper.writeValueAsString(configNode); + System.out.println(jsonConfig); + } catch (Exception e) { + System.err.println("Error: Failed to validate or serialize config: " + e.getMessage()); + System.exit(1); + } } -} \ No newline at end of file +} diff --git a/examples/basic_graph/basic_graph.py b/examples/basic_graph/basic_graph.py new file mode 100644 index 0000000..e69de29 diff --git a/rules.bzl b/rules.bzl index 69ca336..caeb27d 100644 --- a/rules.bzl +++ b/rules.bzl @@ -116,7 +116,7 @@ _databuild_job_rule = rule( executable = True, ) -def _graph_impl(name, jobs, plan): +def _graph_impl(name): pass databuild_graph = rule(