Implement remaining test app jobs

2025-07-30 22:53:52 -07:00 · 2025-07-30 22:53:52 -07:00 · 6d55d54267
commit 6d55d54267
parent 63f9518486
13 changed files with 439 additions and 9 deletions
--- a/MODULE.bazel
+++ b/MODULE.bazel
@ -215,6 +215,24 @@ pip.parse(
 )
 use_repo(pip, "pypi")

+# OCI (Docker images)
+oci = use_extension("@rules_oci//oci:extensions.bzl", "oci")
+
+# Declare external images you need to pull
+oci.pull(
+    name = "debian",
+    image = "docker.io/library/python",
+    platforms = [
+        "linux/arm64/v8",
+        "linux/amd64",
+    ],
+    # Using a pinned version for reproducibility
+    tag = "3.12-bookworm",
+)
+
+# For each oci.pull call, repeat the "name" here to expose them as dependencies
+use_repo(oci, "debian", "debian_linux_amd64", "debian_linux_arm64_v8")
+
 # Ruff
 # macOS ARM64 (Apple Silicon)
 http_file(
--- a/MODULE.bazel.lock
+++ b/MODULE.bazel.lock
@ -567,11 +567,54 @@
    "@@rules_oci+//oci:extensions.bzl%oci": {
      "general": {
        "bzlTransitiveDigest": "KHcdN2ovRQGX1MKsH0nGoGPFd/84U43tssN2jImCeJU=",
-        "usagesDigest": "/O1PwnnkqSBmI9Oe08ZYYqjM4IS8JR+/9rjgzVTNDaQ=",
+        "usagesDigest": "Y6oSW43ZgWvZTMtL3eDjcxyo58BCPzyiFhH+D+xVgwM=",
        "recordedFileInputs": {},
        "recordedDirentsInputs": {},
        "envVariables": {},
        "generatedRepoSpecs": {
+          "debian_linux_arm64_v8": {
+            "repoRuleId": "@@rules_oci+//oci/private:pull.bzl%oci_pull",
+            "attributes": {
+              "www_authenticate_challenges": {},
+              "scheme": "https",
+              "registry": "index.docker.io",
+              "repository": "library/python",
+              "identifier": "3.12-bookworm",
+              "platform": "linux/arm64/v8",
+              "target_name": "debian_linux_arm64_v8",
+              "bazel_tags": []
+            }
+          },
+          "debian_linux_amd64": {
+            "repoRuleId": "@@rules_oci+//oci/private:pull.bzl%oci_pull",
+            "attributes": {
+              "www_authenticate_challenges": {},
+              "scheme": "https",
+              "registry": "index.docker.io",
+              "repository": "library/python",
+              "identifier": "3.12-bookworm",
+              "platform": "linux/amd64",
+              "target_name": "debian_linux_amd64",
+              "bazel_tags": []
+            }
+          },
+          "debian": {
+            "repoRuleId": "@@rules_oci+//oci/private:pull.bzl%oci_alias",
+            "attributes": {
+              "target_name": "debian",
+              "www_authenticate_challenges": {},
+              "scheme": "https",
+              "registry": "index.docker.io",
+              "repository": "library/python",
+              "identifier": "3.12-bookworm",
+              "platforms": {
+                "@@platforms//cpu:arm64": "@debian_linux_arm64_v8",
+                "@@platforms//cpu:x86_64": "@debian_linux_amd64"
+              },
+              "bzlmod_repository": "debian",
+              "reproducible": true
+            }
+          },
          "oci_crane_darwin_amd64": {
            "repoRuleId": "@@rules_oci+//oci:repositories.bzl%crane_repositories",
            "attributes": {
@ -687,7 +730,11 @@
          }
        },
        "moduleExtensionMetadata": {
-          "explicitRootModuleDirectDeps": [],
+          "explicitRootModuleDirectDeps": [
+            "debian",
+            "debian_linux_arm64_v8",
+            "debian_linux_amd64"
+          ],
          "explicitRootModuleDirectDevDeps": [],
          "useAllRepos": "NO",
          "reproducible": false
--- a/databuild/test/app/BUILD.bazel
+++ b/databuild/test/app/BUILD.bazel
@ -22,6 +22,20 @@ py_test(
    deps = [":job_src"],
 )

+py_test(
+    name = "test_aggregate_color_votes",
+    srcs = ["jobs/aggregate_color_votes/test.py"],
+    main = "jobs/aggregate_color_votes/test.py",
+    deps = [":job_src"],
+)
+
+py_test(
+    name = "test_color_vote_report_calc",
+    srcs = ["jobs/color_vote_report_calc/test.py"],
+    main = "jobs/color_vote_report_calc/test.py",
+    deps = [":job_src"],
+)
+
 # Bazel-defined
 ## Graph
 databuild_graph(
@ -29,7 +43,8 @@ databuild_graph(
    jobs = [
        ":ingest_color_votes",
        ":trailing_color_votes",
-        # TODO
+        ":aggregate_color_votes",
+        ":color_vote_report_calc",
    ],
    lookup = ":bazel_graph_lookup",
 )
@ -67,10 +82,30 @@ py_binary(
 )

 ## Aggregate Color Votes
-# TODO
+databuild_job(
+    name = "aggregate_color_votes",
+    binary = ":aggregate_color_votes_binary",
+)
+
+py_binary(
+    name = "aggregate_color_votes_binary",
+    srcs = ["jobs/aggregate_color_votes/main.py"],
+    main = "jobs/aggregate_color_votes/main.py",
+    deps = [":job_src"],
+)

 ## Color Vote Report Calc
-# TODO
+databuild_job(
+    name = "color_vote_report_calc",
+    binary = ":color_vote_report_calc_binary",
+)
+
+py_binary(
+    name = "color_vote_report_calc_binary",
+    srcs = ["jobs/color_vote_report_calc/main.py"],
+    main = "jobs/color_vote_report_calc/main.py",
+    deps = [":job_src"],
+)

 # Python-DSL-defined

--- a/databuild/test/app/jobs/aggregate_color_votes/config.py
+++ b/databuild/test/app/jobs/aggregate_color_votes/config.py
@ -0,0 +1,42 @@
+from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig
+from databuild.test.app.colors import COLORS
+from datetime import date
+
+def configure(outputs: list[PartitionRef]) -> JobConfigureResponse:
+    configs = []
+    
+    for output in outputs:
+        parts = output.str.split("/")
+        if len(parts) == 2:
+            output_type, data_date = parts
+            date.fromisoformat(data_date)  # Validate date format
+            
+            # Determine input type based on output type
+            if output_type == "daily_votes":
+                input_prefix = "daily_color_votes"
+            elif output_type == "votes_1w":
+                input_prefix = "color_votes_1w"
+            elif output_type == "votes_1m":
+                input_prefix = "color_votes_1m"
+            else:
+                raise ValueError(f"Unknown output type: {output_type}")
+            
+            # Create inputs for all colors
+            inputs = []
+            for color in COLORS:
+                input_ref = PartitionRef(str=f"{input_prefix}/{data_date}/{color}")
+                inputs.append(input_ref)
+            
+            configs.append(JobConfig(
+                outputs=[output],
+                inputs=inputs,
+                args=[],
+                env={
+                    "DATA_DATE": data_date,
+                    "AGGREGATE_TYPE": output_type
+                }
+            ))
+        else:
+            raise ValueError(f"Invalid output partition format: {output.str}")
+    
+    return JobConfigureResponse(configs=configs)
--- a/databuild/test/app/jobs/aggregate_color_votes/execute.py
+++ b/databuild/test/app/jobs/aggregate_color_votes/execute.py
@ -0,0 +1,26 @@
+from databuild.test.app import dal
+from databuild.proto import PartitionRef
+from databuild.test.app.colors import COLORS
+
+def execute(data_date: str, aggregate_type: str):
+    # Determine input prefix based on aggregate type
+    if aggregate_type == "daily_votes":
+        input_prefix = "daily_color_votes"
+    elif aggregate_type == "votes_1w":
+        input_prefix = "color_votes_1w"
+    elif aggregate_type == "votes_1m":
+        input_prefix = "color_votes_1m"
+    else:
+        raise ValueError(f"Unknown aggregate type: {aggregate_type}")
+    
+    # Read data from all colors for this date
+    input_refs = []
+    for color in COLORS:
+        input_refs.append(PartitionRef(str=f"{input_prefix}/{data_date}/{color}"))
+    
+    data = dal.read(*input_refs)
+    total_votes = sum(record["votes"] for record in data)
+    
+    # Write aggregated result
+    output_ref = PartitionRef(str=f"{aggregate_type}/{data_date}")
+    dal.write(output_ref, [{"data_date": data_date, "votes": total_votes}])
--- a/databuild/test/app/jobs/aggregate_color_votes/main.py
+++ b/databuild/test/app/jobs/aggregate_color_votes/main.py
@ -0,0 +1,20 @@
+"""Main entrypoint for the aggregate_color_votes job for use with bazel-defined graph."""
+
+import sys
+import os
+import json
+from databuild.proto import PartitionRef
+from databuild.test.app.jobs.aggregate_color_votes.config import configure
+from databuild.test.app.jobs.aggregate_color_votes.execute import execute
+
+if __name__ == "__main__":
+    if sys.argv[1] == "config":
+        response = configure([
+            PartitionRef(str=raw_ref)
+            for raw_ref in sys.argv[2:]
+        ])
+        print(json.dumps(response.to_dict()))
+    elif sys.argv[1] == "exec":
+        execute(os.environ["DATA_DATE"], os.environ["AGGREGATE_TYPE"])
+    else:
+        raise Exception(f"Invalid command `{sys.argv[1]}`")
--- a/databuild/test/app/jobs/aggregate_color_votes/test.py
+++ b/databuild/test/app/jobs/aggregate_color_votes/test.py
@ -0,0 +1,59 @@
+import unittest
+from databuild.proto import PartitionRef
+from databuild.test.app.jobs.aggregate_color_votes.config import configure
+from databuild.test.app.colors import COLORS
+
+class TestAggregateColorVotesConfig(unittest.TestCase):
+    def test_configure_daily_votes(self):
+        outputs = [PartitionRef(str="daily_votes/2024-01-15")]
+        response = configure(outputs)
+        
+        self.assertEqual(len(response.configs), 1)
+        config = response.configs[0]
+        self.assertEqual(len(config.outputs), 1)
+        self.assertEqual(len(config.inputs), len(COLORS))  # One input per color
+        self.assertEqual(config.env["AGGREGATE_TYPE"], "daily_votes")
+        self.assertEqual(config.env["DATA_DATE"], "2024-01-15")
+        
+        # Check that inputs are from daily_color_votes
+        for i, color in enumerate(COLORS):
+            expected_input = f"daily_color_votes/2024-01-15/{color}"
+            self.assertEqual(config.inputs[i].str, expected_input)
+    
+    def test_configure_weekly_votes(self):
+        outputs = [PartitionRef(str="votes_1w/2024-01-21")]
+        response = configure(outputs)
+        
+        self.assertEqual(len(response.configs), 1)
+        config = response.configs[0]
+        self.assertEqual(config.env["AGGREGATE_TYPE"], "votes_1w")
+        
+        # Check that inputs are from color_votes_1w
+        for i, color in enumerate(COLORS):
+            expected_input = f"color_votes_1w/2024-01-21/{color}"
+            self.assertEqual(config.inputs[i].str, expected_input)
+    
+    def test_configure_monthly_votes(self):
+        outputs = [PartitionRef(str="votes_1m/2024-01-31")]
+        response = configure(outputs)
+        
+        self.assertEqual(len(response.configs), 1)  
+        config = response.configs[0]
+        self.assertEqual(config.env["AGGREGATE_TYPE"], "votes_1m")
+        
+        # Check that inputs are from color_votes_1m
+        for i, color in enumerate(COLORS):
+            expected_input = f"color_votes_1m/2024-01-31/{color}"
+            self.assertEqual(config.inputs[i].str, expected_input)
+    
+    def test_configure_multiple_outputs(self):
+        outputs = [
+            PartitionRef(str="daily_votes/2024-01-15"),
+            PartitionRef(str="votes_1w/2024-01-21")
+        ]
+        response = configure(outputs)
+        
+        self.assertEqual(len(response.configs), 2)  # One config per output
+
+if __name__ == "__main__":
+    unittest.main()
--- a/databuild/test/app/jobs/color_vote_report_calc/config.py
+++ b/databuild/test/app/jobs/color_vote_report_calc/config.py
@ -0,0 +1,48 @@
+from databuild.proto import PartitionRef, JobConfigureResponse, JobConfig
+from datetime import date
+from collections import defaultdict
+
+def configure(outputs: list[PartitionRef]) -> JobConfigureResponse:
+    # This job produces a single job config that handles all requested outputs
+    all_dates = set()
+    all_colors = set()
+    
+    for output in outputs:
+        parts = output.str.split("/")
+        if len(parts) == 3 and parts[0] == "color_vote_report":
+            prefix, data_date, color = parts
+            date.fromisoformat(data_date)  # Validate date format
+            all_dates.add(data_date)
+            all_colors.add(color)
+        else:
+            raise ValueError(f"Invalid output partition format: {output.str}")
+    
+    # Build inputs for all dates and colors that are actually requested
+    inputs = []
+    
+    # Add total vote aggregates for all dates
+    for data_date in all_dates:
+        inputs.extend([
+            PartitionRef(str=f"daily_votes/{data_date}"),
+            PartitionRef(str=f"votes_1w/{data_date}"),
+            PartitionRef(str=f"votes_1m/{data_date}")
+        ])
+    
+    # Add color-specific inputs for all date/color combinations that are requested
+    for output in outputs:
+        data_date, color = output.str.split("/")[1], output.str.split("/")[2]
+        inputs.extend([
+            PartitionRef(str=f"daily_color_votes/{data_date}/{color}"),
+            PartitionRef(str=f"color_votes_1w/{data_date}/{color}"),
+            PartitionRef(str=f"color_votes_1m/{data_date}/{color}")
+        ])
+    
+    # Single job config for all outputs - pass output partition refs as args
+    config = JobConfig(
+        outputs=outputs,
+        inputs=inputs,
+        args=[output.str for output in outputs],
+        env={}
+    )
+    
+    return JobConfigureResponse(configs=[config])
--- a/databuild/test/app/jobs/color_vote_report_calc/execute.py
+++ b/databuild/test/app/jobs/color_vote_report_calc/execute.py
@ -0,0 +1,51 @@
+from databuild.test.app import dal
+from databuild.proto import PartitionRef
+
+def execute(output_partition_strs: list[str]):
+    # Parse requested outputs
+    outputs = [PartitionRef(str=ref_str) for ref_str in output_partition_strs]
+    
+    for output in outputs:
+        parts = output.str.split("/")
+        data_date, color = parts[1], parts[2]
+        
+        # Read total votes for this date - fail if missing
+        daily_total = dal.read(PartitionRef(str=f"daily_votes/{data_date}"), empty_ok=False)
+        weekly_total = dal.read(PartitionRef(str=f"votes_1w/{data_date}"), empty_ok=False)
+        monthly_total = dal.read(PartitionRef(str=f"votes_1m/{data_date}"), empty_ok=False)
+        
+        # Read color-specific votes for this date/color - fail if missing
+        daily_color = dal.read(PartitionRef(str=f"daily_color_votes/{data_date}/{color}"), empty_ok=False)
+        weekly_color = dal.read(PartitionRef(str=f"color_votes_1w/{data_date}/{color}"), empty_ok=False)
+        monthly_color = dal.read(PartitionRef(str=f"color_votes_1m/{data_date}/{color}"), empty_ok=False)
+        
+        # Extract vote counts
+        daily_total_votes = daily_total[0]["votes"]
+        weekly_total_votes = weekly_total[0]["votes"]
+        monthly_total_votes = monthly_total[0]["votes"]
+        
+        daily_color_votes = daily_color[0]["votes"]
+        weekly_color_votes = weekly_color[0]["votes"]
+        monthly_color_votes = monthly_color[0]["votes"]
+        
+        # Calculate percentages
+        daily_percent = (daily_color_votes / daily_total_votes * 100) if daily_total_votes > 0 else 0
+        weekly_percent = (weekly_color_votes / weekly_total_votes * 100) if weekly_total_votes > 0 else 0
+        monthly_percent = (monthly_color_votes / monthly_total_votes * 100) if monthly_total_votes > 0 else 0
+        
+        # Write report
+        report_data = [{
+            "color": color,
+            "data_date": data_date,
+            "daily_total_votes": daily_total_votes,
+            "weekly_total_votes": weekly_total_votes,
+            "monthly_total_votes": monthly_total_votes,
+            "daily_color_votes": daily_color_votes,
+            "weekly_color_votes": weekly_color_votes,
+            "monthly_color_votes": monthly_color_votes,
+            "daily_percent": daily_percent,
+            "weekly_percent": weekly_percent,
+            "monthly_percent": monthly_percent
+        }]
+        
+        dal.write(output, report_data)
--- a/databuild/test/app/jobs/color_vote_report_calc/main.py
+++ b/databuild/test/app/jobs/color_vote_report_calc/main.py
@ -0,0 +1,20 @@
+"""Main entrypoint for the color_vote_report_calc job for use with bazel-defined graph."""
+
+import sys
+import os
+import json
+from databuild.proto import PartitionRef
+from databuild.test.app.jobs.color_vote_report_calc.config import configure
+from databuild.test.app.jobs.color_vote_report_calc.execute import execute
+
+if __name__ == "__main__":
+    if sys.argv[1] == "config":
+        response = configure([
+            PartitionRef(str=raw_ref)
+            for raw_ref in sys.argv[2:]
+        ])
+        print(json.dumps(response.to_dict()))
+    elif sys.argv[1] == "exec":
+        execute(sys.argv[2:])
+    else:
+        raise Exception(f"Invalid command `{sys.argv[1]}`")
--- a/databuild/test/app/jobs/color_vote_report_calc/test.py
+++ b/databuild/test/app/jobs/color_vote_report_calc/test.py
@ -0,0 +1,60 @@
+import unittest
+from databuild.proto import PartitionRef
+from databuild.test.app.jobs.color_vote_report_calc.config import configure
+
+class TestColorVoteReportCalcConfig(unittest.TestCase):
+    def test_configure_single_output(self):
+        outputs = [PartitionRef(str="color_vote_report/2024-01-15/red")]
+        response = configure(outputs)
+        
+        self.assertEqual(len(response.configs), 1)  # Always single config
+        config = response.configs[0]
+        self.assertEqual(len(config.outputs), 1)
+        self.assertEqual(config.args, ["color_vote_report/2024-01-15/red"])
+        
+        # Should have inputs for total votes and color-specific votes
+        expected_inputs = [
+            "daily_votes/2024-01-15",
+            "votes_1w/2024-01-15", 
+            "votes_1m/2024-01-15",
+            "daily_color_votes/2024-01-15/red",
+            "color_votes_1w/2024-01-15/red",
+            "color_votes_1m/2024-01-15/red"
+        ]
+        actual_inputs = [inp.str for inp in config.inputs]
+        for expected in expected_inputs:
+            self.assertIn(expected, actual_inputs)
+    
+    def test_configure_multiple_outputs_same_date(self):
+        outputs = [
+            PartitionRef(str="color_vote_report/2024-01-15/red"),
+            PartitionRef(str="color_vote_report/2024-01-15/blue")
+        ]
+        response = configure(outputs)
+        
+        self.assertEqual(len(response.configs), 1)  # Single config for all outputs
+        config = response.configs[0]
+        self.assertEqual(len(config.outputs), 2)
+        self.assertEqual(set(config.args), {
+            "color_vote_report/2024-01-15/red",
+            "color_vote_report/2024-01-15/blue"
+        })
+    
+    def test_configure_multiple_dates(self):
+        outputs = [
+            PartitionRef(str="color_vote_report/2024-01-15/red"),
+            PartitionRef(str="color_vote_report/2024-01-16/red")
+        ]
+        response = configure(outputs)
+        
+        self.assertEqual(len(response.configs), 1)  # Single config for all outputs
+        config = response.configs[0]
+        self.assertEqual(len(config.outputs), 2)
+        
+        # Should have total vote inputs for both dates
+        actual_inputs = [inp.str for inp in config.inputs]
+        self.assertIn("daily_votes/2024-01-15", actual_inputs)
+        self.assertIn("daily_votes/2024-01-16", actual_inputs)
+
+if __name__ == "__main__":
+    unittest.main()
--- a/databuild/test/app/jobs/ingest_color_votes/main.py
+++ b/databuild/test/app/jobs/ingest_color_votes/main.py
@ -2,17 +2,19 @@

 import sys
 import os
+import json
 from databuild.proto import PartitionRef
 from databuild.test.app.jobs.ingest_color_votes.config import configure
 from databuild.test.app.jobs.ingest_color_votes.execute import execute

 if __name__ == "__main__":
    if sys.argv[1] == "config":
-        configure([
+        response = configure([
            PartitionRef(str=raw_ref)
            for raw_ref in sys.argv[2:]
        ])
-    elif sys.argv[1] == "execute":
+        print(json.dumps(response.to_dict()))
+    elif sys.argv[1] == "exec":
        execute(os.environ["DATA_DATE"], os.environ["COLOR"])
    else:
        raise Exception(f"Invalid command `{sys.argv[1]}`")
--- a/databuild/test/app/jobs/trailing_color_votes/main.py
+++ b/databuild/test/app/jobs/trailing_color_votes/main.py
@ -2,17 +2,19 @@

 import sys
 import os
+import json
 from databuild.proto import PartitionRef
 from databuild.test.app.jobs.trailing_color_votes.config import configure
 from databuild.test.app.jobs.trailing_color_votes.execute import execute

 if __name__ == "__main__":
    if sys.argv[1] == "config":
-        configure([
+        response = configure([
            PartitionRef(str=raw_ref)
            for raw_ref in sys.argv[2:]
        ])
-    elif sys.argv[1] == "execute":
+        print(json.dumps(response.to_dict()))
+    elif sys.argv[1] == "exec":
        execute(os.environ["DATA_DATE"], os.environ["COLOR"])
    else:
        raise Exception(f"Invalid command `{sys.argv[1]}`")