databuild/rules.bzl


RUNFILES_PREFIX = """
# ================= BEGIN RUNFILES INIT =================

# TODO should this be extracted to shared init script
# Get the directory where the script is located
if [[ -z "${RUNFILES_DIR:-}" ]]; then
    SCRIPT_DIR="$(readlink -f "${BASH_SOURCE[0]}")"
    # Set RUNFILES_DIR relative to the script location
    export RUNFILES_DIR="${SCRIPT_DIR}.runfiles"
fi

# --- begin runfiles.bash initialization v3 ---
# Copy-pasted from the Bazel Bash runfiles library v3.
set -uo pipefail; set +e; f=bazel_tools/tools/bash/runfiles/runfiles.bash
source "${RUNFILES_DIR:-/dev/null}/$f" 2>/dev/null || \
  source "$(grep -sm1 "^$f " "${RUNFILES_MANIFEST_FILE:-/dev/null}" | cut -f2- -d' ')" 2>/dev/null || \
  source "$0.runfiles/$f" 2>/dev/null || \
  source "$(grep -sm1 "^$f " "$0.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \
  source "$(grep -sm1 "^$f " "$0.exe.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \
  { echo>&2 "ERROR: cannot find $f"; exit 1; }; f=; set -e
# --- end runfiles.bash initialization v3 ---

# ================== END RUNFILES INIT ==================

"""

def databuild_job(
        name,
        configure,
        execute,
        visibility = None):
    """Creates a DataBuild job target with configuration and execution capabilities.

    Args:
        name: Name of the job target
        configure: Target that implements the configuration logic
        execute: Target that implements the execution logic
        deps: List of other job_targets this job depends on
        visibility: Visibility specification
        **kwargs: Additional attributes to pass to the underlying rule
    """

    _databuild_job_cfg_rule(
        name = name + ".cfg",
        configure = configure,
        visibility = visibility,
    )

    # Create the main rule that serves as a provider for other targets
    _databuild_job_exec_rule(
        name = name + ".exec",
        execute = execute,
        visibility = visibility,
    )

    # Create a job target that configures then executes
    _databuild_job_rule(
        name = name,
        configure = ":%s.cfg" % name,
        execute = ":%s.exec" % name,
        visibility = visibility,
    )

def _databuild_job_cfg_impl(ctx):
    configure_file = ctx.executable.configure
    configure_path = ctx.attr.configure.files_to_run.executable.path
    script = ctx.actions.declare_file(ctx.label.name)

    ctx.actions.expand_template(
        template = ctx.file._template,
        output = script,
        substitutions = {
            "%{EXECUTABLE_PATH}": configure_path,
            "%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
            "%{PREFIX}": "",
        },
        is_executable = True,
    )

    runfiles = ctx.runfiles(
        files = [configure_file],
    ).merge(ctx.attr.configure.default_runfiles).merge(
        ctx.attr._bash_runfiles.default_runfiles,
    )

    return [
        DefaultInfo(
            executable = script,
            runfiles = runfiles,
        ),
    ]

_databuild_job_cfg_rule = rule(
    implementation = _databuild_job_cfg_impl,
    attrs = {
        "configure": attr.label(
            doc = "Target that implements the configuration logic",
            executable = True,
            cfg = "target",
            mandatory = True,
        ),
        "_template": attr.label(
            default = "@databuild//runtime:simple_executable_wrapper.sh.tpl",
            allow_single_file = True,
        ),
        "_bash_runfiles": attr.label(
            default = Label("@bazel_tools//tools/bash/runfiles"),
            allow_files = True,
        ),
    },
    executable = True,
)

def _databuild_job_exec_impl(ctx):
    execute_file = ctx.executable.execute
    jq_file = ctx.executable._jq

    script = ctx.actions.declare_file(ctx.label.name)

    # Get the correct runfiles paths
    jq_path = ctx.attr._jq.files_to_run.executable.path
    execute_path = ctx.attr.execute.files_to_run.executable.path

    ctx.actions.expand_template(
        template = ctx.file._template,
        output = script,
        substitutions = {
            "%{JQ_PATH}": jq_path,
            "%{EXECUTE_PATH}": execute_path,
            "%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
        },
        is_executable = True,
    )

    runfiles = ctx.runfiles(
        files = [jq_file, execute_file],
    ).merge(ctx.attr.execute.default_runfiles).merge(ctx.attr._jq.default_runfiles).merge(
        ctx.attr._bash_runfiles.default_runfiles,
    )

    return [
        DefaultInfo(
            executable = script,
            runfiles = runfiles,
        ),
    ]

# Define the provider
DataBuildJobInfo = provider(
    doc = "Information about a DataBuild job",
    fields = {
        "configure": "Target that implements the configuration logic",
        "execute": "Target that implements the execution logic",
        "deps": "List of dependencies (other DataBuildJobInfo providers)",
    },
)

_databuild_job_exec_rule = rule(
    implementation = _databuild_job_exec_impl,
    attrs = {
        "execute": attr.label(
            doc = "Target that implements the execution logic",
            mandatory = True,
            executable = True,
            cfg = "target",
        ),
        "_template": attr.label(
            default = "@databuild//job:execute_wrapper.sh.tpl",
            allow_single_file = True,
        ),
        "_jq": attr.label(
            default = "@databuild//runtime:jq",
            executable = True,
            cfg = "target",
        ),
        "_bash_runfiles": attr.label(
            default = Label("@bazel_tools//tools/bash/runfiles"),
            allow_files = True,
        ),
    },
    executable = True,
)

def _databuild_job_impl(ctx):
    """Wraps the configure and execute targets in a shell script."""
    script = ctx.actions.declare_file(ctx.label.name)
    ctx.actions.write(
        output = script,
        is_executable = True,
        content = RUNFILES_PREFIX + """
$(rlocation _main/{configure_path}) $@ | $(rlocation _main/{execute_path})
        """.format(
            configure_path = ctx.attr.configure.files_to_run.executable.short_path,
            execute_path = ctx.attr.execute.files_to_run.executable.short_path,
        ),
    )

    runfiles = ctx.runfiles(
        files = [ctx.executable.execute, ctx.executable.configure],
    ).merge(ctx.attr.execute.default_runfiles).merge(ctx.attr.configure.default_runfiles)

    return [
        DefaultInfo(
            executable = script,
            runfiles = runfiles,
        ),
        DataBuildJobInfo(
            configure = ctx.attr.configure,
            execute = script,
        ),
    ]

_databuild_job_rule = rule(
    implementation = _databuild_job_impl,
    attrs = {
        "configure": attr.label(
            doc = "Target that implements the configuration logic",
            mandatory = True,
            executable = True,
            # TODO all these cdf=execs are probably a problem for deployment
            cfg = "exec",
        ),
        "execute": attr.label(
            doc = "Target that implements the execution logic",
            mandatory = True,
            executable = True,
            cfg = "target",
        ),
    },
    executable = True,
)

def databuild_graph(name, jobs, lookup, visibility = None):
    """Creates a databuild graph target."""
    _databuild_graph_lookup(
        name = "%s.lookup" % name,
        lookup = lookup,
        visibility = visibility,
    )
    _databuild_graph_analyze(
        name = "%s.analyze" % name,
        lookup = "%s.lookup" % name,
        jobs = jobs,
        visibility = visibility,
    )
    _databuild_graph_exec(
        name = "%s.exec" % name,
        jobs = jobs,
        visibility = visibility,
    )
    _databuild_graph_build(
        name = "%s.build" % name,
        analyze = "%s.analyze" % name,
        exec = "%s.exec" % name,
        visibility = visibility,
    )


# TODO there feels like a lot of boilerplate around wrapping a target with a script - can this be simplified?
def _databuild_graph_lookup_impl(ctx):
    script = ctx.actions.declare_file(ctx.label.name)

    ctx.actions.expand_template(
        template = ctx.file._template,
        output = script,
        substitutions = {
            "%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
            "%{PREFIX}": "",
            "%{EXECUTABLE_PATH}": ctx.attr.lookup.files_to_run.executable.path,
        },
        is_executable = True,
    )

    runfiles = ctx.runfiles(
        files = [ctx.executable.lookup],
    ).merge(ctx.attr.lookup.default_runfiles).merge(
        ctx.attr._bash_runfiles.default_runfiles,
    )

    return [
        DefaultInfo(
            executable = script,
            runfiles = runfiles,
        ),
    ]

_databuild_graph_lookup = rule(
    implementation = _databuild_graph_lookup_impl,
    attrs = {
        "lookup": attr.label(
            doc = "Target that implements job lookup for desired partition refs",
            mandatory = True,
            executable = True,
            cfg = "exec",
        ),
        "_template": attr.label(
            default = "@databuild//runtime:simple_executable_wrapper.sh.tpl",
            allow_single_file = True,
        ),
        "_bash_runfiles": attr.label(
            default = Label("@bazel_tools//tools/bash/runfiles"),
            allow_files = True,
        ),
    },
    executable = True,
)

def _databuild_graph_analyze_impl(ctx):
    script = ctx.actions.declare_file(ctx.label.name)

    config_paths = {
        "//" + job.label.package + ":" +job.label.name:
            "$(rlocation _main/" + job[DataBuildJobInfo].configure.files_to_run.executable.short_path + ")"
        for job in ctx.attr.jobs
    }
    config_paths_str = "{" + ",".join(['\\"%s\\":\\"%s\\"' % (k, v) for k, v in config_paths.items()]) + "}"

    candidate_job_env_var = "'" + ",".join([
        "//" + target.label.package + ":" +target.label.name
        for target in ctx.attr.jobs
    ]) + "'"

    env_setup = """
export DATABUILD_CANDIDATE_JOBS="{candidate_job_env_var}"
export DATABUILD_MODE=plan
export DATABUILD_JOB_LOOKUP_PATH=$(rlocation _main/{lookup_path})
    """.format(
        candidate_job_env_var = config_paths_str,
        lookup_path = ctx.attr.lookup.files_to_run.executable.short_path,
    )

    script_prefix = env_setup

    ctx.actions.expand_template(
        template = ctx.file._template,
        output = script,
        substitutions = {
            "%{EXECUTABLE_PATH}": ctx.attr._analyze.files_to_run.executable.path,
            "%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
            "%{PREFIX}": script_prefix,
        },
        is_executable = True,
    )

    # Gather the configure executables
    configure_executables = [
        job[DataBuildJobInfo].configure.files_to_run.executable
        for job in ctx.attr.jobs
    ]

    runfiles = ctx.runfiles(
        files = [ctx.executable.lookup, ctx.executable._analyze] + configure_executables,
    ).merge(ctx.attr.lookup.default_runfiles).merge(ctx.attr._analyze.default_runfiles).merge(
        ctx.attr._bash_runfiles.default_runfiles
    ).merge_all([job.default_runfiles for job in ctx.attr.jobs])

    # Merge runfiles from all configure targets
    for job in ctx.attr.jobs:
        configure_target = job[DataBuildJobInfo].configure
        runfiles = runfiles.merge(configure_target.default_runfiles)

    return [
        DefaultInfo(
            executable = script,
            runfiles = runfiles,
        ),
    ]


_databuild_graph_analyze = rule(
    implementation = _databuild_graph_analyze_impl,
    attrs = {
        "lookup": attr.label(
            doc = "Target that implements job lookup for desired partition refs",
            mandatory = True,
            executable = True,
            cfg = "exec",
        ),
        "jobs": attr.label_list(
            doc = "The list of jobs that are candidates for building partitions in this databuild graph",
            allow_empty = False,
        ),
        "_template": attr.label(
            default = "@databuild//graph:go_analyze_wrapper.sh.tpl",
            allow_single_file = True,
        ),
        "_bash_runfiles": attr.label(
            default = Label("@bazel_tools//tools/bash/runfiles"),
            allow_files = True,
        ),
        "_analyze": attr.label(
            default = "@databuild//graph:analyze",
            executable = True,
            cfg = "target",
        )
    },
    executable = True,
)

def _databuild_graph_exec_impl(ctx):
    script = ctx.actions.declare_file(ctx.label.name)

    # Gather the execute executables
    execute_executables = [
        job[DataBuildJobInfo].execute
        for job in ctx.attr.jobs
    ]

    ctx.actions.expand_template(
        template = ctx.file._template,
        output = script,
        substitutions = {
            "%{EXECUTABLE_PATH}": ctx.attr._execute.files_to_run.executable.path,
            "%{RUNFILES_PREFIX}": RUNFILES_PREFIX,
            "%{PREFIX}": "",
        },
        is_executable = True,
    )

    runfiles = ctx.runfiles(
        files = [ctx.executable._execute] + execute_executables,
    ).merge(ctx.attr._execute.default_runfiles).merge(
        ctx.attr._bash_runfiles.default_runfiles
    ).merge_all([job.default_runfiles for job in ctx.attr.jobs])

    # Merge runfiles from all execute targets
    for job in ctx.attr.jobs:
        execute_target = job[DataBuildJobInfo].execute
        if hasattr(execute_target, "default_runfiles"):
            runfiles = runfiles.merge(execute_target.default_runfiles)

    return [
        DefaultInfo(
            executable = script,
            runfiles = runfiles,
        ),
    ]

_databuild_graph_exec = rule(
    implementation = _databuild_graph_exec_impl,
    attrs = {
        "jobs": attr.label_list(
            doc = "The list of jobs that are candidates for building partitions in this databuild graph",
            allow_empty = False,
        ),
        "_template": attr.label(
            default = "@databuild//graph:go_exec_wrapper.sh.tpl",
            allow_single_file = True,
        ),
        "_bash_runfiles": attr.label(
            default = Label("@bazel_tools//tools/bash/runfiles"),
            allow_files = True,
        ),
        "_execute": attr.label(
            default = "@databuild//graph:execute",
            executable = True,
            cfg = "target",
        )
    },
    executable = True,
)

def _databuild_graph_build_impl(ctx):
    """Wraps the analyze and execute targets in a shell script."""
    script = ctx.actions.declare_file(ctx.label.name)
    ctx.actions.write(
        output = script,
        is_executable = True,
        content = RUNFILES_PREFIX + """
$(rlocation _main/{analyze_path}) $@ | $(rlocation _main/{exec_path})
        """.format(
            analyze_path = ctx.attr.analyze.files_to_run.executable.short_path,
            exec_path = ctx.attr.exec.files_to_run.executable.short_path,
        ),
    )

    runfiles = ctx.runfiles(
        files = [ctx.executable.analyze, ctx.executable.exec],
    ).merge(ctx.attr.analyze.default_runfiles).merge(ctx.attr.exec.default_runfiles)

    return [
        DefaultInfo(
            executable = script,
            runfiles = runfiles,
        ),
    ]

_databuild_graph_build = rule(
    implementation = _databuild_graph_build_impl,
    attrs = {
        "analyze": attr.label(
            doc = "Target that implements the graph analysis logic",
            mandatory = True,
            executable = True,
            cfg = "exec",
        ),
        "exec": attr.label(
            doc = "Target that implements the graph execution logic",
            mandatory = True,
            executable = True,
            cfg = "target",
        ),
    },
    executable = True,
)

#def _graph_impl(name):
#    """
#
#    """
#
#    # Lets do this
#    pass
#
#databuild_graph = rule(
#    implementation = _graph_impl,
#    attrs = {
#        "jobs": attr.label_list(
#            doc = "The list of jobs that are candidates for building partitions in this databuild graph",
#            allow_empty = False,
#        ),
#        "plan": attr.label(
#            doc = "The binary that is run to produce a `JobGraph` that builds the requested partition refs",
#            executable = True,
#            cfg = "exec",
#        ),
#    },
#)