Parallel analyze
This commit is contained in:
parent
0cb2737701
commit
13fc7a7c63
5 changed files with 248 additions and 385 deletions
|
|
@ -1,9 +0,0 @@
|
|||
|
||||
directories:
|
||||
./
|
||||
|
||||
targets:
|
||||
//runtime/...
|
||||
//:all
|
||||
//job/...
|
||||
//graph/...
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -2,3 +2,5 @@ bazel-*
|
|||
.ijwb
|
||||
databuild.iml
|
||||
.idea
|
||||
.DS_Store
|
||||
examples/podcast_reviews/data
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
6
examples/podcast_reviews/README.md
Normal file
6
examples/podcast_reviews/README.md
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
# Podcast Reviews Example
|
||||
|
||||
## Input Data
|
||||
|
||||
Get it from [here](https://www.kaggle.com/datasets/thoughtvector/podcastreviews/versions/28?select=database.sqlite)! (and put it in `examples/podcast_reviews/data/ingest/database.sqlite`)
|
||||
110
graph/analyze.go
110
graph/analyze.go
|
|
@ -7,6 +7,9 @@ import (
|
|||
"os/exec"
|
||||
"strings"
|
||||
"log"
|
||||
"sync"
|
||||
"runtime"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
// DataDepType represents the type of data dependency
|
||||
|
|
@ -146,6 +149,83 @@ func resolve(outputRefs []string) (map[string][]string, error) {
|
|||
return result, nil
|
||||
}
|
||||
|
||||
// configureParallel configures multiple jobs in parallel
|
||||
func configureParallel(jobRefs map[string][]string, numWorkers int) ([]Task, error) {
|
||||
var wg sync.WaitGroup
|
||||
tasksChan := make(chan []Task, len(jobRefs))
|
||||
errorChan := make(chan error, len(jobRefs))
|
||||
jobsChan := make(chan struct {
|
||||
jobLabel string
|
||||
producedRefs []string
|
||||
}, len(jobRefs))
|
||||
|
||||
// Use a mutex to protect access to the error variable
|
||||
var mu sync.Mutex
|
||||
var firstErr error
|
||||
|
||||
// Fill the jobs channel
|
||||
for jobLabel, producedRefs := range jobRefs {
|
||||
jobsChan <- struct {
|
||||
jobLabel string
|
||||
producedRefs []string
|
||||
}{jobLabel, producedRefs}
|
||||
}
|
||||
close(jobsChan)
|
||||
|
||||
// Start workers
|
||||
for i := 0; i < numWorkers; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for job := range jobsChan {
|
||||
// Check if an error has already occurred
|
||||
mu.Lock()
|
||||
if firstErr != nil {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
mu.Unlock()
|
||||
|
||||
tasks, err := configure(job.jobLabel, job.producedRefs)
|
||||
if err != nil {
|
||||
mu.Lock()
|
||||
if firstErr == nil {
|
||||
firstErr = err
|
||||
errorChan <- err
|
||||
}
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
tasksChan <- tasks
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Wait for all workers to finish
|
||||
go func() {
|
||||
wg.Wait()
|
||||
close(tasksChan)
|
||||
close(errorChan)
|
||||
}()
|
||||
|
||||
// Collect results
|
||||
var allTasks []Task
|
||||
for tasks := range tasksChan {
|
||||
allTasks = append(allTasks, tasks...)
|
||||
}
|
||||
|
||||
// Check for errors
|
||||
select {
|
||||
case err := <-errorChan:
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
default:
|
||||
}
|
||||
|
||||
return allTasks, nil
|
||||
}
|
||||
|
||||
// plan creates a job graph for given output references
|
||||
func plan(outputRefs []string) (*JobGraph, error) {
|
||||
log.Printf("Starting planning for %d output refs: %v", len(outputRefs), outputRefs)
|
||||
|
|
@ -156,6 +236,20 @@ func plan(outputRefs []string) (*JobGraph, error) {
|
|||
epoch := 0
|
||||
var nodes []Task
|
||||
|
||||
// Determine the number of workers based on available CPU cores or environment variable
|
||||
numWorkers := runtime.NumCPU()
|
||||
if workerEnv := os.Getenv("DATABUILD_PARALLEL_WORKERS"); workerEnv != "" {
|
||||
if parsedWorkers, err := strconv.Atoi(workerEnv); err != nil {
|
||||
log.Printf("Warning: Invalid DATABUILD_PARALLEL_WORKERS value '%s', using default: %d", workerEnv, numWorkers)
|
||||
} else if parsedWorkers < 1 {
|
||||
numWorkers = 1
|
||||
log.Printf("Warning: DATABUILD_PARALLEL_WORKERS must be at least 1, using: %d", numWorkers)
|
||||
} else {
|
||||
numWorkers = parsedWorkers
|
||||
}
|
||||
}
|
||||
log.Printf("Using %d workers for parallel execution", numWorkers)
|
||||
|
||||
for len(unhandledRefs) > 0 {
|
||||
if epoch >= 1000 {
|
||||
log.Printf("Planning timeout: still planning after %d epochs, giving up", epoch)
|
||||
|
|
@ -173,16 +267,14 @@ func plan(outputRefs []string) (*JobGraph, error) {
|
|||
return nil, err
|
||||
}
|
||||
|
||||
// Configure jobs
|
||||
var newNodes []Task
|
||||
for jobLabel, producedRefs := range jobRefs {
|
||||
tasks, err := configure(jobLabel, producedRefs)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
newNodes = append(newNodes, tasks...)
|
||||
// Configure jobs in parallel
|
||||
newNodes, err := configureParallel(jobRefs, numWorkers)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Remove handled refs
|
||||
// Remove handled refs
|
||||
for _, producedRefs := range jobRefs {
|
||||
for _, ref := range producedRefs {
|
||||
delete(unhandledRefs, ref)
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue