package executor import ( "context" "errors" "fmt" "log/slog" "path/filepath" "strings" "sync" "time" "github.com/thepeterstone/claudomator/internal/storage" "github.com/thepeterstone/claudomator/internal/task" "github.com/google/uuid" ) // LogPather is an optional interface runners can implement to provide the log // directory for an execution before it starts. The pool uses this to persist // log paths at CreateExecution time rather than waiting until execution ends. type LogPather interface { ExecLogDir(execID string) string } // Runner executes a single task and returns the result. type Runner interface { Run(ctx context.Context, t *task.Task, exec *storage.Execution) error } // workItem is an entry in the pool's internal work queue. type workItem struct { ctx context.Context task *task.Task exec *storage.Execution // non-nil for resume submissions } // Pool manages a bounded set of concurrent task workers. type Pool struct { maxConcurrent int runners map[string]Runner store *storage.DB logger *slog.Logger depPollInterval time.Duration // how often waitForDependencies polls; defaults to 5s mu sync.Mutex active int activePerAgent map[string]int rateLimited map[string]time.Time // agentType -> until cancels map[string]context.CancelFunc // taskID → cancel resultCh chan *Result workCh chan workItem // internal bounded queue; Submit enqueues here doneCh chan struct{} // signals when a worker slot is freed Questions *QuestionRegistry Classifier *Classifier } // Result is emitted when a task execution completes. type Result struct { TaskID string Execution *storage.Execution Err error } func NewPool(maxConcurrent int, runners map[string]Runner, store *storage.DB, logger *slog.Logger) *Pool { if maxConcurrent < 1 { maxConcurrent = 1 } p := &Pool{ maxConcurrent: maxConcurrent, runners: runners, store: store, logger: logger, depPollInterval: 5 * time.Second, activePerAgent: make(map[string]int), rateLimited: make(map[string]time.Time), cancels: make(map[string]context.CancelFunc), resultCh: make(chan *Result, maxConcurrent*2), workCh: make(chan workItem, maxConcurrent*10+100), doneCh: make(chan struct{}, maxConcurrent), Questions: NewQuestionRegistry(), } go p.dispatch() return p } // dispatch is a long-running goroutine that reads from the internal work queue // and launches goroutines as soon as a pool slot is available. This prevents // tasks from being rejected when the pool is temporarily at capacity. func (p *Pool) dispatch() { for item := range p.workCh { for { p.mu.Lock() if p.active < p.maxConcurrent { p.active++ p.mu.Unlock() if item.exec != nil { go p.executeResume(item.ctx, item.task, item.exec) } else { go p.execute(item.ctx, item.task) } break } p.mu.Unlock() <-p.doneCh // wait for a worker to finish } } } // Submit enqueues a task for execution. Returns an error only if the internal // work queue is full. When the pool is at capacity the task is buffered and // dispatched as soon as a slot becomes available. func (p *Pool) Submit(ctx context.Context, t *task.Task) error { select { case p.workCh <- workItem{ctx: ctx, task: t}: return nil default: return fmt.Errorf("executor work queue full (capacity %d)", cap(p.workCh)) } } // Results returns the channel for reading execution results. func (p *Pool) Results() <-chan *Result { return p.resultCh } // Cancel requests cancellation of a running task. Returns false if the task // is not currently running in this pool. func (p *Pool) Cancel(taskID string) bool { p.mu.Lock() cancel, ok := p.cancels[taskID] p.mu.Unlock() if !ok { return false } cancel() return true } // SubmitResume re-queues a blocked task using the provided resume execution. // The execution must have ResumeSessionID and ResumeAnswer set. func (p *Pool) SubmitResume(ctx context.Context, t *task.Task, exec *storage.Execution) error { if t.State != task.StateBlocked && t.State != task.StateTimedOut { return fmt.Errorf("task %s must be in BLOCKED or TIMED_OUT state to resume (current: %s)", t.ID, t.State) } if exec.ResumeSessionID == "" { return fmt.Errorf("resume execution for task %s must have a ResumeSessionID", t.ID) } select { case p.workCh <- workItem{ctx: ctx, task: t, exec: exec}: return nil default: return fmt.Errorf("executor work queue full (capacity %d)", cap(p.workCh)) } } func (p *Pool) getRunner(t *task.Task) (Runner, error) { agentType := t.Agent.Type if agentType == "" { agentType = "claude" // Default for backward compatibility } runner, ok := p.runners[agentType] if !ok { return nil, fmt.Errorf("unsupported agent type: %q", agentType) } return runner, nil } func (p *Pool) executeResume(ctx context.Context, t *task.Task, exec *storage.Execution) { agentType := t.Agent.Type if agentType == "" { agentType = "claude" } p.mu.Lock() p.activePerAgent[agentType]++ p.mu.Unlock() defer func() { p.mu.Lock() p.active-- p.activePerAgent[agentType]-- p.mu.Unlock() select { case p.doneCh <- struct{}{}: default: } }() runner, err := p.getRunner(t) if err != nil { p.logger.Error("failed to get runner for resume", "error", err, "taskID", t.ID) p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err} return } // Pre-populate log paths. if lp, ok := runner.(LogPather); ok { if logDir := lp.ExecLogDir(exec.ID); logDir != "" { exec.StdoutPath = filepath.Join(logDir, "stdout.log") exec.StderrPath = filepath.Join(logDir, "stderr.log") exec.ArtifactDir = logDir } } exec.StartTime = time.Now().UTC() exec.Status = "RUNNING" if err := p.store.CreateExecution(exec); err != nil { p.logger.Error("failed to create resume execution record", "error", err) } if err := p.store.UpdateTaskState(t.ID, task.StateRunning); err != nil { p.logger.Error("failed to update task state", "error", err) } var cancel context.CancelFunc if t.Timeout.Duration > 0 { ctx, cancel = context.WithTimeout(ctx, t.Timeout.Duration) } else { ctx, cancel = context.WithCancel(ctx) } p.mu.Lock() p.cancels[t.ID] = cancel p.mu.Unlock() defer func() { cancel() p.mu.Lock() delete(p.cancels, t.ID) p.mu.Unlock() }() err = runner.Run(ctx, t, exec) exec.EndTime = time.Now().UTC() if err != nil { if isRateLimitError(err) { p.mu.Lock() retryAfter := parseRetryAfter(err.Error()) if retryAfter == 0 { retryAfter = 1 * time.Minute } p.rateLimited[agentType] = time.Now().Add(retryAfter) p.mu.Unlock() } var blockedErr *BlockedError if errors.As(err, &blockedErr) { exec.Status = "BLOCKED" p.store.UpdateTaskState(t.ID, task.StateBlocked) p.store.UpdateTaskQuestion(t.ID, blockedErr.QuestionJSON) } else if ctx.Err() == context.DeadlineExceeded { exec.Status = "TIMED_OUT" exec.ErrorMsg = "execution timed out" p.store.UpdateTaskState(t.ID, task.StateTimedOut) } else if ctx.Err() == context.Canceled { exec.Status = "CANCELLED" exec.ErrorMsg = "execution cancelled" p.store.UpdateTaskState(t.ID, task.StateCancelled) } else if isQuotaExhausted(err) { exec.Status = "BUDGET_EXCEEDED" exec.ErrorMsg = err.Error() p.store.UpdateTaskState(t.ID, task.StateBudgetExceeded) } else { exec.Status = "FAILED" exec.ErrorMsg = err.Error() p.store.UpdateTaskState(t.ID, task.StateFailed) } } else { if t.ParentTaskID == "" { exec.Status = "READY" p.store.UpdateTaskState(t.ID, task.StateReady) } else { exec.Status = "COMPLETED" p.store.UpdateTaskState(t.ID, task.StateCompleted) } } if updateErr := p.store.UpdateExecution(exec); updateErr != nil { p.logger.Error("failed to update resume execution", "error", updateErr) } p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err} } // ActiveCount returns the number of currently running tasks. func (p *Pool) ActiveCount() int { p.mu.Lock() defer p.mu.Unlock() return p.active } func (p *Pool) execute(ctx context.Context, t *task.Task) { // 1. Classification if p.Classifier != nil { p.mu.Lock() activeTasks := make(map[string]int) rateLimited := make(map[string]bool) now := time.Now() for agent := range p.runners { activeTasks[agent] = p.activePerAgent[agent] rateLimited[agent] = now.Before(p.rateLimited[agent]) } status := SystemStatus{ ActiveTasks: activeTasks, RateLimited: rateLimited, } p.mu.Unlock() cls, err := p.Classifier.Classify(ctx, t.Name, t.Agent.Instructions, status) if err == nil { p.logger.Info("task classified", "taskID", t.ID, "agent", cls.AgentType, "model", cls.Model, "reason", cls.Reason) t.Agent.Type = cls.AgentType t.Agent.Model = cls.Model } else { p.logger.Error("classification failed", "error", err, "taskID", t.ID) } } agentType := t.Agent.Type if agentType == "" { agentType = "claude" } p.mu.Lock() p.activePerAgent[agentType]++ p.mu.Unlock() defer func() { p.mu.Lock() p.active-- p.activePerAgent[agentType]-- p.mu.Unlock() select { case p.doneCh <- struct{}{}: default: } }() runner, err := p.getRunner(t) if err != nil { p.logger.Error("failed to get runner", "error", err, "taskID", t.ID) now := time.Now().UTC() exec := &storage.Execution{ ID: uuid.New().String(), TaskID: t.ID, StartTime: now, EndTime: now, Status: "FAILED", ErrorMsg: err.Error(), } if createErr := p.store.CreateExecution(exec); createErr != nil { p.logger.Error("failed to create execution record", "error", createErr) } p.store.UpdateTaskState(t.ID, task.StateFailed) p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err} return } // Wait for all dependencies to complete before starting execution. if len(t.DependsOn) > 0 { if err := p.waitForDependencies(ctx, t); err != nil { now := time.Now().UTC() exec := &storage.Execution{ ID: uuid.New().String(), TaskID: t.ID, StartTime: now, EndTime: now, Status: "FAILED", ErrorMsg: err.Error(), } if createErr := p.store.CreateExecution(exec); createErr != nil { p.logger.Error("failed to create execution record", "error", createErr) } p.store.UpdateTaskState(t.ID, task.StateFailed) p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err} return } } execID := uuid.New().String() exec := &storage.Execution{ ID: execID, TaskID: t.ID, StartTime: time.Now().UTC(), Status: "RUNNING", } // Pre-populate log paths so they're available in the DB immediately — // before the subprocess starts — enabling live tailing and debugging. if lp, ok := runner.(LogPather); ok { if logDir := lp.ExecLogDir(execID); logDir != "" { exec.StdoutPath = filepath.Join(logDir, "stdout.log") exec.StderrPath = filepath.Join(logDir, "stderr.log") exec.ArtifactDir = logDir } } // Record execution start. if err := p.store.CreateExecution(exec); err != nil { p.logger.Error("failed to create execution record", "error", err) } if err := p.store.UpdateTaskState(t.ID, task.StateRunning); err != nil { p.logger.Error("failed to update task state", "error", err) } // Apply task timeout and register cancel so callers can stop this task. var cancel context.CancelFunc if t.Timeout.Duration > 0 { ctx, cancel = context.WithTimeout(ctx, t.Timeout.Duration) } else { ctx, cancel = context.WithCancel(ctx) } p.mu.Lock() p.cancels[t.ID] = cancel p.mu.Unlock() defer func() { cancel() p.mu.Lock() delete(p.cancels, t.ID) p.mu.Unlock() }() // Inject prior failure history so the agent knows what went wrong before. priorExecs, priorErr := p.store.ListExecutions(t.ID) t = withFailureHistory(t, priorExecs, priorErr) // Run the task. err = runner.Run(ctx, t, exec) exec.EndTime = time.Now().UTC() if err != nil { if isRateLimitError(err) { p.mu.Lock() retryAfter := parseRetryAfter(err.Error()) if retryAfter == 0 { retryAfter = 1 * time.Minute } p.rateLimited[agentType] = time.Now().Add(retryAfter) p.mu.Unlock() } var blockedErr *BlockedError if errors.As(err, &blockedErr) { exec.Status = "BLOCKED" p.store.UpdateTaskState(t.ID, task.StateBlocked) p.store.UpdateTaskQuestion(t.ID, blockedErr.QuestionJSON) } else if ctx.Err() == context.DeadlineExceeded { exec.Status = "TIMED_OUT" exec.ErrorMsg = "execution timed out" p.store.UpdateTaskState(t.ID, task.StateTimedOut) } else if ctx.Err() == context.Canceled { exec.Status = "CANCELLED" exec.ErrorMsg = "execution cancelled" p.store.UpdateTaskState(t.ID, task.StateCancelled) } else if isQuotaExhausted(err) { exec.Status = "BUDGET_EXCEEDED" exec.ErrorMsg = err.Error() p.store.UpdateTaskState(t.ID, task.StateBudgetExceeded) } else { exec.Status = "FAILED" exec.ErrorMsg = err.Error() p.store.UpdateTaskState(t.ID, task.StateFailed) } } else { if t.ParentTaskID == "" { exec.Status = "READY" p.store.UpdateTaskState(t.ID, task.StateReady) } else { exec.Status = "COMPLETED" p.store.UpdateTaskState(t.ID, task.StateCompleted) } } if updateErr := p.store.UpdateExecution(exec); updateErr != nil { p.logger.Error("failed to update execution", "error", updateErr) } p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err} } // terminalFailureStates are dependency states that cause the waiting task to fail immediately. var terminalFailureStates = map[task.State]bool{ task.StateFailed: true, task.StateTimedOut: true, task.StateCancelled: true, task.StateBudgetExceeded: true, } // withFailureHistory returns a shallow copy of t with prior failed execution // error messages prepended to SystemPromptAppend so the agent knows what went // wrong in previous attempts. func withFailureHistory(t *task.Task, execs []*storage.Execution, err error) *task.Task { if err != nil || len(execs) == 0 { return t } var failures []storage.Execution for _, e := range execs { if (e.Status == "FAILED" || e.Status == "TIMED_OUT") && e.ErrorMsg != "" { failures = append(failures, *e) } } if len(failures) == 0 { return t } var sb strings.Builder sb.WriteString("## Prior Attempt History\n\n") sb.WriteString("This task has failed before. Do not repeat the same mistakes.\n\n") for i, f := range failures { fmt.Fprintf(&sb, "**Attempt %d** (%s) — %s:\n%s\n\n", i+1, f.StartTime.Format("2006-01-02 15:04 UTC"), f.Status, f.ErrorMsg) } sb.WriteString("---\n\n") copy := *t copy.Agent = t.Agent if copy.Agent.SystemPromptAppend != "" { copy.Agent.SystemPromptAppend = sb.String() + copy.Agent.SystemPromptAppend } else { copy.Agent.SystemPromptAppend = sb.String() } return © } // waitForDependencies polls storage until all tasks in t.DependsOn reach COMPLETED, // or until a dependency enters a terminal failure state or the context is cancelled. func (p *Pool) waitForDependencies(ctx context.Context, t *task.Task) error { for { allDone := true for _, depID := range t.DependsOn { dep, err := p.store.GetTask(depID) if err != nil { return fmt.Errorf("dependency %q not found: %w", depID, err) } if dep.State == task.StateCompleted { continue } if terminalFailureStates[dep.State] { return fmt.Errorf("dependency %q ended in state %s", depID, dep.State) } allDone = false } if allDone { return nil } select { case <-ctx.Done(): return ctx.Err() case <-time.After(p.depPollInterval): } } }