package executor import ( "context" "errors" "fmt" "log/slog" "path/filepath" "strings" "sync" "time" "github.com/thepeterstone/claudomator/internal/storage" "github.com/thepeterstone/claudomator/internal/task" "github.com/google/uuid" ) // Store is the subset of storage.DB methods used by the Pool. // Defining it as an interface allows test doubles to be injected. type Store interface { GetTask(id string) (*task.Task, error) ListTasks(filter storage.TaskFilter) ([]*task.Task, error) ListSubtasks(parentID string) ([]*task.Task, error) ListExecutions(taskID string) ([]*storage.Execution, error) CreateExecution(e *storage.Execution) error UpdateExecution(e *storage.Execution) error UpdateTaskState(id string, newState task.State) error UpdateTaskQuestion(taskID, questionJSON string) error UpdateTaskSummary(taskID, summary string) error AppendTaskInteraction(taskID string, interaction task.Interaction) error UpdateTaskAgent(id string, agent task.AgentConfig) error } // LogPather is an optional interface runners can implement to provide the log // directory for an execution before it starts. The pool uses this to persist // log paths at CreateExecution time rather than waiting until execution ends. type LogPather interface { ExecLogDir(execID string) string } // Runner executes a single task and returns the result. type Runner interface { Run(ctx context.Context, t *task.Task, exec *storage.Execution) error } // workItem is an entry in the pool's internal work queue. type workItem struct { ctx context.Context task *task.Task exec *storage.Execution // non-nil for resume submissions } // Pool manages a bounded set of concurrent task workers. type Pool struct { maxConcurrent int runners map[string]Runner store Store logger *slog.Logger depPollInterval time.Duration // how often waitForDependencies polls; defaults to 5s mu sync.Mutex active int activePerAgent map[string]int rateLimited map[string]time.Time // agentType -> until cancels map[string]context.CancelFunc // taskID → cancel resultCh chan *Result workCh chan workItem // internal bounded queue; Submit enqueues here doneCh chan struct{} // signals when a worker slot is freed Questions *QuestionRegistry Classifier *Classifier } // Result is emitted when a task execution completes. type Result struct { TaskID string Execution *storage.Execution Err error } func NewPool(maxConcurrent int, runners map[string]Runner, store Store, logger *slog.Logger) *Pool { if maxConcurrent < 1 { maxConcurrent = 1 } p := &Pool{ maxConcurrent: maxConcurrent, runners: runners, store: store, logger: logger, depPollInterval: 5 * time.Second, activePerAgent: make(map[string]int), rateLimited: make(map[string]time.Time), cancels: make(map[string]context.CancelFunc), resultCh: make(chan *Result, maxConcurrent*2), workCh: make(chan workItem, maxConcurrent*10+100), doneCh: make(chan struct{}, maxConcurrent), Questions: NewQuestionRegistry(), } go p.dispatch() return p } // dispatch is a long-running goroutine that reads from the internal work queue // and launches goroutines as soon as a pool slot is available. This prevents // tasks from being rejected when the pool is temporarily at capacity. func (p *Pool) dispatch() { for item := range p.workCh { for { p.mu.Lock() if p.active < p.maxConcurrent { p.active++ p.mu.Unlock() if item.exec != nil { go p.executeResume(item.ctx, item.task, item.exec) } else { go p.execute(item.ctx, item.task) } break } p.mu.Unlock() <-p.doneCh // wait for a worker to finish } } } // Submit enqueues a task for execution. Returns an error only if the internal // work queue is full. When the pool is at capacity the task is buffered and // dispatched as soon as a slot becomes available. func (p *Pool) Submit(ctx context.Context, t *task.Task) error { select { case p.workCh <- workItem{ctx: ctx, task: t}: return nil default: return fmt.Errorf("executor work queue full (capacity %d)", cap(p.workCh)) } } // Results returns the channel for reading execution results. func (p *Pool) Results() <-chan *Result { return p.resultCh } // Cancel requests cancellation of a running task. Returns false if the task // is not currently running in this pool. func (p *Pool) Cancel(taskID string) bool { p.mu.Lock() cancel, ok := p.cancels[taskID] p.mu.Unlock() if !ok { return false } cancel() return true } // resumablePoolStates are the task states that may be submitted for session resume. var resumablePoolStates = map[task.State]bool{ task.StateBlocked: true, task.StateTimedOut: true, task.StateCancelled: true, task.StateFailed: true, task.StateBudgetExceeded: true, } // SubmitResume re-queues a blocked or interrupted task using the provided resume execution. // The execution must have ResumeSessionID and ResumeAnswer set. func (p *Pool) SubmitResume(ctx context.Context, t *task.Task, exec *storage.Execution) error { if !resumablePoolStates[t.State] { return fmt.Errorf("task %s must be in a resumable state to resume (current: %s)", t.ID, t.State) } if exec.ResumeSessionID == "" { return fmt.Errorf("resume execution for task %s must have a ResumeSessionID", t.ID) } select { case p.workCh <- workItem{ctx: ctx, task: t, exec: exec}: return nil default: return fmt.Errorf("executor work queue full (capacity %d)", cap(p.workCh)) } } func (p *Pool) getRunner(t *task.Task) (Runner, error) { agentType := t.Agent.Type if agentType == "" { agentType = "claude" // Default for backward compatibility } runner, ok := p.runners[agentType] if !ok { return nil, fmt.Errorf("unsupported agent type: %q", agentType) } return runner, nil } func (p *Pool) executeResume(ctx context.Context, t *task.Task, exec *storage.Execution) { agentType := t.Agent.Type if agentType == "" { agentType = "claude" } p.mu.Lock() p.activePerAgent[agentType]++ p.mu.Unlock() defer func() { p.mu.Lock() p.active-- p.activePerAgent[agentType]-- if p.activePerAgent[agentType] == 0 { delete(p.activePerAgent, agentType) } p.mu.Unlock() select { case p.doneCh <- struct{}{}: default: } }() runner, err := p.getRunner(t) if err != nil { p.logger.Error("failed to get runner for resume", "error", err, "taskID", t.ID) p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err} return } // Pre-populate log paths. if lp, ok := runner.(LogPather); ok { if logDir := lp.ExecLogDir(exec.ID); logDir != "" { exec.StdoutPath = filepath.Join(logDir, "stdout.log") exec.StderrPath = filepath.Join(logDir, "stderr.log") exec.ArtifactDir = logDir } } exec.StartTime = time.Now().UTC() exec.Status = "RUNNING" if err := p.store.CreateExecution(exec); err != nil { p.logger.Error("failed to create resume execution record", "error", err) } if err := p.store.UpdateTaskState(t.ID, task.StateRunning); err != nil { p.logger.Error("failed to update task state", "error", err) } var cancel context.CancelFunc if t.Timeout.Duration > 0 { ctx, cancel = context.WithTimeout(ctx, t.Timeout.Duration) } else { ctx, cancel = context.WithCancel(ctx) } p.mu.Lock() p.cancels[t.ID] = cancel p.mu.Unlock() defer func() { cancel() p.mu.Lock() delete(p.cancels, t.ID) p.mu.Unlock() }() err = runner.Run(ctx, t, exec) exec.EndTime = time.Now().UTC() p.handleRunResult(ctx, t, exec, err, agentType) } // handleRunResult applies the shared post-run error-classification and // state-update logic used by both execute() and executeResume(). It sets // exec.Status and exec.ErrorMsg, updates storage, and emits the result to // resultCh. The caller must set exec.EndTime before calling. func (p *Pool) handleRunResult(ctx context.Context, t *task.Task, exec *storage.Execution, err error, agentType string) { if err != nil { if isRateLimitError(err) || isQuotaExhausted(err) { p.mu.Lock() retryAfter := parseRetryAfter(err.Error()) if retryAfter == 0 { if isQuotaExhausted(err) { retryAfter = 5 * time.Hour } else { retryAfter = 1 * time.Minute } } p.rateLimited[agentType] = time.Now().Add(retryAfter) p.logger.Info("agent rate limited", "agent", agentType, "retryAfter", retryAfter, "quotaExhausted", isQuotaExhausted(err)) p.mu.Unlock() } var blockedErr *BlockedError if errors.As(err, &blockedErr) { exec.Status = "BLOCKED" exec.SandboxDir = blockedErr.SandboxDir // preserve so resume runs in same dir if err := p.store.UpdateTaskState(t.ID, task.StateBlocked); err != nil { p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateBlocked, "error", err) } if err := p.store.UpdateTaskQuestion(t.ID, blockedErr.QuestionJSON); err != nil { p.logger.Error("failed to update task question", "taskID", t.ID, "error", err) } } else if ctx.Err() == context.DeadlineExceeded { exec.Status = "TIMED_OUT" exec.ErrorMsg = "execution timed out" if err := p.store.UpdateTaskState(t.ID, task.StateTimedOut); err != nil { p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateTimedOut, "error", err) } } else if ctx.Err() == context.Canceled { exec.Status = "CANCELLED" exec.ErrorMsg = "execution cancelled" if err := p.store.UpdateTaskState(t.ID, task.StateCancelled); err != nil { p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateCancelled, "error", err) } } else if isQuotaExhausted(err) { exec.Status = "BUDGET_EXCEEDED" exec.ErrorMsg = err.Error() if err := p.store.UpdateTaskState(t.ID, task.StateBudgetExceeded); err != nil { p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateBudgetExceeded, "error", err) } } else { exec.Status = "FAILED" exec.ErrorMsg = err.Error() if err := p.store.UpdateTaskState(t.ID, task.StateFailed); err != nil { p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateFailed, "error", err) } } } else { if t.ParentTaskID == "" { subtasks, subErr := p.store.ListSubtasks(t.ID) if subErr != nil { p.logger.Error("failed to list subtasks", "taskID", t.ID, "error", subErr) } if subErr == nil && len(subtasks) > 0 { exec.Status = "BLOCKED" if err := p.store.UpdateTaskState(t.ID, task.StateBlocked); err != nil { p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateBlocked, "error", err) } } else { exec.Status = "READY" if err := p.store.UpdateTaskState(t.ID, task.StateReady); err != nil { p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateReady, "error", err) } } } else { exec.Status = "COMPLETED" if err := p.store.UpdateTaskState(t.ID, task.StateCompleted); err != nil { p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateCompleted, "error", err) } p.maybeUnblockParent(t.ParentTaskID) } } summary := exec.Summary if summary == "" && exec.StdoutPath != "" { summary = extractSummary(exec.StdoutPath) } if summary != "" { if summaryErr := p.store.UpdateTaskSummary(t.ID, summary); summaryErr != nil { p.logger.Error("failed to update task summary", "taskID", t.ID, "error", summaryErr) } } if updateErr := p.store.UpdateExecution(exec); updateErr != nil { p.logger.Error("failed to update execution", "error", updateErr) } p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err} } // ActiveCount returns the number of currently running tasks. func (p *Pool) ActiveCount() int { p.mu.Lock() defer p.mu.Unlock() return p.active } // pickAgent selects the best agent from the given SystemStatus using explicit // load balancing: prefer the available (non-rate-limited) agent with the fewest // active tasks. If all agents are rate-limited, fall back to fewest active. func pickAgent(status SystemStatus) string { best := "" bestActive := -1 // First pass: only consider non-rate-limited agents. for agent, active := range status.ActiveTasks { if status.RateLimited[agent] { continue } if bestActive == -1 || active < bestActive || (active == bestActive && agent < best) { best = agent bestActive = active } } if best != "" { return best } // Fallback: all rate-limited — pick least active anyway. for agent, active := range status.ActiveTasks { if bestActive == -1 || active < bestActive || (active == bestActive && agent < best) { best = agent bestActive = active } } return best } func (p *Pool) execute(ctx context.Context, t *task.Task) { // 1. Load-balanced agent selection + model classification. p.mu.Lock() activeTasks := make(map[string]int) rateLimited := make(map[string]bool) now := time.Now() for agent := range p.runners { activeTasks[agent] = p.activePerAgent[agent] if deadline, ok := p.rateLimited[agent]; ok && now.After(deadline) { delete(p.rateLimited, agent) } rateLimited[agent] = now.Before(p.rateLimited[agent]) } status := SystemStatus{ ActiveTasks: activeTasks, RateLimited: rateLimited, } p.mu.Unlock() // If a specific agent is already requested, skip selection and classification. skipClassification := t.Agent.Type == "claude" || t.Agent.Type == "gemini" if !skipClassification { // Deterministically pick the agent with fewest active tasks. selectedAgent := pickAgent(status) if selectedAgent != "" { t.Agent.Type = selectedAgent } if p.Classifier != nil { cls, err := p.Classifier.Classify(ctx, t.Name, t.Agent.Instructions, status, t.Agent.Type) if err == nil { p.logger.Info("task classified", "taskID", t.ID, "agent", t.Agent.Type, "model", cls.Model, "reason", cls.Reason) t.Agent.Model = cls.Model } else { p.logger.Error("classification failed", "error", err, "taskID", t.ID) } } } // Persist the assigned agent (and model) to the database before running. if err := p.store.UpdateTaskAgent(t.ID, t.Agent); err != nil { p.logger.Error("failed to persist agent config", "error", err, "taskID", t.ID) } agentType := t.Agent.Type if agentType == "" { agentType = "claude" } p.mu.Lock() if deadline, ok := p.rateLimited[agentType]; ok && time.Now().After(deadline) { delete(p.rateLimited, agentType) } p.activePerAgent[agentType]++ p.mu.Unlock() defer func() { p.mu.Lock() p.active-- p.activePerAgent[agentType]-- if p.activePerAgent[agentType] == 0 { delete(p.activePerAgent, agentType) } p.mu.Unlock() select { case p.doneCh <- struct{}{}: default: } }() runner, err := p.getRunner(t) if err != nil { p.logger.Error("failed to get runner", "error", err, "taskID", t.ID) now := time.Now().UTC() exec := &storage.Execution{ ID: uuid.New().String(), TaskID: t.ID, StartTime: now, EndTime: now, Status: "FAILED", ErrorMsg: err.Error(), } if createErr := p.store.CreateExecution(exec); createErr != nil { p.logger.Error("failed to create execution record", "error", createErr) } if err := p.store.UpdateTaskState(t.ID, task.StateFailed); err != nil { p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateFailed, "error", err) } p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err} return } // Wait for all dependencies to complete before starting execution. if len(t.DependsOn) > 0 { if err := p.waitForDependencies(ctx, t); err != nil { now := time.Now().UTC() exec := &storage.Execution{ ID: uuid.New().String(), TaskID: t.ID, StartTime: now, EndTime: now, Status: "FAILED", ErrorMsg: err.Error(), } if createErr := p.store.CreateExecution(exec); createErr != nil { p.logger.Error("failed to create execution record", "error", createErr) } if err := p.store.UpdateTaskState(t.ID, task.StateFailed); err != nil { p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateFailed, "error", err) } p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err} return } } execID := uuid.New().String() exec := &storage.Execution{ ID: execID, TaskID: t.ID, StartTime: time.Now().UTC(), Status: "RUNNING", } // Pre-populate log paths so they're available in the DB immediately — // before the subprocess starts — enabling live tailing and debugging. if lp, ok := runner.(LogPather); ok { if logDir := lp.ExecLogDir(execID); logDir != "" { exec.StdoutPath = filepath.Join(logDir, "stdout.log") exec.StderrPath = filepath.Join(logDir, "stderr.log") exec.ArtifactDir = logDir } } // Record execution start. if err := p.store.CreateExecution(exec); err != nil { p.logger.Error("failed to create execution record", "error", err) } if err := p.store.UpdateTaskState(t.ID, task.StateRunning); err != nil { p.logger.Error("failed to update task state", "error", err) } // Apply task timeout and register cancel so callers can stop this task. var cancel context.CancelFunc if t.Timeout.Duration > 0 { ctx, cancel = context.WithTimeout(ctx, t.Timeout.Duration) } else { ctx, cancel = context.WithCancel(ctx) } p.mu.Lock() p.cancels[t.ID] = cancel p.mu.Unlock() defer func() { cancel() p.mu.Lock() delete(p.cancels, t.ID) p.mu.Unlock() }() // Inject prior failure history so the agent knows what went wrong before. priorExecs, priorErr := p.store.ListExecutions(t.ID) t = withFailureHistory(t, priorExecs, priorErr) // Run the task. err = runner.Run(ctx, t, exec) exec.EndTime = time.Now().UTC() p.handleRunResult(ctx, t, exec, err, agentType) } // RecoverStaleRunning marks any tasks stuck in RUNNING state (from a previous // server crash or restart) as FAILED, then immediately re-queues them for // retry. It also closes any open RUNNING execution records for those tasks. // Call this once on server startup. func (p *Pool) RecoverStaleRunning(ctx context.Context) { tasks, err := p.store.ListTasks(storage.TaskFilter{State: task.StateRunning}) if err != nil { p.logger.Error("RecoverStaleRunning: list tasks", "error", err) return } for _, t := range tasks { p.logger.Warn("recovering stale RUNNING task", "taskID", t.ID, "name", t.Name) // Close any open execution records. execs, err := p.store.ListExecutions(t.ID) if err == nil { for _, e := range execs { if e.Status == "RUNNING" { e.Status = "FAILED" e.ErrorMsg = "server restarted while task was running" e.EndTime = time.Now().UTC() if updateErr := p.store.UpdateExecution(e); updateErr != nil { p.logger.Error("RecoverStaleRunning: update execution", "error", updateErr, "execID", e.ID) } } } } if err := p.store.UpdateTaskState(t.ID, task.StateFailed); err != nil { p.logger.Error("RecoverStaleRunning: update task state", "error", err, "taskID", t.ID) continue } // Re-queue so the task retries automatically. Submit expects QUEUED state. if err := p.store.UpdateTaskState(t.ID, task.StateQueued); err != nil { p.logger.Error("RecoverStaleRunning: set queued", "error", err, "taskID", t.ID) continue } t.State = task.StateQueued if err := p.Submit(ctx, t); err != nil { p.logger.Error("RecoverStaleRunning: re-queue", "error", err, "taskID", t.ID) } } } // RecoverStaleQueued re-submits any tasks that are stuck in QUEUED state from // a previous server instance. Call this once on server startup, after // RecoverStaleRunning. func (p *Pool) RecoverStaleQueued(ctx context.Context) { tasks, err := p.store.ListTasks(storage.TaskFilter{State: task.StateQueued}) if err != nil { p.logger.Error("RecoverStaleQueued: list tasks", "error", err) return } for _, t := range tasks { p.logger.Info("resubmitting stale QUEUED task", "taskID", t.ID, "name", t.Name) if err := p.Submit(ctx, t); err != nil { p.logger.Error("RecoverStaleQueued: submit", "error", err, "taskID", t.ID) } } } // terminalFailureStates are dependency states that cause the waiting task to fail immediately. var terminalFailureStates = map[task.State]bool{ task.StateFailed: true, task.StateTimedOut: true, task.StateCancelled: true, task.StateBudgetExceeded: true, } // withFailureHistory returns a shallow copy of t with prior failed execution // error messages prepended to SystemPromptAppend so the agent knows what went // wrong in previous attempts. func withFailureHistory(t *task.Task, execs []*storage.Execution, err error) *task.Task { if err != nil || len(execs) == 0 { return t } var failures []storage.Execution for _, e := range execs { if (e.Status == "FAILED" || e.Status == "TIMED_OUT") && e.ErrorMsg != "" { failures = append(failures, *e) } } if len(failures) == 0 { return t } var sb strings.Builder sb.WriteString("## Prior Attempt History\n\n") sb.WriteString("This task has failed before. Do not repeat the same mistakes.\n\n") for i, f := range failures { fmt.Fprintf(&sb, "**Attempt %d** (%s) — %s:\n%s\n\n", i+1, f.StartTime.Format("2006-01-02 15:04 UTC"), f.Status, f.ErrorMsg) } sb.WriteString("---\n\n") copy := *t copy.Agent = t.Agent if copy.Agent.SystemPromptAppend != "" { copy.Agent.SystemPromptAppend = sb.String() + copy.Agent.SystemPromptAppend } else { copy.Agent.SystemPromptAppend = sb.String() } return © } // maybeUnblockParent transitions the parent task from BLOCKED to READY if all // of its subtasks are in the COMPLETED state. If any subtask is not COMPLETED // (including FAILED, CANCELLED, RUNNING, etc.) the parent stays BLOCKED. func (p *Pool) maybeUnblockParent(parentID string) { parent, err := p.store.GetTask(parentID) if err != nil { p.logger.Error("maybeUnblockParent: get parent", "parentID", parentID, "error", err) return } if parent.State != task.StateBlocked { return } subtasks, err := p.store.ListSubtasks(parentID) if err != nil { p.logger.Error("maybeUnblockParent: list subtasks", "parentID", parentID, "error", err) return } for _, sub := range subtasks { if sub.State != task.StateCompleted { return } } if err := p.store.UpdateTaskState(parentID, task.StateReady); err != nil { p.logger.Error("maybeUnblockParent: update parent state", "parentID", parentID, "error", err) } } // waitForDependencies polls storage until all tasks in t.DependsOn reach COMPLETED, // or until a dependency enters a terminal failure state or the context is cancelled. func (p *Pool) waitForDependencies(ctx context.Context, t *task.Task) error { for { allDone := true for _, depID := range t.DependsOn { dep, err := p.store.GetTask(depID) if err != nil { return fmt.Errorf("dependency %q not found: %w", depID, err) } if dep.State == task.StateCompleted { continue } if terminalFailureStates[dep.State] { return fmt.Errorf("dependency %q ended in state %s", depID, dep.State) } allDone = false } if allDone { return nil } select { case <-ctx.Done(): return ctx.Err() case <-time.After(p.depPollInterval): } } }