package executor

import (
	"context"
	"errors"
	"fmt"
	"log/slog"
	"path/filepath"
	"strings"
	"sync"
	"time"

	"github.com/thepeterstone/claudomator/internal/storage"
	"github.com/thepeterstone/claudomator/internal/task"
	"github.com/google/uuid"
)

// Store is the subset of storage.DB methods used by the Pool.
// Defining it as an interface allows test doubles to be injected.
type Store interface {
	GetTask(id string) (*task.Task, error)
	ListTasks(filter storage.TaskFilter) ([]*task.Task, error)
	ListSubtasks(parentID string) ([]*task.Task, error)
	ListExecutions(taskID string) ([]*storage.Execution, error)
	CreateExecution(e *storage.Execution) error
	UpdateExecution(e *storage.Execution) error
	UpdateTaskState(id string, newState task.State) error
	UpdateTaskQuestion(taskID, questionJSON string) error
	UpdateTaskSummary(taskID, summary string) error
	AppendTaskInteraction(taskID string, interaction task.Interaction) error
	UpdateTaskAgent(id string, agent task.AgentConfig) error
}

// LogPather is an optional interface runners can implement to provide the log
// directory for an execution before it starts. The pool uses this to persist
// log paths at CreateExecution time rather than waiting until execution ends.
type LogPather interface {
	ExecLogDir(execID string) string
}

// Runner executes a single task and returns the result.
type Runner interface {
	Run(ctx context.Context, t *task.Task, exec *storage.Execution) error
}

// workItem is an entry in the pool's internal work queue.
type workItem struct {
	ctx  context.Context
	task *task.Task
	exec *storage.Execution // non-nil for resume submissions
}

// Pool manages a bounded set of concurrent task workers.
type Pool struct {
	maxConcurrent   int
	runners         map[string]Runner
	store           Store
	logger          *slog.Logger
	depPollInterval time.Duration // how often waitForDependencies polls; defaults to 5s

	mu             sync.Mutex
	active         int
	activePerAgent map[string]int
	rateLimited    map[string]time.Time // agentType -> until
	cancels        map[string]context.CancelFunc // taskID → cancel
	resultCh       chan *Result
	workCh         chan workItem  // internal bounded queue; Submit enqueues here
	doneCh         chan struct{}  // signals when a worker slot is freed
	Questions      *QuestionRegistry
	Classifier     *Classifier
}

// Result is emitted when a task execution completes.
type Result struct {
	TaskID    string
	Execution *storage.Execution
	Err       error
}

func NewPool(maxConcurrent int, runners map[string]Runner, store Store, logger *slog.Logger) *Pool {
	if maxConcurrent < 1 {
		maxConcurrent = 1
	}
	p := &Pool{
		maxConcurrent:   maxConcurrent,
		runners:         runners,
		store:           store,
		logger:          logger,
		depPollInterval: 5 * time.Second,
		activePerAgent:  make(map[string]int),
		rateLimited:     make(map[string]time.Time),
		cancels:         make(map[string]context.CancelFunc),
		resultCh:        make(chan *Result, maxConcurrent*2),
		workCh:          make(chan workItem, maxConcurrent*10+100),
		doneCh:          make(chan struct{}, maxConcurrent),
		Questions:       NewQuestionRegistry(),
	}
	go p.dispatch()
	return p
}

// dispatch is a long-running goroutine that reads from the internal work queue
// and launches goroutines as soon as a pool slot is available. This prevents
// tasks from being rejected when the pool is temporarily at capacity.
func (p *Pool) dispatch() {
	for item := range p.workCh {
		for {
			p.mu.Lock()
			if p.active < p.maxConcurrent {
				p.active++
				p.mu.Unlock()
				if item.exec != nil {
					go p.executeResume(item.ctx, item.task, item.exec)
				} else {
					go p.execute(item.ctx, item.task)
				}
				break
			}
			p.mu.Unlock()
			<-p.doneCh // wait for a worker to finish
		}
	}
}

// Submit enqueues a task for execution. Returns an error only if the internal
// work queue is full. When the pool is at capacity the task is buffered and
// dispatched as soon as a slot becomes available.
func (p *Pool) Submit(ctx context.Context, t *task.Task) error {
	select {
	case p.workCh <- workItem{ctx: ctx, task: t}:
		return nil
	default:
		return fmt.Errorf("executor work queue full (capacity %d)", cap(p.workCh))
	}
}

// Results returns the channel for reading execution results.
func (p *Pool) Results() <-chan *Result {
	return p.resultCh
}

// Cancel requests cancellation of a running task. Returns false if the task
// is not currently running in this pool.
func (p *Pool) Cancel(taskID string) bool {
	p.mu.Lock()
	cancel, ok := p.cancels[taskID]
	p.mu.Unlock()
	if !ok {
		return false
	}
	cancel()
	return true
}

// resumablePoolStates are the task states that may be submitted for session resume.
var resumablePoolStates = map[task.State]bool{
	task.StateBlocked:       true,
	task.StateTimedOut:      true,
	task.StateCancelled:     true,
	task.StateFailed:        true,
	task.StateBudgetExceeded: true,
}

// SubmitResume re-queues a blocked or interrupted task using the provided resume execution.
// The execution must have ResumeSessionID and ResumeAnswer set.
func (p *Pool) SubmitResume(ctx context.Context, t *task.Task, exec *storage.Execution) error {
	if !resumablePoolStates[t.State] {
		return fmt.Errorf("task %s must be in a resumable state to resume (current: %s)", t.ID, t.State)
	}
	if exec.ResumeSessionID == "" {
		return fmt.Errorf("resume execution for task %s must have a ResumeSessionID", t.ID)
	}
	select {
	case p.workCh <- workItem{ctx: ctx, task: t, exec: exec}:
		return nil
	default:
		return fmt.Errorf("executor work queue full (capacity %d)", cap(p.workCh))
	}
}

func (p *Pool) getRunner(t *task.Task) (Runner, error) {
	agentType := t.Agent.Type
	if agentType == "" {
		agentType = "claude" // Default for backward compatibility
	}
	runner, ok := p.runners[agentType]
	if !ok {
		return nil, fmt.Errorf("unsupported agent type: %q", agentType)
	}
	return runner, nil
}

func (p *Pool) executeResume(ctx context.Context, t *task.Task, exec *storage.Execution) {
	agentType := t.Agent.Type
	if agentType == "" {
		agentType = "claude"
	}

	p.mu.Lock()
	p.activePerAgent[agentType]++
	p.mu.Unlock()

	defer func() {
		p.mu.Lock()
		p.active--
		p.activePerAgent[agentType]--
		if p.activePerAgent[agentType] == 0 {
			delete(p.activePerAgent, agentType)
		}
		p.mu.Unlock()
		select {
		case p.doneCh <- struct{}{}:
		default:
		}
	}()

	runner, err := p.getRunner(t)
	if err != nil {
		p.logger.Error("failed to get runner for resume", "error", err, "taskID", t.ID)
		p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err}
		return
	}

	// Pre-populate log paths.
	if lp, ok := runner.(LogPather); ok {
		if logDir := lp.ExecLogDir(exec.ID); logDir != "" {
			exec.StdoutPath = filepath.Join(logDir, "stdout.log")
			exec.StderrPath = filepath.Join(logDir, "stderr.log")
			exec.ArtifactDir = logDir
		}
	}
	exec.StartTime = time.Now().UTC()
	exec.Status = "RUNNING"

	if err := p.store.CreateExecution(exec); err != nil {
		p.logger.Error("failed to create resume execution record", "error", err)
	}
	if err := p.store.UpdateTaskState(t.ID, task.StateRunning); err != nil {
		p.logger.Error("failed to update task state", "error", err)
	}

	var cancel context.CancelFunc
	if t.Timeout.Duration > 0 {
		ctx, cancel = context.WithTimeout(ctx, t.Timeout.Duration)
	} else {
		ctx, cancel = context.WithCancel(ctx)
	}
	p.mu.Lock()
	p.cancels[t.ID] = cancel
	p.mu.Unlock()
	defer func() {
		cancel()
		p.mu.Lock()
		delete(p.cancels, t.ID)
		p.mu.Unlock()
	}()

	err = runner.Run(ctx, t, exec)
	exec.EndTime = time.Now().UTC()

	p.handleRunResult(ctx, t, exec, err, agentType)
}

// handleRunResult applies the shared post-run error-classification and
// state-update logic used by both execute() and executeResume(). It sets
// exec.Status and exec.ErrorMsg, updates storage, and emits the result to
// resultCh. The caller must set exec.EndTime before calling.
func (p *Pool) handleRunResult(ctx context.Context, t *task.Task, exec *storage.Execution, err error, agentType string) {
	if err != nil {
		if isRateLimitError(err) || isQuotaExhausted(err) {
			p.mu.Lock()
			retryAfter := parseRetryAfter(err.Error())
			if retryAfter == 0 {
				if isQuotaExhausted(err) {
					retryAfter = 5 * time.Hour
				} else {
					retryAfter = 1 * time.Minute
				}
			}
			p.rateLimited[agentType] = time.Now().Add(retryAfter)
			p.logger.Info("agent rate limited", "agent", agentType, "retryAfter", retryAfter, "quotaExhausted", isQuotaExhausted(err))
			p.mu.Unlock()
		}

		var blockedErr *BlockedError
		if errors.As(err, &blockedErr) {
			exec.Status = "BLOCKED"
			exec.SandboxDir = blockedErr.SandboxDir // preserve so resume runs in same dir
			if err := p.store.UpdateTaskState(t.ID, task.StateBlocked); err != nil {
				p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateBlocked, "error", err)
			}
			if err := p.store.UpdateTaskQuestion(t.ID, blockedErr.QuestionJSON); err != nil {
				p.logger.Error("failed to update task question", "taskID", t.ID, "error", err)
			}
		} else if ctx.Err() == context.DeadlineExceeded {
			exec.Status = "TIMED_OUT"
			exec.ErrorMsg = "execution timed out"
			if err := p.store.UpdateTaskState(t.ID, task.StateTimedOut); err != nil {
				p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateTimedOut, "error", err)
			}
		} else if ctx.Err() == context.Canceled {
			exec.Status = "CANCELLED"
			exec.ErrorMsg = "execution cancelled"
			if err := p.store.UpdateTaskState(t.ID, task.StateCancelled); err != nil {
				p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateCancelled, "error", err)
			}
		} else if isQuotaExhausted(err) {
			exec.Status = "BUDGET_EXCEEDED"
			exec.ErrorMsg = err.Error()
			if err := p.store.UpdateTaskState(t.ID, task.StateBudgetExceeded); err != nil {
				p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateBudgetExceeded, "error", err)
			}
		} else {
			exec.Status = "FAILED"
			exec.ErrorMsg = err.Error()
			if err := p.store.UpdateTaskState(t.ID, task.StateFailed); err != nil {
				p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateFailed, "error", err)
			}
		}
	} else {
		if t.ParentTaskID == "" {
			subtasks, subErr := p.store.ListSubtasks(t.ID)
			if subErr != nil {
				p.logger.Error("failed to list subtasks", "taskID", t.ID, "error", subErr)
			}
			if subErr == nil && len(subtasks) > 0 {
				exec.Status = "BLOCKED"
				if err := p.store.UpdateTaskState(t.ID, task.StateBlocked); err != nil {
					p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateBlocked, "error", err)
				}
			} else {
				exec.Status = "READY"
				if err := p.store.UpdateTaskState(t.ID, task.StateReady); err != nil {
					p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateReady, "error", err)
				}
			}
		} else {
			exec.Status = "COMPLETED"
			if err := p.store.UpdateTaskState(t.ID, task.StateCompleted); err != nil {
				p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateCompleted, "error", err)
			}
			p.maybeUnblockParent(t.ParentTaskID)
		}
	}

	summary := exec.Summary
	if summary == "" && exec.StdoutPath != "" {
		summary = extractSummary(exec.StdoutPath)
	}
	if summary != "" {
		if summaryErr := p.store.UpdateTaskSummary(t.ID, summary); summaryErr != nil {
			p.logger.Error("failed to update task summary", "taskID", t.ID, "error", summaryErr)
		}
	}
	if updateErr := p.store.UpdateExecution(exec); updateErr != nil {
		p.logger.Error("failed to update execution", "error", updateErr)
	}
	p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err}
}

// ActiveCount returns the number of currently running tasks.
func (p *Pool) ActiveCount() int {
	p.mu.Lock()
	defer p.mu.Unlock()
	return p.active
}

// pickAgent selects the best agent from the given SystemStatus using explicit
// load balancing: prefer the available (non-rate-limited) agent with the fewest
// active tasks. If all agents are rate-limited, fall back to fewest active.
func pickAgent(status SystemStatus) string {
	best := ""
	bestActive := -1

	// First pass: only consider non-rate-limited agents.
	for agent, active := range status.ActiveTasks {
		if status.RateLimited[agent] {
			continue
		}
		if bestActive == -1 || active < bestActive || (active == bestActive && agent < best) {
			best = agent
			bestActive = active
		}
	}
	if best != "" {
		return best
	}

	// Fallback: all rate-limited — pick least active anyway.
	for agent, active := range status.ActiveTasks {
		if bestActive == -1 || active < bestActive || (active == bestActive && agent < best) {
			best = agent
			bestActive = active
		}
	}
	return best
}

func (p *Pool) execute(ctx context.Context, t *task.Task) {
	// 1. Load-balanced agent selection + model classification.
	p.mu.Lock()
	activeTasks := make(map[string]int)
	rateLimited := make(map[string]bool)
	now := time.Now()
	for agent := range p.runners {
		activeTasks[agent] = p.activePerAgent[agent]
		if deadline, ok := p.rateLimited[agent]; ok && now.After(deadline) {
			delete(p.rateLimited, agent)
		}
		rateLimited[agent] = now.Before(p.rateLimited[agent])
	}
	status := SystemStatus{
		ActiveTasks: activeTasks,
		RateLimited: rateLimited,
	}
	p.mu.Unlock()

	// If a specific agent is already requested, skip selection and classification.
	skipClassification := t.Agent.Type == "claude" || t.Agent.Type == "gemini"

	if !skipClassification {
		// Deterministically pick the agent with fewest active tasks.
		selectedAgent := pickAgent(status)
		if selectedAgent != "" {
			t.Agent.Type = selectedAgent
		}

		if p.Classifier != nil {
			cls, err := p.Classifier.Classify(ctx, t.Name, t.Agent.Instructions, status, t.Agent.Type)
			if err == nil {
				p.logger.Info("task classified", "taskID", t.ID, "agent", t.Agent.Type, "model", cls.Model, "reason", cls.Reason)
				t.Agent.Model = cls.Model
			} else {
				p.logger.Error("classification failed", "error", err, "taskID", t.ID)
			}
		}
	}

	// Persist the assigned agent (and model) to the database before running.
	if err := p.store.UpdateTaskAgent(t.ID, t.Agent); err != nil {
		p.logger.Error("failed to persist agent config", "error", err, "taskID", t.ID)
	}

	agentType := t.Agent.Type
	if agentType == "" {
		agentType = "claude"
	}

	p.mu.Lock()
	if deadline, ok := p.rateLimited[agentType]; ok && time.Now().After(deadline) {
		delete(p.rateLimited, agentType)
	}
	p.activePerAgent[agentType]++
	p.mu.Unlock()

	defer func() {
		p.mu.Lock()
		p.active--
		p.activePerAgent[agentType]--
		if p.activePerAgent[agentType] == 0 {
			delete(p.activePerAgent, agentType)
		}
		p.mu.Unlock()
		select {
		case p.doneCh <- struct{}{}:
		default:
		}
	}()

	runner, err := p.getRunner(t)
	if err != nil {
		p.logger.Error("failed to get runner", "error", err, "taskID", t.ID)
		now := time.Now().UTC()
		exec := &storage.Execution{
			ID:        uuid.New().String(),
			TaskID:    t.ID,
			StartTime: now,
			EndTime:   now,
			Status:    "FAILED",
			ErrorMsg:  err.Error(),
		}
		if createErr := p.store.CreateExecution(exec); createErr != nil {
			p.logger.Error("failed to create execution record", "error", createErr)
		}
		if err := p.store.UpdateTaskState(t.ID, task.StateFailed); err != nil {
			p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateFailed, "error", err)
		}
		p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err}
		return
	}

	// Wait for all dependencies to complete before starting execution.
	if len(t.DependsOn) > 0 {
		if err := p.waitForDependencies(ctx, t); err != nil {
			now := time.Now().UTC()
			exec := &storage.Execution{
				ID:        uuid.New().String(),
				TaskID:    t.ID,
				StartTime: now,
				EndTime:   now,
				Status:    "FAILED",
				ErrorMsg:  err.Error(),
			}
			if createErr := p.store.CreateExecution(exec); createErr != nil {
				p.logger.Error("failed to create execution record", "error", createErr)
			}
			if err := p.store.UpdateTaskState(t.ID, task.StateFailed); err != nil {
				p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateFailed, "error", err)
			}
			p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err}
			return
		}
	}

	execID := uuid.New().String()
	exec := &storage.Execution{
		ID:        execID,
		TaskID:    t.ID,
		StartTime: time.Now().UTC(),
		Status:    "RUNNING",
	}

	// Pre-populate log paths so they're available in the DB immediately —
	// before the subprocess starts — enabling live tailing and debugging.
	if lp, ok := runner.(LogPather); ok {
		if logDir := lp.ExecLogDir(execID); logDir != "" {
			exec.StdoutPath = filepath.Join(logDir, "stdout.log")
			exec.StderrPath = filepath.Join(logDir, "stderr.log")
			exec.ArtifactDir = logDir
		}
	}

	// Record execution start.
	if err := p.store.CreateExecution(exec); err != nil {
		p.logger.Error("failed to create execution record", "error", err)
	}
	if err := p.store.UpdateTaskState(t.ID, task.StateRunning); err != nil {
		p.logger.Error("failed to update task state", "error", err)
	}

	// Apply task timeout and register cancel so callers can stop this task.
	var cancel context.CancelFunc
	if t.Timeout.Duration > 0 {
		ctx, cancel = context.WithTimeout(ctx, t.Timeout.Duration)
	} else {
		ctx, cancel = context.WithCancel(ctx)
	}
	p.mu.Lock()
	p.cancels[t.ID] = cancel
	p.mu.Unlock()
	defer func() {
		cancel()
		p.mu.Lock()
		delete(p.cancels, t.ID)
		p.mu.Unlock()
	}()

	// Inject prior failure history so the agent knows what went wrong before.
	priorExecs, priorErr := p.store.ListExecutions(t.ID)
	t = withFailureHistory(t, priorExecs, priorErr)

	// Run the task.
	err = runner.Run(ctx, t, exec)
	exec.EndTime = time.Now().UTC()

	p.handleRunResult(ctx, t, exec, err, agentType)
}

// RecoverStaleRunning marks any tasks stuck in RUNNING state (from a previous
// server crash or restart) as FAILED, then immediately re-queues them for
// retry. It also closes any open RUNNING execution records for those tasks.
// Call this once on server startup.
func (p *Pool) RecoverStaleRunning(ctx context.Context) {
	tasks, err := p.store.ListTasks(storage.TaskFilter{State: task.StateRunning})
	if err != nil {
		p.logger.Error("RecoverStaleRunning: list tasks", "error", err)
		return
	}
	for _, t := range tasks {
		p.logger.Warn("recovering stale RUNNING task", "taskID", t.ID, "name", t.Name)
		// Close any open execution records.
		execs, err := p.store.ListExecutions(t.ID)
		if err == nil {
			for _, e := range execs {
				if e.Status == "RUNNING" {
					e.Status = "FAILED"
					e.ErrorMsg = "server restarted while task was running"
					e.EndTime = time.Now().UTC()
					if updateErr := p.store.UpdateExecution(e); updateErr != nil {
						p.logger.Error("RecoverStaleRunning: update execution", "error", updateErr, "execID", e.ID)
					}
				}
			}
		}
		if err := p.store.UpdateTaskState(t.ID, task.StateFailed); err != nil {
			p.logger.Error("RecoverStaleRunning: update task state", "error", err, "taskID", t.ID)
			continue
		}
		// Re-queue so the task retries automatically. Submit expects QUEUED state.
		if err := p.store.UpdateTaskState(t.ID, task.StateQueued); err != nil {
			p.logger.Error("RecoverStaleRunning: set queued", "error", err, "taskID", t.ID)
			continue
		}
		t.State = task.StateQueued
		if err := p.Submit(ctx, t); err != nil {
			p.logger.Error("RecoverStaleRunning: re-queue", "error", err, "taskID", t.ID)
		}
	}
}

// RecoverStaleQueued re-submits any tasks that are stuck in QUEUED state from
// a previous server instance. Call this once on server startup, after
// RecoverStaleRunning.
func (p *Pool) RecoverStaleQueued(ctx context.Context) {
	tasks, err := p.store.ListTasks(storage.TaskFilter{State: task.StateQueued})
	if err != nil {
		p.logger.Error("RecoverStaleQueued: list tasks", "error", err)
		return
	}
	for _, t := range tasks {
		p.logger.Info("resubmitting stale QUEUED task", "taskID", t.ID, "name", t.Name)
		if err := p.Submit(ctx, t); err != nil {
			p.logger.Error("RecoverStaleQueued: submit", "error", err, "taskID", t.ID)
		}
	}
}

// terminalFailureStates are dependency states that cause the waiting task to fail immediately.
var terminalFailureStates = map[task.State]bool{
	task.StateFailed:         true,
	task.StateTimedOut:       true,
	task.StateCancelled:      true,
	task.StateBudgetExceeded: true,
}

// withFailureHistory returns a shallow copy of t with prior failed execution
// error messages prepended to SystemPromptAppend so the agent knows what went
// wrong in previous attempts.
func withFailureHistory(t *task.Task, execs []*storage.Execution, err error) *task.Task {
	if err != nil || len(execs) == 0 {
		return t
	}

	var failures []storage.Execution
	for _, e := range execs {
		if (e.Status == "FAILED" || e.Status == "TIMED_OUT") && e.ErrorMsg != "" {
			failures = append(failures, *e)
		}
	}
	if len(failures) == 0 {
		return t
	}

	var sb strings.Builder
	sb.WriteString("## Prior Attempt History\n\n")
	sb.WriteString("This task has failed before. Do not repeat the same mistakes.\n\n")
	for i, f := range failures {
		fmt.Fprintf(&sb, "**Attempt %d** (%s) — %s:\n%s\n\n",
			i+1, f.StartTime.Format("2006-01-02 15:04 UTC"), f.Status, f.ErrorMsg)
	}
	sb.WriteString("---\n\n")

	copy := *t
	copy.Agent = t.Agent
	if copy.Agent.SystemPromptAppend != "" {
		copy.Agent.SystemPromptAppend = sb.String() + copy.Agent.SystemPromptAppend
	} else {
		copy.Agent.SystemPromptAppend = sb.String()
	}
	return &copy
}

// maybeUnblockParent transitions the parent task from BLOCKED to READY if all
// of its subtasks are in the COMPLETED state. If any subtask is not COMPLETED
// (including FAILED, CANCELLED, RUNNING, etc.) the parent stays BLOCKED.
func (p *Pool) maybeUnblockParent(parentID string) {
	parent, err := p.store.GetTask(parentID)
	if err != nil {
		p.logger.Error("maybeUnblockParent: get parent", "parentID", parentID, "error", err)
		return
	}
	if parent.State != task.StateBlocked {
		return
	}
	subtasks, err := p.store.ListSubtasks(parentID)
	if err != nil {
		p.logger.Error("maybeUnblockParent: list subtasks", "parentID", parentID, "error", err)
		return
	}
	for _, sub := range subtasks {
		if sub.State != task.StateCompleted {
			return
		}
	}
	if err := p.store.UpdateTaskState(parentID, task.StateReady); err != nil {
		p.logger.Error("maybeUnblockParent: update parent state", "parentID", parentID, "error", err)
	}
}

// waitForDependencies polls storage until all tasks in t.DependsOn reach COMPLETED,
// or until a dependency enters a terminal failure state or the context is cancelled.
func (p *Pool) waitForDependencies(ctx context.Context, t *task.Task) error {
	for {
		allDone := true
		for _, depID := range t.DependsOn {
			dep, err := p.store.GetTask(depID)
			if err != nil {
				return fmt.Errorf("dependency %q not found: %w", depID, err)
			}
			if dep.State == task.StateCompleted {
				continue
			}
			if terminalFailureStates[dep.State] {
				return fmt.Errorf("dependency %q ended in state %s", depID, dep.State)
			}
			allDone = false
		}
		if allDone {
			return nil
		}
		select {
		case <-ctx.Done():
			return ctx.Err()
		case <-time.After(p.depPollInterval):
		}
	}
}