package executor

import (
	"context"
	"errors"
	"fmt"
	"log/slog"
	"path/filepath"
	"strings"
	"sync"
	"time"

	"github.com/thepeterstone/claudomator/internal/storage"
	"github.com/thepeterstone/claudomator/internal/task"
	"github.com/google/uuid"
)

// LogPather is an optional interface runners can implement to provide the log
// directory for an execution before it starts. The pool uses this to persist
// log paths at CreateExecution time rather than waiting until execution ends.
type LogPather interface {
	ExecLogDir(execID string) string
}

// Runner executes a single task and returns the result.
type Runner interface {
	Run(ctx context.Context, t *task.Task, exec *storage.Execution) error
}

// workItem is an entry in the pool's internal work queue.
type workItem struct {
	ctx  context.Context
	task *task.Task
	exec *storage.Execution // non-nil for resume submissions
}

// Pool manages a bounded set of concurrent task workers.
type Pool struct {
	maxConcurrent   int
	runners         map[string]Runner
	store           *storage.DB
	logger          *slog.Logger
	depPollInterval time.Duration // how often waitForDependencies polls; defaults to 5s

	mu             sync.Mutex
	active         int
	activePerAgent map[string]int
	rateLimited    map[string]time.Time // agentType -> until
	cancels        map[string]context.CancelFunc // taskID → cancel
	resultCh       chan *Result
	workCh         chan workItem  // internal bounded queue; Submit enqueues here
	doneCh         chan struct{}  // signals when a worker slot is freed
	Questions      *QuestionRegistry
	Classifier     *Classifier
}

// Result is emitted when a task execution completes.
type Result struct {
	TaskID    string
	Execution *storage.Execution
	Err       error
}

func NewPool(maxConcurrent int, runners map[string]Runner, store *storage.DB, logger *slog.Logger) *Pool {
	if maxConcurrent < 1 {
		maxConcurrent = 1
	}
	p := &Pool{
		maxConcurrent:   maxConcurrent,
		runners:         runners,
		store:           store,
		logger:          logger,
		depPollInterval: 5 * time.Second,
		activePerAgent:  make(map[string]int),
		rateLimited:     make(map[string]time.Time),
		cancels:         make(map[string]context.CancelFunc),
		resultCh:        make(chan *Result, maxConcurrent*2),
		workCh:          make(chan workItem, maxConcurrent*10+100),
		doneCh:          make(chan struct{}, maxConcurrent),
		Questions:       NewQuestionRegistry(),
	}
	go p.dispatch()
	return p
}

// dispatch is a long-running goroutine that reads from the internal work queue
// and launches goroutines as soon as a pool slot is available. This prevents
// tasks from being rejected when the pool is temporarily at capacity.
func (p *Pool) dispatch() {
	for item := range p.workCh {
		for {
			p.mu.Lock()
			if p.active < p.maxConcurrent {
				p.active++
				p.mu.Unlock()
				if item.exec != nil {
					go p.executeResume(item.ctx, item.task, item.exec)
				} else {
					go p.execute(item.ctx, item.task)
				}
				break
			}
			p.mu.Unlock()
			<-p.doneCh // wait for a worker to finish
		}
	}
}

// Submit enqueues a task for execution. Returns an error only if the internal
// work queue is full. When the pool is at capacity the task is buffered and
// dispatched as soon as a slot becomes available.
func (p *Pool) Submit(ctx context.Context, t *task.Task) error {
	select {
	case p.workCh <- workItem{ctx: ctx, task: t}:
		return nil
	default:
		return fmt.Errorf("executor work queue full (capacity %d)", cap(p.workCh))
	}
}

// Results returns the channel for reading execution results.
func (p *Pool) Results() <-chan *Result {
	return p.resultCh
}

// Cancel requests cancellation of a running task. Returns false if the task
// is not currently running in this pool.
func (p *Pool) Cancel(taskID string) bool {
	p.mu.Lock()
	cancel, ok := p.cancels[taskID]
	p.mu.Unlock()
	if !ok {
		return false
	}
	cancel()
	return true
}

// SubmitResume re-queues a blocked task using the provided resume execution.
// The execution must have ResumeSessionID and ResumeAnswer set.
func (p *Pool) SubmitResume(ctx context.Context, t *task.Task, exec *storage.Execution) error {
	if t.State != task.StateBlocked && t.State != task.StateTimedOut {
		return fmt.Errorf("task %s must be in BLOCKED or TIMED_OUT state to resume (current: %s)", t.ID, t.State)
	}
	if exec.ResumeSessionID == "" {
		return fmt.Errorf("resume execution for task %s must have a ResumeSessionID", t.ID)
	}
	select {
	case p.workCh <- workItem{ctx: ctx, task: t, exec: exec}:
		return nil
	default:
		return fmt.Errorf("executor work queue full (capacity %d)", cap(p.workCh))
	}
}

func (p *Pool) getRunner(t *task.Task) (Runner, error) {
	agentType := t.Agent.Type
	if agentType == "" {
		agentType = "claude" // Default for backward compatibility
	}
	runner, ok := p.runners[agentType]
	if !ok {
		return nil, fmt.Errorf("unsupported agent type: %q", agentType)
	}
	return runner, nil
}

func (p *Pool) executeResume(ctx context.Context, t *task.Task, exec *storage.Execution) {
	agentType := t.Agent.Type
	if agentType == "" {
		agentType = "claude"
	}

	p.mu.Lock()
	p.activePerAgent[agentType]++
	p.mu.Unlock()

	defer func() {
		p.mu.Lock()
		p.active--
		p.activePerAgent[agentType]--
		p.mu.Unlock()
		select {
		case p.doneCh <- struct{}{}:
		default:
		}
	}()

	runner, err := p.getRunner(t)
	if err != nil {
		p.logger.Error("failed to get runner for resume", "error", err, "taskID", t.ID)
		p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err}
		return
	}

	// Pre-populate log paths.
	if lp, ok := runner.(LogPather); ok {
		if logDir := lp.ExecLogDir(exec.ID); logDir != "" {
			exec.StdoutPath = filepath.Join(logDir, "stdout.log")
			exec.StderrPath = filepath.Join(logDir, "stderr.log")
			exec.ArtifactDir = logDir
		}
	}
	exec.StartTime = time.Now().UTC()
	exec.Status = "RUNNING"

	if err := p.store.CreateExecution(exec); err != nil {
		p.logger.Error("failed to create resume execution record", "error", err)
	}
	if err := p.store.UpdateTaskState(t.ID, task.StateRunning); err != nil {
		p.logger.Error("failed to update task state", "error", err)
	}

	var cancel context.CancelFunc
	if t.Timeout.Duration > 0 {
		ctx, cancel = context.WithTimeout(ctx, t.Timeout.Duration)
	} else {
		ctx, cancel = context.WithCancel(ctx)
	}
	p.mu.Lock()
	p.cancels[t.ID] = cancel
	p.mu.Unlock()
	defer func() {
		cancel()
		p.mu.Lock()
		delete(p.cancels, t.ID)
		p.mu.Unlock()
	}()

	err = runner.Run(ctx, t, exec)
	exec.EndTime = time.Now().UTC()

	if err != nil {
		if isRateLimitError(err) {
			p.mu.Lock()
			retryAfter := parseRetryAfter(err.Error())
			if retryAfter == 0 {
				retryAfter = 1 * time.Minute
			}
			p.rateLimited[agentType] = time.Now().Add(retryAfter)
			p.mu.Unlock()
		}

		var blockedErr *BlockedError
		if errors.As(err, &blockedErr) {
			exec.Status = "BLOCKED"
			p.store.UpdateTaskState(t.ID, task.StateBlocked)
			p.store.UpdateTaskQuestion(t.ID, blockedErr.QuestionJSON)
		} else if ctx.Err() == context.DeadlineExceeded {
			exec.Status = "TIMED_OUT"
			exec.ErrorMsg = "execution timed out"
			p.store.UpdateTaskState(t.ID, task.StateTimedOut)
		} else if ctx.Err() == context.Canceled {
			exec.Status = "CANCELLED"
			exec.ErrorMsg = "execution cancelled"
			p.store.UpdateTaskState(t.ID, task.StateCancelled)
		} else if isQuotaExhausted(err) {
			exec.Status = "BUDGET_EXCEEDED"
			exec.ErrorMsg = err.Error()
			p.store.UpdateTaskState(t.ID, task.StateBudgetExceeded)
		} else {
			exec.Status = "FAILED"
			exec.ErrorMsg = err.Error()
			p.store.UpdateTaskState(t.ID, task.StateFailed)
		}
	} else {
		if t.ParentTaskID == "" {
			exec.Status = "READY"
			p.store.UpdateTaskState(t.ID, task.StateReady)
		} else {
			exec.Status = "COMPLETED"
			p.store.UpdateTaskState(t.ID, task.StateCompleted)
		}
	}

	if updateErr := p.store.UpdateExecution(exec); updateErr != nil {
		p.logger.Error("failed to update resume execution", "error", updateErr)
	}
	p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err}
}

// ActiveCount returns the number of currently running tasks.
func (p *Pool) ActiveCount() int {
	p.mu.Lock()
	defer p.mu.Unlock()
	return p.active
}

func (p *Pool) execute(ctx context.Context, t *task.Task) {
	// 1. Classification
	if p.Classifier != nil {
		p.mu.Lock()
		activeTasks := make(map[string]int)
		rateLimited := make(map[string]bool)
		now := time.Now()
		for agent := range p.runners {
			activeTasks[agent] = p.activePerAgent[agent]
			rateLimited[agent] = now.Before(p.rateLimited[agent])
		}
		status := SystemStatus{
			ActiveTasks: activeTasks,
			RateLimited: rateLimited,
		}
		p.mu.Unlock()

		cls, err := p.Classifier.Classify(ctx, t.Name, t.Agent.Instructions, status)
		if err == nil {
			p.logger.Info("task classified", "taskID", t.ID, "agent", cls.AgentType, "model", cls.Model, "reason", cls.Reason)
			t.Agent.Type = cls.AgentType
			t.Agent.Model = cls.Model
		} else {
			p.logger.Error("classification failed", "error", err, "taskID", t.ID)
		}
	}

	agentType := t.Agent.Type
	if agentType == "" {
		agentType = "claude"
	}

	p.mu.Lock()
	p.activePerAgent[agentType]++
	p.mu.Unlock()

	defer func() {
		p.mu.Lock()
		p.active--
		p.activePerAgent[agentType]--
		p.mu.Unlock()
		select {
		case p.doneCh <- struct{}{}:
		default:
		}
	}()

	runner, err := p.getRunner(t)
	if err != nil {
		p.logger.Error("failed to get runner", "error", err, "taskID", t.ID)
		now := time.Now().UTC()
		exec := &storage.Execution{
			ID:        uuid.New().String(),
			TaskID:    t.ID,
			StartTime: now,
			EndTime:   now,
			Status:    "FAILED",
			ErrorMsg:  err.Error(),
		}
		if createErr := p.store.CreateExecution(exec); createErr != nil {
			p.logger.Error("failed to create execution record", "error", createErr)
		}
		p.store.UpdateTaskState(t.ID, task.StateFailed)
		p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err}
		return
	}

	// Wait for all dependencies to complete before starting execution.
	if len(t.DependsOn) > 0 {
		if err := p.waitForDependencies(ctx, t); err != nil {
			now := time.Now().UTC()
			exec := &storage.Execution{
				ID:        uuid.New().String(),
				TaskID:    t.ID,
				StartTime: now,
				EndTime:   now,
				Status:    "FAILED",
				ErrorMsg:  err.Error(),
			}
			if createErr := p.store.CreateExecution(exec); createErr != nil {
				p.logger.Error("failed to create execution record", "error", createErr)
			}
			p.store.UpdateTaskState(t.ID, task.StateFailed)
			p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err}
			return
		}
	}

	execID := uuid.New().String()
	exec := &storage.Execution{
		ID:        execID,
		TaskID:    t.ID,
		StartTime: time.Now().UTC(),
		Status:    "RUNNING",
	}

	// Pre-populate log paths so they're available in the DB immediately —
	// before the subprocess starts — enabling live tailing and debugging.
	if lp, ok := runner.(LogPather); ok {
		if logDir := lp.ExecLogDir(execID); logDir != "" {
			exec.StdoutPath = filepath.Join(logDir, "stdout.log")
			exec.StderrPath = filepath.Join(logDir, "stderr.log")
			exec.ArtifactDir = logDir
		}
	}

	// Record execution start.
	if err := p.store.CreateExecution(exec); err != nil {
		p.logger.Error("failed to create execution record", "error", err)
	}
	if err := p.store.UpdateTaskState(t.ID, task.StateRunning); err != nil {
		p.logger.Error("failed to update task state", "error", err)
	}

	// Apply task timeout and register cancel so callers can stop this task.
	var cancel context.CancelFunc
	if t.Timeout.Duration > 0 {
		ctx, cancel = context.WithTimeout(ctx, t.Timeout.Duration)
	} else {
		ctx, cancel = context.WithCancel(ctx)
	}
	p.mu.Lock()
	p.cancels[t.ID] = cancel
	p.mu.Unlock()
	defer func() {
		cancel()
		p.mu.Lock()
		delete(p.cancels, t.ID)
		p.mu.Unlock()
	}()

	// Inject prior failure history so the agent knows what went wrong before.
	priorExecs, priorErr := p.store.ListExecutions(t.ID)
	t = withFailureHistory(t, priorExecs, priorErr)

	// Run the task.
	err = runner.Run(ctx, t, exec)
	exec.EndTime = time.Now().UTC()

	if err != nil {
		if isRateLimitError(err) {
			p.mu.Lock()
			retryAfter := parseRetryAfter(err.Error())
			if retryAfter == 0 {
				retryAfter = 1 * time.Minute
			}
			p.rateLimited[agentType] = time.Now().Add(retryAfter)
			p.mu.Unlock()
		}

		var blockedErr *BlockedError
		if errors.As(err, &blockedErr) {
			exec.Status = "BLOCKED"
			p.store.UpdateTaskState(t.ID, task.StateBlocked)
			p.store.UpdateTaskQuestion(t.ID, blockedErr.QuestionJSON)
		} else if ctx.Err() == context.DeadlineExceeded {
			exec.Status = "TIMED_OUT"
			exec.ErrorMsg = "execution timed out"
			p.store.UpdateTaskState(t.ID, task.StateTimedOut)
		} else if ctx.Err() == context.Canceled {
			exec.Status = "CANCELLED"
			exec.ErrorMsg = "execution cancelled"
			p.store.UpdateTaskState(t.ID, task.StateCancelled)
		} else if isQuotaExhausted(err) {
			exec.Status = "BUDGET_EXCEEDED"
			exec.ErrorMsg = err.Error()
			p.store.UpdateTaskState(t.ID, task.StateBudgetExceeded)
		} else {
			exec.Status = "FAILED"
			exec.ErrorMsg = err.Error()
			p.store.UpdateTaskState(t.ID, task.StateFailed)
		}
	} else {
		if t.ParentTaskID == "" {
			exec.Status = "READY"
			p.store.UpdateTaskState(t.ID, task.StateReady)
		} else {
			exec.Status = "COMPLETED"
			p.store.UpdateTaskState(t.ID, task.StateCompleted)
		}
	}

	if updateErr := p.store.UpdateExecution(exec); updateErr != nil {
		p.logger.Error("failed to update execution", "error", updateErr)
	}

	p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err}
}

// terminalFailureStates are dependency states that cause the waiting task to fail immediately.
var terminalFailureStates = map[task.State]bool{
	task.StateFailed:         true,
	task.StateTimedOut:       true,
	task.StateCancelled:      true,
	task.StateBudgetExceeded: true,
}

// withFailureHistory returns a shallow copy of t with prior failed execution
// error messages prepended to SystemPromptAppend so the agent knows what went
// wrong in previous attempts.
func withFailureHistory(t *task.Task, execs []*storage.Execution, err error) *task.Task {
	if err != nil || len(execs) == 0 {
		return t
	}

	var failures []storage.Execution
	for _, e := range execs {
		if (e.Status == "FAILED" || e.Status == "TIMED_OUT") && e.ErrorMsg != "" {
			failures = append(failures, *e)
		}
	}
	if len(failures) == 0 {
		return t
	}

	var sb strings.Builder
	sb.WriteString("## Prior Attempt History\n\n")
	sb.WriteString("This task has failed before. Do not repeat the same mistakes.\n\n")
	for i, f := range failures {
		fmt.Fprintf(&sb, "**Attempt %d** (%s) — %s:\n%s\n\n",
			i+1, f.StartTime.Format("2006-01-02 15:04 UTC"), f.Status, f.ErrorMsg)
	}
	sb.WriteString("---\n\n")

	copy := *t
	copy.Agent = t.Agent
	if copy.Agent.SystemPromptAppend != "" {
		copy.Agent.SystemPromptAppend = sb.String() + copy.Agent.SystemPromptAppend
	} else {
		copy.Agent.SystemPromptAppend = sb.String()
	}
	return &copy
}

// waitForDependencies polls storage until all tasks in t.DependsOn reach COMPLETED,
// or until a dependency enters a terminal failure state or the context is cancelled.
func (p *Pool) waitForDependencies(ctx context.Context, t *task.Task) error {
	for {
		allDone := true
		for _, depID := range t.DependsOn {
			dep, err := p.store.GetTask(depID)
			if err != nil {
				return fmt.Errorf("dependency %q not found: %w", depID, err)
			}
			if dep.State == task.StateCompleted {
				continue
			}
			if terminalFailureStates[dep.State] {
				return fmt.Errorf("dependency %q ended in state %s", depID, dep.State)
			}
			allDone = false
		}
		if allDone {
			return nil
		}
		select {
		case <-ctx.Done():
			return ctx.Err()
		case <-time.After(p.depPollInterval):
		}
	}
}