From cf83444a9d341ae362e65a9f995100c69176887c Mon Sep 17 00:00:00 2001 From: Peter Stone Date: Thu, 5 Mar 2026 18:51:50 +0000 Subject: Rescue work from claudomator-work: question/answer, ratelimit, start-next-task Merges features developed in /site/doot.terst.org/claudomator-work (a stale clone) into the canonical repo: - executor: QuestionRegistry for human-in-the-loop answers, rate limit detection and exponential backoff retry (ratelimit.go, question.go) - executor/claude.go: process group isolation (SIGKILL orphans on cancel), os.Pipe for reliable stdout drain, backoff retry on rate limits - api/scripts.go: POST /api/scripts/start-next-task handler - api/server.go: startNextTaskScript field, answer-question route, BroadcastQuestion for WebSocket question events - web: Cancel/Restart buttons, question banner UI, log viewer, validate section, WebSocket auto-connect All tests pass. Co-Authored-By: Claude Sonnet 4.6 --- internal/executor/claude.go | 98 +++++++++++++++++++++++++++++++++------------ 1 file changed, 73 insertions(+), 25 deletions(-) (limited to 'internal/executor/claude.go') diff --git a/internal/executor/claude.go b/internal/executor/claude.go index 7b3884c..0029331 100644 --- a/internal/executor/claude.go +++ b/internal/executor/claude.go @@ -10,6 +10,9 @@ import ( "os" "os/exec" "path/filepath" + "sync" + "syscall" + "time" "github.com/thepeterstone/claudomator/internal/storage" "github.com/thepeterstone/claudomator/internal/task" @@ -31,71 +34,116 @@ func (r *ClaudeRunner) binaryPath() string { } // Run executes a claude -p invocation, streaming output to log files. +// It retries up to 3 times on rate-limit errors using exponential backoff. func (r *ClaudeRunner) Run(ctx context.Context, t *task.Task, e *storage.Execution) error { args := r.buildArgs(t) - cmd := exec.CommandContext(ctx, r.binaryPath(), args...) - cmd.Env = append(os.Environ(), - "CLAUDOMATOR_API_URL="+r.APIURL, - "CLAUDOMATOR_TASK_ID="+t.ID, - ) if t.Claude.WorkingDir != "" { if _, err := os.Stat(t.Claude.WorkingDir); err != nil { return fmt.Errorf("working_dir %q: %w", t.Claude.WorkingDir, err) } - cmd.Dir = t.Claude.WorkingDir } - // Setup log directory for this execution. + // Setup log directory once; retries overwrite the log files. logDir := filepath.Join(r.LogDir, e.ID) if err := os.MkdirAll(logDir, 0700); err != nil { return fmt.Errorf("creating log dir: %w", err) } - - stdoutPath := filepath.Join(logDir, "stdout.log") - stderrPath := filepath.Join(logDir, "stderr.log") - e.StdoutPath = stdoutPath - e.StderrPath = stderrPath + e.StdoutPath = filepath.Join(logDir, "stdout.log") + e.StderrPath = filepath.Join(logDir, "stderr.log") e.ArtifactDir = logDir - stdoutFile, err := os.Create(stdoutPath) + attempt := 0 + return runWithBackoff(ctx, 3, 5*time.Second, func() error { + if attempt > 0 { + delay := 5 * time.Second * (1 << (attempt - 1)) + r.Logger.Warn("rate-limited by Claude API, retrying", + "attempt", attempt, + "delay", delay, + ) + } + attempt++ + return r.execOnce(ctx, t, args, e) + }) +} + +// execOnce runs the claude subprocess once, streaming output to e's log paths. +func (r *ClaudeRunner) execOnce(ctx context.Context, t *task.Task, args []string, e *storage.Execution) error { + cmd := exec.CommandContext(ctx, r.binaryPath(), args...) + cmd.Env = append(os.Environ(), + "CLAUDOMATOR_API_URL="+r.APIURL, + "CLAUDOMATOR_TASK_ID="+t.ID, + ) + // Put the subprocess in its own process group so we can SIGKILL the entire + // group (MCP servers, bash children, etc.) on cancellation. + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + if t.Claude.WorkingDir != "" { + cmd.Dir = t.Claude.WorkingDir + } + + stdoutFile, err := os.Create(e.StdoutPath) if err != nil { return fmt.Errorf("creating stdout log: %w", err) } defer stdoutFile.Close() - stderrFile, err := os.Create(stderrPath) + stderrFile, err := os.Create(e.StderrPath) if err != nil { return fmt.Errorf("creating stderr log: %w", err) } defer stderrFile.Close() - stdoutPipe, err := cmd.StdoutPipe() + // Use os.Pipe for stdout so we own the read-end lifetime. + // cmd.StdoutPipe() would add the read-end to closeAfterWait, causing + // cmd.Wait() to close it before our goroutine finishes reading. + stdoutR, stdoutW, err := os.Pipe() if err != nil { return fmt.Errorf("creating stdout pipe: %w", err) } - stderrPipe, err := cmd.StderrPipe() - if err != nil { - return fmt.Errorf("creating stderr pipe: %w", err) - } + cmd.Stdout = stdoutW // *os.File — not added to closeAfterStart/Wait + cmd.Stderr = stderrFile if err := cmd.Start(); err != nil { + stdoutW.Close() + stdoutR.Close() return fmt.Errorf("starting claude: %w", err) } + // Close our write-end immediately; the subprocess holds its own copy. + // The goroutine below gets EOF when the subprocess exits. + stdoutW.Close() + + // killDone is closed when cmd.Wait() returns, stopping the pgid-kill goroutine. + killDone := make(chan struct{}) + go func() { + select { + case <-ctx.Done(): + // SIGKILL the entire process group to reap orphan children. + syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL) + case <-killDone: + } + }() - // Stream output to log files and parse cost info. + // Stream stdout to the log file and parse cost. + // wg ensures costUSD is fully written before we read it after cmd.Wait(). var costUSD float64 + var wg sync.WaitGroup + wg.Add(1) go func() { - costUSD = streamAndParseCost(stdoutPipe, stdoutFile, r.Logger) + defer wg.Done() + costUSD = streamAndParseCost(stdoutR, stdoutFile, r.Logger) + stdoutR.Close() }() - go io.Copy(stderrFile, stderrPipe) - if err := cmd.Wait(); err != nil { - if exitErr, ok := err.(*exec.ExitError); ok { + waitErr := cmd.Wait() + close(killDone) // stop the pgid-kill goroutine + wg.Wait() // drain remaining stdout before reading costUSD + + if waitErr != nil { + if exitErr, ok := waitErr.(*exec.ExitError); ok { e.ExitCode = exitErr.ExitCode() } e.CostUSD = costUSD - return fmt.Errorf("claude exited with error: %w", err) + return fmt.Errorf("claude exited with error: %w", waitErr) } e.ExitCode = 0 -- cgit v1.2.3