From 4029fdd82bdd657ed862c89f20eb03ff2594cde9 Mon Sep 17 00:00:00 2001 From: Peter Stone Date: Sat, 14 Mar 2026 07:37:20 +0000 Subject: fix: surface agent stderr, auto-retry restart-killed tasks, handle stale sandboxes #1 - Diagnostics: tailFile() reads last 20 lines of subprocess stderr and appends to error message when claude/gemini exits non-zero. Previously all exit-1 failures were opaque; now the error_msg carries the actual subprocess output. #4 - Restart recovery: RecoverStaleRunning() now re-queues tasks after marking them FAILED, so tasks killed by a server restart automatically retry on the next boot rather than staying permanently FAILED. #2 - Stale sandbox: If a resume execution's preserved SandboxDir no longer exists (e.g. /tmp purge after reboot), clone a fresh sandbox instead of failing immediately with "no such file or directory". Co-Authored-By: Claude Sonnet 4.6 --- internal/cli/serve.go | 2 +- internal/executor/claude.go | 41 +++++++++++++++++- internal/executor/claude_test.go | 86 ++++++++++++++++++++++++++++++++++++++ internal/executor/executor.go | 17 ++++++-- internal/executor/executor_test.go | 28 +++++++++---- internal/executor/gemini.go | 3 ++ 6 files changed, 164 insertions(+), 13 deletions(-) diff --git a/internal/cli/serve.go b/internal/cli/serve.go index fd9fda8..2d47630 100644 --- a/internal/cli/serve.go +++ b/internal/cli/serve.go @@ -76,7 +76,7 @@ func serve(addr string) error { if cfg.GeminiBinaryPath != "" { pool.Classifier = &executor.Classifier{GeminiBinaryPath: cfg.GeminiBinaryPath} } - pool.RecoverStaleRunning() + pool.RecoverStaleRunning(context.Background()) pool.RecoverStaleQueued(context.Background()) srv := api.NewServer(store, pool, logger, cfg.ClaudeBinaryPath, cfg.GeminiBinaryPath) diff --git a/internal/executor/claude.go b/internal/executor/claude.go index 626a854..5a5b35e 100644 --- a/internal/executor/claude.go +++ b/internal/executor/claude.go @@ -106,7 +106,23 @@ func (r *ClaudeRunner) Run(ctx context.Context, t *task.Task, e *storage.Executi effectiveWorkingDir := projectDir if e.ResumeSessionID != "" { if e.SandboxDir != "" { - effectiveWorkingDir = e.SandboxDir + if _, statErr := os.Stat(e.SandboxDir); statErr == nil { + effectiveWorkingDir = e.SandboxDir + } else { + // Preserved sandbox was cleaned up (e.g. /tmp purge after reboot). + // Clone a fresh sandbox so the task can run rather than fail immediately. + r.Logger.Warn("preserved sandbox missing, cloning fresh", "sandbox", e.SandboxDir, "project_dir", projectDir) + e.SandboxDir = "" + if projectDir != "" { + var err error + sandboxDir, err = setupSandbox(projectDir) + if err != nil { + return fmt.Errorf("setting up sandbox: %w", err) + } + effectiveWorkingDir = sandboxDir + r.Logger.Info("fresh sandbox created for resume", "sandbox", sandboxDir, "project_dir", projectDir) + } + } } } else if projectDir != "" { var err error @@ -399,6 +415,9 @@ func (r *ClaudeRunner) execOnce(ctx context.Context, args []string, workingDir s if isRateLimitError(streamErr) || isQuotaExhausted(streamErr) { return streamErr } + if tail := tailFile(e.StderrPath, 20); tail != "" { + return fmt.Errorf("claude exited with error: %w\nstderr:\n%s", waitErr, tail) + } return fmt.Errorf("claude exited with error: %w", waitErr) } @@ -578,3 +597,23 @@ func permissionDenialError(msg map[string]interface{}) error { } return nil } + +// tailFile returns the last n lines of the file at path, or empty string if +// the file cannot be read. Used to surface subprocess stderr on failure. +func tailFile(path string, n int) string { + f, err := os.Open(path) + if err != nil { + return "" + } + defer f.Close() + + var lines []string + scanner := bufio.NewScanner(f) + for scanner.Scan() { + lines = append(lines, scanner.Text()) + if len(lines) > n { + lines = lines[1:] + } + } + return strings.Join(lines, "\n") +} diff --git a/internal/executor/claude_test.go b/internal/executor/claude_test.go index 7ab0802..9bb873f 100644 --- a/internal/executor/claude_test.go +++ b/internal/executor/claude_test.go @@ -3,6 +3,7 @@ package executor import ( "context" "errors" + "fmt" "io" "log/slog" "os" @@ -579,6 +580,63 @@ func TestClaudeRunner_Run_ResumeUsesStoredSandboxDir(t *testing.T) { } } +func TestClaudeRunner_Run_StaleSandboxDir_ClonesAfresh(t *testing.T) { + logDir := t.TempDir() + projectDir := t.TempDir() + initGitRepo(t, projectDir) + + cwdFile := filepath.Join(logDir, "cwd.txt") + scriptPath := filepath.Join(t.TempDir(), "fake-claude.sh") + script := "#!/bin/sh\nprintf '%s' \"$PWD\" > " + cwdFile + "\n" + if err := os.WriteFile(scriptPath, []byte(script), 0755); err != nil { + t.Fatalf("write script: %v", err) + } + + r := &ClaudeRunner{ + BinaryPath: scriptPath, + Logger: slog.New(slog.NewTextHandler(io.Discard, nil)), + LogDir: logDir, + } + tk := &task.Task{ + Agent: task.AgentConfig{ + Type: "claude", + ProjectDir: projectDir, + SkipPlanning: true, + }, + } + // Point to a sandbox that no longer exists (e.g. /tmp was purged). + staleSandbox := filepath.Join(t.TempDir(), "gone") + e := &storage.Execution{ + ID: "resume-exec-2", + TaskID: "task-2", + ResumeSessionID: "session-abc", + ResumeAnswer: "ok", + SandboxDir: staleSandbox, + } + + if err := r.Run(context.Background(), tk, e); err != nil { + t.Fatalf("Run with stale sandbox: %v", err) + } + + got, err := os.ReadFile(cwdFile) + if err != nil { + t.Fatalf("cwd file not written: %v", err) + } + // Should have run in a fresh sandbox (not the stale path, not the raw projectDir). + // The sandbox is removed after teardown, so we only check what it wasn't. + cwd := string(got) + if cwd == staleSandbox { + t.Error("ran in stale sandbox dir that doesn't exist") + } + if cwd == projectDir { + t.Error("ran directly in project_dir; expected a fresh sandbox clone") + } + // cwd should look like a claudomator sandbox path. + if !strings.Contains(cwd, "claudomator-sandbox-") { + t.Errorf("expected sandbox path, got %q", cwd) + } +} + func TestIsCompletionReport(t *testing.T) { tests := []struct { name string @@ -621,6 +679,34 @@ func TestIsCompletionReport(t *testing.T) { } } +func TestTailFile_ReturnsLastNLines(t *testing.T) { + f, err := os.CreateTemp("", "tailfile-*") + if err != nil { + t.Fatal(err) + } + defer os.Remove(f.Name()) + for i := 1; i <= 30; i++ { + fmt.Fprintf(f, "line %d\n", i) + } + f.Close() + + got := tailFile(f.Name(), 5) + lines := strings.Split(got, "\n") + if len(lines) != 5 { + t.Fatalf("want 5 lines, got %d: %q", len(lines), got) + } + if lines[0] != "line 26" || lines[4] != "line 30" { + t.Errorf("want lines 26-30, got: %q", got) + } +} + +func TestTailFile_MissingFile_ReturnsEmpty(t *testing.T) { + got := tailFile("/nonexistent/path/file.log", 10) + if got != "" { + t.Errorf("want empty string for missing file, got %q", got) + } +} + func TestGitSafe_PrependsSafeDirectory(t *testing.T) { got := gitSafe("-C", "/some/path", "status") want := []string{"-c", "safe.directory=*", "-C", "/some/path", "status"} diff --git a/internal/executor/executor.go b/internal/executor/executor.go index 475d150..7674fe6 100644 --- a/internal/executor/executor.go +++ b/internal/executor/executor.go @@ -567,9 +567,10 @@ func (p *Pool) execute(ctx context.Context, t *task.Task) { } // RecoverStaleRunning marks any tasks stuck in RUNNING state (from a previous -// server crash or restart) as FAILED. It also closes any open RUNNING execution -// records for those tasks. Call this once on server startup. -func (p *Pool) RecoverStaleRunning() { +// server crash or restart) as FAILED, then immediately re-queues them for +// retry. It also closes any open RUNNING execution records for those tasks. +// Call this once on server startup. +func (p *Pool) RecoverStaleRunning(ctx context.Context) { tasks, err := p.store.ListTasks(storage.TaskFilter{State: task.StateRunning}) if err != nil { p.logger.Error("RecoverStaleRunning: list tasks", "error", err) @@ -593,6 +594,16 @@ func (p *Pool) RecoverStaleRunning() { } if err := p.store.UpdateTaskState(t.ID, task.StateFailed); err != nil { p.logger.Error("RecoverStaleRunning: update task state", "error", err, "taskID", t.ID) + continue + } + // Re-queue so the task retries automatically. Submit expects QUEUED state. + if err := p.store.UpdateTaskState(t.ID, task.StateQueued); err != nil { + p.logger.Error("RecoverStaleRunning: set queued", "error", err, "taskID", t.ID) + continue + } + t.State = task.StateQueued + if err := p.Submit(ctx, t); err != nil { + p.logger.Error("RecoverStaleRunning: re-queue", "error", err, "taskID", t.ID) } } } diff --git a/internal/executor/executor_test.go b/internal/executor/executor_test.go index f6d0179..a6c4ad8 100644 --- a/internal/executor/executor_test.go +++ b/internal/executor/executor_test.go @@ -596,15 +596,9 @@ func TestPool_RecoverStaleRunning(t *testing.T) { Status: "RUNNING", }) - pool.RecoverStaleRunning() + pool.RecoverStaleRunning(context.Background()) - recovered, err := store.GetTask(tk.ID) - if err != nil { - t.Fatalf("get task: %v", err) - } - if recovered.State != task.StateFailed { - t.Errorf("state: want FAILED, got %q", recovered.State) - } + // Execution record should be closed as FAILED. execs, _ := store.ListExecutions(tk.ID) if len(execs) == 0 || execs[0].Status != "FAILED" { t.Errorf("execution status: want FAILED, got %+v", execs) @@ -612,6 +606,24 @@ func TestPool_RecoverStaleRunning(t *testing.T) { if execs[0].ErrorMsg == "" { t.Error("expected non-empty error message on recovered execution") } + + // Task should be re-queued for retry and complete. + select { + case result := <-pool.Results(): + if result.TaskID != tk.ID { + t.Errorf("unexpected task in results: %s", result.TaskID) + } + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for stale RUNNING task to be re-queued and run") + } + recovered, err := store.GetTask(tk.ID) + if err != nil { + t.Fatalf("get task: %v", err) + } + // Top-level tasks (no parent) go to READY after a successful run. + if recovered.State != task.StateReady { + t.Errorf("state after re-queue: want READY, got %q", recovered.State) + } } func TestPool_RecoverStaleQueued_ResubmitsToPool(t *testing.T) { diff --git a/internal/executor/gemini.go b/internal/executor/gemini.go index c30cd66..2db3218 100644 --- a/internal/executor/gemini.go +++ b/internal/executor/gemini.go @@ -150,6 +150,9 @@ func (r *GeminiRunner) execOnce(ctx context.Context, args []string, workingDir s if exitErr, ok := waitErr.(*exec.ExitError); ok { e.ExitCode = exitErr.ExitCode() } + if tail := tailFile(e.StderrPath, 20); tail != "" { + return fmt.Errorf("gemini exited with error: %w\nstderr:\n%s", waitErr, tail) + } return fmt.Errorf("gemini exited with error: %w", waitErr) } -- cgit v1.2.3