From 4029fdd82bdd657ed862c89f20eb03ff2594cde9 Mon Sep 17 00:00:00 2001 From: Peter Stone Date: Sat, 14 Mar 2026 07:37:20 +0000 Subject: fix: surface agent stderr, auto-retry restart-killed tasks, handle stale sandboxes #1 - Diagnostics: tailFile() reads last 20 lines of subprocess stderr and appends to error message when claude/gemini exits non-zero. Previously all exit-1 failures were opaque; now the error_msg carries the actual subprocess output. #4 - Restart recovery: RecoverStaleRunning() now re-queues tasks after marking them FAILED, so tasks killed by a server restart automatically retry on the next boot rather than staying permanently FAILED. #2 - Stale sandbox: If a resume execution's preserved SandboxDir no longer exists (e.g. /tmp purge after reboot), clone a fresh sandbox instead of failing immediately with "no such file or directory". Co-Authored-By: Claude Sonnet 4.6 --- internal/executor/claude.go | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) (limited to 'internal/executor/claude.go') diff --git a/internal/executor/claude.go b/internal/executor/claude.go index 626a854..5a5b35e 100644 --- a/internal/executor/claude.go +++ b/internal/executor/claude.go @@ -106,7 +106,23 @@ func (r *ClaudeRunner) Run(ctx context.Context, t *task.Task, e *storage.Executi effectiveWorkingDir := projectDir if e.ResumeSessionID != "" { if e.SandboxDir != "" { - effectiveWorkingDir = e.SandboxDir + if _, statErr := os.Stat(e.SandboxDir); statErr == nil { + effectiveWorkingDir = e.SandboxDir + } else { + // Preserved sandbox was cleaned up (e.g. /tmp purge after reboot). + // Clone a fresh sandbox so the task can run rather than fail immediately. + r.Logger.Warn("preserved sandbox missing, cloning fresh", "sandbox", e.SandboxDir, "project_dir", projectDir) + e.SandboxDir = "" + if projectDir != "" { + var err error + sandboxDir, err = setupSandbox(projectDir) + if err != nil { + return fmt.Errorf("setting up sandbox: %w", err) + } + effectiveWorkingDir = sandboxDir + r.Logger.Info("fresh sandbox created for resume", "sandbox", sandboxDir, "project_dir", projectDir) + } + } } } else if projectDir != "" { var err error @@ -399,6 +415,9 @@ func (r *ClaudeRunner) execOnce(ctx context.Context, args []string, workingDir s if isRateLimitError(streamErr) || isQuotaExhausted(streamErr) { return streamErr } + if tail := tailFile(e.StderrPath, 20); tail != "" { + return fmt.Errorf("claude exited with error: %w\nstderr:\n%s", waitErr, tail) + } return fmt.Errorf("claude exited with error: %w", waitErr) } @@ -578,3 +597,23 @@ func permissionDenialError(msg map[string]interface{}) error { } return nil } + +// tailFile returns the last n lines of the file at path, or empty string if +// the file cannot be read. Used to surface subprocess stderr on failure. +func tailFile(path string, n int) string { + f, err := os.Open(path) + if err != nil { + return "" + } + defer f.Close() + + var lines []string + scanner := bufio.NewScanner(f) + for scanner.Scan() { + lines = append(lines, scanner.Text()) + if len(lines) > n { + lines = lines[1:] + } + } + return strings.Join(lines, "\n") +} -- cgit v1.2.3