fix: surface agent stderr, auto-retry restart-killed tasks, handle stale sandboxes

#1 - Diagnostics: tailFile() reads last 20 lines of subprocess stderr and appends to error message when claude/gemini exits non-zero. Previously all exit-1 failures were opaque; now the error_msg carries the actual subprocess output. #4 - Restart recovery: RecoverStaleRunning() now re-queues tasks after marking them FAILED, so tasks killed by a server restart automatically retry on the next boot rather than staying permanently FAILED. #2 - Stale sandbox: If a resume execution's preserved SandboxDir no longer exists (e.g. /tmp purge after reboot), clone a fresh sandbox instead of failing immediately with "no such file or directory". Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
author: Peter Stone <thepeterstone@gmail.com> 2026-03-14 07:37:20 +0000
committer: Peter Stone <thepeterstone@gmail.com> 2026-03-14 07:37:20 +0000
commit: 4029fdd82bdd657ed862c89f20eb03ff2594cde9 (patch)
tree: 5725975ffa6825018605ee336ebe8a7e3f02b1d4 /internal/executor/claude.go
parent: 02b35218d9aadcaa6a3b52f218b71577ab72c811 (diff)
1 files changed, 40 insertions, 1 deletions
diff --git a/internal/executor/claude.go b/internal/executor/claude.go
index 626a854..5a5b35e 100644
--- a/internal/executor/claude.go
+++ b/internal/executor/claude.go
@@ -106,7 +106,23 @@ func (r *ClaudeRunner) Run(ctx context.Context, t *task.Task, e *storage.Executi
 	effectiveWorkingDir := projectDir
 	if e.ResumeSessionID != "" {
 		if e.SandboxDir != "" {
-			effectiveWorkingDir = e.SandboxDir
+			if _, statErr := os.Stat(e.SandboxDir); statErr == nil {
+				effectiveWorkingDir = e.SandboxDir
+			} else {
+				// Preserved sandbox was cleaned up (e.g. /tmp purge after reboot).
+				// Clone a fresh sandbox so the task can run rather than fail immediately.
+				r.Logger.Warn("preserved sandbox missing, cloning fresh", "sandbox", e.SandboxDir, "project_dir", projectDir)
+				e.SandboxDir = ""
+				if projectDir != "" {
+					var err error
+					sandboxDir, err = setupSandbox(projectDir)
+					if err != nil {
+						return fmt.Errorf("setting up sandbox: %w", err)
+					}
+					effectiveWorkingDir = sandboxDir
+					r.Logger.Info("fresh sandbox created for resume", "sandbox", sandboxDir, "project_dir", projectDir)
+				}
+			}
 		}
 	} else if projectDir != "" {
 		var err error
@@ -399,6 +415,9 @@ func (r *ClaudeRunner) execOnce(ctx context.Context, args []string, workingDir s
 		if isRateLimitError(streamErr) || isQuotaExhausted(streamErr) {
 			return streamErr
 		}
+		if tail := tailFile(e.StderrPath, 20); tail != "" {
+			return fmt.Errorf("claude exited with error: %w\nstderr:\n%s", waitErr, tail)
+		}
 		return fmt.Errorf("claude exited with error: %w", waitErr)
 	}
 
@@ -578,3 +597,23 @@ func permissionDenialError(msg map[string]interface{}) error {
 	}
 	return nil
 }
+
+// tailFile returns the last n lines of the file at path, or empty string if
+// the file cannot be read. Used to surface subprocess stderr on failure.
+func tailFile(path string, n int) string {
+	f, err := os.Open(path)
+	if err != nil {
+		return ""
+	}
+	defer f.Close()
+
+	var lines []string
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		lines = append(lines, scanner.Text())
+		if len(lines) > n {
+			lines = lines[1:]
+		}
+	}
+	return strings.Join(lines, "\n")
+}
author	Peter Stone <thepeterstone@gmail.com>	2026-03-14 07:37:20 +0000
committer	Peter Stone <thepeterstone@gmail.com>	2026-03-14 07:37:20 +0000
commit	4029fdd82bdd657ed862c89f20eb03ff2594cde9 (patch)
tree	5725975ffa6825018605ee336ebe8a7e3f02b1d4 /internal/executor/claude.go
parent	02b35218d9aadcaa6a3b52f218b71577ab72c811 (diff)