fix: surface agent stderr, auto-retry restart-killed tasks, handle stale sandboxes

#1 - Diagnostics: tailFile() reads last 20 lines of subprocess stderr and appends to error message when claude/gemini exits non-zero. Previously all exit-1 failures were opaque; now the error_msg carries the actual subprocess output. #4 - Restart recovery: RecoverStaleRunning() now re-queues tasks after marking them FAILED, so tasks killed by a server restart automatically retry on the next boot rather than staying permanently FAILED. #2 - Stale sandbox: If a resume execution's preserved SandboxDir no longer exists (e.g. /tmp purge after reboot), clone a fresh sandbox instead of failing immediately with "no such file or directory". Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
author: Peter Stone <thepeterstone@gmail.com> 2026-03-14 07:37:20 +0000
committer: Peter Stone <thepeterstone@gmail.com> 2026-03-14 07:37:20 +0000
commit: 4029fdd82bdd657ed862c89f20eb03ff2594cde9 (patch)
tree: 5725975ffa6825018605ee336ebe8a7e3f02b1d4 /internal/executor/executor.go
parent: 02b35218d9aadcaa6a3b52f218b71577ab72c811 (diff)
1 files changed, 14 insertions, 3 deletions
diff --git a/internal/executor/executor.go b/internal/executor/executor.go
index 475d150..7674fe6 100644
--- a/internal/executor/executor.go
+++ b/internal/executor/executor.go
@@ -567,9 +567,10 @@ func (p *Pool) execute(ctx context.Context, t *task.Task) {
 }
 
 // RecoverStaleRunning marks any tasks stuck in RUNNING state (from a previous
-// server crash or restart) as FAILED. It also closes any open RUNNING execution
-// records for those tasks. Call this once on server startup.
-func (p *Pool) RecoverStaleRunning() {
+// server crash or restart) as FAILED, then immediately re-queues them for
+// retry. It also closes any open RUNNING execution records for those tasks.
+// Call this once on server startup.
+func (p *Pool) RecoverStaleRunning(ctx context.Context) {
 	tasks, err := p.store.ListTasks(storage.TaskFilter{State: task.StateRunning})
 	if err != nil {
 		p.logger.Error("RecoverStaleRunning: list tasks", "error", err)
@@ -593,6 +594,16 @@ func (p *Pool) RecoverStaleRunning() {
 		}
 		if err := p.store.UpdateTaskState(t.ID, task.StateFailed); err != nil {
 			p.logger.Error("RecoverStaleRunning: update task state", "error", err, "taskID", t.ID)
+			continue
+		}
+		// Re-queue so the task retries automatically. Submit expects QUEUED state.
+		if err := p.store.UpdateTaskState(t.ID, task.StateQueued); err != nil {
+			p.logger.Error("RecoverStaleRunning: set queued", "error", err, "taskID", t.ID)
+			continue
+		}
+		t.State = task.StateQueued
+		if err := p.Submit(ctx, t); err != nil {
+			p.logger.Error("RecoverStaleRunning: re-queue", "error", err, "taskID", t.ID)
 		}
 	}
 }
author	Peter Stone <thepeterstone@gmail.com>	2026-03-14 07:37:20 +0000
committer	Peter Stone <thepeterstone@gmail.com>	2026-03-14 07:37:20 +0000
commit	4029fdd82bdd657ed862c89f20eb03ff2594cde9 (patch)
tree	5725975ffa6825018605ee336ebe8a7e3f02b1d4 /internal/executor/executor.go
parent	02b35218d9aadcaa6a3b52f218b71577ab72c811 (diff)