fix: surface agent stderr, auto-retry restart-killed tasks, handle stale sandboxes

#1 - Diagnostics: tailFile() reads last 20 lines of subprocess stderr and appends to error message when claude/gemini exits non-zero. Previously all exit-1 failures were opaque; now the error_msg carries the actual subprocess output. #4 - Restart recovery: RecoverStaleRunning() now re-queues tasks after marking them FAILED, so tasks killed by a server restart automatically retry on the next boot rather than staying permanently FAILED. #2 - Stale sandbox: If a resume execution's preserved SandboxDir no longer exists (e.g. /tmp purge after reboot), clone a fresh sandbox instead of failing immediately with "no such file or directory". Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
author: Peter Stone <thepeterstone@gmail.com> 2026-03-14 07:37:20 +0000
committer: Peter Stone <thepeterstone@gmail.com> 2026-03-14 07:37:20 +0000
commit: 4029fdd82bdd657ed862c89f20eb03ff2594cde9 (patch)
tree: 5725975ffa6825018605ee336ebe8a7e3f02b1d4 /internal/executor/executor_test.go
parent: 02b35218d9aadcaa6a3b52f218b71577ab72c811 (diff)
1 files changed, 20 insertions, 8 deletions
diff --git a/internal/executor/executor_test.go b/internal/executor/executor_test.go
index f6d0179..a6c4ad8 100644
--- a/internal/executor/executor_test.go
+++ b/internal/executor/executor_test.go
@@ -596,15 +596,9 @@ func TestPool_RecoverStaleRunning(t *testing.T) {
 		Status:    "RUNNING",
 	})
 
-	pool.RecoverStaleRunning()
+	pool.RecoverStaleRunning(context.Background())
 
-	recovered, err := store.GetTask(tk.ID)
-	if err != nil {
-		t.Fatalf("get task: %v", err)
-	}
-	if recovered.State != task.StateFailed {
-		t.Errorf("state: want FAILED, got %q", recovered.State)
-	}
+	// Execution record should be closed as FAILED.
 	execs, _ := store.ListExecutions(tk.ID)
 	if len(execs) == 0 || execs[0].Status != "FAILED" {
 		t.Errorf("execution status: want FAILED, got %+v", execs)
@@ -612,6 +606,24 @@ func TestPool_RecoverStaleRunning(t *testing.T) {
 	if execs[0].ErrorMsg == "" {
 		t.Error("expected non-empty error message on recovered execution")
 	}
+
+	// Task should be re-queued for retry and complete.
+	select {
+	case result := <-pool.Results():
+		if result.TaskID != tk.ID {
+			t.Errorf("unexpected task in results: %s", result.TaskID)
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatal("timed out waiting for stale RUNNING task to be re-queued and run")
+	}
+	recovered, err := store.GetTask(tk.ID)
+	if err != nil {
+		t.Fatalf("get task: %v", err)
+	}
+	// Top-level tasks (no parent) go to READY after a successful run.
+	if recovered.State != task.StateReady {
+		t.Errorf("state after re-queue: want READY, got %q", recovered.State)
+	}
 }
 
 func TestPool_RecoverStaleQueued_ResubmitsToPool(t *testing.T) {
author	Peter Stone <thepeterstone@gmail.com>	2026-03-14 07:37:20 +0000
committer	Peter Stone <thepeterstone@gmail.com>	2026-03-14 07:37:20 +0000
commit	4029fdd82bdd657ed862c89f20eb03ff2594cde9 (patch)
tree	5725975ffa6825018605ee336ebe8a7e3f02b1d4 /internal/executor/executor_test.go
parent	02b35218d9aadcaa6a3b52f218b71577ab72c811 (diff)