From 0865afc43be562dbe14528e4299b9e213b54cc93 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 28 Apr 2026 09:24:43 +0000
Subject: feat(executor): add LocalRunner and OpenAI-compat LLM client

Phase 1 of "local OSS models as agents" plan. Adds a third Runner
backed by any OpenAI-compatible HTTP server (Ollama, vLLM, LM Studio,
llama.cpp), and migrates the Gemini-CLI classifier to route through
the same client when configured.

Two-layer split: internal/llm.Client is the workhorse (HTTP, no Pool,
no DB) used directly by the classifier and any future internal helper
that needs cheap reasoning. internal/executor.LocalRunner is a thin
adapter implementing Runner for user-facing tasks. This avoids
Pool reentrancy/deadlock when sub-second internal calls fire from
inside Pool.execute().

Highlights:
- internal/retry: relocated runWithBackoff/IsRateLimitError/ParseRetryAfter
  into a shared package reused by executor and llm.
- internal/llm: Chat (non-streaming) and ChatStream (SSE) over
  /chat/completions with optional bearer auth, json_object response
  format, retry on 429/503, Retry-After parsing.
- internal/executor/LocalRunner: streams deltas into stdout.log in the
  same stream-json envelope ClaudeRunner emits, then writes one
  consolidated assistant block plus a result terminator so existing
  parsers (extractSummary, ParseChangestatFromOutput) work unchanged.
- internal/executor/Classifier: gains optional LLM field; uses
  json_object response format (no markdown-fence cleanup needed).
  Falls back to Gemini-CLI subprocess when LLM is nil.
- Pool.skipClassification: now skips only when the requested agent
  type is registered, so unknown types still reach the load balancer.
- Storage: additive tokens_in/tokens_out ALTERs on executions; CLI
  runners record cost_usd as before, LocalRunner records 0 + tokens.
- Config: [local_model] section (endpoint, model, timeout_seconds,
  default_temperature, api_key). Empty endpoint = no LocalRunner
  registered, classifier falls back to Gemini.

Pre-existing test issues fixed in passing:
- claude_test.go setupSandbox callsites updated to current signature.
- gemini_test.go TestParseGeminiStream skipped (asserts unimplemented
  GeminiRunner stream-error parsing; tracked separately).

Plan: docs/plans/local-oss-runner.md.

https://claude.ai/code/session_017Edeq947TpSm1vQTxMhi1J
---
 internal/executor/gemini_test.go | 1 +
 1 file changed, 1 insertion(+)

(limited to 'internal/executor/gemini_test.go')

diff --git a/internal/executor/gemini_test.go b/internal/executor/gemini_test.go
index 4b0339e..75e3b45 100644
--- a/internal/executor/gemini_test.go
+++ b/internal/executor/gemini_test.go
@@ -148,6 +148,7 @@ func TestGeminiRunner_BinaryPath_Custom(t *testing.T) {
 
 
 func TestParseGeminiStream_ParsesStructuredOutput(t *testing.T) {
+	t.Skip("GeminiRunner stub: result error/cost parsing not yet implemented; tracked separately")
 	// Simulate a stream-json input with various message types, including a result with error and cost.
 	input := streamLine(`{"type":"content_block_start","content_block":{"text":"Hello,"}}`) +
 		streamLine(`{"type":"content_block_delta","content_block":{"text":" World!"}}`) +
-- 
cgit v1.2.3


From 85c3bf4d28b0903a2005356339e6ea56855b8c80 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 3 May 2026 03:58:19 +0000
Subject: chore: post-epic cleanup — green test suite, no skips
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses the cleanup queue captured in docs/plans/local-oss-runner.md
after the local-OSS-models epic landed. After this commit
`go test -race ./...` is green across every package with zero `t.Skip`
calls and no excluded tests.

Real bugs fixed:
- claude.go setupSandbox callsites used `sandboxDir, err := ...` which
  shadowed the outer variable, so BlockedError.SandboxDir was always
  empty. Resume-after-block was broken for both new and stale-sandbox
  paths. TestBlockedError_IncludesSandboxDir now exercises the right
  invariant.
- TestPool_ActivePerAgent_DeletesZeroEntries flake under -race: the
  cleanup defer in execute()/executeResume() runs AFTER
  handleRunResult sends on resultCh, so consumers observing a result
  could see a still-counted activePerAgent entry. Extracted
  decActiveAgent(agentType, *cleaned) helper; called explicitly before
  every resultCh send, defer becomes a no-op via the cleaned flag.
  Verified clean over `go test -race -count=10`.

Test infrastructure made hermetic:
- gitSafe now also passes -c commit.gpgsign=false / -c tag.gpgsign=false
  so sandbox tests pass on hosts whose global config requires signing.
- Bare repos in tests initialized with `-b main` (HEAD symbolic ref
  matched to the branch we push) so `git log` after push works.
- TestSandboxCloneSource_FallsBackToOrigin uses a local-FS origin URL,
  matching sandboxCloneSource's intentional filter against network URLs.
- TestGeminiLogs_ParsedCorrectly URL fixed to the actual log route
  (/api/executions/{id}/log).

GeminiRunner gap closed (partial):
- parseGeminiStream now walks lines for `result` events, surfacing
  is_error as an error and total_cost_usd as the float return value.
- GeminiRunner.Run propagates parsed cost to Execution.CostUSD.
- TestParseGeminiStream_ParsesStructuredOutput unskipped.

Notes:
- GeminiRunner is still simulated end-to-end (Run writes hardcoded
  stream data instead of execing the binary). The result/cost parser
  now exists; finishing the runner is a smaller, contained follow-up.
  Kept on the deferred queue.
- Frontend "Local" agent option and a minor storage.db.go logger TODO
  remain on the deferred queue, both intentionally — neither blocks
  anything in flight.

https://claude.ai/code/session_017Edeq947TpSm1vQTxMhi1J
---
 docs/plans/local-oss-runner.md   | 20 +++++++++
 internal/api/server_test.go      |  2 +-
 internal/executor/claude.go      | 23 ++++++++---
 internal/executor/claude_test.go | 37 +++++++++--------
 internal/executor/executor.go    | 57 ++++++++++++++------------
 internal/executor/gemini.go      | 87 ++++++++++++++++++++++++++++------------
 internal/executor/gemini_test.go |  1 -
 7 files changed, 151 insertions(+), 76 deletions(-)

(limited to 'internal/executor/gemini_test.go')

diff --git a/docs/plans/local-oss-runner.md b/docs/plans/local-oss-runner.md
index c3d6291..4d5cb87 100644
--- a/docs/plans/local-oss-runner.md
+++ b/docs/plans/local-oss-runner.md
@@ -202,6 +202,26 @@ After all four phases land, plan and execute a deep cleanup pass. Things noticed
 
 Goal: clean `go test -race ./...` with zero skips and zero environmental failures on whatever platform CI runs on.
 
+## Cleanup pass — DONE
+
+All eight items in the cleanup queue above have been addressed in the post-epic cleanup commit. Summary of fixes:
+
+- `gitSafe` now disables `commit.gpgsign` and `tag.gpgsign` so sandbox tests pass on hosts with surprise signing config; matching `safe.directory=*` literals in test helpers updated for parity.
+- Real bug found and fixed: `setupSandbox(...)` callsites in `claude.go` used `sandboxDir, err := ...` which shadowed the outer variable. `BlockedError.SandboxDir` was always empty as a result; `TestBlockedError_IncludesSandboxDir` now passes for the right reason.
+- `parseGeminiStream` now parses `result` events for `is_error`/`total_cost_usd` and returns errors/cost accordingly; `TestParseGeminiStream_ParsesStructuredOutput` is unskipped.
+- `GeminiRunner.Run` propagates parsed cost to `Execution.CostUSD`.
+- `TestGeminiLogs_ParsedCorrectly` test URL fixed (`/api/tasks/{id}/executions/{exec-id}/log` → `/api/executions/{id}/log`, matching the actual route).
+- `TestPool_ActivePerAgent_DeletesZeroEntries` flake root-caused: `handleRunResult` was sending on `resultCh` before `execute()`'s deferred cleanup ran, so consumers could observe a zero-count map entry. Extracted `decActiveAgent(agentType, *cleaned)` helper, called explicitly before each `resultCh` send, defer becomes no-op via the cleaned flag. Verified clean over `-count=10` under `-race`.
+- `TestSandboxCloneSource_FallsBackToOrigin` updated to use a local-FS origin URL, matching `sandboxCloneSource`'s actual semantics (it filters non-local URLs to avoid network clones).
+- All bare repos in tests created with `git init --bare -b main` so `HEAD` symbolically points at `main` (not the default `master`), unblocking the `git log` queries the tests perform after pushing.
+
+Test-suite state after cleanup: `go test -race ./...` is green across all packages with zero `t.Skip` calls and zero excluded tests.
+
+Items not chased (deferred deliberately):
+- **GeminiRunner is still simulated** (`gemini.go` `Run` writes hardcoded stream data instead of executing the binary). The result/cost parsing now exists, so finishing the runner is a smaller, contained change. Kept on the queue but doesn't block anything else.
+- **Frontend "Local" agent option** — UI dropdown still says "Auto / Claude / Gemini". Pending token telemetry surface.
+- **`storage.db.go:706` TODO comment** — minor logger plumbing nit. Skipping unless it blocks something.
+
 ---
 
 # Phase 2 — Focused Plan (Elaboration)
diff --git a/internal/api/server_test.go b/internal/api/server_test.go
index 5c0deba..516e289 100644
--- a/internal/api/server_test.go
+++ b/internal/api/server_test.go
@@ -255,7 +255,7 @@ func TestGeminiLogs_ParsedCorrectly(t *testing.T) {
 	}
 
 	// 6. Verify the content retrieved via the API endpoint.
-	req = httptest.NewRequest("GET", "/api/tasks/"+tk.ID+"/executions/"+exec.ID+"/log", nil)
+	req = httptest.NewRequest("GET", "/api/executions/"+exec.ID+"/log", nil)
 	w = httptest.NewRecorder()
 	srv.Handler().ServeHTTP(w, req)
 
diff --git a/internal/executor/claude.go b/internal/executor/claude.go
index e3f8e1c..fa68382 100644
--- a/internal/executor/claude.go
+++ b/internal/executor/claude.go
@@ -117,7 +117,7 @@ func (r *ClaudeRunner) Run(ctx context.Context, t *task.Task, e *storage.Executi
 				e.SandboxDir = ""
 				if projectDir != "" {
 					var err error
-					sandboxDir, err := setupSandbox(t.Agent.ProjectDir, r.Logger)
+					sandboxDir, err = setupSandbox(t.Agent.ProjectDir, r.Logger)
 					if err != nil {
 						return fmt.Errorf("setting up sandbox: %w", err)
 					}
@@ -129,7 +129,7 @@ func (r *ClaudeRunner) Run(ctx context.Context, t *task.Task, e *storage.Executi
 		}
 	} else if projectDir != "" {
 		var err error
-		sandboxDir, err := setupSandbox(t.Agent.ProjectDir, r.Logger)
+		sandboxDir, err = setupSandbox(t.Agent.ProjectDir, r.Logger)
 		if err != nil {
 			return fmt.Errorf("setting up sandbox: %w", err)
 		}
@@ -226,11 +226,22 @@ func extractQuestionText(questionJSON string) string {
 	return strings.TrimSpace(q.Text)
 }
 
-// gitSafe returns git arguments that prepend "-c safe.directory=*" so that
-// commands succeed regardless of the repository owner. This is needed when
-// claudomator operates on project directories owned by a different OS user.
+// gitSafe returns git arguments that prepend safety overrides so that
+// commands succeed regardless of the repository owner or the host's global
+// git configuration. Specifically:
+//
+//   - "-c safe.directory=*" lets us operate on directories owned by a
+//     different OS user.
+//   - "-c commit.gpgsign=false" / "-c tag.gpgsign=false" stop git from
+//     trying to sign commits via the host's signing tooling. Sandbox commits
+//     are internal and don't need to be signed; an unconfigured or broken
+//     signing setup on the host should never block a sandbox merge.
 func gitSafe(args ...string) []string {
-	return append([]string{"-c", "safe.directory=*"}, args...)
+	return append([]string{
+		"-c", "safe.directory=*",
+		"-c", "commit.gpgsign=false",
+		"-c", "tag.gpgsign=false",
+	}, args...)
 }
 
 // sandboxCloneSource returns the URL to clone the sandbox from. It prefers a
diff --git a/internal/executor/claude_test.go b/internal/executor/claude_test.go
index 77596ca..b40c4ae 100644
--- a/internal/executor/claude_test.go
+++ b/internal/executor/claude_test.go
@@ -353,9 +353,9 @@ func TestExecOnce_NoGoroutineLeak_OnNaturalExit(t *testing.T) {
 func initGitRepo(t *testing.T, dir string) {
 	t.Helper()
 	cmds := [][]string{
-		{"git", "-c", "safe.directory=*", "-C", dir, "init", "-b", "main"},
-		{"git", "-c", "safe.directory=*", "-C", dir, "config", "user.email", "test@test"},
-		{"git", "-c", "safe.directory=*", "-C", dir, "config", "user.name", "test"},
+		{"git", "-c", "safe.directory=*", "-c", "commit.gpgsign=false", "-C", dir, "init", "-b", "main"},
+		{"git", "-c", "safe.directory=*", "-c", "commit.gpgsign=false", "-C", dir, "config", "user.email", "test@test"},
+		{"git", "-c", "safe.directory=*", "-c", "commit.gpgsign=false", "-C", dir, "config", "user.name", "test"},
 	}
 	for _, args := range cmds {
 		if out, err := exec.Command(args[0], args[1:]...).CombinedOutput(); err != nil {
@@ -365,10 +365,10 @@ func initGitRepo(t *testing.T, dir string) {
 	if err := os.WriteFile(filepath.Join(dir, "init.txt"), []byte("init"), 0644); err != nil {
 		t.Fatal(err)
 	}
-	if out, err := exec.Command("git", "-c", "safe.directory=*", "-C", dir, "add", ".").CombinedOutput(); err != nil {
+	if out, err := exec.Command("git", "-c", "safe.directory=*", "-c", "commit.gpgsign=false", "-C", dir, "add", ".").CombinedOutput(); err != nil {
 		t.Fatalf("git add: %v\n%s", err, out)
 	}
-	if out, err := exec.Command("git", "-c", "safe.directory=*", "-C", dir, "commit", "-m", "init").CombinedOutput(); err != nil {
+	if out, err := exec.Command("git", "-c", "safe.directory=*", "-c", "commit.gpgsign=false", "-C", dir, "commit", "-m", "init").CombinedOutput(); err != nil {
 		t.Fatalf("git commit: %v\n%s", err, out)
 	}
 }
@@ -391,7 +391,10 @@ func TestSandboxCloneSource_PrefersLocalRemote(t *testing.T) {
 func TestSandboxCloneSource_FallsBackToOrigin(t *testing.T) {
 	dir := t.TempDir()
 	initGitRepo(t, dir)
-	originURL := "https://example.com/origin-repo"
+	// sandboxCloneSource intentionally filters to local-FS remotes (so
+	// `git clone <src>` doesn't go over the network). Use a local path
+	// for origin to verify the fallback semantics.
+	originURL := t.TempDir()
 	exec.Command("git", "-C", dir, "remote", "add", "origin", originURL).Run()
 
 	got := sandboxCloneSource(dir)
@@ -455,23 +458,23 @@ func TestSetupSandbox_InitialisesNonGitDir(t *testing.T) {
 func TestTeardownSandbox_AutocommitsChanges(t *testing.T) {
 	// Create a bare repo as origin so push succeeds.
 	bare := t.TempDir()
-	if out, err := exec.Command("git", "init", "--bare", bare).CombinedOutput(); err != nil {
+	if out, err := exec.Command("git", "init", "--bare", "-b", "main", bare).CombinedOutput(); err != nil {
 		t.Fatalf("git init bare: %v\n%s", err, out)
 	}
 
 	// Create a sandbox directly.
 	sandbox := t.TempDir()
 	initGitRepo(t, sandbox)
-	if out, err := exec.Command("git", "-c", "safe.directory=*", "-C", sandbox, "remote", "add", "origin", bare).CombinedOutput(); err != nil {
+	if out, err := exec.Command("git", "-c", "safe.directory=*", "-c", "commit.gpgsign=false", "-C", sandbox, "remote", "add", "origin", bare).CombinedOutput(); err != nil {
 		t.Fatalf("git remote add: %v\n%s", err, out)
 	}
 	// Initial push to establish origin/main
-	if out, err := exec.Command("git", "-c", "safe.directory=*", "-C", sandbox, "push", "origin", "main").CombinedOutput(); err != nil {
+	if out, err := exec.Command("git", "-c", "safe.directory=*", "-c", "commit.gpgsign=false", "-C", sandbox, "push", "origin", "main").CombinedOutput(); err != nil {
 		t.Fatalf("git push initial: %v\n%s", err, out)
 	}
 
 	// Capture startHEAD
-	headOut, err := exec.Command("git", "-c", "safe.directory=*", "-C", sandbox, "rev-parse", "HEAD").Output()
+	headOut, err := exec.Command("git", "-c", "safe.directory=*", "-c", "commit.gpgsign=false", "-C", sandbox, "rev-parse", "HEAD").Output()
 	if err != nil {
 		t.Fatalf("rev-parse HEAD: %v", err)
 	}
@@ -514,18 +517,18 @@ func TestTeardownSandbox_AutocommitsChanges(t *testing.T) {
 
 func TestTeardownSandbox_BuildFailure_BlocksAutocommit(t *testing.T) {
 	bare := t.TempDir()
-	if out, err := exec.Command("git", "init", "--bare", bare).CombinedOutput(); err != nil {
+	if out, err := exec.Command("git", "init", "--bare", "-b", "main", bare).CombinedOutput(); err != nil {
 		t.Fatalf("git init bare: %v\n%s", err, out)
 	}
 
 	sandbox := t.TempDir()
 	initGitRepo(t, sandbox)
-	if out, err := exec.Command("git", "-c", "safe.directory=*", "-C", sandbox, "remote", "add", "origin", bare).CombinedOutput(); err != nil {
+	if out, err := exec.Command("git", "-c", "safe.directory=*", "-c", "commit.gpgsign=false", "-C", sandbox, "remote", "add", "origin", bare).CombinedOutput(); err != nil {
 		t.Fatalf("git remote add: %v\n%s", err, out)
 	}
 
 	// Capture startHEAD
-	headOut, err := exec.Command("git", "-c", "safe.directory=*", "-C", sandbox, "rev-parse", "HEAD").Output()
+	headOut, err := exec.Command("git", "-c", "safe.directory=*", "-c", "commit.gpgsign=false", "-C", sandbox, "rev-parse", "HEAD").Output()
 	if err != nil {
 		t.Fatalf("rev-parse HEAD: %v", err)
 	}
@@ -566,18 +569,18 @@ func TestTeardownSandbox_BuildFailure_BlocksAutocommit(t *testing.T) {
 
 func TestTeardownSandbox_BuildSuccess_ProceedsToAutocommit(t *testing.T) {
 	bare := t.TempDir()
-	if out, err := exec.Command("git", "init", "--bare", bare).CombinedOutput(); err != nil {
+	if out, err := exec.Command("git", "init", "--bare", "-b", "main", bare).CombinedOutput(); err != nil {
 		t.Fatalf("git init bare: %v\n%s", err, out)
 	}
 
 	sandbox := t.TempDir()
 	initGitRepo(t, sandbox)
-	if out, err := exec.Command("git", "-c", "safe.directory=*", "-C", sandbox, "remote", "add", "origin", bare).CombinedOutput(); err != nil {
+	if out, err := exec.Command("git", "-c", "safe.directory=*", "-c", "commit.gpgsign=false", "-C", sandbox, "remote", "add", "origin", bare).CombinedOutput(); err != nil {
 		t.Fatalf("git remote add: %v\n%s", err, out)
 	}
 
 	// Capture startHEAD
-	headOut, err := exec.Command("git", "-c", "safe.directory=*", "-C", sandbox, "rev-parse", "HEAD").Output()
+	headOut, err := exec.Command("git", "-c", "safe.directory=*", "-c", "commit.gpgsign=false", "-C", sandbox, "rev-parse", "HEAD").Output()
 	if err != nil {
 		t.Fatalf("rev-parse HEAD: %v", err)
 	}
@@ -870,7 +873,7 @@ func TestTailFile_MissingFile_ReturnsEmpty(t *testing.T) {
 
 func TestGitSafe_PrependsSafeDirectory(t *testing.T) {
 	got := gitSafe("-C", "/some/path", "status")
-	want := []string{"-c", "safe.directory=*", "-C", "/some/path", "status"}
+	want := []string{"-c", "safe.directory=*", "-c", "commit.gpgsign=false", "-c", "tag.gpgsign=false", "-C", "/some/path", "status"}
 	if len(got) != len(want) {
 		t.Fatalf("gitSafe() = %v, want %v", got, want)
 	}
diff --git a/internal/executor/executor.go b/internal/executor/executor.go
index 4501a3c..315030d 100644
--- a/internal/executor/executor.go
+++ b/internal/executor/executor.go
@@ -196,6 +196,28 @@ func (p *Pool) getRunner(t *task.Task) (Runner, error) {
 	return runner, nil
 }
 
+// decActiveAgent decrements the active counters for a finished task. Safe to
+// call multiple times — subsequent calls are no-ops via the cleaned flag.
+// Always call this before sending on resultCh so consumers observing a result
+// see the accounting already settled (no zero-count map entries lingering).
+func (p *Pool) decActiveAgent(agentType string, cleaned *bool) {
+	if *cleaned {
+		return
+	}
+	*cleaned = true
+	p.mu.Lock()
+	p.active--
+	p.activePerAgent[agentType]--
+	if p.activePerAgent[agentType] == 0 {
+		delete(p.activePerAgent, agentType)
+	}
+	p.mu.Unlock()
+	select {
+	case p.doneCh <- struct{}{}:
+	default:
+	}
+}
+
 func (p *Pool) executeResume(ctx context.Context, t *task.Task, exec *storage.Execution) {
 	agentType := t.Agent.Type
 	if agentType == "" {
@@ -206,23 +228,13 @@ func (p *Pool) executeResume(ctx context.Context, t *task.Task, exec *storage.Ex
 	p.activePerAgent[agentType]++
 	p.mu.Unlock()
 
-	defer func() {
-		p.mu.Lock()
-		p.active--
-		p.activePerAgent[agentType]--
-		if p.activePerAgent[agentType] == 0 {
-			delete(p.activePerAgent, agentType)
-		}
-		p.mu.Unlock()
-		select {
-		case p.doneCh <- struct{}{}:
-		default:
-		}
-	}()
+	var cleaned bool
+	defer p.decActiveAgent(agentType, &cleaned)
 
 	runner, err := p.getRunner(t)
 	if err != nil {
 		p.logger.Error("failed to get runner for resume", "error", err, "taskID", t.ID)
+		p.decActiveAgent(agentType, &cleaned)
 		p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err}
 		return
 	}
@@ -264,6 +276,7 @@ func (p *Pool) executeResume(ctx context.Context, t *task.Task, exec *storage.Ex
 	err = runner.Run(ctx, t, exec)
 	exec.EndTime = time.Now().UTC()
 
+	p.decActiveAgent(agentType, &cleaned)
 	p.handleRunResult(ctx, t, exec, err, agentType)
 }
 
@@ -473,19 +486,8 @@ func (p *Pool) execute(ctx context.Context, t *task.Task) {
 	p.activePerAgent[agentType]++
 	p.mu.Unlock()
 
-	defer func() {
-		p.mu.Lock()
-		p.active--
-		p.activePerAgent[agentType]--
-		if p.activePerAgent[agentType] == 0 {
-			delete(p.activePerAgent, agentType)
-		}
-		p.mu.Unlock()
-		select {
-		case p.doneCh <- struct{}{}:
-		default:
-		}
-	}()
+	var cleaned bool
+	defer p.decActiveAgent(agentType, &cleaned)
 
 	runner, err := p.getRunner(t)
 	if err != nil {
@@ -505,6 +507,7 @@ func (p *Pool) execute(ctx context.Context, t *task.Task) {
 		if err := p.store.UpdateTaskState(t.ID, task.StateFailed); err != nil {
 			p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateFailed, "error", err)
 		}
+		p.decActiveAgent(agentType, &cleaned)
 		p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err}
 		return
 	}
@@ -527,6 +530,7 @@ func (p *Pool) execute(ctx context.Context, t *task.Task) {
 			if err := p.store.UpdateTaskState(t.ID, task.StateFailed); err != nil {
 				p.logger.Error("failed to update task state", "taskID", t.ID, "state", task.StateFailed, "error", err)
 			}
+			p.decActiveAgent(agentType, &cleaned)
 			p.resultCh <- &Result{TaskID: t.ID, Execution: exec, Err: err}
 			return
 		}
@@ -583,6 +587,7 @@ func (p *Pool) execute(ctx context.Context, t *task.Task) {
 	err = runner.Run(ctx, t, exec)
 	exec.EndTime = time.Now().UTC()
 
+	p.decActiveAgent(agentType, &cleaned)
 	p.handleRunResult(ctx, t, exec, err, agentType)
 }
 
diff --git a/internal/executor/gemini.go b/internal/executor/gemini.go
index d79c47d..7f2f54f 100644
--- a/internal/executor/gemini.go
+++ b/internal/executor/gemini.go
@@ -2,6 +2,7 @@ package executor
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"io"
 	"log/slog"
@@ -117,16 +118,21 @@ func (r *GeminiRunner) execOnce(ctx context.Context, args []string, workingDir,
 
 
 	var streamErr error
+	var streamCost float64
 	var wg sync.WaitGroup
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
-		_, streamErr = parseGeminiStream(stdoutR, stdoutFile, r.Logger)
+		streamCost, streamErr = parseGeminiStream(stdoutR, stdoutFile, r.Logger)
 		stdoutR.Close()
 	}()
 
 	wg.Wait() // Wait for parseGeminiStream to finish
 
+	if streamCost > 0 {
+		e.CostUSD = streamCost
+	}
+
 	// Set a dummy exit code for this simulated run
 	e.ExitCode = 0
 
@@ -136,9 +142,10 @@ func (r *GeminiRunner) execOnce(ctx context.Context, args []string, workingDir,
 	return nil
 }
 
-// parseGeminiStream reads streaming JSON from the gemini CLI, unwraps markdown
-// code blocks, writes the inner JSON to w, and returns (costUSD, error).
-// For now, it focuses on unwrapping and writing, not detailed parsing of cost/errors.
+// parseGeminiStream reads streaming JSON from the gemini CLI, strips markdown
+// code fences if the output is wrapped in them, writes the inner stream-json
+// to w, and returns (costUSD, error). If a `result` event has `is_error: true`,
+// an error wrapping the result message is returned.
 func parseGeminiStream(r io.Reader, w io.Writer, logger *slog.Logger) (float64, error) {
 	fullOutput, err := io.ReadAll(r)
 	if err != nil {
@@ -146,31 +153,61 @@ func parseGeminiStream(r io.Reader, w io.Writer, logger *slog.Logger) (float64,
 	}
 	logger.Debug("parseGeminiStream: raw output received", "output", string(fullOutput))
 
-	outputStr := strings.TrimSpace(string(fullOutput)) // Trim leading/trailing whitespace/newlines from the whole output
-
-	jsonContent := outputStr // Default to raw output if no markdown block is found or malformed
-	jsonStartIdx := strings.Index(outputStr, "```json")
-	if jsonStartIdx != -1 {
-		// Found "```json", now look for the closing "```"
-		jsonEndIdx := strings.LastIndex(outputStr, "```")
-		if jsonEndIdx != -1 && jsonEndIdx > jsonStartIdx {
-			// Extract content between the markdown fences.
-			jsonContent = outputStr[jsonStartIdx+len("```json"):jsonEndIdx]
-			jsonContent = strings.TrimSpace(jsonContent) // Trim again after extraction, to remove potential inner newlines
-		} else {
-			logger.Warn("Malformed markdown JSON block from Gemini (missing closing ``` or invalid structure), falling back to raw output.", "outputLength", len(outputStr))
+	inner := stripGeminiFences(string(fullOutput), logger)
+	if _, writeErr := w.Write([]byte(inner)); writeErr != nil {
+		return 0, fmt.Errorf("writing gemini output: %w", writeErr)
+	}
+
+	// Walk lines looking for a result event so we can surface errors and cost.
+	var (
+		cost    float64
+		errMsg  string
+		isError bool
+	)
+	for _, raw := range strings.Split(inner, "\n") {
+		line := strings.TrimSpace(raw)
+		if line == "" {
+			continue
+		}
+		var evt struct {
+			Type    string  `json:"type"`
+			IsError bool    `json:"is_error"`
+			Result  string  `json:"result"`
+			Cost    float64 `json:"total_cost_usd"`
+		}
+		if err := json.Unmarshal([]byte(line), &evt); err != nil {
+			continue
+		}
+		if evt.Type == "result" {
+			if evt.Cost > 0 {
+				cost = evt.Cost
+			}
+			if evt.IsError {
+				isError = true
+				errMsg = evt.Result
+			}
 		}
-	} else {
-		logger.Warn("No markdown JSON block found from Gemini, falling back to raw output.", "outputLength", len(outputStr))
 	}
-
-	// Write the (possibly extracted and trimmed) JSON content to the writer.
-	_, writeErr := w.Write([]byte(jsonContent))
-	if writeErr != nil {
-		return 0, fmt.Errorf("writing extracted gemini json: %w", writeErr)
+	if isError {
+		return cost, fmt.Errorf("gemini reported error: %s", errMsg)
 	}
+	return cost, nil
+}
 
-	return 0, nil // For now, no cost/error parsing for Gemini stream
+// stripGeminiFences removes a surrounding ```json ... ``` markdown block if
+// present, returning the trimmed inner content. If no markdown fence is
+// found, the input is returned verbatim (no whitespace trimming) so callers
+// that expect byte-exact pass-through behavior get it.
+func stripGeminiFences(raw string, logger *slog.Logger) string {
+	trimmed := strings.TrimSpace(raw)
+	if start := strings.Index(trimmed, "```json"); start != -1 {
+		if end := strings.LastIndex(trimmed, "```"); end > start {
+			return strings.TrimSpace(trimmed[start+len("```json") : end])
+		}
+		logger.Warn("malformed gemini markdown block (missing closing fence); using raw output", "len", len(trimmed))
+		return trimmed
+	}
+	return raw
 }
 
 func (r *GeminiRunner) buildArgs(t *task.Task, e *storage.Execution, questionFile string) []string {
diff --git a/internal/executor/gemini_test.go b/internal/executor/gemini_test.go
index 75e3b45..4b0339e 100644
--- a/internal/executor/gemini_test.go
+++ b/internal/executor/gemini_test.go
@@ -148,7 +148,6 @@ func TestGeminiRunner_BinaryPath_Custom(t *testing.T) {
 
 
 func TestParseGeminiStream_ParsesStructuredOutput(t *testing.T) {
-	t.Skip("GeminiRunner stub: result error/cost parsing not yet implemented; tracked separately")
 	// Simulate a stream-json input with various message types, including a result with error and cost.
 	input := streamLine(`{"type":"content_block_start","content_block":{"text":"Hello,"}}`) +
 		streamLine(`{"type":"content_block_delta","content_block":{"text":" World!"}}`) +
-- 
cgit v1.2.3


From e7171181fff10c66b2b74eabfb1fc94b3cfbb4fb Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 12 May 2026 21:03:30 +0000
Subject: feat(executor): bring GeminiRunner to sandbox-flow parity with Claude
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All coding tasks now follow the same flow regardless of runner: when
project_dir is set, the agent runs in a temp clone, not in the user's
working tree. On success, edits are autocommitted and pushed back to
origin/master and the sandbox is removed. On failure or BLOCKED, the
sandbox is preserved and its path surfaces in the error / BlockedError
so the user can inspect partial work or resume in place.

Before this commit, GeminiRunner.Run set cmd.Dir to project_dir
directly, so an agent run could leave half-done edits in the user's
working tree with no rollback. ClaudeRunner has had the full sandbox
flow for a while; this commit closes the gap.

Reused the existing package-level helpers from claude.go verbatim:
setupSandbox, teardownSandbox, sandboxCloneSource, gitSafe, plus the
resume/stale-sandbox/blocked-error patterns. No new shared abstraction
needed — same package.

LocalRunner intentionally not changed. The OpenAI chat path has no
tool use, so the agent can't edit files; sandbox would be theater.

Tests (6 new):
- Run_ProjectDir_RunsInSandbox: cwd captured by fake binary is a
  sandbox path, not project_dir.
- Run_BlockedError_IncludesSandboxDir: when question.json appears,
  BlockedError.SandboxDir is set and the dir exists.
- Run_ExecError_PreservesSandbox: failing exit wraps error with
  "(sandbox preserved at <path>)" and the path exists on disk.
- Run_ResumeUsesStoredSandboxDir: ResumeSessionID + SandboxDir →
  runs in that dir without re-cloning.
- Run_StaleSandboxDir_ClonesAfresh: resume pointing at missing
  dir falls back to a fresh clone from project_dir.
- Run_NoProjectDir_SkipsSandbox: tasks without project_dir don't
  trigger sandbox setup.

https://claude.ai/code/session_017Edeq947TpSm1vQTxMhi1J
---
 internal/executor/gemini.go      |  96 ++++++++++++--
 internal/executor/gemini_test.go | 268 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 353 insertions(+), 11 deletions(-)

(limited to 'internal/executor/gemini_test.go')

diff --git a/internal/executor/gemini.go b/internal/executor/gemini.go
index 04382ae..3abec05 100644
--- a/internal/executor/gemini.go
+++ b/internal/executor/gemini.go
@@ -40,11 +40,21 @@ func (r *GeminiRunner) binaryPath() string {
 	return "gemini"
 }
 
-// Run executes a gemini <instructions> invocation, streaming output to log files.
+// Run executes the gemini CLI inside a sandboxed clone of project_dir.
+// When project_dir is set, claudomator first clones it into a temp sandbox
+// (preferring a `local` bare remote, then `origin`, then the working tree)
+// and runs the agent there. On success the sandbox is autocommitted and
+// pushed back to origin/master, then removed. On failure the sandbox is
+// preserved and its path is included in the returned error so the user can
+// inspect partial work. If the agent writes a question file before exiting,
+// Run returns *BlockedError with SandboxDir populated so a resume execution
+// can pick up in the same directory.
 func (r *GeminiRunner) Run(ctx context.Context, t *task.Task, e *storage.Execution) error {
-	if t.Agent.ProjectDir != "" {
-		if _, err := os.Stat(t.Agent.ProjectDir); err != nil {
-			return fmt.Errorf("project_dir %q: %w", t.Agent.ProjectDir, err)
+	projectDir := t.Agent.ProjectDir
+
+	if projectDir != "" {
+		if _, err := os.Stat(projectDir); err != nil {
+			return fmt.Errorf("project_dir %q: %w", projectDir, err)
 		}
 	}
 
@@ -63,24 +73,88 @@ func (r *GeminiRunner) Run(ctx context.Context, t *task.Task, e *storage.Executi
 	}
 
 	if e.SessionID == "" {
-		e.SessionID = e.ID
+		if e.ResumeSessionID != "" {
+			e.SessionID = e.ResumeSessionID
+		} else {
+			e.SessionID = e.ID
+		}
+	}
+
+	// Sandbox setup: for new executions with a project_dir, clone into a sandbox.
+	// Resume executions reuse the preserved sandbox so any partial work survives.
+	// If the preserved sandbox is missing (e.g. /tmp was purged), clone fresh.
+	var sandboxDir string
+	var startHEAD string
+	effectiveWorkingDir := projectDir
+	if e.ResumeSessionID != "" {
+		if e.SandboxDir != "" {
+			if _, statErr := os.Stat(e.SandboxDir); statErr == nil {
+				effectiveWorkingDir = e.SandboxDir
+			} else {
+				r.Logger.Warn("preserved sandbox missing, cloning fresh", "sandbox", e.SandboxDir, "project_dir", projectDir)
+				e.SandboxDir = ""
+				if projectDir != "" {
+					var err error
+					sandboxDir, err = setupSandbox(projectDir, r.Logger)
+					if err != nil {
+						return fmt.Errorf("setting up sandbox: %w", err)
+					}
+					effectiveWorkingDir = sandboxDir
+					r.Logger.Info("fresh sandbox created for resume", "sandbox", sandboxDir, "project_dir", projectDir)
+				}
+			}
+		}
+	} else if projectDir != "" {
+		var err error
+		sandboxDir, err = setupSandbox(projectDir, r.Logger)
+		if err != nil {
+			return fmt.Errorf("setting up sandbox: %w", err)
+		}
+		effectiveWorkingDir = sandboxDir
+		r.Logger.Info("sandbox created", "sandbox", sandboxDir, "project_dir", projectDir)
+	}
+
+	if effectiveWorkingDir != "" {
+		headOut, _ := exec.Command("git", gitSafe("-C", effectiveWorkingDir, "rev-parse", "HEAD")...).Output()
+		startHEAD = strings.TrimSpace(string(headOut))
 	}
 
 	questionFile := filepath.Join(logDir, "question.json")
 	args := r.buildArgs(t, e, questionFile)
 
-	// Gemini CLI doesn't necessarily have the same rate limiting behavior as Claude,
-	// but we'll use a similar execution pattern.
-	err := r.execOnce(ctx, args, t.Agent.ProjectDir, t.Agent.ProjectDir, e)
-	if err != nil {
+	if err := r.execOnce(ctx, args, effectiveWorkingDir, projectDir, e); err != nil {
+		if sandboxDir != "" {
+			return fmt.Errorf("%w (sandbox preserved at %s)", err, sandboxDir)
+		}
 		return err
 	}
 
 	// Check whether the agent left a question before exiting.
 	data, readErr := os.ReadFile(questionFile)
 	if readErr == nil {
-		os.Remove(questionFile) // consumed
-		return &BlockedError{QuestionJSON: strings.TrimSpace(string(data)), SessionID: e.SessionID}
+		os.Remove(questionFile)
+		questionJSON := strings.TrimSpace(string(data))
+		if isCompletionReport(questionJSON) {
+			r.Logger.Info("treating question file as completion report", "taskID", e.TaskID)
+			e.Summary = extractQuestionText(questionJSON)
+		} else {
+			// Preserve sandbox on BLOCKED so a resume can pick up in the same dir.
+			return &BlockedError{QuestionJSON: questionJSON, SessionID: e.SessionID, SandboxDir: sandboxDir}
+		}
+	}
+
+	// Read agent summary if written.
+	summaryFile := filepath.Join(logDir, "summary.txt")
+	if summaryData, readErr := os.ReadFile(summaryFile); readErr == nil {
+		os.Remove(summaryFile)
+		e.Summary = strings.TrimSpace(string(summaryData))
+	}
+
+	// Merge sandbox back to project_dir and clean up.
+	if sandboxDir != "" {
+		if mergeErr := teardownSandbox(projectDir, sandboxDir, startHEAD, r.Logger, e); mergeErr != nil {
+			return fmt.Errorf("sandbox teardown: %w (sandbox preserved at %s)", mergeErr, sandboxDir)
+		}
 	}
 	return nil
 }
diff --git a/internal/executor/gemini_test.go b/internal/executor/gemini_test.go
index 4b0339e..cd11ebc 100644
--- a/internal/executor/gemini_test.go
+++ b/internal/executor/gemini_test.go
@@ -3,8 +3,11 @@ package executor
 import (
 	"bytes"
 	"context"
+	"errors"
 	"io"
 	"log/slog"
+	"os"
+	"path/filepath"
 	"strings"
 	"testing"
 
@@ -177,3 +180,268 @@ func TestParseGeminiStream_ParsesStructuredOutput(t *testing.T) {
 		t.Errorf("writer content mismatch:\nwant:\n%s\ngot:\n%s", expectedWriterContent, writer.String())
 	}
 }
+
+// TestGeminiRunner_Run_ProjectDir_RunsInSandbox verifies that when project_dir
+// is set, the gemini subprocess runs inside a sandbox clone — not in
+// project_dir itself.
+func TestGeminiRunner_Run_ProjectDir_RunsInSandbox(t *testing.T) {
+	projectDir := t.TempDir()
+	initGitRepo(t, projectDir)
+
+	logDir := t.TempDir()
+	cwdFile := filepath.Join(logDir, "gemini-cwd.txt")
+
+	// Fake gemini binary that records its $PWD then exits 0.
+	scriptPath := filepath.Join(t.TempDir(), "fake-gemini.sh")
+	script := "#!/bin/sh\nprintf '%s' \"$PWD\" > " + cwdFile + "\n"
+	if err := os.WriteFile(scriptPath, []byte(script), 0755); err != nil {
+		t.Fatalf("write script: %v", err)
+	}
+
+	r := &GeminiRunner{
+		BinaryPath: scriptPath,
+		Logger:     slog.New(slog.NewTextHandler(io.Discard, nil)),
+		LogDir:     logDir,
+	}
+	tk := &task.Task{
+		Agent: task.AgentConfig{
+			Type:         "gemini",
+			Instructions: "do work",
+			ProjectDir:   projectDir,
+			SkipPlanning: true,
+		},
+	}
+	e := &storage.Execution{ID: "sandbox-exec", TaskID: "task-1"}
+
+	if err := r.Run(context.Background(), tk, e); err != nil {
+		t.Fatalf("Run: %v", err)
+	}
+
+	got, err := os.ReadFile(cwdFile)
+	if err != nil {
+		t.Fatalf("cwd file not written: %v", err)
+	}
+	cwd := string(got)
+	if cwd == projectDir {
+		t.Errorf("ran directly in project_dir; expected sandbox clone (cwd=%q)", cwd)
+	}
+	// Sandbox should be removed after successful teardown (no edits → nothing to push).
+	// We can't assert the exact dir, but it should not be projectDir.
+}
+
+// TestGeminiRunner_Run_BlockedError_IncludesSandboxDir verifies that when the
+// agent writes a question file before exiting, the BlockedError carries the
+// sandbox path so resume runs in the same dir.
+func TestGeminiRunner_Run_BlockedError_IncludesSandboxDir(t *testing.T) {
+	src := t.TempDir()
+	initGitRepo(t, src)
+	logDir := t.TempDir()
+
+	scriptPath := filepath.Join(t.TempDir(), "fake-gemini.sh")
+	if err := os.WriteFile(scriptPath, []byte(`#!/bin/sh
+if [ -n "$CLAUDOMATOR_QUESTION_FILE" ]; then
+  printf '{"text":"Should I continue?"}' > "$CLAUDOMATOR_QUESTION_FILE"
+fi
+`), 0755); err != nil {
+		t.Fatalf("write script: %v", err)
+	}
+
+	r := &GeminiRunner{
+		BinaryPath: scriptPath,
+		Logger:     slog.New(slog.NewTextHandler(io.Discard, nil)),
+		LogDir:     logDir,
+	}
+	tk := &task.Task{
+		Agent: task.AgentConfig{
+			Type:         "gemini",
+			Instructions: "do something",
+			ProjectDir:   src,
+			SkipPlanning: true,
+		},
+	}
+	e := &storage.Execution{ID: "blocked-gemini-exec", TaskID: "task-1"}
+
+	err := r.Run(context.Background(), tk, e)
+
+	var blocked *BlockedError
+	if !errors.As(err, &blocked) {
+		t.Fatalf("expected BlockedError, got: %v", err)
+	}
+	if blocked.SandboxDir == "" {
+		t.Error("BlockedError.SandboxDir should be set when gemini task runs in a sandbox")
+	}
+	if _, statErr := os.Stat(blocked.SandboxDir); os.IsNotExist(statErr) {
+		t.Error("sandbox directory should be preserved when blocked")
+	} else {
+		os.RemoveAll(blocked.SandboxDir)
+	}
+}
+
+// TestGeminiRunner_Run_ExecError_PreservesSandbox verifies that when gemini
+// exits non-zero, the sandbox path is included in the wrapped error so the
+// user can inspect partial work.
+func TestGeminiRunner_Run_ExecError_PreservesSandbox(t *testing.T) {
+	src := t.TempDir()
+	initGitRepo(t, src)
+	logDir := t.TempDir()
+
+	// "false" exits 1, no output.
+	r := &GeminiRunner{
+		BinaryPath: "false",
+		Logger:     slog.New(slog.NewTextHandler(io.Discard, nil)),
+		LogDir:     logDir,
+	}
+	tk := &task.Task{
+		Agent: task.AgentConfig{
+			Type:         "gemini",
+			Instructions: "do something",
+			ProjectDir:   src,
+			SkipPlanning: true,
+		},
+	}
+	e := &storage.Execution{ID: "err-gemini-exec", TaskID: "task-1"}
+
+	err := r.Run(context.Background(), tk, e)
+	if err == nil {
+		t.Fatal("expected error from failing gemini exit")
+	}
+	if !strings.Contains(err.Error(), "sandbox preserved at ") {
+		t.Errorf("expected error to include sandbox path; got: %v", err)
+	}
+	// Extract path and verify it exists.
+	idx := strings.Index(err.Error(), "sandbox preserved at ")
+	rest := err.Error()[idx+len("sandbox preserved at "):]
+	rest = strings.TrimSuffix(rest, ")")
+	rest = strings.TrimSpace(rest)
+	if _, statErr := os.Stat(rest); os.IsNotExist(statErr) {
+		t.Errorf("sandbox path from error should exist on disk: %q", rest)
+	} else {
+		os.RemoveAll(rest)
+	}
+}
+
+// TestGeminiRunner_Run_ResumeUsesStoredSandboxDir verifies that a resume
+// execution runs in the preserved SandboxDir rather than cloning fresh.
+func TestGeminiRunner_Run_ResumeUsesStoredSandboxDir(t *testing.T) {
+	logDir := t.TempDir()
+	sandboxDir := t.TempDir()
+	initGitRepo(t, sandboxDir)
+	cwdFile := filepath.Join(logDir, "cwd.txt")
+
+	scriptPath := filepath.Join(t.TempDir(), "fake-gemini.sh")
+	script := "#!/bin/sh\nprintf '%s' \"$PWD\" > " + cwdFile + "\n"
+	if err := os.WriteFile(scriptPath, []byte(script), 0755); err != nil {
+		t.Fatalf("write script: %v", err)
+	}
+
+	r := &GeminiRunner{
+		BinaryPath: scriptPath,
+		Logger:     slog.New(slog.NewTextHandler(io.Discard, nil)),
+		LogDir:     logDir,
+	}
+	tk := &task.Task{
+		Agent: task.AgentConfig{
+			Type:         "gemini",
+			SkipPlanning: true,
+		},
+	}
+	e := &storage.Execution{
+		ID:              "resume-gemini-1",
+		TaskID:          "task-resume",
+		ResumeSessionID: "session-abc",
+		SandboxDir:      sandboxDir,
+	}
+
+	if err := r.Run(context.Background(), tk, e); err != nil {
+		t.Fatalf("Run with preserved sandbox: %v", err)
+	}
+
+	got, err := os.ReadFile(cwdFile)
+	if err != nil {
+		t.Fatalf("cwd file not written: %v", err)
+	}
+	if string(got) != sandboxDir {
+		t.Errorf("resume should run in preserved sandbox; got cwd=%q want %q", got, sandboxDir)
+	}
+}
+
+// TestGeminiRunner_Run_StaleSandboxDir_ClonesAfresh verifies that a resume
+// pointing at a missing sandbox falls back to cloning a fresh sandbox from
+// project_dir rather than failing outright.
+func TestGeminiRunner_Run_StaleSandboxDir_ClonesAfresh(t *testing.T) {
+	logDir := t.TempDir()
+	projectDir := t.TempDir()
+	initGitRepo(t, projectDir)
+
+	cwdFile := filepath.Join(logDir, "cwd.txt")
+	scriptPath := filepath.Join(t.TempDir(), "fake-gemini.sh")
+	script := "#!/bin/sh\nprintf '%s' \"$PWD\" > " + cwdFile + "\n"
+	if err := os.WriteFile(scriptPath, []byte(script), 0755); err != nil {
+		t.Fatalf("write script: %v", err)
+	}
+
+	r := &GeminiRunner{
+		BinaryPath: scriptPath,
+		Logger:     slog.New(slog.NewTextHandler(io.Discard, nil)),
+		LogDir:     logDir,
+	}
+	tk := &task.Task{
+		Agent: task.AgentConfig{
+			Type:         "gemini",
+			ProjectDir:   projectDir,
+			SkipPlanning: true,
+		},
+	}
+	staleSandbox := filepath.Join(t.TempDir(), "gone")
+	e := &storage.Execution{
+		ID:              "resume-gemini-2",
+		TaskID:          "task-stale",
+		ResumeSessionID: "session-xyz",
+		SandboxDir:      staleSandbox,
+	}
+
+	if err := r.Run(context.Background(), tk, e); err != nil {
+		t.Fatalf("Run with stale sandbox: %v", err)
+	}
+
+	got, err := os.ReadFile(cwdFile)
+	if err != nil {
+		t.Fatalf("cwd file not written: %v", err)
+	}
+	cwd := string(got)
+	if cwd == staleSandbox {
+		t.Error("ran in stale (nonexistent) sandbox dir")
+	}
+	if cwd == projectDir {
+		t.Error("ran directly in project_dir; expected a fresh sandbox clone")
+	}
+}
+
+// TestGeminiRunner_Run_NoProjectDir_SkipsSandbox verifies that a task with no
+// project_dir doesn't trigger sandbox setup (matches LocalRunner/non-coding
+// task semantics).
+func TestGeminiRunner_Run_NoProjectDir_SkipsSandbox(t *testing.T) {
+	logDir := t.TempDir()
+
+	r := &GeminiRunner{
+		BinaryPath: "true", // exits 0, no output
+		Logger:     slog.New(slog.NewTextHandler(io.Discard, nil)),
+		LogDir:     logDir,
+	}
+	tk := &task.Task{
+		Agent: task.AgentConfig{
+			Type:         "gemini",
+			Instructions: "summarize: 2+2",
+			SkipPlanning: true,
+			// No ProjectDir
+		},
+	}
+	e := &storage.Execution{ID: "no-pd-gemini", TaskID: "task-nopd"}
+
+	if err := r.Run(context.Background(), tk, e); err != nil {
+		t.Fatalf("Run without project_dir: %v", err)
+	}
+	if e.SandboxDir != "" {
+		t.Errorf("SandboxDir should be empty for tasks without project_dir, got %q", e.SandboxDir)
+	}
+}
-- 
cgit v1.2.3