From 7df4f06ae0e3ae80bd967bf53cbec36e58b4a3bd Mon Sep 17 00:00:00 2001
From: Peter Stone <thepeterstone@gmail.com>
Date: Wed, 18 Mar 2026 23:56:20 +0000
Subject: feat: containerized execution with agent tooling and deployment fixes

- ContainerRunner replaces ClaudeRunner/GeminiRunner; all agent types run
  in Docker containers via claudomator-agent:latest
- Writable agentHome staging dir (/home/agent) satisfies home-dir
  requirements for both claude and gemini CLIs without exposing host creds
- Copy .credentials.json and .claude.json into staging dir at run time;
  GEMINI_API_KEY passed via env file
- Fix git clone: remove MkdirTemp-created dir before cloning (git rejects
  pre-existing dirs even when empty)
- Replace localhost with host.docker.internal in APIURL so container can
  reach host API; add --add-host=host.docker.internal:host-gateway
- Run container as --user=$(uid):$(gid) so host-owned workspace files are
  readable; chmod workspace 0755 and instructions file 0644 after clone
- Pre-create .gemini/ in staging dir to avoid atomic-rename ENOENT on first
  gemini-cli run
- Add ct CLI tool to container image: pre-built Bash wrapper for
  Claudomator API (ct task submit/create/run/wait/status/list)
- Document ct tool in CLAUDE.md agent instructions section
- Add drain-failed-tasks script: retries failed tasks on a 5-minute interval
- Update Dockerfile: Node 22 via NodeSource, Go 1.24, gemini-cli,
  git safe.directory=*, default ~/.claude.json

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 CLAUDE.md                           |  52 ++++++++-
 images/agent-base/Dockerfile        |  43 +++++---
 images/agent-base/tools/ct          | 210 ++++++++++++++++++++++++++++++++++++
 internal/cli/serve.go               |  50 ++++-----
 internal/executor/container.go      |  79 ++++++++++----
 internal/executor/container_test.go |   9 +-
 scripts/drain-failed-tasks          |  22 ++++
 7 files changed, 398 insertions(+), 67 deletions(-)
 create mode 100644 images/agent-base/tools/ct
 create mode 100644 scripts/drain-failed-tasks
diff --git a/CLAUDE.md b/CLAUDE.md
index 2cb37a8..d804a96 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -53,14 +53,14 @@ Config defaults to `~/.claudomator/config.toml`. Data is stored in `~/.claudomat
 
 ## Architecture
 
-**Pipeline:** CLI/API → `executor.Pool` → `executor.ClaudeRunner` → `claude -p` subprocess → SQLite + log files
+**Pipeline:** CLI/API → `executor.Pool` → `executor.ContainerRunner` → Docker container → SQLite + log files
 
 ### Packages
 
 | Package | Role |
 |---|---|
 | `internal/task` | `Task` struct, YAML parsing, state machine, validation |
-| `internal/executor` | `Pool` (bounded goroutine pool) + `ClaudeRunner` (subprocess manager) |
+| `internal/executor` | `Pool` (bounded goroutine pool) + `ContainerRunner` (Docker-based executor) |
 | `internal/storage` | SQLite wrapper; stores tasks and execution records |
 | `internal/api` | HTTP server (REST + WebSocket via `internal/api.Hub`) |
 | `internal/reporter` | Formats and emits execution results |
@@ -72,9 +72,9 @@ Config defaults to `~/.claudomator/config.toml`. Data is stored in `~/.claudomat
 **Task execution:**
 1. Task created via `POST /api/tasks` or YAML file (`task.ParseFile`)
 2. `POST /api/tasks/{id}/run` → `executor.Pool.Submit()` → goroutine in pool
-3. `ClaudeRunner.Run()` invokes `claude -p <instructions> --output-format stream-json`
-4. stdout streamed to `~/.claudomator/executions/<exec-id>/stdout.log`; cost parsed from stream-json
-5. Execution result written to SQLite; broadcast via WebSocket to connected clients
+3. `ContainerRunner.Run()` clones `repository_url`, runs `docker run claudomator-agent:latest`
+4. Agent runs `claude -p` inside the container; stdout streamed to `executions/<exec-id>/stdout.log`
+5. On success, runner pushes commits back to the remote; execution result written to SQLite + WebSocket broadcast
 
 **State machine** (`task.ValidTransition`):
 `PENDING` → `QUEUED` → `RUNNING` → `COMPLETED | FAILED | TIMED_OUT | CANCELLED | BUDGET_EXCEEDED`
@@ -166,6 +166,48 @@ A task is created for:
 
 Tasks are tagged `["ci", "auto"]`, capped at $3 USD, and use tools: Read, Edit, Bash, Glob, Grep.
 
+## Agent Tooling (`ct` CLI)
+
+Agents running inside containers have access to `ct`, a pre-built CLI for interacting with the Claudomator API. It is installed at `/usr/local/bin/ct` in the container image. **Use `ct` to create and manage subtasks — do not attempt raw `curl` API calls.**
+
+### Environment (injected automatically)
+
+| Variable | Purpose |
+|---|---|
+| `CLAUDOMATOR_API_URL` | Base URL of the Claudomator API (e.g. `http://host.docker.internal:8484`) |
+| `CLAUDOMATOR_TASK_ID` | ID of the currently-running task; used as the default `parent_task_id` for new subtasks |
+
+### Commands
+
+```bash
+# Create a subtask and immediately queue it (returns task ID)
+ct task submit --name "Fix tests" --instructions "Run tests and fix any failures." [--model sonnet] [--budget 3.0]
+
+# Create, queue, and wait for completion (exits 0=COMPLETED, 1=FAILED, 2=BLOCKED)
+ct task submit --name "Fix tests" --instructions "..." --wait
+
+# Read instructions from a file instead of inline
+ct task submit --name "Fix tests" --file /workspace/subtask-instructions.txt --wait
+
+# Lower-level: create only (returns task ID), then run separately
+TASK_ID=$(ct task create --name "..." --instructions "...")
+ct task run "$TASK_ID"
+ct task wait "$TASK_ID" --timeout 600
+
+# Check status of any task
+ct task status <task-id>
+
+# List recent tasks
+ct task list
+```
+
+### Notes
+
+- Default model is `sonnet`; default budget is `$3.00 USD`. Override with `--model` / `--budget`.
+- `ct task wait` polls every 5 seconds and exits with the task's terminal state on stdout.
+- Subtasks inherit the current task as their parent automatically (via `$CLAUDOMATOR_TASK_ID`).
+- Override parent with `--parent <task-id>` if needed.
+
 ## ADRs
 
 See `docs/adr/001-language-and-architecture.md` for the Go + SQLite + WebSocket rationale.
diff --git a/images/agent-base/Dockerfile b/images/agent-base/Dockerfile
index 6fb253c..0e8057c 100644
--- a/images/agent-base/Dockerfile
+++ b/images/agent-base/Dockerfile
@@ -1,45 +1,58 @@
 # Claudomator Agent Base Image
 FROM ubuntu:24.04
 
-# Avoid interactive prompts
 ENV DEBIAN_FRONTEND=noninteractive
 
-# Install core build and dev tools
+# Base system tools
 RUN apt-get update && apt-get install -y \
     git \
     curl \
     make \
     wget \
-    nodejs \
-    npm \
     sqlite3 \
     jq \
     sudo \
+    ca-certificates \
     && rm -rf /var/lib/apt/lists/*
 
-# Install Go 1.22+
-RUN wget https://go.dev/dl/go1.22.1.linux-amd64.tar.gz && \
-    tar -C /usr/local -xzf go1.22.1.linux-amd64.tar.gz && \
-    rm go1.22.1.linux-amd64.tar.gz
+# Node.js 22 via NodeSource
+RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \
+    && apt-get install -y nodejs \
+    && rm -rf /var/lib/apt/lists/*
+
+# Go 1.24
+RUN wget -q https://go.dev/dl/go1.24.1.linux-amd64.tar.gz && \
+    tar -C /usr/local -xzf go1.24.1.linux-amd64.tar.gz && \
+    rm go1.24.1.linux-amd64.tar.gz
 ENV PATH=$PATH:/usr/local/go/bin
 
-# Install Claude CLI
+# Claude Code CLI
 RUN npm install -g @anthropic-ai/claude-code
 
-# Install specific node tools
+# Gemini CLI
+RUN npm install -g @google/gemini-cli
+
+# CSS build tools (for claudomator itself)
 RUN npm install -g postcss-cli tailwindcss autoprefixer
 
+# Git: allow operations on any directory (agents clone into /workspace/*)
+RUN git config --system safe.directory '*'
+
+# Claudomator agent CLI tools (ct)
+COPY tools/ct /usr/local/bin/ct
+RUN chmod +x /usr/local/bin/ct
+
 # Setup workspace
 WORKDIR /workspace
 
-# Add a user claudomator-agent
+# Agent user with passwordless sudo
 RUN useradd -m claudomator-agent && \
     echo "claudomator-agent ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
 
-# Ensure /usr/local/bin is writable for npm or use a different path
-# @anthropic-ai/claude-code might need some extra setup or just work
-
 USER claudomator-agent
 
-# Default command
+# Create a default empty config to satisfy the CLI if no mount is provided
+RUN mkdir -p /home/claudomator-agent/.claude && \
+    echo '{}' > /home/claudomator-agent/.claude.json
+
 CMD ["/bin/bash"]
diff --git a/images/agent-base/tools/ct b/images/agent-base/tools/ct
new file mode 100644
index 0000000..46d9613
--- /dev/null
+++ b/images/agent-base/tools/ct
@@ -0,0 +1,210 @@
+#!/bin/bash
+# ct - Claudomator CLI for agents running inside containers
+#
+# Usage:
+#   ct task create --name "..." --instructions "..."  # create subtask (parent auto-set)
+#   ct task run <task-id>                             # queue a task for execution
+#   ct task wait <task-id> [--timeout 300]            # poll until done, print status
+#   ct task status <task-id>                          # print current state
+#   ct task list                                      # list recent tasks
+#
+# Environment (injected by ContainerRunner):
+#   CLAUDOMATOR_API_URL   base URL of the Claudomator API
+#   CLAUDOMATOR_TASK_ID   ID of the currently running task (used as default parent)
+
+set -euo pipefail
+
+API="${CLAUDOMATOR_API_URL:-http://host.docker.internal:8484}"
+PARENT="${CLAUDOMATOR_TASK_ID:-}"
+
+_api() {
+  local method="$1"; shift
+  local path="$1"; shift
+  curl -sf -X "$method" "${API}${path}" \
+    -H "Content-Type: application/json" \
+    "$@"
+}
+
+_require() {
+  if ! command -v "$1" &>/dev/null; then
+    echo "ct: required tool '$1' not found" >&2
+    exit 1
+  fi
+}
+
+_require curl
+_require jq
+
+cmd_task_create() {
+  local name="" instructions="" instructions_file="" model="" budget="" parent="$PARENT"
+
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      --name)         name="$2";             shift 2 ;;
+      --instructions) instructions="$2";     shift 2 ;;
+      --file)         instructions_file="$2"; shift 2 ;;
+      --model)        model="$2";            shift 2 ;;
+      --budget)       budget="$2";           shift 2 ;;
+      --parent)       parent="$2";           shift 2 ;;
+      *) echo "ct task create: unknown flag $1" >&2; exit 1 ;;
+    esac
+  done
+
+  if [[ -z "$name" ]]; then
+    echo "ct task create: --name is required" >&2; exit 1
+  fi
+
+  if [[ -n "$instructions_file" ]]; then
+    instructions=$(cat "$instructions_file")
+  fi
+
+  if [[ -z "$instructions" ]]; then
+    echo "ct task create: --instructions or --file is required" >&2; exit 1
+  fi
+
+  local payload
+  payload=$(jq -n \
+    --arg name "$name" \
+    --arg instructions "$instructions" \
+    --arg parent "$parent" \
+    --arg model "${model:-sonnet}" \
+    --argjson budget "${budget:-3.0}" \
+    '{
+      name: $name,
+      parent_task_id: $parent,
+      agent: {
+        type: "claude",
+        model: $model,
+        instructions: $instructions,
+        max_budget_usd: $budget
+      }
+    }')
+
+  local response
+  response=$(_api POST /api/tasks -d "$payload")
+  local task_id
+  task_id=$(echo "$response" | jq -r '.id // empty')
+
+  if [[ -z "$task_id" ]]; then
+    echo "ct task create: API error: $(echo "$response" | jq -r '.error // .')" >&2
+    exit 1
+  fi
+
+  echo "$task_id"
+}
+
+cmd_task_run() {
+  local task_id="${1:-}"
+  if [[ -z "$task_id" ]]; then
+    echo "ct task run: task-id required" >&2; exit 1
+  fi
+
+  local response
+  response=$(_api POST "/api/tasks/${task_id}/run")
+  echo "$response" | jq -r '.message // .error // .'
+}
+
+cmd_task_wait() {
+  local task_id="${1:-}"
+  local timeout=300
+  shift || true
+
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      --timeout) timeout="$2"; shift 2 ;;
+      *) echo "ct task wait: unknown flag $1" >&2; exit 1 ;;
+    esac
+  done
+
+  if [[ -z "$task_id" ]]; then
+    echo "ct task wait: task-id required" >&2; exit 1
+  fi
+
+  local deadline=$(( $(date +%s) + timeout ))
+  local interval=5
+
+  while true; do
+    local response
+    response=$(_api GET "/api/tasks/${task_id}" 2>/dev/null) || true
+
+    local state
+    state=$(echo "$response" | jq -r '.state // "UNKNOWN"')
+
+    case "$state" in
+      COMPLETED|FAILED|TIMED_OUT|CANCELLED|BUDGET_EXCEEDED)
+        echo "$state"
+        [[ "$state" == "COMPLETED" ]] && exit 0 || exit 1
+        ;;
+      BLOCKED)
+        echo "BLOCKED"
+        exit 2
+        ;;
+    esac
+
+    if [[ $(date +%s) -ge $deadline ]]; then
+      echo "ct task wait: timed out after ${timeout}s (state: $state)" >&2
+      exit 1
+    fi
+
+    sleep "$interval"
+  done
+}
+
+cmd_task_status() {
+  local task_id="${1:-}"
+  if [[ -z "$task_id" ]]; then
+    echo "ct task status: task-id required" >&2; exit 1
+  fi
+  _api GET "/api/tasks/${task_id}" | jq -r '.state'
+}
+
+cmd_task_list() {
+  _api GET "/api/tasks" | jq -r '.[] | "\(.state)\t\(.id)\t\(.name)"' | sort
+}
+
+# create-and-run shorthand: create a subtask and immediately queue it, then optionally wait
+cmd_task_submit() {
+  local wait=false
+  local args=()
+
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      --wait) wait=true; shift ;;
+      *) args+=("$1"); shift ;;
+    esac
+  done
+
+  local task_id
+  task_id=$(cmd_task_create "${args[@]}")
+  cmd_task_run "$task_id" >/dev/null
+  echo "$task_id"
+
+  if $wait; then
+    cmd_task_wait "$task_id"
+  fi
+}
+
+# Dispatch
+if [[ $# -lt 2 ]]; then
+  echo "Usage: ct <resource> <command> [args...]"
+  echo "  ct task create --name NAME --instructions TEXT [--file FILE] [--model MODEL] [--budget N]"
+  echo "  ct task submit --name NAME --instructions TEXT [--wait]"
+  echo "  ct task run <id>"
+  echo "  ct task wait <id> [--timeout 300]"
+  echo "  ct task status <id>"
+  echo "  ct task list"
+  exit 1
+fi
+
+resource="$1"; shift
+command="$1"; shift
+
+case "${resource}/${command}" in
+  task/create) cmd_task_create "$@" ;;
+  task/run)    cmd_task_run    "$@" ;;
+  task/wait)   cmd_task_wait   "$@" ;;
+  task/status) cmd_task_status "$@" ;;
+  task/list)   cmd_task_list        ;;
+  task/submit) cmd_task_submit "$@" ;;
+  *) echo "ct: unknown command: ${resource} ${command}" >&2; exit 1 ;;
+esac
diff --git a/internal/cli/serve.go b/internal/cli/serve.go
index 2ee020d..98e7524 100644
--- a/internal/cli/serve.go
+++ b/internal/cli/serve.go
@@ -75,36 +75,38 @@ func serve(addr string) error {
 		apiURL = "http://" + addr
 	}
 
+	// Resolve the claude config dir from HOME so the container can mount credentials.
+	claudeConfigDir := filepath.Join(os.Getenv("HOME"), ".claude")
+
 	runners := map[string]executor.Runner{
+		// ContainerRunner: binaries are resolved via PATH inside the container image,
+		// so ClaudeBinary/GeminiBinary are left empty (host paths would not exist inside).
 		"claude": &executor.ContainerRunner{
-			Image:        cfg.ClaudeImage,
-			Logger:       logger,
-			LogDir:       cfg.LogDir,
-			APIURL:       apiURL,
-			DropsDir:     cfg.DropsDir,
-			SSHAuthSock:  cfg.SSHAuthSock,
-			ClaudeBinary: cfg.ClaudeBinaryPath,
-			GeminiBinary: cfg.GeminiBinaryPath,
+			Image:           cfg.ClaudeImage,
+			Logger:          logger,
+			LogDir:          cfg.LogDir,
+			APIURL:          apiURL,
+			DropsDir:        cfg.DropsDir,
+			SSHAuthSock:     cfg.SSHAuthSock,
+			ClaudeConfigDir: claudeConfigDir,
 		},
 		"gemini": &executor.ContainerRunner{
-			Image:        cfg.GeminiImage,
-			Logger:       logger,
-			LogDir:       cfg.LogDir,
-			APIURL:       apiURL,
-			DropsDir:     cfg.DropsDir,
-			SSHAuthSock:  cfg.SSHAuthSock,
-			ClaudeBinary: cfg.ClaudeBinaryPath,
-			GeminiBinary: cfg.GeminiBinaryPath,
+			Image:           cfg.GeminiImage,
+			Logger:          logger,
+			LogDir:          cfg.LogDir,
+			APIURL:          apiURL,
+			DropsDir:        cfg.DropsDir,
+			SSHAuthSock:     cfg.SSHAuthSock,
+			ClaudeConfigDir: claudeConfigDir,
 		},
 		"container": &executor.ContainerRunner{
-			Image:        "claudomator-agent:latest",
-			Logger:       logger,
-			LogDir:       cfg.LogDir,
-			APIURL:       apiURL,
-			DropsDir:     cfg.DropsDir,
-			SSHAuthSock:  cfg.SSHAuthSock,
-			ClaudeBinary: cfg.ClaudeBinaryPath,
-			GeminiBinary: cfg.GeminiBinaryPath,
+			Image:           "claudomator-agent:latest",
+			Logger:          logger,
+			LogDir:          cfg.LogDir,
+			APIURL:          apiURL,
+			DropsDir:        cfg.DropsDir,
+			SSHAuthSock:     cfg.SSHAuthSock,
+			ClaudeConfigDir: claudeConfigDir,
 		},
 	}
 
diff --git a/internal/executor/container.go b/internal/executor/container.go
index 45758d2..c43e201 100644
--- a/internal/executor/container.go
+++ b/internal/executor/container.go
@@ -22,9 +22,10 @@ type ContainerRunner struct {
 	LogDir       string
 	APIURL       string
 	DropsDir     string
-	SSHAuthSock  string // optional path to host SSH agent
-	ClaudeBinary string // optional path to claude binary in container
-	GeminiBinary string // optional path to gemini binary in container
+	SSHAuthSock     string // optional path to host SSH agent
+	ClaudeBinary    string // optional path to claude binary in container
+	GeminiBinary    string // optional path to gemini binary in container
+	ClaudeConfigDir string // host path to ~/.claude; mounted into container for auth credentials
 	// Command allows mocking exec.CommandContext for tests.
 	Command func(ctx context.Context, name string, arg ...string) *exec.Cmd
 }
@@ -50,9 +51,14 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec
 		repoURL = t.Agent.RepositoryURL
 	}
 	if repoURL == "" {
-		// Fallback to project_dir if repository_url is not set (legacy support)
+		// Fallback to project_dir if repository_url is not set (legacy support).
+		// Prefer the 'local' bare remote so that git push succeeds after execution
+		// (pushing to a non-bare working copy on a checked-out branch is rejected by git).
 		if t.Agent.ProjectDir != "" {
 			repoURL = t.Agent.ProjectDir
+			if out, err2 := exec.Command("git", "-C", t.Agent.ProjectDir, "remote", "get-url", "local").Output(); err2 == nil {
+				repoURL = strings.TrimSpace(string(out))
+			}
 		} else {
 			return fmt.Errorf("task %s has no repository_url or project_dir", t.ID)
 		}
@@ -82,6 +88,7 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec
 		if err != nil {
 			return fmt.Errorf("creating workspace: %w", err)
 		}
+		// chmod applied after clone; see step 2.
 	}
 
 	// Note: workspace is only removed on success. On failure, it's preserved for debugging.
@@ -96,18 +103,18 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec
 		}
 	}()
 
-	// 2. Clone repo into workspace if not resuming
+	// 2. Clone repo into workspace if not resuming.
+	// git clone requires the target directory to not exist; remove the MkdirTemp-created dir first.
 	if !isResume {
+		if err := os.Remove(workspace); err != nil {
+			return fmt.Errorf("removing workspace before clone: %w", err)
+		}
 		r.Logger.Info("cloning repository", "url", repoURL, "workspace", workspace)
 		if out, err := r.command(ctx, "git", "clone", repoURL, workspace).CombinedOutput(); err != nil {
-			// If it looks like a remote URL, fail fast.
-			if strings.HasPrefix(repoURL, "http") || strings.HasPrefix(repoURL, "git@") || strings.HasPrefix(repoURL, "ssh://") {
-				return fmt.Errorf("git clone failed for remote repository: %w\n%s", err, string(out))
-			}
-			r.Logger.Warn("git clone failed, attempting fallback init", "url", repoURL, "error", err)
-			if initErr := r.fallbackGitInit(repoURL, workspace); initErr != nil {
-				return fmt.Errorf("git clone and fallback init failed: %w\n%s", err, string(out))
-			}
+			return fmt.Errorf("git clone failed: %w\n%s", err, string(out))
+		}
+		if err = os.Chmod(workspace, 0755); err != nil {
+			return fmt.Errorf("chmod cloned workspace: %w", err)
 		}
 	}
 	e.SandboxDir = workspace
@@ -140,18 +147,39 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec
 
 	// Write API keys to a temporary env file to avoid exposure in 'ps' or 'docker inspect'
 	envFile := filepath.Join(workspace, ".claudomator-env")
-	envContent := fmt.Sprintf("ANTHROPIC_API_KEY=%s\nGOOGLE_API_KEY=%s\n", os.Getenv("ANTHROPIC_API_KEY"), os.Getenv("GOOGLE_API_KEY"))
+	envContent := fmt.Sprintf("ANTHROPIC_API_KEY=%s\nGOOGLE_API_KEY=%s\nGEMINI_API_KEY=%s\n", os.Getenv("ANTHROPIC_API_KEY"), os.Getenv("GOOGLE_API_KEY"), os.Getenv("GEMINI_API_KEY"))
 	if err := os.WriteFile(envFile, []byte(envContent), 0600); err != nil {
 		return fmt.Errorf("writing env file: %w", err)
 	}
 
 	// Inject custom instructions via file to avoid CLI length limits
 	instructionsFile := filepath.Join(workspace, ".claudomator-instructions.txt")
-	if err := os.WriteFile(instructionsFile, []byte(t.Agent.Instructions), 0600); err != nil {
+	if err := os.WriteFile(instructionsFile, []byte(t.Agent.Instructions), 0644); err != nil {
 		return fmt.Errorf("writing instructions: %w", err)
 	}
 
-	args := r.buildDockerArgs(workspace, e.TaskID)
+	// Set up a writable $HOME staging dir so any agent tool (claude, gemini, etc.)
+	// can freely create subdirs (session-env, .gemini, .cache, …) without hitting
+	// a non-existent or read-only home. We copy only the claude credentials into it.
+	agentHome := filepath.Join(workspace, ".agent-home")
+	if err := os.MkdirAll(filepath.Join(agentHome, ".claude"), 0755); err != nil {
+		return fmt.Errorf("creating agent home staging dir: %w", err)
+	}
+	if err := os.MkdirAll(filepath.Join(agentHome, ".gemini"), 0755); err != nil {
+		return fmt.Errorf("creating .gemini dir: %w", err)
+	}
+	if r.ClaudeConfigDir != "" {
+		// credentials
+		if srcData, readErr := os.ReadFile(filepath.Join(r.ClaudeConfigDir, ".credentials.json")); readErr == nil {
+			_ = os.WriteFile(filepath.Join(agentHome, ".claude", ".credentials.json"), srcData, 0600)
+		}
+		// settings (used by claude CLI; copy so it can write updates without hitting the host)
+		if srcData, readErr := os.ReadFile(filepath.Join(filepath.Dir(r.ClaudeConfigDir), ".claude.json")); readErr == nil {
+			_ = os.WriteFile(filepath.Join(agentHome, ".claude.json"), srcData, 0644)
+		}
+	}
+
+	args := r.buildDockerArgs(workspace, agentHome, e.TaskID)
 	innerCmd := r.buildInnerCmd(t, e, isResume)
 
 	fullArgs := append(args, image)
@@ -240,9 +268,8 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec
 
 	// 5. Post-execution: push changes if successful
 	if waitErr == nil && streamErr == nil {
-		// Check if there are any commits to push (Issue 10)
-		// We use rev-list to see if HEAD is ahead of origin/HEAD.
-		// If origin/HEAD doesn't exist (e.g. fresh init), we just attempt to push.
+		// Check if there are any commits to push (HEAD ahead of origin/HEAD).
+		// If origin/HEAD doesn't exist (e.g. fresh clone with no commits), we attempt push anyway.
 		hasCommits := true
 		if out, err := r.command(ctx, "git", "-C", workspace, "rev-list", "origin/HEAD..HEAD").CombinedOutput(); err == nil {
 			if len(strings.TrimSpace(string(out))) == 0 {
@@ -272,15 +299,25 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec
 	return nil
 }
 
-func (r *ContainerRunner) buildDockerArgs(workspace, taskID string) []string {
+func (r *ContainerRunner) buildDockerArgs(workspace, claudeHome, taskID string) []string {
 	// --env-file takes a HOST path.
 	hostEnvFile := filepath.Join(workspace, ".claudomator-env")
+
+	// Replace localhost with host.docker.internal so the container can reach the host API.
+	apiURL := strings.ReplaceAll(r.APIURL, "localhost", "host.docker.internal")
+
 	args := []string{
 		"run", "--rm",
+		// Allow container to reach the host via host.docker.internal.
+		"--add-host=host.docker.internal:host-gateway",
+		// Run as the current process UID:GID so the container can read host-owned files.
+		fmt.Sprintf("--user=%d:%d", os.Getuid(), os.Getgid()),
 		"-v", workspace + ":/workspace",
+		"-v", claudeHome + ":/home/agent",
 		"-w", "/workspace",
 		"--env-file", hostEnvFile,
-		"-e", "CLAUDOMATOR_API_URL=" + r.APIURL,
+		"-e", "HOME=/home/agent",
+		"-e", "CLAUDOMATOR_API_URL=" + apiURL,
 		"-e", "CLAUDOMATOR_TASK_ID=" + taskID,
 		"-e", "CLAUDOMATOR_DROP_DIR=" + r.DropsDir,
 	}
diff --git a/internal/executor/container_test.go b/internal/executor/container_test.go
index d4d591e..f97f2b5 100644
--- a/internal/executor/container_test.go
+++ b/internal/executor/container_test.go
@@ -23,14 +23,19 @@ func TestContainerRunner_BuildDockerArgs(t *testing.T) {
 	workspace := "/tmp/ws"
 	taskID := "task-123"
 
-	args := runner.buildDockerArgs(workspace, taskID)
+	agentHome := "/tmp/ws/.agent-home"
+	args := runner.buildDockerArgs(workspace, agentHome, taskID)
 
 	expected := []string{
 		"run", "--rm",
+		"--add-host=host.docker.internal:host-gateway",
+		fmt.Sprintf("--user=%d:%d", os.Getuid(), os.Getgid()),
 		"-v", "/tmp/ws:/workspace",
+		"-v", "/tmp/ws/.agent-home:/home/agent",
 		"-w", "/workspace",
 		"--env-file", "/tmp/ws/.claudomator-env",
-		"-e", "CLAUDOMATOR_API_URL=http://localhost:8484",
+		"-e", "HOME=/home/agent",
+		"-e", "CLAUDOMATOR_API_URL=http://host.docker.internal:8484",
 		"-e", "CLAUDOMATOR_TASK_ID=task-123",
 		"-e", "CLAUDOMATOR_DROP_DIR=/data/drops",
 		"-v", "/tmp/ssh.sock:/tmp/ssh-auth.sock",
diff --git a/scripts/drain-failed-tasks b/scripts/drain-failed-tasks
new file mode 100644
index 0000000..4bb6992
--- /dev/null
+++ b/scripts/drain-failed-tasks
@@ -0,0 +1,22 @@
+#!/bin/bash
+# drain-failed-tasks — retry failed tasks by running start-next-task every 5 minutes
+# Usage: ./scripts/drain-failed-tasks [iterations]
+# Default: 29 iterations
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+ITERATIONS="${1:-29}"
+INTERVAL=300  # 5 minutes
+
+echo "Running start-next-task every ${INTERVAL}s for ${ITERATIONS} iterations"
+
+for ((i=1; i<=ITERATIONS; i++)); do
+    echo "[$(date '+%H:%M:%S')] Iteration ${i}/${ITERATIONS}"
+    "$SCRIPT_DIR/start-next-task" || true
+    if [[ $i -lt $ITERATIONS ]]; then
+        sleep "$INTERVAL"
+    fi
+done
+
+echo "[$(date '+%H:%M:%S')] Done."
-- 
cgit v1.2.3