From 7df4f06ae0e3ae80bd967bf53cbec36e58b4a3bd Mon Sep 17 00:00:00 2001 From: Peter Stone Date: Wed, 18 Mar 2026 23:56:20 +0000 Subject: feat: containerized execution with agent tooling and deployment fixes - ContainerRunner replaces ClaudeRunner/GeminiRunner; all agent types run in Docker containers via claudomator-agent:latest - Writable agentHome staging dir (/home/agent) satisfies home-dir requirements for both claude and gemini CLIs without exposing host creds - Copy .credentials.json and .claude.json into staging dir at run time; GEMINI_API_KEY passed via env file - Fix git clone: remove MkdirTemp-created dir before cloning (git rejects pre-existing dirs even when empty) - Replace localhost with host.docker.internal in APIURL so container can reach host API; add --add-host=host.docker.internal:host-gateway - Run container as --user=$(uid):$(gid) so host-owned workspace files are readable; chmod workspace 0755 and instructions file 0644 after clone - Pre-create .gemini/ in staging dir to avoid atomic-rename ENOENT on first gemini-cli run - Add ct CLI tool to container image: pre-built Bash wrapper for Claudomator API (ct task submit/create/run/wait/status/list) - Document ct tool in CLAUDE.md agent instructions section - Add drain-failed-tasks script: retries failed tasks on a 5-minute interval - Update Dockerfile: Node 22 via NodeSource, Go 1.24, gemini-cli, git safe.directory=*, default ~/.claude.json Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 52 ++++++++- images/agent-base/Dockerfile | 43 +++++--- images/agent-base/tools/ct | 210 ++++++++++++++++++++++++++++++++++++ internal/cli/serve.go | 50 ++++----- internal/executor/container.go | 79 ++++++++++---- internal/executor/container_test.go | 9 +- scripts/drain-failed-tasks | 22 ++++ 7 files changed, 398 insertions(+), 67 deletions(-) create mode 100644 images/agent-base/tools/ct create mode 100644 scripts/drain-failed-tasks diff --git a/CLAUDE.md b/CLAUDE.md index 2cb37a8..d804a96 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -53,14 +53,14 @@ Config defaults to `~/.claudomator/config.toml`. Data is stored in `~/.claudomat ## Architecture -**Pipeline:** CLI/API → `executor.Pool` → `executor.ClaudeRunner` → `claude -p` subprocess → SQLite + log files +**Pipeline:** CLI/API → `executor.Pool` → `executor.ContainerRunner` → Docker container → SQLite + log files ### Packages | Package | Role | |---|---| | `internal/task` | `Task` struct, YAML parsing, state machine, validation | -| `internal/executor` | `Pool` (bounded goroutine pool) + `ClaudeRunner` (subprocess manager) | +| `internal/executor` | `Pool` (bounded goroutine pool) + `ContainerRunner` (Docker-based executor) | | `internal/storage` | SQLite wrapper; stores tasks and execution records | | `internal/api` | HTTP server (REST + WebSocket via `internal/api.Hub`) | | `internal/reporter` | Formats and emits execution results | @@ -72,9 +72,9 @@ Config defaults to `~/.claudomator/config.toml`. Data is stored in `~/.claudomat **Task execution:** 1. Task created via `POST /api/tasks` or YAML file (`task.ParseFile`) 2. `POST /api/tasks/{id}/run` → `executor.Pool.Submit()` → goroutine in pool -3. `ClaudeRunner.Run()` invokes `claude -p --output-format stream-json` -4. stdout streamed to `~/.claudomator/executions//stdout.log`; cost parsed from stream-json -5. Execution result written to SQLite; broadcast via WebSocket to connected clients +3. `ContainerRunner.Run()` clones `repository_url`, runs `docker run claudomator-agent:latest` +4. Agent runs `claude -p` inside the container; stdout streamed to `executions//stdout.log` +5. On success, runner pushes commits back to the remote; execution result written to SQLite + WebSocket broadcast **State machine** (`task.ValidTransition`): `PENDING` → `QUEUED` → `RUNNING` → `COMPLETED | FAILED | TIMED_OUT | CANCELLED | BUDGET_EXCEEDED` @@ -166,6 +166,48 @@ A task is created for: Tasks are tagged `["ci", "auto"]`, capped at $3 USD, and use tools: Read, Edit, Bash, Glob, Grep. +## Agent Tooling (`ct` CLI) + +Agents running inside containers have access to `ct`, a pre-built CLI for interacting with the Claudomator API. It is installed at `/usr/local/bin/ct` in the container image. **Use `ct` to create and manage subtasks — do not attempt raw `curl` API calls.** + +### Environment (injected automatically) + +| Variable | Purpose | +|---|---| +| `CLAUDOMATOR_API_URL` | Base URL of the Claudomator API (e.g. `http://host.docker.internal:8484`) | +| `CLAUDOMATOR_TASK_ID` | ID of the currently-running task; used as the default `parent_task_id` for new subtasks | + +### Commands + +```bash +# Create a subtask and immediately queue it (returns task ID) +ct task submit --name "Fix tests" --instructions "Run tests and fix any failures." [--model sonnet] [--budget 3.0] + +# Create, queue, and wait for completion (exits 0=COMPLETED, 1=FAILED, 2=BLOCKED) +ct task submit --name "Fix tests" --instructions "..." --wait + +# Read instructions from a file instead of inline +ct task submit --name "Fix tests" --file /workspace/subtask-instructions.txt --wait + +# Lower-level: create only (returns task ID), then run separately +TASK_ID=$(ct task create --name "..." --instructions "...") +ct task run "$TASK_ID" +ct task wait "$TASK_ID" --timeout 600 + +# Check status of any task +ct task status + +# List recent tasks +ct task list +``` + +### Notes + +- Default model is `sonnet`; default budget is `$3.00 USD`. Override with `--model` / `--budget`. +- `ct task wait` polls every 5 seconds and exits with the task's terminal state on stdout. +- Subtasks inherit the current task as their parent automatically (via `$CLAUDOMATOR_TASK_ID`). +- Override parent with `--parent ` if needed. + ## ADRs See `docs/adr/001-language-and-architecture.md` for the Go + SQLite + WebSocket rationale. diff --git a/images/agent-base/Dockerfile b/images/agent-base/Dockerfile index 6fb253c..0e8057c 100644 --- a/images/agent-base/Dockerfile +++ b/images/agent-base/Dockerfile @@ -1,45 +1,58 @@ # Claudomator Agent Base Image FROM ubuntu:24.04 -# Avoid interactive prompts ENV DEBIAN_FRONTEND=noninteractive -# Install core build and dev tools +# Base system tools RUN apt-get update && apt-get install -y \ git \ curl \ make \ wget \ - nodejs \ - npm \ sqlite3 \ jq \ sudo \ + ca-certificates \ && rm -rf /var/lib/apt/lists/* -# Install Go 1.22+ -RUN wget https://go.dev/dl/go1.22.1.linux-amd64.tar.gz && \ - tar -C /usr/local -xzf go1.22.1.linux-amd64.tar.gz && \ - rm go1.22.1.linux-amd64.tar.gz +# Node.js 22 via NodeSource +RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \ + && apt-get install -y nodejs \ + && rm -rf /var/lib/apt/lists/* + +# Go 1.24 +RUN wget -q https://go.dev/dl/go1.24.1.linux-amd64.tar.gz && \ + tar -C /usr/local -xzf go1.24.1.linux-amd64.tar.gz && \ + rm go1.24.1.linux-amd64.tar.gz ENV PATH=$PATH:/usr/local/go/bin -# Install Claude CLI +# Claude Code CLI RUN npm install -g @anthropic-ai/claude-code -# Install specific node tools +# Gemini CLI +RUN npm install -g @google/gemini-cli + +# CSS build tools (for claudomator itself) RUN npm install -g postcss-cli tailwindcss autoprefixer +# Git: allow operations on any directory (agents clone into /workspace/*) +RUN git config --system safe.directory '*' + +# Claudomator agent CLI tools (ct) +COPY tools/ct /usr/local/bin/ct +RUN chmod +x /usr/local/bin/ct + # Setup workspace WORKDIR /workspace -# Add a user claudomator-agent +# Agent user with passwordless sudo RUN useradd -m claudomator-agent && \ echo "claudomator-agent ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers -# Ensure /usr/local/bin is writable for npm or use a different path -# @anthropic-ai/claude-code might need some extra setup or just work - USER claudomator-agent -# Default command +# Create a default empty config to satisfy the CLI if no mount is provided +RUN mkdir -p /home/claudomator-agent/.claude && \ + echo '{}' > /home/claudomator-agent/.claude.json + CMD ["/bin/bash"] diff --git a/images/agent-base/tools/ct b/images/agent-base/tools/ct new file mode 100644 index 0000000..46d9613 --- /dev/null +++ b/images/agent-base/tools/ct @@ -0,0 +1,210 @@ +#!/bin/bash +# ct - Claudomator CLI for agents running inside containers +# +# Usage: +# ct task create --name "..." --instructions "..." # create subtask (parent auto-set) +# ct task run # queue a task for execution +# ct task wait [--timeout 300] # poll until done, print status +# ct task status # print current state +# ct task list # list recent tasks +# +# Environment (injected by ContainerRunner): +# CLAUDOMATOR_API_URL base URL of the Claudomator API +# CLAUDOMATOR_TASK_ID ID of the currently running task (used as default parent) + +set -euo pipefail + +API="${CLAUDOMATOR_API_URL:-http://host.docker.internal:8484}" +PARENT="${CLAUDOMATOR_TASK_ID:-}" + +_api() { + local method="$1"; shift + local path="$1"; shift + curl -sf -X "$method" "${API}${path}" \ + -H "Content-Type: application/json" \ + "$@" +} + +_require() { + if ! command -v "$1" &>/dev/null; then + echo "ct: required tool '$1' not found" >&2 + exit 1 + fi +} + +_require curl +_require jq + +cmd_task_create() { + local name="" instructions="" instructions_file="" model="" budget="" parent="$PARENT" + + while [[ $# -gt 0 ]]; do + case "$1" in + --name) name="$2"; shift 2 ;; + --instructions) instructions="$2"; shift 2 ;; + --file) instructions_file="$2"; shift 2 ;; + --model) model="$2"; shift 2 ;; + --budget) budget="$2"; shift 2 ;; + --parent) parent="$2"; shift 2 ;; + *) echo "ct task create: unknown flag $1" >&2; exit 1 ;; + esac + done + + if [[ -z "$name" ]]; then + echo "ct task create: --name is required" >&2; exit 1 + fi + + if [[ -n "$instructions_file" ]]; then + instructions=$(cat "$instructions_file") + fi + + if [[ -z "$instructions" ]]; then + echo "ct task create: --instructions or --file is required" >&2; exit 1 + fi + + local payload + payload=$(jq -n \ + --arg name "$name" \ + --arg instructions "$instructions" \ + --arg parent "$parent" \ + --arg model "${model:-sonnet}" \ + --argjson budget "${budget:-3.0}" \ + '{ + name: $name, + parent_task_id: $parent, + agent: { + type: "claude", + model: $model, + instructions: $instructions, + max_budget_usd: $budget + } + }') + + local response + response=$(_api POST /api/tasks -d "$payload") + local task_id + task_id=$(echo "$response" | jq -r '.id // empty') + + if [[ -z "$task_id" ]]; then + echo "ct task create: API error: $(echo "$response" | jq -r '.error // .')" >&2 + exit 1 + fi + + echo "$task_id" +} + +cmd_task_run() { + local task_id="${1:-}" + if [[ -z "$task_id" ]]; then + echo "ct task run: task-id required" >&2; exit 1 + fi + + local response + response=$(_api POST "/api/tasks/${task_id}/run") + echo "$response" | jq -r '.message // .error // .' +} + +cmd_task_wait() { + local task_id="${1:-}" + local timeout=300 + shift || true + + while [[ $# -gt 0 ]]; do + case "$1" in + --timeout) timeout="$2"; shift 2 ;; + *) echo "ct task wait: unknown flag $1" >&2; exit 1 ;; + esac + done + + if [[ -z "$task_id" ]]; then + echo "ct task wait: task-id required" >&2; exit 1 + fi + + local deadline=$(( $(date +%s) + timeout )) + local interval=5 + + while true; do + local response + response=$(_api GET "/api/tasks/${task_id}" 2>/dev/null) || true + + local state + state=$(echo "$response" | jq -r '.state // "UNKNOWN"') + + case "$state" in + COMPLETED|FAILED|TIMED_OUT|CANCELLED|BUDGET_EXCEEDED) + echo "$state" + [[ "$state" == "COMPLETED" ]] && exit 0 || exit 1 + ;; + BLOCKED) + echo "BLOCKED" + exit 2 + ;; + esac + + if [[ $(date +%s) -ge $deadline ]]; then + echo "ct task wait: timed out after ${timeout}s (state: $state)" >&2 + exit 1 + fi + + sleep "$interval" + done +} + +cmd_task_status() { + local task_id="${1:-}" + if [[ -z "$task_id" ]]; then + echo "ct task status: task-id required" >&2; exit 1 + fi + _api GET "/api/tasks/${task_id}" | jq -r '.state' +} + +cmd_task_list() { + _api GET "/api/tasks" | jq -r '.[] | "\(.state)\t\(.id)\t\(.name)"' | sort +} + +# create-and-run shorthand: create a subtask and immediately queue it, then optionally wait +cmd_task_submit() { + local wait=false + local args=() + + while [[ $# -gt 0 ]]; do + case "$1" in + --wait) wait=true; shift ;; + *) args+=("$1"); shift ;; + esac + done + + local task_id + task_id=$(cmd_task_create "${args[@]}") + cmd_task_run "$task_id" >/dev/null + echo "$task_id" + + if $wait; then + cmd_task_wait "$task_id" + fi +} + +# Dispatch +if [[ $# -lt 2 ]]; then + echo "Usage: ct [args...]" + echo " ct task create --name NAME --instructions TEXT [--file FILE] [--model MODEL] [--budget N]" + echo " ct task submit --name NAME --instructions TEXT [--wait]" + echo " ct task run " + echo " ct task wait [--timeout 300]" + echo " ct task status " + echo " ct task list" + exit 1 +fi + +resource="$1"; shift +command="$1"; shift + +case "${resource}/${command}" in + task/create) cmd_task_create "$@" ;; + task/run) cmd_task_run "$@" ;; + task/wait) cmd_task_wait "$@" ;; + task/status) cmd_task_status "$@" ;; + task/list) cmd_task_list ;; + task/submit) cmd_task_submit "$@" ;; + *) echo "ct: unknown command: ${resource} ${command}" >&2; exit 1 ;; +esac diff --git a/internal/cli/serve.go b/internal/cli/serve.go index 2ee020d..98e7524 100644 --- a/internal/cli/serve.go +++ b/internal/cli/serve.go @@ -75,36 +75,38 @@ func serve(addr string) error { apiURL = "http://" + addr } + // Resolve the claude config dir from HOME so the container can mount credentials. + claudeConfigDir := filepath.Join(os.Getenv("HOME"), ".claude") + runners := map[string]executor.Runner{ + // ContainerRunner: binaries are resolved via PATH inside the container image, + // so ClaudeBinary/GeminiBinary are left empty (host paths would not exist inside). "claude": &executor.ContainerRunner{ - Image: cfg.ClaudeImage, - Logger: logger, - LogDir: cfg.LogDir, - APIURL: apiURL, - DropsDir: cfg.DropsDir, - SSHAuthSock: cfg.SSHAuthSock, - ClaudeBinary: cfg.ClaudeBinaryPath, - GeminiBinary: cfg.GeminiBinaryPath, + Image: cfg.ClaudeImage, + Logger: logger, + LogDir: cfg.LogDir, + APIURL: apiURL, + DropsDir: cfg.DropsDir, + SSHAuthSock: cfg.SSHAuthSock, + ClaudeConfigDir: claudeConfigDir, }, "gemini": &executor.ContainerRunner{ - Image: cfg.GeminiImage, - Logger: logger, - LogDir: cfg.LogDir, - APIURL: apiURL, - DropsDir: cfg.DropsDir, - SSHAuthSock: cfg.SSHAuthSock, - ClaudeBinary: cfg.ClaudeBinaryPath, - GeminiBinary: cfg.GeminiBinaryPath, + Image: cfg.GeminiImage, + Logger: logger, + LogDir: cfg.LogDir, + APIURL: apiURL, + DropsDir: cfg.DropsDir, + SSHAuthSock: cfg.SSHAuthSock, + ClaudeConfigDir: claudeConfigDir, }, "container": &executor.ContainerRunner{ - Image: "claudomator-agent:latest", - Logger: logger, - LogDir: cfg.LogDir, - APIURL: apiURL, - DropsDir: cfg.DropsDir, - SSHAuthSock: cfg.SSHAuthSock, - ClaudeBinary: cfg.ClaudeBinaryPath, - GeminiBinary: cfg.GeminiBinaryPath, + Image: "claudomator-agent:latest", + Logger: logger, + LogDir: cfg.LogDir, + APIURL: apiURL, + DropsDir: cfg.DropsDir, + SSHAuthSock: cfg.SSHAuthSock, + ClaudeConfigDir: claudeConfigDir, }, } diff --git a/internal/executor/container.go b/internal/executor/container.go index 45758d2..c43e201 100644 --- a/internal/executor/container.go +++ b/internal/executor/container.go @@ -22,9 +22,10 @@ type ContainerRunner struct { LogDir string APIURL string DropsDir string - SSHAuthSock string // optional path to host SSH agent - ClaudeBinary string // optional path to claude binary in container - GeminiBinary string // optional path to gemini binary in container + SSHAuthSock string // optional path to host SSH agent + ClaudeBinary string // optional path to claude binary in container + GeminiBinary string // optional path to gemini binary in container + ClaudeConfigDir string // host path to ~/.claude; mounted into container for auth credentials // Command allows mocking exec.CommandContext for tests. Command func(ctx context.Context, name string, arg ...string) *exec.Cmd } @@ -50,9 +51,14 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec repoURL = t.Agent.RepositoryURL } if repoURL == "" { - // Fallback to project_dir if repository_url is not set (legacy support) + // Fallback to project_dir if repository_url is not set (legacy support). + // Prefer the 'local' bare remote so that git push succeeds after execution + // (pushing to a non-bare working copy on a checked-out branch is rejected by git). if t.Agent.ProjectDir != "" { repoURL = t.Agent.ProjectDir + if out, err2 := exec.Command("git", "-C", t.Agent.ProjectDir, "remote", "get-url", "local").Output(); err2 == nil { + repoURL = strings.TrimSpace(string(out)) + } } else { return fmt.Errorf("task %s has no repository_url or project_dir", t.ID) } @@ -82,6 +88,7 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec if err != nil { return fmt.Errorf("creating workspace: %w", err) } + // chmod applied after clone; see step 2. } // Note: workspace is only removed on success. On failure, it's preserved for debugging. @@ -96,18 +103,18 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec } }() - // 2. Clone repo into workspace if not resuming + // 2. Clone repo into workspace if not resuming. + // git clone requires the target directory to not exist; remove the MkdirTemp-created dir first. if !isResume { + if err := os.Remove(workspace); err != nil { + return fmt.Errorf("removing workspace before clone: %w", err) + } r.Logger.Info("cloning repository", "url", repoURL, "workspace", workspace) if out, err := r.command(ctx, "git", "clone", repoURL, workspace).CombinedOutput(); err != nil { - // If it looks like a remote URL, fail fast. - if strings.HasPrefix(repoURL, "http") || strings.HasPrefix(repoURL, "git@") || strings.HasPrefix(repoURL, "ssh://") { - return fmt.Errorf("git clone failed for remote repository: %w\n%s", err, string(out)) - } - r.Logger.Warn("git clone failed, attempting fallback init", "url", repoURL, "error", err) - if initErr := r.fallbackGitInit(repoURL, workspace); initErr != nil { - return fmt.Errorf("git clone and fallback init failed: %w\n%s", err, string(out)) - } + return fmt.Errorf("git clone failed: %w\n%s", err, string(out)) + } + if err = os.Chmod(workspace, 0755); err != nil { + return fmt.Errorf("chmod cloned workspace: %w", err) } } e.SandboxDir = workspace @@ -140,18 +147,39 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec // Write API keys to a temporary env file to avoid exposure in 'ps' or 'docker inspect' envFile := filepath.Join(workspace, ".claudomator-env") - envContent := fmt.Sprintf("ANTHROPIC_API_KEY=%s\nGOOGLE_API_KEY=%s\n", os.Getenv("ANTHROPIC_API_KEY"), os.Getenv("GOOGLE_API_KEY")) + envContent := fmt.Sprintf("ANTHROPIC_API_KEY=%s\nGOOGLE_API_KEY=%s\nGEMINI_API_KEY=%s\n", os.Getenv("ANTHROPIC_API_KEY"), os.Getenv("GOOGLE_API_KEY"), os.Getenv("GEMINI_API_KEY")) if err := os.WriteFile(envFile, []byte(envContent), 0600); err != nil { return fmt.Errorf("writing env file: %w", err) } // Inject custom instructions via file to avoid CLI length limits instructionsFile := filepath.Join(workspace, ".claudomator-instructions.txt") - if err := os.WriteFile(instructionsFile, []byte(t.Agent.Instructions), 0600); err != nil { + if err := os.WriteFile(instructionsFile, []byte(t.Agent.Instructions), 0644); err != nil { return fmt.Errorf("writing instructions: %w", err) } - args := r.buildDockerArgs(workspace, e.TaskID) + // Set up a writable $HOME staging dir so any agent tool (claude, gemini, etc.) + // can freely create subdirs (session-env, .gemini, .cache, …) without hitting + // a non-existent or read-only home. We copy only the claude credentials into it. + agentHome := filepath.Join(workspace, ".agent-home") + if err := os.MkdirAll(filepath.Join(agentHome, ".claude"), 0755); err != nil { + return fmt.Errorf("creating agent home staging dir: %w", err) + } + if err := os.MkdirAll(filepath.Join(agentHome, ".gemini"), 0755); err != nil { + return fmt.Errorf("creating .gemini dir: %w", err) + } + if r.ClaudeConfigDir != "" { + // credentials + if srcData, readErr := os.ReadFile(filepath.Join(r.ClaudeConfigDir, ".credentials.json")); readErr == nil { + _ = os.WriteFile(filepath.Join(agentHome, ".claude", ".credentials.json"), srcData, 0600) + } + // settings (used by claude CLI; copy so it can write updates without hitting the host) + if srcData, readErr := os.ReadFile(filepath.Join(filepath.Dir(r.ClaudeConfigDir), ".claude.json")); readErr == nil { + _ = os.WriteFile(filepath.Join(agentHome, ".claude.json"), srcData, 0644) + } + } + + args := r.buildDockerArgs(workspace, agentHome, e.TaskID) innerCmd := r.buildInnerCmd(t, e, isResume) fullArgs := append(args, image) @@ -240,9 +268,8 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec // 5. Post-execution: push changes if successful if waitErr == nil && streamErr == nil { - // Check if there are any commits to push (Issue 10) - // We use rev-list to see if HEAD is ahead of origin/HEAD. - // If origin/HEAD doesn't exist (e.g. fresh init), we just attempt to push. + // Check if there are any commits to push (HEAD ahead of origin/HEAD). + // If origin/HEAD doesn't exist (e.g. fresh clone with no commits), we attempt push anyway. hasCommits := true if out, err := r.command(ctx, "git", "-C", workspace, "rev-list", "origin/HEAD..HEAD").CombinedOutput(); err == nil { if len(strings.TrimSpace(string(out))) == 0 { @@ -272,15 +299,25 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec return nil } -func (r *ContainerRunner) buildDockerArgs(workspace, taskID string) []string { +func (r *ContainerRunner) buildDockerArgs(workspace, claudeHome, taskID string) []string { // --env-file takes a HOST path. hostEnvFile := filepath.Join(workspace, ".claudomator-env") + + // Replace localhost with host.docker.internal so the container can reach the host API. + apiURL := strings.ReplaceAll(r.APIURL, "localhost", "host.docker.internal") + args := []string{ "run", "--rm", + // Allow container to reach the host via host.docker.internal. + "--add-host=host.docker.internal:host-gateway", + // Run as the current process UID:GID so the container can read host-owned files. + fmt.Sprintf("--user=%d:%d", os.Getuid(), os.Getgid()), "-v", workspace + ":/workspace", + "-v", claudeHome + ":/home/agent", "-w", "/workspace", "--env-file", hostEnvFile, - "-e", "CLAUDOMATOR_API_URL=" + r.APIURL, + "-e", "HOME=/home/agent", + "-e", "CLAUDOMATOR_API_URL=" + apiURL, "-e", "CLAUDOMATOR_TASK_ID=" + taskID, "-e", "CLAUDOMATOR_DROP_DIR=" + r.DropsDir, } diff --git a/internal/executor/container_test.go b/internal/executor/container_test.go index d4d591e..f97f2b5 100644 --- a/internal/executor/container_test.go +++ b/internal/executor/container_test.go @@ -23,14 +23,19 @@ func TestContainerRunner_BuildDockerArgs(t *testing.T) { workspace := "/tmp/ws" taskID := "task-123" - args := runner.buildDockerArgs(workspace, taskID) + agentHome := "/tmp/ws/.agent-home" + args := runner.buildDockerArgs(workspace, agentHome, taskID) expected := []string{ "run", "--rm", + "--add-host=host.docker.internal:host-gateway", + fmt.Sprintf("--user=%d:%d", os.Getuid(), os.Getgid()), "-v", "/tmp/ws:/workspace", + "-v", "/tmp/ws/.agent-home:/home/agent", "-w", "/workspace", "--env-file", "/tmp/ws/.claudomator-env", - "-e", "CLAUDOMATOR_API_URL=http://localhost:8484", + "-e", "HOME=/home/agent", + "-e", "CLAUDOMATOR_API_URL=http://host.docker.internal:8484", "-e", "CLAUDOMATOR_TASK_ID=task-123", "-e", "CLAUDOMATOR_DROP_DIR=/data/drops", "-v", "/tmp/ssh.sock:/tmp/ssh-auth.sock", diff --git a/scripts/drain-failed-tasks b/scripts/drain-failed-tasks new file mode 100644 index 0000000..4bb6992 --- /dev/null +++ b/scripts/drain-failed-tasks @@ -0,0 +1,22 @@ +#!/bin/bash +# drain-failed-tasks — retry failed tasks by running start-next-task every 5 minutes +# Usage: ./scripts/drain-failed-tasks [iterations] +# Default: 29 iterations + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +ITERATIONS="${1:-29}" +INTERVAL=300 # 5 minutes + +echo "Running start-next-task every ${INTERVAL}s for ${ITERATIONS} iterations" + +for ((i=1; i<=ITERATIONS; i++)); do + echo "[$(date '+%H:%M:%S')] Iteration ${i}/${ITERATIONS}" + "$SCRIPT_DIR/start-next-task" || true + if [[ $i -lt $ITERATIONS ]]; then + sleep "$INTERVAL" + fi +done + +echo "[$(date '+%H:%M:%S')] Done." -- cgit v1.2.3