summaryrefslogtreecommitdiff
path: root/internal/executor/container.go
diff options
context:
space:
mode:
authorClaudomator Agent <agent@claudomator.local>2026-03-21 23:18:50 +0000
committerClaudomator Agent <agent@claudomator.local>2026-03-21 23:18:50 +0000
commit8dca9bbb0baee59ffe0d3127180ef0958dda8b91 (patch)
treee887036f4cce0f10694c5b9a29f4b4dc251769ba /internal/executor/container.go
parent9e35f7e4087cfa6017cb65ec6a7036f394f5eb22 (diff)
feat: executor reliability — per-agent limit, drain gate, pre-flight creds, auth recovery
- maxPerAgent=1: only 1 in-flight execution per agent type at a time; excess tasks are requeued after 30s - Drain gate: after 2 consecutive failures the agent is drained and a question is set on the task; reset on first success; POST /api/pool/agents/{agent}/undrain to acknowledge - Pre-flight credential check: verify .credentials.json and .claude.json exist in agentHome before spinning up a container - Auth error auto-recovery: detect auth errors (Not logged in, OAuth token has expired, etc.) and retry once after running sync-credentials and re-copying fresh credentials - Extracted runContainer() helper from ContainerRunner.Run() to support the retry flow - Wire CredentialSyncCmd in serve.go for all three ContainerRunner instances - Tests: TestPool_MaxPerAgent_*, TestPool_ConsecutiveFailures_*, TestPool_Undrain_*, TestContainerRunner_Missing{Credentials,Settings}_FailsFast, TestIsAuthError_*, TestContainerRunner_AuthError_SyncsAndRetries Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Diffstat (limited to 'internal/executor/container.go')
-rw-r--r--internal/executor/container.go128
1 files changed, 92 insertions, 36 deletions
diff --git a/internal/executor/container.go b/internal/executor/container.go
index 5421108..d9ed8ef 100644
--- a/internal/executor/container.go
+++ b/internal/executor/container.go
@@ -2,6 +2,7 @@ package executor
import (
"context"
+ "errors"
"fmt"
"log/slog"
"os"
@@ -22,14 +23,26 @@ type ContainerRunner struct {
LogDir string
APIURL string
DropsDir string
- SSHAuthSock string // optional path to host SSH agent
- ClaudeBinary string // optional path to claude binary in container
- GeminiBinary string // optional path to gemini binary in container
- ClaudeConfigDir string // host path to ~/.claude; mounted into container for auth credentials
+ SSHAuthSock string // optional path to host SSH agent
+ ClaudeBinary string // optional path to claude binary in container
+ GeminiBinary string // optional path to gemini binary in container
+ ClaudeConfigDir string // host path to ~/.claude; mounted into container for auth credentials
+ CredentialSyncCmd string // optional path to sync-credentials script for auth-error auto-recovery
// Command allows mocking exec.CommandContext for tests.
Command func(ctx context.Context, name string, arg ...string) *exec.Cmd
}
+func isAuthError(err error) bool {
+ if err == nil {
+ return false
+ }
+ s := err.Error()
+ return strings.Contains(s, "Not logged in") ||
+ strings.Contains(s, "OAuth token has expired") ||
+ strings.Contains(s, "authentication_error") ||
+ strings.Contains(s, "Please run /login")
+}
+
func (r *ContainerRunner) command(ctx context.Context, name string, arg ...string) *exec.Cmd {
if r.Command != nil {
return r.Command(ctx, name, arg...)
@@ -51,14 +64,6 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec
return fmt.Errorf("task %s has no repository_url", t.ID)
}
- image := t.Agent.ContainerImage
- if image == "" {
- image = r.Image
- }
- if image == "" {
- image = "claudomator-agent:latest"
- }
-
// 1. Setup workspace on host
var workspace string
isResume := false
@@ -106,6 +111,81 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec
}
e.SandboxDir = workspace
+ // Set up a writable $HOME staging dir so any agent tool (claude, gemini, etc.)
+ // can freely create subdirs (session-env, .gemini, .cache, …) without hitting
+ // a non-existent or read-only home. We copy only the claude credentials into it.
+ agentHome := filepath.Join(workspace, ".agent-home")
+ if err := os.MkdirAll(filepath.Join(agentHome, ".claude"), 0755); err != nil {
+ return fmt.Errorf("creating agent home staging dir: %w", err)
+ }
+ if err := os.MkdirAll(filepath.Join(agentHome, ".gemini"), 0755); err != nil {
+ return fmt.Errorf("creating .gemini dir: %w", err)
+ }
+ if r.ClaudeConfigDir != "" {
+ // credentials
+ if srcData, readErr := os.ReadFile(filepath.Join(r.ClaudeConfigDir, ".credentials.json")); readErr == nil {
+ _ = os.WriteFile(filepath.Join(agentHome, ".claude", ".credentials.json"), srcData, 0600)
+ }
+ // settings (used by claude CLI; copy so it can write updates without hitting the host)
+ if srcData, readErr := os.ReadFile(filepath.Join(r.ClaudeConfigDir, ".claude.json")); readErr == nil {
+ _ = os.WriteFile(filepath.Join(agentHome, ".claude.json"), srcData, 0644)
+ }
+ }
+
+ // Pre-flight: verify credentials were actually copied before spinning up a container.
+ if r.ClaudeConfigDir != "" {
+ credsPath := filepath.Join(agentHome, ".claude", ".credentials.json")
+ settingsPath := filepath.Join(agentHome, ".claude.json")
+ if _, err := os.Stat(credsPath); os.IsNotExist(err) {
+ return fmt.Errorf("credentials not found at %s — run sync-credentials", r.ClaudeConfigDir)
+ }
+ if _, err := os.Stat(settingsPath); os.IsNotExist(err) {
+ return fmt.Errorf("claude settings (.claude.json) not found at %s — run sync-credentials", r.ClaudeConfigDir)
+ }
+ }
+
+ // Run container (with auth retry on failure).
+ runErr := r.runContainer(ctx, t, e, workspace, agentHome, isResume)
+ if runErr != nil && isAuthError(runErr) && r.CredentialSyncCmd != "" {
+ r.Logger.Warn("auth failure detected, syncing credentials and retrying once", "taskID", t.ID)
+ syncOut, syncErr := r.command(ctx, r.CredentialSyncCmd).CombinedOutput()
+ if syncErr != nil {
+ r.Logger.Warn("sync-credentials failed", "error", syncErr, "output", string(syncOut))
+ }
+ // Re-copy credentials into agentHome with fresh files.
+ if srcData, readErr := os.ReadFile(filepath.Join(r.ClaudeConfigDir, ".credentials.json")); readErr == nil {
+ _ = os.WriteFile(filepath.Join(agentHome, ".claude", ".credentials.json"), srcData, 0600)
+ }
+ if srcData, readErr := os.ReadFile(filepath.Join(r.ClaudeConfigDir, ".claude.json")); readErr == nil {
+ _ = os.WriteFile(filepath.Join(agentHome, ".claude.json"), srcData, 0644)
+ }
+ runErr = r.runContainer(ctx, t, e, workspace, agentHome, isResume)
+ }
+
+ if runErr == nil {
+ success = true
+ }
+ var blockedErr *BlockedError
+ if errors.As(runErr, &blockedErr) {
+ isBlocked = true
+ success = true // preserve workspace for resumption
+ }
+ return runErr
+}
+
+// runContainer runs the docker container for the given task and handles log setup,
+// environment files, instructions, and post-execution git operations.
+func (r *ContainerRunner) runContainer(ctx context.Context, t *task.Task, e *storage.Execution, workspace, agentHome string, isResume bool) error {
+ repoURL := t.RepositoryURL
+
+ image := t.Agent.ContainerImage
+ if image == "" {
+ image = r.Image
+ }
+ if image == "" {
+ image = "claudomator-agent:latest"
+ }
+
// 3. Prepare logs
logDir := r.ExecLogDir(e.ID)
if logDir == "" {
@@ -145,27 +225,6 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec
return fmt.Errorf("writing instructions: %w", err)
}
- // Set up a writable $HOME staging dir so any agent tool (claude, gemini, etc.)
- // can freely create subdirs (session-env, .gemini, .cache, …) without hitting
- // a non-existent or read-only home. We copy only the claude credentials into it.
- agentHome := filepath.Join(workspace, ".agent-home")
- if err := os.MkdirAll(filepath.Join(agentHome, ".claude"), 0755); err != nil {
- return fmt.Errorf("creating agent home staging dir: %w", err)
- }
- if err := os.MkdirAll(filepath.Join(agentHome, ".gemini"), 0755); err != nil {
- return fmt.Errorf("creating .gemini dir: %w", err)
- }
- if r.ClaudeConfigDir != "" {
- // credentials
- if srcData, readErr := os.ReadFile(filepath.Join(r.ClaudeConfigDir, ".credentials.json")); readErr == nil {
- _ = os.WriteFile(filepath.Join(agentHome, ".claude", ".credentials.json"), srcData, 0600)
- }
- // settings (used by claude CLI; copy so it can write updates without hitting the host)
- if srcData, readErr := os.ReadFile(filepath.Join(r.ClaudeConfigDir, ".claude.json")); readErr == nil {
- _ = os.WriteFile(filepath.Join(agentHome, ".claude.json"), srcData, 0644)
- }
- }
-
args := r.buildDockerArgs(workspace, agentHome, e.TaskID)
innerCmd := r.buildInnerCmd(t, e, isResume)
@@ -233,8 +292,6 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec
r.Logger.Info("treating question file as completion report", "taskID", e.TaskID)
e.Summary = extractQuestionText(questionJSON)
} else {
- isBlocked = true
- success = true // We consider BLOCKED as a "success" for workspace preservation
if e.SessionID == "" {
r.Logger.Warn("missing session ID; resume will start fresh", "taskID", e.TaskID)
}
@@ -278,7 +335,6 @@ func (r *ContainerRunner) Run(ctx context.Context, t *task.Task, e *storage.Exec
}
r.Logger.Info("no new commits to push", "taskID", t.ID)
}
- success = true
}
if waitErr != nil {