feat: executor reliability — per-agent limit, drain gate, pre-flight creds, auth recovery

- maxPerAgent=1: only 1 in-flight execution per agent type at a time; excess tasks are requeued after 30s - Drain gate: after 2 consecutive failures the agent is drained and a question is set on the task; reset on first success; POST /api/pool/agents/{agent}/undrain to acknowledge - Pre-flight credential check: verify .credentials.json and .claude.json exist in agentHome before spinning up a container - Auth error auto-recovery: detect auth errors (Not logged in, OAuth token has expired, etc.) and retry once after running sync-credentials and re-copying fresh credentials - Extracted runContainer() helper from ContainerRunner.Run() to support the retry flow - Wire CredentialSyncCmd in serve.go for all three ContainerRunner instances - Tests: TestPool_MaxPerAgent_*, TestPool_ConsecutiveFailures_*, TestPool_Undrain_*, TestContainerRunner_Missing{Credentials,Settings}_FailsFast, TestIsAuthError_*, TestContainerRunner_AuthError_SyncsAndRetries Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
author: Claudomator Agent <agent@claudomator.local> 2026-03-21 23:18:50 +0000
committer: Claudomator Agent <agent@claudomator.local> 2026-03-21 23:18:50 +0000
commit: 8dca9bbb0baee59ffe0d3127180ef0958dda8b91 (patch)
tree: e887036f4cce0f10694c5b9a29f4b4dc251769ba /internal/executor/container_test.go
parent: 9e35f7e4087cfa6017cb65ec6a7036f394f5eb22 (diff)
1 files changed, 171 insertions, 0 deletions
diff --git a/internal/executor/container_test.go b/internal/executor/container_test.go
index be80b51..b6946ef 100644
--- a/internal/executor/container_test.go
+++ b/internal/executor/container_test.go
@@ -7,6 +7,7 @@ import (
 	"log/slog"
 	"os"
 	"os/exec"
+	"path/filepath"
 	"strings"
 	"testing"
 
@@ -343,3 +344,173 @@ func TestGitSafe_PrependsSafeDirectory(t *testing.T) {
 		}
 	}
 }
+
+func TestContainerRunner_MissingCredentials_FailsFast(t *testing.T) {
+	logger := slog.New(slog.NewTextHandler(io.Discard, nil))
+
+	claudeConfigDir := t.TempDir()
+
+	// Set up ClaudeConfigDir with MISSING credentials (so pre-flight fails)
+	// Don't create .credentials.json
+	// But DO create .claude.json so the test isolates the credentials check
+	if err := os.WriteFile(filepath.Join(claudeConfigDir, ".claude.json"), []byte("{}"), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	runner := &ContainerRunner{
+		Logger:          logger,
+		Image:           "busybox",
+		ClaudeConfigDir: claudeConfigDir,
+		Command: func(ctx context.Context, name string, arg ...string) *exec.Cmd {
+			if name == "git" && len(arg) > 0 && arg[0] == "clone" {
+				dir := arg[len(arg)-1]
+				os.MkdirAll(dir, 0755)
+				return exec.Command("true")
+			}
+			return exec.Command("true")
+		},
+	}
+
+	tk := &task.Task{
+		ID:            "test-missing-creds",
+		RepositoryURL: "https://github.com/example/repo.git",
+		Agent:         task.AgentConfig{Type: "claude"},
+	}
+	e := &storage.Execution{ID: "test-exec", TaskID: "test-missing-creds"}
+
+	err := runner.Run(context.Background(), tk, e)
+	if err == nil {
+		t.Fatal("expected error due to missing credentials, got nil")
+	}
+	if !strings.Contains(err.Error(), "credentials not found") {
+		t.Errorf("expected 'credentials not found' error, got: %v", err)
+	}
+}
+
+func TestContainerRunner_MissingSettings_FailsFast(t *testing.T) {
+	logger := slog.New(slog.NewTextHandler(io.Discard, nil))
+
+	claudeConfigDir := t.TempDir()
+
+	// Only create credentials but NOT .claude.json
+	if err := os.WriteFile(filepath.Join(claudeConfigDir, ".credentials.json"), []byte("{}"), 0600); err != nil {
+		t.Fatal(err)
+	}
+
+	runner := &ContainerRunner{
+		Logger:          logger,
+		Image:           "busybox",
+		ClaudeConfigDir: claudeConfigDir,
+		Command: func(ctx context.Context, name string, arg ...string) *exec.Cmd {
+			if name == "git" && len(arg) > 0 && arg[0] == "clone" {
+				dir := arg[len(arg)-1]
+				os.MkdirAll(dir, 0755)
+				return exec.Command("true")
+			}
+			return exec.Command("true")
+		},
+	}
+
+	tk := &task.Task{
+		ID:            "test-missing-settings",
+		RepositoryURL: "https://github.com/example/repo.git",
+		Agent:         task.AgentConfig{Type: "claude"},
+	}
+	e := &storage.Execution{ID: "test-exec-2", TaskID: "test-missing-settings"}
+
+	err := runner.Run(context.Background(), tk, e)
+	if err == nil {
+		t.Fatal("expected error due to missing settings, got nil")
+	}
+	if !strings.Contains(err.Error(), "claude settings") {
+		t.Errorf("expected 'claude settings' error, got: %v", err)
+	}
+}
+
+func TestIsAuthError_DetectsAllVariants(t *testing.T) {
+	tests := []struct {
+		msg  string
+		want bool
+	}{
+		{"Not logged in", true},
+		{"OAuth token has expired", true},
+		{"authentication_error: invalid token", true},
+		{"Please run /login to authenticate", true},
+		{"container execution failed: exit status 1", false},
+		{"git clone failed", false},
+		{"", false},
+	}
+	for _, tt := range tests {
+		var err error
+		if tt.msg != "" {
+			err = fmt.Errorf("%s", tt.msg)
+		}
+		got := isAuthError(err)
+		if got != tt.want {
+			t.Errorf("isAuthError(%q) = %v, want %v", tt.msg, got, tt.want)
+		}
+	}
+}
+
+func TestContainerRunner_AuthError_SyncsAndRetries(t *testing.T) {
+	logger := slog.New(slog.NewTextHandler(io.Discard, nil))
+
+	// Create a sync script that creates a marker file
+	syncDir := t.TempDir()
+	syncMarker := filepath.Join(syncDir, "sync-called")
+	syncScript := filepath.Join(syncDir, "sync-creds")
+	os.WriteFile(syncScript, []byte("#!/bin/sh\ntouch "+syncMarker+"\n"), 0755)
+
+	claudeConfigDir := t.TempDir()
+	// Create both credential files in ClaudeConfigDir
+	os.WriteFile(filepath.Join(claudeConfigDir, ".credentials.json"), []byte(`{"token":"fresh"}`), 0600)
+	os.WriteFile(filepath.Join(claudeConfigDir, ".claude.json"), []byte("{}"), 0644)
+
+	callCount := 0
+	runner := &ContainerRunner{
+		Logger:            logger,
+		Image:             "busybox",
+		ClaudeConfigDir:   claudeConfigDir,
+		CredentialSyncCmd: syncScript,
+		Command: func(ctx context.Context, name string, arg ...string) *exec.Cmd {
+			if name == "git" {
+				if len(arg) > 0 && arg[0] == "clone" {
+					dir := arg[len(arg)-1]
+					os.MkdirAll(dir, 0755)
+				}
+				return exec.Command("true")
+			}
+			if name == "docker" {
+				callCount++
+				if callCount == 1 {
+					// First docker call fails with auth error
+					return exec.Command("sh", "-c", "echo 'Not logged in' >&2; exit 1")
+				}
+				// Second docker call "succeeds"
+				return exec.Command("sh", "-c", "exit 0")
+			}
+			if name == syncScript {
+				return exec.Command("sh", "-c", "touch "+syncMarker)
+			}
+			return exec.Command("true")
+		},
+	}
+
+	tk := &task.Task{
+		ID:            "auth-retry-test",
+		RepositoryURL: "https://github.com/example/repo.git",
+		Agent:         task.AgentConfig{Type: "claude", Instructions: "test"},
+	}
+	e := &storage.Execution{ID: "auth-retry-exec", TaskID: "auth-retry-test"}
+
+	// Run — first attempt will fail with auth error, triggering sync+retry
+	runner.Run(context.Background(), tk, e)
+	// We don't check error strictly since second run may also fail (git push etc.)
+	// What we care about is that docker was called twice and sync was called
+	if callCount < 2 {
+		t.Errorf("expected docker to be called at least twice (original + retry), got %d", callCount)
+	}
+	if _, err := os.Stat(syncMarker); os.IsNotExist(err) {
+		t.Error("expected sync-credentials to be called, but marker file not found")
+	}
+}
author	Claudomator Agent <agent@claudomator.local>	2026-03-21 23:18:50 +0000
committer	Claudomator Agent <agent@claudomator.local>	2026-03-21 23:18:50 +0000
commit	8dca9bbb0baee59ffe0d3127180ef0958dda8b91 (patch)
tree	e887036f4cce0f10694c5b9a29f4b4dc251769ba /internal/executor/container_test.go
parent	9e35f7e4087cfa6017cb65ec6a7036f394f5eb22 (diff)