diff options
| -rw-r--r-- | docs/adr/002-task-state-machine.md | 18 | ||||
| -rw-r--r-- | docs/packages_old/task.md | 262 | ||||
| -rw-r--r-- | internal/api/docs/RAW_NARRATIVE.md | 117 | ||||
| -rw-r--r-- | internal/api/elaborate.go | 70 | ||||
| -rw-r--r-- | internal/api/elaborate_test.go | 178 | ||||
| -rw-r--r-- | internal/api/server.go | 36 | ||||
| -rw-r--r-- | internal/api/server_test.go | 123 | ||||
| -rw-r--r-- | internal/cli/serve.go | 1 | ||||
| -rw-r--r-- | internal/executor/claude.go | 8 | ||||
| -rw-r--r-- | internal/executor/executor.go | 43 | ||||
| -rw-r--r-- | internal/executor/executor_test.go | 58 | ||||
| -rw-r--r-- | internal/executor/preamble.go | 11 | ||||
| -rw-r--r-- | internal/executor/summary.go | 57 | ||||
| -rw-r--r-- | internal/executor/summary_test.go | 49 | ||||
| -rw-r--r-- | internal/storage/db.go | 66 | ||||
| -rw-r--r-- | internal/task/task.go | 51 | ||||
| -rw-r--r-- | web/app.js | 84 | ||||
| -rw-r--r-- | web/style.css | 38 | ||||
| -rw-r--r-- | web/test/task-actions.test.mjs | 89 |
19 files changed, 1254 insertions, 105 deletions
diff --git a/docs/adr/002-task-state-machine.md b/docs/adr/002-task-state-machine.md index 310c337..6910f6a 100644 --- a/docs/adr/002-task-state-machine.md +++ b/docs/adr/002-task-state-machine.md @@ -66,13 +66,13 @@ True terminal state (no outgoing transitions): `COMPLETED`. All other non-succes | `QUEUED` | `RUNNING` | Pool goroutine starts execution | | `QUEUED` | `CANCELLED` | `POST /api/tasks/{id}/cancel` | | `RUNNING` | `READY` | Runner exits 0, no question file, top-level task (`parent_task_id == ""`), and task has no subtasks | -| `RUNNING` | `BLOCKED` | Runner exits 0, no question file, top-level task (`parent_task_id == ""`), and task has subtasks | +| `RUNNING` | `BLOCKED` (subtasks) | Runner exits 0, no question file, top-level task (`parent_task_id == ""`), and task has subtasks | +| `RUNNING` | `BLOCKED` (question) | Runner exits 0 but left a `question.json` file in log dir (any task type) | | `RUNNING` | `COMPLETED` | Runner exits 0, no question file, subtask (`parent_task_id != ""`) | | `RUNNING` | `FAILED` | Runner exits non-zero or stream signals `is_error: true` | | `RUNNING` | `TIMED_OUT` | Context deadline exceeded (`context.DeadlineExceeded`) | | `RUNNING` | `CANCELLED` | Context cancelled (`context.Canceled`) | | `RUNNING` | `BUDGET_EXCEEDED` | `--max-budget-usd` exceeded (signalled by runner) | -| `RUNNING` | `BLOCKED` | Runner exits 0 but left a `question.json` file in log dir | | `READY` | `COMPLETED` | `POST /api/tasks/{id}/accept` | | `READY` | `PENDING` | `POST /api/tasks/{id}/reject` (with optional comment) | | `FAILED` | `QUEUED` | Retry (manual re-run via `POST /api/tasks/{id}/run`) | @@ -85,7 +85,7 @@ True terminal state (no outgoing transitions): `COMPLETED`. All other non-succes ## Implementation **Validation:** `task.ValidTransition(from, to State) bool` -(`internal/task/task.go:93`) — called by API handlers before every state change. +(`internal/task/task.go:123`) — called by API handlers before every state change. **State writes:** `storage.DB.UpdateTaskState(id, state)` — single source of write; called by both API handlers and the executor pool. @@ -158,9 +158,9 @@ Task lifecycle changes produce WebSocket broadcasts to all connected clients: ## Known Limitations and Edge Cases -- **`BUDGET_EXCEEDED` transition.** `BUDGET_EXCEEDED` appears in `terminalFailureStates` - (used by `waitForDependencies`) but has no outgoing transitions in `ValidTransition`, - making it permanently terminal. There is no `/resume` endpoint for it. +- **`BUDGET_EXCEEDED` retry.** `BUDGET_EXCEEDED → QUEUED` is a valid transition (retry via + `POST /run`), matching `FAILED` and `CANCELLED` behaviour. However, there is no dedicated + `/resume` endpoint for it — callers must use the standard `/run` restart path. - **Retry enforcement.** `RetryConfig.MaxAttempts` is stored but not enforced by the pool. The API allows unlimited manual retries via `POST /run` from `FAILED`. @@ -178,9 +178,9 @@ Task lifecycle changes produce WebSocket broadcasts to all connected clients: | Concern | File | Lines | |---|---|---| -| State constants | `internal/task/task.go` | 7–18 | -| `ValidTransition` | `internal/task/task.go` | 93–109 | -| State machine tests | `internal/task/task_test.go` | 8–72 | +| State constants | `internal/task/task.go` | 9–20 | +| `ValidTransition` | `internal/task/task.go` | 107–130 | +| State machine tests | `internal/task/task_test.go` | 8–75 | | Pool execute | `internal/executor/executor.go` | 194–303 | | Pool executeResume | `internal/executor/executor.go` | 116–185 | | Dependency wait | `internal/executor/executor.go` | 305–340 | diff --git a/docs/packages_old/task.md b/docs/packages_old/task.md new file mode 100644 index 0000000..923cd56 --- /dev/null +++ b/docs/packages_old/task.md @@ -0,0 +1,262 @@ +# Package: task + +`internal/task` — Task definition, parsing, validation, and state machine. + +--- + +## 1. Overview + +A **Task** is the central unit of work in Claudomator. It describes what an agent should do (`agent.instructions`), how it should be run (timeout, retry, priority), and how it relates to other tasks (`depends_on`, `parent_task_id`). Tasks are defined in YAML files, parsed into `Task` structs, persisted in SQLite, and driven through a state machine from `PENDING` to a terminal state. + +--- + +## 2. Task Struct + +```go +type Task struct { + ID string // UUID; auto-generated if omitted in YAML + ParentTaskID string // ID of parent task (subtask linkage); empty for root tasks + Name string // Human-readable label; required + Description string // Optional longer description + Agent AgentConfig // How to invoke the agent + Timeout Duration // Maximum wall-clock run time (e.g. "30m"); 0 = no limit + Retry RetryConfig // Retry policy + Priority Priority // "high" | "normal" | "low"; default "normal" + Tags []string // Arbitrary labels for filtering + DependsOn []string // Task IDs that must reach COMPLETED before this queues + State State // Current lifecycle state; not read from YAML (yaml:"-") + RejectionComment string // Set by RejectTask; not read from YAML (yaml:"-") + QuestionJSON string // Pending question from a READY agent; not read from YAML (yaml:"-") + CreatedAt time.Time // Set on parse; not read from YAML (yaml:"-") + UpdatedAt time.Time // Updated on every state change; not read from YAML (yaml:"-") +} +``` + +Fields tagged `yaml:"-"` are runtime-only and are never parsed from task YAML files. + +--- + +## 3. AgentConfig Struct + +```go +type AgentConfig struct { + Type string // Agent implementation: "claude", "gemini", etc. + Model string // Model identifier passed to the agent binary + ContextFiles []string // Files injected into agent context at start + Instructions string // Prompt / task description sent to the agent; required + ProjectDir string // Working directory for the agent process + MaxBudgetUSD float64 // Spending cap in USD; 0 = unlimited; must be >= 0 + PermissionMode string // One of: default | acceptEdits | bypassPermissions | plan | dontAsk | delegate + AllowedTools []string // Whitelist of tool names the agent may use + DisallowedTools []string // Blacklist of tool names the agent may not use + SystemPromptAppend string // Text appended to the agent's system prompt + AdditionalArgs []string // Extra CLI arguments forwarded to the agent binary + SkipPlanning bool // When true, bypass the planning phase +} +``` + +--- + +## 4. RetryConfig Struct + +```go +type RetryConfig struct { + MaxAttempts int // Total attempts including the first; default 1 (no retry) + Backoff string // "linear" or "exponential"; default "exponential" +} +``` + +--- + +## 5. YAML Task File Format — Single Task + +```yaml +# Unique identifier. Optional: auto-generated UUID if omitted. +id: "fix-login-bug" + +# Human-readable label. Required. +name: "Fix login redirect bug" + +# Optional description shown in the UI. +description: "Users are redirected to /home instead of /dashboard after login." + +# Agent configuration. +agent: + # Agent type: "claude", "gemini", etc. Can be omitted for auto-classification. + type: "claude" + + # Model to use. Empty = agent default. + model: "claude-opus-4-6" + + # Files loaded into the agent's context before execution. + context_files: + - "src/auth/login.go" + - "docs/design/auth.md" + + # Prompt sent to the agent. Required. + instructions: | + Fix the post-login redirect in src/auth/login.go so that users are + sent to /dashboard instead of /home. Add a regression test. + + # Working directory for the agent process. Empty = server working directory. + project_dir: "/workspace/myapp" + + # USD spending cap. 0 = no limit. + max_budget_usd: 1.00 + + # Permission mode. Valid values: default | acceptEdits | bypassPermissions | plan | dontAsk | delegate + permission_mode: "acceptEdits" + + # Tool whitelist. Empty = all tools allowed. + allowed_tools: + - "Edit" + - "Read" + - "Bash" + + # Tool blacklist. + disallowed_tools: + - "WebFetch" + + # Appended to the agent's system prompt. + system_prompt_append: "Always write tests before implementation." + + # Extra arguments forwarded verbatim to the agent binary. + additional_args: + - "--verbose" + + # Skip the planning phase. + skip_planning: false + +# Maximum run time. Accepts Go duration strings: "30m", "1h30m", "45s". +# 0 or omitted = no limit. +timeout: "30m" + +# Retry policy. +retry: + # Total attempts (including first). Must be >= 1. + max_attempts: 3 + # "linear" or "exponential". + backoff: "exponential" + +# Scheduling priority: "high", "normal" (default), or "low". +priority: "normal" + +# Arbitrary string labels for filtering. +tags: + - "bug" + - "auth" + +# Task IDs that must be COMPLETED before this task is queued. +depends_on: + - "setup-test-db" +``` + +--- + +## 6. Batch File Format + +A batch file wraps multiple tasks under a `tasks` key. Each entry is a full task definition (same fields as above). All tasks are parsed and initialized together. + +```yaml +tasks: + - name: "Step 1 — scaffold" + agent: + instructions: "Create the initial project structure." + priority: "high" + + - name: "Step 2 — implement" + agent: + instructions: "Implement the feature described in docs/feature.md." + depends_on: + - "step-1-id" + + - name: "Step 3 — test" + agent: + instructions: "Write and run integration tests." + depends_on: + - "step-2-id" + retry: + max_attempts: 2 + backoff: "linear" +``` + +`ParseFile` tries the batch format first; if no `tasks` key is present it falls back to single-task parsing. + +--- + +## 7. State Constants + +| Constant | Value | Meaning | +|-------------------|------------------|-------------------------------------------------------------------------| +| `StatePending` | `PENDING` | Newly created; awaiting classification or human approval. | +| `StateQueued` | `QUEUED` | Accepted and waiting for an available agent slot. | +| `StateRunning` | `RUNNING` | Agent process is actively executing. | +| `StateReady` | `READY` | Agent has paused and is awaiting human input (question / approval). | +| `StateCompleted` | `COMPLETED` | Agent finished successfully. Terminal — no further transitions allowed. | +| `StateFailed` | `FAILED` | Agent exited with a non-zero code or internal error. | +| `StateTimedOut` | `TIMED_OUT` | Execution exceeded the configured `timeout`. | +| `StateCancelled` | `CANCELLED` | Explicitly cancelled by the user or scheduler. | +| `StateBudgetExceeded` | `BUDGET_EXCEEDED` | Agent hit the `max_budget_usd` cap before finishing. | +| `StateBlocked` | `BLOCKED` | Waiting on a dependency task that has not yet completed. | + +--- + +## 8. State Machine — Valid Transitions + +| From | To | Condition / trigger | +|--------------------|--------------------|--------------------------------------------------------------| +| `PENDING` | `QUEUED` | Task approved and eligible for scheduling. | +| `PENDING` | `CANCELLED` | Cancelled before being queued. | +| `QUEUED` | `RUNNING` | Agent slot becomes available; execution starts. | +| `QUEUED` | `CANCELLED` | Cancelled while waiting in the queue. | +| `RUNNING` | `READY` | Agent pauses and emits a question for human input. | +| `RUNNING` | `COMPLETED` | Agent exits successfully. | +| `RUNNING` | `FAILED` | Agent exits with an error. | +| `RUNNING` | `TIMED_OUT` | Wall-clock time exceeds `timeout`. | +| `RUNNING` | `CANCELLED` | Cancelled mid-execution. | +| `RUNNING` | `BUDGET_EXCEEDED` | Cumulative cost exceeds `max_budget_usd`. | +| `RUNNING` | `BLOCKED` | Runtime dependency check fails. | +| `READY` | `COMPLETED` | Human answer accepted; task finishes. | +| `READY` | `PENDING` | Answer rejected; task returns to pending for re-approval. | +| `FAILED` | `QUEUED` | Retry requested (re-enqueue). | +| `TIMED_OUT` | `QUEUED` | Retry or resume requested. | +| `CANCELLED` | `QUEUED` | Restart requested. | +| `BUDGET_EXCEEDED` | `QUEUED` | Retry with higher or no budget. | +| `BLOCKED` | `QUEUED` | Dependency became satisfied; task re-queued. | +| `BLOCKED` | `READY` | Dependency resolved but human review required. | +| `COMPLETED` | *(none)* | Terminal state. | + +--- + +## 9. Key Functions + +### `ParseFile(path string) ([]Task, error)` +Reads a YAML file at `path` and returns one or more tasks. Tries batch format (`tasks:` key) first; falls back to single-task format. Auto-assigns UUIDs, default priority, default retry config, and sets `State = PENDING` on all returned tasks. + +### `Parse(data []byte) ([]Task, error)` +Same as `ParseFile` but operates on raw bytes instead of a file path. + +### `ValidTransition(from, to State) bool` +Returns `true` if the transition from `from` to `to` is permitted by the state machine. Used by `storage.DB.UpdateTaskState` to enforce transitions atomically inside a transaction. + +### `Validate(t *Task) error` +Validates a task's fields. Returns `*ValidationError` (implementing `error`) with all failures collected, or `nil` if valid. + +--- + +## 10. Validation Rules + +The `Validate` function enforces the following rules: + +| Rule | Details | +|------|---------| +| `name` required | Must be non-empty. | +| `agent.instructions` required | Must be non-empty. | +| `agent.max_budget_usd` non-negative | Must be `>= 0`. | +| `timeout` non-negative | Must be `>= 0` (zero means no limit). | +| `retry.max_attempts >= 1` | Must be at least 1. | +| `retry.backoff` valid values | Must be empty, `"linear"`, or `"exponential"`. | +| `priority` valid values | Must be empty, `"high"`, `"normal"`, or `"low"`. | +| `agent.permission_mode` valid values | Must be empty or one of: `default`, `acceptEdits`, `bypassPermissions`, `plan`, `dontAsk`, `delegate`. | + +Multiple failures are collected and returned together in a single `ValidationError`. diff --git a/internal/api/docs/RAW_NARRATIVE.md b/internal/api/docs/RAW_NARRATIVE.md index 8fe69b6..3c7768a 100644 --- a/internal/api/docs/RAW_NARRATIVE.md +++ b/internal/api/docs/RAW_NARRATIVE.md @@ -1,4 +1,121 @@ +--- 2026-03-10T09:33:34Z --- +run the Go test suite with race detector and fail if coverage < 80% + +--- 2026-03-10T09:33:34Z --- +do something + +--- 2026-03-10T09:33:34Z --- +do something + +--- 2026-03-10T16:46:39Z --- +run the Go test suite with race detector and fail if coverage < 80% + +--- 2026-03-10T16:46:39Z --- +do something + +--- 2026-03-10T16:46:39Z --- +do something + +--- 2026-03-10T17:16:31Z --- +run the Go test suite with race detector and fail if coverage < 80% + +--- 2026-03-10T17:16:31Z --- +do something + +--- 2026-03-10T17:16:31Z --- +do something + +--- 2026-03-10T17:25:16Z --- +run the Go test suite with race detector and fail if coverage < 80% + +--- 2026-03-10T17:25:16Z --- +do something + +--- 2026-03-10T17:25:16Z --- +do something + +--- 2026-03-10T23:54:53Z --- +run the Go test suite with race detector and fail if coverage < 80% + +--- 2026-03-10T23:54:53Z --- +do something + +--- 2026-03-10T23:54:53Z --- +do something + +--- 2026-03-10T23:55:54Z --- +run the Go test suite with race detector and fail if coverage < 80% + +--- 2026-03-10T23:55:54Z --- +do something + +--- 2026-03-10T23:55:54Z --- +do something + +--- 2026-03-10T23:56:06Z --- +run the Go test suite with race detector and fail if coverage < 80% + +--- 2026-03-10T23:56:06Z --- +do something + +--- 2026-03-10T23:56:06Z --- +do something + +--- 2026-03-10T23:57:26Z --- +run the Go test suite with race detector and fail if coverage < 80% + +--- 2026-03-10T23:57:26Z --- +do something + +--- 2026-03-10T23:57:26Z --- +do something + +--- 2026-03-11T07:40:17Z --- +run the Go test suite with race detector and fail if coverage < 80% + +--- 2026-03-11T07:40:17Z --- +do something + +--- 2026-03-11T07:40:17Z --- +do something + +--- 2026-03-11T08:25:03Z --- +run the Go test suite with race detector and fail if coverage < 80% + +--- 2026-03-11T08:25:04Z --- +do something + +--- 2026-03-11T08:25:04Z --- +do something + +--- 2026-03-12T21:00:28Z --- +generate a report + +--- 2026-03-12T21:00:33Z --- +generate a report + +--- 2026-03-12T21:00:34Z --- +run the Go test suite with race detector and fail if coverage < 80% + +--- 2026-03-12T21:00:34Z --- +do something + +--- 2026-03-12T21:00:34Z --- +do something + +--- 2026-03-13T02:27:38Z --- +generate a report + +--- 2026-03-13T02:27:38Z --- +run the Go test suite with race detector and fail if coverage < 80% + +--- 2026-03-13T02:27:38Z --- +do something + +--- 2026-03-13T02:27:38Z --- +do something + --- 2026-03-11T19:04:51Z --- run the Go test suite with race detector and fail if coverage < 80% diff --git a/internal/api/elaborate.go b/internal/api/elaborate.go index eb686bf..c6d08f4 100644 --- a/internal/api/elaborate.go +++ b/internal/api/elaborate.go @@ -9,6 +9,7 @@ import ( "os" "os/exec" "path/filepath" + "sort" "strings" "time" ) @@ -32,10 +33,10 @@ Output ONLY a valid JSON object matching this schema (no markdown fences, no pro "agent": { "type": "claude" | "gemini", "model": string — "sonnet" for claude, "gemini-2.5-flash-lite" for gemini, - "instructions": string — detailed, step-by-step instructions for the agent, + "instructions": string — detailed, step-by-step instructions for the agent. Must end with a "## Acceptance Criteria" section listing measurable conditions that define success. For coding tasks, include TDD requirements (write failing tests first, then implement), ` + workDirLine + ` "max_budget_usd": number — conservative estimate (0.25–5.00), - "allowed_tools": array — only tools the task genuinely needs + "allowed_tools": array — every tool the task genuinely needs. Include "Write" if creating files, "Edit" if modifying files, "Read" if reading files, "Bash" for shell/git/test commands, "Grep"/"Glob" for searching. }, "timeout": string — e.g. "15m", "priority": string — "normal" | "high" | "low", @@ -62,6 +63,69 @@ type elaboratedAgent struct { AllowedTools []string `json:"allowed_tools"` } +// sanitizeElaboratedTask enforces tool completeness and dev practice compliance. +// It modifies t in place, inferring missing tools from instruction keywords and +// appending required sections when they are absent. +func sanitizeElaboratedTask(t *elaboratedTask) { + lower := strings.ToLower(t.Agent.Instructions) + + // Build current tool set. + toolSet := make(map[string]bool, len(t.Agent.AllowedTools)) + for _, tool := range t.Agent.AllowedTools { + toolSet[tool] = true + } + + // Infer missing tools from instruction keywords. + type rule struct { + tool string + keywords []string + } + rules := []rule{ + {"Write", []string{"create file", "write file", "new file", "write to", "save to", "output to", "generate file", "creates a file", "create a new file"}}, + {"Edit", []string{"edit", "modify", "refactor", "replace", "patch"}}, + {"Read", []string{"read", "inspect", "examine", "look at the file"}}, + {"Bash", []string{"run", "execute", "bash", "shell", "command", "build", "compile", "git", "install", "make"}}, + {"Grep", []string{"search for", "grep", "find in", "locate in"}}, + {"Glob", []string{"find file", "list file", "search file"}}, + } + for _, r := range rules { + if toolSet[r.tool] { + continue + } + for _, kw := range r.keywords { + if strings.Contains(lower, kw) { + toolSet[r.tool] = true + break + } + } + } + // Edit without Read is almost always wrong. + if toolSet["Edit"] && !toolSet["Read"] { + toolSet["Read"] = true + } + // Rebuild the list only when tools were added. + if len(toolSet) > len(t.Agent.AllowedTools) { + tools := make([]string, 0, len(toolSet)) + for tool := range toolSet { + tools = append(tools, tool) + } + sort.Strings(tools) + t.Agent.AllowedTools = tools + } + + // Append an acceptance criteria section when none is present. + if !strings.Contains(lower, "acceptance") && + !strings.Contains(lower, "done when") && + !strings.Contains(lower, "success criteria") { + t.Agent.Instructions += "\n\n## Acceptance Criteria\nBefore finishing, verify all stated goals are met, tests pass (if applicable), and no unintended side effects were introduced." + } + + // Append a TDD reminder for coding tasks that do not already mention tests. + if (toolSet["Edit"] || toolSet["Write"]) && !strings.Contains(lower, "test") { + t.Agent.Instructions += "\n\n## Dev Practices\nFollow TDD: write a failing test first, then implement the minimum code to make it pass. Commit all changes before finishing." + } +} + // claudeJSONResult is the top-level object returned by `claude --output-format json`. type claudeJSONResult struct { Result string `json:"result"` @@ -214,5 +278,7 @@ func (s *Server) handleElaborateTask(w http.ResponseWriter, r *http.Request) { result.Agent.Type = "claude" } + sanitizeElaboratedTask(&result) + writeJSON(w, http.StatusOK, result) } diff --git a/internal/api/elaborate_test.go b/internal/api/elaborate_test.go index 330c111..9ae2e98 100644 --- a/internal/api/elaborate_test.go +++ b/internal/api/elaborate_test.go @@ -30,6 +30,184 @@ func createFakeClaude(t *testing.T, output string, exitCode int) string { return script } +// hasTool is a test helper that reports whether name is in the tools slice. +func hasTool(tools []string, name string) bool { + for _, t := range tools { + if t == name { + return true + } + } + return false +} + +// --- sanitizeElaboratedTask unit tests --- + +func TestSanitize_AddsWriteWhenInstructionsMentionFileCreation(t *testing.T) { + task := &elaboratedTask{ + Agent: elaboratedAgent{ + Instructions: "Create a new file called output.txt with the results.", + AllowedTools: []string{"Bash"}, + }, + } + sanitizeElaboratedTask(task) + if !hasTool(task.Agent.AllowedTools, "Write") { + t.Errorf("expected Write in allowed_tools, got %v", task.Agent.AllowedTools) + } +} + +func TestSanitize_AddsReadWhenEditIsPresent(t *testing.T) { + task := &elaboratedTask{ + Agent: elaboratedAgent{ + Instructions: "Modify the configuration file.", + AllowedTools: []string{"Edit"}, + }, + } + sanitizeElaboratedTask(task) + if !hasTool(task.Agent.AllowedTools, "Read") { + t.Errorf("expected Read added alongside Edit, got %v", task.Agent.AllowedTools) + } +} + +func TestSanitize_NoDuplicateTools(t *testing.T) { + task := &elaboratedTask{ + Agent: elaboratedAgent{ + Instructions: "Run go test ./...", + AllowedTools: []string{"Bash"}, + }, + } + sanitizeElaboratedTask(task) + count := 0 + for _, tool := range task.Agent.AllowedTools { + if tool == "Bash" { + count++ + } + } + if count != 1 { + t.Errorf("Bash duplicated in allowed_tools: %v", task.Agent.AllowedTools) + } +} + +func TestSanitize_AddsAcceptanceCriteriaWhenMissing(t *testing.T) { + task := &elaboratedTask{ + Agent: elaboratedAgent{ + Instructions: "Do something useful with the codebase.", + AllowedTools: []string{"Bash"}, + }, + } + sanitizeElaboratedTask(task) + lower := strings.ToLower(task.Agent.Instructions) + if !strings.Contains(lower, "acceptance") && !strings.Contains(lower, "done when") { + t.Error("expected acceptance criteria section appended to instructions") + } +} + +func TestSanitize_NoopWhenAcceptanceCriteriaAlreadyPresent(t *testing.T) { + original := "Do something.\n\n## Acceptance Criteria\n- All tests pass." + task := &elaboratedTask{ + Agent: elaboratedAgent{ + Instructions: original, + AllowedTools: []string{"Bash"}, + }, + } + sanitizeElaboratedTask(task) + if task.Agent.Instructions != original { + t.Errorf("instructions were modified when acceptance criteria were already present") + } +} + +func TestSanitize_AddsTDDReminderForCodingTaskWithoutTestMention(t *testing.T) { + task := &elaboratedTask{ + Agent: elaboratedAgent{ + Instructions: "## Acceptance Criteria\nFix the bug.\n\nModify the handler to return 404 instead of 500.", + AllowedTools: []string{"Edit", "Read"}, + }, + } + sanitizeElaboratedTask(task) + lower := strings.ToLower(task.Agent.Instructions) + if !strings.Contains(lower, "tdd") && !strings.Contains(lower, "test") { + t.Error("expected TDD reminder for coding task without test mention") + } +} + +func TestSanitize_NoTDDReminderWhenTestsAlreadyMentioned(t *testing.T) { + original := "## Acceptance Criteria\nAll tests pass.\n\nEdit the file and run go test ./... to verify." + task := &elaboratedTask{ + Agent: elaboratedAgent{ + Instructions: original, + AllowedTools: []string{"Edit", "Read", "Bash"}, + }, + } + before := task.Agent.Instructions + sanitizeElaboratedTask(task) + // Should NOT add a second TDD block since tests are already mentioned. + // Count occurrences of "tdd" / "test" — just verify no double-append. + if strings.Count(strings.ToLower(task.Agent.Instructions), "tdd") > 1 { + t.Errorf("TDD block added twice; instructions:\n%s", task.Agent.Instructions) + } + _ = before +} + +func TestElaboratePrompt_RequiresAcceptanceCriteria(t *testing.T) { + prompt := buildElaboratePrompt("") + lower := strings.ToLower(prompt) + if !strings.Contains(lower, "acceptance criteria") { + t.Error("elaborate prompt should instruct the model to include acceptance criteria") + } +} + +func TestElaboratePrompt_RequiresAllRelevantTools(t *testing.T) { + prompt := buildElaboratePrompt("") + // Prompt must remind the model to include file-creating tools when needed. + if !strings.Contains(prompt, "Write") { + t.Error("elaborate prompt should mention the Write tool so models know to include it") + } +} + +func TestElaborateTask_SanitizationAppliedToResponse(t *testing.T) { + srv, _ := testServer(t) + + // Elaborator returns a task that needs Write (instructions say "create file") + // but does NOT include it in allowed_tools. + task := elaboratedTask{ + Name: "Generate report", + Description: "Creates a report file.", + Agent: elaboratedAgent{ + Type: "claude", + Model: "sonnet", + Instructions: "Create a new file called report.md with the analysis results.\n\n## Acceptance Criteria\n- report.md exists.", + MaxBudgetUSD: 0.5, + AllowedTools: []string{"Bash"}, // Write intentionally missing + }, + Timeout: "15m", + Priority: "normal", + Tags: []string{"report"}, + } + taskJSON, _ := json.Marshal(task) + wrapper := map[string]string{"result": string(taskJSON)} + wrapperJSON, _ := json.Marshal(wrapper) + + srv.elaborateCmdPath = createFakeClaude(t, string(wrapperJSON), 0) + + body := `{"prompt":"generate a report"}` + req := httptest.NewRequest("POST", "/api/tasks/elaborate", bytes.NewBufferString(body)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + + srv.Handler().ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("status: want 200, got %d; body: %s", w.Code, w.Body.String()) + } + + var result elaboratedTask + if err := json.NewDecoder(w.Body).Decode(&result); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + if !hasTool(result.Agent.AllowedTools, "Write") { + t.Errorf("expected Write in sanitized allowed_tools, got %v", result.Agent.AllowedTools) + } +} + func TestElaboratePrompt_ContainsWorkDir(t *testing.T) { prompt := buildElaboratePrompt("/some/custom/path") if !strings.Contains(prompt, "/some/custom/path") { diff --git a/internal/api/server.go b/internal/api/server.go index c545253..df35536 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -24,6 +24,7 @@ type questionStore interface { GetLatestExecution(taskID string) (*storage.Execution, error) UpdateTaskQuestion(taskID, questionJSON string) error UpdateTaskState(id string, newState task.State) error + AppendTaskInteraction(taskID string, interaction task.Interaction) error } // Server provides the REST API and WebSocket endpoint for Claudomator. @@ -250,6 +251,25 @@ func (s *Server) handleAnswerQuestion(w http.ResponseWriter, r *http.Request) { return } + // Record the Q&A interaction before clearing the question. + if tk.QuestionJSON != "" { + var qData struct { + Text string `json:"text"` + Options []string `json:"options"` + } + if jsonErr := json.Unmarshal([]byte(tk.QuestionJSON), &qData); jsonErr == nil { + interaction := task.Interaction{ + QuestionText: qData.Text, + Options: qData.Options, + Answer: input.Answer, + AskedAt: tk.UpdatedAt, + } + if appendErr := s.questionStore.AppendTaskInteraction(taskID, interaction); appendErr != nil { + s.logger.Error("failed to append interaction", "taskID", taskID, "error", appendErr) + } + } + } + // Clear the question and transition to QUEUED. if err := s.questionStore.UpdateTaskQuestion(taskID, ""); err != nil { writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "failed to clear question"}) @@ -277,6 +297,14 @@ func (s *Server) handleAnswerQuestion(w http.ResponseWriter, r *http.Request) { writeJSON(w, http.StatusOK, map[string]string{"message": "task queued for resume", "task_id": taskID}) } +// resumableStates are the task states from which a session-based resume is valid. +var resumableStates = map[task.State]string{ + task.StateTimedOut: "Your previous execution timed out. Please continue where you left off and complete the task.", + task.StateCancelled: "Your previous execution was cancelled. Please continue where you left off and complete the task.", + task.StateFailed: "Your previous execution failed. Please review what happened and continue from where you left off.", + task.StateBudgetExceeded: "Your previous execution exceeded its budget. Please continue where you left off and complete the task.", +} + func (s *Server) handleResumeTimedOutTask(w http.ResponseWriter, r *http.Request) { taskID := r.PathValue("id") @@ -285,8 +313,10 @@ func (s *Server) handleResumeTimedOutTask(w http.ResponseWriter, r *http.Request writeJSON(w, http.StatusNotFound, map[string]string{"error": "task not found"}) return } - if tk.State != task.StateTimedOut { - writeJSON(w, http.StatusConflict, map[string]string{"error": "task is not timed out"}) + + resumeMsg, resumable := resumableStates[tk.State] + if !resumable { + writeJSON(w, http.StatusConflict, map[string]string{"error": "task is not in a resumable state"}) return } @@ -302,7 +332,7 @@ func (s *Server) handleResumeTimedOutTask(w http.ResponseWriter, r *http.Request ID: uuid.New().String(), TaskID: taskID, ResumeSessionID: latest.SessionID, - ResumeAnswer: "Your previous execution timed out. Please continue where you left off and complete the task.", + ResumeAnswer: resumeMsg, } if err := s.pool.SubmitResume(context.Background(), tk, resumeExec); err != nil { writeJSON(w, http.StatusServiceUnavailable, map[string]string{"error": err.Error()}) diff --git a/internal/api/server_test.go b/internal/api/server_test.go index afdc9d2..c90e3b3 100644 --- a/internal/api/server_test.go +++ b/internal/api/server_test.go @@ -894,10 +894,11 @@ func TestServer_CancelTask_Completed_Returns409(t *testing.T) { // mockQuestionStore implements questionStore for testing handleAnswerQuestion. type mockQuestionStore struct { - getTaskFn func(id string) (*task.Task, error) - getLatestExecutionFn func(taskID string) (*storage.Execution, error) - updateTaskQuestionFn func(taskID, questionJSON string) error - updateTaskStateFn func(id string, newState task.State) error + getTaskFn func(id string) (*task.Task, error) + getLatestExecutionFn func(taskID string) (*storage.Execution, error) + updateTaskQuestionFn func(taskID, questionJSON string) error + updateTaskStateFn func(id string, newState task.State) error + appendInteractionFn func(taskID string, interaction task.Interaction) error } func (m *mockQuestionStore) GetTask(id string) (*task.Task, error) { @@ -912,6 +913,12 @@ func (m *mockQuestionStore) UpdateTaskQuestion(taskID, questionJSON string) erro func (m *mockQuestionStore) UpdateTaskState(id string, newState task.State) error { return m.updateTaskStateFn(id, newState) } +func (m *mockQuestionStore) AppendTaskInteraction(taskID string, interaction task.Interaction) error { + if m.appendInteractionFn != nil { + return m.appendInteractionFn(taskID, interaction) + } + return nil +} func TestServer_AnswerQuestion_UpdateQuestionFails_Returns500(t *testing.T) { srv, _ := testServer(t) @@ -1178,6 +1185,114 @@ func TestResumeTimedOut_ResponseShape(t *testing.T) { } } +func TestResumeInterrupted_Cancelled_Success_Returns202(t *testing.T) { + srv, store := testServer(t) + createTaskWithState(t, store, "resume-cancelled-1", task.StateCancelled) + + exec := &storage.Execution{ + ID: "exec-cancelled-1", + TaskID: "resume-cancelled-1", + SessionID: "550e8400-e29b-41d4-a716-446655440030", + Status: "CANCELLED", + } + if err := store.CreateExecution(exec); err != nil { + t.Fatalf("create execution: %v", err) + } + + req := httptest.NewRequest("POST", "/api/tasks/resume-cancelled-1/resume", nil) + w := httptest.NewRecorder() + srv.Handler().ServeHTTP(w, req) + + if w.Code != http.StatusAccepted { + t.Errorf("status: want 202, got %d; body: %s", w.Code, w.Body.String()) + } + got, _ := store.GetTask("resume-cancelled-1") + if got.State != task.StateQueued && got.State != task.StateRunning && got.State != task.StateReady { + t.Errorf("task state: want QUEUED/RUNNING/READY after resume, got %v", got.State) + } +} + +func TestResumeInterrupted_Failed_Success_Returns202(t *testing.T) { + srv, store := testServer(t) + createTaskWithState(t, store, "resume-failed-1", task.StateFailed) + + exec := &storage.Execution{ + ID: "exec-failed-1", + TaskID: "resume-failed-1", + SessionID: "550e8400-e29b-41d4-a716-446655440031", + Status: "FAILED", + } + if err := store.CreateExecution(exec); err != nil { + t.Fatalf("create execution: %v", err) + } + + req := httptest.NewRequest("POST", "/api/tasks/resume-failed-1/resume", nil) + w := httptest.NewRecorder() + srv.Handler().ServeHTTP(w, req) + + if w.Code != http.StatusAccepted { + t.Errorf("status: want 202, got %d; body: %s", w.Code, w.Body.String()) + } + got, _ := store.GetTask("resume-failed-1") + if got.State != task.StateQueued && got.State != task.StateRunning && got.State != task.StateReady { + t.Errorf("task state: want QUEUED/RUNNING/READY after resume, got %v", got.State) + } +} + +func TestResumeInterrupted_BudgetExceeded_Success_Returns202(t *testing.T) { + srv, store := testServer(t) + createTaskWithState(t, store, "resume-budget-1", task.StateBudgetExceeded) + + exec := &storage.Execution{ + ID: "exec-budget-1", + TaskID: "resume-budget-1", + SessionID: "550e8400-e29b-41d4-a716-446655440032", + Status: "BUDGET_EXCEEDED", + } + if err := store.CreateExecution(exec); err != nil { + t.Fatalf("create execution: %v", err) + } + + req := httptest.NewRequest("POST", "/api/tasks/resume-budget-1/resume", nil) + w := httptest.NewRecorder() + srv.Handler().ServeHTTP(w, req) + + if w.Code != http.StatusAccepted { + t.Errorf("status: want 202, got %d; body: %s", w.Code, w.Body.String()) + } + got, _ := store.GetTask("resume-budget-1") + if got.State != task.StateQueued && got.State != task.StateRunning && got.State != task.StateReady { + t.Errorf("task state: want QUEUED/RUNNING/READY after resume, got %v", got.State) + } +} + +func TestResumeInterrupted_NoSession_Returns500(t *testing.T) { + srv, store := testServer(t) + createTaskWithState(t, store, "resume-nosess-1", task.StateCancelled) + + // No execution — no session ID available. + req := httptest.NewRequest("POST", "/api/tasks/resume-nosess-1/resume", nil) + w := httptest.NewRecorder() + srv.Handler().ServeHTTP(w, req) + + if w.Code != http.StatusInternalServerError { + t.Errorf("status: want 500, got %d; body: %s", w.Code, w.Body.String()) + } +} + +func TestResumeInterrupted_WrongState_Returns409(t *testing.T) { + srv, store := testServer(t) + createTaskWithState(t, store, "resume-wrong-1", task.StatePending) + + req := httptest.NewRequest("POST", "/api/tasks/resume-wrong-1/resume", nil) + w := httptest.NewRecorder() + srv.Handler().ServeHTTP(w, req) + + if w.Code != http.StatusConflict { + t.Errorf("status: want 409, got %d; body: %s", w.Code, w.Body.String()) + } +} + func TestRateLimit_ValidateRejectsExcess(t *testing.T) { srv, _ := testServer(t) srv.elaborateLimiter = newIPRateLimiter(0, 1) diff --git a/internal/cli/serve.go b/internal/cli/serve.go index e5bd873..fd9fda8 100644 --- a/internal/cli/serve.go +++ b/internal/cli/serve.go @@ -77,6 +77,7 @@ func serve(addr string) error { pool.Classifier = &executor.Classifier{GeminiBinaryPath: cfg.GeminiBinaryPath} } pool.RecoverStaleRunning() + pool.RecoverStaleQueued(context.Background()) srv := api.NewServer(store, pool, logger, cfg.ClaudeBinaryPath, cfg.GeminiBinaryPath) if cfg.WebhookURL != "" { diff --git a/internal/executor/claude.go b/internal/executor/claude.go index 0e29f7f..a58f1ad 100644 --- a/internal/executor/claude.go +++ b/internal/executor/claude.go @@ -150,6 +150,13 @@ func (r *ClaudeRunner) Run(ctx context.Context, t *task.Task, e *storage.Executi return &BlockedError{QuestionJSON: strings.TrimSpace(string(data)), SessionID: e.SessionID, SandboxDir: sandboxDir} } + // Read agent summary if written. + summaryFile := filepath.Join(logDir, "summary.txt") + if summaryData, readErr := os.ReadFile(summaryFile); readErr == nil { + os.Remove(summaryFile) // consumed + e.Summary = strings.TrimSpace(string(summaryData)) + } + // Merge sandbox back to project_dir and clean up. if sandboxDir != "" { if mergeErr := teardownSandbox(projectDir, sandboxDir, r.Logger); mergeErr != nil { @@ -261,6 +268,7 @@ func (r *ClaudeRunner) execOnce(ctx context.Context, args []string, workingDir s "CLAUDOMATOR_API_URL="+r.APIURL, "CLAUDOMATOR_TASK_ID="+e.TaskID, "CLAUDOMATOR_QUESTION_FILE="+filepath.Join(e.ArtifactDir, "question.json"), + "CLAUDOMATOR_SUMMARY_FILE="+filepath.Join(e.ArtifactDir, "summary.txt"), ) // Put the subprocess in its own process group so we can SIGKILL the entire // group (MCP servers, bash children, etc.) on cancellation. diff --git a/internal/executor/executor.go b/internal/executor/executor.go index 76c8ac7..7ae4e2d 100644 --- a/internal/executor/executor.go +++ b/internal/executor/executor.go @@ -26,6 +26,8 @@ type Store interface { UpdateExecution(e *storage.Execution) error UpdateTaskState(id string, newState task.State) error UpdateTaskQuestion(taskID, questionJSON string) error + UpdateTaskSummary(taskID, summary string) error + AppendTaskInteraction(taskID string, interaction task.Interaction) error } // LogPather is an optional interface runners can implement to provide the log @@ -149,11 +151,20 @@ func (p *Pool) Cancel(taskID string) bool { return true } -// SubmitResume re-queues a blocked task using the provided resume execution. +// resumablePoolStates are the task states that may be submitted for session resume. +var resumablePoolStates = map[task.State]bool{ + task.StateBlocked: true, + task.StateTimedOut: true, + task.StateCancelled: true, + task.StateFailed: true, + task.StateBudgetExceeded: true, +} + +// SubmitResume re-queues a blocked or interrupted task using the provided resume execution. // The execution must have ResumeSessionID and ResumeAnswer set. func (p *Pool) SubmitResume(ctx context.Context, t *task.Task, exec *storage.Execution) error { - if t.State != task.StateBlocked && t.State != task.StateTimedOut { - return fmt.Errorf("task %s must be in BLOCKED or TIMED_OUT state to resume (current: %s)", t.ID, t.State) + if !resumablePoolStates[t.State] { + return fmt.Errorf("task %s must be in a resumable state to resume (current: %s)", t.ID, t.State) } if exec.ResumeSessionID == "" { return fmt.Errorf("resume execution for task %s must have a ResumeSessionID", t.ID) @@ -331,6 +342,15 @@ func (p *Pool) handleRunResult(ctx context.Context, t *task.Task, exec *storage. } } + summary := exec.Summary + if summary == "" && exec.StdoutPath != "" { + summary = extractSummary(exec.StdoutPath) + } + if summary != "" { + if summaryErr := p.store.UpdateTaskSummary(t.ID, summary); summaryErr != nil { + p.logger.Error("failed to update task summary", "taskID", t.ID, "error", summaryErr) + } + } if updateErr := p.store.UpdateExecution(exec); updateErr != nil { p.logger.Error("failed to update execution", "error", updateErr) } @@ -566,6 +586,23 @@ func (p *Pool) RecoverStaleRunning() { } } +// RecoverStaleQueued re-submits any tasks that are stuck in QUEUED state from +// a previous server instance. Call this once on server startup, after +// RecoverStaleRunning. +func (p *Pool) RecoverStaleQueued(ctx context.Context) { + tasks, err := p.store.ListTasks(storage.TaskFilter{State: task.StateQueued}) + if err != nil { + p.logger.Error("RecoverStaleQueued: list tasks", "error", err) + return + } + for _, t := range tasks { + p.logger.Info("resubmitting stale QUEUED task", "taskID", t.ID, "name", t.Name) + if err := p.Submit(ctx, t); err != nil { + p.logger.Error("RecoverStaleQueued: submit", "error", err, "taskID", t.ID) + } + } +} + // terminalFailureStates are dependency states that cause the waiting task to fail immediately. var terminalFailureStates = map[task.State]bool{ task.StateFailed: true, diff --git a/internal/executor/executor_test.go b/internal/executor/executor_test.go index 9448816..7e676eb 100644 --- a/internal/executor/executor_test.go +++ b/internal/executor/executor_test.go @@ -614,6 +614,60 @@ func TestPool_RecoverStaleRunning(t *testing.T) { } } +func TestPool_RecoverStaleQueued_ResubmitsToPool(t *testing.T) { + store := testStore(t) + runner := &mockRunner{} + logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelError})) + pool := NewPool(2, map[string]Runner{"claude": runner}, store, logger) + + // Create a task already in QUEUED state (persisted from before a server restart). + tk := makeTask("stale-queued-1") + tk.State = task.StateQueued + store.CreateTask(tk) + + pool.RecoverStaleQueued(context.Background()) + + // Wait for the pool to pick it up and complete it. + select { + case result := <-pool.Results(): + if result.TaskID != tk.ID { + t.Errorf("unexpected task in results: %s", result.TaskID) + } + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for stale QUEUED task to complete") + } + + got, err := store.GetTask(tk.ID) + if err != nil { + t.Fatalf("get task: %v", err) + } + if got.State != task.StateCompleted && got.State != task.StateReady { + t.Errorf("state: want COMPLETED or READY, got %q", got.State) + } + if runner.callCount() != 1 { + t.Errorf("runner call count: want 1, got %d", runner.callCount()) + } +} + +func TestPool_RecoverStaleQueued_SkipsNonQueuedTasks(t *testing.T) { + store := testStore(t) + runner := &mockRunner{} + logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelError})) + pool := NewPool(2, map[string]Runner{"claude": runner}, store, logger) + + // PENDING task should NOT be resubmitted. + tk := makeTask("pending-1") + tk.State = task.StatePending + store.CreateTask(tk) + + pool.RecoverStaleQueued(context.Background()) + time.Sleep(50 * time.Millisecond) + + if runner.callCount() != 0 { + t.Errorf("runner should not have been called for PENDING task, got %d calls", runner.callCount()) + } +} + func TestPool_ActivePerAgent_DeletesZeroEntries(t *testing.T) { store := testStore(t) runner := &mockRunner{} @@ -906,6 +960,10 @@ func (m *minimalMockStore) UpdateTaskQuestion(taskID, questionJSON string) error m.mu.Unlock() return nil } +func (m *minimalMockStore) UpdateTaskSummary(taskID, summary string) error { return nil } +func (m *minimalMockStore) AppendTaskInteraction(taskID string, _ task.Interaction) error { + return nil +} func (m *minimalMockStore) lastStateUpdate() (string, task.State, bool) { m.mu.Lock() diff --git a/internal/executor/preamble.go b/internal/executor/preamble.go index 1993361..5e57852 100644 --- a/internal/executor/preamble.go +++ b/internal/executor/preamble.go @@ -48,15 +48,14 @@ The sandbox is rejected if there are any uncommitted modifications. ## Final Summary (mandatory) -Before exiting, write a final summary paragraph (2-5 sentences) as your last output. Start it with "## Summary" on its own line. Describe: -- What was accomplished -- Key decisions made -- Any issues or follow-ups needed +Before exiting, write a brief summary paragraph (2–5 sentences) describing what you did +and the outcome. Write it to the path in $CLAUDOMATOR_SUMMARY_FILE: -This summary will be extracted and displayed in the task UI. + echo "Your summary here." > "$CLAUDOMATOR_SUMMARY_FILE" ---- +This summary is displayed in the task UI so the user knows what happened. +--- ` func withPlanningPreamble(instructions string) string { diff --git a/internal/executor/summary.go b/internal/executor/summary.go new file mode 100644 index 0000000..a942de0 --- /dev/null +++ b/internal/executor/summary.go @@ -0,0 +1,57 @@ +package executor + +import ( + "bufio" + "encoding/json" + "os" + "strings" +) + +// extractSummary reads a stream-json stdout log and returns the text following +// the last "## Summary" heading found in any assistant text block. +// Returns empty string if the file cannot be read or no summary is found. +func extractSummary(stdoutPath string) string { + f, err := os.Open(stdoutPath) + if err != nil { + return "" + } + defer f.Close() + + var last string + scanner := bufio.NewScanner(f) + scanner.Buffer(make([]byte, 1024*1024), 1024*1024) + for scanner.Scan() { + if text := summaryFromLine(scanner.Bytes()); text != "" { + last = text + } + } + return last +} + +// summaryFromLine parses a single stream-json line and returns the text after +// "## Summary" if the line is an assistant text block containing that heading. +func summaryFromLine(line []byte) string { + var event struct { + Type string `json:"type"` + Message struct { + Content []struct { + Type string `json:"type"` + Text string `json:"text"` + } `json:"content"` + } `json:"message"` + } + if err := json.Unmarshal(line, &event); err != nil || event.Type != "assistant" { + return "" + } + for _, block := range event.Message.Content { + if block.Type != "text" { + continue + } + idx := strings.Index(block.Text, "## Summary") + if idx == -1 { + continue + } + return strings.TrimSpace(block.Text[idx+len("## Summary"):]) + } + return "" +} diff --git a/internal/executor/summary_test.go b/internal/executor/summary_test.go new file mode 100644 index 0000000..4a73711 --- /dev/null +++ b/internal/executor/summary_test.go @@ -0,0 +1,49 @@ +package executor + +import ( + "os" + "path/filepath" + "testing" +) + +func TestExtractSummary_WithSummarySection(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "stdout.log") + content := streamLine(`{"type":"assistant","message":{"content":[{"type":"text","text":"## Summary\nThe task was completed successfully."}]}}`) + if err := os.WriteFile(path, []byte(content), 0600); err != nil { + t.Fatal(err) + } + got := extractSummary(path) + want := "The task was completed successfully." + if got != want { + t.Errorf("got %q, want %q", got, want) + } +} + +func TestExtractSummary_NoSummary(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "stdout.log") + content := streamLine(`{"type":"assistant","message":{"content":[{"type":"text","text":"All done, no summary heading."}]}}`) + if err := os.WriteFile(path, []byte(content), 0600); err != nil { + t.Fatal(err) + } + got := extractSummary(path) + if got != "" { + t.Errorf("expected empty string, got %q", got) + } +} + +func TestExtractSummary_MultipleSections_PicksLast(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "stdout.log") + content := streamLine(`{"type":"assistant","message":{"content":[{"type":"text","text":"## Summary\nFirst summary."}]}}`) + + streamLine(`{"type":"assistant","message":{"content":[{"type":"text","text":"## Summary\nFinal summary."}]}}`) + if err := os.WriteFile(path, []byte(content), 0600); err != nil { + t.Fatal(err) + } + got := extractSummary(path) + want := "Final summary." + if got != want { + t.Errorf("got %q, want %q", got, want) + } +} diff --git a/internal/storage/db.go b/internal/storage/db.go index aaf1e09..b8a7085 100644 --- a/internal/storage/db.go +++ b/internal/storage/db.go @@ -81,6 +81,8 @@ func (s *DB) migrate() error { `ALTER TABLE tasks ADD COLUMN question_json TEXT`, `ALTER TABLE executions ADD COLUMN session_id TEXT`, `ALTER TABLE executions ADD COLUMN sandbox_dir TEXT`, + `ALTER TABLE tasks ADD COLUMN summary TEXT`, + `ALTER TABLE tasks ADD COLUMN interactions_json TEXT NOT NULL DEFAULT '[]'`, } for _, m := range migrations { if _, err := s.db.Exec(m); err != nil { @@ -129,13 +131,13 @@ func (s *DB) CreateTask(t *task.Task) error { // GetTask retrieves a task by ID. func (s *DB) GetTask(id string) (*task.Task, error) { - row := s.db.QueryRow(`SELECT id, name, description, config_json, priority, timeout_ns, retry_json, tags_json, depends_on_json, parent_task_id, state, created_at, updated_at, rejection_comment, question_json FROM tasks WHERE id = ?`, id) + row := s.db.QueryRow(`SELECT id, name, description, config_json, priority, timeout_ns, retry_json, tags_json, depends_on_json, parent_task_id, state, created_at, updated_at, rejection_comment, question_json, summary, interactions_json FROM tasks WHERE id = ?`, id) return scanTask(row) } // ListTasks returns tasks matching the given filter. func (s *DB) ListTasks(filter TaskFilter) ([]*task.Task, error) { - query := `SELECT id, name, description, config_json, priority, timeout_ns, retry_json, tags_json, depends_on_json, parent_task_id, state, created_at, updated_at, rejection_comment, question_json FROM tasks WHERE 1=1` + query := `SELECT id, name, description, config_json, priority, timeout_ns, retry_json, tags_json, depends_on_json, parent_task_id, state, created_at, updated_at, rejection_comment, question_json, summary, interactions_json FROM tasks WHERE 1=1` var args []interface{} if filter.State != "" { @@ -167,7 +169,7 @@ func (s *DB) ListTasks(filter TaskFilter) ([]*task.Task, error) { // ListSubtasks returns all tasks whose parent_task_id matches the given ID. func (s *DB) ListSubtasks(parentID string) ([]*task.Task, error) { - rows, err := s.db.Query(`SELECT id, name, description, config_json, priority, timeout_ns, retry_json, tags_json, depends_on_json, parent_task_id, state, created_at, updated_at, rejection_comment, question_json FROM tasks WHERE parent_task_id = ? ORDER BY created_at ASC`, parentID) + rows, err := s.db.Query(`SELECT id, name, description, config_json, priority, timeout_ns, retry_json, tags_json, depends_on_json, parent_task_id, state, created_at, updated_at, rejection_comment, question_json, summary, interactions_json FROM tasks WHERE parent_task_id = ? ORDER BY created_at ASC`, parentID) if err != nil { return nil, err } @@ -220,7 +222,7 @@ func (s *DB) ResetTaskForRetry(id string) (*task.Task, error) { } defer tx.Rollback() //nolint:errcheck - t, err := scanTask(tx.QueryRow(`SELECT id, name, description, config_json, priority, timeout_ns, retry_json, tags_json, depends_on_json, parent_task_id, state, created_at, updated_at, rejection_comment, question_json FROM tasks WHERE id = ?`, id)) + t, err := scanTask(tx.QueryRow(`SELECT id, name, description, config_json, priority, timeout_ns, retry_json, tags_json, depends_on_json, parent_task_id, state, created_at, updated_at, rejection_comment, question_json, summary, interactions_json FROM tasks WHERE id = ?`, id)) if err != nil { if err == sql.ErrNoRows { return nil, fmt.Errorf("task %q not found", id) @@ -355,6 +357,8 @@ type Execution struct { // In-memory only: set when creating a resume execution, not stored in DB. ResumeSessionID string ResumeAnswer string + // In-memory only: populated by the runner after successful execution. + Summary string } // CreateExecution inserts an execution record. @@ -516,6 +520,48 @@ func (s *DB) UpdateTaskQuestion(taskID, questionJSON string) error { return err } +// UpdateTaskSummary stores the agent's final summary paragraph on a task. +func (s *DB) UpdateTaskSummary(taskID, summary string) error { + _, err := s.db.Exec(`UPDATE tasks SET summary = ?, updated_at = ? WHERE id = ?`, + summary, time.Now().UTC(), taskID) + return err +} + +// AppendTaskInteraction appends a Q&A interaction to the task's interaction history. +func (s *DB) AppendTaskInteraction(taskID string, interaction task.Interaction) error { + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() //nolint:errcheck + + var raw sql.NullString + if err := tx.QueryRow(`SELECT interactions_json FROM tasks WHERE id = ?`, taskID).Scan(&raw); err != nil { + if err == sql.ErrNoRows { + return fmt.Errorf("task %q not found", taskID) + } + return err + } + existing := raw.String + if existing == "" { + existing = "[]" + } + var interactions []task.Interaction + if err := json.Unmarshal([]byte(existing), &interactions); err != nil { + return fmt.Errorf("unmarshaling interactions: %w", err) + } + interactions = append(interactions, interaction) + updated, err := json.Marshal(interactions) + if err != nil { + return fmt.Errorf("marshaling interactions: %w", err) + } + if _, err := tx.Exec(`UPDATE tasks SET interactions_json = ?, updated_at = ? WHERE id = ?`, + string(updated), time.Now().UTC(), taskID); err != nil { + return err + } + return tx.Commit() +} + // UpdateExecution updates a completed execution. func (s *DB) UpdateExecution(e *Execution) error { _, err := s.db.Exec(` @@ -545,11 +591,14 @@ func scanTask(row scanner) (*task.Task, error) { parentTaskID sql.NullString rejectionComment sql.NullString questionJSON sql.NullString + summary sql.NullString + interactionsJSON sql.NullString ) - err := row.Scan(&t.ID, &t.Name, &t.Description, &configJSON, &priority, &timeoutNS, &retryJSON, &tagsJSON, &depsJSON, &parentTaskID, &state, &t.CreatedAt, &t.UpdatedAt, &rejectionComment, &questionJSON) + err := row.Scan(&t.ID, &t.Name, &t.Description, &configJSON, &priority, &timeoutNS, &retryJSON, &tagsJSON, &depsJSON, &parentTaskID, &state, &t.CreatedAt, &t.UpdatedAt, &rejectionComment, &questionJSON, &summary, &interactionsJSON) t.ParentTaskID = parentTaskID.String t.RejectionComment = rejectionComment.String t.QuestionJSON = questionJSON.String + t.Summary = summary.String if err != nil { return nil, err } @@ -568,6 +617,13 @@ func scanTask(row scanner) (*task.Task, error) { if err := json.Unmarshal([]byte(depsJSON), &t.DependsOn); err != nil { return nil, fmt.Errorf("unmarshaling depends_on: %w", err) } + raw := interactionsJSON.String + if raw == "" { + raw = "[]" + } + if err := json.Unmarshal([]byte(raw), &t.Interactions); err != nil { + return nil, fmt.Errorf("unmarshaling interactions: %w", err) + } return &t, nil } diff --git a/internal/task/task.go b/internal/task/task.go index 9968b15..2c57922 100644 --- a/internal/task/task.go +++ b/internal/task/task.go @@ -48,6 +48,14 @@ type RetryConfig struct { Backoff string `yaml:"backoff" json:"backoff"` // "linear", "exponential" } +// Interaction records a single question/answer exchange between an agent and the user. +type Interaction struct { + QuestionText string `json:"question_text"` + Options []string `json:"options,omitempty"` + Answer string `json:"answer,omitempty"` + AskedAt time.Time `json:"asked_at"` +} + type Task struct { ID string `yaml:"id" json:"id"` ParentTaskID string `yaml:"parent_task_id" json:"parent_task_id"` @@ -59,11 +67,13 @@ type Task struct { Priority Priority `yaml:"priority" json:"priority"` Tags []string `yaml:"tags" json:"tags"` DependsOn []string `yaml:"depends_on" json:"depends_on"` - State State `yaml:"-" json:"state"` - RejectionComment string `yaml:"-" json:"rejection_comment,omitempty"` - QuestionJSON string `yaml:"-" json:"question,omitempty"` - CreatedAt time.Time `yaml:"-" json:"created_at"` - UpdatedAt time.Time `yaml:"-" json:"updated_at"` + State State `yaml:"-" json:"state"` + RejectionComment string `yaml:"-" json:"rejection_comment,omitempty"` + QuestionJSON string `yaml:"-" json:"question,omitempty"` + Summary string `yaml:"-" json:"summary,omitempty"` + Interactions []Interaction `yaml:"-" json:"interactions,omitempty"` + CreatedAt time.Time `yaml:"-" json:"created_at"` + UpdatedAt time.Time `yaml:"-" json:"updated_at"` } // Duration wraps time.Duration for YAML unmarshaling from strings like "30m". @@ -94,27 +104,24 @@ type BatchFile struct { } // validTransitions maps each state to the set of states it may transition into. -// Terminal state COMPLETED has no outgoing edges. +// COMPLETED is the only true terminal state (no outgoing edges). // CANCELLED, FAILED, TIMED_OUT, and BUDGET_EXCEEDED all allow re-entry at QUEUED // (restart or retry). -var validTransitions = map[State][]State{ - StatePending: {StateQueued, StateCancelled}, - StateQueued: {StateRunning, StateCancelled}, - StateRunning: {StateReady, StateCompleted, StateFailed, StateTimedOut, StateCancelled, StateBudgetExceeded, StateBlocked}, - StateReady: {StateCompleted, StatePending}, - StateFailed: {StateQueued}, // retry - StateTimedOut: {StateQueued}, // retry or resume - StateCancelled: {StateQueued}, // restart - StateBudgetExceeded: {StateQueued}, // retry - StateBlocked: {StateQueued, StateReady}, +// READY may go back to PENDING on user rejection. +// BLOCKED may advance to READY when all subtasks complete, or back to QUEUED on user answer. +var validTransitions = map[State]map[State]bool{ + StatePending: {StateQueued: true, StateCancelled: true}, + StateQueued: {StateRunning: true, StateCancelled: true}, + StateRunning: {StateReady: true, StateCompleted: true, StateFailed: true, StateTimedOut: true, StateCancelled: true, StateBudgetExceeded: true, StateBlocked: true}, + StateReady: {StateCompleted: true, StatePending: true}, + StateFailed: {StateQueued: true}, // retry + StateTimedOut: {StateQueued: true}, // retry or resume + StateCancelled: {StateQueued: true}, // restart + StateBudgetExceeded: {StateQueued: true}, // retry + StateBlocked: {StateQueued: true, StateReady: true}, } // ValidTransition returns true if moving from the current state to next is allowed. func ValidTransition(from, to State) bool { - for _, allowed := range validTransitions[from] { - if allowed == to { - return true - } - } - return false + return validTransitions[from][to] } @@ -119,8 +119,11 @@ function createTaskCard(task) { } // Footer: action buttons based on state - const RESTART_STATES = new Set(['FAILED', 'CANCELLED', 'BUDGET_EXCEEDED']); - if (task.state === 'PENDING' || task.state === 'RUNNING' || task.state === 'READY' || task.state === 'BLOCKED' || task.state === 'TIMED_OUT' || RESTART_STATES.has(task.state)) { + // Interrupted states (CANCELLED, FAILED, BUDGET_EXCEEDED) show both Resume and Restart. + // TIMED_OUT shows Resume only. Others show a single action. + const RESUME_STATES = new Set(['TIMED_OUT', 'CANCELLED', 'FAILED', 'BUDGET_EXCEEDED']); + const RESTART_STATES = new Set(['CANCELLED', 'FAILED', 'BUDGET_EXCEEDED']); + if (task.state === 'PENDING' || task.state === 'RUNNING' || task.state === 'READY' || task.state === 'BLOCKED' || RESUME_STATES.has(task.state)) { const footer = document.createElement('div'); footer.className = 'task-card-footer'; @@ -161,24 +164,25 @@ function createTaskCard(task) { footer.appendChild(rejectBtn); } else if (task.state === 'BLOCKED') { renderQuestionFooter(task, footer); - } else if (task.state === 'TIMED_OUT') { - const btn = document.createElement('button'); - btn.className = 'btn-resume'; - btn.textContent = 'Resume'; - btn.addEventListener('click', (e) => { - e.stopPropagation(); - handleResume(task.id, btn, footer); - }); - footer.appendChild(btn); - } else if (RESTART_STATES.has(task.state)) { - const btn = document.createElement('button'); - btn.className = 'btn-restart'; - btn.textContent = 'Restart'; - btn.addEventListener('click', (e) => { + } else if (RESUME_STATES.has(task.state)) { + const resumeBtn = document.createElement('button'); + resumeBtn.className = 'btn-resume'; + resumeBtn.textContent = 'Resume'; + resumeBtn.addEventListener('click', (e) => { e.stopPropagation(); - handleRestart(task.id, btn, footer); + handleResume(task.id, resumeBtn, footer); }); - footer.appendChild(btn); + footer.appendChild(resumeBtn); + if (RESTART_STATES.has(task.state)) { + const restartBtn = document.createElement('button'); + restartBtn.className = 'btn-restart'; + restartBtn.textContent = 'Restart'; + restartBtn.addEventListener('click', (e) => { + e.stopPropagation(); + handleRestart(task.id, restartBtn, footer); + }); + footer.appendChild(restartBtn); + } } card.appendChild(footer); @@ -1353,6 +1357,50 @@ function renderTaskPanel(task, executions) { const content = document.getElementById('task-panel-content'); content.innerHTML = ''; + // ── Summary ── + if (task.summary) { + const summarySection = makeSection('Summary'); + const summaryEl = document.createElement('p'); + summaryEl.className = 'task-summary'; + summaryEl.textContent = task.summary; + summarySection.appendChild(summaryEl); + content.appendChild(summarySection); + } + + // ── Q&A History ── + if (task.interactions && task.interactions.length > 0) { + const qaSection = makeSection('Q&A History'); + const qaList = document.createElement('div'); + qaList.className = 'qa-list'; + for (const interaction of task.interactions) { + const qaItem = document.createElement('div'); + qaItem.className = 'qa-item'; + + const qEl = document.createElement('div'); + qEl.className = 'qa-question'; + qEl.textContent = interaction.question_text || '(question)'; + qaItem.appendChild(qEl); + + if (interaction.options && interaction.options.length > 0) { + const opts = document.createElement('div'); + opts.className = 'qa-options'; + opts.textContent = 'Options: ' + interaction.options.join(', '); + qaItem.appendChild(opts); + } + + if (interaction.answer) { + const aEl = document.createElement('div'); + aEl.className = 'qa-answer'; + aEl.textContent = interaction.answer; + qaItem.appendChild(aEl); + } + + qaList.appendChild(qaItem); + } + qaSection.appendChild(qaList); + content.appendChild(qaSection); + } + // ── Overview ── const overview = makeSection('Overview'); const overviewGrid = document.createElement('div'); diff --git a/web/style.css b/web/style.css index feedcce..67e4962 100644 --- a/web/style.css +++ b/web/style.css @@ -1200,6 +1200,44 @@ dialog label select:focus { text-align: right; } +/* ── Task Summary + Q&A History ────────────────────────────── */ + +.task-summary { + color: var(--text); + line-height: 1.6; + margin: 0; + white-space: pre-wrap; +} + +.qa-list { + display: flex; + flex-direction: column; + gap: 0.75rem; +} + +.qa-item { + border-left: 3px solid var(--border); + padding: 0.5rem 0.75rem; + display: flex; + flex-direction: column; + gap: 0.25rem; +} + +.qa-question { + font-weight: 500; + color: var(--text); +} + +.qa-options { + font-size: 0.82rem; + color: var(--text-muted, #94a3b8); +} + +.qa-answer { + color: var(--accent, #60a5fa); + font-style: italic; +} + /* ── Stats tab ────────────────────────────────────────────────────────────── */ [data-panel="stats"] { diff --git a/web/test/task-actions.test.mjs b/web/test/task-actions.test.mjs index c7d666b..a1790fa 100644 --- a/web/test/task-actions.test.mjs +++ b/web/test/task-actions.test.mjs @@ -6,78 +6,101 @@ import { describe, it } from 'node:test'; import assert from 'node:assert/strict'; // ── Logic under test ────────────────────────────────────────────────────────── +// +// Interrupted states (CANCELLED, FAILED, BUDGET_EXCEEDED) get BOTH a Resume +// button and a Restart button. TIMED_OUT gets Resume only. + +const RESUME_STATES = new Set(['TIMED_OUT', 'CANCELLED', 'FAILED', 'BUDGET_EXCEEDED']); +const RESTART_STATES = new Set(['CANCELLED', 'FAILED', 'BUDGET_EXCEEDED']); + +function getCardActions(state) { + const actions = []; + if (state === 'PENDING') return ['run']; + if (state === 'RUNNING') return ['cancel']; + if (state === 'READY') return ['approve']; + if (RESUME_STATES.has(state)) actions.push('resume'); + if (RESTART_STATES.has(state)) actions.push('restart'); + return actions.length ? actions : [null]; +} -const RESTART_STATES = new Set(['FAILED', 'CANCELLED', 'BUDGET_EXCEEDED']); - -function getCardAction(state) { - if (state === 'PENDING') return 'run'; - if (state === 'RUNNING') return 'cancel'; - if (state === 'READY') return 'approve'; - if (state === 'TIMED_OUT') return 'resume'; - if (RESTART_STATES.has(state)) return 'restart'; - return null; +function getResumeEndpoint(state) { + return RESUME_STATES.has(state) ? '/resume' : null; } -function getApiEndpoint(state) { - if (state === 'TIMED_OUT') return '/resume'; - if (RESTART_STATES.has(state)) return '/run'; - return null; +function getRestartEndpoint(state) { + return RESTART_STATES.has(state) ? '/run' : null; } // ── Tests ───────────────────────────────────────────────────────────────────── describe('task card action buttons', () => { it('shows Run button for PENDING', () => { - assert.equal(getCardAction('PENDING'), 'run'); + assert.deepEqual(getCardActions('PENDING'), ['run']); }); it('shows Cancel button for RUNNING', () => { - assert.equal(getCardAction('RUNNING'), 'cancel'); + assert.deepEqual(getCardActions('RUNNING'), ['cancel']); }); - it('shows Restart button for FAILED', () => { - assert.equal(getCardAction('FAILED'), 'restart'); + it('shows Resume AND Restart buttons for FAILED', () => { + assert.deepEqual(getCardActions('FAILED'), ['resume', 'restart']); }); - it('shows Resume button for TIMED_OUT', () => { - assert.equal(getCardAction('TIMED_OUT'), 'resume'); + it('shows Resume AND Restart buttons for CANCELLED', () => { + assert.deepEqual(getCardActions('CANCELLED'), ['resume', 'restart']); }); - it('shows Restart button for CANCELLED', () => { - assert.equal(getCardAction('CANCELLED'), 'restart'); + it('shows Resume AND Restart buttons for BUDGET_EXCEEDED', () => { + assert.deepEqual(getCardActions('BUDGET_EXCEEDED'), ['resume', 'restart']); }); - it('shows Restart button for BUDGET_EXCEEDED', () => { - assert.equal(getCardAction('BUDGET_EXCEEDED'), 'restart'); + it('shows Resume button only for TIMED_OUT (no restart)', () => { + assert.deepEqual(getCardActions('TIMED_OUT'), ['resume']); }); it('shows approve buttons for READY', () => { - assert.equal(getCardAction('READY'), 'approve'); + assert.deepEqual(getCardActions('READY'), ['approve']); }); it('shows no button for COMPLETED', () => { - assert.equal(getCardAction('COMPLETED'), null); + assert.deepEqual(getCardActions('COMPLETED'), [null]); }); it('shows no button for QUEUED', () => { - assert.equal(getCardAction('QUEUED'), null); + assert.deepEqual(getCardActions('QUEUED'), [null]); }); }); describe('task action API endpoints', () => { it('TIMED_OUT uses /resume endpoint', () => { - assert.equal(getApiEndpoint('TIMED_OUT'), '/resume'); + assert.equal(getResumeEndpoint('TIMED_OUT'), '/resume'); + }); + + it('CANCELLED uses /resume endpoint for resume', () => { + assert.equal(getResumeEndpoint('CANCELLED'), '/resume'); + }); + + it('FAILED uses /resume endpoint for resume', () => { + assert.equal(getResumeEndpoint('FAILED'), '/resume'); + }); + + it('BUDGET_EXCEEDED uses /resume endpoint for resume', () => { + assert.equal(getResumeEndpoint('BUDGET_EXCEEDED'), '/resume'); + }); + + it('CANCELLED uses /run endpoint for restart', () => { + assert.equal(getRestartEndpoint('CANCELLED'), '/run'); }); - it('FAILED uses /run endpoint', () => { - assert.equal(getApiEndpoint('FAILED'), '/run'); + it('FAILED uses /run endpoint for restart', () => { + assert.equal(getRestartEndpoint('FAILED'), '/run'); }); - it('CANCELLED uses /run endpoint', () => { - assert.equal(getApiEndpoint('CANCELLED'), '/run'); + it('BUDGET_EXCEEDED uses /run endpoint for restart', () => { + assert.equal(getRestartEndpoint('BUDGET_EXCEEDED'), '/run'); }); - it('BUDGET_EXCEEDED uses /run endpoint', () => { - assert.equal(getApiEndpoint('BUDGET_EXCEEDED'), '/run'); + it('TIMED_OUT has no /run restart endpoint', () => { + assert.equal(getRestartEndpoint('TIMED_OUT'), null); }); }); |
