From b9aba3d242482fa9cd42f2a49b2767a73d4d2213 Mon Sep 17 00:00:00 2001
From: Peter Stone <thepeterstone@gmail.com>
Date: Fri, 13 Mar 2026 03:14:53 +0000
Subject: feat: post-elaboration sanity check for tools, acceptance criteria,
 and dev practices

Add sanitizeElaboratedTask() called after every elaboration response:
- Infers missing allowed_tools from instruction keywords (Write/Edit/Read/Bash/Grep/Glob)
- Auto-adds Read when Edit is present
- Appends Acceptance Criteria section if none present
- Appends TDD reminder for coding tasks without test mention

Also tighten buildElaboratePrompt to require acceptance criteria and
list concrete tool examples, reducing how often the model omits tools.

Fixes class of failures where agents couldn't create files because
the elaborator omitted Write from allowed_tools.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 internal/api/elaborate.go      |  70 +++++++++++++++-
 internal/api/elaborate_test.go | 178 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 246 insertions(+), 2 deletions(-)

(limited to 'internal')

diff --git a/internal/api/elaborate.go b/internal/api/elaborate.go
index eb686bf..c6d08f4 100644
--- a/internal/api/elaborate.go
+++ b/internal/api/elaborate.go
@@ -9,6 +9,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"sort"
 	"strings"
 	"time"
 )
@@ -32,10 +33,10 @@ Output ONLY a valid JSON object matching this schema (no markdown fences, no pro
   "agent": {
     "type":            "claude" | "gemini",
     "model":           string  — "sonnet" for claude, "gemini-2.5-flash-lite" for gemini,
-    "instructions":    string  — detailed, step-by-step instructions for the agent,
+    "instructions":    string  — detailed, step-by-step instructions for the agent. Must end with a "## Acceptance Criteria" section listing measurable conditions that define success. For coding tasks, include TDD requirements (write failing tests first, then implement),
 ` + workDirLine + `
     "max_budget_usd":  number  — conservative estimate (0.25–5.00),
-    "allowed_tools":   array   — only tools the task genuinely needs
+    "allowed_tools":   array   — every tool the task genuinely needs. Include "Write" if creating files, "Edit" if modifying files, "Read" if reading files, "Bash" for shell/git/test commands, "Grep"/"Glob" for searching.
   },
   "timeout":  string  — e.g. "15m",
   "priority": string  — "normal" | "high" | "low",
@@ -62,6 +63,69 @@ type elaboratedAgent struct {
 	AllowedTools []string `json:"allowed_tools"`
 }
 
+// sanitizeElaboratedTask enforces tool completeness and dev practice compliance.
+// It modifies t in place, inferring missing tools from instruction keywords and
+// appending required sections when they are absent.
+func sanitizeElaboratedTask(t *elaboratedTask) {
+	lower := strings.ToLower(t.Agent.Instructions)
+
+	// Build current tool set.
+	toolSet := make(map[string]bool, len(t.Agent.AllowedTools))
+	for _, tool := range t.Agent.AllowedTools {
+		toolSet[tool] = true
+	}
+
+	// Infer missing tools from instruction keywords.
+	type rule struct {
+		tool     string
+		keywords []string
+	}
+	rules := []rule{
+		{"Write", []string{"create file", "write file", "new file", "write to", "save to", "output to", "generate file", "creates a file", "create a new file"}},
+		{"Edit", []string{"edit", "modify", "refactor", "replace", "patch"}},
+		{"Read", []string{"read", "inspect", "examine", "look at the file"}},
+		{"Bash", []string{"run", "execute", "bash", "shell", "command", "build", "compile", "git", "install", "make"}},
+		{"Grep", []string{"search for", "grep", "find in", "locate in"}},
+		{"Glob", []string{"find file", "list file", "search file"}},
+	}
+	for _, r := range rules {
+		if toolSet[r.tool] {
+			continue
+		}
+		for _, kw := range r.keywords {
+			if strings.Contains(lower, kw) {
+				toolSet[r.tool] = true
+				break
+			}
+		}
+	}
+	// Edit without Read is almost always wrong.
+	if toolSet["Edit"] && !toolSet["Read"] {
+		toolSet["Read"] = true
+	}
+	// Rebuild the list only when tools were added.
+	if len(toolSet) > len(t.Agent.AllowedTools) {
+		tools := make([]string, 0, len(toolSet))
+		for tool := range toolSet {
+			tools = append(tools, tool)
+		}
+		sort.Strings(tools)
+		t.Agent.AllowedTools = tools
+	}
+
+	// Append an acceptance criteria section when none is present.
+	if !strings.Contains(lower, "acceptance") &&
+		!strings.Contains(lower, "done when") &&
+		!strings.Contains(lower, "success criteria") {
+		t.Agent.Instructions += "\n\n## Acceptance Criteria\nBefore finishing, verify all stated goals are met, tests pass (if applicable), and no unintended side effects were introduced."
+	}
+
+	// Append a TDD reminder for coding tasks that do not already mention tests.
+	if (toolSet["Edit"] || toolSet["Write"]) && !strings.Contains(lower, "test") {
+		t.Agent.Instructions += "\n\n## Dev Practices\nFollow TDD: write a failing test first, then implement the minimum code to make it pass. Commit all changes before finishing."
+	}
+}
+
 // claudeJSONResult is the top-level object returned by `claude --output-format json`.
 type claudeJSONResult struct {
 	Result string `json:"result"`
@@ -214,5 +278,7 @@ func (s *Server) handleElaborateTask(w http.ResponseWriter, r *http.Request) {
 		result.Agent.Type = "claude"
 	}
 
+	sanitizeElaboratedTask(&result)
+
 	writeJSON(w, http.StatusOK, result)
 }
diff --git a/internal/api/elaborate_test.go b/internal/api/elaborate_test.go
index 330c111..9ae2e98 100644
--- a/internal/api/elaborate_test.go
+++ b/internal/api/elaborate_test.go
@@ -30,6 +30,184 @@ func createFakeClaude(t *testing.T, output string, exitCode int) string {
 	return script
 }
 
+// hasTool is a test helper that reports whether name is in the tools slice.
+func hasTool(tools []string, name string) bool {
+	for _, t := range tools {
+		if t == name {
+			return true
+		}
+	}
+	return false
+}
+
+// --- sanitizeElaboratedTask unit tests ---
+
+func TestSanitize_AddsWriteWhenInstructionsMentionFileCreation(t *testing.T) {
+	task := &elaboratedTask{
+		Agent: elaboratedAgent{
+			Instructions: "Create a new file called output.txt with the results.",
+			AllowedTools: []string{"Bash"},
+		},
+	}
+	sanitizeElaboratedTask(task)
+	if !hasTool(task.Agent.AllowedTools, "Write") {
+		t.Errorf("expected Write in allowed_tools, got %v", task.Agent.AllowedTools)
+	}
+}
+
+func TestSanitize_AddsReadWhenEditIsPresent(t *testing.T) {
+	task := &elaboratedTask{
+		Agent: elaboratedAgent{
+			Instructions: "Modify the configuration file.",
+			AllowedTools: []string{"Edit"},
+		},
+	}
+	sanitizeElaboratedTask(task)
+	if !hasTool(task.Agent.AllowedTools, "Read") {
+		t.Errorf("expected Read added alongside Edit, got %v", task.Agent.AllowedTools)
+	}
+}
+
+func TestSanitize_NoDuplicateTools(t *testing.T) {
+	task := &elaboratedTask{
+		Agent: elaboratedAgent{
+			Instructions: "Run go test ./...",
+			AllowedTools: []string{"Bash"},
+		},
+	}
+	sanitizeElaboratedTask(task)
+	count := 0
+	for _, tool := range task.Agent.AllowedTools {
+		if tool == "Bash" {
+			count++
+		}
+	}
+	if count != 1 {
+		t.Errorf("Bash duplicated in allowed_tools: %v", task.Agent.AllowedTools)
+	}
+}
+
+func TestSanitize_AddsAcceptanceCriteriaWhenMissing(t *testing.T) {
+	task := &elaboratedTask{
+		Agent: elaboratedAgent{
+			Instructions: "Do something useful with the codebase.",
+			AllowedTools: []string{"Bash"},
+		},
+	}
+	sanitizeElaboratedTask(task)
+	lower := strings.ToLower(task.Agent.Instructions)
+	if !strings.Contains(lower, "acceptance") && !strings.Contains(lower, "done when") {
+		t.Error("expected acceptance criteria section appended to instructions")
+	}
+}
+
+func TestSanitize_NoopWhenAcceptanceCriteriaAlreadyPresent(t *testing.T) {
+	original := "Do something.\n\n## Acceptance Criteria\n- All tests pass."
+	task := &elaboratedTask{
+		Agent: elaboratedAgent{
+			Instructions: original,
+			AllowedTools: []string{"Bash"},
+		},
+	}
+	sanitizeElaboratedTask(task)
+	if task.Agent.Instructions != original {
+		t.Errorf("instructions were modified when acceptance criteria were already present")
+	}
+}
+
+func TestSanitize_AddsTDDReminderForCodingTaskWithoutTestMention(t *testing.T) {
+	task := &elaboratedTask{
+		Agent: elaboratedAgent{
+			Instructions: "## Acceptance Criteria\nFix the bug.\n\nModify the handler to return 404 instead of 500.",
+			AllowedTools: []string{"Edit", "Read"},
+		},
+	}
+	sanitizeElaboratedTask(task)
+	lower := strings.ToLower(task.Agent.Instructions)
+	if !strings.Contains(lower, "tdd") && !strings.Contains(lower, "test") {
+		t.Error("expected TDD reminder for coding task without test mention")
+	}
+}
+
+func TestSanitize_NoTDDReminderWhenTestsAlreadyMentioned(t *testing.T) {
+	original := "## Acceptance Criteria\nAll tests pass.\n\nEdit the file and run go test ./... to verify."
+	task := &elaboratedTask{
+		Agent: elaboratedAgent{
+			Instructions: original,
+			AllowedTools: []string{"Edit", "Read", "Bash"},
+		},
+	}
+	before := task.Agent.Instructions
+	sanitizeElaboratedTask(task)
+	// Should NOT add a second TDD block since tests are already mentioned.
+	// Count occurrences of "tdd" / "test" — just verify no double-append.
+	if strings.Count(strings.ToLower(task.Agent.Instructions), "tdd") > 1 {
+		t.Errorf("TDD block added twice; instructions:\n%s", task.Agent.Instructions)
+	}
+	_ = before
+}
+
+func TestElaboratePrompt_RequiresAcceptanceCriteria(t *testing.T) {
+	prompt := buildElaboratePrompt("")
+	lower := strings.ToLower(prompt)
+	if !strings.Contains(lower, "acceptance criteria") {
+		t.Error("elaborate prompt should instruct the model to include acceptance criteria")
+	}
+}
+
+func TestElaboratePrompt_RequiresAllRelevantTools(t *testing.T) {
+	prompt := buildElaboratePrompt("")
+	// Prompt must remind the model to include file-creating tools when needed.
+	if !strings.Contains(prompt, "Write") {
+		t.Error("elaborate prompt should mention the Write tool so models know to include it")
+	}
+}
+
+func TestElaborateTask_SanitizationAppliedToResponse(t *testing.T) {
+	srv, _ := testServer(t)
+
+	// Elaborator returns a task that needs Write (instructions say "create file")
+	// but does NOT include it in allowed_tools.
+	task := elaboratedTask{
+		Name:        "Generate report",
+		Description: "Creates a report file.",
+		Agent: elaboratedAgent{
+			Type:         "claude",
+			Model:        "sonnet",
+			Instructions: "Create a new file called report.md with the analysis results.\n\n## Acceptance Criteria\n- report.md exists.",
+			MaxBudgetUSD: 0.5,
+			AllowedTools: []string{"Bash"}, // Write intentionally missing
+		},
+		Timeout:  "15m",
+		Priority: "normal",
+		Tags:     []string{"report"},
+	}
+	taskJSON, _ := json.Marshal(task)
+	wrapper := map[string]string{"result": string(taskJSON)}
+	wrapperJSON, _ := json.Marshal(wrapper)
+
+	srv.elaborateCmdPath = createFakeClaude(t, string(wrapperJSON), 0)
+
+	body := `{"prompt":"generate a report"}`
+	req := httptest.NewRequest("POST", "/api/tasks/elaborate", bytes.NewBufferString(body))
+	req.Header.Set("Content-Type", "application/json")
+	w := httptest.NewRecorder()
+
+	srv.Handler().ServeHTTP(w, req)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("status: want 200, got %d; body: %s", w.Code, w.Body.String())
+	}
+
+	var result elaboratedTask
+	if err := json.NewDecoder(w.Body).Decode(&result); err != nil {
+		t.Fatalf("failed to decode response: %v", err)
+	}
+	if !hasTool(result.Agent.AllowedTools, "Write") {
+		t.Errorf("expected Write in sanitized allowed_tools, got %v", result.Agent.AllowedTools)
+	}
+}
+
 func TestElaboratePrompt_ContainsWorkDir(t *testing.T) {
 	prompt := buildElaboratePrompt("/some/custom/path")
 	if !strings.Contains(prompt, "/some/custom/path") {
-- 
cgit v1.2.3