feat: post-elaboration sanity check for tools, acceptance criteria, and dev practices

Add sanitizeElaboratedTask() called after every elaboration response: - Infers missing allowed_tools from instruction keywords (Write/Edit/Read/Bash/Grep/Glob) - Auto-adds Read when Edit is present - Appends Acceptance Criteria section if none present - Appends TDD reminder for coding tasks without test mention Also tighten buildElaboratePrompt to require acceptance criteria and list concrete tool examples, reducing how often the model omits tools. Fixes class of failures where agents couldn't create files because the elaborator omitted Write from allowed_tools. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
author: Peter Stone <thepeterstone@gmail.com> 2026-03-13 03:14:53 +0000
committer: Peter Stone <thepeterstone@gmail.com> 2026-03-13 03:14:53 +0000
commit: b9aba3d242482fa9cd42f2a49b2767a73d4d2213 (patch)
tree: 37bc9a8d598ace8eae2abd82ad9cf67fa7ef7dd1 /internal/api/elaborate.go
parent: 5303a68d67e435da863353cdce09fa2e3a8c2ccd (diff)
1 files changed, 68 insertions, 2 deletions
diff --git a/internal/api/elaborate.go b/internal/api/elaborate.go
index eb686bf..c6d08f4 100644
--- a/internal/api/elaborate.go
+++ b/internal/api/elaborate.go
@@ -9,6 +9,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"sort"
 	"strings"
 	"time"
 )
@@ -32,10 +33,10 @@ Output ONLY a valid JSON object matching this schema (no markdown fences, no pro
   "agent": {
     "type":            "claude" | "gemini",
     "model":           string  — "sonnet" for claude, "gemini-2.5-flash-lite" for gemini,
-    "instructions":    string  — detailed, step-by-step instructions for the agent,
+    "instructions":    string  — detailed, step-by-step instructions for the agent. Must end with a "## Acceptance Criteria" section listing measurable conditions that define success. For coding tasks, include TDD requirements (write failing tests first, then implement),
 ` + workDirLine + `
     "max_budget_usd":  number  — conservative estimate (0.25–5.00),
-    "allowed_tools":   array   — only tools the task genuinely needs
+    "allowed_tools":   array   — every tool the task genuinely needs. Include "Write" if creating files, "Edit" if modifying files, "Read" if reading files, "Bash" for shell/git/test commands, "Grep"/"Glob" for searching.
   },
   "timeout":  string  — e.g. "15m",
   "priority": string  — "normal" | "high" | "low",
@@ -62,6 +63,69 @@ type elaboratedAgent struct {
 	AllowedTools []string `json:"allowed_tools"`
 }
 
+// sanitizeElaboratedTask enforces tool completeness and dev practice compliance.
+// It modifies t in place, inferring missing tools from instruction keywords and
+// appending required sections when they are absent.
+func sanitizeElaboratedTask(t *elaboratedTask) {
+	lower := strings.ToLower(t.Agent.Instructions)
+
+	// Build current tool set.
+	toolSet := make(map[string]bool, len(t.Agent.AllowedTools))
+	for _, tool := range t.Agent.AllowedTools {
+		toolSet[tool] = true
+	}
+
+	// Infer missing tools from instruction keywords.
+	type rule struct {
+		tool     string
+		keywords []string
+	}
+	rules := []rule{
+		{"Write", []string{"create file", "write file", "new file", "write to", "save to", "output to", "generate file", "creates a file", "create a new file"}},
+		{"Edit", []string{"edit", "modify", "refactor", "replace", "patch"}},
+		{"Read", []string{"read", "inspect", "examine", "look at the file"}},
+		{"Bash", []string{"run", "execute", "bash", "shell", "command", "build", "compile", "git", "install", "make"}},
+		{"Grep", []string{"search for", "grep", "find in", "locate in"}},
+		{"Glob", []string{"find file", "list file", "search file"}},
+	}
+	for _, r := range rules {
+		if toolSet[r.tool] {
+			continue
+		}
+		for _, kw := range r.keywords {
+			if strings.Contains(lower, kw) {
+				toolSet[r.tool] = true
+				break
+			}
+		}
+	}
+	// Edit without Read is almost always wrong.
+	if toolSet["Edit"] && !toolSet["Read"] {
+		toolSet["Read"] = true
+	}
+	// Rebuild the list only when tools were added.
+	if len(toolSet) > len(t.Agent.AllowedTools) {
+		tools := make([]string, 0, len(toolSet))
+		for tool := range toolSet {
+			tools = append(tools, tool)
+		}
+		sort.Strings(tools)
+		t.Agent.AllowedTools = tools
+	}
+
+	// Append an acceptance criteria section when none is present.
+	if !strings.Contains(lower, "acceptance") &&
+		!strings.Contains(lower, "done when") &&
+		!strings.Contains(lower, "success criteria") {
+		t.Agent.Instructions += "\n\n## Acceptance Criteria\nBefore finishing, verify all stated goals are met, tests pass (if applicable), and no unintended side effects were introduced."
+	}
+
+	// Append a TDD reminder for coding tasks that do not already mention tests.
+	if (toolSet["Edit"] || toolSet["Write"]) && !strings.Contains(lower, "test") {
+		t.Agent.Instructions += "\n\n## Dev Practices\nFollow TDD: write a failing test first, then implement the minimum code to make it pass. Commit all changes before finishing."
+	}
+}
+
 // claudeJSONResult is the top-level object returned by `claude --output-format json`.
 type claudeJSONResult struct {
 	Result string `json:"result"`
@@ -214,5 +278,7 @@ func (s *Server) handleElaborateTask(w http.ResponseWriter, r *http.Request) {
 		result.Agent.Type = "claude"
 	}
 
+	sanitizeElaboratedTask(&result)
+
 	writeJSON(w, http.StatusOK, result)
 }
author	Peter Stone <thepeterstone@gmail.com>	2026-03-13 03:14:53 +0000
committer	Peter Stone <thepeterstone@gmail.com>	2026-03-13 03:14:53 +0000
commit	b9aba3d242482fa9cd42f2a49b2767a73d4d2213 (patch)
tree	37bc9a8d598ace8eae2abd82ad9cf67fa7ef7dd1 /internal/api/elaborate.go
parent	5303a68d67e435da863353cdce09fa2e3a8c2ccd (diff)