diff options
| author | Peter Stone <thepeterstone@gmail.com> | 2026-05-13 04:02:20 +0000 |
|---|---|---|
| committer | Peter Stone <thepeterstone@gmail.com> | 2026-05-13 04:02:20 +0000 |
| commit | 68399a598924775a3ec22a39c2336ae497fb07f3 (patch) | |
| tree | 29ade8224eb51eca47a1d9d03bb4d0d3653a72aa /internal/retry | |
| parent | f01231cc45f41ce2dc37072e77428e467ef3fc15 (diff) | |
| parent | d970c0730ff0dc7d714d3261197d8ba52b5d21f4 (diff) | |
Merges 12 commits from github/main (formerly master) that were developed
independently. Key additions:
- LocalRunner: OpenAI-compatible local LLM execution (Ollama, LM Studio)
- Real GeminiRunner with full sandbox parity to ClaudeRunner
- llm.Client for enriching CI failures and elaboration via local model
- retry.ParseRetryAfter moved to shared package
- tokens_in/tokens_out columns in executions table
Conflict resolutions:
- Kept local main's VAPID/push, stories, projects, agent events schema
- Merged both sets of Config fields (local + LocalModel from github/main)
- Unified activePerAgent accounting (decActiveAgent helper)
- Removed duplicate helpers from claude.go (now in helpers.go)
- Fixed double-decrement bug in handleRunResult vs decActiveAgent
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Diffstat (limited to 'internal/retry')
| -rw-r--r-- | internal/retry/backoff.go | 77 | ||||
| -rw-r--r-- | internal/retry/backoff_test.go | 169 |
2 files changed, 246 insertions, 0 deletions
diff --git a/internal/retry/backoff.go b/internal/retry/backoff.go new file mode 100644 index 0000000..b91abc4 --- /dev/null +++ b/internal/retry/backoff.go @@ -0,0 +1,77 @@ +// Package retry provides exponential-backoff retry helpers used across the +// codebase for rate-limit-aware HTTP/subprocess calls. +package retry + +import ( + "context" + "fmt" + "regexp" + "strconv" + "strings" + "time" +) + +var retryAfterRe = regexp.MustCompile(`(?i)retry[-_ ]after[:\s]+(\d+)`) + +const maxBackoffDelay = 5 * time.Minute + +// IsRateLimitError returns true if err looks like a transient rate-limit +// (e.g. HTTP 429, "too many requests", "overloaded") that is worth retrying. +func IsRateLimitError(err error) bool { + if err == nil { + return false + } + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "rate limit") || + strings.Contains(msg, "too many requests") || + strings.Contains(msg, "429") || + strings.Contains(msg, "overloaded") +} + +// ParseRetryAfter extracts a Retry-After duration from an error message. +// Returns 0 if no retry-after value is found. +func ParseRetryAfter(msg string) time.Duration { + m := retryAfterRe.FindStringSubmatch(msg) + if m == nil { + return 0 + } + secs, err := strconv.Atoi(m[1]) + if err != nil || secs <= 0 { + return 0 + } + return time.Duration(secs) * time.Second +} + +// RunWithBackoff calls fn repeatedly on rate-limit errors, using exponential backoff. +// maxRetries is the max number of retry attempts (not counting the initial call). +// baseDelay is the initial backoff duration (doubled each retry). +func RunWithBackoff(ctx context.Context, maxRetries int, baseDelay time.Duration, fn func() error) error { + var lastErr error + for attempt := 0; attempt <= maxRetries; attempt++ { + lastErr = fn() + if lastErr == nil { + return nil + } + if !IsRateLimitError(lastErr) { + return lastErr + } + if attempt == maxRetries { + break + } + + delay := baseDelay * (1 << attempt) + if delay > maxBackoffDelay { + delay = maxBackoffDelay + } + if ra := ParseRetryAfter(lastErr.Error()); ra > 0 { + delay = ra + } + + select { + case <-ctx.Done(): + return fmt.Errorf("context cancelled during rate-limit backoff: %w", ctx.Err()) + case <-time.After(delay): + } + } + return lastErr +} diff --git a/internal/retry/backoff_test.go b/internal/retry/backoff_test.go new file mode 100644 index 0000000..a963fc2 --- /dev/null +++ b/internal/retry/backoff_test.go @@ -0,0 +1,169 @@ +package retry + +import ( + "context" + "errors" + "fmt" + "testing" + "time" +) + +// --- IsRateLimitError tests --- + +func TestIsRateLimitError_RateLimitMessage(t *testing.T) { + err := errors.New("claude exited with error: rate limit exceeded") + if !IsRateLimitError(err) { + t.Error("want true for 'rate limit exceeded', got false") + } +} + +func TestIsRateLimitError_TooManyRequests(t *testing.T) { + err := errors.New("too many requests to the API") + if !IsRateLimitError(err) { + t.Error("want true for 'too many requests', got false") + } +} + +func TestIsRateLimitError_HTTP429(t *testing.T) { + err := errors.New("API returned status 429") + if !IsRateLimitError(err) { + t.Error("want true for '429', got false") + } +} + +func TestIsRateLimitError_Overloaded(t *testing.T) { + err := errors.New("API overloaded, please retry later") + if !IsRateLimitError(err) { + t.Error("want true for 'overloaded', got false") + } +} + +func TestIsRateLimitError_NonRateLimitError(t *testing.T) { + err := errors.New("claude exited with error: exit status 1") + if IsRateLimitError(err) { + t.Error("want false for non-rate-limit error, got true") + } +} + +func TestIsRateLimitError_NilError(t *testing.T) { + if IsRateLimitError(nil) { + t.Error("want false for nil error, got true") + } +} + +// --- ParseRetryAfter tests --- + +func TestParseRetryAfter_RetryAfterSeconds(t *testing.T) { + msg := "rate limit exceeded, retry after 30 seconds" + d := ParseRetryAfter(msg) + if d != 30*time.Second { + t.Errorf("want 30s, got %v", d) + } +} + +func TestParseRetryAfter_RetryAfterHeader(t *testing.T) { + msg := "rate_limit_error: retry-after: 60" + d := ParseRetryAfter(msg) + if d != 60*time.Second { + t.Errorf("want 60s, got %v", d) + } +} + +func TestParseRetryAfter_NoRetryInfo(t *testing.T) { + msg := "rate limit exceeded" + d := ParseRetryAfter(msg) + if d != 0 { + t.Errorf("want 0, got %v", d) + } +} + +// --- RunWithBackoff tests --- + +func TestRunWithBackoff_SuccessOnFirstTry(t *testing.T) { + calls := 0 + fn := func() error { + calls++ + return nil + } + err := RunWithBackoff(context.Background(), 3, time.Millisecond, fn) + if err != nil { + t.Errorf("want nil error, got %v", err) + } + if calls != 1 { + t.Errorf("want 1 call, got %d", calls) + } +} + +func TestRunWithBackoff_RetriesOnRateLimit(t *testing.T) { + calls := 0 + fn := func() error { + calls++ + if calls < 3 { + return fmt.Errorf("rate limit exceeded") + } + return nil + } + err := RunWithBackoff(context.Background(), 3, time.Millisecond, fn) + if err != nil { + t.Errorf("want nil error, got %v", err) + } + if calls != 3 { + t.Errorf("want 3 calls, got %d", calls) + } +} + +func TestRunWithBackoff_GivesUpAfterMaxRetries(t *testing.T) { + calls := 0 + rateLimitErr := fmt.Errorf("rate limit exceeded") + fn := func() error { + calls++ + return rateLimitErr + } + err := RunWithBackoff(context.Background(), 3, time.Millisecond, fn) + if err == nil { + t.Fatal("want error after max retries, got nil") + } + if calls != 4 { + t.Errorf("want 4 calls (1 initial + 3 retries), got %d", calls) + } +} + +func TestRunWithBackoff_DoesNotRetryNonRateLimitError(t *testing.T) { + calls := 0 + fn := func() error { + calls++ + return fmt.Errorf("permission denied") + } + err := RunWithBackoff(context.Background(), 3, time.Millisecond, fn) + if err == nil { + t.Fatal("want error, got nil") + } + if calls != 1 { + t.Errorf("want 1 call (no retry for non-rate-limit), got %d", calls) + } +} + +func TestRunWithBackoff_ContextCancellation(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + calls := 0 + + fn := func() error { + calls++ + cancel() + return fmt.Errorf("rate limit exceeded") + } + + start := time.Now() + err := RunWithBackoff(ctx, 3, time.Second, fn) + elapsed := time.Since(start) + + if err == nil { + t.Fatal("want error on context cancellation, got nil") + } + if elapsed > 500*time.Millisecond { + t.Errorf("context cancellation too slow: %v (want < 500ms)", elapsed) + } + if calls != 1 { + t.Errorf("want 1 call before cancellation, got %d", calls) + } +} |
