summaryrefslogtreecommitdiff
path: root/internal/retry
diff options
context:
space:
mode:
authorPeter Stone <thepeterstone@gmail.com>2026-05-01 22:14:37 -1000
committerGitHub <noreply@github.com>2026-05-01 22:14:37 -1000
commit99115d8158137083239c45e5a860b718ff4cefa1 (patch)
tree1bf3bd0505eea79375c67af83c7c5fe8c0f274ff /internal/retry
parentc2aa026f6ce1c9e216b99d74f294fc133d5fcddd (diff)
parent50f8fe8c1ff8b82e0bd399e5776e58bda3e57d1c (diff)
Merge pull request #1 from thepeterstone/claude/local-oss-model-agents-MEBqj
Local OSS models as a third runner (epic)
Diffstat (limited to 'internal/retry')
-rw-r--r--internal/retry/backoff.go77
-rw-r--r--internal/retry/backoff_test.go169
2 files changed, 246 insertions, 0 deletions
diff --git a/internal/retry/backoff.go b/internal/retry/backoff.go
new file mode 100644
index 0000000..b91abc4
--- /dev/null
+++ b/internal/retry/backoff.go
@@ -0,0 +1,77 @@
+// Package retry provides exponential-backoff retry helpers used across the
+// codebase for rate-limit-aware HTTP/subprocess calls.
+package retry
+
+import (
+ "context"
+ "fmt"
+ "regexp"
+ "strconv"
+ "strings"
+ "time"
+)
+
+var retryAfterRe = regexp.MustCompile(`(?i)retry[-_ ]after[:\s]+(\d+)`)
+
+const maxBackoffDelay = 5 * time.Minute
+
+// IsRateLimitError returns true if err looks like a transient rate-limit
+// (e.g. HTTP 429, "too many requests", "overloaded") that is worth retrying.
+func IsRateLimitError(err error) bool {
+ if err == nil {
+ return false
+ }
+ msg := strings.ToLower(err.Error())
+ return strings.Contains(msg, "rate limit") ||
+ strings.Contains(msg, "too many requests") ||
+ strings.Contains(msg, "429") ||
+ strings.Contains(msg, "overloaded")
+}
+
+// ParseRetryAfter extracts a Retry-After duration from an error message.
+// Returns 0 if no retry-after value is found.
+func ParseRetryAfter(msg string) time.Duration {
+ m := retryAfterRe.FindStringSubmatch(msg)
+ if m == nil {
+ return 0
+ }
+ secs, err := strconv.Atoi(m[1])
+ if err != nil || secs <= 0 {
+ return 0
+ }
+ return time.Duration(secs) * time.Second
+}
+
+// RunWithBackoff calls fn repeatedly on rate-limit errors, using exponential backoff.
+// maxRetries is the max number of retry attempts (not counting the initial call).
+// baseDelay is the initial backoff duration (doubled each retry).
+func RunWithBackoff(ctx context.Context, maxRetries int, baseDelay time.Duration, fn func() error) error {
+ var lastErr error
+ for attempt := 0; attempt <= maxRetries; attempt++ {
+ lastErr = fn()
+ if lastErr == nil {
+ return nil
+ }
+ if !IsRateLimitError(lastErr) {
+ return lastErr
+ }
+ if attempt == maxRetries {
+ break
+ }
+
+ delay := baseDelay * (1 << attempt)
+ if delay > maxBackoffDelay {
+ delay = maxBackoffDelay
+ }
+ if ra := ParseRetryAfter(lastErr.Error()); ra > 0 {
+ delay = ra
+ }
+
+ select {
+ case <-ctx.Done():
+ return fmt.Errorf("context cancelled during rate-limit backoff: %w", ctx.Err())
+ case <-time.After(delay):
+ }
+ }
+ return lastErr
+}
diff --git a/internal/retry/backoff_test.go b/internal/retry/backoff_test.go
new file mode 100644
index 0000000..a963fc2
--- /dev/null
+++ b/internal/retry/backoff_test.go
@@ -0,0 +1,169 @@
+package retry
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "testing"
+ "time"
+)
+
+// --- IsRateLimitError tests ---
+
+func TestIsRateLimitError_RateLimitMessage(t *testing.T) {
+ err := errors.New("claude exited with error: rate limit exceeded")
+ if !IsRateLimitError(err) {
+ t.Error("want true for 'rate limit exceeded', got false")
+ }
+}
+
+func TestIsRateLimitError_TooManyRequests(t *testing.T) {
+ err := errors.New("too many requests to the API")
+ if !IsRateLimitError(err) {
+ t.Error("want true for 'too many requests', got false")
+ }
+}
+
+func TestIsRateLimitError_HTTP429(t *testing.T) {
+ err := errors.New("API returned status 429")
+ if !IsRateLimitError(err) {
+ t.Error("want true for '429', got false")
+ }
+}
+
+func TestIsRateLimitError_Overloaded(t *testing.T) {
+ err := errors.New("API overloaded, please retry later")
+ if !IsRateLimitError(err) {
+ t.Error("want true for 'overloaded', got false")
+ }
+}
+
+func TestIsRateLimitError_NonRateLimitError(t *testing.T) {
+ err := errors.New("claude exited with error: exit status 1")
+ if IsRateLimitError(err) {
+ t.Error("want false for non-rate-limit error, got true")
+ }
+}
+
+func TestIsRateLimitError_NilError(t *testing.T) {
+ if IsRateLimitError(nil) {
+ t.Error("want false for nil error, got true")
+ }
+}
+
+// --- ParseRetryAfter tests ---
+
+func TestParseRetryAfter_RetryAfterSeconds(t *testing.T) {
+ msg := "rate limit exceeded, retry after 30 seconds"
+ d := ParseRetryAfter(msg)
+ if d != 30*time.Second {
+ t.Errorf("want 30s, got %v", d)
+ }
+}
+
+func TestParseRetryAfter_RetryAfterHeader(t *testing.T) {
+ msg := "rate_limit_error: retry-after: 60"
+ d := ParseRetryAfter(msg)
+ if d != 60*time.Second {
+ t.Errorf("want 60s, got %v", d)
+ }
+}
+
+func TestParseRetryAfter_NoRetryInfo(t *testing.T) {
+ msg := "rate limit exceeded"
+ d := ParseRetryAfter(msg)
+ if d != 0 {
+ t.Errorf("want 0, got %v", d)
+ }
+}
+
+// --- RunWithBackoff tests ---
+
+func TestRunWithBackoff_SuccessOnFirstTry(t *testing.T) {
+ calls := 0
+ fn := func() error {
+ calls++
+ return nil
+ }
+ err := RunWithBackoff(context.Background(), 3, time.Millisecond, fn)
+ if err != nil {
+ t.Errorf("want nil error, got %v", err)
+ }
+ if calls != 1 {
+ t.Errorf("want 1 call, got %d", calls)
+ }
+}
+
+func TestRunWithBackoff_RetriesOnRateLimit(t *testing.T) {
+ calls := 0
+ fn := func() error {
+ calls++
+ if calls < 3 {
+ return fmt.Errorf("rate limit exceeded")
+ }
+ return nil
+ }
+ err := RunWithBackoff(context.Background(), 3, time.Millisecond, fn)
+ if err != nil {
+ t.Errorf("want nil error, got %v", err)
+ }
+ if calls != 3 {
+ t.Errorf("want 3 calls, got %d", calls)
+ }
+}
+
+func TestRunWithBackoff_GivesUpAfterMaxRetries(t *testing.T) {
+ calls := 0
+ rateLimitErr := fmt.Errorf("rate limit exceeded")
+ fn := func() error {
+ calls++
+ return rateLimitErr
+ }
+ err := RunWithBackoff(context.Background(), 3, time.Millisecond, fn)
+ if err == nil {
+ t.Fatal("want error after max retries, got nil")
+ }
+ if calls != 4 {
+ t.Errorf("want 4 calls (1 initial + 3 retries), got %d", calls)
+ }
+}
+
+func TestRunWithBackoff_DoesNotRetryNonRateLimitError(t *testing.T) {
+ calls := 0
+ fn := func() error {
+ calls++
+ return fmt.Errorf("permission denied")
+ }
+ err := RunWithBackoff(context.Background(), 3, time.Millisecond, fn)
+ if err == nil {
+ t.Fatal("want error, got nil")
+ }
+ if calls != 1 {
+ t.Errorf("want 1 call (no retry for non-rate-limit), got %d", calls)
+ }
+}
+
+func TestRunWithBackoff_ContextCancellation(t *testing.T) {
+ ctx, cancel := context.WithCancel(context.Background())
+ calls := 0
+
+ fn := func() error {
+ calls++
+ cancel()
+ return fmt.Errorf("rate limit exceeded")
+ }
+
+ start := time.Now()
+ err := RunWithBackoff(ctx, 3, time.Second, fn)
+ elapsed := time.Since(start)
+
+ if err == nil {
+ t.Fatal("want error on context cancellation, got nil")
+ }
+ if elapsed > 500*time.Millisecond {
+ t.Errorf("context cancellation too slow: %v (want < 500ms)", elapsed)
+ }
+ if calls != 1 {
+ t.Errorf("want 1 call before cancellation, got %d", calls)
+ }
+}