From eeee3b60a1fd8dc8b8b92997f709ef65e4b2097f Mon Sep 17 00:00:00 2001 From: Peter Stone Date: Thu, 19 Mar 2026 23:15:58 +0000 Subject: feat: add errors, throughput, and billing sections to stats dashboard - GET /api/stats?window=7d: pre-aggregated SQL queries for errors, throughput, billing - Errors section: category summary (quota/rate_limit/timeout/git/failed) + failure table - Throughput section: stacked hourly bar chart (completed/failed/other) over 7d - Billing section: KPIs (7d total, avg/day, cost/run) + daily cost bar chart Co-Authored-By: Claude Sonnet 4.6 --- internal/storage/db.go | 135 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) (limited to 'internal/storage/db.go') diff --git a/internal/storage/db.go b/internal/storage/db.go index 0d11b4e..1a0e74f 100644 --- a/internal/storage/db.go +++ b/internal/storage/db.go @@ -560,6 +560,141 @@ type RecentExecution struct { StdoutPath string `json:"stdout_path"` } +// ThroughputBucket is one time-bucket of execution counts by outcome. +type ThroughputBucket struct { + Hour string `json:"hour"` // RFC3339 truncated to hour + Completed int `json:"completed"` + Failed int `json:"failed"` + Other int `json:"other"` +} + +// BillingDay is the aggregated cost and run count for a calendar day. +type BillingDay struct { + Day string `json:"day"` // YYYY-MM-DD + CostUSD float64 `json:"cost_usd"` + Runs int `json:"runs"` +} + +// FailedExecution is a failed/timed-out/budget-exceeded execution with its error. +type FailedExecution struct { + ID string `json:"id"` + TaskID string `json:"task_id"` + TaskName string `json:"task_name"` + Status string `json:"status"` + ErrorMsg string `json:"error_msg"` + Category string `json:"category"` // quota | timeout | rate_limit | git | failed + StartedAt time.Time `json:"started_at"` +} + +// DashboardStats is returned by QueryDashboardStats. +type DashboardStats struct { + Throughput []ThroughputBucket `json:"throughput"` + Billing []BillingDay `json:"billing"` + Failures []FailedExecution `json:"failures"` +} + +// QueryDashboardStats returns pre-aggregated stats for the given window. +func (s *DB) QueryDashboardStats(since time.Time) (*DashboardStats, error) { + stats := &DashboardStats{ + Throughput: []ThroughputBucket{}, + Billing: []BillingDay{}, + Failures: []FailedExecution{}, + } + + // Throughput: completions per hour bucket + tpRows, err := s.db.Query(` + SELECT strftime('%Y-%m-%dT%H:00:00Z', start_time) as hour, + SUM(CASE WHEN status IN ('COMPLETED','READY') THEN 1 ELSE 0 END), + SUM(CASE WHEN status IN ('FAILED','TIMED_OUT','BUDGET_EXCEEDED') THEN 1 ELSE 0 END), + SUM(CASE WHEN status NOT IN ('COMPLETED','READY','FAILED','TIMED_OUT','BUDGET_EXCEEDED') THEN 1 ELSE 0 END) + FROM executions + WHERE start_time >= ? AND status NOT IN ('RUNNING','QUEUED','PENDING') + GROUP BY hour ORDER BY hour ASC`, since.UTC()) + if err != nil { + return nil, err + } + defer tpRows.Close() + for tpRows.Next() { + var b ThroughputBucket + if err := tpRows.Scan(&b.Hour, &b.Completed, &b.Failed, &b.Other); err != nil { + return nil, err + } + stats.Throughput = append(stats.Throughput, b) + } + if err := tpRows.Err(); err != nil { + return nil, err + } + + // Billing: cost per day + billRows, err := s.db.Query(` + SELECT date(start_time) as day, COALESCE(SUM(cost_usd),0), COUNT(*) + FROM executions + WHERE start_time >= ? + GROUP BY day ORDER BY day ASC`, since.UTC()) + if err != nil { + return nil, err + } + defer billRows.Close() + for billRows.Next() { + var b BillingDay + if err := billRows.Scan(&b.Day, &b.CostUSD, &b.Runs); err != nil { + return nil, err + } + stats.Billing = append(stats.Billing, b) + } + if err := billRows.Err(); err != nil { + return nil, err + } + + // Failures: recent failed executions with error messages + failRows, err := s.db.Query(` + SELECT e.id, e.task_id, t.name, e.status, COALESCE(e.error_msg,''), e.start_time + FROM executions e JOIN tasks t ON e.task_id = t.id + WHERE e.start_time >= ? AND e.status IN ('FAILED','TIMED_OUT','BUDGET_EXCEEDED') + ORDER BY e.start_time DESC LIMIT 50`, since.UTC()) + if err != nil { + return nil, err + } + defer failRows.Close() + for failRows.Next() { + var f FailedExecution + if err := failRows.Scan(&f.ID, &f.TaskID, &f.TaskName, &f.Status, &f.ErrorMsg, &f.StartedAt); err != nil { + return nil, err + } + f.Category = classifyError(f.Status, f.ErrorMsg) + stats.Failures = append(stats.Failures, f) + } + if err := failRows.Err(); err != nil { + return nil, err + } + + return stats, nil +} + +// classifyError maps a status + error message to a human category. +func classifyError(status, msg string) string { + if status == "TIMED_OUT" { + return "timeout" + } + if status == "BUDGET_EXCEEDED" { + return "quota" + } + low := strings.ToLower(msg) + if strings.Contains(low, "quota") || strings.Contains(low, "exhausted") || strings.Contains(low, "terminalquota") { + return "quota" + } + if strings.Contains(low, "rate limit") || strings.Contains(low, "429") || strings.Contains(low, "too many requests") { + return "rate_limit" + } + if strings.Contains(low, "git push") || strings.Contains(low, "git pull") { + return "git" + } + if strings.Contains(low, "timeout") || strings.Contains(low, "deadline") { + return "timeout" + } + return "failed" +} + // ListRecentExecutions returns executions since the given time, joined with task names. // If taskID is non-empty, only executions for that task are returned. func (s *DB) ListRecentExecutions(since time.Time, limit int, taskID string) ([]*RecentExecution, error) { -- cgit v1.2.3