Skip to content

Commit

Permalink
Merge pull request #187 from rusq/i186-500
Browse files Browse the repository at this point in the history
Retry for 500-599 errors
  • Loading branch information
rusq authored Feb 2, 2023
2 parents d4b7c94 + 3e282c7 commit f21eab2
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 8 deletions.
45 changes: 39 additions & 6 deletions internal/network/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package network
import (
"context"
"fmt"
"net/http"
"runtime/trace"
"time"

Expand All @@ -15,7 +16,14 @@ import (
)

// defNumAttempts is the default number of retry attempts.
const defNumAttempts = 3
const (
defNumAttempts = 3
)

// MaxAllowedWaitTime is the maximum time to wait for a transient error.
// The wait time for a transient error depends on the current retry attempt
// number and is calculated as: (attempt+2)^3 seconds.
var MaxAllowedWaitTime = 5 * time.Minute

// Logger is the package logger.
var Logger logger.Interface = logger.Default
Expand Down Expand Up @@ -48,20 +56,45 @@ func WithRetry(ctx context.Context, l *rate.Limiter, maxAttempts int, fn func()
}

tracelogf(ctx, "error", "slackRetry: %s after %d attempts", cbErr, attempt+1)
var rle *slack.RateLimitedError
if !errors.As(cbErr, &rle) {
return fmt.Errorf("callback error: %w", cbErr)
var (
rle *slack.RateLimitedError
sce slack.StatusCodeError
)
if errors.As(cbErr, &rle) {
tracelogf(ctx, "info", "got rate limited, sleeping %s", rle.RetryAfter)
time.Sleep(rle.RetryAfter)
continue
} else if errors.As(cbErr, &sce) {
if sce.Code >= http.StatusInternalServerError && sce.Code <= 599 {
// possibly transient error
delay := waitTime(attempt)
tracelogf(ctx, "info", "got server error %d, sleeping %s", sce.Code, delay)
time.Sleep(delay)
continue
}
}

tracelogf(ctx, "info", "got rate limited, sleeping %s", rle.RetryAfter)
time.Sleep(rle.RetryAfter)
return fmt.Errorf("callback error: %w", cbErr)
}
if !ok {
return ErrRetryFailed
}
return nil
}

// waitTime returns the amount of time to wait before retrying depending on
// the current attempt. The wait time is calculated as (x+2)^3 seconds, where
// x is the current attempt number. The maximum wait time is capped at 5
// minutes.
func waitTime(attempt int) time.Duration {
x := attempt + 2 // this is to ensure that we sleep at least 8 seconds.
delay := time.Duration(x*x*x) * time.Second
if delay > MaxAllowedWaitTime {
return MaxAllowedWaitTime
}
return delay
}

func tracelogf(ctx context.Context, category string, fmt string, a ...any) {
trace.Logf(ctx, category, fmt, a...)
l().Debugf(fmt, a...)
Expand Down
112 changes: 110 additions & 2 deletions internal/network/network_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@ package network

import (
"context"
"errors"
"fmt"
"net/http"
"net/http/httptest"
"reflect"
"testing"
"time"

"errors"

"github.com/slack-go/slack"
"golang.org/x/time/rate"
)
Expand Down Expand Up @@ -143,3 +146,108 @@ func Test_withRetry(t *testing.T) {
})
}
}

func Test500ErrorHandling(t *testing.T) {
t.Parallel()

var codes = []int{500, 502, 503, 504, 598}
for _, code := range codes {
var thisCode = code
// This test is to ensure that we handle 500 errors correctly.
t.Run(fmt.Sprintf("%d error", code), func(t *testing.T) {
t.Parallel()

const (
testRetryCount = 1
waitThreshold = 100 * time.Millisecond
)

// Create a test server that returns a 500 error.
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(thisCode)
}))
defer ts.Close()

// Create a new client with the test server as the endpoint.
client := slack.New("token", slack.OptionAPIURL(ts.URL+"/"))

start := time.Now()
// Call the client with a retry.
err := WithRetry(context.Background(), rate.NewLimiter(1, 1), testRetryCount, func() error {
_, err := client.GetConversationHistory(&slack.GetConversationHistoryParameters{})
if err == nil {
return errors.New("expected error, got nil")
}
return err
})
if err == nil {
t.Fatal("expected error, got nil")
}

dur := time.Since(start)
if dur < waitTime(testRetryCount-1)-waitThreshold || waitTime(testRetryCount-1)+waitThreshold < dur {
t.Errorf("expected duration to be around %s, got %s", waitTime(testRetryCount), dur)
}

})
}
t.Run("404 error", func(t *testing.T) {
t.Parallel()

const (
testRetryCount = 1
waitThreshold = 100 * time.Millisecond
)

// Create a test server that returns a 404 error.
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(404)
}))
defer ts.Close()

// Create a new client with the test server as the endpoint.
client := slack.New("token", slack.OptionAPIURL(ts.URL+"/"))

// Call the client with a retry.
start := time.Now()
err := WithRetry(context.Background(), rate.NewLimiter(1, 1), testRetryCount, func() error {
_, err := client.GetConversationHistory(&slack.GetConversationHistoryParameters{})
if err == nil {
return errors.New("expected error, got nil")
}
return err
})
if err == nil {
t.Fatal("expected error, got nil")
}
dur := time.Since(start)
if dur > 500*time.Millisecond { // 404 error should not be retried
t.Errorf("expected no sleep, but slept for %s", dur)
}
})
}

func Test_waitTime(t *testing.T) {
type args struct {
attempt int
}
tests := []struct {
name string
args args
want time.Duration
}{
{"attempt 0", args{0}, 8 * time.Second},
{"attempt 1", args{1}, 27 * time.Second},
{"attempt 2", args{2}, 64 * time.Second},
{"attempt 2", args{4}, 216 * time.Second},
{"attempt 100", args{5}, MaxAllowedWaitTime}, // check if capped properly
{"attempt 100", args{1000}, MaxAllowedWaitTime},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := waitTime(tt.args.attempt); !reflect.DeepEqual(got, tt.want) {
t.Errorf("waitTime() = %v, want %v", got, tt.want)
}
})
}
}

0 comments on commit f21eab2

Please sign in to comment.