Merge pull request #187 from rusq/i186-500

Retry for 500-599 errors
rusq · Feb 2, 2023 · f21eab2 · f21eab2
2 parents d4b7c94 + 3e282c7
commit f21eab2
Show file tree

Hide file tree

Showing 2 changed files with 149 additions and 8 deletions.
diff --git a/internal/network/network.go b/internal/network/network.go
@@ -3,6 +3,7 @@ package network
 import (
 	"context"
 	"fmt"
+	"net/http"
 	"runtime/trace"
 	"time"
 
@@ -15,7 +16,14 @@ import (
 )
 
 // defNumAttempts is the default number of retry attempts.
-const defNumAttempts = 3
+const (
+	defNumAttempts = 3
+)
+
+// MaxAllowedWaitTime is the maximum time to wait for a transient error.
+// The wait time for a transient error depends on the current retry attempt
+// number and is calculated as: (attempt+2)^3 seconds.
+var MaxAllowedWaitTime = 5 * time.Minute
 
 // Logger is the package logger.
 var Logger logger.Interface = logger.Default
@@ -48,20 +56,45 @@ func WithRetry(ctx context.Context, l *rate.Limiter, maxAttempts int, fn func()
 		}
 
 		tracelogf(ctx, "error", "slackRetry: %s after %d attempts", cbErr, attempt+1)
-		var rle *slack.RateLimitedError
-		if !errors.As(cbErr, &rle) {
-			return fmt.Errorf("callback error: %w", cbErr)
+		var (
+			rle *slack.RateLimitedError
+			sce slack.StatusCodeError
+		)
+		if errors.As(cbErr, &rle) {
+			tracelogf(ctx, "info", "got rate limited, sleeping %s", rle.RetryAfter)
+			time.Sleep(rle.RetryAfter)
+			continue
+		} else if errors.As(cbErr, &sce) {
+			if sce.Code >= http.StatusInternalServerError && sce.Code <= 599 {
+				// possibly transient error
+				delay := waitTime(attempt)
+				tracelogf(ctx, "info", "got server error %d, sleeping %s", sce.Code, delay)
+				time.Sleep(delay)
+				continue
+			}
 		}
 
-		tracelogf(ctx, "info", "got rate limited, sleeping %s", rle.RetryAfter)
-		time.Sleep(rle.RetryAfter)
+		return fmt.Errorf("callback error: %w", cbErr)
 	}
 	if !ok {
 		return ErrRetryFailed
 	}
 	return nil
 }
 
+// waitTime returns the amount of time to wait before retrying depending on
+// the current attempt. The wait time is calculated as (x+2)^3 seconds, where
+// x is the current attempt number. The maximum wait time is capped at 5
+// minutes.
+func waitTime(attempt int) time.Duration {
+	x := attempt + 2 // this is to ensure that we sleep at least 8 seconds.
+	delay := time.Duration(x*x*x) * time.Second
+	if delay > MaxAllowedWaitTime {
+		return MaxAllowedWaitTime
+	}
+	return delay
+}
+
 func tracelogf(ctx context.Context, category string, fmt string, a ...any) {
 	trace.Logf(ctx, category, fmt, a...)
 	l().Debugf(fmt, a...)

diff --git a/internal/network/network_test.go b/internal/network/network_test.go
@@ -2,11 +2,14 @@ package network
 
 import (
 	"context"
+	"errors"
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"reflect"
 	"testing"
 	"time"
 
-	"errors"
-
 	"github.com/slack-go/slack"
 	"golang.org/x/time/rate"
 )
@@ -143,3 +146,108 @@ func Test_withRetry(t *testing.T) {
 		})
 	}
 }
+
+func Test500ErrorHandling(t *testing.T) {
+	t.Parallel()
+
+	var codes = []int{500, 502, 503, 504, 598}
+	for _, code := range codes {
+		var thisCode = code
+		// This test is to ensure that we handle 500 errors correctly.
+		t.Run(fmt.Sprintf("%d error", code), func(t *testing.T) {
+			t.Parallel()
+
+			const (
+				testRetryCount = 1
+				waitThreshold  = 100 * time.Millisecond
+			)
+
+			// Create a test server that returns a 500 error.
+			ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				w.WriteHeader(thisCode)
+			}))
+			defer ts.Close()
+
+			// Create a new client with the test server as the endpoint.
+			client := slack.New("token", slack.OptionAPIURL(ts.URL+"/"))
+
+			start := time.Now()
+			// Call the client with a retry.
+			err := WithRetry(context.Background(), rate.NewLimiter(1, 1), testRetryCount, func() error {
+				_, err := client.GetConversationHistory(&slack.GetConversationHistoryParameters{})
+				if err == nil {
+					return errors.New("expected error, got nil")
+				}
+				return err
+			})
+			if err == nil {
+				t.Fatal("expected error, got nil")
+			}
+
+			dur := time.Since(start)
+			if dur < waitTime(testRetryCount-1)-waitThreshold || waitTime(testRetryCount-1)+waitThreshold < dur {
+				t.Errorf("expected duration to be around %s, got %s", waitTime(testRetryCount), dur)
+			}
+
+		})
+	}
+	t.Run("404 error", func(t *testing.T) {
+		t.Parallel()
+
+		const (
+			testRetryCount = 1
+			waitThreshold  = 100 * time.Millisecond
+		)
+
+		// Create a test server that returns a 404 error.
+		ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			w.WriteHeader(404)
+		}))
+		defer ts.Close()
+
+		// Create a new client with the test server as the endpoint.
+		client := slack.New("token", slack.OptionAPIURL(ts.URL+"/"))
+
+		// Call the client with a retry.
+		start := time.Now()
+		err := WithRetry(context.Background(), rate.NewLimiter(1, 1), testRetryCount, func() error {
+			_, err := client.GetConversationHistory(&slack.GetConversationHistoryParameters{})
+			if err == nil {
+				return errors.New("expected error, got nil")
+			}
+			return err
+		})
+		if err == nil {
+			t.Fatal("expected error, got nil")
+		}
+		dur := time.Since(start)
+		if dur > 500*time.Millisecond { // 404 error should not be retried
+			t.Errorf("expected no sleep, but slept for %s", dur)
+		}
+	})
+}
+
+func Test_waitTime(t *testing.T) {
+	type args struct {
+		attempt int
+	}
+	tests := []struct {
+		name string
+		args args
+		want time.Duration
+	}{
+		{"attempt 0", args{0}, 8 * time.Second},
+		{"attempt 1", args{1}, 27 * time.Second},
+		{"attempt 2", args{2}, 64 * time.Second},
+		{"attempt 2", args{4}, 216 * time.Second},
+		{"attempt 100", args{5}, MaxAllowedWaitTime}, // check if capped properly
+		{"attempt 100", args{1000}, MaxAllowedWaitTime},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := waitTime(tt.args.attempt); !reflect.DeepEqual(got, tt.want) {
+				t.Errorf("waitTime() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}