fix: utf-8 with bom to be rejected

greut · Aug 29, 2023 · ac65d9c · ac65d9c
1 parent 85cb0d0
commit ac65d9c
Show file tree

Hide file tree

Showing 5 changed files with 19 additions and 10 deletions.
diff --git a/probes.go b/probes.go
@@ -45,7 +45,7 @@ func ProbeCharsetOrBinary(ctx context.Context, r *bufio.Reader, charset string)
 	return cs, false, nil
 }
 
-// probeMagic searches for some text-baesd binary files such as PDF.
+// probeMagic searches for some text-based binary files such as PDF.
 func probeMagic(ctx context.Context, bs []byte) bool {
 	log := logr.FromContextOrDiscard(ctx)
 
@@ -115,16 +115,12 @@ func probeCharset(ctx context.Context, bs []byte, charset string) (string, error
 
 	var cs string
 	// The first line may contain the BOM for detecting some encodings
-	if charset != Utf8 && charset != Latin1 {
-		cs = detectCharsetUsingBOM(bs)
+	cs = detectCharsetUsingBOM(bs)
 
-		if charset != "" && cs != charset {
-			return "", ValidationError{
-				Message: fmt.Sprintf("no %s prefix were found, got %q", charset, cs),
-			}
+	if charset != "" && cs != "" && cs != charset {
+		return "", ValidationError{
+			Message: fmt.Sprintf("no %s prefix were found, got %q", charset, cs),
 		}
-
-		log.V(3).Info("detect using BOM", "charset", charset)
 	}
 
 	if cs == "" && charset != "" {

diff --git a/probes_test.go b/probes_test.go
@@ -104,6 +104,14 @@ func TestProbeCharsetOfBinaryFailure(t *testing.T) {
 			Name:    "utf-8 vs latin1",
 			Charset: "latin1",
 			File:    []byte{'h', 'i', ' ', 0xf0, 0x9f, 0x92, 0xa9, '!'},
+		}, {
+			Name:    "utf-8 vs utf-8 bom",
+			Charset: "utf-8 bom",
+			File:    []byte{'h', 'i', ' ', 0xf0, 0x9f, 0x92, 0xa9, '!'},
+		}, {
+			Name:    "utf-8 bom vs utf-8",
+			Charset: "utf-8",
+			File:    []byte{0xef, 0xbb, 0xbf, 'h', 'e', 'l', 'l', 'o', '.'},
 		},
 	}
 

diff --git a/scanner.go b/scanner.go
@@ -50,7 +50,7 @@ func SplitLines(data []byte, atEOF bool) (int, []byte, error) {
 // ReadLines consumes the reader and emit each line via the LineFunc
 //
 // Line numbering starts at 0. Scanner is pretty smart an will reuse
-// its memory structure. This is somehing we explicitly avoid by copying
+// its memory structure. This is something we explicitly avoid by copying
 // the content to a new slice.
 func ReadLines(r io.Reader, fileSize int64, fn LineFunc) []error {
 	errs := make([]error, 0)

diff --git a/testdata/charset/.editorconfig b/testdata/charset/.editorconfig
@@ -3,6 +3,9 @@ root = true
 [utf8.*]
 charset=utf-8
 
+[utf8-bom.*]
+charset=utf-8 bom
+
 [iso-8859-1.*]
 charset=latin1
 

diff --git a/testdata/charset/utf8-bom.txt b/testdata/charset/utf8-bom.txt
@@ -0,0 +1,2 @@
+👇☕
+💩