Skip to content

Commit

Permalink
fix: utf-8 with bom to be rejected
Browse files Browse the repository at this point in the history
  • Loading branch information
greut committed Aug 29, 2023
1 parent 85cb0d0 commit ac65d9c
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 10 deletions.
14 changes: 5 additions & 9 deletions probes.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ func ProbeCharsetOrBinary(ctx context.Context, r *bufio.Reader, charset string)
return cs, false, nil
}

// probeMagic searches for some text-baesd binary files such as PDF.
// probeMagic searches for some text-based binary files such as PDF.
func probeMagic(ctx context.Context, bs []byte) bool {
log := logr.FromContextOrDiscard(ctx)

Expand Down Expand Up @@ -115,16 +115,12 @@ func probeCharset(ctx context.Context, bs []byte, charset string) (string, error

var cs string
// The first line may contain the BOM for detecting some encodings
if charset != Utf8 && charset != Latin1 {
cs = detectCharsetUsingBOM(bs)
cs = detectCharsetUsingBOM(bs)

if charset != "" && cs != charset {
return "", ValidationError{
Message: fmt.Sprintf("no %s prefix were found, got %q", charset, cs),
}
if charset != "" && cs != "" && cs != charset {
return "", ValidationError{
Message: fmt.Sprintf("no %s prefix were found, got %q", charset, cs),
}

log.V(3).Info("detect using BOM", "charset", charset)
}

if cs == "" && charset != "" {
Expand Down
8 changes: 8 additions & 0 deletions probes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,14 @@ func TestProbeCharsetOfBinaryFailure(t *testing.T) {
Name: "utf-8 vs latin1",
Charset: "latin1",
File: []byte{'h', 'i', ' ', 0xf0, 0x9f, 0x92, 0xa9, '!'},
}, {
Name: "utf-8 vs utf-8 bom",
Charset: "utf-8 bom",
File: []byte{'h', 'i', ' ', 0xf0, 0x9f, 0x92, 0xa9, '!'},
}, {
Name: "utf-8 bom vs utf-8",
Charset: "utf-8",
File: []byte{0xef, 0xbb, 0xbf, 'h', 'e', 'l', 'l', 'o', '.'},
},
}

Expand Down
2 changes: 1 addition & 1 deletion scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func SplitLines(data []byte, atEOF bool) (int, []byte, error) {
// ReadLines consumes the reader and emit each line via the LineFunc
//
// Line numbering starts at 0. Scanner is pretty smart an will reuse
// its memory structure. This is somehing we explicitly avoid by copying
// its memory structure. This is something we explicitly avoid by copying
// the content to a new slice.
func ReadLines(r io.Reader, fileSize int64, fn LineFunc) []error {
errs := make([]error, 0)
Expand Down
3 changes: 3 additions & 0 deletions testdata/charset/.editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ root = true
[utf8.*]
charset=utf-8

[utf8-bom.*]
charset=utf-8 bom

[iso-8859-1.*]
charset=latin1

Expand Down
2 changes: 2 additions & 0 deletions testdata/charset/utf8-bom.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
👇☕
💩

0 comments on commit ac65d9c

Please sign in to comment.