diff --git a/README.md b/README.md index 8a3de03..edfcc7d 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,8 @@ $ eclint -exclude "testdata/**/*" - `indent_style` - `insert_final_newline` - `max_line_length` (when using tabs, specify the `tab_width` or `indent_size`) + - by default, UTF-8 charset is assumed and multi-byte characters should be + counted as one. However, combining characters won't. - `trim_trailing_whitespace` - [domain-specific properties][dsl] - `line_comment` @@ -44,7 +46,7 @@ $ eclint -exclude "testdata/**/*" ## Missing features - basic `//nolint` [suffix](https://github.com/golangci/golangci-lint#nolint) -- doing checks on `rune` rather than `byte` +- `max_line_length` counting UTF-16 and UTF-32 characters - more tests - ability to fix - etc. diff --git a/lint.go b/lint.go index 303ec57..28034c9 100644 --- a/lint.go +++ b/lint.go @@ -26,6 +26,7 @@ func validate(r io.Reader, log logr.Logger, def *editorconfig.Definition) []erro indentSize, _ := strconv.Atoi(def.IndentSize) + var charset string var lastLine []byte var lastIndex int @@ -67,13 +68,21 @@ func validate(r io.Reader, log logr.Logger, def *editorconfig.Definition) []erro var err error // The first line may contain the BOM for detecting some encodings - if index == 1 && def.Charset != "" { - ok, err := charsetUsingBOM(def.Charset, data) - if err != nil { - return err + if index == 1 { + if def.Charset != "utf-8" && def.Charset != "latin1" { + charset = detectCharsetUsingBOM(data) + + if def.Charset != "" && charset != def.Charset { + return validationError{ + error: fmt.Sprintf("no %s prefix were found (got %q)", def.Charset, charset), + position: 1, + index: index, + line: data, + } + } } - if !ok { + if charset == "" && def.Charset != "" { buf = bytes.NewBuffer(make([]byte, 0)) } } @@ -125,7 +134,17 @@ func validate(r io.Reader, log logr.Logger, def *editorconfig.Definition) []erro } if err == nil && maxLength > 0 && tabWidth > 0 { - err = maxLineLength(maxLength, tabWidth, data) + // Remove any BOM from the first line. + d := data + if index == 1 && charset != "" { + for _, bom := range [][]byte{utf8Bom} { + if bytes.HasPrefix(data, bom) { + d = data[len(utf8Bom):] + break + } + } + } + err = maxLineLength(maxLength, tabWidth, d) } // Enrich the error with the line number @@ -139,7 +158,7 @@ func validate(r io.Reader, log logr.Logger, def *editorconfig.Definition) []erro }) if buf != nil && buf.Len() > 0 { - err := charset(def.Charset, buf.Bytes()) + err := detectCharset(def.Charset, buf.Bytes()) if err != nil { errs = append(errs, err) } diff --git a/print.go b/print.go index f3fe624..2b25905 100644 --- a/print.go +++ b/print.go @@ -81,6 +81,11 @@ func errorAt(au aurora.Aurora, line []byte, position int) (string, error) { if err := b.WriteByte(line[i]); err != nil { return "", err } + + // skip 0x10xxxxxx that are UTF-8 continuation markers + if (line[i] >> 6) == 0b10 { + position++ + } } } @@ -98,6 +103,10 @@ func errorAt(au aurora.Aurora, line []byte, position int) (string, error) { if err := b.WriteByte(line[i]); err != nil { return "", err } + + if (line[i] >> 6) == 0b10 { + position++ + } } } diff --git a/validators.go b/validators.go index 33b6c9a..3dd0406 100644 --- a/validators.go +++ b/validators.go @@ -14,6 +14,14 @@ const ( space = ' ' ) +var ( + utf8Bom = []byte{0xef, 0xbb, 0xbf} // nolint:gochecknoglobals + utf16leBom = []byte{0xff, 0xfe} // nolint:gochecknoglobals + utf16beBom = []byte{0xfe, 0xff} // nolint:gochecknoglobals + utf32leBom = []byte{0xff, 0xfe, 0, 0} // nolint:gochecknoglobals + utf32beBom = []byte{0, 0, 0xfe, 0xff} // nolint:gochecknoglobals +) + // validationError is a rich type containing information about the error type validationError struct { error string @@ -63,37 +71,25 @@ func endOfLine(eol string, data []byte) error { return nil } -// charsetUsingBOM checks the charset via the first bytes of the first line -func charsetUsingBOM(charset string, data []byte) (bool, error) { - switch charset { - case "utf-8 bom": - if !bytes.HasPrefix(data, []byte{0xef, 0xbb, 0xbf}) { - return false, validationError{error: "no UTF-8 BOM were found"} - } - case "utf-16le": - if !bytes.HasPrefix(data, []byte{0xff, 0xfe}) { - return false, validationError{error: "no UTF-16LE BOM were found"} - } - case "utf-16be": - if !bytes.HasPrefix(data, []byte{0xfe, 0xff}) { - return false, validationError{error: "no UTF-16BE BOM were found"} - } - case "utf-32le": - if !bytes.HasPrefix(data, []byte{0xff, 0xfe, 0, 0}) { - return false, validationError{error: "no UTF-32LE BOM were found"} - } - case "utf-32be": - if !bytes.HasPrefix(data, []byte{0, 0, 0xfe, 0xff}) { - return false, validationError{error: "no UTF-32BE BOM were found"} - } - default: - return false, nil +// detectCharsetUsingBOM checks the charset via the first bytes of the first line +func detectCharsetUsingBOM(data []byte) string { + switch { + case bytes.HasPrefix(data, utf32leBom): + return "utf-32le" + case bytes.HasPrefix(data, utf32beBom): + return "utf-32be" + case bytes.HasPrefix(data, utf16leBom): + return "utf-16le" + case bytes.HasPrefix(data, utf16beBom): + return "utf-16be" + case bytes.HasPrefix(data, utf8Bom): + return "utf-8 bom" } - return true, nil + return "" } -// charset checks the file encoding -func charset(charset string, data []byte) error { +// detectCharset checks the file encoding +func detectCharset(charset string, data []byte) error { d := chardet.NewTextDetector() results, err := d.DetectAll(data) if err != nil { @@ -117,7 +113,8 @@ func charset(charset string, data []byte) error { if len(results) > 0 { return validationError{ - error: fmt.Sprintf("detected charset %q does not match expected %q", results[0].Charset, charset), + error: fmt.Sprintf("detected charset %q does not match expected %q", results[0].Charset, charset), + position: 1, } } @@ -228,9 +225,12 @@ func maxLineLength(maxLength int, tabWidth int, data []byte) error { if data[i] == cr || data[i] == lf { break } - if data[i] == tab { + switch { + case data[i] == tab: length += tabWidth - } else { + case (data[i] >> 6) == 0b10: + // skip 0x10xxxxxx that are UTF-8 continuation markers + default: length++ } if length > maxLength && breakingPosition == 0 { @@ -240,8 +240,8 @@ func maxLineLength(maxLength int, tabWidth int, data []byte) error { if length > maxLength { return validationError{ - error: fmt.Sprintf("line is too long (%d > %d)", length+1, maxLength), - position: breakingPosition, + error: fmt.Sprintf("line is too long (%d > %d)", length, maxLength), + position: breakingPosition + 1, } } diff --git a/validators_test.go b/validators_test.go index c5652ef..8889069 100644 --- a/validators_test.go +++ b/validators_test.go @@ -8,34 +8,6 @@ import ( tlogr "github.com/go-logr/logr/testing" ) -func TestCharsetUsingBOMFailure(t *testing.T) { - tests := []struct { - Charset string - }{ - { - Charset: "utf-8 bom", - }, { - Charset: "utf-16le", - }, { - Charset: "utf-16be", - }, { - Charset: "utf-32le", - }, { - Charset: "utf-32be", - }, - } - for _, tc := range tests { - tc := tc - t.Run(tc.Charset, func(t *testing.T) { - t.Parallel() - ok, err := charsetUsingBOM(tc.Charset, []byte{}) - if ok || err == nil { - t.Error("an error was expected") - } - }) - } -} - func TestCharset(t *testing.T) { tests := []struct { Name string @@ -78,7 +50,7 @@ func TestCharset(t *testing.T) { for _, tc := range tests { tc := tc t.Run(tc.Name, func(t *testing.T) { - //t.Parallel() + t.Parallel() def := &editorconfig.Definition{ Charset: tc.Charset, @@ -402,6 +374,11 @@ func TestMaxLineLength(t *testing.T) { MaxLineLength: 5, TabWidth: 2, Line: []byte("\t\t.\n"), + }, { + Name: "utf-8 encoded characters", + MaxLineLength: 1, + TabWidth: 0, + Line: []byte("é\n"), }, } for _, tc := range tests {