Skip to content

Commit

Permalink
Merge branch 'detect-utf8' into 'master'
Browse files Browse the repository at this point in the history
lint: max_line_length supports utf-8

See merge request greut/eclint!8
  • Loading branch information
greut committed Nov 22, 2019
2 parents 3034996 + 9bd91b8 commit 0a58a4e
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 70 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ $ eclint -exclude "testdata/**/*"
- `indent_style`
- `insert_final_newline`
- `max_line_length` (when using tabs, specify the `tab_width` or `indent_size`)
- by default, UTF-8 charset is assumed and multi-byte characters should be
counted as one. However, combining characters won't.
- `trim_trailing_whitespace`
- [domain-specific properties][dsl]
- `line_comment`
Expand All @@ -44,7 +46,7 @@ $ eclint -exclude "testdata/**/*"
## Missing features

- basic `//nolint` [suffix](https://github.com/golangci/golangci-lint#nolint)
- doing checks on `rune` rather than `byte`
- `max_line_length` counting UTF-16 and UTF-32 characters
- more tests
- ability to fix
- etc.
Expand Down
33 changes: 26 additions & 7 deletions lint.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ func validate(r io.Reader, log logr.Logger, def *editorconfig.Definition) []erro

indentSize, _ := strconv.Atoi(def.IndentSize)

var charset string
var lastLine []byte
var lastIndex int

Expand Down Expand Up @@ -67,13 +68,21 @@ func validate(r io.Reader, log logr.Logger, def *editorconfig.Definition) []erro
var err error

// The first line may contain the BOM for detecting some encodings
if index == 1 && def.Charset != "" {
ok, err := charsetUsingBOM(def.Charset, data)
if err != nil {
return err
if index == 1 {
if def.Charset != "utf-8" && def.Charset != "latin1" {
charset = detectCharsetUsingBOM(data)

if def.Charset != "" && charset != def.Charset {
return validationError{
error: fmt.Sprintf("no %s prefix were found (got %q)", def.Charset, charset),
position: 1,
index: index,
line: data,
}
}
}

if !ok {
if charset == "" && def.Charset != "" {
buf = bytes.NewBuffer(make([]byte, 0))
}
}
Expand Down Expand Up @@ -125,7 +134,17 @@ func validate(r io.Reader, log logr.Logger, def *editorconfig.Definition) []erro
}

if err == nil && maxLength > 0 && tabWidth > 0 {
err = maxLineLength(maxLength, tabWidth, data)
// Remove any BOM from the first line.
d := data
if index == 1 && charset != "" {
for _, bom := range [][]byte{utf8Bom} {
if bytes.HasPrefix(data, bom) {
d = data[len(utf8Bom):]
break
}
}
}
err = maxLineLength(maxLength, tabWidth, d)
}

// Enrich the error with the line number
Expand All @@ -139,7 +158,7 @@ func validate(r io.Reader, log logr.Logger, def *editorconfig.Definition) []erro
})

if buf != nil && buf.Len() > 0 {
err := charset(def.Charset, buf.Bytes())
err := detectCharset(def.Charset, buf.Bytes())
if err != nil {
errs = append(errs, err)
}
Expand Down
9 changes: 9 additions & 0 deletions print.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ func errorAt(au aurora.Aurora, line []byte, position int) (string, error) {
if err := b.WriteByte(line[i]); err != nil {
return "", err
}

// skip 0x10xxxxxx that are UTF-8 continuation markers
if (line[i] >> 6) == 0b10 {
position++
}
}
}

Expand All @@ -98,6 +103,10 @@ func errorAt(au aurora.Aurora, line []byte, position int) (string, error) {
if err := b.WriteByte(line[i]); err != nil {
return "", err
}

if (line[i] >> 6) == 0b10 {
position++
}
}
}

Expand Down
66 changes: 33 additions & 33 deletions validators.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ const (
space = ' '
)

var (
utf8Bom = []byte{0xef, 0xbb, 0xbf} // nolint:gochecknoglobals
utf16leBom = []byte{0xff, 0xfe} // nolint:gochecknoglobals
utf16beBom = []byte{0xfe, 0xff} // nolint:gochecknoglobals
utf32leBom = []byte{0xff, 0xfe, 0, 0} // nolint:gochecknoglobals
utf32beBom = []byte{0, 0, 0xfe, 0xff} // nolint:gochecknoglobals
)

// validationError is a rich type containing information about the error
type validationError struct {
error string
Expand Down Expand Up @@ -63,37 +71,25 @@ func endOfLine(eol string, data []byte) error {
return nil
}

// charsetUsingBOM checks the charset via the first bytes of the first line
func charsetUsingBOM(charset string, data []byte) (bool, error) {
switch charset {
case "utf-8 bom":
if !bytes.HasPrefix(data, []byte{0xef, 0xbb, 0xbf}) {
return false, validationError{error: "no UTF-8 BOM were found"}
}
case "utf-16le":
if !bytes.HasPrefix(data, []byte{0xff, 0xfe}) {
return false, validationError{error: "no UTF-16LE BOM were found"}
}
case "utf-16be":
if !bytes.HasPrefix(data, []byte{0xfe, 0xff}) {
return false, validationError{error: "no UTF-16BE BOM were found"}
}
case "utf-32le":
if !bytes.HasPrefix(data, []byte{0xff, 0xfe, 0, 0}) {
return false, validationError{error: "no UTF-32LE BOM were found"}
}
case "utf-32be":
if !bytes.HasPrefix(data, []byte{0, 0, 0xfe, 0xff}) {
return false, validationError{error: "no UTF-32BE BOM were found"}
}
default:
return false, nil
// detectCharsetUsingBOM checks the charset via the first bytes of the first line
func detectCharsetUsingBOM(data []byte) string {
switch {
case bytes.HasPrefix(data, utf32leBom):
return "utf-32le"
case bytes.HasPrefix(data, utf32beBom):
return "utf-32be"
case bytes.HasPrefix(data, utf16leBom):
return "utf-16le"
case bytes.HasPrefix(data, utf16beBom):
return "utf-16be"
case bytes.HasPrefix(data, utf8Bom):
return "utf-8 bom"
}
return true, nil
return ""
}

// charset checks the file encoding
func charset(charset string, data []byte) error {
// detectCharset checks the file encoding
func detectCharset(charset string, data []byte) error {
d := chardet.NewTextDetector()
results, err := d.DetectAll(data)
if err != nil {
Expand All @@ -117,7 +113,8 @@ func charset(charset string, data []byte) error {

if len(results) > 0 {
return validationError{
error: fmt.Sprintf("detected charset %q does not match expected %q", results[0].Charset, charset),
error: fmt.Sprintf("detected charset %q does not match expected %q", results[0].Charset, charset),
position: 1,
}
}

Expand Down Expand Up @@ -228,9 +225,12 @@ func maxLineLength(maxLength int, tabWidth int, data []byte) error {
if data[i] == cr || data[i] == lf {
break
}
if data[i] == tab {
switch {
case data[i] == tab:
length += tabWidth
} else {
case (data[i] >> 6) == 0b10:
// skip 0x10xxxxxx that are UTF-8 continuation markers
default:
length++
}
if length > maxLength && breakingPosition == 0 {
Expand All @@ -240,8 +240,8 @@ func maxLineLength(maxLength int, tabWidth int, data []byte) error {

if length > maxLength {
return validationError{
error: fmt.Sprintf("line is too long (%d > %d)", length+1, maxLength),
position: breakingPosition,
error: fmt.Sprintf("line is too long (%d > %d)", length, maxLength),
position: breakingPosition + 1,
}
}

Expand Down
35 changes: 6 additions & 29 deletions validators_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,34 +8,6 @@ import (
tlogr "github.com/go-logr/logr/testing"
)

func TestCharsetUsingBOMFailure(t *testing.T) {
tests := []struct {
Charset string
}{
{
Charset: "utf-8 bom",
}, {
Charset: "utf-16le",
}, {
Charset: "utf-16be",
}, {
Charset: "utf-32le",
}, {
Charset: "utf-32be",
},
}
for _, tc := range tests {
tc := tc
t.Run(tc.Charset, func(t *testing.T) {
t.Parallel()
ok, err := charsetUsingBOM(tc.Charset, []byte{})
if ok || err == nil {
t.Error("an error was expected")
}
})
}
}

func TestCharset(t *testing.T) {
tests := []struct {
Name string
Expand Down Expand Up @@ -78,7 +50,7 @@ func TestCharset(t *testing.T) {
for _, tc := range tests {
tc := tc
t.Run(tc.Name, func(t *testing.T) {
//t.Parallel()
t.Parallel()

def := &editorconfig.Definition{
Charset: tc.Charset,
Expand Down Expand Up @@ -402,6 +374,11 @@ func TestMaxLineLength(t *testing.T) {
MaxLineLength: 5,
TabWidth: 2,
Line: []byte("\t\t.\n"),
}, {
Name: "utf-8 encoded characters",
MaxLineLength: 1,
TabWidth: 0,
Line: []byte(\n"),
},
}
for _, tc := range tests {
Expand Down

0 comments on commit 0a58a4e

Please sign in to comment.