tool/internal/parser: sanitize input to clean, valid UTF-8 (publicsuf…

…fix#2005) * tool/internal/parser: sanitize input to clean, valid UTF-8 The PSL's canonical is valid UTF-8 with no BOM. However, to try and report useful lint errors, the parser tries to detect and normalize all forms of UTF-16, as well as UTF-8 with BOM. Anything other than the specified canonical encoding is reported in validation errors. * tools/internal/parser: make invalid encoding tests easier to read
tawk · Jun 24, 2024 · a2f48e0 · a2f48e0
1 parent bc648fe
commit a2f48e0
Show file tree

Hide file tree

Showing 8 changed files with 553 additions and 40 deletions.
diff --git a/tools/go.mod b/tools/go.mod
@@ -2,4 +2,7 @@ module github.com/publicsuffix/list/tools
 
 go 1.21
 
-require github.com/google/go-cmp v0.6.0
+require (
+	github.com/google/go-cmp v0.6.0
+	golang.org/x/text v0.16.0
+)
diff --git a/tools/go.sum b/tools/go.sum
@@ -1,2 +1,4 @@
 github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
 github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
+golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI=
diff --git a/tools/govalidate/govalidate.go b/tools/govalidate/govalidate.go
@@ -30,7 +30,7 @@ func main() {
 		os.Exit(1)
 	}
 
-	psl := parser.Parse(string(bs))
+	psl := parser.Parse(bs)
 
 	for _, err := range psl.Errors {
 		fmt.Println(err)

diff --git a/tools/internal/parser/errors.go b/tools/internal/parser/errors.go
@@ -4,6 +4,61 @@ import (
 	"fmt"
 )
 
+// InvalidEncodingError reports that the input is encoded with
+// something other than UTF-8.
+type InvalidEncodingError struct {
+	Encoding string
+}
+
+func (e InvalidEncodingError) Error() string {
+	return fmt.Sprintf("file uses invalid character encoding %s", e.Encoding)
+}
+
+// UTF8BOMError reports that the input has an unnecessary UTF-8 byte
+// order mark (BOM) at the start.
+type UTF8BOMError struct{}
+
+func (e UTF8BOMError) Error() string {
+	return "file starts with an unnecessary UTF-8 BOM (byte order mark)"
+}
+
+// InvalidUTF8Error reports that a line contains bytes that are not
+// valid UTF-8.
+type InvalidUTF8Error struct {
+	Line Source
+}
+
+func (e InvalidUTF8Error) Error() string {
+	return fmt.Sprintf("found non UTF-8 bytes at %s", e.Line.LocationString())
+}
+
+// DOSNewlineError reports that a line has a DOS style line ending.
+type DOSNewlineError struct {
+	Line Source
+}
+
+func (e DOSNewlineError) Error() string {
+	return fmt.Sprintf("%s has a DOS line ending (\\r\\n instead of just \\n)", e.Line.LocationString())
+}
+
+// TrailingWhitespaceError reports that a line has trailing whitespace.
+type TrailingWhitespaceError struct {
+	Line Source
+}
+
+func (e TrailingWhitespaceError) Error() string {
+	return fmt.Sprintf("%s has trailing whitespace", e.Line.LocationString())
+}
+
+// LeadingWhitespaceError reports that a line has leading whitespace.
+type LeadingWhitespaceError struct {
+	Line Source
+}
+
+func (e LeadingWhitespaceError) Error() string {
+	return fmt.Sprintf("%s has leading whitespace", e.Line.LocationString())
+}
+
 // UnclosedSectionError reports that a file section was not closed
 // properly before EOF.
 type UnclosedSectionError struct {

diff --git a/tools/internal/parser/parser.go b/tools/internal/parser/parser.go
@@ -7,7 +7,7 @@ import (
 	"strings"
 )
 
-// Parse parses src as a PSL file and returns the parse result.
+// Parse parses bs as a PSL file and returns the parse result.
 //
 // The parser tries to keep going when it encounters errors. Parse and
 // validation errors are accumulated in the Errors field of the
@@ -19,15 +19,19 @@ import (
 // submission guidelines
 // (https://github.com/publicsuffix/list/wiki/Guidelines). A File with
 // errors should not be used to calculate public suffixes for FQDNs.
-func Parse(src string) *File {
-	return parseWithExceptions(src, downgradeToWarning)
+func Parse(bs []byte) *File {
+	return parseWithExceptions(bs, downgradeToWarning)
 }
 
-func parseWithExceptions(src string, downgradeToWarning func(error) bool) *File {
+func parseWithExceptions(bs []byte, downgradeToWarning func(error) bool) *File {
+	src, errs := newSource(bs)
 	p := parser{
 		downgradeToWarning: downgradeToWarning,
 	}
-	p.Parse(newSource(src))
+	for _, err := range errs {
+		p.addError(err)
+	}
+	p.Parse(src)
 	p.Validate()
 	return &p.File
 }

diff --git a/tools/internal/parser/parser_test.go b/tools/internal/parser/parser_test.go
@@ -27,19 +27,19 @@ func TestParser(t *testing.T) {
 
 	tests := []struct {
 		name               string
-		psl                string
+		psl                []byte
 		downgradeToWarning func(error) bool
 		want               File
 	}{
 		{
 			name: "empty",
-			psl:  "",
+			psl:  byteLines(""),
 			want: File{},
 		},
 
 		{
 			name: "just_comments",
-			psl: lines(
+			psl: byteLines(
 				"// This is an empty PSL file.",
 				"",
 				"// Here is a second comment.",
@@ -54,7 +54,7 @@ func TestParser(t *testing.T) {
 
 		{
 			name: "just_suffixes",
-			psl: lines(
+			psl: byteLines(
 				"example.com",
 				"other.example.com",
 				"*.example.org",
@@ -87,7 +87,7 @@ func TestParser(t *testing.T) {
 
 		{
 			name: "empty_sections",
-			psl: lines(
+			psl: byteLines(
 				"// ===BEGIN IMAGINARY DOMAINS===",
 				"",
 				"// ===END IMAGINARY DOMAINS===",
@@ -118,7 +118,7 @@ func TestParser(t *testing.T) {
 
 		{
 			name: "missing_section_end",
-			psl: lines(
+			psl: byteLines(
 				"// ===BEGIN ICANN DOMAINS===",
 			),
 			want: File{
@@ -141,7 +141,7 @@ func TestParser(t *testing.T) {
 
 		{
 			name: "nested_sections",
-			psl: lines(
+			psl: byteLines(
 				"// ===BEGIN ICANN DOMAINS===",
 				"// ===BEGIN SECRET DOMAINS===",
 				"// ===END SECRET DOMAINS===",
@@ -188,7 +188,7 @@ func TestParser(t *testing.T) {
 		},
 		{
 			name: "mismatched_sections",
-			psl: lines(
+			psl: byteLines(
 				"// ===BEGIN ICANN DOMAINS===",
 				"",
 				"// ===END PRIVATE DOMAINS===",
@@ -221,7 +221,7 @@ func TestParser(t *testing.T) {
 
 		{
 			name: "unknown_section_header",
-			psl: lines(
+			psl: byteLines(
 				"// ===TRANSFORM DOMAINS===",
 			),
 			want: File{
@@ -240,7 +240,7 @@ func TestParser(t *testing.T) {
 
 		{
 			name: "suffixes_with_unstructured_header",
-			psl: lines(
+			psl: byteLines(
 				"// Unstructured header.",
 				"// I'm just going on about random things.",
 				"example.com",
@@ -271,7 +271,7 @@ func TestParser(t *testing.T) {
 
 		{
 			name: "suffixes_with_canonical_private_header",
-			psl: lines(
+			psl: byteLines(
 				"// DuckCorp Inc: https://example.com",
 				"// Submitted by Not A Duck <[email protected]>",
 				"// Seriously, not a duck",
@@ -307,7 +307,7 @@ func TestParser(t *testing.T) {
 
 		{
 			name: "suffixes_with_entity_and_submitter",
-			psl: lines(
+			psl: byteLines(
 				"// DuckCorp Inc: submitted by Not A Duck <[email protected]>",
 				"example.com",
 			),
@@ -333,7 +333,7 @@ func TestParser(t *testing.T) {
 
 		{
 			name: "suffixes_with_all_separate_lines",
-			psl: lines(
+			psl: byteLines(
 				"// DuckCorp Inc",
 				"// https://example.com",
 				"// Submitted by Not A Duck <[email protected]>",
@@ -366,7 +366,7 @@ func TestParser(t *testing.T) {
 
 		{
 			name: "suffixes_standard_header_submitter_first",
-			psl: lines(
+			psl: byteLines(
 				"// Submitted by Not A Duck <[email protected]>",
 				"// DuckCorp Inc: https://example.com",
 				"example.com",
@@ -396,7 +396,7 @@ func TestParser(t *testing.T) {
 
 		{
 			name: "suffixes_standard_header_leading_unstructured",
-			psl: lines(
+			psl: byteLines(
 				"// This is an unstructured comment.",
 				"// DuckCorp Inc: https://example.com",
 				"// Submitted by Not A Duck <[email protected]>",
@@ -429,7 +429,7 @@ func TestParser(t *testing.T) {
 
 		{
 			name: "legacy_error_downgrade",
-			psl: lines(
+			psl: byteLines(
 				"// https://example.com",
 				"example.com",
 			),
@@ -476,7 +476,7 @@ func TestParser(t *testing.T) {
 			// Regression test for Future Versatile Group, who use a
 			// unicode fullwidth colon in their header.
 			name: "unicode_colon",
-			psl: lines(
+			psl: byteLines(
 				"// Future Versatile Group：https://example.org",
 				"example.com",
 			),
@@ -501,7 +501,7 @@ func TestParser(t *testing.T) {
 			// Regression test for a few blocks that start with "name
 			// (url)" instead of the more common "name: url".
 			name: "url_in_parens",
-			psl: lines(
+			psl: byteLines(
 				"// Parens Appreciation Society (https://example.org)",
 				"example.com",
 			),
@@ -527,7 +527,7 @@ func TestParser(t *testing.T) {
 			// (url)" style don't have a scheme on their URL, so
 			// require a bit more fudging to parse.
 			name: "url_in_parens_no_scheme",
-			psl: lines(
+			psl: byteLines(
 				"// Parens Appreciation Society (hostyhosting.com)",
 				"example.com",
 				"",
@@ -569,7 +569,7 @@ func TestParser(t *testing.T) {
 			// lines, or you might overwrite the correct answer with
 			// someething else that happens to have the right shape.
 			name: "accept_first_valid_entity",
-			psl: lines(
+			psl: byteLines(
 				"// cd : https://en.wikipedia.org/wiki/.cd",
 				"// see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1",
 				"cd",
@@ -645,7 +645,7 @@ func TestParseRealList(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	f := Parse(string(bs))
+	f := Parse(bs)
 
 	for _, err := range f.Errors {
 		t.Errorf("Parse error: %v", err)
@@ -661,7 +661,7 @@ func TestRoundtripRealList(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	f := Parse(string(bs))
+	f := Parse(bs)
 
 	if len(f.Errors) > 0 {
 		t.Fatal("Parse errors, not attempting to roundtrip")
@@ -700,7 +700,7 @@ func TestRoundtripRealListDetailed(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	f := Parse(string(bs))
+	f := Parse(bs)
 
 	if len(f.Errors) > 0 {
 		t.Fatal("Parse errors, not attempting to roundtrip")
@@ -761,7 +761,7 @@ func TestExceptionsStillNecessary(t *testing.T) {
 		defer func() { missingEmail = old }()
 		missingEmail = trimmed
 
-		f := Parse(string(bs))
+		f := Parse(bs)
 		if len(f.Errors) == 0 {
 			t.Errorf("missingEmail exception no longer necessary:\n%s", omitted)
 		}