Skip to content

Commit

Permalink
tool/internal/parser: sanitize input to clean, valid UTF-8 (publicsuf…
Browse files Browse the repository at this point in the history
…fix#2005)

* tool/internal/parser: sanitize input to clean, valid UTF-8

The PSL's canonical is valid UTF-8 with no BOM. However, to try and report useful lint
errors, the parser tries to detect and normalize all forms of UTF-16, as well as UTF-8
with BOM. Anything other than the specified canonical encoding is reported in validation
errors.

* tools/internal/parser: make invalid encoding tests easier to read
  • Loading branch information
danderson authored Jun 24, 2024
1 parent bc648fe commit a2f48e0
Show file tree
Hide file tree
Showing 8 changed files with 553 additions and 40 deletions.
5 changes: 4 additions & 1 deletion tools/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,7 @@ module github.com/publicsuffix/list/tools

go 1.21

require github.com/google/go-cmp v0.6.0
require (
github.com/google/go-cmp v0.6.0
golang.org/x/text v0.16.0
)
2 changes: 2 additions & 0 deletions tools/go.sum
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI=
2 changes: 1 addition & 1 deletion tools/govalidate/govalidate.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func main() {
os.Exit(1)
}

psl := parser.Parse(string(bs))
psl := parser.Parse(bs)

for _, err := range psl.Errors {
fmt.Println(err)
Expand Down
55 changes: 55 additions & 0 deletions tools/internal/parser/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,61 @@ import (
"fmt"
)

// InvalidEncodingError reports that the input is encoded with
// something other than UTF-8.
type InvalidEncodingError struct {
Encoding string
}

func (e InvalidEncodingError) Error() string {
return fmt.Sprintf("file uses invalid character encoding %s", e.Encoding)
}

// UTF8BOMError reports that the input has an unnecessary UTF-8 byte
// order mark (BOM) at the start.
type UTF8BOMError struct{}

func (e UTF8BOMError) Error() string {
return "file starts with an unnecessary UTF-8 BOM (byte order mark)"
}

// InvalidUTF8Error reports that a line contains bytes that are not
// valid UTF-8.
type InvalidUTF8Error struct {
Line Source
}

func (e InvalidUTF8Error) Error() string {
return fmt.Sprintf("found non UTF-8 bytes at %s", e.Line.LocationString())
}

// DOSNewlineError reports that a line has a DOS style line ending.
type DOSNewlineError struct {
Line Source
}

func (e DOSNewlineError) Error() string {
return fmt.Sprintf("%s has a DOS line ending (\\r\\n instead of just \\n)", e.Line.LocationString())
}

// TrailingWhitespaceError reports that a line has trailing whitespace.
type TrailingWhitespaceError struct {
Line Source
}

func (e TrailingWhitespaceError) Error() string {
return fmt.Sprintf("%s has trailing whitespace", e.Line.LocationString())
}

// LeadingWhitespaceError reports that a line has leading whitespace.
type LeadingWhitespaceError struct {
Line Source
}

func (e LeadingWhitespaceError) Error() string {
return fmt.Sprintf("%s has leading whitespace", e.Line.LocationString())
}

// UnclosedSectionError reports that a file section was not closed
// properly before EOF.
type UnclosedSectionError struct {
Expand Down
14 changes: 9 additions & 5 deletions tools/internal/parser/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
"strings"
)

// Parse parses src as a PSL file and returns the parse result.
// Parse parses bs as a PSL file and returns the parse result.
//
// The parser tries to keep going when it encounters errors. Parse and
// validation errors are accumulated in the Errors field of the
Expand All @@ -19,15 +19,19 @@ import (
// submission guidelines
// (https://github.com/publicsuffix/list/wiki/Guidelines). A File with
// errors should not be used to calculate public suffixes for FQDNs.
func Parse(src string) *File {
return parseWithExceptions(src, downgradeToWarning)
func Parse(bs []byte) *File {
return parseWithExceptions(bs, downgradeToWarning)
}

func parseWithExceptions(src string, downgradeToWarning func(error) bool) *File {
func parseWithExceptions(bs []byte, downgradeToWarning func(error) bool) *File {
src, errs := newSource(bs)
p := parser{
downgradeToWarning: downgradeToWarning,
}
p.Parse(newSource(src))
for _, err := range errs {
p.addError(err)
}
p.Parse(src)
p.Validate()
return &p.File
}
Expand Down
48 changes: 24 additions & 24 deletions tools/internal/parser/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,19 @@ func TestParser(t *testing.T) {

tests := []struct {
name string
psl string
psl []byte
downgradeToWarning func(error) bool
want File
}{
{
name: "empty",
psl: "",
psl: byteLines(""),
want: File{},
},

{
name: "just_comments",
psl: lines(
psl: byteLines(
"// This is an empty PSL file.",
"",
"// Here is a second comment.",
Expand All @@ -54,7 +54,7 @@ func TestParser(t *testing.T) {

{
name: "just_suffixes",
psl: lines(
psl: byteLines(
"example.com",
"other.example.com",
"*.example.org",
Expand Down Expand Up @@ -87,7 +87,7 @@ func TestParser(t *testing.T) {

{
name: "empty_sections",
psl: lines(
psl: byteLines(
"// ===BEGIN IMAGINARY DOMAINS===",
"",
"// ===END IMAGINARY DOMAINS===",
Expand Down Expand Up @@ -118,7 +118,7 @@ func TestParser(t *testing.T) {

{
name: "missing_section_end",
psl: lines(
psl: byteLines(
"// ===BEGIN ICANN DOMAINS===",
),
want: File{
Expand All @@ -141,7 +141,7 @@ func TestParser(t *testing.T) {

{
name: "nested_sections",
psl: lines(
psl: byteLines(
"// ===BEGIN ICANN DOMAINS===",
"// ===BEGIN SECRET DOMAINS===",
"// ===END SECRET DOMAINS===",
Expand Down Expand Up @@ -188,7 +188,7 @@ func TestParser(t *testing.T) {
},
{
name: "mismatched_sections",
psl: lines(
psl: byteLines(
"// ===BEGIN ICANN DOMAINS===",
"",
"// ===END PRIVATE DOMAINS===",
Expand Down Expand Up @@ -221,7 +221,7 @@ func TestParser(t *testing.T) {

{
name: "unknown_section_header",
psl: lines(
psl: byteLines(
"// ===TRANSFORM DOMAINS===",
),
want: File{
Expand All @@ -240,7 +240,7 @@ func TestParser(t *testing.T) {

{
name: "suffixes_with_unstructured_header",
psl: lines(
psl: byteLines(
"// Unstructured header.",
"// I'm just going on about random things.",
"example.com",
Expand Down Expand Up @@ -271,7 +271,7 @@ func TestParser(t *testing.T) {

{
name: "suffixes_with_canonical_private_header",
psl: lines(
psl: byteLines(
"// DuckCorp Inc: https://example.com",
"// Submitted by Not A Duck <[email protected]>",
"// Seriously, not a duck",
Expand Down Expand Up @@ -307,7 +307,7 @@ func TestParser(t *testing.T) {

{
name: "suffixes_with_entity_and_submitter",
psl: lines(
psl: byteLines(
"// DuckCorp Inc: submitted by Not A Duck <[email protected]>",
"example.com",
),
Expand All @@ -333,7 +333,7 @@ func TestParser(t *testing.T) {

{
name: "suffixes_with_all_separate_lines",
psl: lines(
psl: byteLines(
"// DuckCorp Inc",
"// https://example.com",
"// Submitted by Not A Duck <[email protected]>",
Expand Down Expand Up @@ -366,7 +366,7 @@ func TestParser(t *testing.T) {

{
name: "suffixes_standard_header_submitter_first",
psl: lines(
psl: byteLines(
"// Submitted by Not A Duck <[email protected]>",
"// DuckCorp Inc: https://example.com",
"example.com",
Expand Down Expand Up @@ -396,7 +396,7 @@ func TestParser(t *testing.T) {

{
name: "suffixes_standard_header_leading_unstructured",
psl: lines(
psl: byteLines(
"// This is an unstructured comment.",
"// DuckCorp Inc: https://example.com",
"// Submitted by Not A Duck <[email protected]>",
Expand Down Expand Up @@ -429,7 +429,7 @@ func TestParser(t *testing.T) {

{
name: "legacy_error_downgrade",
psl: lines(
psl: byteLines(
"// https://example.com",
"example.com",
),
Expand Down Expand Up @@ -476,7 +476,7 @@ func TestParser(t *testing.T) {
// Regression test for Future Versatile Group, who use a
// unicode fullwidth colon in their header.
name: "unicode_colon",
psl: lines(
psl: byteLines(
"// Future Versatile Group:https://example.org",
"example.com",
),
Expand All @@ -501,7 +501,7 @@ func TestParser(t *testing.T) {
// Regression test for a few blocks that start with "name
// (url)" instead of the more common "name: url".
name: "url_in_parens",
psl: lines(
psl: byteLines(
"// Parens Appreciation Society (https://example.org)",
"example.com",
),
Expand All @@ -527,7 +527,7 @@ func TestParser(t *testing.T) {
// (url)" style don't have a scheme on their URL, so
// require a bit more fudging to parse.
name: "url_in_parens_no_scheme",
psl: lines(
psl: byteLines(
"// Parens Appreciation Society (hostyhosting.com)",
"example.com",
"",
Expand Down Expand Up @@ -569,7 +569,7 @@ func TestParser(t *testing.T) {
// lines, or you might overwrite the correct answer with
// someething else that happens to have the right shape.
name: "accept_first_valid_entity",
psl: lines(
psl: byteLines(
"// cd : https://en.wikipedia.org/wiki/.cd",
"// see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1",
"cd",
Expand Down Expand Up @@ -645,7 +645,7 @@ func TestParseRealList(t *testing.T) {
t.Fatal(err)
}

f := Parse(string(bs))
f := Parse(bs)

for _, err := range f.Errors {
t.Errorf("Parse error: %v", err)
Expand All @@ -661,7 +661,7 @@ func TestRoundtripRealList(t *testing.T) {
if err != nil {
t.Fatal(err)
}
f := Parse(string(bs))
f := Parse(bs)

if len(f.Errors) > 0 {
t.Fatal("Parse errors, not attempting to roundtrip")
Expand Down Expand Up @@ -700,7 +700,7 @@ func TestRoundtripRealListDetailed(t *testing.T) {
if err != nil {
t.Fatal(err)
}
f := Parse(string(bs))
f := Parse(bs)

if len(f.Errors) > 0 {
t.Fatal("Parse errors, not attempting to roundtrip")
Expand Down Expand Up @@ -761,7 +761,7 @@ func TestExceptionsStillNecessary(t *testing.T) {
defer func() { missingEmail = old }()
missingEmail = trimmed

f := Parse(string(bs))
f := Parse(bs)
if len(f.Errors) == 0 {
t.Errorf("missingEmail exception no longer necessary:\n%s", omitted)
}
Expand Down
Loading

0 comments on commit a2f48e0

Please sign in to comment.