forked from publicsuffix/list
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
tool/internal/parser: sanitize input to clean, valid UTF-8 (publicsuf…
…fix#2005) * tool/internal/parser: sanitize input to clean, valid UTF-8 The PSL's canonical is valid UTF-8 with no BOM. However, to try and report useful lint errors, the parser tries to detect and normalize all forms of UTF-16, as well as UTF-8 with BOM. Anything other than the specified canonical encoding is reported in validation errors. * tools/internal/parser: make invalid encoding tests easier to read
- Loading branch information
Showing
8 changed files
with
553 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,4 @@ | ||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= | ||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= | ||
golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= | ||
golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,19 +27,19 @@ func TestParser(t *testing.T) { | |
|
||
tests := []struct { | ||
name string | ||
psl string | ||
psl []byte | ||
downgradeToWarning func(error) bool | ||
want File | ||
}{ | ||
{ | ||
name: "empty", | ||
psl: "", | ||
psl: byteLines(""), | ||
want: File{}, | ||
}, | ||
|
||
{ | ||
name: "just_comments", | ||
psl: lines( | ||
psl: byteLines( | ||
"// This is an empty PSL file.", | ||
"", | ||
"// Here is a second comment.", | ||
|
@@ -54,7 +54,7 @@ func TestParser(t *testing.T) { | |
|
||
{ | ||
name: "just_suffixes", | ||
psl: lines( | ||
psl: byteLines( | ||
"example.com", | ||
"other.example.com", | ||
"*.example.org", | ||
|
@@ -87,7 +87,7 @@ func TestParser(t *testing.T) { | |
|
||
{ | ||
name: "empty_sections", | ||
psl: lines( | ||
psl: byteLines( | ||
"// ===BEGIN IMAGINARY DOMAINS===", | ||
"", | ||
"// ===END IMAGINARY DOMAINS===", | ||
|
@@ -118,7 +118,7 @@ func TestParser(t *testing.T) { | |
|
||
{ | ||
name: "missing_section_end", | ||
psl: lines( | ||
psl: byteLines( | ||
"// ===BEGIN ICANN DOMAINS===", | ||
), | ||
want: File{ | ||
|
@@ -141,7 +141,7 @@ func TestParser(t *testing.T) { | |
|
||
{ | ||
name: "nested_sections", | ||
psl: lines( | ||
psl: byteLines( | ||
"// ===BEGIN ICANN DOMAINS===", | ||
"// ===BEGIN SECRET DOMAINS===", | ||
"// ===END SECRET DOMAINS===", | ||
|
@@ -188,7 +188,7 @@ func TestParser(t *testing.T) { | |
}, | ||
{ | ||
name: "mismatched_sections", | ||
psl: lines( | ||
psl: byteLines( | ||
"// ===BEGIN ICANN DOMAINS===", | ||
"", | ||
"// ===END PRIVATE DOMAINS===", | ||
|
@@ -221,7 +221,7 @@ func TestParser(t *testing.T) { | |
|
||
{ | ||
name: "unknown_section_header", | ||
psl: lines( | ||
psl: byteLines( | ||
"// ===TRANSFORM DOMAINS===", | ||
), | ||
want: File{ | ||
|
@@ -240,7 +240,7 @@ func TestParser(t *testing.T) { | |
|
||
{ | ||
name: "suffixes_with_unstructured_header", | ||
psl: lines( | ||
psl: byteLines( | ||
"// Unstructured header.", | ||
"// I'm just going on about random things.", | ||
"example.com", | ||
|
@@ -271,7 +271,7 @@ func TestParser(t *testing.T) { | |
|
||
{ | ||
name: "suffixes_with_canonical_private_header", | ||
psl: lines( | ||
psl: byteLines( | ||
"// DuckCorp Inc: https://example.com", | ||
"// Submitted by Not A Duck <[email protected]>", | ||
"// Seriously, not a duck", | ||
|
@@ -307,7 +307,7 @@ func TestParser(t *testing.T) { | |
|
||
{ | ||
name: "suffixes_with_entity_and_submitter", | ||
psl: lines( | ||
psl: byteLines( | ||
"// DuckCorp Inc: submitted by Not A Duck <[email protected]>", | ||
"example.com", | ||
), | ||
|
@@ -333,7 +333,7 @@ func TestParser(t *testing.T) { | |
|
||
{ | ||
name: "suffixes_with_all_separate_lines", | ||
psl: lines( | ||
psl: byteLines( | ||
"// DuckCorp Inc", | ||
"// https://example.com", | ||
"// Submitted by Not A Duck <[email protected]>", | ||
|
@@ -366,7 +366,7 @@ func TestParser(t *testing.T) { | |
|
||
{ | ||
name: "suffixes_standard_header_submitter_first", | ||
psl: lines( | ||
psl: byteLines( | ||
"// Submitted by Not A Duck <[email protected]>", | ||
"// DuckCorp Inc: https://example.com", | ||
"example.com", | ||
|
@@ -396,7 +396,7 @@ func TestParser(t *testing.T) { | |
|
||
{ | ||
name: "suffixes_standard_header_leading_unstructured", | ||
psl: lines( | ||
psl: byteLines( | ||
"// This is an unstructured comment.", | ||
"// DuckCorp Inc: https://example.com", | ||
"// Submitted by Not A Duck <[email protected]>", | ||
|
@@ -429,7 +429,7 @@ func TestParser(t *testing.T) { | |
|
||
{ | ||
name: "legacy_error_downgrade", | ||
psl: lines( | ||
psl: byteLines( | ||
"// https://example.com", | ||
"example.com", | ||
), | ||
|
@@ -476,7 +476,7 @@ func TestParser(t *testing.T) { | |
// Regression test for Future Versatile Group, who use a | ||
// unicode fullwidth colon in their header. | ||
name: "unicode_colon", | ||
psl: lines( | ||
psl: byteLines( | ||
"// Future Versatile Group:https://example.org", | ||
"example.com", | ||
), | ||
|
@@ -501,7 +501,7 @@ func TestParser(t *testing.T) { | |
// Regression test for a few blocks that start with "name | ||
// (url)" instead of the more common "name: url". | ||
name: "url_in_parens", | ||
psl: lines( | ||
psl: byteLines( | ||
"// Parens Appreciation Society (https://example.org)", | ||
"example.com", | ||
), | ||
|
@@ -527,7 +527,7 @@ func TestParser(t *testing.T) { | |
// (url)" style don't have a scheme on their URL, so | ||
// require a bit more fudging to parse. | ||
name: "url_in_parens_no_scheme", | ||
psl: lines( | ||
psl: byteLines( | ||
"// Parens Appreciation Society (hostyhosting.com)", | ||
"example.com", | ||
"", | ||
|
@@ -569,7 +569,7 @@ func TestParser(t *testing.T) { | |
// lines, or you might overwrite the correct answer with | ||
// someething else that happens to have the right shape. | ||
name: "accept_first_valid_entity", | ||
psl: lines( | ||
psl: byteLines( | ||
"// cd : https://en.wikipedia.org/wiki/.cd", | ||
"// see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1", | ||
"cd", | ||
|
@@ -645,7 +645,7 @@ func TestParseRealList(t *testing.T) { | |
t.Fatal(err) | ||
} | ||
|
||
f := Parse(string(bs)) | ||
f := Parse(bs) | ||
|
||
for _, err := range f.Errors { | ||
t.Errorf("Parse error: %v", err) | ||
|
@@ -661,7 +661,7 @@ func TestRoundtripRealList(t *testing.T) { | |
if err != nil { | ||
t.Fatal(err) | ||
} | ||
f := Parse(string(bs)) | ||
f := Parse(bs) | ||
|
||
if len(f.Errors) > 0 { | ||
t.Fatal("Parse errors, not attempting to roundtrip") | ||
|
@@ -700,7 +700,7 @@ func TestRoundtripRealListDetailed(t *testing.T) { | |
if err != nil { | ||
t.Fatal(err) | ||
} | ||
f := Parse(string(bs)) | ||
f := Parse(bs) | ||
|
||
if len(f.Errors) > 0 { | ||
t.Fatal("Parse errors, not attempting to roundtrip") | ||
|
@@ -761,7 +761,7 @@ func TestExceptionsStillNecessary(t *testing.T) { | |
defer func() { missingEmail = old }() | ||
missingEmail = trimmed | ||
|
||
f := Parse(string(bs)) | ||
f := Parse(bs) | ||
if len(f.Errors) == 0 { | ||
t.Errorf("missingEmail exception no longer necessary:\n%s", omitted) | ||
} | ||
|
Oops, something went wrong.