forked from google/webrisk
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathurls.go
517 lines (477 loc) · 13.9 KB
/
urls.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package webrisk
// The logic below deals with extracting patterns from a URL.
// Patterns are all the possible host-suffix and path-prefix fragments for
// the input URL.
//
// From example, the patterns for the given URL are the following:
// input: "http://a.b.c/1/2.html?param=1/2"
// patterns: [
// "a.b.c/1/2.html?param=1/2",
// "a.b.c/1/2.html",
// "a.b.c/1/",
// "a.b.c/",
// "b.c/1/2.html?param=1/2",
// "b.c/1/2.html",
// "b.c/1/",
// "b.c/"
// ]
//
// The process that Web Risk uses predates Chrome and many RFC standards
// and is partly based on how legacy browsers typically parse URLs. Thus, we
// parse URLs in a way that is not strictly standards compliant.
import (
"bytes"
"errors"
"fmt"
"net"
"net/url"
"path"
"regexp"
"strconv"
"strings"
"unicode"
"golang.org/x/net/idna"
)
var (
dotsRegexp = regexp.MustCompile("[.]+")
portRegexp = regexp.MustCompile(`:\d+$`)
possibleIPRegexp = regexp.MustCompile(`^(?i)((?:0x[0-9a-f]+|[0-9\.])+)$`)
trailingSpaceRegexp = regexp.MustCompile(`^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) `)
)
// ValidURL parses the given string and returns true if it is a Web Risk
// compatible URL.
//
// In general, clients can (and should) just call LookupURLs, which performs the
// same checks internally. This method can be useful when checking a batch of
// URLs, as the first parse failure will cause LookupURLs to stop processing
// the request and return an error.
func ValidURL(url string) bool {
parsed, err := parseURL(url)
return parsed != nil && err == nil
}
// generateHashes returns a set of full hashes for all patterns in the URL.
func generateHashes(url string) (map[hashPrefix]string, error) {
patterns, err := generatePatterns(url)
if err != nil {
return nil, err
}
hashes := make(map[hashPrefix]string)
for _, p := range patterns {
hashes[hashFromPattern(p)] = p
}
return hashes, nil
}
// generatePatterns returns all possible host-suffix and path-prefix patterns
// for the input URL.
func generatePatterns(url string) ([]string, error) {
hosts, err := generateLookupHosts(url)
if err != nil {
return nil, err
}
paths, err := generateLookupPaths(url)
if err != nil {
return nil, err
}
var patterns []string
for _, h := range hosts {
for _, p := range paths {
patterns = append(patterns, h+p)
}
}
return patterns, nil
}
// isHex reports whether c is a hexadecimal character.
func isHex(c byte) bool {
switch {
case '0' <= c && c <= '9':
return true
case 'a' <= c && c <= 'f':
return true
case 'A' <= c && c <= 'F':
return true
}
return false
}
// unhex converts a hexadecimal character to byte value in 0..15, inclusive.
func unhex(c byte) byte {
switch {
case '0' <= c && c <= '9':
return c - '0'
case 'a' <= c && c <= 'f':
return c - 'a' + 10
case 'A' <= c && c <= 'F':
return c - 'A' + 10
}
return 0
}
// isUnicode reports whether s is a Unicode string.
func isUnicode(s string) bool {
for _, c := range []byte(s) {
// For legacy reasons, 0x80 is not considered a Unicode character.
if c > 0x80 {
return true
}
}
return false
}
// split splits the string s around the delimiter c.
//
// Let string s be of the form:
// "%s%s%s" % (t, c, u)
//
// Then split returns (t, u) if cutc is set, otherwise, it returns (t, c+u).
// If c does not exist in s, then (s, "") is returned.
func split(s string, c string, cutc bool) (string, string) {
i := strings.Index(s, c)
if i < 0 {
return s, ""
}
if cutc {
return s[:i], s[i+len(c):]
}
return s[:i], s[i:]
}
// escape returns the percent-encoded form of the string s.
func escape(s string) string {
var b bytes.Buffer
for _, c := range []byte(s) {
if c < 0x20 || c >= 0x7f || c == ' ' || c == '#' || c == '%' {
b.WriteString(fmt.Sprintf("%%%02X", c))
} else {
b.WriteByte(c)
}
}
return b.String()
}
// unescape returns the decoded form of a percent-encoded string s.
func unescape(s string) string {
var b bytes.Buffer
for len(s) > 0 {
if len(s) >= 3 && s[0] == '%' && isHex(s[1]) && isHex(s[2]) {
b.WriteByte(unhex(s[1])<<4 | unhex(s[2]))
s = s[3:]
} else {
b.WriteByte(s[0])
s = s[1:]
}
}
return b.String()
}
// recursiveUnescape unescapes the string s recursively until it cannot be
// unescaped anymore. It reports an error if the unescaping process seemed to
// have no end.
func recursiveUnescape(s string) (string, error) {
const maxDepth = 1024
for i := 0; i < maxDepth; i++ {
t := unescape(s)
if t == s {
return s, nil
}
s = t
}
return "", errors.New("webrisk: unescaping is too recursive")
}
// normalizeEscape performs a recursive unescape and then escapes the string
// exactly once. It reports an error if it was unable to unescape the string.
func normalizeEscape(s string) (string, error) {
u, err := recursiveUnescape(s)
if err != nil {
return "", err
}
return escape(u), nil
}
// getScheme splits the url into (scheme, path) where scheme is the protocol.
// If the scheme cannot be determined ("", url) is returned.
func getScheme(url string) (scheme, path string) {
for i, c := range []byte(url) {
switch {
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
// Do nothing.
case '0' <= c && c <= '9' || c == '+' || c == '-' || c == '.':
if i == 0 {
return "", url
}
case c == ':':
return url[:i], url[i+1:]
default:
// Invalid character, so there is no valid scheme.
return "", url
}
}
return "", url
}
// parseHost parses a string to get host by the stripping the
// username, password, and port.
func parseHost(hostish string) (host string, err error) {
i := strings.LastIndex(hostish, "@")
if i < 0 {
host = hostish
} else {
host = hostish[i+1:]
}
if strings.HasPrefix(host, "[") {
// Parse an IP-Literal per RFC 3986 and RFC 6874.
// For example: "[fe80::1] or "[fe80::1%25en0]"
i := strings.LastIndex(host, "]")
if i < 0 {
return "", errors.New("webrisk: missing ']' in host")
}
}
// Remove the port if it is there.
host = portRegexp.ReplaceAllString(host, "")
// Convert internationalized hostnames to IDNA.
u := unescape(host)
if isUnicode(u) {
host, err = idna.ToASCII(u)
if err != nil {
return "", err
}
}
// Remove any superfluous '.' characters in the hostname.
host = dotsRegexp.ReplaceAllString(host, ".")
host = strings.Trim(host, ".")
// Canonicalize IP addresses.
if iphost := parseIPAddress(host); iphost != "" {
host = iphost
} else {
// In order to properly escape urls, first get the unescaped
// version.
host, err = recursiveUnescape(host)
if err != nil {
return "", err
}
// Then apply a to lower but only to ascii characters [a-z|A-Z].
var temp_host bytes.Buffer
for _, c := range []byte(host) {
if (c >= 0x41 && c <=0x5A) || (c >= 0x61 && c <= 0x7A) {
temp_host.WriteByte(byte(unicode.ToLower(rune(c))))
} else {
temp_host.WriteByte(c)
}
}
host = temp_host.String()
// Then escape the result.
host = escape(host)
}
return host, nil
}
// parseURL parses urlStr as a url.URL and reports an error if not possible.
func parseURL(urlStr string) (parsedURL *url.URL, err error) {
// For legacy reasons, this is a simplified version of the net/url logic.
//
// Few cases where net/url was not helpful:
// 1. URLs are are expected to have no escaped encoding in the host but to
// be escaped in the path. Web Risk allows escaped characters in both.
// 2. Also it has different behavior with and without a scheme for absolute
// paths. Web Risk test web URLs only; and a scheme is optional.
// If missing, we assume that it is an "http".
// 3. We strip off the fragment and the escaped query as they are not
// required for building patterns for Web Risk.
parsedURL = new(url.URL)
// Remove the URL fragment.
// Also, we decode and encode the URL.
// The '#' in a fragment is not friendly to that.
rest, _ := split(urlStr, "#", true)
// Start by stripping any leading and trailing whitespace.
rest = strings.TrimSpace(rest)
// Remove any embedded tabs and CR/LF characters which aren't escaped.
rest = strings.Replace(rest, "\t", "", -1)
rest = strings.Replace(rest, "\r", "", -1)
rest = strings.Replace(rest, "\n", "", -1)
rest, err = normalizeEscape(rest)
if err != nil {
return nil, err
}
parsedURL.Scheme, rest = getScheme(rest)
rest, parsedURL.RawQuery = split(rest, "?", true)
// Add HTTP as scheme if none.
var hostish string
if !strings.HasPrefix(rest, "//") && parsedURL.Scheme != "" {
return nil, errors.New("webrisk: invalid path")
}
if parsedURL.Scheme == "" {
parsedURL.Scheme = "http"
hostish, rest = split(rest, "/", false)
} else {
hostish, rest = split(rest[2:], "/", false)
}
if hostish == "" {
return nil, errors.New("webrisk: missing hostname")
}
parsedURL.Host, err = parseHost(hostish)
if err != nil {
return nil, err
}
// Format the path.
p := path.Clean(rest)
if p == "." {
p = "/"
} else if rest[len(rest)-1] == '/' && p[len(p)-1] != '/' {
p += "/"
}
parsedURL.Path = p
return parsedURL, nil
}
func parseIPAddress(iphostname string) string {
// The Windows resolver allows a 4-part dotted decimal IP address to have a
// space followed by any old rubbish, so long as the total length of the
// string doesn't get above 15 characters. So, "10.192.95.89 xy" is
// resolved to 10.192.95.89. If the string length is greater than 15
// characters, e.g. "10.192.95.89 xy.wildcard.example.com", it will be
// resolved through DNS.
if len(iphostname) <= 15 {
match := trailingSpaceRegexp.FindString(iphostname)
if match != "" {
iphostname = strings.TrimSpace(match)
}
}
if !possibleIPRegexp.MatchString(iphostname) {
return ""
}
parts := strings.Split(iphostname, ".")
if len(parts) > 4 {
return ""
}
ss := make([]string, len(parts))
for i, n := range parts {
if i == len(parts)-1 {
ss[i] = canonicalNum(n, 5-len(parts))
} else {
ss[i] = canonicalNum(n, 1)
}
if ss[i] == "" {
return ""
}
}
return strings.Join(ss, ".")
}
// canonicalNum parses s as an integer and attempts to encode it as a '.'
// separated string where each element is the base-10 encoded value of each byte
// for the corresponding number, starting with the MSB. The result is one that
// is usable as an IP address.
//
// For example:
// s:"01234", n:2 => "2.156"
// s:"0x10203040", n:4 => "16.32.48.64"
func canonicalNum(s string, n int) string {
if n <= 0 || n > 4 {
return ""
}
v, err := strconv.ParseUint(s, 0, 32)
if err != nil {
return ""
}
ss := make([]string, n)
for i := n - 1; i >= 0; i-- {
ss[i] = strconv.Itoa(int(v) & 0xff)
v = v >> 8
}
return strings.Join(ss, ".")
}
// canonicalURL parses a URL string and returns it as scheme://hostname/path.
// It strips off fragments and queries.
func canonicalURL(u string) (string, error) {
parsedURL, err := parseURL(u)
if err != nil {
return "", err
}
// Assemble the URL ourselves to skip encodings from the net/url package.
u = parsedURL.Scheme + "://" + parsedURL.Host
if parsedURL.Path == "" {
return u + "/", nil
}
u += parsedURL.Path
return u, nil
}
func canonicalHost(urlStr string) (string, error) {
parsedURL, err := parseURL(urlStr)
if err != nil {
return "", err
}
return parsedURL.Host, nil
}
// generateLookupHosts returns a list of host-suffixes for the input URL.
func generateLookupHosts(urlStr string) ([]string, error) {
// Web Risk policy asks to generate lookup hosts for the URL.
// Those are formed by the domain and also up to 4 hostnames suffixes.
// The last component or sometimes the pair isn't examined alone,
// since it's the TLD or country code. The database for TLDs is here:
// https://publicsuffix.org/list/
//
// Note that we do not need to be clever about stopping at the "real" TLD.
// We just check a few extra components regardless. It's not significantly
// slower on the server side to check some extra hashes. Also the client
// does not need to keep a database of TLDs.
const maxHostComponents = 7
host, err := canonicalHost(urlStr)
if err != nil {
return nil, err
}
// handle IPv4 and IPv6 addresses.
ip := net.ParseIP(strings.Trim(host, "[]"))
if ip != nil {
return []string{host}, nil
}
hostComponents := strings.Split(host, ".")
numComponents := len(hostComponents) - maxHostComponents
if numComponents < 1 {
numComponents = 1
}
hosts := []string{host}
for i := numComponents; i < len(hostComponents)-1; i++ {
hosts = append(hosts, strings.Join(hostComponents[i:], "."))
}
return hosts, nil
}
func canonicalPath(urlStr string) (string, error) {
// Note that this function is not used, but remains to ensure that the
// parsedURL.Path output matches C++ implementation.
parsedURL, err := parseURL(urlStr)
if err != nil {
return "", err
}
return parsedURL.Path, nil
}
// generateLookupPaths returns a list path-prefixes for the input URL.
func generateLookupPaths(urlStr string) ([]string, error) {
const maxPathComponents = 4
parsedURL, err := parseURL(urlStr)
if err != nil {
return nil, err
}
path := parsedURL.Path
paths := []string{"/"}
var pathComponents []string
for _, p := range strings.Split(path, "/") {
if p != "" {
pathComponents = append(pathComponents, p)
}
}
numComponents := len(pathComponents)
if numComponents > maxPathComponents {
numComponents = maxPathComponents
}
for i := 1; i < numComponents; i++ {
paths = append(paths, "/"+strings.Join(pathComponents[:i], "/")+"/")
}
if path != "/" {
paths = append(paths, path)
}
if len(parsedURL.RawQuery) > 0 {
paths = append(paths, path+"?"+parsedURL.RawQuery)
}
return paths, nil
}