From c6dcf24f67265375fc45da9115624dc94be6f0ad Mon Sep 17 00:00:00 2001 From: StJudeWasHere <707925+StJudeWasHere@users.noreply.github.com> Date: Thu, 3 Oct 2024 09:36:55 +0200 Subject: [PATCH] Parse srcset urls with commas and optional descriptor --- internal/services/html_parser_test.go | 42 +++++++++++++++++++++++++++ internal/services/parser.go | 31 ++++++++++++++++---- 2 files changed, 67 insertions(+), 6 deletions(-) diff --git a/internal/services/html_parser_test.go b/internal/services/html_parser_test.go index 9213bda..f43edaa 100644 --- a/internal/services/html_parser_test.go +++ b/internal/services/html_parser_test.go @@ -438,3 +438,45 @@ func TestRobotsNone(t *testing.T) { t.Error("NewPageReport Nofollow should be true") } } + +func TestSrcset(t *testing.T) { + u, err := url.Parse(testURL) + if err != nil { + fmt.Println(err) + } + + images := []string{ + "https://example.com/logo.png", + "https://example.com/image,c_fill,w_576.jpg", + "https://example.com/image,c_fill,w_276.jpg", + "https://example.com/image,c_fill,w_76.jpg", + } + body := []byte( + ` +
+ + + + `) + statusCode := 200 + headers := &http.Header{ + "Content-Type": []string{"text/html"}, + } + + pageReport, _, err := services.NewHTMLParser(u, statusCode, headers, body, int64(len(body))) + if err != nil { + t.Fatal(err) + } + + if len(pageReport.Images) != len(images) { + t.Errorf("pagereport images len want: %d Got: %d", len(images), len(pageReport.Images)) + } + + for n, i := range images { + if pageReport.Images[n].URL != i { + t.Errorf("pageReport image %d should be %s. Got: %s", n, i, pageReport.Images[n].URL) + } + } + +} diff --git a/internal/services/parser.go b/internal/services/parser.go index 44a0151..60d4126 100644 --- a/internal/services/parser.go +++ b/internal/services/parser.go @@ -7,6 +7,7 @@ import ( "net/http" "net/url" "strings" + "unicode" "github.com/stjudewashere/seonaut/internal/models" @@ -586,19 +587,37 @@ func (p *Parser) headersLocation() string { func (p *Parser) parseSrcSet(srcset string) []string { var imageURLs []string + srcset = strings.Trim(srcset, " ,") if srcset == "" { return imageURLs } - imageSet := strings.Split(srcset, ",") - for _, s := range imageSet { - i := strings.Split(s, " ") - - if len(i) > 0 { - imageURLs = append(imageURLs, strings.TrimSpace(i[0])) + // URLs in srcset strings can contain an optional descriptor. + // Also take into account URLs with commas in them. + parsingURL := true + var currentURL strings.Builder + for _, char := range srcset { + if parsingURL { + if unicode.IsSpace(char) { + if currentURL.Len() > 0 { + parsingURL = false + } + } else if currentURL.Len() > 0 || char != ',' { + currentURL.WriteRune(char) + } + } else { + if char == ',' { + parsingURL = true + imageURLs = append(imageURLs, strings.TrimSpace(currentURL.String())) + currentURL.Reset() + } } } + if currentURL.Len() > 0 { + imageURLs = append(imageURLs, strings.TrimSpace(currentURL.String())) + } + return imageURLs }