Skip to content

Commit

Permalink
Parse srcset urls with commas and optional descriptor
Browse files Browse the repository at this point in the history
  • Loading branch information
StJudeWasHere committed Oct 3, 2024
1 parent 3f87fe7 commit c6dcf24
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 6 deletions.
42 changes: 42 additions & 0 deletions internal/services/html_parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -438,3 +438,45 @@ func TestRobotsNone(t *testing.T) {
t.Error("NewPageReport Nofollow should be true")
}
}

func TestSrcset(t *testing.T) {
u, err := url.Parse(testURL)
if err != nil {
fmt.Println(err)
}

images := []string{
"https://example.com/logo.png",
"https://example.com/image,c_fill,w_576.jpg",
"https://example.com/image,c_fill,w_276.jpg",
"https://example.com/image,c_fill,w_76.jpg",
}
body := []byte(
`<html>
<head></head>
<body>
<img src="` + images[0] + `"
srcset=",` + images[1] + ` 576w, ,` + images[2] + ` 276w,` + images[3] + `,">
</body>
</html>`)
statusCode := 200
headers := &http.Header{
"Content-Type": []string{"text/html"},
}

pageReport, _, err := services.NewHTMLParser(u, statusCode, headers, body, int64(len(body)))
if err != nil {
t.Fatal(err)
}

if len(pageReport.Images) != len(images) {
t.Errorf("pagereport images len want: %d Got: %d", len(images), len(pageReport.Images))
}

for n, i := range images {
if pageReport.Images[n].URL != i {
t.Errorf("pageReport image %d should be %s. Got: %s", n, i, pageReport.Images[n].URL)
}
}

}
31 changes: 25 additions & 6 deletions internal/services/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/http"
"net/url"
"strings"
"unicode"

"github.com/stjudewashere/seonaut/internal/models"

Expand Down Expand Up @@ -586,19 +587,37 @@ func (p *Parser) headersLocation() string {
func (p *Parser) parseSrcSet(srcset string) []string {
var imageURLs []string

srcset = strings.Trim(srcset, " ,")
if srcset == "" {
return imageURLs
}

imageSet := strings.Split(srcset, ",")
for _, s := range imageSet {
i := strings.Split(s, " ")

if len(i) > 0 {
imageURLs = append(imageURLs, strings.TrimSpace(i[0]))
// URLs in srcset strings can contain an optional descriptor.
// Also take into account URLs with commas in them.
parsingURL := true
var currentURL strings.Builder
for _, char := range srcset {
if parsingURL {
if unicode.IsSpace(char) {
if currentURL.Len() > 0 {
parsingURL = false
}
} else if currentURL.Len() > 0 || char != ',' {
currentURL.WriteRune(char)
}
} else {
if char == ',' {
parsingURL = true
imageURLs = append(imageURLs, strings.TrimSpace(currentURL.String()))
currentURL.Reset()
}
}
}

if currentURL.Len() > 0 {
imageURLs = append(imageURLs, strings.TrimSpace(currentURL.String()))
}

return imageURLs
}

Expand Down

0 comments on commit c6dcf24

Please sign in to comment.