Parse srcset urls with commas and optional descriptor

StJudeWasHere · Oct 3, 2024 · c6dcf24 · c6dcf24
1 parent 3f87fe7
commit c6dcf24
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 6 deletions.
diff --git a/internal/services/html_parser_test.go b/internal/services/html_parser_test.go
@@ -438,3 +438,45 @@ func TestRobotsNone(t *testing.T) {
 		t.Error("NewPageReport Nofollow should be true")
 	}
 }
+
+func TestSrcset(t *testing.T) {
+	u, err := url.Parse(testURL)
+	if err != nil {
+		fmt.Println(err)
+	}
+
+	images := []string{
+		"https://example.com/logo.png",
+		"https://example.com/image,c_fill,w_576.jpg",
+		"https://example.com/image,c_fill,w_276.jpg",
+		"https://example.com/image,c_fill,w_76.jpg",
+	}
+	body := []byte(
+		`<html>
+		<head></head>
+		<body>
+			<img src="` + images[0] + `"
+			srcset=",` + images[1] + ` 576w, ,` + images[2] + ` 276w,` + images[3] + `,">
+		</body>
+	</html>`)
+	statusCode := 200
+	headers := &http.Header{
+		"Content-Type": []string{"text/html"},
+	}
+
+	pageReport, _, err := services.NewHTMLParser(u, statusCode, headers, body, int64(len(body)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(pageReport.Images) != len(images) {
+		t.Errorf("pagereport images len want: %d Got: %d", len(images), len(pageReport.Images))
+	}
+
+	for n, i := range images {
+		if pageReport.Images[n].URL != i {
+			t.Errorf("pageReport image %d should be %s. Got: %s", n, i, pageReport.Images[n].URL)
+		}
+	}
+
+}
diff --git a/internal/services/parser.go b/internal/services/parser.go
@@ -7,6 +7,7 @@ import (
 	"net/http"
 	"net/url"
 	"strings"
+	"unicode"
 
 	"github.com/stjudewashere/seonaut/internal/models"
 
@@ -586,19 +587,37 @@ func (p *Parser) headersLocation() string {
 func (p *Parser) parseSrcSet(srcset string) []string {
 	var imageURLs []string
 
+	srcset = strings.Trim(srcset, " ,")
 	if srcset == "" {
 		return imageURLs
 	}
 
-	imageSet := strings.Split(srcset, ",")
-	for _, s := range imageSet {
-		i := strings.Split(s, " ")
-
-		if len(i) > 0 {
-			imageURLs = append(imageURLs, strings.TrimSpace(i[0]))
+	// URLs in srcset strings can contain an optional descriptor.
+	// Also take into account URLs with commas in them.
+	parsingURL := true
+	var currentURL strings.Builder
+	for _, char := range srcset {
+		if parsingURL {
+			if unicode.IsSpace(char) {
+				if currentURL.Len() > 0 {
+					parsingURL = false
+				}
+			} else if currentURL.Len() > 0 || char != ',' {
+				currentURL.WriteRune(char)
+			}
+		} else {
+			if char == ',' {
+				parsingURL = true
+				imageURLs = append(imageURLs, strings.TrimSpace(currentURL.String()))
+				currentURL.Reset()
+			}
 		}
 	}
 
+	if currentURL.Len() > 0 {
+		imageURLs = append(imageURLs, strings.TrimSpace(currentURL.String()))
+	}
+
 	return imageURLs
 }