zytedata · PyExplorer · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024
diff --git a/tests/utils.py b/tests/utils.py
@@ -1,3 +1,72 @@
 from pathlib import Path
 
+import pytest
+from parsel import Selector  # noqa: F401
+
+from zyte_parsers.utils import extract_link, fromstring
+
 TEST_DATA_ROOT = Path(__file__).parent / "data"
+
+
+@pytest.mark.parametrize(
+    "html_input, base_url, expected_output",
+    [
+        ("<a href=' http://example.com'>", "", "http://example.com"),
+        ("<a href='foo'>", "http://example.com", "http://example.com/foo"),
+        ("<a href='/foo '>", "http://example.com", "http://example.com/foo"),
+        ("<a href='//foo '>", "http://example.com", "http://foo"),
+        (
+            "<a href='//example.com/foo'>",
+            "http://example.com",
+            "http://example.com/foo",
+        ),
+        # Selector
+        (
+            Selector(text="<a href='http://example.com'>").css("a")[0],
+            "",
+            "http://example.com",
+        ),
+        # no base url
+        ("<a href='foo'>", "", "foo"),
+        ("<a href='/foo '>", "", "/foo"),
+        ("<a href='//foo '>", "", "//foo"),
+        ("<a href='' data-url='http://example.com'>", "", "http://example.com"),
+        ("<a href='http://example.com'>", "", "http://example.com"),
+        # invalid url
+        ("<a href='javascript:void(0)'>", "", None),
+        ("<a href=''>", "http://example.com", None),
+    ],
+)
+def test_extract_link(html_input, base_url, expected_output):
+    a_node = fromstring(html_input) if isinstance(html_input, str) else html_input
+    result = extract_link(a_node, base_url)
+    assert result == expected_output
+
+
+@pytest.mark.parametrize(
+    "html_input, base_url, expected_output",
+    [
+        # Spaces in the path
+        (
+            "<a href='/path/to/resource with spaces'>",
+            "http://example.com",
+            "http://example.com/path/to/resource%20with%20spaces",
+        ),
+        # Missing schema and base_url
+        (
+            "<a href='//example.com/foo'>",
+            "",
+            "https://example.com/foo",
+        ),
+        # no base url
+        ("<a href='foo'>", "", "foo"),
+        ("<a href='/foo '>", "", "/foo"),
+        ("<a href='//foo '>", "", "https://foo"),
+        ("<a href='' data-url='http://example.com'>", "", "http://example.com"),
+        ("<a href='http://example.com'>", "", "http://example.com"),
+    ],
+)
+def test_extract_safe_link(html_input, base_url, expected_output):
+    a_node = fromstring(html_input) if isinstance(html_input, str) else html_input
+    result = extract_link(a_node, base_url, force_safe=True)
+    assert result == expected_output
diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py
@@ -1,6 +1,6 @@
 import itertools
 from typing import Any, Callable, Iterable, Optional
-from urllib.parse import urljoin
+from urllib.parse import urljoin, urlparse, urlunparse
 
 import html_text
 from lxml.html import (  # noqa: F401
@@ -11,6 +11,7 @@
 )
 from parsel import Selector  # noqa: F401
 from w3lib.html import strip_html5_whitespace
+from w3lib.url import safe_url_string
 
 from zyte_parsers.api import SelectorOrElement, input_to_element
 
@@ -55,19 +56,22 @@ def strip_urljoin(base_url: Optional[str], url: Optional[str]) -> str:
     return urljoin(base_url or "", url or "")
 
 
-def extract_link(a_node: SelectorOrElement, base_url: str) -> Optional[str]:
+def add_https_to_url(url: str) -> str:
+    if url.startswith(("http://", "https://")):
+        return url
+
+    parsed_url = urlparse(url)
+    if not parsed_url.scheme and parsed_url.netloc:
+        parsed_url = parsed_url._replace(scheme="https")
+
+    return str(urlunparse(parsed_url))
+
+
+def extract_link(
+    a_node: SelectorOrElement, base_url: str, force_safe: bool = False
+) -> Optional[str]:
     """
     Extract the absolute url link from an ``<a>`` HTML tag.
-
-    >>> extract_link(fromstring("<a href=' http://example.com'"), "")
-    'http://example.com'
-    >>> extract_link(fromstring("<a href='/foo '"), "http://example.com")
-    'http://example.com/foo'
-    >>> extract_link(fromstring("<a href='' data-url='http://example.com'"), "")
-    'http://example.com'
-    >>> extract_link(fromstring("<a href='javascript:void(0)'"), "")
-    >>> extract_link(Selector(text="<a href='http://example.com'").css("a")[0], "")
-    'http://example.com'
     """
     a_node = input_to_element(a_node)
     link = a_node.get("href") or a_node.get("data-url")
@@ -80,7 +84,18 @@ def extract_link(a_node: SelectorOrElement, base_url: str) -> Optional[str]:
     except ValueError:
         link = None
 
-    return link
+    if not link or not force_safe:
+        return link
+
+    try:
+        safe_link = safe_url_string(link)
+    except ValueError:
+        return None
+
+    # add scheme (https) when missing schema and no base url
+    safe_link = add_https_to_url(safe_link)
+
+    return safe_link
 
 
 def extract_text(