From 0d4669d0bb375a0e82e96cb5530da17bbd6ddf69 Mon Sep 17 00:00:00 2001
From: PyExplorer <stgmont@gmail.com>
Date: Fri, 6 Dec 2024 13:05:40 +0300
Subject: [PATCH 1/8] add tests for extract_link

---
 tests/utils.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
diff --git a/tests/utils.py b/tests/utils.py
index 034ac86..cad6d24 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,3 +1,72 @@
 from pathlib import Path
 
+import pytest
+from parsel import Selector  # noqa: F401
+
+from zyte_parsers.utils import extract_link, fromstring
+
 TEST_DATA_ROOT = Path(__file__).parent / "data"
+
+
+@pytest.mark.parametrize(
+    "html_input, base_url, expected_output",
+    [
+        ("<a href=' http://example.com'>", "", "http://example.com"),
+        ("<a href='foo'>", "http://example.com", "http://example.com/foo"),
+        ("<a href='/foo '>", "http://example.com", "http://example.com/foo"),
+        ("<a href='//foo '>", "http://example.com", "http://foo"),
+        (
+            "<a href='//example.com/foo'>",
+            "http://example.com",
+            "http://example.com/foo",
+        ),
+        # Selector
+        (
+            Selector(text="<a href='http://example.com'>").css("a")[0],
+            "",
+            "http://example.com",
+        ),
+        # no base url
+        ("<a href='foo'>", "", "foo"),
+        ("<a href='/foo '>", "", "/foo"),
+        ("<a href='//foo '>", "", "//foo"),
+        ("<a href='' data-url='http://example.com'>", "", "http://example.com"),
+        ("<a href='http://example.com'>", "", "http://example.com"),
+        # invalid url
+        ("<a href='javascript:void(0)'>", "", None),
+        ("<a href=''>", "http://example.com", None),
+    ],
+)
+def test_extract_link(html_input, base_url, expected_output):
+    a_node = fromstring(html_input) if isinstance(html_input, str) else html_input
+    result = extract_link(a_node, base_url)
+    assert result == expected_output
+
+
+@pytest.mark.parametrize(
+    "html_input, base_url, expected_output",
+    [
+        # Spaces in the path
+        (
+            "<a href='/path/to/resource with spaces'>",
+            "http://example.com",
+            "http://example.com/path/to/resource%20with%20spaces",
+        ),
+        # Missing schema and base_url
+        (
+            "<a href='//example.com/foo'>",
+            "",
+            "https://example.com/foo",
+        ),
+        # no base url
+        ("<a href='foo'>", "", "foo"),
+        ("<a href='/foo '>", "", "/foo"),
+        ("<a href='//foo '>", "", "https://foo"),
+        ("<a href='' data-url='http://example.com'>", "", "http://example.com"),
+        ("<a href='http://example.com'>", "", "http://example.com"),
+    ],
+)
+def test_extract_safe_link(html_input, base_url, expected_output):
+    a_node = fromstring(html_input) if isinstance(html_input, str) else html_input
+    result = extract_link(a_node, base_url, force_safe=True)
+    assert result == expected_output

From 138fb39b78c03787b44feed57af4387206f49f8e Mon Sep 17 00:00:00 2001
From: PyExplorer <stgmont@gmail.com>
Date: Fri, 6 Dec 2024 13:07:07 +0300
Subject: [PATCH 2/8] make link from extract_link safe and with missing scheme

---
 zyte_parsers/utils.py | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py
index a767159..67046a2 100644
--- a/zyte_parsers/utils.py
+++ b/zyte_parsers/utils.py
@@ -1,6 +1,6 @@
 import itertools
 from typing import Any, Callable, Iterable, Optional
-from urllib.parse import urljoin
+from urllib.parse import urljoin, urlparse, urlunparse
 
 import html_text
 from lxml.html import (  # noqa: F401
@@ -11,6 +11,7 @@
 )
 from parsel import Selector  # noqa: F401
 from w3lib.html import strip_html5_whitespace
+from w3lib.url import safe_url_string
 
 from zyte_parsers.api import SelectorOrElement, input_to_element
 
@@ -55,19 +56,19 @@ def strip_urljoin(base_url: Optional[str], url: Optional[str]) -> str:
     return urljoin(base_url or "", url or "")
 
 
-def extract_link(a_node: SelectorOrElement, base_url: str) -> Optional[str]:
+def add_https_to_url(url: str) -> str:
+    parsed_url = urlparse(url)
+    if not parsed_url.scheme and parsed_url.netloc:
+        parsed_url = parsed_url._replace(scheme="https")
+
+    return str(urlunparse(parsed_url))
+
+
+def extract_link(
+    a_node: SelectorOrElement, base_url: str, force_safe=False
+) -> Optional[str]:
     """
     Extract the absolute url link from an ``<a>`` HTML tag.
-
-    >>> extract_link(fromstring("<a href=' http://example.com'"), "")
-    'http://example.com'
-    >>> extract_link(fromstring("<a href='/foo '"), "http://example.com")
-    'http://example.com/foo'
-    >>> extract_link(fromstring("<a href='' data-url='http://example.com'"), "")
-    'http://example.com'
-    >>> extract_link(fromstring("<a href='javascript:void(0)'"), "")
-    >>> extract_link(Selector(text="<a href='http://example.com'").css("a")[0], "")
-    'http://example.com'
     """
     a_node = input_to_element(a_node)
     link = a_node.get("href") or a_node.get("data-url")
@@ -80,7 +81,18 @@ def extract_link(a_node: SelectorOrElement, base_url: str) -> Optional[str]:
     except ValueError:
         link = None
 
-    return link
+    if not force_safe:
+        return link
+
+    try:
+        safe_link = safe_url_string(link)
+    except ValueError:
+        return None
+
+    # add scheme (https) when missing schema and no base url
+    safe_link = add_https_to_url(safe_link)
+
+    return safe_link
 
 
 def extract_text(

From 0fcfabad8de7b8009bbd26a35e0af0a77603347a Mon Sep 17 00:00:00 2001
From: PyExplorer <stgmont@gmail.com>
Date: Fri, 6 Dec 2024 13:16:11 +0300
Subject: [PATCH 3/8] add early return to add_https_to_url

---
 zyte_parsers/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py
index 67046a2..1c52e16 100644
--- a/zyte_parsers/utils.py
+++ b/zyte_parsers/utils.py
@@ -57,6 +57,9 @@ def strip_urljoin(base_url: Optional[str], url: Optional[str]) -> str:
 
 
 def add_https_to_url(url: str) -> str:
+    if url.startswith(('http://', 'https://')):
+        return url
+
     parsed_url = urlparse(url)
     if not parsed_url.scheme and parsed_url.netloc:
         parsed_url = parsed_url._replace(scheme="https")

From 4b854dc882fe640fb5a733db776e7a75bf2af821 Mon Sep 17 00:00:00 2001
From: PyExplorer <stgmont@gmail.com>
Date: Fri, 6 Dec 2024 13:22:02 +0300
Subject: [PATCH 4/8] format

---
 zyte_parsers/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py
index 1c52e16..468a826 100644
--- a/zyte_parsers/utils.py
+++ b/zyte_parsers/utils.py
@@ -57,7 +57,7 @@ def strip_urljoin(base_url: Optional[str], url: Optional[str]) -> str:
 
 
 def add_https_to_url(url: str) -> str:
-    if url.startswith(('http://', 'https://')):
+    if url.startswith(("http://", "https://")):
         return url
 
     parsed_url = urlparse(url)

From 8c77922c82ed7b4eb233fb164ae4b5dc960d94a2 Mon Sep 17 00:00:00 2001
From: PyExplorer <stgmont@gmail.com>
Date: Fri, 6 Dec 2024 13:26:49 +0300
Subject: [PATCH 5/8] add early return if link is empty/None

---
 zyte_parsers/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py
index 468a826..2de6c8f 100644
--- a/zyte_parsers/utils.py
+++ b/zyte_parsers/utils.py
@@ -84,7 +84,7 @@ def extract_link(
     except ValueError:
         link = None
 
-    if not force_safe:
+    if not link or not force_safe:
         return link
 
     try:

From 6bbc25d3bf273457485ee86b2353024efbd4281f Mon Sep 17 00:00:00 2001
From: Shevchenko Taras <stgmont@gmail.com>
Date: Fri, 6 Dec 2024 14:31:43 +0300
Subject: [PATCH 6/8] Update zyte_parsers/utils.py

Co-authored-by: Andrey Rakhmatullin <wrar@wrar.name>
---
 zyte_parsers/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py
index 2de6c8f..2ed1498 100644
--- a/zyte_parsers/utils.py
+++ b/zyte_parsers/utils.py
@@ -68,7 +68,7 @@ def add_https_to_url(url: str) -> str:
 
 
 def extract_link(
-    a_node: SelectorOrElement, base_url: str, force_safe=False
+    a_node: SelectorOrElement, base_url: str, force_safe: bool = False
 ) -> Optional[str]:
     """
     Extract the absolute url link from an ``<a>`` HTML tag.

From 209fbe7431fe73d85ae42d53b5a3de47ae986eac Mon Sep 17 00:00:00 2001
From: PyExplorer <stgmont@gmail.com>
Date: Tue, 10 Dec 2024 09:55:13 +0300
Subject: [PATCH 7/8] tune add_https_to_url

---
 zyte_parsers/utils.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py
index 2de6c8f..839ced5 100644
--- a/zyte_parsers/utils.py
+++ b/zyte_parsers/utils.py
@@ -61,10 +61,16 @@ def add_https_to_url(url: str) -> str:
         return url
 
     parsed_url = urlparse(url)
-    if not parsed_url.scheme and parsed_url.netloc:
+
+    # If it's a relative URL, return it as-is
+    if not parsed_url.netloc:
+        return url
+
+    # Handle missing scheme
+    if not parsed_url.scheme:
         parsed_url = parsed_url._replace(scheme="https")
 
-    return str(urlunparse(parsed_url))
+    return urlunparse(parsed_url)  # type: ignore
 
 
 def extract_link(

From 9a315165f93d554e84abe9bc1b6def0e7acb6b4c Mon Sep 17 00:00:00 2001
From: PyExplorer <stgmont@gmail.com>
Date: Thu, 12 Dec 2024 11:40:56 +0300
Subject: [PATCH 8/8] remove add_https_to_url

---
 tests/utils.py        |  4 ++--
 zyte_parsers/utils.py | 26 ++------------------------
 2 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/tests/utils.py b/tests/utils.py
index cad6d24..b2af996 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -56,12 +56,12 @@ def test_extract_link(html_input, base_url, expected_output):
         (
             "<a href='//example.com/foo'>",
             "",
-            "https://example.com/foo",
+            "//example.com/foo",
         ),
         # no base url
         ("<a href='foo'>", "", "foo"),
         ("<a href='/foo '>", "", "/foo"),
-        ("<a href='//foo '>", "", "https://foo"),
+        ("<a href='//foo '>", "", "//foo"),
         ("<a href='' data-url='http://example.com'>", "", "http://example.com"),
         ("<a href='http://example.com'>", "", "http://example.com"),
     ],
diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py
index 1bbe35b..662ecaf 100644
--- a/zyte_parsers/utils.py
+++ b/zyte_parsers/utils.py
@@ -1,6 +1,6 @@
 import itertools
 from typing import Any, Callable, Iterable, Optional
-from urllib.parse import urljoin, urlparse, urlunparse
+from urllib.parse import urljoin
 
 import html_text
 from lxml.html import (  # noqa: F401
@@ -56,23 +56,6 @@ def strip_urljoin(base_url: Optional[str], url: Optional[str]) -> str:
     return urljoin(base_url or "", url or "")
 
 
-def add_https_to_url(url: str) -> str:
-    if url.startswith(("http://", "https://")):
-        return url
-
-    parsed_url = urlparse(url)
-
-    # If it's a relative URL, return it as-is
-    if not parsed_url.netloc:
-        return url
-
-    # Handle missing scheme
-    if not parsed_url.scheme:
-        parsed_url = parsed_url._replace(scheme="https")
-
-    return urlunparse(parsed_url)  # type: ignore
-
-
 def extract_link(
     a_node: SelectorOrElement, base_url: str, force_safe: bool = False
 ) -> Optional[str]:
@@ -94,15 +77,10 @@ def extract_link(
         return link
 
     try:
-        safe_link = safe_url_string(link)
+        return safe_url_string(link)
     except ValueError:
         return None
 
-    # add scheme (https) when missing schema and no base url
-    safe_link = add_https_to_url(safe_link)
-
-    return safe_link
-
 
 def extract_text(
     node: Optional[SelectorOrElement], guess_layout: bool = False