From 0d4669d0bb375a0e82e96cb5530da17bbd6ddf69 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Fri, 6 Dec 2024 13:05:40 +0300 Subject: [PATCH 1/8] add tests for extract_link --- tests/utils.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tests/utils.py b/tests/utils.py index 034ac86..cad6d24 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,3 +1,72 @@ from pathlib import Path +import pytest +from parsel import Selector # noqa: F401 + +from zyte_parsers.utils import extract_link, fromstring + TEST_DATA_ROOT = Path(__file__).parent / "data" + + +@pytest.mark.parametrize( + "html_input, base_url, expected_output", + [ + ("", "", "http://example.com"), + ("", "http://example.com", "http://example.com/foo"), + ("", "http://example.com", "http://example.com/foo"), + ("", "http://example.com", "http://foo"), + ( + "", + "http://example.com", + "http://example.com/foo", + ), + # Selector + ( + Selector(text="").css("a")[0], + "", + "http://example.com", + ), + # no base url + ("", "", "foo"), + ("", "", "/foo"), + ("", "", "//foo"), + ("", "", "http://example.com"), + ("", "", "http://example.com"), + # invalid url + ("", "", None), + ("", "http://example.com", None), + ], +) +def test_extract_link(html_input, base_url, expected_output): + a_node = fromstring(html_input) if isinstance(html_input, str) else html_input + result = extract_link(a_node, base_url) + assert result == expected_output + + +@pytest.mark.parametrize( + "html_input, base_url, expected_output", + [ + # Spaces in the path + ( + "", + "http://example.com", + "http://example.com/path/to/resource%20with%20spaces", + ), + # Missing schema and base_url + ( + "", + "", + "https://example.com/foo", + ), + # no base url + ("", "", "foo"), + ("", "", "/foo"), + ("", "", "https://foo"), + ("", "", "http://example.com"), + ("", "", "http://example.com"), + ], +) +def test_extract_safe_link(html_input, base_url, expected_output): + a_node = fromstring(html_input) if isinstance(html_input, str) else html_input + result = extract_link(a_node, base_url, force_safe=True) + assert result == expected_output From 138fb39b78c03787b44feed57af4387206f49f8e Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Fri, 6 Dec 2024 13:07:07 +0300 Subject: [PATCH 2/8] make link from extract_link safe and with missing scheme --- zyte_parsers/utils.py | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py index a767159..67046a2 100644 --- a/zyte_parsers/utils.py +++ b/zyte_parsers/utils.py @@ -1,6 +1,6 @@ import itertools from typing import Any, Callable, Iterable, Optional -from urllib.parse import urljoin +from urllib.parse import urljoin, urlparse, urlunparse import html_text from lxml.html import ( # noqa: F401 @@ -11,6 +11,7 @@ ) from parsel import Selector # noqa: F401 from w3lib.html import strip_html5_whitespace +from w3lib.url import safe_url_string from zyte_parsers.api import SelectorOrElement, input_to_element @@ -55,19 +56,19 @@ def strip_urljoin(base_url: Optional[str], url: Optional[str]) -> str: return urljoin(base_url or "", url or "") -def extract_link(a_node: SelectorOrElement, base_url: str) -> Optional[str]: +def add_https_to_url(url: str) -> str: + parsed_url = urlparse(url) + if not parsed_url.scheme and parsed_url.netloc: + parsed_url = parsed_url._replace(scheme="https") + + return str(urlunparse(parsed_url)) + + +def extract_link( + a_node: SelectorOrElement, base_url: str, force_safe=False +) -> Optional[str]: """ Extract the absolute url link from an ```` HTML tag. - - >>> extract_link(fromstring(">> extract_link(fromstring(">> extract_link(fromstring(">> extract_link(fromstring(">> extract_link(Selector(text=" Optional[str]: except ValueError: link = None - return link + if not force_safe: + return link + + try: + safe_link = safe_url_string(link) + except ValueError: + return None + + # add scheme (https) when missing schema and no base url + safe_link = add_https_to_url(safe_link) + + return safe_link def extract_text( From 0fcfabad8de7b8009bbd26a35e0af0a77603347a Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Fri, 6 Dec 2024 13:16:11 +0300 Subject: [PATCH 3/8] add early return to add_https_to_url --- zyte_parsers/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py index 67046a2..1c52e16 100644 --- a/zyte_parsers/utils.py +++ b/zyte_parsers/utils.py @@ -57,6 +57,9 @@ def strip_urljoin(base_url: Optional[str], url: Optional[str]) -> str: def add_https_to_url(url: str) -> str: + if url.startswith(('http://', 'https://')): + return url + parsed_url = urlparse(url) if not parsed_url.scheme and parsed_url.netloc: parsed_url = parsed_url._replace(scheme="https") From 4b854dc882fe640fb5a733db776e7a75bf2af821 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Fri, 6 Dec 2024 13:22:02 +0300 Subject: [PATCH 4/8] format --- zyte_parsers/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py index 1c52e16..468a826 100644 --- a/zyte_parsers/utils.py +++ b/zyte_parsers/utils.py @@ -57,7 +57,7 @@ def strip_urljoin(base_url: Optional[str], url: Optional[str]) -> str: def add_https_to_url(url: str) -> str: - if url.startswith(('http://', 'https://')): + if url.startswith(("http://", "https://")): return url parsed_url = urlparse(url) From 8c77922c82ed7b4eb233fb164ae4b5dc960d94a2 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Fri, 6 Dec 2024 13:26:49 +0300 Subject: [PATCH 5/8] add early return if link is empty/None --- zyte_parsers/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py index 468a826..2de6c8f 100644 --- a/zyte_parsers/utils.py +++ b/zyte_parsers/utils.py @@ -84,7 +84,7 @@ def extract_link( except ValueError: link = None - if not force_safe: + if not link or not force_safe: return link try: From 6bbc25d3bf273457485ee86b2353024efbd4281f Mon Sep 17 00:00:00 2001 From: Shevchenko Taras Date: Fri, 6 Dec 2024 14:31:43 +0300 Subject: [PATCH 6/8] Update zyte_parsers/utils.py Co-authored-by: Andrey Rakhmatullin --- zyte_parsers/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py index 2de6c8f..2ed1498 100644 --- a/zyte_parsers/utils.py +++ b/zyte_parsers/utils.py @@ -68,7 +68,7 @@ def add_https_to_url(url: str) -> str: def extract_link( - a_node: SelectorOrElement, base_url: str, force_safe=False + a_node: SelectorOrElement, base_url: str, force_safe: bool = False ) -> Optional[str]: """ Extract the absolute url link from an ```` HTML tag. From 209fbe7431fe73d85ae42d53b5a3de47ae986eac Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Tue, 10 Dec 2024 09:55:13 +0300 Subject: [PATCH 7/8] tune add_https_to_url --- zyte_parsers/utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py index 2de6c8f..839ced5 100644 --- a/zyte_parsers/utils.py +++ b/zyte_parsers/utils.py @@ -61,10 +61,16 @@ def add_https_to_url(url: str) -> str: return url parsed_url = urlparse(url) - if not parsed_url.scheme and parsed_url.netloc: + + # If it's a relative URL, return it as-is + if not parsed_url.netloc: + return url + + # Handle missing scheme + if not parsed_url.scheme: parsed_url = parsed_url._replace(scheme="https") - return str(urlunparse(parsed_url)) + return urlunparse(parsed_url) # type: ignore def extract_link( From 9a315165f93d554e84abe9bc1b6def0e7acb6b4c Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Thu, 12 Dec 2024 11:40:56 +0300 Subject: [PATCH 8/8] remove add_https_to_url --- tests/utils.py | 4 ++-- zyte_parsers/utils.py | 26 ++------------------------ 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index cad6d24..b2af996 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -56,12 +56,12 @@ def test_extract_link(html_input, base_url, expected_output): ( "", "", - "https://example.com/foo", + "//example.com/foo", ), # no base url ("", "", "foo"), ("", "", "/foo"), - ("", "", "https://foo"), + ("", "", "//foo"), ("", "", "http://example.com"), ("", "", "http://example.com"), ], diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py index 1bbe35b..662ecaf 100644 --- a/zyte_parsers/utils.py +++ b/zyte_parsers/utils.py @@ -1,6 +1,6 @@ import itertools from typing import Any, Callable, Iterable, Optional -from urllib.parse import urljoin, urlparse, urlunparse +from urllib.parse import urljoin import html_text from lxml.html import ( # noqa: F401 @@ -56,23 +56,6 @@ def strip_urljoin(base_url: Optional[str], url: Optional[str]) -> str: return urljoin(base_url or "", url or "") -def add_https_to_url(url: str) -> str: - if url.startswith(("http://", "https://")): - return url - - parsed_url = urlparse(url) - - # If it's a relative URL, return it as-is - if not parsed_url.netloc: - return url - - # Handle missing scheme - if not parsed_url.scheme: - parsed_url = parsed_url._replace(scheme="https") - - return urlunparse(parsed_url) # type: ignore - - def extract_link( a_node: SelectorOrElement, base_url: str, force_safe: bool = False ) -> Optional[str]: @@ -94,15 +77,10 @@ def extract_link( return link try: - safe_link = safe_url_string(link) + return safe_url_string(link) except ValueError: return None - # add scheme (https) when missing schema and no base url - safe_link = add_https_to_url(safe_link) - - return safe_link - def extract_text( node: Optional[SelectorOrElement], guess_layout: bool = False