Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make URL from extract_link safe and add missing scheme #19

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
69 changes: 69 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,72 @@
from pathlib import Path

import pytest
from parsel import Selector # noqa: F401

from zyte_parsers.utils import extract_link, fromstring

TEST_DATA_ROOT = Path(__file__).parent / "data"


@pytest.mark.parametrize(
"html_input, base_url, expected_output",
[
("<a href=' http://example.com'>", "", "http://example.com"),
("<a href='foo'>", "http://example.com", "http://example.com/foo"),
("<a href='/foo '>", "http://example.com", "http://example.com/foo"),
("<a href='//foo '>", "http://example.com", "http://foo"),
(
"<a href='//example.com/foo'>",
"http://example.com",
"http://example.com/foo",
),
# Selector
(
Selector(text="<a href='http://example.com'>").css("a")[0],
"",
"http://example.com",
),
# no base url
("<a href='foo'>", "", "foo"),
("<a href='/foo '>", "", "/foo"),
("<a href='//foo '>", "", "//foo"),
("<a href='' data-url='http://example.com'>", "", "http://example.com"),
("<a href='http://example.com'>", "", "http://example.com"),
# invalid url
("<a href='javascript:void(0)'>", "", None),
("<a href=''>", "http://example.com", None),
],
)
def test_extract_link(html_input, base_url, expected_output):
a_node = fromstring(html_input) if isinstance(html_input, str) else html_input
result = extract_link(a_node, base_url)
assert result == expected_output


@pytest.mark.parametrize(
"html_input, base_url, expected_output",
[
# Spaces in the path
(
"<a href='/path/to/resource with spaces'>",
"http://example.com",
"http://example.com/path/to/resource%20with%20spaces",
),
# Missing schema and base_url
(
"<a href='//example.com/foo'>",
"",
"//example.com/foo",
),
# no base url
("<a href='foo'>", "", "foo"),
("<a href='/foo '>", "", "/foo"),
("<a href='//foo '>", "", "//foo"),
("<a href='' data-url='http://example.com'>", "", "http://example.com"),
("<a href='http://example.com'>", "", "http://example.com"),
],
)
def test_extract_safe_link(html_input, base_url, expected_output):
a_node = fromstring(html_input) if isinstance(html_input, str) else html_input
result = extract_link(a_node, base_url, force_safe=True)
assert result == expected_output
23 changes: 11 additions & 12 deletions zyte_parsers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
)
from parsel import Selector # noqa: F401
from w3lib.html import strip_html5_whitespace
from w3lib.url import safe_url_string

from zyte_parsers.api import SelectorOrElement, input_to_element

Expand Down Expand Up @@ -55,19 +56,11 @@ def strip_urljoin(base_url: Optional[str], url: Optional[str]) -> str:
return urljoin(base_url or "", url or "")


def extract_link(a_node: SelectorOrElement, base_url: str) -> Optional[str]:
def extract_link(
a_node: SelectorOrElement, base_url: str, force_safe: bool = False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I understand you're adding force_safe to make new behavior optional, I wonder if it's possible to apply that unconditionally to keep API simpler, or there are any cases where you are worried it can break working code?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, actually I don't know where and in what cases this feature is used and decided to add it in such a way to prevent any unpredictable problems. And also for the cases if someone wants to use this without changing the link at all.

) -> Optional[str]:
"""
Extract the absolute url link from an ``<a>`` HTML tag.

>>> extract_link(fromstring("<a href=' http://example.com'"), "")
'http://example.com'
>>> extract_link(fromstring("<a href='/foo '"), "http://example.com")
'http://example.com/foo'
>>> extract_link(fromstring("<a href='' data-url='http://example.com'"), "")
'http://example.com'
>>> extract_link(fromstring("<a href='javascript:void(0)'"), "")
>>> extract_link(Selector(text="<a href='http://example.com'").css("a")[0], "")
'http://example.com'
"""
a_node = input_to_element(a_node)
link = a_node.get("href") or a_node.get("data-url")
Expand All @@ -80,7 +73,13 @@ def extract_link(a_node: SelectorOrElement, base_url: str) -> Optional[str]:
except ValueError:
link = None

return link
if not link or not force_safe:
return link

try:
return safe_url_string(link)
except ValueError:
return None


def extract_text(
Expand Down
Loading