-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make URL from extract_link safe and add missing scheme #19
base: main
Are you sure you want to change the base?
Changes from 6 commits
0d4669d
138fb39
0fcfaba
4b854dc
8c77922
6bbc25d
209fbe7
ef9925d
9a31516
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,72 @@ | ||
from pathlib import Path | ||
|
||
import pytest | ||
from parsel import Selector # noqa: F401 | ||
|
||
from zyte_parsers.utils import extract_link, fromstring | ||
|
||
TEST_DATA_ROOT = Path(__file__).parent / "data" | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"html_input, base_url, expected_output", | ||
[ | ||
("<a href=' http://example.com'>", "", "http://example.com"), | ||
("<a href='foo'>", "http://example.com", "http://example.com/foo"), | ||
("<a href='/foo '>", "http://example.com", "http://example.com/foo"), | ||
("<a href='//foo '>", "http://example.com", "http://foo"), | ||
( | ||
"<a href='//example.com/foo'>", | ||
"http://example.com", | ||
"http://example.com/foo", | ||
), | ||
# Selector | ||
( | ||
Selector(text="<a href='http://example.com'>").css("a")[0], | ||
"", | ||
"http://example.com", | ||
), | ||
# no base url | ||
("<a href='foo'>", "", "foo"), | ||
("<a href='/foo '>", "", "/foo"), | ||
("<a href='//foo '>", "", "//foo"), | ||
("<a href='' data-url='http://example.com'>", "", "http://example.com"), | ||
("<a href='http://example.com'>", "", "http://example.com"), | ||
# invalid url | ||
("<a href='javascript:void(0)'>", "", None), | ||
("<a href=''>", "http://example.com", None), | ||
], | ||
) | ||
def test_extract_link(html_input, base_url, expected_output): | ||
a_node = fromstring(html_input) if isinstance(html_input, str) else html_input | ||
result = extract_link(a_node, base_url) | ||
assert result == expected_output | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"html_input, base_url, expected_output", | ||
[ | ||
# Spaces in the path | ||
( | ||
"<a href='/path/to/resource with spaces'>", | ||
"http://example.com", | ||
"http://example.com/path/to/resource%20with%20spaces", | ||
), | ||
# Missing schema and base_url | ||
( | ||
"<a href='//example.com/foo'>", | ||
"", | ||
"https://example.com/foo", | ||
), | ||
# no base url | ||
("<a href='foo'>", "", "foo"), | ||
("<a href='/foo '>", "", "/foo"), | ||
("<a href='//foo '>", "", "https://foo"), | ||
("<a href='' data-url='http://example.com'>", "", "http://example.com"), | ||
("<a href='http://example.com'>", "", "http://example.com"), | ||
], | ||
) | ||
def test_extract_safe_link(html_input, base_url, expected_output): | ||
a_node = fromstring(html_input) if isinstance(html_input, str) else html_input | ||
result = extract_link(a_node, base_url, force_safe=True) | ||
assert result == expected_output |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
import itertools | ||
from typing import Any, Callable, Iterable, Optional | ||
from urllib.parse import urljoin | ||
from urllib.parse import urljoin, urlparse, urlunparse | ||
|
||
import html_text | ||
from lxml.html import ( # noqa: F401 | ||
|
@@ -11,6 +11,7 @@ | |
) | ||
from parsel import Selector # noqa: F401 | ||
from w3lib.html import strip_html5_whitespace | ||
from w3lib.url import safe_url_string | ||
|
||
from zyte_parsers.api import SelectorOrElement, input_to_element | ||
|
||
|
@@ -55,19 +56,22 @@ def strip_urljoin(base_url: Optional[str], url: Optional[str]) -> str: | |
return urljoin(base_url or "", url or "") | ||
|
||
|
||
def extract_link(a_node: SelectorOrElement, base_url: str) -> Optional[str]: | ||
def add_https_to_url(url: str) -> str: | ||
if url.startswith(("http://", "https://")): | ||
return url | ||
|
||
parsed_url = urlparse(url) | ||
if not parsed_url.scheme and parsed_url.netloc: | ||
parsed_url = parsed_url._replace(scheme="https") | ||
|
||
return str(urlunparse(parsed_url)) | ||
|
||
|
||
def extract_link( | ||
a_node: SelectorOrElement, base_url: str, force_safe: bool = False | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I understand you're adding There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, actually I don't know where and in what cases this feature is used and decided to add it in such a way to prevent any unpredictable problems. And also for the cases if someone wants to use this without changing the link at all. |
||
) -> Optional[str]: | ||
""" | ||
Extract the absolute url link from an ``<a>`` HTML tag. | ||
|
||
>>> extract_link(fromstring("<a href=' http://example.com'"), "") | ||
'http://example.com' | ||
>>> extract_link(fromstring("<a href='/foo '"), "http://example.com") | ||
'http://example.com/foo' | ||
>>> extract_link(fromstring("<a href='' data-url='http://example.com'"), "") | ||
'http://example.com' | ||
>>> extract_link(fromstring("<a href='javascript:void(0)'"), "") | ||
>>> extract_link(Selector(text="<a href='http://example.com'").css("a")[0], "") | ||
'http://example.com' | ||
""" | ||
a_node = input_to_element(a_node) | ||
link = a_node.get("href") or a_node.get("data-url") | ||
|
@@ -80,7 +84,18 @@ def extract_link(a_node: SelectorOrElement, base_url: str) -> Optional[str]: | |
except ValueError: | ||
link = None | ||
|
||
return link | ||
if not link or not force_safe: | ||
return link | ||
|
||
try: | ||
safe_link = safe_url_string(link) | ||
except ValueError: | ||
return None | ||
|
||
# add scheme (https) when missing schema and no base url | ||
safe_link = add_https_to_url(safe_link) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Super minor: I think "no base url" part of the comment does not quite apply, as this can have effect even if base URL is supplied. Also if we take that away we'd just repeat the function name so maybe we could remove the comment completely. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, in case of existing |
||
|
||
return safe_link | ||
|
||
|
||
def extract_text( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, why do we need this function? If a URL is written without http or https, it won't work anyways in the browser, right (unless it's a relative URL, which uses different rules - adding http/https is wrong for them)?
If so, why are we adding the missing schema? My concern is that it'd change some invalid URLs to URLs which look valid, but which are not.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, or is it the
//
example, i.e. the relative schema ("use whatever protocol the page is using")? In this case, adding https could be not 100% precise, as it should be the protocol of the web page itself.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the idea is to handle such cases like //example.com when
base_url
here https://github.com/zytedata/zyte-parsers/pull/19/files#diff-0db3ea2158d725e2de6295c907fbe44928efc0c94057ef6ca85dbbee4c733819R83 is not provided.But this function doesn't seem to be a very important feature at the moment and we can easily ignore it.
UPDATE: For the simplicity I removed this function.