diff --git a/docs/guides/code_examples/http_crawlers/selectolax_parser.py b/docs/guides/code_examples/http_crawlers/selectolax_parser.py
index 0c38b1e9bf..b8fca8b38c 100644
--- a/docs/guides/code_examples/http_crawlers/selectolax_parser.py
+++ b/docs/guides/code_examples/http_crawlers/selectolax_parser.py
@@ -45,7 +45,7 @@ def is_matching_selector(
@override
def find_links(
- self, parsed_content: LexborHTMLParser, selector: str
+ self, parsed_content: LexborHTMLParser, selector: str, attribute: str
) -> Iterable[str]:
"""Extract href attributes from elements matching the selector.
@@ -54,7 +54,7 @@ def find_links(
link: LexborNode
urls: list[str] = []
for link in parsed_content.css(selector):
- url = link.attributes.get('href')
+ url = link.attributes.get(attribute)
if url:
urls.append(url.strip())
return urls
diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py
index 23090311c2..baa0746009 100644
--- a/src/crawlee/_types.py
+++ b/src/crawlee/_types.py
@@ -387,6 +387,7 @@ def __call__(
self,
*,
selector: str | None = None,
+ attribute: str | None = None,
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
@@ -411,6 +412,7 @@ def __call__(
self,
*,
selector: str | None = None,
+ attribute: str | None = None,
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
@@ -428,6 +430,7 @@ def __call__(
- `PlaywrightCrawler` supports CSS and XPath selectors.
- `ParselCrawler` supports CSS selectors.
- `BeautifulSoupCrawler` supports CSS selectors.
+ attribute: Which node attribute to extract the links from.
label: Label for the newly created `Request` objects, used for request routing.
user_data: User data to be provided to the newly created `Request` objects.
transform_request_function: A function that takes `RequestOptions` and returns either:
@@ -457,6 +460,7 @@ def __call__(
self,
*,
selector: str = 'a',
+ attribute: str = 'href',
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
@@ -470,6 +474,7 @@ def __call__(
- `PlaywrightCrawler` supports CSS and XPath selectors.
- `ParselCrawler` supports CSS selectors.
- `BeautifulSoupCrawler` supports CSS selectors.
+ attribute: Which node attribute to extract the links from.
label: Label for the newly created `Request` objects, used for request routing.
user_data: User data to be provided to the newly created `Request` objects.
transform_request_function: A function that takes `RequestOptions` and returns either:
diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
index 7aafa49e2e..f1b87f8ac3 100644
--- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
+++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -176,6 +176,7 @@ def _create_extract_links_function(
async def extract_links(
*,
selector: str = 'a',
+ attribute: str = 'href',
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
@@ -191,10 +192,12 @@ async def extract_links(
kwargs.setdefault('strategy', 'same-hostname')
strategy = kwargs.get('strategy', 'same-hostname')
- links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
+ links_iterator: Iterator[str] = iter(
+ self._parser.find_links(parsed_content, selector=selector, attribute=attribute)
+ )
# Get base URL from tag if present
- extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
+ extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]', 'href'))
base_url: str = (
str(extracted_base_urls[0])
if extracted_base_urls
diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py
index 13ef57e9ea..5f64565e76 100644
--- a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py
+++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py
@@ -93,12 +93,13 @@ def is_matching_selector(self, parsed_content: TParseResult, selector: str) -> b
"""
@abstractmethod
- def find_links(self, parsed_content: TParseResult, selector: str) -> Iterable[str]:
+ def find_links(self, parsed_content: TParseResult, selector: str, attribute: str) -> Iterable[str]:
"""Find all links in result using selector.
Args:
parsed_content: Parsed HTTP response. Result of `parse` method.
selector: String used to define matching pattern for finding links.
+ attribute: Which node attribute to extract the links from.
Returns:
Iterable of strings that contain found links.
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 89eb252d91..53c37416e0 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -997,6 +997,7 @@ def _create_enqueue_links_function(
async def enqueue_links(
*,
selector: str | None = None,
+ attribute: str | None = None,
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
@@ -1010,9 +1011,9 @@ async def enqueue_links(
kwargs.setdefault('strategy', 'same-hostname')
if requests:
- if any((selector, label, user_data, transform_request_function)):
+ if any((selector, attribute, label, user_data, transform_request_function)):
raise ValueError(
- 'You cannot provide `selector`, `label`, `user_data` or '
+ 'You cannot provide `selector`, `attribute`, `label`, `user_data` or '
'`transform_request_function` arguments when `requests` is provided.'
)
# Add directly passed requests.
@@ -1024,6 +1025,7 @@ async def enqueue_links(
await context.add_requests(
await extract_links(
selector=selector or 'a',
+ attribute=attribute or 'href',
label=label,
user_data=user_data,
transform_request_function=transform_request_function,
diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py
index cd264cd946..735444a576 100644
--- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py
+++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py
@@ -38,11 +38,11 @@ async def select(self, parsed_content: Tag, selector: str) -> Sequence[Tag]:
return tuple(match for match in parsed_content.select(selector))
@override
- def find_links(self, parsed_content: Tag, selector: str) -> Iterable[str]:
+ def find_links(self, parsed_content: Tag, selector: str, attribute: str) -> Iterable[str]:
link: Tag
urls: list[str] = []
for link in parsed_content.select(selector):
- url = link.attrs.get('href')
+ url = link.attrs.get(attribute)
if url:
urls.append(url.strip())
return urls
diff --git a/src/crawlee/crawlers/_http/_http_parser.py b/src/crawlee/crawlers/_http/_http_parser.py
index 143629dac4..adfcc52aec 100644
--- a/src/crawlee/crawlers/_http/_http_parser.py
+++ b/src/crawlee/crawlers/_http/_http_parser.py
@@ -43,5 +43,7 @@ def is_matching_selector(self, parsed_content: bytes, selector: str) -> bool: #
return False
@override
- def find_links(self, parsed_content: bytes, selector: str) -> Iterable[str]: # Intentional unused argument.
+ def find_links(
+ self, parsed_content: bytes, selector: str, attribute: str
+ ) -> Iterable[str]: # Intentional unused argument.
return []
diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py
index f9ca19139a..2b58957edc 100644
--- a/src/crawlee/crawlers/_parsel/_parsel_parser.py
+++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py
@@ -37,11 +37,11 @@ def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool:
return parsed_content.type in ('html', 'xml') and parsed_content.css(selector).get() is not None
@override
- def find_links(self, parsed_content: Selector, selector: str) -> Iterable[str]:
+ def find_links(self, parsed_content: Selector, selector: str, attribute: str) -> Iterable[str]:
link: Selector
urls: list[str] = []
for link in parsed_content.css(selector):
- url = link.xpath('@href').get()
+ url = link.xpath(f'@{attribute}').get()
if url:
urls.append(url.strip())
return urls
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
index 6f4b2b0e9d..cf70ff94bd 100644
--- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py
+++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -373,6 +373,7 @@ def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContex
async def extract_links(
*,
selector: str = 'a',
+ attribute: str = 'href',
label: str | None = None,
user_data: dict | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
@@ -394,7 +395,7 @@ async def extract_links(
elements = await context.page.query_selector_all(selector)
links_iterator: Iterator[str] = iter(
- [url for element in elements if (url := await element.get_attribute('href')) is not None]
+ [url for element in elements if (url := await element.get_attribute(attribute)) is not None]
)
# Get base URL from tag if present
diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
index 1b8b50777b..ea31cc42b5 100644
--- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
+++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
@@ -50,20 +50,41 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await crawler.run(requests)
- first_visited = visit.call_args_list[0][0][0]
- visited = {call[0][0] for call in visit.call_args_list}
-
- assert first_visited == redirect_url
- assert visited == {
- redirect_url,
- str(server_url / 'sub_index'),
- str(server_url / 'page_1'),
- str(server_url / 'page_2'),
- str(server_url / 'page_3'),
- str(server_url / 'page_4'),
- str(server_url / 'base_page'),
- str(server_url / 'base_subpath/page_5'),
- }
+ expected_visit_calls = [
+ mock.call(redirect_url),
+ mock.call(str(server_url / 'sub_index')),
+ mock.call(str(server_url / 'page_1')),
+ mock.call(str(server_url / 'page_2')),
+ mock.call(str(server_url / 'page_3')),
+ mock.call(str(server_url / 'page_4')),
+ mock.call(str(server_url / 'base_page')),
+ mock.call(str(server_url / 'base_subpath/page_5')),
+ ]
+ assert visit.mock_calls[0] == expected_visit_calls[0]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
+
+
+async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:
+ redirect_target = str(server_url / 'start_enqueue_non_href')
+ redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))
+ requests = [redirect_url]
+
+ crawler = BeautifulSoupCrawler(http_client=http_client)
+ visit = mock.Mock()
+
+ @crawler.router.default_handler
+ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+ visit(context.request.url)
+ await context.enqueue_links(selector='img', attribute='src')
+
+ await crawler.run(requests)
+
+ expected_visit_calls = [
+ mock.call(redirect_url),
+ mock.call(str(server_url / 'base_subpath/image_1')),
+ mock.call(str(server_url / 'image_2')),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
async def test_enqueue_links_selector(server_url: URL, http_client: HttpClient) -> None:
@@ -77,8 +98,11 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await crawler.run([str(server_url / 'start_enqueue')])
- visited = {call[0][0] for call in visit.call_args_list}
- assert visited == {str(server_url / 'start_enqueue'), str(server_url / 'sub_index')}
+ expected_visit_calls = [
+ mock.call(str(server_url / 'start_enqueue')),
+ mock.call(str(server_url / 'sub_index')),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpClient) -> None:
@@ -128,18 +152,17 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await crawler.run([str(server_url / 'start_enqueue')])
- visited = {call[0][0] for call in visit.call_args_list}
-
# url /page_3 should not be visited
- assert visited == {
- str(server_url / 'start_enqueue'),
- str(server_url / 'sub_index'),
- str(server_url / 'page_1'),
- str(server_url / 'page_2'),
- str(server_url / 'base_page'),
- str(server_url / 'page_4'),
- str(server_url / 'base_subpath/page_5'),
- }
+ expected_visit_calls = [
+ mock.call(str(server_url / 'start_enqueue')),
+ mock.call(str(server_url / 'sub_index')),
+ mock.call(str(server_url / 'page_1')),
+ mock.call(str(server_url / 'page_2')),
+ mock.call(str(server_url / 'base_page')),
+ mock.call(str(server_url / 'page_4')),
+ mock.call(str(server_url / 'base_subpath/page_5')),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
# # all urls added to `enqueue_links` must have a custom header
assert headers[1]['transform-header'] == 'my-header'
@@ -167,14 +190,14 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await context.enqueue_links()
await crawler.run([str(server_url / 'start_enqueue')])
- visited = {call[0][0] for call in visit.call_args_list}
- assert visited == {
- str(server_url / 'start_enqueue'),
- str(server_url / 'sub_index'),
- str(server_url / 'base_page'),
- str(server_url / 'base_subpath/page_5'),
- }
+ expected_visit_calls = [
+ mock.call(str(server_url / 'start_enqueue')),
+ mock.call(str(server_url / 'sub_index')),
+ mock.call(str(server_url / 'base_page')),
+ mock.call(str(server_url / 'base_subpath/page_5')),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
@@ -198,17 +221,19 @@ async def error_handler(context: BasicCrawlingContext, _error: Exception) -> Non
await crawler.run([str(server_url / 'problematic_links')])
- visited = {call[0][0] for call in visit.call_args_list}
- failed = {call[0][0] for call in fail.call_args_list}
-
# Email must be skipped
# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
- assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}
+ expected_visit_calls = [
+ mock.call(str(server_url / 'problematic_links')),
+ mock.call('https://avatars.githubusercontent.com/apify'),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
# The budplaceholder.com does not exist.
- assert failed == {
- 'https://budplaceholder.com/',
- }
+ expected_fail_calls = [
+ mock.call('https://budplaceholder.com/'),
+ ]
+ fail.assert_has_calls(expected_fail_calls, any_order=True)
async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
@@ -225,14 +250,13 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None:
await crawler.run([str(server_url / 'start_enqueue')])
- skipped = {call[0][0] for call in skip.call_args_list}
-
- assert skipped == {
- str(server_url / 'page_1'),
- str(server_url / 'page_2'),
- str(server_url / 'page_3'),
- str(server_url / 'page_4'),
- }
+ expected_skip_calls = [
+ mock.call(str(server_url / 'page_1')),
+ mock.call(str(server_url / 'page_2')),
+ mock.call(str(server_url / 'page_3')),
+ mock.call(str(server_url / 'page_4')),
+ ]
+ skip.assert_has_calls(expected_skip_calls, any_order=True)
async def test_extract_links(server_url: URL, http_client: HttpClient) -> None:
@@ -250,6 +274,21 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
assert extracted_links[0] == str(server_url / 'page_1')
+async def test_extract_non_href_links(server_url: URL, http_client: HttpClient) -> None:
+ crawler = BeautifulSoupCrawler(http_client=http_client)
+ extracted_links: list[str] = []
+
+ @crawler.router.default_handler
+ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+ links = await context.extract_links(selector='li', attribute='data-href')
+ extracted_links.extend(request.url for request in links)
+
+ await crawler.run([str(server_url / 'non_href_links')])
+
+ assert len(extracted_links) == 1
+ assert extracted_links[0] == str(server_url / 'page_2')
+
+
@pytest.mark.parametrize(
('queue_name', 'queue_alias', 'by_id'),
[
@@ -444,12 +483,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await crawler.run(requests)
- first_visited = visit.call_args_list[0][0][0]
- visited = {call[0][0] for call in visit.call_args_list}
-
- assert first_visited == start_url
# Only one link should be enqueued from sub_index due to the limit
- assert visited == {
- start_url,
- str(server_url / 'page_3'),
- }
+ expected_visit_calls = [
+ mock.call(start_url),
+ mock.call(str(server_url / 'page_3')),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py
index 648b6ee9c0..71d8ada1c5 100644
--- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py
+++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py
@@ -51,20 +51,41 @@ async def request_handler(context: ParselCrawlingContext) -> None:
await crawler.run(requests)
- first_visited = visit.call_args_list[0][0][0]
- visited = {call[0][0] for call in visit.call_args_list}
-
- assert first_visited == redirect_url
- assert visited == {
- redirect_url,
- str(server_url / 'sub_index'),
- str(server_url / 'page_1'),
- str(server_url / 'page_2'),
- str(server_url / 'page_3'),
- str(server_url / 'page_4'),
- str(server_url / 'base_page'),
- str(server_url / 'base_subpath/page_5'),
- }
+ expected_visit_calls = [
+ mock.call(redirect_url),
+ mock.call(str(server_url / 'sub_index')),
+ mock.call(str(server_url / 'page_1')),
+ mock.call(str(server_url / 'page_2')),
+ mock.call(str(server_url / 'page_3')),
+ mock.call(str(server_url / 'page_4')),
+ mock.call(str(server_url / 'base_page')),
+ mock.call(str(server_url / 'base_subpath/page_5')),
+ ]
+ assert visit.mock_calls[0] == expected_visit_calls[0]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
+
+
+async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:
+ redirect_target = str(server_url / 'start_enqueue_non_href')
+ redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))
+ requests = [redirect_url]
+
+ crawler = ParselCrawler(http_client=http_client)
+ visit = mock.Mock()
+
+ @crawler.router.default_handler
+ async def request_handler(context: ParselCrawlingContext) -> None:
+ visit(context.request.url)
+ await context.enqueue_links(selector='img', attribute='src')
+
+ await crawler.run(requests)
+
+ expected_visit_calls = [
+ mock.call(redirect_url),
+ mock.call(str(server_url / 'base_subpath/image_1')),
+ mock.call(str(server_url / 'image_2')),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
async def test_enqueue_links_with_incompatible_kwargs_raises_error(server_url: URL) -> None:
@@ -97,8 +118,11 @@ async def request_handler(context: ParselCrawlingContext) -> None:
await crawler.run([str(server_url / 'start_enqueue')])
- visited = {call[0][0] for call in visit.call_args_list}
- assert visited == {str(server_url / 'start_enqueue'), str(server_url / 'sub_index')}
+ expected_visit_calls = [
+ mock.call(str(server_url / 'start_enqueue')),
+ mock.call(str(server_url / 'sub_index')),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpClient) -> None:
@@ -147,20 +171,19 @@ async def request_handler(context: ParselCrawlingContext) -> None:
await crawler.run([str(server_url / 'start_enqueue')])
- visited = {call[0][0] for call in visit.call_args_list}
-
# url /page_3 should not be visited
- assert visited == {
- str(server_url / 'start_enqueue'),
- str(server_url / 'sub_index'),
- str(server_url / 'page_1'),
- str(server_url / 'page_2'),
- str(server_url / 'page_4'),
- str(server_url / 'base_page'),
- str(server_url / 'base_subpath/page_5'),
- }
-
- # # all urls added to `enqueue_links` must have a custom header
+ expected_visit_calls = [
+ mock.call(str(server_url / 'start_enqueue')),
+ mock.call(str(server_url / 'sub_index')),
+ mock.call(str(server_url / 'page_1')),
+ mock.call(str(server_url / 'page_2')),
+ mock.call(str(server_url / 'page_4')),
+ mock.call(str(server_url / 'base_page')),
+ mock.call(str(server_url / 'base_subpath/page_5')),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
+
+ # all urls added to `enqueue_links` must have a custom header
assert headers[1]['transform-header'] == 'my-header'
assert headers[2]['transform-header'] == 'my-header'
assert headers[3]['transform-header'] == 'my-header'
@@ -260,14 +283,14 @@ async def request_handler(context: ParselCrawlingContext) -> None:
await context.enqueue_links()
await crawler.run([str(server_url / 'start_enqueue')])
- visited = {call[0][0] for call in visit.call_args_list}
- assert visited == {
- str(server_url / 'start_enqueue'),
- str(server_url / 'sub_index'),
- str(server_url / 'base_page'),
- str(server_url / 'base_subpath/page_5'),
- }
+ expected_visit_calls = [
+ mock.call(str(server_url / 'start_enqueue')),
+ mock.call(str(server_url / 'sub_index')),
+ mock.call(str(server_url / 'base_page')),
+ mock.call(str(server_url / 'base_subpath/page_5')),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
@@ -291,17 +314,19 @@ async def error_handler(context: BasicCrawlingContext, _error: Exception) -> Non
await crawler.run([str(server_url / 'problematic_links')])
- visited = {call[0][0] for call in visit.call_args_list}
- failed = {call[0][0] for call in fail.call_args_list}
-
# Email must be skipped
# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
- assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}
+ expected_visit_calls = [
+ mock.call(str(server_url / 'problematic_links')),
+ mock.call('https://avatars.githubusercontent.com/apify'),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
# The budplaceholder.com does not exist.
- assert failed == {
- 'https://budplaceholder.com/',
- }
+ expected_fail_calls = [
+ mock.call('https://budplaceholder.com/'),
+ ]
+ fail.assert_has_calls(expected_fail_calls, any_order=True)
async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
@@ -318,14 +343,13 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None:
await crawler.run([str(server_url / 'start_enqueue')])
- skipped = {call[0][0] for call in skip.call_args_list}
-
- assert skipped == {
- str(server_url / 'page_1'),
- str(server_url / 'page_2'),
- str(server_url / 'page_3'),
- str(server_url / 'page_4'),
- }
+ expected_skip_calls = [
+ mock.call(str(server_url / 'page_1')),
+ mock.call(str(server_url / 'page_2')),
+ mock.call(str(server_url / 'page_3')),
+ mock.call(str(server_url / 'page_4')),
+ ]
+ skip.assert_has_calls(expected_skip_calls, any_order=True)
async def test_extract_links(server_url: URL, http_client: HttpClient) -> None:
@@ -343,6 +367,21 @@ async def request_handler(context: ParselCrawlingContext) -> None:
assert extracted_links[0] == str(server_url / 'page_1')
+async def test_extract_non_href_links(server_url: URL, http_client: HttpClient) -> None:
+ crawler = ParselCrawler(http_client=http_client)
+ extracted_links: list[str] = []
+
+ @crawler.router.default_handler
+ async def request_handler(context: ParselCrawlingContext) -> None:
+ links = await context.extract_links(selector='li', attribute='data-href')
+ extracted_links.extend(request.url for request in links)
+
+ await crawler.run([str(server_url / 'non_href_links')])
+
+ assert len(extracted_links) == 1
+ assert extracted_links[0] == str(server_url / 'page_2')
+
+
@pytest.mark.parametrize(
('queue_name', 'queue_alias', 'by_id'),
[
@@ -461,12 +500,9 @@ async def request_handler(context: ParselCrawlingContext) -> None:
await crawler.run(requests)
- first_visited = visit.call_args_list[0][0][0]
- visited = {call[0][0] for call in visit.call_args_list}
-
- assert first_visited == start_url
# Only one link should be enqueued from sub_index due to the limit
- assert visited == {
- start_url,
- str(server_url / 'page_3'),
- }
+ expected_visit_calls = [
+ mock.call(start_url),
+ mock.call(str(server_url / 'page_3')),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
index e0ada5de1c..8ea80fdef5 100644
--- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py
+++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
@@ -89,19 +89,40 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
await crawler.run(requests)
- first_visited = visit.call_args_list[0][0][0]
- visited = {call[0][0] for call in visit.call_args_list[1:]}
-
- assert first_visited == redirect_url
- assert visited == {
- str(server_url / 'sub_index'),
- str(server_url / 'page_1'),
- str(server_url / 'page_2'),
- str(server_url / 'page_3'),
- str(server_url / 'page_4'),
- str(server_url / 'base_page'),
- str(server_url / 'base_subpath/page_5'),
- }
+ expected_visit_calls = [
+ mock.call(redirect_url),
+ mock.call(str(server_url / 'sub_index')),
+ mock.call(str(server_url / 'page_1')),
+ mock.call(str(server_url / 'page_2')),
+ mock.call(str(server_url / 'page_3')),
+ mock.call(str(server_url / 'page_4')),
+ mock.call(str(server_url / 'base_page')),
+ mock.call(str(server_url / 'base_subpath/page_5')),
+ ]
+ assert visit.mock_calls[0] == expected_visit_calls[0]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
+
+
+async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL) -> None:
+ redirect_target = str(server_url / 'start_enqueue_non_href')
+ redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))
+ requests = [redirect_url]
+ crawler = PlaywrightCrawler()
+ visit = mock.Mock()
+
+ @crawler.router.default_handler
+ async def request_handler(context: PlaywrightCrawlingContext) -> None:
+ visit(context.request.url)
+ await context.enqueue_links(selector='img', attribute='src')
+
+ await crawler.run(requests)
+
+ expected_visit_calls = [
+ mock.call(redirect_url),
+ mock.call(str(server_url / 'base_subpath/image_1')),
+ mock.call(str(server_url / 'image_2')),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
async def test_enqueue_links_with_incompatible_kwargs_raises_error(server_url: URL) -> None:
@@ -146,9 +167,11 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
await crawler.run([str(server_url / 'start_enqueue')])
- visited = {call[0][0] for call in visit.call_args_list}
-
- assert visited == {str(server_url / 'start_enqueue'), str(server_url / 'sub_index')}
+ expected_visit_calls = [
+ mock.call(str(server_url / 'start_enqueue')),
+ mock.call(str(server_url / 'sub_index')),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
# all urls added to `enqueue_links` must have a custom header
assert headers[1]['transform-header'] == 'my-header'
@@ -676,14 +699,14 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
await context.enqueue_links()
await crawler.run([str(server_url / 'start_enqueue')])
- visited = {call[0][0] for call in visit.call_args_list}
- assert visited == {
- str(server_url / 'start_enqueue'),
- str(server_url / 'sub_index'),
- str(server_url / 'base_page'),
- str(server_url / 'base_subpath/page_5'),
- }
+ expected_visit_calls = [
+ mock.call(str(server_url / 'start_enqueue')),
+ mock.call(str(server_url / 'sub_index')),
+ mock.call(str(server_url / 'base_page')),
+ mock.call(str(server_url / 'base_subpath/page_5')),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
async def test_respect_robots_txt_with_problematic_links(server_url: URL) -> None:
@@ -706,17 +729,19 @@ async def error_handler(context: BasicCrawlingContext, _error: Exception) -> Non
await crawler.run([str(server_url / 'problematic_links')])
- visited = {call[0][0] for call in visit.call_args_list}
- failed = {call[0][0] for call in fail.call_args_list}
-
# Email must be skipped
# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
- assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}
+ expected_visit_calls = [
+ mock.call(str(server_url / 'problematic_links')),
+ mock.call('https://avatars.githubusercontent.com/apify'),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
# The budplaceholder.com does not exist.
- assert failed == {
- 'https://budplaceholder.com/',
- }
+ expected_fail_calls = [
+ mock.call('https://budplaceholder.com/'),
+ ]
+ fail.assert_has_calls(expected_fail_calls, any_order=True)
async def test_on_skipped_request(server_url: URL) -> None:
@@ -733,14 +758,13 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None:
await crawler.run([str(server_url / 'start_enqueue')])
- skipped = {call[0][0] for call in skip.call_args_list}
-
- assert skipped == {
- str(server_url / 'page_1'),
- str(server_url / 'page_2'),
- str(server_url / 'page_3'),
- str(server_url / 'page_4'),
- }
+ expected_skip_calls = [
+ mock.call(str(server_url / 'page_1')),
+ mock.call(str(server_url / 'page_2')),
+ mock.call(str(server_url / 'page_3')),
+ mock.call(str(server_url / 'page_4')),
+ ]
+ skip.assert_has_calls(expected_skip_calls, any_order=True)
async def test_send_request(server_url: URL) -> None:
@@ -817,6 +841,21 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
assert extracted_links[0] == str(server_url / 'page_1')
+async def test_extract_non_href_links(server_url: URL) -> None:
+ crawler = PlaywrightCrawler()
+ extracted_links: list[str] = []
+
+ @crawler.router.default_handler
+ async def request_handler(context: PlaywrightCrawlingContext) -> None:
+ links = await context.extract_links(selector='li', attribute='data-href')
+ extracted_links.extend(request.url for request in links)
+
+ await crawler.run([str(server_url / 'non_href_links')])
+
+ assert len(extracted_links) == 1
+ assert extracted_links[0] == str(server_url / 'page_2')
+
+
async def test_reduced_logs_from_playwright_navigation_timeout(caplog: pytest.LogCaptureFixture) -> None:
caplog.set_level(logging.INFO)
crawler = PlaywrightCrawler(configure_logging=False)
@@ -1072,12 +1111,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
await crawler.run(requests)
- first_visited = visit.call_args_list[0][0][0]
- visited = {call[0][0] for call in visit.call_args_list}
-
- assert first_visited == start_url
# Only one link should be enqueued from sub_index due to the limit
- assert visited == {
- start_url,
- str(server_url / 'page_3'),
- }
+ expected_visit_calls = [
+ mock.call(start_url),
+ mock.call(str(server_url / 'page_3')),
+ ]
+ visit.assert_has_calls(expected_visit_calls, any_order=True)
diff --git a/tests/unit/server.py b/tests/unit/server.py
index dd7edd59e8..69de104149 100644
--- a/tests/unit/server.py
+++ b/tests/unit/server.py
@@ -20,11 +20,13 @@
HELLO_WORLD,
INCAPSULA,
INFINITE_SCROLL,
+ NON_HREF_LINKS,
PROBLEMATIC_LINKS,
RESOURCE_LOADING_PAGE,
ROBOTS_TXT,
SECONDARY_INDEX,
START_ENQUEUE,
+ START_ENQUEUE_NON_HREF,
)
if TYPE_CHECKING:
@@ -101,6 +103,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
assert scope['type'] == 'http'
paths: dict[str, PathHandler] = {
'start_enqueue': start_enqueue_endpoint,
+ 'start_enqueue_non_href': start_enqueue_non_href_endpoint,
'sub_index': secondary_index_endpoint,
'incapsula': incapsula_endpoint,
'page_1': generic_response_endpoint,
@@ -108,6 +111,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
'page_3': generic_response_endpoint,
'base_page': base_index_endpoint,
'problematic_links': problematic_links_endpoint,
+ 'non_href_links': non_href_links_endpoint,
'set_cookies': set_cookies,
'set_complex_cookies': set_complex_cookies,
'cookies': get_cookies,
@@ -304,6 +308,14 @@ async def problematic_links_endpoint(_scope: dict[str, Any], _receive: Receive,
)
+async def non_href_links_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
+ """Handle requests with a page containing non-href links."""
+ await send_html_response(
+ send,
+ NON_HREF_LINKS,
+ )
+
+
async def redirect_to_url(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests that should redirect to a specified full URL."""
query_params = get_query_params(scope.get('query_string', b''))
@@ -453,6 +465,16 @@ async def base_index_endpoint(_scope: dict[str, Any], _receive: Receive, send: S
)
+async def start_enqueue_non_href_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
+ """Handle requests for the base index page."""
+ host = f'http://{get_headers_dict(_scope).get("host", "localhost")}'
+ content = START_ENQUEUE_NON_HREF.format(host=host).encode()
+ await send_html_response(
+ send,
+ content,
+ )
+
+
class TestServer(Server):
"""A test HTTP server implementation based on Uvicorn Server."""
diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py
index fede2ee173..b32d136524 100644
--- a/tests/unit/server_endpoints.py
+++ b/tests/unit/server_endpoints.py
@@ -17,6 +17,18 @@
test@test.com