From 61d7c12bb2f1e1b082909788b17486adc2f31ba0 Mon Sep 17 00:00:00 2001 From: Valentin Nazarov Date: Fri, 6 Mar 2026 20:00:19 +0300 Subject: [PATCH 1/5] feat: allow non-href links extract & enqueue --- .../http_crawlers/selectolax_parser.py | 4 ++-- src/crawlee/_types.py | 5 +++++ .../_abstract_http/_abstract_http_crawler.py | 7 +++++-- .../_abstract_http/_abstract_http_parser.py | 3 ++- src/crawlee/crawlers/_basic/_basic_crawler.py | 6 ++++-- .../_beautifulsoup/_beautifulsoup_parser.py | 4 ++-- src/crawlee/crawlers/_http/_http_parser.py | 4 +++- src/crawlee/crawlers/_parsel/_parsel_parser.py | 4 ++-- .../crawlers/_playwright/_playwright_crawler.py | 3 ++- .../_beautifulsoup/test_beautifulsoup_crawler.py | 15 +++++++++++++++ .../unit/crawlers/_parsel/test_parsel_crawler.py | 15 +++++++++++++++ .../_playwright/test_playwright_crawler.py | 15 +++++++++++++++ tests/unit/server.py | 10 ++++++++++ tests/unit/server_endpoints.py | 10 ++++++++++ 14 files changed, 92 insertions(+), 13 deletions(-) diff --git a/docs/guides/code_examples/http_crawlers/selectolax_parser.py b/docs/guides/code_examples/http_crawlers/selectolax_parser.py index 0c38b1e9bf..b8fca8b38c 100644 --- a/docs/guides/code_examples/http_crawlers/selectolax_parser.py +++ b/docs/guides/code_examples/http_crawlers/selectolax_parser.py @@ -45,7 +45,7 @@ def is_matching_selector( @override def find_links( - self, parsed_content: LexborHTMLParser, selector: str + self, parsed_content: LexborHTMLParser, selector: str, attribute: str ) -> Iterable[str]: """Extract href attributes from elements matching the selector. @@ -54,7 +54,7 @@ def find_links( link: LexborNode urls: list[str] = [] for link in parsed_content.css(selector): - url = link.attributes.get('href') + url = link.attributes.get(attribute) if url: urls.append(url.strip()) return urls diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 23090311c2..baa0746009 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -387,6 +387,7 @@ def __call__( self, *, selector: str | None = None, + attribute: str | None = None, label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, @@ -411,6 +412,7 @@ def __call__( self, *, selector: str | None = None, + attribute: str | None = None, label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, @@ -428,6 +430,7 @@ def __call__( - `PlaywrightCrawler` supports CSS and XPath selectors. - `ParselCrawler` supports CSS selectors. - `BeautifulSoupCrawler` supports CSS selectors. + attribute: Which node attribute to extract the links from. label: Label for the newly created `Request` objects, used for request routing. user_data: User data to be provided to the newly created `Request` objects. transform_request_function: A function that takes `RequestOptions` and returns either: @@ -457,6 +460,7 @@ def __call__( self, *, selector: str = 'a', + attribute: str = 'href', label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, @@ -470,6 +474,7 @@ def __call__( - `PlaywrightCrawler` supports CSS and XPath selectors. - `ParselCrawler` supports CSS selectors. - `BeautifulSoupCrawler` supports CSS selectors. + attribute: Which node attribute to extract the links from. label: Label for the newly created `Request` objects, used for request routing. user_data: User data to be provided to the newly created `Request` objects. transform_request_function: A function that takes `RequestOptions` and returns either: diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 7aafa49e2e..d146786af2 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -176,6 +176,7 @@ def _create_extract_links_function( async def extract_links( *, selector: str = 'a', + attribute: str = 'href', label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] @@ -191,10 +192,12 @@ async def extract_links( kwargs.setdefault('strategy', 'same-hostname') strategy = kwargs.get('strategy', 'same-hostname') - links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector)) + links_iterator: Iterator[str] = iter( + self._parser.find_links(parsed_content, selector=selector, attribute=attribute) + ) # Get base URL from tag if present - extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]')) + extracted_base_urls = list(self._parser.find_links(parsed_content, 'base', 'href')) base_url: str = ( str(extracted_base_urls[0]) if extracted_base_urls diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py index 13ef57e9ea..5f64565e76 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py @@ -93,12 +93,13 @@ def is_matching_selector(self, parsed_content: TParseResult, selector: str) -> b """ @abstractmethod - def find_links(self, parsed_content: TParseResult, selector: str) -> Iterable[str]: + def find_links(self, parsed_content: TParseResult, selector: str, attribute: str) -> Iterable[str]: """Find all links in result using selector. Args: parsed_content: Parsed HTTP response. Result of `parse` method. selector: String used to define matching pattern for finding links. + attribute: Which node attribute to extract the links from. Returns: Iterable of strings that contain found links. diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 89eb252d91..53c37416e0 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -997,6 +997,7 @@ def _create_enqueue_links_function( async def enqueue_links( *, selector: str | None = None, + attribute: str | None = None, label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] @@ -1010,9 +1011,9 @@ async def enqueue_links( kwargs.setdefault('strategy', 'same-hostname') if requests: - if any((selector, label, user_data, transform_request_function)): + if any((selector, attribute, label, user_data, transform_request_function)): raise ValueError( - 'You cannot provide `selector`, `label`, `user_data` or ' + 'You cannot provide `selector`, `attribute`, `label`, `user_data` or ' '`transform_request_function` arguments when `requests` is provided.' ) # Add directly passed requests. @@ -1024,6 +1025,7 @@ async def enqueue_links( await context.add_requests( await extract_links( selector=selector or 'a', + attribute=attribute or 'href', label=label, user_data=user_data, transform_request_function=transform_request_function, diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py index cd264cd946..735444a576 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py @@ -38,11 +38,11 @@ async def select(self, parsed_content: Tag, selector: str) -> Sequence[Tag]: return tuple(match for match in parsed_content.select(selector)) @override - def find_links(self, parsed_content: Tag, selector: str) -> Iterable[str]: + def find_links(self, parsed_content: Tag, selector: str, attribute: str) -> Iterable[str]: link: Tag urls: list[str] = [] for link in parsed_content.select(selector): - url = link.attrs.get('href') + url = link.attrs.get(attribute) if url: urls.append(url.strip()) return urls diff --git a/src/crawlee/crawlers/_http/_http_parser.py b/src/crawlee/crawlers/_http/_http_parser.py index 143629dac4..adfcc52aec 100644 --- a/src/crawlee/crawlers/_http/_http_parser.py +++ b/src/crawlee/crawlers/_http/_http_parser.py @@ -43,5 +43,7 @@ def is_matching_selector(self, parsed_content: bytes, selector: str) -> bool: # return False @override - def find_links(self, parsed_content: bytes, selector: str) -> Iterable[str]: # Intentional unused argument. + def find_links( + self, parsed_content: bytes, selector: str, attribute: str + ) -> Iterable[str]: # Intentional unused argument. return [] diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py index f9ca19139a..2b58957edc 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_parser.py +++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py @@ -37,11 +37,11 @@ def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool: return parsed_content.type in ('html', 'xml') and parsed_content.css(selector).get() is not None @override - def find_links(self, parsed_content: Selector, selector: str) -> Iterable[str]: + def find_links(self, parsed_content: Selector, selector: str, attribute: str) -> Iterable[str]: link: Selector urls: list[str] = [] for link in parsed_content.css(selector): - url = link.xpath('@href').get() + url = link.xpath(f'@{attribute}').get() if url: urls.append(url.strip()) return urls diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 6f4b2b0e9d..cf70ff94bd 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -373,6 +373,7 @@ def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContex async def extract_links( *, selector: str = 'a', + attribute: str = 'href', label: str | None = None, user_data: dict | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] @@ -394,7 +395,7 @@ async def extract_links( elements = await context.page.query_selector_all(selector) links_iterator: Iterator[str] = iter( - [url for element in elements if (url := await element.get_attribute('href')) is not None] + [url for element in elements if (url := await element.get_attribute(attribute)) is not None] ) # Get base URL from tag if present diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index 1b8b50777b..cae0629c41 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -250,6 +250,21 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: assert extracted_links[0] == str(server_url / 'page_1') +async def test_extract_non_href_links(server_url: URL, http_client: HttpClient) -> None: + crawler = BeautifulSoupCrawler(http_client=http_client) + extracted_links: list[str] = [] + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + links = await context.extract_links(selector='li', attribute='data-href') + extracted_links.extend(request.url for request in links) + + await crawler.run([str(server_url / 'non_href_links')]) + + assert len(extracted_links) == 1 + assert extracted_links[0] == str(server_url / 'page_2') + + @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'by_id'), [ diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index 648b6ee9c0..b6e0b0e036 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -343,6 +343,21 @@ async def request_handler(context: ParselCrawlingContext) -> None: assert extracted_links[0] == str(server_url / 'page_1') +async def test_extract_non_href_links(server_url: URL, http_client: HttpClient) -> None: + crawler = ParselCrawler(http_client=http_client) + extracted_links: list[str] = [] + + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + links = await context.extract_links(selector='li', attribute='data-href') + extracted_links.extend(request.url for request in links) + + await crawler.run([str(server_url / 'non_href_links')]) + + assert len(extracted_links) == 1 + assert extracted_links[0] == str(server_url / 'page_2') + + @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'by_id'), [ diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index e0ada5de1c..315755412e 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -817,6 +817,21 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: assert extracted_links[0] == str(server_url / 'page_1') +async def test_extract_non_href_links(server_url: URL) -> None: + crawler = PlaywrightCrawler() + extracted_links: list[str] = [] + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + links = await context.extract_links(selector='li', attribute='data-href') + extracted_links.extend(request.url for request in links) + + await crawler.run([str(server_url / 'non_href_links')]) + + assert len(extracted_links) == 1 + assert extracted_links[0] == str(server_url / 'page_2') + + async def test_reduced_logs_from_playwright_navigation_timeout(caplog: pytest.LogCaptureFixture) -> None: caplog.set_level(logging.INFO) crawler = PlaywrightCrawler(configure_logging=False) diff --git a/tests/unit/server.py b/tests/unit/server.py index dd7edd59e8..386f260159 100644 --- a/tests/unit/server.py +++ b/tests/unit/server.py @@ -20,6 +20,7 @@ HELLO_WORLD, INCAPSULA, INFINITE_SCROLL, + NON_HREF_LINKS, PROBLEMATIC_LINKS, RESOURCE_LOADING_PAGE, ROBOTS_TXT, @@ -108,6 +109,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None: 'page_3': generic_response_endpoint, 'base_page': base_index_endpoint, 'problematic_links': problematic_links_endpoint, + 'non_href_links': non_href_links_endpoint, 'set_cookies': set_cookies, 'set_complex_cookies': set_complex_cookies, 'cookies': get_cookies, @@ -304,6 +306,14 @@ async def problematic_links_endpoint(_scope: dict[str, Any], _receive: Receive, ) +async def non_href_links_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: + """Handle requests with a page containing non-href links.""" + await send_html_response( + send, + NON_HREF_LINKS, + ) + + async def redirect_to_url(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests that should redirect to a specified full URL.""" query_params = get_query_params(scope.get('query_string', b'')) diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py index fede2ee173..9a1e74e79a 100644 --- a/tests/unit/server_endpoints.py +++ b/tests/unit/server_endpoints.py @@ -57,6 +57,16 @@ Apify avatar/a> """ +NON_HREF_LINKS = b"""\ + + Hello + + + +
  • + +""" + GENERIC_RESPONSE = b"""\ Hello From 6665b026c5aefcdb705728e695d89a65f966a8c0 Mon Sep 17 00:00:00 2001 From: Valentin Nazarov Date: Sun, 8 Mar 2026 11:44:40 +0300 Subject: [PATCH 2/5] whoopsie --- src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index d146786af2..f1b87f8ac3 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -197,7 +197,7 @@ async def extract_links( ) # Get base URL from tag if present - extracted_base_urls = list(self._parser.find_links(parsed_content, 'base', 'href')) + extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]', 'href')) base_url: str = ( str(extracted_base_urls[0]) if extracted_base_urls From c651d9e5c13547de735d70ddb09071eaac59a67f Mon Sep 17 00:00:00 2001 From: Valentin Nazarov Date: Mon, 9 Mar 2026 12:10:10 +0300 Subject: [PATCH 3/5] Update tests/unit/server_endpoints.py Co-authored-by: Vlada Dusek --- tests/unit/server_endpoints.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py index 9a1e74e79a..d86005b38d 100644 --- a/tests/unit/server_endpoints.py +++ b/tests/unit/server_endpoints.py @@ -63,7 +63,7 @@ -
  • +
  • """ From d5f94465d9d0b23ee68b05c8b5b79352fd7ffc43 Mon Sep 17 00:00:00 2001 From: Valentin Nazarov Date: Mon, 9 Mar 2026 13:19:38 +0300 Subject: [PATCH 4/5] tests for `context.enqueue_links` --- .../test_beautifulsoup_crawler.py | 26 +++++++++++++++++++ .../crawlers/_parsel/test_parsel_crawler.py | 26 +++++++++++++++++++ .../_playwright/test_playwright_crawler.py | 25 ++++++++++++++++++ tests/unit/server.py | 12 +++++++++ tests/unit/server_endpoints.py | 12 +++++++++ 5 files changed, 101 insertions(+) diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index cae0629c41..1617ad8ddb 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -66,6 +66,32 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: } +async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None: + redirect_target = str(server_url / 'start_enqueue_non_href') + redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target)) + requests = [redirect_url] + + crawler = BeautifulSoupCrawler(http_client=http_client) + visit = mock.Mock() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + visit(context.request.url) + await context.enqueue_links(selector='img', attribute='src') + + await crawler.run(requests) + + first_visited = visit.call_args_list[0][0][0] + visited = {call[0][0] for call in visit.call_args_list} + + assert first_visited == redirect_url + assert visited == { + redirect_url, + str(server_url / 'base_subpath/image_1'), + str(server_url / 'image_2'), + } + + async def test_enqueue_links_selector(server_url: URL, http_client: HttpClient) -> None: crawler = BeautifulSoupCrawler(http_client=http_client) visit = mock.Mock() diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index b6e0b0e036..79cf723b30 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -67,6 +67,32 @@ async def request_handler(context: ParselCrawlingContext) -> None: } +async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None: + redirect_target = str(server_url / 'start_enqueue_non_href') + redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target)) + requests = [redirect_url] + + crawler = ParselCrawler(http_client=http_client) + visit = mock.Mock() + + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + visit(context.request.url) + await context.enqueue_links(selector='img', attribute='src') + + await crawler.run(requests) + + first_visited = visit.call_args_list[0][0][0] + visited = {call[0][0] for call in visit.call_args_list} + + assert first_visited == redirect_url + assert visited == { + redirect_url, + str(server_url / 'base_subpath/image_1'), + str(server_url / 'image_2'), + } + + async def test_enqueue_links_with_incompatible_kwargs_raises_error(server_url: URL) -> None: """Call `enqueue_links` with arguments that can't be used together.""" crawler = ParselCrawler(max_request_retries=1) diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 315755412e..c09f20788c 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -104,6 +104,31 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: } +async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL) -> None: + redirect_target = str(server_url / 'start_enqueue_non_href') + redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target)) + requests = [redirect_url] + crawler = PlaywrightCrawler() + visit = mock.Mock() + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + visit(context.request.url) + await context.enqueue_links(selector='img', attribute='src') + + await crawler.run(requests) + + first_visited = visit.call_args_list[0][0][0] + visited = {call[0][0] for call in visit.call_args_list} + + assert first_visited == redirect_url + assert visited == { + redirect_url, + str(server_url / 'base_subpath/image_1'), + str(server_url / 'image_2'), + } + + async def test_enqueue_links_with_incompatible_kwargs_raises_error(server_url: URL) -> None: """Call `enqueue_links` with arguments that can't be used together.""" crawler = PlaywrightCrawler(max_request_retries=1) diff --git a/tests/unit/server.py b/tests/unit/server.py index 386f260159..69de104149 100644 --- a/tests/unit/server.py +++ b/tests/unit/server.py @@ -26,6 +26,7 @@ ROBOTS_TXT, SECONDARY_INDEX, START_ENQUEUE, + START_ENQUEUE_NON_HREF, ) if TYPE_CHECKING: @@ -102,6 +103,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None: assert scope['type'] == 'http' paths: dict[str, PathHandler] = { 'start_enqueue': start_enqueue_endpoint, + 'start_enqueue_non_href': start_enqueue_non_href_endpoint, 'sub_index': secondary_index_endpoint, 'incapsula': incapsula_endpoint, 'page_1': generic_response_endpoint, @@ -463,6 +465,16 @@ async def base_index_endpoint(_scope: dict[str, Any], _receive: Receive, send: S ) +async def start_enqueue_non_href_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: + """Handle requests for the base index page.""" + host = f'http://{get_headers_dict(_scope).get("host", "localhost")}' + content = START_ENQUEUE_NON_HREF.format(host=host).encode() + await send_html_response( + send, + content, + ) + + class TestServer(Server): """A test HTTP server implementation based on Uvicorn Server.""" diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py index d86005b38d..b32d136524 100644 --- a/tests/unit/server_endpoints.py +++ b/tests/unit/server_endpoints.py @@ -17,6 +17,18 @@ test@test.com """ +START_ENQUEUE_NON_HREF = """\ + + + Hello + + + Link A + Link B + + +""" + SECONDARY_INDEX = b"""\ Hello From 389ec7ecb85292a4e2eb5c1b05dff4723a420a98 Mon Sep 17 00:00:00 2001 From: Valentin Nazarov Date: Wed, 11 Mar 2026 13:43:21 +0300 Subject: [PATCH 5/5] switch to mock.assert_has_calls --- .../test_beautifulsoup_crawler.py | 127 ++++++++--------- .../crawlers/_parsel/test_parsel_crawler.py | 131 +++++++++--------- .../_playwright/test_playwright_crawler.py | 106 +++++++------- 3 files changed, 175 insertions(+), 189 deletions(-) diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index 1617ad8ddb..ea31cc42b5 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -50,20 +50,18 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await crawler.run(requests) - first_visited = visit.call_args_list[0][0][0] - visited = {call[0][0] for call in visit.call_args_list} - - assert first_visited == redirect_url - assert visited == { - redirect_url, - str(server_url / 'sub_index'), - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'page_3'), - str(server_url / 'page_4'), - str(server_url / 'base_page'), - str(server_url / 'base_subpath/page_5'), - } + expected_visit_calls = [ + mock.call(redirect_url), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'page_3')), + mock.call(str(server_url / 'page_4')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + assert visit.mock_calls[0] == expected_visit_calls[0] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None: @@ -81,15 +79,12 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await crawler.run(requests) - first_visited = visit.call_args_list[0][0][0] - visited = {call[0][0] for call in visit.call_args_list} - - assert first_visited == redirect_url - assert visited == { - redirect_url, - str(server_url / 'base_subpath/image_1'), - str(server_url / 'image_2'), - } + expected_visit_calls = [ + mock.call(redirect_url), + mock.call(str(server_url / 'base_subpath/image_1')), + mock.call(str(server_url / 'image_2')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_links_selector(server_url: URL, http_client: HttpClient) -> None: @@ -103,8 +98,11 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - assert visited == {str(server_url / 'start_enqueue'), str(server_url / 'sub_index')} + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpClient) -> None: @@ -154,18 +152,17 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - # url /page_3 should not be visited - assert visited == { - str(server_url / 'start_enqueue'), - str(server_url / 'sub_index'), - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'base_page'), - str(server_url / 'page_4'), - str(server_url / 'base_subpath/page_5'), - } + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'page_4')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) # # all urls added to `enqueue_links` must have a custom header assert headers[1]['transform-header'] == 'my-header' @@ -193,14 +190,14 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await context.enqueue_links() await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - assert visited == { - str(server_url / 'start_enqueue'), - str(server_url / 'sub_index'), - str(server_url / 'base_page'), - str(server_url / 'base_subpath/page_5'), - } + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None: @@ -224,17 +221,19 @@ async def error_handler(context: BasicCrawlingContext, _error: Exception) -> Non await crawler.run([str(server_url / 'problematic_links')]) - visited = {call[0][0] for call in visit.call_args_list} - failed = {call[0][0] for call in fail.call_args_list} - # Email must be skipped # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler. - assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'} + expected_visit_calls = [ + mock.call(str(server_url / 'problematic_links')), + mock.call('https://avatars.githubusercontent.com/apify'), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) # The budplaceholder.com does not exist. - assert failed == { - 'https://budplaceholder.com/', - } + expected_fail_calls = [ + mock.call('https://budplaceholder.com/'), + ] + fail.assert_has_calls(expected_fail_calls, any_order=True) async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None: @@ -251,14 +250,13 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - skipped = {call[0][0] for call in skip.call_args_list} - - assert skipped == { - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'page_3'), - str(server_url / 'page_4'), - } + expected_skip_calls = [ + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'page_3')), + mock.call(str(server_url / 'page_4')), + ] + skip.assert_has_calls(expected_skip_calls, any_order=True) async def test_extract_links(server_url: URL, http_client: HttpClient) -> None: @@ -485,12 +483,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await crawler.run(requests) - first_visited = visit.call_args_list[0][0][0] - visited = {call[0][0] for call in visit.call_args_list} - - assert first_visited == start_url # Only one link should be enqueued from sub_index due to the limit - assert visited == { - start_url, - str(server_url / 'page_3'), - } + expected_visit_calls = [ + mock.call(start_url), + mock.call(str(server_url / 'page_3')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index 79cf723b30..71d8ada1c5 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -51,20 +51,18 @@ async def request_handler(context: ParselCrawlingContext) -> None: await crawler.run(requests) - first_visited = visit.call_args_list[0][0][0] - visited = {call[0][0] for call in visit.call_args_list} - - assert first_visited == redirect_url - assert visited == { - redirect_url, - str(server_url / 'sub_index'), - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'page_3'), - str(server_url / 'page_4'), - str(server_url / 'base_page'), - str(server_url / 'base_subpath/page_5'), - } + expected_visit_calls = [ + mock.call(redirect_url), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'page_3')), + mock.call(str(server_url / 'page_4')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + assert visit.mock_calls[0] == expected_visit_calls[0] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None: @@ -82,15 +80,12 @@ async def request_handler(context: ParselCrawlingContext) -> None: await crawler.run(requests) - first_visited = visit.call_args_list[0][0][0] - visited = {call[0][0] for call in visit.call_args_list} - - assert first_visited == redirect_url - assert visited == { - redirect_url, - str(server_url / 'base_subpath/image_1'), - str(server_url / 'image_2'), - } + expected_visit_calls = [ + mock.call(redirect_url), + mock.call(str(server_url / 'base_subpath/image_1')), + mock.call(str(server_url / 'image_2')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_links_with_incompatible_kwargs_raises_error(server_url: URL) -> None: @@ -123,8 +118,11 @@ async def request_handler(context: ParselCrawlingContext) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - assert visited == {str(server_url / 'start_enqueue'), str(server_url / 'sub_index')} + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpClient) -> None: @@ -173,20 +171,19 @@ async def request_handler(context: ParselCrawlingContext) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - # url /page_3 should not be visited - assert visited == { - str(server_url / 'start_enqueue'), - str(server_url / 'sub_index'), - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'page_4'), - str(server_url / 'base_page'), - str(server_url / 'base_subpath/page_5'), - } - - # # all urls added to `enqueue_links` must have a custom header + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'page_4')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) + + # all urls added to `enqueue_links` must have a custom header assert headers[1]['transform-header'] == 'my-header' assert headers[2]['transform-header'] == 'my-header' assert headers[3]['transform-header'] == 'my-header' @@ -286,14 +283,14 @@ async def request_handler(context: ParselCrawlingContext) -> None: await context.enqueue_links() await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - assert visited == { - str(server_url / 'start_enqueue'), - str(server_url / 'sub_index'), - str(server_url / 'base_page'), - str(server_url / 'base_subpath/page_5'), - } + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None: @@ -317,17 +314,19 @@ async def error_handler(context: BasicCrawlingContext, _error: Exception) -> Non await crawler.run([str(server_url / 'problematic_links')]) - visited = {call[0][0] for call in visit.call_args_list} - failed = {call[0][0] for call in fail.call_args_list} - # Email must be skipped # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler. - assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'} + expected_visit_calls = [ + mock.call(str(server_url / 'problematic_links')), + mock.call('https://avatars.githubusercontent.com/apify'), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) # The budplaceholder.com does not exist. - assert failed == { - 'https://budplaceholder.com/', - } + expected_fail_calls = [ + mock.call('https://budplaceholder.com/'), + ] + fail.assert_has_calls(expected_fail_calls, any_order=True) async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None: @@ -344,14 +343,13 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - skipped = {call[0][0] for call in skip.call_args_list} - - assert skipped == { - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'page_3'), - str(server_url / 'page_4'), - } + expected_skip_calls = [ + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'page_3')), + mock.call(str(server_url / 'page_4')), + ] + skip.assert_has_calls(expected_skip_calls, any_order=True) async def test_extract_links(server_url: URL, http_client: HttpClient) -> None: @@ -502,12 +500,9 @@ async def request_handler(context: ParselCrawlingContext) -> None: await crawler.run(requests) - first_visited = visit.call_args_list[0][0][0] - visited = {call[0][0] for call in visit.call_args_list} - - assert first_visited == start_url # Only one link should be enqueued from sub_index due to the limit - assert visited == { - start_url, - str(server_url / 'page_3'), - } + expected_visit_calls = [ + mock.call(start_url), + mock.call(str(server_url / 'page_3')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index c09f20788c..8ea80fdef5 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -89,19 +89,18 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: await crawler.run(requests) - first_visited = visit.call_args_list[0][0][0] - visited = {call[0][0] for call in visit.call_args_list[1:]} - - assert first_visited == redirect_url - assert visited == { - str(server_url / 'sub_index'), - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'page_3'), - str(server_url / 'page_4'), - str(server_url / 'base_page'), - str(server_url / 'base_subpath/page_5'), - } + expected_visit_calls = [ + mock.call(redirect_url), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'page_3')), + mock.call(str(server_url / 'page_4')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + assert visit.mock_calls[0] == expected_visit_calls[0] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL) -> None: @@ -118,15 +117,12 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: await crawler.run(requests) - first_visited = visit.call_args_list[0][0][0] - visited = {call[0][0] for call in visit.call_args_list} - - assert first_visited == redirect_url - assert visited == { - redirect_url, - str(server_url / 'base_subpath/image_1'), - str(server_url / 'image_2'), - } + expected_visit_calls = [ + mock.call(redirect_url), + mock.call(str(server_url / 'base_subpath/image_1')), + mock.call(str(server_url / 'image_2')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_links_with_incompatible_kwargs_raises_error(server_url: URL) -> None: @@ -171,9 +167,11 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - - assert visited == {str(server_url / 'start_enqueue'), str(server_url / 'sub_index')} + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) # all urls added to `enqueue_links` must have a custom header assert headers[1]['transform-header'] == 'my-header' @@ -701,14 +699,14 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: await context.enqueue_links() await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - assert visited == { - str(server_url / 'start_enqueue'), - str(server_url / 'sub_index'), - str(server_url / 'base_page'), - str(server_url / 'base_subpath/page_5'), - } + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_respect_robots_txt_with_problematic_links(server_url: URL) -> None: @@ -731,17 +729,19 @@ async def error_handler(context: BasicCrawlingContext, _error: Exception) -> Non await crawler.run([str(server_url / 'problematic_links')]) - visited = {call[0][0] for call in visit.call_args_list} - failed = {call[0][0] for call in fail.call_args_list} - # Email must be skipped # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler. - assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'} + expected_visit_calls = [ + mock.call(str(server_url / 'problematic_links')), + mock.call('https://avatars.githubusercontent.com/apify'), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) # The budplaceholder.com does not exist. - assert failed == { - 'https://budplaceholder.com/', - } + expected_fail_calls = [ + mock.call('https://budplaceholder.com/'), + ] + fail.assert_has_calls(expected_fail_calls, any_order=True) async def test_on_skipped_request(server_url: URL) -> None: @@ -758,14 +758,13 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - skipped = {call[0][0] for call in skip.call_args_list} - - assert skipped == { - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'page_3'), - str(server_url / 'page_4'), - } + expected_skip_calls = [ + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'page_3')), + mock.call(str(server_url / 'page_4')), + ] + skip.assert_has_calls(expected_skip_calls, any_order=True) async def test_send_request(server_url: URL) -> None: @@ -1112,12 +1111,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: await crawler.run(requests) - first_visited = visit.call_args_list[0][0][0] - visited = {call[0][0] for call in visit.call_args_list} - - assert first_visited == start_url # Only one link should be enqueued from sub_index due to the limit - assert visited == { - start_url, - str(server_url / 'page_3'), - } + expected_visit_calls = [ + mock.call(start_url), + mock.call(str(server_url / 'page_3')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True)