diff --git a/docs/guides/code_examples/http_crawlers/selectolax_parser.py b/docs/guides/code_examples/http_crawlers/selectolax_parser.py index 0c38b1e9bf..b8fca8b38c 100644 --- a/docs/guides/code_examples/http_crawlers/selectolax_parser.py +++ b/docs/guides/code_examples/http_crawlers/selectolax_parser.py @@ -45,7 +45,7 @@ def is_matching_selector( @override def find_links( - self, parsed_content: LexborHTMLParser, selector: str + self, parsed_content: LexborHTMLParser, selector: str, attribute: str ) -> Iterable[str]: """Extract href attributes from elements matching the selector. @@ -54,7 +54,7 @@ def find_links( link: LexborNode urls: list[str] = [] for link in parsed_content.css(selector): - url = link.attributes.get('href') + url = link.attributes.get(attribute) if url: urls.append(url.strip()) return urls diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 23090311c2..baa0746009 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -387,6 +387,7 @@ def __call__( self, *, selector: str | None = None, + attribute: str | None = None, label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, @@ -411,6 +412,7 @@ def __call__( self, *, selector: str | None = None, + attribute: str | None = None, label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, @@ -428,6 +430,7 @@ def __call__( - `PlaywrightCrawler` supports CSS and XPath selectors. - `ParselCrawler` supports CSS selectors. - `BeautifulSoupCrawler` supports CSS selectors. + attribute: Which node attribute to extract the links from. label: Label for the newly created `Request` objects, used for request routing. user_data: User data to be provided to the newly created `Request` objects. transform_request_function: A function that takes `RequestOptions` and returns either: @@ -457,6 +460,7 @@ def __call__( self, *, selector: str = 'a', + attribute: str = 'href', label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, @@ -470,6 +474,7 @@ def __call__( - `PlaywrightCrawler` supports CSS and XPath selectors. - `ParselCrawler` supports CSS selectors. - `BeautifulSoupCrawler` supports CSS selectors. + attribute: Which node attribute to extract the links from. label: Label for the newly created `Request` objects, used for request routing. user_data: User data to be provided to the newly created `Request` objects. transform_request_function: A function that takes `RequestOptions` and returns either: diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 7aafa49e2e..f1b87f8ac3 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -176,6 +176,7 @@ def _create_extract_links_function( async def extract_links( *, selector: str = 'a', + attribute: str = 'href', label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] @@ -191,10 +192,12 @@ async def extract_links( kwargs.setdefault('strategy', 'same-hostname') strategy = kwargs.get('strategy', 'same-hostname') - links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector)) + links_iterator: Iterator[str] = iter( + self._parser.find_links(parsed_content, selector=selector, attribute=attribute) + ) # Get base URL from tag if present - extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]')) + extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]', 'href')) base_url: str = ( str(extracted_base_urls[0]) if extracted_base_urls diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py index 13ef57e9ea..5f64565e76 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py @@ -93,12 +93,13 @@ def is_matching_selector(self, parsed_content: TParseResult, selector: str) -> b """ @abstractmethod - def find_links(self, parsed_content: TParseResult, selector: str) -> Iterable[str]: + def find_links(self, parsed_content: TParseResult, selector: str, attribute: str) -> Iterable[str]: """Find all links in result using selector. Args: parsed_content: Parsed HTTP response. Result of `parse` method. selector: String used to define matching pattern for finding links. + attribute: Which node attribute to extract the links from. Returns: Iterable of strings that contain found links. diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 89eb252d91..53c37416e0 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -997,6 +997,7 @@ def _create_enqueue_links_function( async def enqueue_links( *, selector: str | None = None, + attribute: str | None = None, label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] @@ -1010,9 +1011,9 @@ async def enqueue_links( kwargs.setdefault('strategy', 'same-hostname') if requests: - if any((selector, label, user_data, transform_request_function)): + if any((selector, attribute, label, user_data, transform_request_function)): raise ValueError( - 'You cannot provide `selector`, `label`, `user_data` or ' + 'You cannot provide `selector`, `attribute`, `label`, `user_data` or ' '`transform_request_function` arguments when `requests` is provided.' ) # Add directly passed requests. @@ -1024,6 +1025,7 @@ async def enqueue_links( await context.add_requests( await extract_links( selector=selector or 'a', + attribute=attribute or 'href', label=label, user_data=user_data, transform_request_function=transform_request_function, diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py index cd264cd946..735444a576 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py @@ -38,11 +38,11 @@ async def select(self, parsed_content: Tag, selector: str) -> Sequence[Tag]: return tuple(match for match in parsed_content.select(selector)) @override - def find_links(self, parsed_content: Tag, selector: str) -> Iterable[str]: + def find_links(self, parsed_content: Tag, selector: str, attribute: str) -> Iterable[str]: link: Tag urls: list[str] = [] for link in parsed_content.select(selector): - url = link.attrs.get('href') + url = link.attrs.get(attribute) if url: urls.append(url.strip()) return urls diff --git a/src/crawlee/crawlers/_http/_http_parser.py b/src/crawlee/crawlers/_http/_http_parser.py index 143629dac4..adfcc52aec 100644 --- a/src/crawlee/crawlers/_http/_http_parser.py +++ b/src/crawlee/crawlers/_http/_http_parser.py @@ -43,5 +43,7 @@ def is_matching_selector(self, parsed_content: bytes, selector: str) -> bool: # return False @override - def find_links(self, parsed_content: bytes, selector: str) -> Iterable[str]: # Intentional unused argument. + def find_links( + self, parsed_content: bytes, selector: str, attribute: str + ) -> Iterable[str]: # Intentional unused argument. return [] diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py index f9ca19139a..2b58957edc 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_parser.py +++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py @@ -37,11 +37,11 @@ def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool: return parsed_content.type in ('html', 'xml') and parsed_content.css(selector).get() is not None @override - def find_links(self, parsed_content: Selector, selector: str) -> Iterable[str]: + def find_links(self, parsed_content: Selector, selector: str, attribute: str) -> Iterable[str]: link: Selector urls: list[str] = [] for link in parsed_content.css(selector): - url = link.xpath('@href').get() + url = link.xpath(f'@{attribute}').get() if url: urls.append(url.strip()) return urls diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 6f4b2b0e9d..cf70ff94bd 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -373,6 +373,7 @@ def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContex async def extract_links( *, selector: str = 'a', + attribute: str = 'href', label: str | None = None, user_data: dict | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] @@ -394,7 +395,7 @@ async def extract_links( elements = await context.page.query_selector_all(selector) links_iterator: Iterator[str] = iter( - [url for element in elements if (url := await element.get_attribute('href')) is not None] + [url for element in elements if (url := await element.get_attribute(attribute)) is not None] ) # Get base URL from tag if present diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index 1b8b50777b..ea31cc42b5 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -50,20 +50,41 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await crawler.run(requests) - first_visited = visit.call_args_list[0][0][0] - visited = {call[0][0] for call in visit.call_args_list} - - assert first_visited == redirect_url - assert visited == { - redirect_url, - str(server_url / 'sub_index'), - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'page_3'), - str(server_url / 'page_4'), - str(server_url / 'base_page'), - str(server_url / 'base_subpath/page_5'), - } + expected_visit_calls = [ + mock.call(redirect_url), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'page_3')), + mock.call(str(server_url / 'page_4')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + assert visit.mock_calls[0] == expected_visit_calls[0] + visit.assert_has_calls(expected_visit_calls, any_order=True) + + +async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None: + redirect_target = str(server_url / 'start_enqueue_non_href') + redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target)) + requests = [redirect_url] + + crawler = BeautifulSoupCrawler(http_client=http_client) + visit = mock.Mock() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + visit(context.request.url) + await context.enqueue_links(selector='img', attribute='src') + + await crawler.run(requests) + + expected_visit_calls = [ + mock.call(redirect_url), + mock.call(str(server_url / 'base_subpath/image_1')), + mock.call(str(server_url / 'image_2')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_links_selector(server_url: URL, http_client: HttpClient) -> None: @@ -77,8 +98,11 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - assert visited == {str(server_url / 'start_enqueue'), str(server_url / 'sub_index')} + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpClient) -> None: @@ -128,18 +152,17 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - # url /page_3 should not be visited - assert visited == { - str(server_url / 'start_enqueue'), - str(server_url / 'sub_index'), - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'base_page'), - str(server_url / 'page_4'), - str(server_url / 'base_subpath/page_5'), - } + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'page_4')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) # # all urls added to `enqueue_links` must have a custom header assert headers[1]['transform-header'] == 'my-header' @@ -167,14 +190,14 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await context.enqueue_links() await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - assert visited == { - str(server_url / 'start_enqueue'), - str(server_url / 'sub_index'), - str(server_url / 'base_page'), - str(server_url / 'base_subpath/page_5'), - } + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None: @@ -198,17 +221,19 @@ async def error_handler(context: BasicCrawlingContext, _error: Exception) -> Non await crawler.run([str(server_url / 'problematic_links')]) - visited = {call[0][0] for call in visit.call_args_list} - failed = {call[0][0] for call in fail.call_args_list} - # Email must be skipped # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler. - assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'} + expected_visit_calls = [ + mock.call(str(server_url / 'problematic_links')), + mock.call('https://avatars.githubusercontent.com/apify'), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) # The budplaceholder.com does not exist. - assert failed == { - 'https://budplaceholder.com/', - } + expected_fail_calls = [ + mock.call('https://budplaceholder.com/'), + ] + fail.assert_has_calls(expected_fail_calls, any_order=True) async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None: @@ -225,14 +250,13 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - skipped = {call[0][0] for call in skip.call_args_list} - - assert skipped == { - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'page_3'), - str(server_url / 'page_4'), - } + expected_skip_calls = [ + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'page_3')), + mock.call(str(server_url / 'page_4')), + ] + skip.assert_has_calls(expected_skip_calls, any_order=True) async def test_extract_links(server_url: URL, http_client: HttpClient) -> None: @@ -250,6 +274,21 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: assert extracted_links[0] == str(server_url / 'page_1') +async def test_extract_non_href_links(server_url: URL, http_client: HttpClient) -> None: + crawler = BeautifulSoupCrawler(http_client=http_client) + extracted_links: list[str] = [] + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + links = await context.extract_links(selector='li', attribute='data-href') + extracted_links.extend(request.url for request in links) + + await crawler.run([str(server_url / 'non_href_links')]) + + assert len(extracted_links) == 1 + assert extracted_links[0] == str(server_url / 'page_2') + + @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'by_id'), [ @@ -444,12 +483,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await crawler.run(requests) - first_visited = visit.call_args_list[0][0][0] - visited = {call[0][0] for call in visit.call_args_list} - - assert first_visited == start_url # Only one link should be enqueued from sub_index due to the limit - assert visited == { - start_url, - str(server_url / 'page_3'), - } + expected_visit_calls = [ + mock.call(start_url), + mock.call(str(server_url / 'page_3')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index 648b6ee9c0..71d8ada1c5 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -51,20 +51,41 @@ async def request_handler(context: ParselCrawlingContext) -> None: await crawler.run(requests) - first_visited = visit.call_args_list[0][0][0] - visited = {call[0][0] for call in visit.call_args_list} - - assert first_visited == redirect_url - assert visited == { - redirect_url, - str(server_url / 'sub_index'), - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'page_3'), - str(server_url / 'page_4'), - str(server_url / 'base_page'), - str(server_url / 'base_subpath/page_5'), - } + expected_visit_calls = [ + mock.call(redirect_url), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'page_3')), + mock.call(str(server_url / 'page_4')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + assert visit.mock_calls[0] == expected_visit_calls[0] + visit.assert_has_calls(expected_visit_calls, any_order=True) + + +async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None: + redirect_target = str(server_url / 'start_enqueue_non_href') + redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target)) + requests = [redirect_url] + + crawler = ParselCrawler(http_client=http_client) + visit = mock.Mock() + + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + visit(context.request.url) + await context.enqueue_links(selector='img', attribute='src') + + await crawler.run(requests) + + expected_visit_calls = [ + mock.call(redirect_url), + mock.call(str(server_url / 'base_subpath/image_1')), + mock.call(str(server_url / 'image_2')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_links_with_incompatible_kwargs_raises_error(server_url: URL) -> None: @@ -97,8 +118,11 @@ async def request_handler(context: ParselCrawlingContext) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - assert visited == {str(server_url / 'start_enqueue'), str(server_url / 'sub_index')} + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpClient) -> None: @@ -147,20 +171,19 @@ async def request_handler(context: ParselCrawlingContext) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - # url /page_3 should not be visited - assert visited == { - str(server_url / 'start_enqueue'), - str(server_url / 'sub_index'), - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'page_4'), - str(server_url / 'base_page'), - str(server_url / 'base_subpath/page_5'), - } - - # # all urls added to `enqueue_links` must have a custom header + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'page_4')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) + + # all urls added to `enqueue_links` must have a custom header assert headers[1]['transform-header'] == 'my-header' assert headers[2]['transform-header'] == 'my-header' assert headers[3]['transform-header'] == 'my-header' @@ -260,14 +283,14 @@ async def request_handler(context: ParselCrawlingContext) -> None: await context.enqueue_links() await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - assert visited == { - str(server_url / 'start_enqueue'), - str(server_url / 'sub_index'), - str(server_url / 'base_page'), - str(server_url / 'base_subpath/page_5'), - } + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None: @@ -291,17 +314,19 @@ async def error_handler(context: BasicCrawlingContext, _error: Exception) -> Non await crawler.run([str(server_url / 'problematic_links')]) - visited = {call[0][0] for call in visit.call_args_list} - failed = {call[0][0] for call in fail.call_args_list} - # Email must be skipped # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler. - assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'} + expected_visit_calls = [ + mock.call(str(server_url / 'problematic_links')), + mock.call('https://avatars.githubusercontent.com/apify'), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) # The budplaceholder.com does not exist. - assert failed == { - 'https://budplaceholder.com/', - } + expected_fail_calls = [ + mock.call('https://budplaceholder.com/'), + ] + fail.assert_has_calls(expected_fail_calls, any_order=True) async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None: @@ -318,14 +343,13 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - skipped = {call[0][0] for call in skip.call_args_list} - - assert skipped == { - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'page_3'), - str(server_url / 'page_4'), - } + expected_skip_calls = [ + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'page_3')), + mock.call(str(server_url / 'page_4')), + ] + skip.assert_has_calls(expected_skip_calls, any_order=True) async def test_extract_links(server_url: URL, http_client: HttpClient) -> None: @@ -343,6 +367,21 @@ async def request_handler(context: ParselCrawlingContext) -> None: assert extracted_links[0] == str(server_url / 'page_1') +async def test_extract_non_href_links(server_url: URL, http_client: HttpClient) -> None: + crawler = ParselCrawler(http_client=http_client) + extracted_links: list[str] = [] + + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + links = await context.extract_links(selector='li', attribute='data-href') + extracted_links.extend(request.url for request in links) + + await crawler.run([str(server_url / 'non_href_links')]) + + assert len(extracted_links) == 1 + assert extracted_links[0] == str(server_url / 'page_2') + + @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'by_id'), [ @@ -461,12 +500,9 @@ async def request_handler(context: ParselCrawlingContext) -> None: await crawler.run(requests) - first_visited = visit.call_args_list[0][0][0] - visited = {call[0][0] for call in visit.call_args_list} - - assert first_visited == start_url # Only one link should be enqueued from sub_index due to the limit - assert visited == { - start_url, - str(server_url / 'page_3'), - } + expected_visit_calls = [ + mock.call(start_url), + mock.call(str(server_url / 'page_3')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index e0ada5de1c..8ea80fdef5 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -89,19 +89,40 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: await crawler.run(requests) - first_visited = visit.call_args_list[0][0][0] - visited = {call[0][0] for call in visit.call_args_list[1:]} - - assert first_visited == redirect_url - assert visited == { - str(server_url / 'sub_index'), - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'page_3'), - str(server_url / 'page_4'), - str(server_url / 'base_page'), - str(server_url / 'base_subpath/page_5'), - } + expected_visit_calls = [ + mock.call(redirect_url), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'page_3')), + mock.call(str(server_url / 'page_4')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + assert visit.mock_calls[0] == expected_visit_calls[0] + visit.assert_has_calls(expected_visit_calls, any_order=True) + + +async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL) -> None: + redirect_target = str(server_url / 'start_enqueue_non_href') + redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target)) + requests = [redirect_url] + crawler = PlaywrightCrawler() + visit = mock.Mock() + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + visit(context.request.url) + await context.enqueue_links(selector='img', attribute='src') + + await crawler.run(requests) + + expected_visit_calls = [ + mock.call(redirect_url), + mock.call(str(server_url / 'base_subpath/image_1')), + mock.call(str(server_url / 'image_2')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_links_with_incompatible_kwargs_raises_error(server_url: URL) -> None: @@ -146,9 +167,11 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - - assert visited == {str(server_url / 'start_enqueue'), str(server_url / 'sub_index')} + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) # all urls added to `enqueue_links` must have a custom header assert headers[1]['transform-header'] == 'my-header' @@ -676,14 +699,14 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: await context.enqueue_links() await crawler.run([str(server_url / 'start_enqueue')]) - visited = {call[0][0] for call in visit.call_args_list} - assert visited == { - str(server_url / 'start_enqueue'), - str(server_url / 'sub_index'), - str(server_url / 'base_page'), - str(server_url / 'base_subpath/page_5'), - } + expected_visit_calls = [ + mock.call(str(server_url / 'start_enqueue')), + mock.call(str(server_url / 'sub_index')), + mock.call(str(server_url / 'base_page')), + mock.call(str(server_url / 'base_subpath/page_5')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_respect_robots_txt_with_problematic_links(server_url: URL) -> None: @@ -706,17 +729,19 @@ async def error_handler(context: BasicCrawlingContext, _error: Exception) -> Non await crawler.run([str(server_url / 'problematic_links')]) - visited = {call[0][0] for call in visit.call_args_list} - failed = {call[0][0] for call in fail.call_args_list} - # Email must be skipped # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler. - assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'} + expected_visit_calls = [ + mock.call(str(server_url / 'problematic_links')), + mock.call('https://avatars.githubusercontent.com/apify'), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) # The budplaceholder.com does not exist. - assert failed == { - 'https://budplaceholder.com/', - } + expected_fail_calls = [ + mock.call('https://budplaceholder.com/'), + ] + fail.assert_has_calls(expected_fail_calls, any_order=True) async def test_on_skipped_request(server_url: URL) -> None: @@ -733,14 +758,13 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None: await crawler.run([str(server_url / 'start_enqueue')]) - skipped = {call[0][0] for call in skip.call_args_list} - - assert skipped == { - str(server_url / 'page_1'), - str(server_url / 'page_2'), - str(server_url / 'page_3'), - str(server_url / 'page_4'), - } + expected_skip_calls = [ + mock.call(str(server_url / 'page_1')), + mock.call(str(server_url / 'page_2')), + mock.call(str(server_url / 'page_3')), + mock.call(str(server_url / 'page_4')), + ] + skip.assert_has_calls(expected_skip_calls, any_order=True) async def test_send_request(server_url: URL) -> None: @@ -817,6 +841,21 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: assert extracted_links[0] == str(server_url / 'page_1') +async def test_extract_non_href_links(server_url: URL) -> None: + crawler = PlaywrightCrawler() + extracted_links: list[str] = [] + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + links = await context.extract_links(selector='li', attribute='data-href') + extracted_links.extend(request.url for request in links) + + await crawler.run([str(server_url / 'non_href_links')]) + + assert len(extracted_links) == 1 + assert extracted_links[0] == str(server_url / 'page_2') + + async def test_reduced_logs_from_playwright_navigation_timeout(caplog: pytest.LogCaptureFixture) -> None: caplog.set_level(logging.INFO) crawler = PlaywrightCrawler(configure_logging=False) @@ -1072,12 +1111,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: await crawler.run(requests) - first_visited = visit.call_args_list[0][0][0] - visited = {call[0][0] for call in visit.call_args_list} - - assert first_visited == start_url # Only one link should be enqueued from sub_index due to the limit - assert visited == { - start_url, - str(server_url / 'page_3'), - } + expected_visit_calls = [ + mock.call(start_url), + mock.call(str(server_url / 'page_3')), + ] + visit.assert_has_calls(expected_visit_calls, any_order=True) diff --git a/tests/unit/server.py b/tests/unit/server.py index dd7edd59e8..69de104149 100644 --- a/tests/unit/server.py +++ b/tests/unit/server.py @@ -20,11 +20,13 @@ HELLO_WORLD, INCAPSULA, INFINITE_SCROLL, + NON_HREF_LINKS, PROBLEMATIC_LINKS, RESOURCE_LOADING_PAGE, ROBOTS_TXT, SECONDARY_INDEX, START_ENQUEUE, + START_ENQUEUE_NON_HREF, ) if TYPE_CHECKING: @@ -101,6 +103,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None: assert scope['type'] == 'http' paths: dict[str, PathHandler] = { 'start_enqueue': start_enqueue_endpoint, + 'start_enqueue_non_href': start_enqueue_non_href_endpoint, 'sub_index': secondary_index_endpoint, 'incapsula': incapsula_endpoint, 'page_1': generic_response_endpoint, @@ -108,6 +111,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None: 'page_3': generic_response_endpoint, 'base_page': base_index_endpoint, 'problematic_links': problematic_links_endpoint, + 'non_href_links': non_href_links_endpoint, 'set_cookies': set_cookies, 'set_complex_cookies': set_complex_cookies, 'cookies': get_cookies, @@ -304,6 +308,14 @@ async def problematic_links_endpoint(_scope: dict[str, Any], _receive: Receive, ) +async def non_href_links_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: + """Handle requests with a page containing non-href links.""" + await send_html_response( + send, + NON_HREF_LINKS, + ) + + async def redirect_to_url(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests that should redirect to a specified full URL.""" query_params = get_query_params(scope.get('query_string', b'')) @@ -453,6 +465,16 @@ async def base_index_endpoint(_scope: dict[str, Any], _receive: Receive, send: S ) +async def start_enqueue_non_href_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: + """Handle requests for the base index page.""" + host = f'http://{get_headers_dict(_scope).get("host", "localhost")}' + content = START_ENQUEUE_NON_HREF.format(host=host).encode() + await send_html_response( + send, + content, + ) + + class TestServer(Server): """A test HTTP server implementation based on Uvicorn Server.""" diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py index fede2ee173..b32d136524 100644 --- a/tests/unit/server_endpoints.py +++ b/tests/unit/server_endpoints.py @@ -17,6 +17,18 @@ test@test.com """ +START_ENQUEUE_NON_HREF = """\ + + + Hello + + + Link A + Link B + + +""" + SECONDARY_INDEX = b"""\ Hello @@ -57,6 +69,16 @@ Apify avatar/a> """ +NON_HREF_LINKS = b"""\ + + Hello + + + +
  • + +""" + GENERIC_RESPONSE = b"""\ Hello