Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,10 @@ class EnqueueLinksKwargs(TypedDict):
combination of protocol, domain, and port, ensuring a strict scope for the crawl.
"""

include: NotRequired[list[re.Pattern | Glob]]
include: NotRequired[Sequence[re.Pattern | Glob]]
"""List of regular expressions or globs that URLs must match to be enqueued."""

exclude: NotRequired[list[re.Pattern | Glob]]
exclude: NotRequired[Sequence[re.Pattern | Glob]]
"""List of regular expressions or globs that URLs must not match to be enqueued."""


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def create_parsed_http_crawler_class(
class _ParsedHttpCrawler(AbstractHttpCrawler):
def __init__(
self,
parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser,
parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser, # ty: ignore[invalid-parameter-default]
**kwargs: Unpack[BasicCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]],
) -> None:
kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline()
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/proxy_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ async def _pick_url(
if inspect.isawaitable(result):
result = await result

return URL(result) if result is not None else None, None
return URL(str(result)) if result is not None else None, None
except Exception as e:
raise ValueError('The provided "new_url_function" did not return a valid URL') from e

Expand Down
3 changes: 2 additions & 1 deletion src/crawlee/request_loaders/_request_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,9 @@ def __init__(
logger=logger,
)

self._requests: AsyncIterator[str | Request]
if isinstance(requests, AsyncIterable):
self._requests = requests.__aiter__()
self._requests = requests.__aiter__() # ty: ignore[invalid-assignment]
elif requests is None:
self._requests = self._iterate_in_threadpool([])
else:
Expand Down
6 changes: 4 additions & 2 deletions src/crawlee/storage_clients/_redis/_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections.abc import Awaitable
from pathlib import Path
from typing import TypeVar, overload
from typing import TypeVar, cast, overload

T = TypeVar('T')

Expand All @@ -13,7 +13,9 @@ async def await_redis_response(response: T) -> T: ...

async def await_redis_response(response: Awaitable[T] | T) -> T:
"""Solve the problem of ambiguous typing for redis."""
return await response if isinstance(response, Awaitable) else response
if isinstance(response, Awaitable):
return cast('T', await response)
return response


def read_lua_script(script_name: str) -> str:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ async def test_static_crawler_actor_at_apify(
'start_url': default_start_url,
'install_project': False,
},
output_dir=tmp_path,
output_dir=str(tmp_path),
)

patch_crawlee_version_in_project(
Expand Down
7 changes: 6 additions & 1 deletion tests/unit/request_loaders/test_sitemap_request_loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import asyncio
import base64
import gzip
from typing import TYPE_CHECKING

from yarl import URL

Expand All @@ -9,6 +10,9 @@
from crawlee.request_loaders._sitemap_request_loader import SitemapRequestLoader
from crawlee.storages import KeyValueStore

if TYPE_CHECKING:
from crawlee._types import JsonSerializable

BASIC_SITEMAP = """
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
Expand Down Expand Up @@ -182,7 +186,8 @@ async def test_transform_request_function(server_url: URL, http_client: HttpClie
sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))

def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
request_options['user_data'] = {'transformed': True}
user_data: dict[str, JsonSerializable] = {'transformed': True}
request_options['user_data'] = user_data
return request_options

sitemap_loader = SitemapRequestLoader(
Expand Down
963 changes: 506 additions & 457 deletions uv.lock

Large diffs are not rendered by default.

Loading
Loading