Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 74 additions & 2 deletions src/crawlee/browsers/_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from crawlee.browsers._types import BrowserType, CrawleePage

if TYPE_CHECKING:
from collections.abc import Mapping, Sequence
from collections.abc import Awaitable, Callable, Mapping, Sequence
from pathlib import Path
from types import TracebackType

Expand Down Expand Up @@ -99,6 +99,13 @@ def __init__(
self._pages = WeakValueDictionary[str, CrawleePage]() # Track the pages in the pool
self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins

self._pre_page_create_hooks: list[
Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]
] = []
self._post_page_create_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = []
self._pre_page_close_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = []
self._post_page_close_hooks: list[Callable[[str, BrowserController], Awaitable[None]]] = []

# Flag to indicate the context state.
self._active = False

Expand Down Expand Up @@ -301,9 +308,15 @@ async def _get_new_page(
try:
if not browser_controller:
browser_controller = await asyncio.wait_for(self._launch_new_browser(plugin), timeout)
browser_new_context_options = dict(plugin.browser_new_context_options)

await self._execute_hooks(
self._pre_page_create_hooks, page_id, browser_controller, browser_new_context_options, proxy_info
)

page = await asyncio.wait_for(
browser_controller.new_page(
browser_new_context_options=plugin.browser_new_context_options,
browser_new_context_options=browser_new_context_options,
proxy_info=proxy_info,
),
timeout,
Expand All @@ -319,6 +332,11 @@ async def _get_new_page(
crawlee_page = CrawleePage(id=page_id, page=page, browser_type=plugin.browser_type)
self._pages[page_id] = crawlee_page
self._total_pages_count += 1

await self._execute_hooks(self._post_page_create_hooks, crawlee_page, browser_controller)

self._override_page_close(crawlee_page, browser_controller)

return crawlee_page

def _pick_browser_with_free_capacity(
Expand Down Expand Up @@ -357,3 +375,57 @@ async def _close_inactive_browsers(self) -> None:
if not browser.pages:
await browser.close()
self._inactive_browsers.remove(browser)

async def _execute_hooks(self, hooks: list[Callable[..., Awaitable[None]]], *args: Any) -> None:
"""Execute the provided hooks with the given arguments."""
for hook in hooks:
await hook(*args)

def _override_page_close(self, crawlee_page: CrawleePage, browser_controller: BrowserController) -> None:
"""Override the page's close method to execute pre and post close hooks."""
if self._pre_page_close_hooks or self._post_page_close_hooks:
original_close = crawlee_page.page.close

async def close_with_hooks(*args: Any, **kwargs: Any) -> None:
await self._execute_hooks(self._pre_page_close_hooks, crawlee_page, browser_controller)
await original_close(*args, **kwargs)
await self._execute_hooks(self._post_page_close_hooks, crawlee_page.id, browser_controller)

crawlee_page.page.close: Callable[..., Awaitable[None]] = close_with_hooks
Comment on lines +386 to +394
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not expected that a closing hook will be added after the page has already been opened.


def pre_page_create_hook(
self, hook: Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]
) -> None:
"""Register a hook to be called just before a new page is created.

The hook receives the page ID, `BrowserController`, `browser_new_context_options`, and `ProxyInfo`.
Note that depending on the `BrowserController` implementation, `browser_new_context_options` may not
apply to every page individually. For example, `PlaywrightBrowserController` with
``use_incognito_pages=False`` shares a single context across all pages, so the options are applied
only when the context is first created.
"""
self._pre_page_create_hooks.append(hook)

def post_page_create_hook(self, hook: Callable[[CrawleePage, BrowserController], Awaitable[None]]) -> None:
"""Register a hook to be called right after a new page is created.

The hook receives the newly created `CrawleePage` and the `BrowserController`. Use it to apply
changes to all pages, such as injecting scripts or configuring request interception.
"""
self._post_page_create_hooks.append(hook)

def pre_page_close_hook(self, hook: Callable[[CrawleePage, BrowserController], Awaitable[None]]) -> None:
"""Register a hook to be called just before a page is closed.

The hook receives the `CrawleePage` and the `BrowserController`. Use it to collect last-second data,
such as taking a screenshot or saving page state before the page is destroyed.
"""
self._pre_page_close_hooks.append(hook)

def post_page_close_hook(self, hook: Callable[[str, BrowserController], Awaitable[None]]) -> None:
"""Register a hook to be called right after a page is closed.

The hook receives the page ID and the `BrowserController`. Use it for cleanup or logging
after a page's lifecycle ends.
"""
self._post_page_close_hooks.append(hook)
162 changes: 162 additions & 0 deletions tests/unit/browsers/test_browser_pool.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,23 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from unittest.mock import AsyncMock

import pytest

from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin
from crawlee.browsers._browser_controller import BrowserController
from crawlee.browsers._types import CrawleePage
from tests.unit.utils import run_alone_on_mac

if TYPE_CHECKING:
from collections.abc import Mapping
from typing import Any

from yarl import URL

from crawlee.proxy_configuration import ProxyInfo


async def test_default_plugin_new_page_creation(server_url: URL) -> None:
async with BrowserPool() as browser_pool:
Expand Down Expand Up @@ -192,3 +200,157 @@ async def test_browser_pool_retire_browser_after_page_count(
assert first_browser is second_browser
else:
assert first_browser is not second_browser


async def test_pre_page_create_hook_is_called() -> None:
call_mock = AsyncMock()

async with BrowserPool() as browser_pool:

@browser_pool.pre_page_create_hook
async def hook(
page_id: str,
controller: BrowserController,
browser_new_context_options: dict[str, Any],
proxy_info: ProxyInfo | None,
) -> None:
await call_mock(page_id, controller, browser_new_context_options, proxy_info)

browser_new_context_options['user_agent'] = 'Modified User-Agent'

assert len(controller.pages) == 0

test_page = await browser_pool.new_page()
user_agent = await test_page.page.evaluate('navigator.userAgent')

await test_page.page.close()

assert user_agent == 'Modified User-Agent'

call_mock.assert_awaited_once()
page_id, controller, _, proxy_info = call_mock.call_args[0]

assert isinstance(page_id, str)
assert test_page.id == page_id
assert isinstance(controller, BrowserController)
assert proxy_info is None


async def test_post_page_create_hook_is_called() -> None:
call_mock = AsyncMock()

async with BrowserPool() as browser_pool:

@browser_pool.post_page_create_hook
async def hook(crawlee_page: CrawleePage, controller: BrowserController) -> None:
await call_mock(crawlee_page, controller)
await crawlee_page.page.evaluate('window.__hook_applied = true')

assert isinstance(crawlee_page, CrawleePage)

assert len(controller.pages) == 1

test_page = await browser_pool.new_page()

js_result = await test_page.page.evaluate('window.__hook_applied')

await test_page.page.close()

assert js_result is True

call_mock.assert_awaited_once()
crawlee_page, controller = call_mock.call_args[0]

assert test_page is crawlee_page
assert isinstance(controller, BrowserController)


async def test_pre_page_close_hook() -> None:
call_mock = AsyncMock()

async with BrowserPool() as browser_pool:

@browser_pool.pre_page_close_hook
async def hook(crawlee_page: CrawleePage, controller: BrowserController) -> None:
await call_mock(crawlee_page, controller)

assert not crawlee_page.page.is_closed()
assert len(controller.pages) == 1

test_page = await browser_pool.new_page()
await test_page.page.close()

call_mock.assert_awaited_once()
assert test_page.page.is_closed()


async def test_post_page_close_hook() -> None:
call_mock = AsyncMock()

async with BrowserPool() as browser_pool:

@browser_pool.post_page_close_hook
async def hook(page_id: str, controller: BrowserController) -> None:
await call_mock(page_id, controller)

assert len(controller.pages) == 0

test_page = await browser_pool.new_page()
await test_page.page.close()

page_id, controller = call_mock.call_args[0]

call_mock.assert_awaited_once()
assert test_page.id == page_id
assert isinstance(controller, BrowserController)


async def test_page_hooks_execution_order() -> None:
call_order: list[str] = []

async with BrowserPool() as browser_pool:

@browser_pool.pre_page_create_hook
async def pre_create(
_page_id: str,
_controller: BrowserController,
_browser_new_context_options: Mapping[str, Any],
_proxy_info: ProxyInfo | None,
) -> None:
call_order.append('pre_create')

@browser_pool.post_page_create_hook
async def post_create(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:
call_order.append('post_create')

@browser_pool.pre_page_close_hook
async def pre_close(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:
call_order.append('pre_close')

@browser_pool.post_page_close_hook
async def post_close(_page_id: str, _controller: BrowserController) -> None:
call_order.append('post_close')

page = await browser_pool.new_page()
await page.page.close()

assert call_order == ['pre_create', 'post_create', 'pre_close', 'post_close']


async def test_multiple_hooks_all_called() -> None:
call_order: list[str] = []

async with BrowserPool() as browser_pool:

@browser_pool.post_page_create_hook
async def first(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:
call_order.append('first')

@browser_pool.post_page_create_hook
async def second(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:
call_order.append('second')

page = await browser_pool.new_page()
await page.page.close()

assert call_order == ['first', 'second']
Loading