From 33b2559b499ec112169eab943ba84cc03f451018 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 12 Mar 2026 02:22:00 +0000 Subject: [PATCH 1/2] add browser page hooks --- src/crawlee/browsers/_browser_pool.py | 76 ++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 2 deletions(-) diff --git a/src/crawlee/browsers/_browser_pool.py b/src/crawlee/browsers/_browser_pool.py index 0f27634683..03807ad670 100644 --- a/src/crawlee/browsers/_browser_pool.py +++ b/src/crawlee/browsers/_browser_pool.py @@ -19,7 +19,7 @@ from crawlee.browsers._types import BrowserType, CrawleePage if TYPE_CHECKING: - from collections.abc import Mapping, Sequence + from collections.abc import Awaitable, Callable, Mapping, Sequence from pathlib import Path from types import TracebackType @@ -99,6 +99,13 @@ def __init__( self._pages = WeakValueDictionary[str, CrawleePage]() # Track the pages in the pool self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins + self._pre_page_create_hooks: list[ + Callable[[str, BrowserController, Mapping[str, Any], ProxyInfo | None], Awaitable[None]] + ] = [] + self._post_page_create_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = [] + self._pre_page_close_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = [] + self._post_page_close_hooks: list[Callable[[str, BrowserController], Awaitable[None]]] = [] + # Flag to indicate the context state. self._active = False @@ -301,9 +308,15 @@ async def _get_new_page( try: if not browser_controller: browser_controller = await asyncio.wait_for(self._launch_new_browser(plugin), timeout) + browser_new_context_options = plugin.browser_new_context_options + + await self._execute_hooks( + self._pre_page_create_hooks, page_id, browser_controller, browser_new_context_options, proxy_info + ) + page = await asyncio.wait_for( browser_controller.new_page( - browser_new_context_options=plugin.browser_new_context_options, + browser_new_context_options=browser_new_context_options, proxy_info=proxy_info, ), timeout, @@ -319,6 +332,11 @@ async def _get_new_page( crawlee_page = CrawleePage(id=page_id, page=page, browser_type=plugin.browser_type) self._pages[page_id] = crawlee_page self._total_pages_count += 1 + + await self._execute_hooks(self._post_page_create_hooks, crawlee_page, browser_controller) + + self._override_page_close(crawlee_page, browser_controller) + return crawlee_page def _pick_browser_with_free_capacity( @@ -357,3 +375,57 @@ async def _close_inactive_browsers(self) -> None: if not browser.pages: await browser.close() self._inactive_browsers.remove(browser) + + async def _execute_hooks(self, hooks: list[Callable[..., Awaitable[None]]], *args: Any) -> None: + """Execute the provided hooks with the given arguments.""" + for hook in hooks: + await hook(*args) + + def _override_page_close(self, crawlee_page: CrawleePage, browser_controller: BrowserController) -> None: + """Override the page's close method to execute pre and post close hooks.""" + if self._pre_page_close_hooks or self._post_page_close_hooks: + original_close = crawlee_page.page.close + + async def close_with_hooks(*args: Any, **kwargs: Any) -> None: + await self._execute_hooks(self._pre_page_close_hooks, crawlee_page, browser_controller) + await original_close(*args, **kwargs) + await self._execute_hooks(self._post_page_close_hooks, crawlee_page.id, browser_controller) + + crawlee_page.page.close: Callable[..., Awaitable[None]] = close_with_hooks + + def pre_page_create_hook( + self, hook: Callable[[str, BrowserController, Mapping[str, Any], ProxyInfo | None], Awaitable[None]] + ) -> None: + """Register a hook to be called just before a new page is created. + + The hook receives the page ID, `BrowserController`, `browser_new_context_options`, and `ProxyInfo`. + Note that depending on the `BrowserController` implementation, `browser_new_context_options` may not + apply to every page individually. For example, `PlaywrightBrowserController` with + ``use_incognito_pages=False`` shares a single context across all pages, so the options are applied + only when the context is first created. + """ + self._pre_page_create_hooks.append(hook) + + def post_page_create_hook(self, hook: Callable[[CrawleePage, BrowserController], Awaitable[None]]) -> None: + """Register a hook to be called right after a new page is created. + + The hook receives the newly created `CrawleePage` and the `BrowserController`. Use it to apply + changes to all pages, such as injecting scripts or configuring request interception. + """ + self._post_page_create_hooks.append(hook) + + def pre_page_close_hook(self, hook: Callable[[CrawleePage, BrowserController], Awaitable[None]]) -> None: + """Register a hook to be called just before a page is closed. + + The hook receives the `CrawleePage` and the `BrowserController`. Use it to collect last-second data, + such as taking a screenshot or saving page state before the page is destroyed. + """ + self._pre_page_close_hooks.append(hook) + + def post_page_close_hook(self, hook: Callable[[str, BrowserController], Awaitable[None]]) -> None: + """Register a hook to be called right after a page is closed. + + The hook receives the page ID and the `BrowserController`. Use it for cleanup or logging + after a page's lifecycle ends. + """ + self._post_page_close_hooks.append(hook) From a07ea343ea4349d5ebb0302b1287d65678c02e1d Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 13 Mar 2026 01:53:43 +0000 Subject: [PATCH 2/2] add hooks --- src/crawlee/browsers/_browser_pool.py | 6 +- tests/unit/browsers/test_browser_pool.py | 162 +++++++++++++++++++++++ 2 files changed, 165 insertions(+), 3 deletions(-) diff --git a/src/crawlee/browsers/_browser_pool.py b/src/crawlee/browsers/_browser_pool.py index 03807ad670..fb9e670928 100644 --- a/src/crawlee/browsers/_browser_pool.py +++ b/src/crawlee/browsers/_browser_pool.py @@ -100,7 +100,7 @@ def __init__( self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins self._pre_page_create_hooks: list[ - Callable[[str, BrowserController, Mapping[str, Any], ProxyInfo | None], Awaitable[None]] + Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]] ] = [] self._post_page_create_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = [] self._pre_page_close_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = [] @@ -308,7 +308,7 @@ async def _get_new_page( try: if not browser_controller: browser_controller = await asyncio.wait_for(self._launch_new_browser(plugin), timeout) - browser_new_context_options = plugin.browser_new_context_options + browser_new_context_options = dict(plugin.browser_new_context_options) await self._execute_hooks( self._pre_page_create_hooks, page_id, browser_controller, browser_new_context_options, proxy_info @@ -394,7 +394,7 @@ async def close_with_hooks(*args: Any, **kwargs: Any) -> None: crawlee_page.page.close: Callable[..., Awaitable[None]] = close_with_hooks def pre_page_create_hook( - self, hook: Callable[[str, BrowserController, Mapping[str, Any], ProxyInfo | None], Awaitable[None]] + self, hook: Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]] ) -> None: """Register a hook to be called just before a new page is created. diff --git a/tests/unit/browsers/test_browser_pool.py b/tests/unit/browsers/test_browser_pool.py index 6fc878d0f6..56e4706adc 100644 --- a/tests/unit/browsers/test_browser_pool.py +++ b/tests/unit/browsers/test_browser_pool.py @@ -1,15 +1,23 @@ from __future__ import annotations from typing import TYPE_CHECKING +from unittest.mock import AsyncMock import pytest from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin +from crawlee.browsers._browser_controller import BrowserController +from crawlee.browsers._types import CrawleePage from tests.unit.utils import run_alone_on_mac if TYPE_CHECKING: + from collections.abc import Mapping + from typing import Any + from yarl import URL + from crawlee.proxy_configuration import ProxyInfo + async def test_default_plugin_new_page_creation(server_url: URL) -> None: async with BrowserPool() as browser_pool: @@ -192,3 +200,157 @@ async def test_browser_pool_retire_browser_after_page_count( assert first_browser is second_browser else: assert first_browser is not second_browser + + +async def test_pre_page_create_hook_is_called() -> None: + call_mock = AsyncMock() + + async with BrowserPool() as browser_pool: + + @browser_pool.pre_page_create_hook + async def hook( + page_id: str, + controller: BrowserController, + browser_new_context_options: dict[str, Any], + proxy_info: ProxyInfo | None, + ) -> None: + await call_mock(page_id, controller, browser_new_context_options, proxy_info) + + browser_new_context_options['user_agent'] = 'Modified User-Agent' + + assert len(controller.pages) == 0 + + test_page = await browser_pool.new_page() + user_agent = await test_page.page.evaluate('navigator.userAgent') + + await test_page.page.close() + + assert user_agent == 'Modified User-Agent' + + call_mock.assert_awaited_once() + page_id, controller, _, proxy_info = call_mock.call_args[0] + + assert isinstance(page_id, str) + assert test_page.id == page_id + assert isinstance(controller, BrowserController) + assert proxy_info is None + + +async def test_post_page_create_hook_is_called() -> None: + call_mock = AsyncMock() + + async with BrowserPool() as browser_pool: + + @browser_pool.post_page_create_hook + async def hook(crawlee_page: CrawleePage, controller: BrowserController) -> None: + await call_mock(crawlee_page, controller) + await crawlee_page.page.evaluate('window.__hook_applied = true') + + assert isinstance(crawlee_page, CrawleePage) + + assert len(controller.pages) == 1 + + test_page = await browser_pool.new_page() + + js_result = await test_page.page.evaluate('window.__hook_applied') + + await test_page.page.close() + + assert js_result is True + + call_mock.assert_awaited_once() + crawlee_page, controller = call_mock.call_args[0] + + assert test_page is crawlee_page + assert isinstance(controller, BrowserController) + + +async def test_pre_page_close_hook() -> None: + call_mock = AsyncMock() + + async with BrowserPool() as browser_pool: + + @browser_pool.pre_page_close_hook + async def hook(crawlee_page: CrawleePage, controller: BrowserController) -> None: + await call_mock(crawlee_page, controller) + + assert not crawlee_page.page.is_closed() + assert len(controller.pages) == 1 + + test_page = await browser_pool.new_page() + await test_page.page.close() + + call_mock.assert_awaited_once() + assert test_page.page.is_closed() + + +async def test_post_page_close_hook() -> None: + call_mock = AsyncMock() + + async with BrowserPool() as browser_pool: + + @browser_pool.post_page_close_hook + async def hook(page_id: str, controller: BrowserController) -> None: + await call_mock(page_id, controller) + + assert len(controller.pages) == 0 + + test_page = await browser_pool.new_page() + await test_page.page.close() + + page_id, controller = call_mock.call_args[0] + + call_mock.assert_awaited_once() + assert test_page.id == page_id + assert isinstance(controller, BrowserController) + + +async def test_page_hooks_execution_order() -> None: + call_order: list[str] = [] + + async with BrowserPool() as browser_pool: + + @browser_pool.pre_page_create_hook + async def pre_create( + _page_id: str, + _controller: BrowserController, + _browser_new_context_options: Mapping[str, Any], + _proxy_info: ProxyInfo | None, + ) -> None: + call_order.append('pre_create') + + @browser_pool.post_page_create_hook + async def post_create(_crawlee_page: CrawleePage, _controller: BrowserController) -> None: + call_order.append('post_create') + + @browser_pool.pre_page_close_hook + async def pre_close(_crawlee_page: CrawleePage, _controller: BrowserController) -> None: + call_order.append('pre_close') + + @browser_pool.post_page_close_hook + async def post_close(_page_id: str, _controller: BrowserController) -> None: + call_order.append('post_close') + + page = await browser_pool.new_page() + await page.page.close() + + assert call_order == ['pre_create', 'post_create', 'pre_close', 'post_close'] + + +async def test_multiple_hooks_all_called() -> None: + call_order: list[str] = [] + + async with BrowserPool() as browser_pool: + + @browser_pool.post_page_create_hook + async def first(_crawlee_page: CrawleePage, _controller: BrowserController) -> None: + call_order.append('first') + + @browser_pool.post_page_create_hook + async def second(_crawlee_page: CrawleePage, _controller: BrowserController) -> None: + call_order.append('second') + + page = await browser_pool.new_page() + await page.page.close() + + assert call_order == ['first', 'second']