Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

import io
import os
import sys
from typing import Any, BinaryIO, Optional

Expand All @@ -25,10 +26,14 @@
_dependency_exc_info = sys.exc_info()


def _extract_images_from_page(page: Any) -> list[dict]:
def _extract_images_from_page(page: Any, max_dimension: int = 1500) -> list[dict]:
"""
Extract images from a PDF page by rendering page regions.

Args:
page: PDF page object from pdfplumber
max_dimension: Maximum width/height for resized images (default: 1500)

Returns:
List of dicts with 'stream', 'bbox', 'name', 'y_pos' keys
"""
Expand Down Expand Up @@ -74,6 +79,13 @@ def _extract_images_from_page(page: Any) -> list[dict]:
if pil_img.mode not in ("RGB", "L"):
pil_img = pil_img.convert("RGB")

# Resize large images to reduce LLM Vision API load
# Target: max 1500 pixels width/height (suitable for OCR)
if pil_img.width > max_dimension or pil_img.height > max_dimension:
scale = min(max_dimension / pil_img.width, max_dimension / pil_img.height)
new_size = (int(pil_img.width * scale), int(pil_img.height * scale))
pil_img = pil_img.resize(new_size, Image.LANCZOS)

# Save to stream as PNG
img_stream = io.BytesIO()
pil_img.save(img_stream, format="PNG")
Expand Down Expand Up @@ -132,9 +144,19 @@ class PdfConverterWithOCR(DocumentConverter):
Maintains document structure while extracting text from images inline.
"""

def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None):
def __init__(
self,
ocr_service: Optional[LLMVisionOCRService] = None,
max_image_dimension: Optional[int] = None,
):
super().__init__()
self.ocr_service = ocr_service
# Use provided value, or read from environment variable, or default to 1500
if max_image_dimension is None:
max_image_dimension = int(os.environ.get("MARKITDOWN_MAX_IMAGE_DIMENSION", "1500"))
if max_image_dimension <= 0:
raise ValueError("max_image_dimension must be a positive integer")
self.max_image_dimension = max_image_dimension

def accepts(
self,
Expand All @@ -161,6 +183,8 @@ def convert(
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
# Get progress callback if provided
progress_callback = kwargs.get("progress_callback")
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
Expand All @@ -185,7 +209,14 @@ def convert(

try:
with pdfplumber.open(pdf_bytes) as pdf:
total_pages = len(pdf.pages)

for page_num, page in enumerate(pdf.pages, 1):
# Report progress if callback provided
if progress_callback:
progress = int((page_num / total_pages) * 100)
progress_callback(progress, f"Parsing page {page_num}/{total_pages}")

markdown_content.append(f"\n## Page {page_num}\n")

# If OCR is enabled, interleave text and images by position
Expand Down Expand Up @@ -306,29 +337,35 @@ def convert(
# treat as scanned PDF and OCR full pages
if ocr_service and (not markdown or not markdown.strip()):
pdf_bytes.seek(0)
markdown = self._ocr_full_pages(pdf_bytes, ocr_service)
markdown = self._ocr_full_pages(pdf_bytes, ocr_service, progress_callback)

return DocumentConverterResult(markdown=markdown)

def _extract_page_images(self, pdf_bytes: io.BytesIO, page_num: int) -> list[dict]:
def _extract_page_images(
self, pdf_bytes: io.BytesIO, page_num: int, max_dimension: Optional[int] = None
) -> list[dict]:
"""
Extract images from a PDF page using pdfplumber.

Args:
pdf_bytes: PDF file as BytesIO
page_num: Page number (1-indexed)
max_dimension: Maximum width/height for resized images (optional, uses instance default if not provided)

Returns:
List of image info dicts with 'stream', 'bbox', 'name', 'y_pos'
"""
images = []

# Use provided max_dimension or fall back to instance default
dimension_limit = max_dimension if max_dimension is not None else self.max_image_dimension

try:
pdf_bytes.seek(0)
with pdfplumber.open(pdf_bytes) as pdf:
if page_num <= len(pdf.pages):
page = pdf.pages[page_num - 1] # 0-indexed
images = _extract_images_from_page(page)
images = _extract_images_from_page(page, dimension_limit)
except Exception:
pass

Expand All @@ -338,7 +375,8 @@ def _extract_page_images(self, pdf_bytes: io.BytesIO, page_num: int) -> list[dic
return images

def _ocr_full_pages(
self, pdf_bytes: io.BytesIO, ocr_service: LLMVisionOCRService
self, pdf_bytes: io.BytesIO, ocr_service: LLMVisionOCRService,
progress_callback: Optional[Any] = None
) -> str:
"""
Fallback for scanned PDFs: Convert entire pages to images and OCR them.
Expand All @@ -347,6 +385,7 @@ def _ocr_full_pages(
Args:
pdf_bytes: PDF file as BytesIO
ocr_service: OCR service to use
progress_callback: Optional callback for progress updates (progress, message)

Returns:
Markdown text extracted from OCR of full pages
Expand All @@ -356,7 +395,13 @@ def _ocr_full_pages(
try:
pdf_bytes.seek(0)
with pdfplumber.open(pdf_bytes) as pdf:
total_pages = len(pdf.pages)
for page_num, page in enumerate(pdf.pages, 1):
# Report progress if callback provided
if progress_callback:
progress = int((page_num / total_pages) * 100)
progress_callback(progress, f"OCR page {page_num}/{total_pages}")

try:
markdown_parts.append(f"\n## Page {page_num}\n")

Expand Down Expand Up @@ -391,7 +436,13 @@ def _ocr_full_pages(

pdf_bytes.seek(0)
doc = fitz.open(stream=pdf_bytes.read(), filetype="pdf")
for page_num in range(1, doc.page_count + 1):
total_pages = doc.page_count
for page_num in range(1, total_pages + 1):
# Report progress if callback provided
if progress_callback:
progress = int((page_num / total_pages) * 100)
progress_callback(progress, f"OCR page {page_num}/{total_pages}")

try:
markdown_parts.append(f"\n## Page {page_num}\n")
page = doc[page_num - 1]
Expand Down