simonw · bbaros · May 20, 2025
diff --git a/README.md b/README.md
@@ -35,6 +35,7 @@ pdf-to-images:<path>?dpi=N&format=jpg|png&quality=Q
 - `dpi=N`: (optional) Dots per inch to use when rendering the PDF pages, which affects the resolution of the output images. Defaults to `300` if omitted.
 - `format=jpg|png`: (optional) Image format to use for the output. Can be either `jpg` (default) or `png`.
 - `quality=Q`: (optional) JPEG quality factor between 1 and 100. Only applies when using JPG format. Defaults to `30` if omitted. Higher values produce better quality but larger file sizes.
+- `image_count_constraint`: (optional) If the PDF contains a lot of images, you can set this to maximum number of resultant pages.  The logic will try to combine the certain number of pages into a single image. This is useful for large documents with many images. The default value is `-1`, which means no concatenation.
 
 ### More examples
 
@@ -68,6 +69,11 @@ Combine multiple parameters:
 llm -f 'pdf-to-images:document.pdf?dpi=450&format=jpg&quality=75' 'OCR'
 ```
 
+Convert a large PDF document:
+```bash
+llm -f 'pdf-to-images:document.pdf?dpi=300&image_count_constraint=50' 'summarize'
+```
+
 ## Development
 
 To set up this plugin locally, first checkout the code. Then create a new virtual environment:

diff --git a/llm_pdf_to_images.py b/llm_pdf_to_images.py
@@ -1,5 +1,7 @@
 import os
 import tempfile
+import io
+from PIL import Image
 import fitz
 from urllib.parse import urlparse, parse_qs
 import llm
@@ -9,17 +11,18 @@
 def register_fragment_loaders(register):
     """
     Register the "pdf-to-images" fragment loader.
-    Usage: pdf-to-images:/path/to/file.pdf?dpi=300&format=jpg&quality=80
+    Usage: pdf-to-images:/path/to/file.pdf?dpi=300&format=jpg&quality=80&image_count_constraint=50
     """
     register("pdf-to-images", pdf_to_images_loader)
 
 
 def pdf_to_images_loader(argument: str):
     """
-    Fragment loader "pdf-to-images:<path>?dpi=N&format=jpg|png&quality=Q"
+    Fragment loader "pdf-to-images:<path>?dpi=N&format=jpg|png&quality=Q&image_count_constraint=P"
       - dpi: render resolution (dots per inch), default 300
       - format: "jpg" (default) or "png"
       - quality: JPEG quality 1–100, default 30
+      - image_count_constraint: Max number of images to create, concatenate pages where needed, default -1 (no concatenation)
     """
     parts = urlparse(argument)
     pdf_path = parts.path
@@ -29,6 +32,7 @@ def pdf_to_images_loader(argument: str):
     dpi = int(params.get("dpi", ["300"])[0])
     img_format = params.get("format", ["jpg"])[0].lower()
     quality = int(params.get("quality", ["30"])[0])
+    image_count_constraint = int(params.get("image_count_constraint", ["-1"])[0])
 
     if not os.path.exists(pdf_path):
         raise ValueError(f"PDF file not found: {pdf_path}")
@@ -44,23 +48,78 @@ def pdf_to_images_loader(argument: str):
     out_dir = tempfile.mkdtemp(prefix="llm_pdf_to_images_")
 
     attachments = []
-    for page_number, page in enumerate(doc, start=1):
-        pix = page.get_pixmap(matrix=matrix)
-
-        if img_format in ("jpg", "jpeg"):
-            image_bytes = pix.tobytes(output="jpg", jpg_quality=quality)
-            ext = "jpg"
-        elif img_format == "png":
-            image_bytes = pix.tobytes(output="png")
-            ext = "png"
+
+    num_pages = len(doc)
+
+    if image_count_constraint > 0 and num_pages > image_count_constraint:
+        # Round up to ensure we don't exceed the threshold
+        compression_ratio = -(-num_pages // image_count_constraint)
+    else:
+        compression_ratio = 1
+
+    # Create an in-memory list to hold images for possible stitching
+    page_images = []
+
+    # iterate over pages
+    for i in range(0, num_pages, compression_ratio):
+        end_idx = min(i + compression_ratio, num_pages)
+
+        # Process each page in the current group
+        for j in range(i, end_idx):
+            page = doc[j]
+            # Use get_pixmap with correct keyword arguments
+            pix = page.get_pixmap(matrix=matrix, alpha=False)  # type: ignore
+
+            if img_format == "jpg":
+                # Save pixmap to temporary file with jpg_quality parameter
+                image_bytes = pix.tobytes(output="jpeg", jpg_quality=quality)
+                ext = "jpg"
+            elif img_format == "png":
+                image_bytes = pix.tobytes(output="png")
+                ext = "png"
+            else:
+                raise ValueError(f"Unsupported image format: {img_format}")
+
+            # Convert to PIL Image for potential stitching
+            img = Image.open(io.BytesIO(image_bytes))
+            page_images.append(img)
+
+        # If we're compressing pages and have multiple images, stitch them
+        if compression_ratio > 1 and len(page_images) > 1:
+            # Stitch images vertically
+            total_width = max(img.width for img in page_images)
+            total_height = sum(img.height for img in page_images)
+
+            stitched_img = Image.new("RGB", (total_width, total_height))
+
+            y_offset = 0
+            for img in page_images:
+                stitched_img.paste(img, (0, y_offset))
+                y_offset += img.height
+
+            # Save the stitched image
+            out_name = f"page_{i + 1:03d}.{ext}"
+            out_path = os.path.join(out_dir, out_name)
+
+            if img_format == "jpg":
+                stitched_img.save(out_path, format="JPEG", quality=quality)
+            else:
+                stitched_img.save(out_path, format="PNG")
         else:
-            raise ValueError(f"Unsupported image format: {img_format}")
+            # Just use the single image (no stitching needed)
+            out_name = f"page_{i + 1:03d}.{ext}"
+            out_path = os.path.join(out_dir, out_name)
 
-        out_name = f"page_{page_number:03d}.{ext}"
-        out_path = os.path.join(out_dir, out_name)
-        with open(out_path, "wb") as img_file:
-            img_file.write(image_bytes)
+            # Save the first image
+            if page_images:
+                if img_format == "jpg":
+                    page_images[0].save(out_path, format="JPEG", quality=quality)
+                else:
+                    page_images[0].save(out_path, format="PNG")
 
         attachments.append(llm.Attachment(path=out_path))
 
+        # Clear the images list for the next group
+        page_images = []
+
     return attachments
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,14 +3,11 @@ name = "llm-pdf-to-images"
 version = "0.1"
 description = "LLM fragment plugin to load a PDF as a sequence of images"
 readme = "README.md"
-authors = [{name = "Simon Willison"}]
+authors = [{ name = "Simon Willison" }]
 license = "Apache-2.0"
 classifiers = []
 requires-python = ">=3.9"
-dependencies = [
-    "llm",
-    "PyMuPDF"
-]
+dependencies = ["llm", "PyMuPDF", "Pillow"]
 
 [build-system]
 requires = ["setuptools"]

diff --git a/tests/hundred-pages.pdf b/tests/hundred-pages.pdf
diff --git a/tests/test_pdf_to_images.py b/tests/test_pdf_to_images.py
@@ -15,3 +15,31 @@ def test_pdf_to_images():
     # Now delete them
     out_dir = os.path.dirname(attachments[0].path)
     shutil.rmtree(out_dir)
+
+
+def test_large_pdf_to_images_50():
+    path = os.path.join(os.path.dirname(__file__), "hundred-pages.pdf")
+    attachments = pdf_to_images_loader(path + "?image_count_constraint=50")
+    assert isinstance(attachments, list)
+    assert len(attachments) == 50
+    assert all(isinstance(attachment, llm.Attachment) for attachment in attachments)
+    assert attachments[0].path.endswith("page_001.jpg")
+    # The 2nd image should have page 3, as the first image covers pages 1-2
+    assert attachments[1].path.endswith("page_003.jpg")
+    # Now delete them
+    out_dir = os.path.dirname(attachments[0].path)
+    shutil.rmtree(out_dir)
+
+
+def test_large_pdf_to_images_40():
+    path = os.path.join(os.path.dirname(__file__), "hundred-pages.pdf")
+    attachments = pdf_to_images_loader(path + "?image_count_constraint=40")
+    assert isinstance(attachments, list)
+    assert len(attachments) == 34
+    assert all(isinstance(attachment, llm.Attachment) for attachment in attachments)
+    assert attachments[0].path.endswith("page_001.jpg")
+    # The 2nd image should have page 4, as the first image covers pages 1-3
+    assert attachments[1].path.endswith("page_004.jpg")
+    # Now delete them
+    out_dir = os.path.dirname(attachments[0].path)
+    shutil.rmtree(out_dir)