Skip to content

Adding image concatenation for large PDFs #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ pdf-to-images:<path>?dpi=N&format=jpg|png&quality=Q
- `dpi=N`: (optional) Dots per inch to use when rendering the PDF pages, which affects the resolution of the output images. Defaults to `300` if omitted.
- `format=jpg|png`: (optional) Image format to use for the output. Can be either `jpg` (default) or `png`.
- `quality=Q`: (optional) JPEG quality factor between 1 and 100. Only applies when using JPG format. Defaults to `30` if omitted. Higher values produce better quality but larger file sizes.
- `image_count_constraint`: (optional) If the PDF contains a lot of images, you can set this to maximum number of resultant pages. The logic will try to combine the certain number of pages into a single image. This is useful for large documents with many images. The default value is `-1`, which means no concatenation.

### More examples

Expand Down Expand Up @@ -68,6 +69,11 @@ Combine multiple parameters:
llm -f 'pdf-to-images:document.pdf?dpi=450&format=jpg&quality=75' 'OCR'
```

Convert a large PDF document:
```bash
llm -f 'pdf-to-images:document.pdf?dpi=300&image_count_constraint=50' 'summarize'
```

## Development

To set up this plugin locally, first checkout the code. Then create a new virtual environment:
Expand Down
91 changes: 75 additions & 16 deletions llm_pdf_to_images.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import tempfile
import io
from PIL import Image
import fitz
from urllib.parse import urlparse, parse_qs
import llm
Expand All @@ -9,17 +11,18 @@
def register_fragment_loaders(register):
"""
Register the "pdf-to-images" fragment loader.
Usage: pdf-to-images:/path/to/file.pdf?dpi=300&format=jpg&quality=80
Usage: pdf-to-images:/path/to/file.pdf?dpi=300&format=jpg&quality=80&image_count_constraint=50
"""
register("pdf-to-images", pdf_to_images_loader)


def pdf_to_images_loader(argument: str):
"""
Fragment loader "pdf-to-images:<path>?dpi=N&format=jpg|png&quality=Q"
Fragment loader "pdf-to-images:<path>?dpi=N&format=jpg|png&quality=Q&image_count_constraint=P"
- dpi: render resolution (dots per inch), default 300
- format: "jpg" (default) or "png"
- quality: JPEG quality 1–100, default 30
- image_count_constraint: Max number of images to create, concatenate pages where needed, default -1 (no concatenation)
"""
parts = urlparse(argument)
pdf_path = parts.path
Expand All @@ -29,6 +32,7 @@ def pdf_to_images_loader(argument: str):
dpi = int(params.get("dpi", ["300"])[0])
img_format = params.get("format", ["jpg"])[0].lower()
quality = int(params.get("quality", ["30"])[0])
image_count_constraint = int(params.get("image_count_constraint", ["-1"])[0])

if not os.path.exists(pdf_path):
raise ValueError(f"PDF file not found: {pdf_path}")
Expand All @@ -44,23 +48,78 @@ def pdf_to_images_loader(argument: str):
out_dir = tempfile.mkdtemp(prefix="llm_pdf_to_images_")

attachments = []
for page_number, page in enumerate(doc, start=1):
pix = page.get_pixmap(matrix=matrix)

if img_format in ("jpg", "jpeg"):
image_bytes = pix.tobytes(output="jpg", jpg_quality=quality)
ext = "jpg"
elif img_format == "png":
image_bytes = pix.tobytes(output="png")
ext = "png"

num_pages = len(doc)

if image_count_constraint > 0 and num_pages > image_count_constraint:
# Round up to ensure we don't exceed the threshold
compression_ratio = -(-num_pages // image_count_constraint)
else:
compression_ratio = 1

# Create an in-memory list to hold images for possible stitching
page_images = []

# iterate over pages
for i in range(0, num_pages, compression_ratio):
end_idx = min(i + compression_ratio, num_pages)

# Process each page in the current group
for j in range(i, end_idx):
page = doc[j]
# Use get_pixmap with correct keyword arguments
pix = page.get_pixmap(matrix=matrix, alpha=False) # type: ignore

if img_format == "jpg":
# Save pixmap to temporary file with jpg_quality parameter
image_bytes = pix.tobytes(output="jpeg", jpg_quality=quality)
ext = "jpg"
elif img_format == "png":
image_bytes = pix.tobytes(output="png")
ext = "png"
else:
raise ValueError(f"Unsupported image format: {img_format}")

# Convert to PIL Image for potential stitching
img = Image.open(io.BytesIO(image_bytes))
page_images.append(img)

# If we're compressing pages and have multiple images, stitch them
if compression_ratio > 1 and len(page_images) > 1:
# Stitch images vertically
total_width = max(img.width for img in page_images)
total_height = sum(img.height for img in page_images)

stitched_img = Image.new("RGB", (total_width, total_height))

y_offset = 0
for img in page_images:
stitched_img.paste(img, (0, y_offset))
y_offset += img.height

# Save the stitched image
out_name = f"page_{i + 1:03d}.{ext}"
out_path = os.path.join(out_dir, out_name)

if img_format == "jpg":
stitched_img.save(out_path, format="JPEG", quality=quality)
else:
stitched_img.save(out_path, format="PNG")
else:
raise ValueError(f"Unsupported image format: {img_format}")
# Just use the single image (no stitching needed)
out_name = f"page_{i + 1:03d}.{ext}"
out_path = os.path.join(out_dir, out_name)

out_name = f"page_{page_number:03d}.{ext}"
out_path = os.path.join(out_dir, out_name)
with open(out_path, "wb") as img_file:
img_file.write(image_bytes)
# Save the first image
if page_images:
if img_format == "jpg":
page_images[0].save(out_path, format="JPEG", quality=quality)
else:
page_images[0].save(out_path, format="PNG")

attachments.append(llm.Attachment(path=out_path))

# Clear the images list for the next group
page_images = []

return attachments
7 changes: 2 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,11 @@ name = "llm-pdf-to-images"
version = "0.1"
description = "LLM fragment plugin to load a PDF as a sequence of images"
readme = "README.md"
authors = [{name = "Simon Willison"}]
authors = [{ name = "Simon Willison" }]
license = "Apache-2.0"
classifiers = []
requires-python = ">=3.9"
dependencies = [
"llm",
"PyMuPDF"
]
dependencies = ["llm", "PyMuPDF", "Pillow"]

[build-system]
requires = ["setuptools"]
Expand Down
Binary file added tests/hundred-pages.pdf
Binary file not shown.
28 changes: 28 additions & 0 deletions tests/test_pdf_to_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,31 @@ def test_pdf_to_images():
# Now delete them
out_dir = os.path.dirname(attachments[0].path)
shutil.rmtree(out_dir)


def test_large_pdf_to_images_50():
path = os.path.join(os.path.dirname(__file__), "hundred-pages.pdf")
attachments = pdf_to_images_loader(path + "?image_count_constraint=50")
assert isinstance(attachments, list)
assert len(attachments) == 50
assert all(isinstance(attachment, llm.Attachment) for attachment in attachments)
assert attachments[0].path.endswith("page_001.jpg")
# The 2nd image should have page 3, as the first image covers pages 1-2
assert attachments[1].path.endswith("page_003.jpg")
# Now delete them
out_dir = os.path.dirname(attachments[0].path)
shutil.rmtree(out_dir)


def test_large_pdf_to_images_40():
path = os.path.join(os.path.dirname(__file__), "hundred-pages.pdf")
attachments = pdf_to_images_loader(path + "?image_count_constraint=40")
assert isinstance(attachments, list)
assert len(attachments) == 34
assert all(isinstance(attachment, llm.Attachment) for attachment in attachments)
assert attachments[0].path.endswith("page_001.jpg")
# The 2nd image should have page 4, as the first image covers pages 1-3
assert attachments[1].path.endswith("page_004.jpg")
# Now delete them
out_dir = os.path.dirname(attachments[0].path)
shutil.rmtree(out_dir)