Skip to content

Commit dc79f43

Browse files
authoredApr 30, 2025··
Search: truncate contents before indexing (#12146)
Our search index is growing fast, mostly due that we are supporting all types of inputs, not just sphinx and mkdocs. This means that even malformed content may be indexed, resulting in us indexing invalid/long content.
1 parent 16183e9 commit dc79f43

File tree

2 files changed

+54
-0
lines changed

2 files changed

+54
-0
lines changed
 

‎readthedocs/search/parsers.py

+16
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@
1616
class GenericParser:
1717
# Limit that matches the ``index.mapping.nested_objects.limit`` ES setting.
1818
max_inner_documents = 10000
19+
# Limit the size of the contents to be indexed,
20+
# to avoid filling the index with too much data.
21+
# The limit may be exceeded if the content is too large,
22+
# or if the content is malformed.
23+
# A raw approximation of bytes based on the number of characters (~1.5 MB).
24+
max_content_length = int(1.5 * 1024 * 1024)
1925

2026
# Block level elements have an implicit line break before and after them.
2127
# List taken from: https://www.w3schools.com/htmL/html_blocks.asp.
@@ -148,6 +154,16 @@ def _parse_content(self, content):
148154
content = content.strip().split()
149155
content = (text.strip() for text in content)
150156
content = " ".join(text for text in content if text)
157+
if len(content) > self.max_content_length:
158+
log.info(
159+
"Content too long, truncating.",
160+
project_slug=self.project.slug,
161+
version_slug=self.version.slug,
162+
content_length=len(content),
163+
limit=self.max_content_length,
164+
)
165+
content = content[: self.max_content_length]
166+
151167
return content
152168

153169
def _parse_sections(self, title, body):

‎readthedocs/search/tests/test_parsers.py

+38
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from readthedocs.builds.storage import BuildMediaFileSystemStorage
1010
from readthedocs.projects.constants import GENERIC, MKDOCS, SPHINX
1111
from readthedocs.projects.models import HTMLFile, Project
12+
from readthedocs.search.parsers import GenericParser
1213

1314
data_path = Path(__file__).parent.resolve() / "data"
1415

@@ -344,3 +345,40 @@ def test_generic_pelican_default_theme(self, storage_open, storage_exists):
344345
parsed_json = [file.processed_json]
345346
expected_json = json.load(open(data_path / "pelican/out/default.json"))
346347
assert parsed_json == expected_json
348+
349+
@mock.patch.object(BuildMediaFileSystemStorage, "exists")
350+
@mock.patch.object(BuildMediaFileSystemStorage, "open")
351+
def test_truncate_content(self, storage_open, storage_exists):
352+
html_content = """
353+
<!DOCTYPE html>
354+
<html>
355+
<head>
356+
<meta charset="utf-8">
357+
<title>Title of the page</title>
358+
</head>
359+
<body>
360+
"""
361+
# More than ~1.5 MB of content
362+
html_content += "A" * (GenericParser.max_content_length + 100) + "!" + "B" * 1000
363+
html_content += "</body></html>"
364+
storage_open.side_effect = self._mock_open(html_content)
365+
storage_exists.return_value = True
366+
367+
self.version.save()
368+
369+
page_file = get(
370+
HTMLFile,
371+
project=self.project,
372+
version=self.version,
373+
path="page.html",
374+
)
375+
376+
parsed_json = page_file.processed_json
377+
assert parsed_json["path"] == "page.html"
378+
assert parsed_json["title"] == "Title of the page"
379+
assert len(parsed_json["sections"]) == 1
380+
section = parsed_json["sections"][0]
381+
assert section["title"] == "Title of the page"
382+
assert len(section["content"]) <= GenericParser.max_content_length
383+
assert section["content"].startswith("A")
384+
assert not section["content"].endswith("B")

0 commit comments

Comments
 (0)
Please sign in to comment.