Search: truncate contents before indexing (#12146)

stsewd · web-flow · commit dc79f439cf3b · 2025-04-30T10:13:06.000-05:00
Our search index is growing fast, mostly due that we are supporting all
types of inputs, not just sphinx and mkdocs. This means that even
malformed content may be indexed, resulting in us indexing invalid/long
content.
diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py
@@ -16,6 +16,12 @@
 class GenericParser:
     # Limit that matches the ``index.mapping.nested_objects.limit`` ES setting.
     max_inner_documents = 10000
+    # Limit the size of the contents to be indexed,
+    # to avoid filling the index with too much data.
+    # The limit may be exceeded if the content is too large,
+    # or if the content is malformed.
+    # A raw approximation of bytes based on the number of characters (~1.5 MB).
+    max_content_length = int(1.5 * 1024 * 1024)
 
     # Block level elements have an implicit line break before and after them.
     # List taken from: https://www.w3schools.com/htmL/html_blocks.asp.
@@ -148,6 +154,16 @@ def _parse_content(self, content):
         content = content.strip().split()
         content = (text.strip() for text in content)
         content = " ".join(text for text in content if text)
+        if len(content) > self.max_content_length:
+            log.info(
+                "Content too long, truncating.",
+                project_slug=self.project.slug,
+                version_slug=self.version.slug,
+                content_length=len(content),
+                limit=self.max_content_length,
+            )
+            content = content[: self.max_content_length]
+
         return content
 
     def _parse_sections(self, title, body):
diff --git a/readthedocs/search/tests/test_parsers.py b/readthedocs/search/tests/test_parsers.py
@@ -9,6 +9,7 @@
 from readthedocs.builds.storage import BuildMediaFileSystemStorage
 from readthedocs.projects.constants import GENERIC, MKDOCS, SPHINX
 from readthedocs.projects.models import HTMLFile, Project
+from readthedocs.search.parsers import GenericParser
 
 data_path = Path(__file__).parent.resolve() / "data"
 
@@ -344,3 +345,40 @@ def test_generic_pelican_default_theme(self, storage_open, storage_exists):
         parsed_json = [file.processed_json]
         expected_json = json.load(open(data_path / "pelican/out/default.json"))
         assert parsed_json == expected_json
+
+    @mock.patch.object(BuildMediaFileSystemStorage, "exists")
+    @mock.patch.object(BuildMediaFileSystemStorage, "open")
+    def test_truncate_content(self, storage_open, storage_exists):
+        html_content = """
+            <!DOCTYPE html>
+            <html>
+            <head>
+                <meta charset="utf-8">
+                <title>Title of the page</title>
+            </head>
+            <body>
+        """
+        # More than ~1.5 MB of content
+        html_content += "A" * (GenericParser.max_content_length + 100) + "!" + "B" * 1000
+        html_content += "</body></html>"
+        storage_open.side_effect = self._mock_open(html_content)
+        storage_exists.return_value = True
+
+        self.version.save()
+
+        page_file = get(
+            HTMLFile,
+            project=self.project,
+            version=self.version,
+            path="page.html",
+        )
+
+        parsed_json = page_file.processed_json
+        assert parsed_json["path"] == "page.html"
+        assert parsed_json["title"] == "Title of the page"
+        assert len(parsed_json["sections"]) == 1
+        section = parsed_json["sections"][0]
+        assert section["title"] == "Title of the page"
+        assert len(section["content"]) <= GenericParser.max_content_length
+        assert section["content"].startswith("A")
+        assert not section["content"].endswith("B")