readthedocs · stsewd · Apr 30, 2025 · Apr 29, 2025
@@ -16,6 +16,12 @@
 class GenericParser:
     # Limit that matches the ``index.mapping.nested_objects.limit`` ES setting.
     max_inner_documents = 10000
+    # Limit the size of the contents to be indexed,
+    # to avoid filling the index with too much data.
+    # The limit may be exceeded if the content is too large,
+    # or if the content is malformed.
+    # A raw approximation of bytes based on the number of characters (~1.5 MB).
+    max_content_length = int(1.5 * 1024 * 1024)
 
     # Block level elements have an implicit line break before and after them.
     # List taken from: https://www.w3schools.com/htmL/html_blocks.asp.
@@ -148,6 +154,16 @@ def _parse_content(self, content):
         content = content.strip().split()
         content = (text.strip() for text in content)
         content = " ".join(text for text in content if text)
+        if len(content) > self.max_content_length:
+            log.info(
+                "Content too long, truncating.",
+                project_slug=self.project.slug,
+                version_slug=self.version.slug,
+                content_length=len(content),
+                limit=self.max_content_length,
+            )
+            content = content[: self.max_content_length]
+
         return content
 
     def _parse_sections(self, title, body):

@@ -9,6 +9,7 @@
 from readthedocs.builds.storage import BuildMediaFileSystemStorage
 from readthedocs.projects.constants import GENERIC, MKDOCS, SPHINX
 from readthedocs.projects.models import HTMLFile, Project
+from readthedocs.search.parsers import GenericParser
 
 data_path = Path(__file__).parent.resolve() / "data"
 
@@ -344,3 +345,40 @@ def test_generic_pelican_default_theme(self, storage_open, storage_exists):
         parsed_json = [file.processed_json]
         expected_json = json.load(open(data_path / "pelican/out/default.json"))
         assert parsed_json == expected_json
+
+    @mock.patch.object(BuildMediaFileSystemStorage, "exists")
+    @mock.patch.object(BuildMediaFileSystemStorage, "open")
+    def test_truncate_content(self, storage_open, storage_exists):
+        html_content = """
+            <!DOCTYPE html>
+            <html>
+            <head>
+                <meta charset="utf-8">
+                <title>Title of the page</title>
+            </head>
+            <body>
+        """
+        # More than ~1.5 MB of content
+        html_content += "A" * (GenericParser.max_content_length + 100) + "!" + "B" * 1000
+        html_content += "</body></html>"
+        storage_open.side_effect = self._mock_open(html_content)
+        storage_exists.return_value = True
+
+        self.version.save()
+
+        page_file = get(
+            HTMLFile,
+            project=self.project,
+            version=self.version,
+            path="page.html",
+        )
+
+        parsed_json = page_file.processed_json
+        assert parsed_json["path"] == "page.html"
+        assert parsed_json["title"] == "Title of the page"
+        assert len(parsed_json["sections"]) == 1
+        section = parsed_json["sections"][0]
+        assert section["title"] == "Title of the page"
+        assert len(section["content"]) <= GenericParser.max_content_length
+        assert section["content"].startswith("A")
+        assert not section["content"].endswith("B")