Don't precompute placeholder replacements in raw HTML post-processor

pawamoy · pawamoy · commit 6113aad68a0a · 2025-02-21T16:33:41.000+01:00
Previously, the raw HTML post-processor would precompute all possible replacements for placeholders in a string, based on the HTML stash. It would then apply a regular expression substitution using these replacements. Finally, if the text changed, it would recurse, and do all that again. This was inefficient because placeholders were re-computed each time it recursed, and because only a few replacements would be used anyway. This change moves the recursion into the regular expression substitution, so that: 1. the regular expression does minimal work on the text (contrary to re-scanning text already scanned in previous frames); 2. but more importantly, replacements aren't computed ahead of time anymore (and even less *several times*), and only fetched from the HTML stash as placeholders are found in the text. The substitution function relies on the regular expression groups ordering: we make sure to match `<p>PLACEHOLDER</p>` first, before `PLACEHOLDER`. The presence of a wrapping `p` tag indicates whether to wrap again the substitution result, or not (also depending on whether the substituted HTML is a block-level tag). Issue-1507: #1507
diff --git a/markdown/postprocessors.py b/markdown/postprocessors.py
@@ -73,37 +73,26 @@ class RawHtmlPostprocessor(Postprocessor):
 
     def run(self, text: str) -> str:
         """ Iterate over html stash and restore html. """
-        replacements = OrderedDict()
-        for i in range(self.md.htmlStash.html_counter):
-            html = self.stash_to_string(self.md.htmlStash.rawHtmlBlocks[i])
-            if self.isblocklevel(html):
-                replacements["<p>{}</p>".format(
-                    self.md.htmlStash.get_placeholder(i))] = html
-            replacements[self.md.htmlStash.get_placeholder(i)] = html
-
         def substitute_match(m: re.Match[str]) -> str:
-            key = m.group(0)
-
-            if key not in replacements:
-                if key[3:-4] in replacements:
-                    return f'<p>{ replacements[key[3:-4]] }</p>'
-                else:
-                    return key
-
-            return replacements[key]
-
-        if replacements:
+            if key := m.group(1):
+                wrapped = True
+            else:
+                key = m.group(2)
+                wrapped = False
+            if (key := int(key)) >= len(self.md.htmlStash.rawHtmlBlocks):
+                return m.group(0)
+            html = self.stash_to_string(self.md.htmlStash.rawHtmlBlocks[key])
+            if self.isblocklevel(html) or not wrapped:
+                return pattern.sub(substitute_match, html)
+            return pattern.sub(substitute_match, f"<p>{html}</p>")
+
+        if self.md.htmlStash.html_counter:
             base_placeholder = util.HTML_PLACEHOLDER % r'([0-9]+)'
             pattern = re.compile(f'<p>{ base_placeholder }</p>|{ base_placeholder }')
-            processed_text = pattern.sub(substitute_match, text)
+            return pattern.sub(substitute_match, text)
         else:
             return text
 
-        if processed_text == text:
-            return processed_text
-        else:
-            return self.run(processed_text)
-
     def isblocklevel(self, html: str) -> bool:
         """ Check is block of HTML is block-level. """
         m = self.BLOCK_LEVEL_REGEX.match(html)