Optimise getting content HTML by only parsing the necessary tags

2022-07-04 18:55:18 +01:00 · 2022-07-04 18:55:18 +01:00 · ccb481726c
commit ccb481726c
parent 66ccd52b15
2 changed files with 12 additions and 14 deletions
--- a/website/common/models.py
+++ b/website/common/models.py
@ -75,7 +75,7 @@ class BaseContentMixin(models.Model):
    @cached_property
    def content_html(self) -> str:
-        return get_content_html(self.body)
+        return get_content_html(self.body_html)
    @cached_property
    def plain_text(self) -> str:
--- a/website/common/streamfield.py
+++ b/website/common/streamfield.py
@ -1,7 +1,6 @@
 from itertools import product
 from typing import Iterable
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, SoupStrainer
 from django.utils import lorem_ipsum
 from django.utils.html import format_html_join
 from django.utils.text import slugify
@ -78,20 +77,19 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
    ]
-def get_content_blocks(value: blocks.StreamValue) -> Iterable[blocks.BaseBlock]:
+def get_content_html(html: str) -> str:
    for block in value:
        if not isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
            yield block
 def get_content_html(value: blocks.StreamValue) -> str:
    """
    Get the HTML of just original content (eg not embeds etc)
    """
-    html = ""
+    block_classes = [
-    for block in get_content_blocks(value):
+        f"block-{block_name}"
-        html += str(block)
+        for block_name, block in get_blocks()
-    return html
+        if not isinstance(block, IGNORE_PLAINTEXT_BLOCKS)
    ]
    return str(
        BeautifulSoup(html, "lxml", parse_only=SoupStrainer(class_=block_classes))
    )
 def add_heading_anchors(html: str) -> str: