From ccb481726cd28448442f159dee2ee0c2bd8a04a1 Mon Sep 17 00:00:00 2001
From: Jake Howard <git@theorangeone.net>
Date: Mon, 4 Jul 2022 18:55:18 +0100
Subject: [PATCH] Optimise getting content HTML by only parsing the necessary
 tags

---
 website/common/models.py      |  2 +-
 website/common/streamfield.py | 24 +++++++++++-------------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/website/common/models.py b/website/common/models.py
index 9c17039..a4ec578 100644
--- a/website/common/models.py
+++ b/website/common/models.py
@@ -75,7 +75,7 @@ class BaseContentMixin(models.Model):
 
     @cached_property
     def content_html(self) -> str:
-        return get_content_html(self.body)
+        return get_content_html(self.body_html)
 
     @cached_property
     def plain_text(self) -> str:
diff --git a/website/common/streamfield.py b/website/common/streamfield.py
index 4b5ab2d..646ddd3 100644
--- a/website/common/streamfield.py
+++ b/website/common/streamfield.py
@@ -1,7 +1,6 @@
 from itertools import product
-from typing import Iterable
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, SoupStrainer
 from django.utils import lorem_ipsum
 from django.utils.html import format_html_join
 from django.utils.text import slugify
@@ -78,20 +77,19 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
     ]
 
 
-def get_content_blocks(value: blocks.StreamValue) -> Iterable[blocks.BaseBlock]:
-    for block in value:
-        if not isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
-            yield block
-
-
-def get_content_html(value: blocks.StreamValue) -> str:
+def get_content_html(html: str) -> str:
     """
     Get the HTML of just original content (eg not embeds etc)
     """
-    html = ""
-    for block in get_content_blocks(value):
-        html += str(block)
-    return html
+    block_classes = [
+        f"block-{block_name}"
+        for block_name, block in get_blocks()
+        if not isinstance(block, IGNORE_PLAINTEXT_BLOCKS)
+    ]
+
+    return str(
+        BeautifulSoup(html, "lxml", parse_only=SoupStrainer(class_=block_classes))
+    )
 
 
 def add_heading_anchors(html: str) -> str: