From 4702afd5dd198d12dedc3ce415442a3fe8e27139 Mon Sep 17 00:00:00 2001
From: Jake Howard <git@theorangeone.net>
Date: Sun, 3 Jul 2022 23:10:57 +0100
Subject: [PATCH] Reduce complexity trying to save computation on rendering
 streamfield

This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure.

Sadly it still renders the content twice, as `get_content_html` requires access to the blocks.
---
 requirements.txt              |  1 +
 website/common/models.py      | 21 ++++++++----
 website/common/streamfield.py | 64 +++++++++++++++++++----------------
 website/common/utils.py       | 34 ++++++++++++-------
 4 files changed, 72 insertions(+), 48 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ae9475f..4b7df25 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ whitenoise[brotli]==6.2.0
 pygments==2.12.0
 beautifulsoup4==4.9.3
 lxml==4.9.0
+more-itertools==8.13.0
diff --git a/website/common/models.py b/website/common/models.py
index 965005d..9c17039 100644
--- a/website/common/models.py
+++ b/website/common/models.py
@@ -9,8 +9,10 @@ from wagtail.fields import StreamField
 from wagtail.images import get_image_model_string
 from wagtail.models import Page
 
-from .streamfield import get_blocks, get_html, get_word_count, truncate_streamfield
-from .utils import TocEntry, add_heading_anchors, get_table_of_contents
+from website.common.utils import count_words
+
+from .streamfield import add_heading_anchors, get_blocks, get_content_html
+from .utils import TocEntry, extract_text, get_table_of_contents, truncate_string
 
 
 class BasePage(Page):
@@ -50,8 +52,7 @@ class BaseContentMixin(models.Model):
 
     @cached_property
     def table_of_contents(self) -> list[TocEntry]:
-        html = "".join(get_html(self.body))
-        return get_table_of_contents(html)
+        return get_table_of_contents(self.content_html)
 
     @cached_property
     def reading_time(self) -> int:
@@ -62,16 +63,24 @@ class BaseContentMixin(models.Model):
 
     @cached_property
     def word_count(self) -> int:
-        return get_word_count(self.body)
+        return count_words(self.plain_text)
 
     @cached_property
     def summary(self) -> str:
-        return truncate_streamfield(self.body, 50)
+        return truncate_string(self.plain_text, 50)
 
     @cached_property
     def body_html(self) -> str:
         return add_heading_anchors(str(self.body))
 
+    @cached_property
+    def content_html(self) -> str:
+        return get_content_html(self.body)
+
+    @cached_property
+    def plain_text(self) -> str:
+        return extract_text(self.content_html)
+
 
 class ContentPage(BasePage, BaseContentMixin):  # type: ignore[misc]
     subpage_types: list[Any] = []
diff --git a/website/common/streamfield.py b/website/common/streamfield.py
index aa00bda..a99aa28 100644
--- a/website/common/streamfield.py
+++ b/website/common/streamfield.py
@@ -1,17 +1,17 @@
-from typing import Iterator
+from itertools import product
+from typing import Iterable
 
 from bs4 import BeautifulSoup
 from django.utils import lorem_ipsum
 from django.utils.html import format_html_join
-from django.utils.text import smart_split
+from django.utils.text import slugify
 from wagtail import blocks
 from wagtail.embeds.blocks import EmbedBlock
 from wagtail.images.blocks import ImageChooserBlock
 
+from website.common.utils import HEADER_TAGS
 from website.contrib.code_block.blocks import CodeBlock
 
-IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock)
-
 RICH_TEXT_FEATURES = [
     "h1",
     "h2",
@@ -64,6 +64,10 @@ class ImageCaptionBlock(blocks.StructBlock):
         template = "common/blocks/image-caption.html"
 
 
+IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock, ImageCaptionBlock)
+IGNORE_HEADING_BLOCKS = (*IGNORE_PLAINTEXT_BLOCKS, LoremBlock)
+
+
 def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
     return [
         ("embed", EmbedBlock()),
@@ -75,34 +79,36 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
     ]
 
 
-def extract_text(html: str) -> str:
-    return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
-
-
-def get_html(value: blocks.StreamValue) -> Iterator[str]:
+def get_content_blocks(value: blocks.StreamValue) -> Iterable[blocks.BaseBlock]:
     for block in value:
-        if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
-            continue
-        yield str(block)
+        if not isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
+            yield block
 
 
-def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
-    for html_chunk in get_html(value):
-        yield extract_text(html_chunk)
+def get_content_html(value: blocks.StreamValue) -> str:
+    """
+    Get the HTML of just original content (eg not embeds etc)
+    """
+    html = ""
+    for block in get_content_blocks(value):
+        html += str(block)
+    return html
 
 
-def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:
-    collected_words: list[str] = []
-    for block_text in get_plain_text(value):
-        collected_words.extend(smart_split(block_text))
-        if len(collected_words) >= words:
-            break
+def add_heading_anchors(html: str) -> str:
+    targets: list[str] = [
+        f".block-{block_name} {header_tag}"
+        for header_tag, block_name in product(
+            HEADER_TAGS,
+            [b[0] for b in get_blocks() if not isinstance(b[1], IGNORE_HEADING_BLOCKS)],
+        )
+    ]
 
-    return " ".join(collected_words[:words])
-
-
-def get_word_count(value: blocks.StreamValue) -> int:
-    count = 0
-    for chunk in get_plain_text(value):
-        count += len(list(smart_split(chunk)))
-    return count
+    soup = BeautifulSoup(html, "lxml")
+    for tag in soup.select(", ".join(targets)):
+        slug = slugify(tag.text)
+        anchor = soup.new_tag("a", href="#" + slug, id=slug)
+        anchor.string = "#"
+        anchor.attrs["class"] = "heading-anchor"
+        tag.insert(0, anchor)
+    return str(soup)
diff --git a/website/common/utils.py b/website/common/utils.py
index c57586f..1d57fa0 100644
--- a/website/common/utils.py
+++ b/website/common/utils.py
@@ -1,11 +1,12 @@
 from dataclasses import dataclass
-from itertools import pairwise
+from itertools import islice, pairwise
 from typing import Type
 
 from bs4 import BeautifulSoup
 from django.conf import settings
 from django.http.request import HttpRequest
-from django.utils.text import slugify
+from django.utils.text import slugify, smart_split
+from more_itertools import ilen
 from wagtail.models import Page
 from wagtail.models import get_page_models as get_wagtail_page_models
 
@@ -56,17 +57,6 @@ def get_table_of_contents(html: str) -> list[TocEntry]:
     return root.children
 
 
-def add_heading_anchors(html: str) -> str:
-    soup = BeautifulSoup(html, "lxml")
-    for tag in soup.find_all(HEADER_TAGS):
-        slug = slugify(tag.text)
-        anchor = soup.new_tag("a", href="#" + slug, id=slug)
-        anchor.string = "#"
-        anchor.attrs["class"] = "heading-anchor"
-        tag.insert(0, anchor)
-    return str(soup)
-
-
 def get_page_models() -> list[Type[Page]]:
     page_models = get_wagtail_page_models().copy()
     page_models.remove(Page)
@@ -75,3 +65,21 @@ def get_page_models() -> list[Type[Page]]:
 
 def show_toolbar_callback(request: HttpRequest) -> bool:
     return settings.DEBUG
+
+
+def count_words(text: str) -> int:
+    """
+    Count the number of words in the text, without duplicating the item in memory
+    """
+    return ilen(smart_split(text))
+
+
+def extract_text(html: str) -> str:
+    """
+    Get the plain text of some HTML.
+    """
+    return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
+
+
+def truncate_string(text: str, words: int) -> str:
+    return " ".join(islice(smart_split(text), words))