Reduce complexity trying to save computation on rendering streamfield

This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks.
2022-07-03 23:10:57 +01:00 · 2022-07-03 23:10:57 +01:00 · 4702afd5dd
commit 4702afd5dd
parent ebfb909c98
4 changed files with 72 additions and 48 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -5,3 +5,4 @@ whitenoise[brotli]==6.2.0
 pygments==2.12.0
 beautifulsoup4==4.9.3
 lxml==4.9.0
+more-itertools==8.13.0
--- a/website/common/models.py
+++ b/website/common/models.py
@ -9,8 +9,10 @@ from wagtail.fields import StreamField
 from wagtail.images import get_image_model_string
 from wagtail.models import Page

-from .streamfield import get_blocks, get_html, get_word_count, truncate_streamfield
-from .utils import TocEntry, add_heading_anchors, get_table_of_contents
+from website.common.utils import count_words
+
+from .streamfield import add_heading_anchors, get_blocks, get_content_html
+from .utils import TocEntry, extract_text, get_table_of_contents, truncate_string


 class BasePage(Page):
@ -50,8 +52,7 @@ class BaseContentMixin(models.Model):

    @cached_property
    def table_of_contents(self) -> list[TocEntry]:
-        html = "".join(get_html(self.body))
-        return get_table_of_contents(html)
+        return get_table_of_contents(self.content_html)

    @cached_property
    def reading_time(self) -> int:
@ -62,16 +63,24 @@ class BaseContentMixin(models.Model):

    @cached_property
    def word_count(self) -> int:
-        return get_word_count(self.body)
+        return count_words(self.plain_text)

    @cached_property
    def summary(self) -> str:
-        return truncate_streamfield(self.body, 50)
+        return truncate_string(self.plain_text, 50)

    @cached_property
    def body_html(self) -> str:
        return add_heading_anchors(str(self.body))

+    @cached_property
+    def content_html(self) -> str:
+        return get_content_html(self.body)
+
+    @cached_property
+    def plain_text(self) -> str:
+        return extract_text(self.content_html)
+

 class ContentPage(BasePage, BaseContentMixin):  # type: ignore[misc]
    subpage_types: list[Any] = []
--- a/website/common/streamfield.py
+++ b/website/common/streamfield.py
@ -1,17 +1,17 @@
-from typing import Iterator
+from itertools import product
+from typing import Iterable

 from bs4 import BeautifulSoup
 from django.utils import lorem_ipsum
 from django.utils.html import format_html_join
-from django.utils.text import smart_split
+from django.utils.text import slugify
 from wagtail import blocks
 from wagtail.embeds.blocks import EmbedBlock
 from wagtail.images.blocks import ImageChooserBlock

+from website.common.utils import HEADER_TAGS
 from website.contrib.code_block.blocks import CodeBlock

-IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock)
-
 RICH_TEXT_FEATURES = [
    "h1",
    "h2",
@ -64,6 +64,10 @@ class ImageCaptionBlock(blocks.StructBlock):
        template = "common/blocks/image-caption.html"


+IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock, ImageCaptionBlock)
+IGNORE_HEADING_BLOCKS = (*IGNORE_PLAINTEXT_BLOCKS, LoremBlock)
+
+
 def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
    return [
        ("embed", EmbedBlock()),
@ -75,34 +79,36 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
    ]


-def extract_text(html: str) -> str:
-    return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
-
-
-def get_html(value: blocks.StreamValue) -> Iterator[str]:
+def get_content_blocks(value: blocks.StreamValue) -> Iterable[blocks.BaseBlock]:
    for block in value:
-        if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
-            continue
-        yield str(block)
+        if not isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
+            yield block


-def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
-    for html_chunk in get_html(value):
-        yield extract_text(html_chunk)
+def get_content_html(value: blocks.StreamValue) -> str:
+    """
+    Get the HTML of just original content (eg not embeds etc)
+    """
+    html = ""
+    for block in get_content_blocks(value):
+        html += str(block)
+    return html


-def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:
-    collected_words: list[str] = []
-    for block_text in get_plain_text(value):
-        collected_words.extend(smart_split(block_text))
-        if len(collected_words) >= words:
-            break
+def add_heading_anchors(html: str) -> str:
+    targets: list[str] = [
+        f".block-{block_name} {header_tag}"
+        for header_tag, block_name in product(
+            HEADER_TAGS,
+            [b[0] for b in get_blocks() if not isinstance(b[1], IGNORE_HEADING_BLOCKS)],
+        )
+    ]

-    return " ".join(collected_words[:words])
-
-
-def get_word_count(value: blocks.StreamValue) -> int:
-    count = 0
-    for chunk in get_plain_text(value):
-        count += len(list(smart_split(chunk)))
-    return count
+    soup = BeautifulSoup(html, "lxml")
+    for tag in soup.select(", ".join(targets)):
+        slug = slugify(tag.text)
+        anchor = soup.new_tag("a", href="#" + slug, id=slug)
+        anchor.string = "#"
+        anchor.attrs["class"] = "heading-anchor"
+        tag.insert(0, anchor)
+    return str(soup)
--- a/website/common/utils.py
+++ b/website/common/utils.py
@ -1,11 +1,12 @@
 from dataclasses import dataclass
-from itertools import pairwise
+from itertools import islice, pairwise
 from typing import Type

 from bs4 import BeautifulSoup
 from django.conf import settings
 from django.http.request import HttpRequest
-from django.utils.text import slugify
+from django.utils.text import slugify, smart_split
+from more_itertools import ilen
 from wagtail.models import Page
 from wagtail.models import get_page_models as get_wagtail_page_models

@ -56,17 +57,6 @@ def get_table_of_contents(html: str) -> list[TocEntry]:
    return root.children


-def add_heading_anchors(html: str) -> str:
-    soup = BeautifulSoup(html, "lxml")
-    for tag in soup.find_all(HEADER_TAGS):
-        slug = slugify(tag.text)
-        anchor = soup.new_tag("a", href="#" + slug, id=slug)
-        anchor.string = "#"
-        anchor.attrs["class"] = "heading-anchor"
-        tag.insert(0, anchor)
-    return str(soup)
-
-
 def get_page_models() -> list[Type[Page]]:
    page_models = get_wagtail_page_models().copy()
    page_models.remove(Page)
@ -75,3 +65,21 @@ def get_page_models() -> list[Type[Page]]:

 def show_toolbar_callback(request: HttpRequest) -> bool:
    return settings.DEBUG
+
+
+def count_words(text: str) -> int:
+    """
+    Count the number of words in the text, without duplicating the item in memory
+    """
+    return ilen(smart_split(text))
+
+
+def extract_text(html: str) -> str:
+    """
+    Get the plain text of some HTML.
+    """
+    return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
+
+
+def truncate_string(text: str, words: int) -> str:
+    return " ".join(islice(smart_split(text), words))