Reduce complexity trying to save computation on rendering streamfield

This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks.
2022-07-03 23:10:57 +01:00 · 2022-07-03 23:10:57 +01:00 · 4702afd5dd
commit 4702afd5dd
parent ebfb909c98
4 changed files with 72 additions and 48 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -5,3 +5,4 @@ whitenoise[brotli]==6.2.0
 pygments==2.12.0
 beautifulsoup4==4.9.3
 lxml==4.9.0
 more-itertools==8.13.0
--- a/website/common/models.py
+++ b/website/common/models.py
@ -9,8 +9,10 @@ from wagtail.fields import StreamField
 from wagtail.images import get_image_model_string
 from wagtail.models import Page
-from .streamfield import get_blocks, get_html, get_word_count, truncate_streamfield
+from website.common.utils import count_words
-from .utils import TocEntry, add_heading_anchors, get_table_of_contents
+
 from .streamfield import add_heading_anchors, get_blocks, get_content_html
 from .utils import TocEntry, extract_text, get_table_of_contents, truncate_string
 class BasePage(Page):
@ -50,8 +52,7 @@ class BaseContentMixin(models.Model):
    @cached_property
    def table_of_contents(self) -> list[TocEntry]:
-        html = "".join(get_html(self.body))
+        return get_table_of_contents(self.content_html)
        return get_table_of_contents(html)
    @cached_property
    def reading_time(self) -> int:
@ -62,16 +63,24 @@ class BaseContentMixin(models.Model):
    @cached_property
    def word_count(self) -> int:
-        return get_word_count(self.body)
+        return count_words(self.plain_text)
    @cached_property
    def summary(self) -> str:
-        return truncate_streamfield(self.body, 50)
+        return truncate_string(self.plain_text, 50)
    @cached_property
    def body_html(self) -> str:
        return add_heading_anchors(str(self.body))
    @cached_property
    def content_html(self) -> str:
        return get_content_html(self.body)
    @cached_property
    def plain_text(self) -> str:
        return extract_text(self.content_html)
 class ContentPage(BasePage, BaseContentMixin):  # type: ignore[misc]
    subpage_types: list[Any] = []
--- a/website/common/streamfield.py
+++ b/website/common/streamfield.py
@ -1,17 +1,17 @@
-from typing import Iterator
+from itertools import product
 from typing import Iterable
 from bs4 import BeautifulSoup
 from django.utils import lorem_ipsum
 from django.utils.html import format_html_join
-from django.utils.text import smart_split
+from django.utils.text import slugify
 from wagtail import blocks
 from wagtail.embeds.blocks import EmbedBlock
 from wagtail.images.blocks import ImageChooserBlock
 from website.common.utils import HEADER_TAGS
 from website.contrib.code_block.blocks import CodeBlock
 IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock)
 RICH_TEXT_FEATURES = [
    "h1",
    "h2",
@ -64,6 +64,10 @@ class ImageCaptionBlock(blocks.StructBlock):
        template = "common/blocks/image-caption.html"
 IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock, ImageCaptionBlock)
 IGNORE_HEADING_BLOCKS = (*IGNORE_PLAINTEXT_BLOCKS, LoremBlock)
 def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
    return [
        ("embed", EmbedBlock()),
@ -75,34 +79,36 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
    ]
-def extract_text(html: str) -> str:
+def get_content_blocks(value: blocks.StreamValue) -> Iterable[blocks.BaseBlock]:
    return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
 def get_html(value: blocks.StreamValue) -> Iterator[str]:
    for block in value:
-        if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
+        if not isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
-            continue
+            yield block
        yield str(block)
-def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
+def get_content_html(value: blocks.StreamValue) -> str:
-    for html_chunk in get_html(value):
+    """
-        yield extract_text(html_chunk)
+    Get the HTML of just original content (eg not embeds etc)
    """
    html = ""
    for block in get_content_blocks(value):
        html += str(block)
    return html
-def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:
+def add_heading_anchors(html: str) -> str:
-    collected_words: list[str] = []
+    targets: list[str] = [
-    for block_text in get_plain_text(value):
+        f".block-{block_name} {header_tag}"
-        collected_words.extend(smart_split(block_text))
+        for header_tag, block_name in product(
-        if len(collected_words) >= words:
+            HEADER_TAGS,
-            break
+            [b[0] for b in get_blocks() if not isinstance(b[1], IGNORE_HEADING_BLOCKS)],
        )
    ]
-    return " ".join(collected_words[:words])
+    soup = BeautifulSoup(html, "lxml")
-
+    for tag in soup.select(", ".join(targets)):
-
+        slug = slugify(tag.text)
-def get_word_count(value: blocks.StreamValue) -> int:
+        anchor = soup.new_tag("a", href="#" + slug, id=slug)
-    count = 0
+        anchor.string = "#"
-    for chunk in get_plain_text(value):
+        anchor.attrs["class"] = "heading-anchor"
-        count += len(list(smart_split(chunk)))
+        tag.insert(0, anchor)
-    return count
+    return str(soup)
--- a/website/common/utils.py
+++ b/website/common/utils.py
@ -1,11 +1,12 @@
 from dataclasses import dataclass
-from itertools import pairwise
+from itertools import islice, pairwise
 from typing import Type
 from bs4 import BeautifulSoup
 from django.conf import settings
 from django.http.request import HttpRequest
-from django.utils.text import slugify
+from django.utils.text import slugify, smart_split
 from more_itertools import ilen
 from wagtail.models import Page
 from wagtail.models import get_page_models as get_wagtail_page_models
@ -56,17 +57,6 @@ def get_table_of_contents(html: str) -> list[TocEntry]:
    return root.children
 def add_heading_anchors(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    for tag in soup.find_all(HEADER_TAGS):
        slug = slugify(tag.text)
        anchor = soup.new_tag("a", href="#" + slug, id=slug)
        anchor.string = "#"
        anchor.attrs["class"] = "heading-anchor"
        tag.insert(0, anchor)
    return str(soup)
 def get_page_models() -> list[Type[Page]]:
    page_models = get_wagtail_page_models().copy()
    page_models.remove(Page)
@ -75,3 +65,21 @@ def get_page_models() -> list[Type[Page]]:
 def show_toolbar_callback(request: HttpRequest) -> bool:
    return settings.DEBUG
 def count_words(text: str) -> int:
    """
    Count the number of words in the text, without duplicating the item in memory
    """
    return ilen(smart_split(text))
 def extract_text(html: str) -> str:
    """
    Get the plain text of some HTML.
    """
    return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
 def truncate_string(text: str, words: int) -> str:
    return " ".join(islice(smart_split(text), words))