Reduce complexity trying to save computation on rendering streamfield

This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure.

Sadly it still renders the content twice, as `get_content_html` requires access to the blocks.
This commit is contained in:
Jake Howard 2022-07-03 23:10:57 +01:00
parent ebfb909c98
commit 4702afd5dd
Signed by: jake
GPG key ID: 57AFB45680EDD477
4 changed files with 72 additions and 48 deletions

View file

@ -5,3 +5,4 @@ whitenoise[brotli]==6.2.0
pygments==2.12.0 pygments==2.12.0
beautifulsoup4==4.9.3 beautifulsoup4==4.9.3
lxml==4.9.0 lxml==4.9.0
more-itertools==8.13.0

View file

@ -9,8 +9,10 @@ from wagtail.fields import StreamField
from wagtail.images import get_image_model_string from wagtail.images import get_image_model_string
from wagtail.models import Page from wagtail.models import Page
from .streamfield import get_blocks, get_html, get_word_count, truncate_streamfield from website.common.utils import count_words
from .utils import TocEntry, add_heading_anchors, get_table_of_contents
from .streamfield import add_heading_anchors, get_blocks, get_content_html
from .utils import TocEntry, extract_text, get_table_of_contents, truncate_string
class BasePage(Page): class BasePage(Page):
@ -50,8 +52,7 @@ class BaseContentMixin(models.Model):
@cached_property @cached_property
def table_of_contents(self) -> list[TocEntry]: def table_of_contents(self) -> list[TocEntry]:
html = "".join(get_html(self.body)) return get_table_of_contents(self.content_html)
return get_table_of_contents(html)
@cached_property @cached_property
def reading_time(self) -> int: def reading_time(self) -> int:
@ -62,16 +63,24 @@ class BaseContentMixin(models.Model):
@cached_property @cached_property
def word_count(self) -> int: def word_count(self) -> int:
return get_word_count(self.body) return count_words(self.plain_text)
@cached_property @cached_property
def summary(self) -> str: def summary(self) -> str:
return truncate_streamfield(self.body, 50) return truncate_string(self.plain_text, 50)
@cached_property @cached_property
def body_html(self) -> str: def body_html(self) -> str:
return add_heading_anchors(str(self.body)) return add_heading_anchors(str(self.body))
@cached_property
def content_html(self) -> str:
return get_content_html(self.body)
@cached_property
def plain_text(self) -> str:
return extract_text(self.content_html)
class ContentPage(BasePage, BaseContentMixin): # type: ignore[misc] class ContentPage(BasePage, BaseContentMixin): # type: ignore[misc]
subpage_types: list[Any] = [] subpage_types: list[Any] = []

View file

@ -1,17 +1,17 @@
from typing import Iterator from itertools import product
from typing import Iterable
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from django.utils import lorem_ipsum from django.utils import lorem_ipsum
from django.utils.html import format_html_join from django.utils.html import format_html_join
from django.utils.text import smart_split from django.utils.text import slugify
from wagtail import blocks from wagtail import blocks
from wagtail.embeds.blocks import EmbedBlock from wagtail.embeds.blocks import EmbedBlock
from wagtail.images.blocks import ImageChooserBlock from wagtail.images.blocks import ImageChooserBlock
from website.common.utils import HEADER_TAGS
from website.contrib.code_block.blocks import CodeBlock from website.contrib.code_block.blocks import CodeBlock
IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock)
RICH_TEXT_FEATURES = [ RICH_TEXT_FEATURES = [
"h1", "h1",
"h2", "h2",
@ -64,6 +64,10 @@ class ImageCaptionBlock(blocks.StructBlock):
template = "common/blocks/image-caption.html" template = "common/blocks/image-caption.html"
IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock, ImageCaptionBlock)
IGNORE_HEADING_BLOCKS = (*IGNORE_PLAINTEXT_BLOCKS, LoremBlock)
def get_blocks() -> list[tuple[str, blocks.BaseBlock]]: def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
return [ return [
("embed", EmbedBlock()), ("embed", EmbedBlock()),
@ -75,34 +79,36 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
] ]
def extract_text(html: str) -> str: def get_content_blocks(value: blocks.StreamValue) -> Iterable[blocks.BaseBlock]:
return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
def get_html(value: blocks.StreamValue) -> Iterator[str]:
for block in value: for block in value:
if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS): if not isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
continue yield block
yield str(block)
def get_plain_text(value: blocks.StreamValue) -> Iterator[str]: def get_content_html(value: blocks.StreamValue) -> str:
for html_chunk in get_html(value): """
yield extract_text(html_chunk) Get the HTML of just original content (eg not embeds etc)
"""
html = ""
for block in get_content_blocks(value):
html += str(block)
return html
def truncate_streamfield(value: blocks.StreamValue, words: int) -> str: def add_heading_anchors(html: str) -> str:
collected_words: list[str] = [] targets: list[str] = [
for block_text in get_plain_text(value): f".block-{block_name} {header_tag}"
collected_words.extend(smart_split(block_text)) for header_tag, block_name in product(
if len(collected_words) >= words: HEADER_TAGS,
break [b[0] for b in get_blocks() if not isinstance(b[1], IGNORE_HEADING_BLOCKS)],
)
]
return " ".join(collected_words[:words]) soup = BeautifulSoup(html, "lxml")
for tag in soup.select(", ".join(targets)):
slug = slugify(tag.text)
def get_word_count(value: blocks.StreamValue) -> int: anchor = soup.new_tag("a", href="#" + slug, id=slug)
count = 0 anchor.string = "#"
for chunk in get_plain_text(value): anchor.attrs["class"] = "heading-anchor"
count += len(list(smart_split(chunk))) tag.insert(0, anchor)
return count return str(soup)

View file

@ -1,11 +1,12 @@
from dataclasses import dataclass from dataclasses import dataclass
from itertools import pairwise from itertools import islice, pairwise
from typing import Type from typing import Type
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from django.conf import settings from django.conf import settings
from django.http.request import HttpRequest from django.http.request import HttpRequest
from django.utils.text import slugify from django.utils.text import slugify, smart_split
from more_itertools import ilen
from wagtail.models import Page from wagtail.models import Page
from wagtail.models import get_page_models as get_wagtail_page_models from wagtail.models import get_page_models as get_wagtail_page_models
@ -56,17 +57,6 @@ def get_table_of_contents(html: str) -> list[TocEntry]:
return root.children return root.children
def add_heading_anchors(html: str) -> str:
soup = BeautifulSoup(html, "lxml")
for tag in soup.find_all(HEADER_TAGS):
slug = slugify(tag.text)
anchor = soup.new_tag("a", href="#" + slug, id=slug)
anchor.string = "#"
anchor.attrs["class"] = "heading-anchor"
tag.insert(0, anchor)
return str(soup)
def get_page_models() -> list[Type[Page]]: def get_page_models() -> list[Type[Page]]:
page_models = get_wagtail_page_models().copy() page_models = get_wagtail_page_models().copy()
page_models.remove(Page) page_models.remove(Page)
@ -75,3 +65,21 @@ def get_page_models() -> list[Type[Page]]:
def show_toolbar_callback(request: HttpRequest) -> bool: def show_toolbar_callback(request: HttpRequest) -> bool:
return settings.DEBUG return settings.DEBUG
def count_words(text: str) -> int:
"""
Count the number of words in the text, without duplicating the item in memory
"""
return ilen(smart_split(text))
def extract_text(html: str) -> str:
"""
Get the plain text of some HTML.
"""
return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
def truncate_string(text: str, words: int) -> str:
return " ".join(islice(smart_split(text), words))