Reduce complexity trying to save computation on rendering streamfield

This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure.

Sadly it still renders the content twice, as `get_content_html` requires access to the blocks.
This commit is contained in:
Jake Howard 2022-07-03 23:10:57 +01:00
parent ebfb909c98
commit 4702afd5dd
Signed by: jake
GPG Key ID: 57AFB45680EDD477
4 changed files with 72 additions and 48 deletions

View File

@ -5,3 +5,4 @@ whitenoise[brotli]==6.2.0
pygments==2.12.0
beautifulsoup4==4.9.3
lxml==4.9.0
more-itertools==8.13.0

View File

@ -9,8 +9,10 @@ from wagtail.fields import StreamField
from wagtail.images import get_image_model_string
from wagtail.models import Page
from .streamfield import get_blocks, get_html, get_word_count, truncate_streamfield
from .utils import TocEntry, add_heading_anchors, get_table_of_contents
from website.common.utils import count_words
from .streamfield import add_heading_anchors, get_blocks, get_content_html
from .utils import TocEntry, extract_text, get_table_of_contents, truncate_string
class BasePage(Page):
@ -50,8 +52,7 @@ class BaseContentMixin(models.Model):
@cached_property
def table_of_contents(self) -> list[TocEntry]:
html = "".join(get_html(self.body))
return get_table_of_contents(html)
return get_table_of_contents(self.content_html)
@cached_property
def reading_time(self) -> int:
@ -62,16 +63,24 @@ class BaseContentMixin(models.Model):
@cached_property
def word_count(self) -> int:
return get_word_count(self.body)
return count_words(self.plain_text)
@cached_property
def summary(self) -> str:
return truncate_streamfield(self.body, 50)
return truncate_string(self.plain_text, 50)
@cached_property
def body_html(self) -> str:
return add_heading_anchors(str(self.body))
@cached_property
def content_html(self) -> str:
return get_content_html(self.body)
@cached_property
def plain_text(self) -> str:
return extract_text(self.content_html)
class ContentPage(BasePage, BaseContentMixin): # type: ignore[misc]
subpage_types: list[Any] = []

View File

@ -1,17 +1,17 @@
from typing import Iterator
from itertools import product
from typing import Iterable
from bs4 import BeautifulSoup
from django.utils import lorem_ipsum
from django.utils.html import format_html_join
from django.utils.text import smart_split
from django.utils.text import slugify
from wagtail import blocks
from wagtail.embeds.blocks import EmbedBlock
from wagtail.images.blocks import ImageChooserBlock
from website.common.utils import HEADER_TAGS
from website.contrib.code_block.blocks import CodeBlock
IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock)
RICH_TEXT_FEATURES = [
"h1",
"h2",
@ -64,6 +64,10 @@ class ImageCaptionBlock(blocks.StructBlock):
template = "common/blocks/image-caption.html"
IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock, ImageCaptionBlock)
IGNORE_HEADING_BLOCKS = (*IGNORE_PLAINTEXT_BLOCKS, LoremBlock)
def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
return [
("embed", EmbedBlock()),
@ -75,34 +79,36 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
]
def extract_text(html: str) -> str:
return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
def get_html(value: blocks.StreamValue) -> Iterator[str]:
def get_content_blocks(value: blocks.StreamValue) -> Iterable[blocks.BaseBlock]:
for block in value:
if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
continue
yield str(block)
if not isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
yield block
def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
for html_chunk in get_html(value):
yield extract_text(html_chunk)
def get_content_html(value: blocks.StreamValue) -> str:
"""
Get the HTML of just original content (eg not embeds etc)
"""
html = ""
for block in get_content_blocks(value):
html += str(block)
return html
def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:
collected_words: list[str] = []
for block_text in get_plain_text(value):
collected_words.extend(smart_split(block_text))
if len(collected_words) >= words:
break
def add_heading_anchors(html: str) -> str:
targets: list[str] = [
f".block-{block_name} {header_tag}"
for header_tag, block_name in product(
HEADER_TAGS,
[b[0] for b in get_blocks() if not isinstance(b[1], IGNORE_HEADING_BLOCKS)],
)
]
return " ".join(collected_words[:words])
def get_word_count(value: blocks.StreamValue) -> int:
count = 0
for chunk in get_plain_text(value):
count += len(list(smart_split(chunk)))
return count
soup = BeautifulSoup(html, "lxml")
for tag in soup.select(", ".join(targets)):
slug = slugify(tag.text)
anchor = soup.new_tag("a", href="#" + slug, id=slug)
anchor.string = "#"
anchor.attrs["class"] = "heading-anchor"
tag.insert(0, anchor)
return str(soup)

View File

@ -1,11 +1,12 @@
from dataclasses import dataclass
from itertools import pairwise
from itertools import islice, pairwise
from typing import Type
from bs4 import BeautifulSoup
from django.conf import settings
from django.http.request import HttpRequest
from django.utils.text import slugify
from django.utils.text import slugify, smart_split
from more_itertools import ilen
from wagtail.models import Page
from wagtail.models import get_page_models as get_wagtail_page_models
@ -56,17 +57,6 @@ def get_table_of_contents(html: str) -> list[TocEntry]:
return root.children
def add_heading_anchors(html: str) -> str:
soup = BeautifulSoup(html, "lxml")
for tag in soup.find_all(HEADER_TAGS):
slug = slugify(tag.text)
anchor = soup.new_tag("a", href="#" + slug, id=slug)
anchor.string = "#"
anchor.attrs["class"] = "heading-anchor"
tag.insert(0, anchor)
return str(soup)
def get_page_models() -> list[Type[Page]]:
page_models = get_wagtail_page_models().copy()
page_models.remove(Page)
@ -75,3 +65,21 @@ def get_page_models() -> list[Type[Page]]:
def show_toolbar_callback(request: HttpRequest) -> bool:
return settings.DEBUG
def count_words(text: str) -> int:
"""
Count the number of words in the text, without duplicating the item in memory
"""
return ilen(smart_split(text))
def extract_text(html: str) -> str:
"""
Get the plain text of some HTML.
"""
return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
def truncate_string(text: str, words: int) -> str:
return " ".join(islice(smart_split(text), words))