Reduce complexity trying to save computation on rendering streamfield
This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks.
This commit is contained in:
parent
ebfb909c98
commit
4702afd5dd
4 changed files with 72 additions and 48 deletions
|
@ -5,3 +5,4 @@ whitenoise[brotli]==6.2.0
|
|||
pygments==2.12.0
|
||||
beautifulsoup4==4.9.3
|
||||
lxml==4.9.0
|
||||
more-itertools==8.13.0
|
||||
|
|
|
@ -9,8 +9,10 @@ from wagtail.fields import StreamField
|
|||
from wagtail.images import get_image_model_string
|
||||
from wagtail.models import Page
|
||||
|
||||
from .streamfield import get_blocks, get_html, get_word_count, truncate_streamfield
|
||||
from .utils import TocEntry, add_heading_anchors, get_table_of_contents
|
||||
from website.common.utils import count_words
|
||||
|
||||
from .streamfield import add_heading_anchors, get_blocks, get_content_html
|
||||
from .utils import TocEntry, extract_text, get_table_of_contents, truncate_string
|
||||
|
||||
|
||||
class BasePage(Page):
|
||||
|
@ -50,8 +52,7 @@ class BaseContentMixin(models.Model):
|
|||
|
||||
@cached_property
|
||||
def table_of_contents(self) -> list[TocEntry]:
|
||||
html = "".join(get_html(self.body))
|
||||
return get_table_of_contents(html)
|
||||
return get_table_of_contents(self.content_html)
|
||||
|
||||
@cached_property
|
||||
def reading_time(self) -> int:
|
||||
|
@ -62,16 +63,24 @@ class BaseContentMixin(models.Model):
|
|||
|
||||
@cached_property
|
||||
def word_count(self) -> int:
|
||||
return get_word_count(self.body)
|
||||
return count_words(self.plain_text)
|
||||
|
||||
@cached_property
|
||||
def summary(self) -> str:
|
||||
return truncate_streamfield(self.body, 50)
|
||||
return truncate_string(self.plain_text, 50)
|
||||
|
||||
@cached_property
|
||||
def body_html(self) -> str:
|
||||
return add_heading_anchors(str(self.body))
|
||||
|
||||
@cached_property
|
||||
def content_html(self) -> str:
|
||||
return get_content_html(self.body)
|
||||
|
||||
@cached_property
|
||||
def plain_text(self) -> str:
|
||||
return extract_text(self.content_html)
|
||||
|
||||
|
||||
class ContentPage(BasePage, BaseContentMixin): # type: ignore[misc]
|
||||
subpage_types: list[Any] = []
|
||||
|
|
|
@ -1,17 +1,17 @@
|
|||
from typing import Iterator
|
||||
from itertools import product
|
||||
from typing import Iterable
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from django.utils import lorem_ipsum
|
||||
from django.utils.html import format_html_join
|
||||
from django.utils.text import smart_split
|
||||
from django.utils.text import slugify
|
||||
from wagtail import blocks
|
||||
from wagtail.embeds.blocks import EmbedBlock
|
||||
from wagtail.images.blocks import ImageChooserBlock
|
||||
|
||||
from website.common.utils import HEADER_TAGS
|
||||
from website.contrib.code_block.blocks import CodeBlock
|
||||
|
||||
IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock)
|
||||
|
||||
RICH_TEXT_FEATURES = [
|
||||
"h1",
|
||||
"h2",
|
||||
|
@ -64,6 +64,10 @@ class ImageCaptionBlock(blocks.StructBlock):
|
|||
template = "common/blocks/image-caption.html"
|
||||
|
||||
|
||||
IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock, ImageCaptionBlock)
|
||||
IGNORE_HEADING_BLOCKS = (*IGNORE_PLAINTEXT_BLOCKS, LoremBlock)
|
||||
|
||||
|
||||
def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
|
||||
return [
|
||||
("embed", EmbedBlock()),
|
||||
|
@ -75,34 +79,36 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
|
|||
]
|
||||
|
||||
|
||||
def extract_text(html: str) -> str:
|
||||
return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
|
||||
|
||||
|
||||
def get_html(value: blocks.StreamValue) -> Iterator[str]:
|
||||
def get_content_blocks(value: blocks.StreamValue) -> Iterable[blocks.BaseBlock]:
|
||||
for block in value:
|
||||
if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
|
||||
continue
|
||||
yield str(block)
|
||||
if not isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
|
||||
yield block
|
||||
|
||||
|
||||
def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
|
||||
for html_chunk in get_html(value):
|
||||
yield extract_text(html_chunk)
|
||||
def get_content_html(value: blocks.StreamValue) -> str:
|
||||
"""
|
||||
Get the HTML of just original content (eg not embeds etc)
|
||||
"""
|
||||
html = ""
|
||||
for block in get_content_blocks(value):
|
||||
html += str(block)
|
||||
return html
|
||||
|
||||
|
||||
def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:
|
||||
collected_words: list[str] = []
|
||||
for block_text in get_plain_text(value):
|
||||
collected_words.extend(smart_split(block_text))
|
||||
if len(collected_words) >= words:
|
||||
break
|
||||
def add_heading_anchors(html: str) -> str:
|
||||
targets: list[str] = [
|
||||
f".block-{block_name} {header_tag}"
|
||||
for header_tag, block_name in product(
|
||||
HEADER_TAGS,
|
||||
[b[0] for b in get_blocks() if not isinstance(b[1], IGNORE_HEADING_BLOCKS)],
|
||||
)
|
||||
]
|
||||
|
||||
return " ".join(collected_words[:words])
|
||||
|
||||
|
||||
def get_word_count(value: blocks.StreamValue) -> int:
|
||||
count = 0
|
||||
for chunk in get_plain_text(value):
|
||||
count += len(list(smart_split(chunk)))
|
||||
return count
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
for tag in soup.select(", ".join(targets)):
|
||||
slug = slugify(tag.text)
|
||||
anchor = soup.new_tag("a", href="#" + slug, id=slug)
|
||||
anchor.string = "#"
|
||||
anchor.attrs["class"] = "heading-anchor"
|
||||
tag.insert(0, anchor)
|
||||
return str(soup)
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
from dataclasses import dataclass
|
||||
from itertools import pairwise
|
||||
from itertools import islice, pairwise
|
||||
from typing import Type
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from django.conf import settings
|
||||
from django.http.request import HttpRequest
|
||||
from django.utils.text import slugify
|
||||
from django.utils.text import slugify, smart_split
|
||||
from more_itertools import ilen
|
||||
from wagtail.models import Page
|
||||
from wagtail.models import get_page_models as get_wagtail_page_models
|
||||
|
||||
|
@ -56,17 +57,6 @@ def get_table_of_contents(html: str) -> list[TocEntry]:
|
|||
return root.children
|
||||
|
||||
|
||||
def add_heading_anchors(html: str) -> str:
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
for tag in soup.find_all(HEADER_TAGS):
|
||||
slug = slugify(tag.text)
|
||||
anchor = soup.new_tag("a", href="#" + slug, id=slug)
|
||||
anchor.string = "#"
|
||||
anchor.attrs["class"] = "heading-anchor"
|
||||
tag.insert(0, anchor)
|
||||
return str(soup)
|
||||
|
||||
|
||||
def get_page_models() -> list[Type[Page]]:
|
||||
page_models = get_wagtail_page_models().copy()
|
||||
page_models.remove(Page)
|
||||
|
@ -75,3 +65,21 @@ def get_page_models() -> list[Type[Page]]:
|
|||
|
||||
def show_toolbar_callback(request: HttpRequest) -> bool:
|
||||
return settings.DEBUG
|
||||
|
||||
|
||||
def count_words(text: str) -> int:
|
||||
"""
|
||||
Count the number of words in the text, without duplicating the item in memory
|
||||
"""
|
||||
return ilen(smart_split(text))
|
||||
|
||||
|
||||
def extract_text(html: str) -> str:
|
||||
"""
|
||||
Get the plain text of some HTML.
|
||||
"""
|
||||
return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
|
||||
|
||||
|
||||
def truncate_string(text: str, words: int) -> str:
|
||||
return " ".join(islice(smart_split(text), words))
|
||||
|
|
Loading…
Reference in a new issue