Reduce complexity trying to save computation on rendering streamfield
This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks.
This commit is contained in:
parent
ebfb909c98
commit
4702afd5dd
4 changed files with 72 additions and 48 deletions
|
@ -5,3 +5,4 @@ whitenoise[brotli]==6.2.0
|
||||||
pygments==2.12.0
|
pygments==2.12.0
|
||||||
beautifulsoup4==4.9.3
|
beautifulsoup4==4.9.3
|
||||||
lxml==4.9.0
|
lxml==4.9.0
|
||||||
|
more-itertools==8.13.0
|
||||||
|
|
|
@ -9,8 +9,10 @@ from wagtail.fields import StreamField
|
||||||
from wagtail.images import get_image_model_string
|
from wagtail.images import get_image_model_string
|
||||||
from wagtail.models import Page
|
from wagtail.models import Page
|
||||||
|
|
||||||
from .streamfield import get_blocks, get_html, get_word_count, truncate_streamfield
|
from website.common.utils import count_words
|
||||||
from .utils import TocEntry, add_heading_anchors, get_table_of_contents
|
|
||||||
|
from .streamfield import add_heading_anchors, get_blocks, get_content_html
|
||||||
|
from .utils import TocEntry, extract_text, get_table_of_contents, truncate_string
|
||||||
|
|
||||||
|
|
||||||
class BasePage(Page):
|
class BasePage(Page):
|
||||||
|
@ -50,8 +52,7 @@ class BaseContentMixin(models.Model):
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def table_of_contents(self) -> list[TocEntry]:
|
def table_of_contents(self) -> list[TocEntry]:
|
||||||
html = "".join(get_html(self.body))
|
return get_table_of_contents(self.content_html)
|
||||||
return get_table_of_contents(html)
|
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def reading_time(self) -> int:
|
def reading_time(self) -> int:
|
||||||
|
@ -62,16 +63,24 @@ class BaseContentMixin(models.Model):
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def word_count(self) -> int:
|
def word_count(self) -> int:
|
||||||
return get_word_count(self.body)
|
return count_words(self.plain_text)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def summary(self) -> str:
|
def summary(self) -> str:
|
||||||
return truncate_streamfield(self.body, 50)
|
return truncate_string(self.plain_text, 50)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def body_html(self) -> str:
|
def body_html(self) -> str:
|
||||||
return add_heading_anchors(str(self.body))
|
return add_heading_anchors(str(self.body))
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def content_html(self) -> str:
|
||||||
|
return get_content_html(self.body)
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def plain_text(self) -> str:
|
||||||
|
return extract_text(self.content_html)
|
||||||
|
|
||||||
|
|
||||||
class ContentPage(BasePage, BaseContentMixin): # type: ignore[misc]
|
class ContentPage(BasePage, BaseContentMixin): # type: ignore[misc]
|
||||||
subpage_types: list[Any] = []
|
subpage_types: list[Any] = []
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
from typing import Iterator
|
from itertools import product
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from django.utils import lorem_ipsum
|
from django.utils import lorem_ipsum
|
||||||
from django.utils.html import format_html_join
|
from django.utils.html import format_html_join
|
||||||
from django.utils.text import smart_split
|
from django.utils.text import slugify
|
||||||
from wagtail import blocks
|
from wagtail import blocks
|
||||||
from wagtail.embeds.blocks import EmbedBlock
|
from wagtail.embeds.blocks import EmbedBlock
|
||||||
from wagtail.images.blocks import ImageChooserBlock
|
from wagtail.images.blocks import ImageChooserBlock
|
||||||
|
|
||||||
|
from website.common.utils import HEADER_TAGS
|
||||||
from website.contrib.code_block.blocks import CodeBlock
|
from website.contrib.code_block.blocks import CodeBlock
|
||||||
|
|
||||||
IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock)
|
|
||||||
|
|
||||||
RICH_TEXT_FEATURES = [
|
RICH_TEXT_FEATURES = [
|
||||||
"h1",
|
"h1",
|
||||||
"h2",
|
"h2",
|
||||||
|
@ -64,6 +64,10 @@ class ImageCaptionBlock(blocks.StructBlock):
|
||||||
template = "common/blocks/image-caption.html"
|
template = "common/blocks/image-caption.html"
|
||||||
|
|
||||||
|
|
||||||
|
IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock, ImageCaptionBlock)
|
||||||
|
IGNORE_HEADING_BLOCKS = (*IGNORE_PLAINTEXT_BLOCKS, LoremBlock)
|
||||||
|
|
||||||
|
|
||||||
def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
|
def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
|
||||||
return [
|
return [
|
||||||
("embed", EmbedBlock()),
|
("embed", EmbedBlock()),
|
||||||
|
@ -75,34 +79,36 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def extract_text(html: str) -> str:
|
def get_content_blocks(value: blocks.StreamValue) -> Iterable[blocks.BaseBlock]:
|
||||||
return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
|
|
||||||
|
|
||||||
|
|
||||||
def get_html(value: blocks.StreamValue) -> Iterator[str]:
|
|
||||||
for block in value:
|
for block in value:
|
||||||
if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
|
if not isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
|
||||||
continue
|
yield block
|
||||||
yield str(block)
|
|
||||||
|
|
||||||
|
|
||||||
def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
|
def get_content_html(value: blocks.StreamValue) -> str:
|
||||||
for html_chunk in get_html(value):
|
"""
|
||||||
yield extract_text(html_chunk)
|
Get the HTML of just original content (eg not embeds etc)
|
||||||
|
"""
|
||||||
|
html = ""
|
||||||
|
for block in get_content_blocks(value):
|
||||||
|
html += str(block)
|
||||||
|
return html
|
||||||
|
|
||||||
|
|
||||||
def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:
|
def add_heading_anchors(html: str) -> str:
|
||||||
collected_words: list[str] = []
|
targets: list[str] = [
|
||||||
for block_text in get_plain_text(value):
|
f".block-{block_name} {header_tag}"
|
||||||
collected_words.extend(smart_split(block_text))
|
for header_tag, block_name in product(
|
||||||
if len(collected_words) >= words:
|
HEADER_TAGS,
|
||||||
break
|
[b[0] for b in get_blocks() if not isinstance(b[1], IGNORE_HEADING_BLOCKS)],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
return " ".join(collected_words[:words])
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
for tag in soup.select(", ".join(targets)):
|
||||||
|
slug = slugify(tag.text)
|
||||||
def get_word_count(value: blocks.StreamValue) -> int:
|
anchor = soup.new_tag("a", href="#" + slug, id=slug)
|
||||||
count = 0
|
anchor.string = "#"
|
||||||
for chunk in get_plain_text(value):
|
anchor.attrs["class"] = "heading-anchor"
|
||||||
count += len(list(smart_split(chunk)))
|
tag.insert(0, anchor)
|
||||||
return count
|
return str(soup)
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import pairwise
|
from itertools import islice, pairwise
|
||||||
from typing import Type
|
from typing import Type
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.http.request import HttpRequest
|
from django.http.request import HttpRequest
|
||||||
from django.utils.text import slugify
|
from django.utils.text import slugify, smart_split
|
||||||
|
from more_itertools import ilen
|
||||||
from wagtail.models import Page
|
from wagtail.models import Page
|
||||||
from wagtail.models import get_page_models as get_wagtail_page_models
|
from wagtail.models import get_page_models as get_wagtail_page_models
|
||||||
|
|
||||||
|
@ -56,17 +57,6 @@ def get_table_of_contents(html: str) -> list[TocEntry]:
|
||||||
return root.children
|
return root.children
|
||||||
|
|
||||||
|
|
||||||
def add_heading_anchors(html: str) -> str:
|
|
||||||
soup = BeautifulSoup(html, "lxml")
|
|
||||||
for tag in soup.find_all(HEADER_TAGS):
|
|
||||||
slug = slugify(tag.text)
|
|
||||||
anchor = soup.new_tag("a", href="#" + slug, id=slug)
|
|
||||||
anchor.string = "#"
|
|
||||||
anchor.attrs["class"] = "heading-anchor"
|
|
||||||
tag.insert(0, anchor)
|
|
||||||
return str(soup)
|
|
||||||
|
|
||||||
|
|
||||||
def get_page_models() -> list[Type[Page]]:
|
def get_page_models() -> list[Type[Page]]:
|
||||||
page_models = get_wagtail_page_models().copy()
|
page_models = get_wagtail_page_models().copy()
|
||||||
page_models.remove(Page)
|
page_models.remove(Page)
|
||||||
|
@ -75,3 +65,21 @@ def get_page_models() -> list[Type[Page]]:
|
||||||
|
|
||||||
def show_toolbar_callback(request: HttpRequest) -> bool:
|
def show_toolbar_callback(request: HttpRequest) -> bool:
|
||||||
return settings.DEBUG
|
return settings.DEBUG
|
||||||
|
|
||||||
|
|
||||||
|
def count_words(text: str) -> int:
|
||||||
|
"""
|
||||||
|
Count the number of words in the text, without duplicating the item in memory
|
||||||
|
"""
|
||||||
|
return ilen(smart_split(text))
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text(html: str) -> str:
|
||||||
|
"""
|
||||||
|
Get the plain text of some HTML.
|
||||||
|
"""
|
||||||
|
return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
|
||||||
|
|
||||||
|
|
||||||
|
def truncate_string(text: str, words: int) -> str:
|
||||||
|
return " ".join(islice(smart_split(text), words))
|
||||||
|
|
Loading…
Reference in a new issue