diff --git a/requirements.txt b/requirements.txt index ae9475f..4b7df25 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ whitenoise[brotli]==6.2.0 pygments==2.12.0 beautifulsoup4==4.9.3 lxml==4.9.0 +more-itertools==8.13.0 diff --git a/website/common/models.py b/website/common/models.py index 965005d..9c17039 100644 --- a/website/common/models.py +++ b/website/common/models.py @@ -9,8 +9,10 @@ from wagtail.fields import StreamField from wagtail.images import get_image_model_string from wagtail.models import Page -from .streamfield import get_blocks, get_html, get_word_count, truncate_streamfield -from .utils import TocEntry, add_heading_anchors, get_table_of_contents +from website.common.utils import count_words + +from .streamfield import add_heading_anchors, get_blocks, get_content_html +from .utils import TocEntry, extract_text, get_table_of_contents, truncate_string class BasePage(Page): @@ -50,8 +52,7 @@ class BaseContentMixin(models.Model): @cached_property def table_of_contents(self) -> list[TocEntry]: - html = "".join(get_html(self.body)) - return get_table_of_contents(html) + return get_table_of_contents(self.content_html) @cached_property def reading_time(self) -> int: @@ -62,16 +63,24 @@ class BaseContentMixin(models.Model): @cached_property def word_count(self) -> int: - return get_word_count(self.body) + return count_words(self.plain_text) @cached_property def summary(self) -> str: - return truncate_streamfield(self.body, 50) + return truncate_string(self.plain_text, 50) @cached_property def body_html(self) -> str: return add_heading_anchors(str(self.body)) + @cached_property + def content_html(self) -> str: + return get_content_html(self.body) + + @cached_property + def plain_text(self) -> str: + return extract_text(self.content_html) + class ContentPage(BasePage, BaseContentMixin): # type: ignore[misc] subpage_types: list[Any] = [] diff --git a/website/common/streamfield.py b/website/common/streamfield.py index aa00bda..a99aa28 100644 --- a/website/common/streamfield.py +++ b/website/common/streamfield.py @@ -1,17 +1,17 @@ -from typing import Iterator +from itertools import product +from typing import Iterable from bs4 import BeautifulSoup from django.utils import lorem_ipsum from django.utils.html import format_html_join -from django.utils.text import smart_split +from django.utils.text import slugify from wagtail import blocks from wagtail.embeds.blocks import EmbedBlock from wagtail.images.blocks import ImageChooserBlock +from website.common.utils import HEADER_TAGS from website.contrib.code_block.blocks import CodeBlock -IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock) - RICH_TEXT_FEATURES = [ "h1", "h2", @@ -64,6 +64,10 @@ class ImageCaptionBlock(blocks.StructBlock): template = "common/blocks/image-caption.html" +IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock, ImageCaptionBlock) +IGNORE_HEADING_BLOCKS = (*IGNORE_PLAINTEXT_BLOCKS, LoremBlock) + + def get_blocks() -> list[tuple[str, blocks.BaseBlock]]: return [ ("embed", EmbedBlock()), @@ -75,34 +79,36 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]: ] -def extract_text(html: str) -> str: - return " ".join(BeautifulSoup(html, "lxml").find_all(text=True)) - - -def get_html(value: blocks.StreamValue) -> Iterator[str]: +def get_content_blocks(value: blocks.StreamValue) -> Iterable[blocks.BaseBlock]: for block in value: - if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS): - continue - yield str(block) + if not isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS): + yield block -def get_plain_text(value: blocks.StreamValue) -> Iterator[str]: - for html_chunk in get_html(value): - yield extract_text(html_chunk) +def get_content_html(value: blocks.StreamValue) -> str: + """ + Get the HTML of just original content (eg not embeds etc) + """ + html = "" + for block in get_content_blocks(value): + html += str(block) + return html -def truncate_streamfield(value: blocks.StreamValue, words: int) -> str: - collected_words: list[str] = [] - for block_text in get_plain_text(value): - collected_words.extend(smart_split(block_text)) - if len(collected_words) >= words: - break +def add_heading_anchors(html: str) -> str: + targets: list[str] = [ + f".block-{block_name} {header_tag}" + for header_tag, block_name in product( + HEADER_TAGS, + [b[0] for b in get_blocks() if not isinstance(b[1], IGNORE_HEADING_BLOCKS)], + ) + ] - return " ".join(collected_words[:words]) - - -def get_word_count(value: blocks.StreamValue) -> int: - count = 0 - for chunk in get_plain_text(value): - count += len(list(smart_split(chunk))) - return count + soup = BeautifulSoup(html, "lxml") + for tag in soup.select(", ".join(targets)): + slug = slugify(tag.text) + anchor = soup.new_tag("a", href="#" + slug, id=slug) + anchor.string = "#" + anchor.attrs["class"] = "heading-anchor" + tag.insert(0, anchor) + return str(soup) diff --git a/website/common/utils.py b/website/common/utils.py index c57586f..1d57fa0 100644 --- a/website/common/utils.py +++ b/website/common/utils.py @@ -1,11 +1,12 @@ from dataclasses import dataclass -from itertools import pairwise +from itertools import islice, pairwise from typing import Type from bs4 import BeautifulSoup from django.conf import settings from django.http.request import HttpRequest -from django.utils.text import slugify +from django.utils.text import slugify, smart_split +from more_itertools import ilen from wagtail.models import Page from wagtail.models import get_page_models as get_wagtail_page_models @@ -56,17 +57,6 @@ def get_table_of_contents(html: str) -> list[TocEntry]: return root.children -def add_heading_anchors(html: str) -> str: - soup = BeautifulSoup(html, "lxml") - for tag in soup.find_all(HEADER_TAGS): - slug = slugify(tag.text) - anchor = soup.new_tag("a", href="#" + slug, id=slug) - anchor.string = "#" - anchor.attrs["class"] = "heading-anchor" - tag.insert(0, anchor) - return str(soup) - - def get_page_models() -> list[Type[Page]]: page_models = get_wagtail_page_models().copy() page_models.remove(Page) @@ -75,3 +65,21 @@ def get_page_models() -> list[Type[Page]]: def show_toolbar_callback(request: HttpRequest) -> bool: return settings.DEBUG + + +def count_words(text: str) -> int: + """ + Count the number of words in the text, without duplicating the item in memory + """ + return ilen(smart_split(text)) + + +def extract_text(html: str) -> str: + """ + Get the plain text of some HTML. + """ + return " ".join(BeautifulSoup(html, "lxml").find_all(text=True)) + + +def truncate_string(text: str, words: int) -> str: + return " ".join(islice(smart_split(text), words))