website/website/common/streamfield.py

from itertools import product

from bs4 import BeautifulSoup, SoupStrainer
from django.utils import lorem_ipsum
from django.utils.html import format_html_join
from django.utils.text import slugify
from wagtail import blocks
from wagtail.embeds.blocks import EmbedBlock
from wagtail.images.blocks import ImageChooserBlock

from website.common.utils import HEADER_TAGS
from website.contrib.code_block.blocks import CodeBlock

RICH_TEXT_FEATURES = [
    "h2",
    "h3",
    "h4",
    "h5",
    "h6",
    "bold",
    "italic",
    "ol",
    "ul",
    "link",
    "document-link",
    "code",
    "strikethrough",
    "snippet-link",
    "snippet-embed",
]

RICH_TEXT_FEATURES_PLAIN = [
    "bold",
    "italic",
    "link",
    "document-link",
    "code",
    "strikethrough",
]

RICH_TEXT_FEATURES_SIMPLE = [
    "bold",
    "italic",
    "ol",
    "ul",
    "link",
    "document-link",
    "code",
    "strikethrough",
]


class LoremBlock(blocks.StructBlock):
    paragraphs = blocks.IntegerBlock(min_value=1)

    def render(self, value: dict, context: dict | None = None) -> str:
        return format_html_join(
            "\n\n",
            "<p>{}</p>",
            [(paragraph,) for paragraph in lorem_ipsum.paragraphs(value["paragraphs"])],
        )

    class Meta:
        icon = "openquote"
        label = "Lorem Ipsum"


class ImageCaptionBlock(blocks.StructBlock):
    image = ImageChooserBlock()
    caption = blocks.RichTextBlock(features=RICH_TEXT_FEATURES_PLAIN)

    class Meta:
        icon = "image"
        label = "Image with caption"
        template = "common/blocks/image-caption.html"


class TangentBlock(blocks.StructBlock):
    name = blocks.CharBlock(max_length=64)
    content = blocks.RichTextBlock(features=RICH_TEXT_FEATURES_SIMPLE)

    class Meta:
        icon = "comment"
        label = "Tangent"
        template = "common/blocks/tangent.html"


IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock, ImageCaptionBlock)
IGNORE_HEADING_BLOCKS = (*IGNORE_PLAINTEXT_BLOCKS, LoremBlock)


def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
    return [
        ("embed", EmbedBlock()),
        ("rich_text", blocks.RichTextBlock(features=RICH_TEXT_FEATURES)),
        ("lorem", LoremBlock()),
        ("html", blocks.RawHTMLBlock()),
        ("image", ImageCaptionBlock()),
        ("code", CodeBlock()),
        ("tangent", TangentBlock()),
    ]


def get_content_html(html: str) -> str:
    """
    Get the HTML of just original content (eg not embeds etc)
    """
    block_classes = [
        f"block-{block_name}"
        for block_name, block in get_blocks()
        if not isinstance(block, IGNORE_PLAINTEXT_BLOCKS)
    ]

    return str(
        BeautifulSoup(html, "lxml", parse_only=SoupStrainer(class_=block_classes))
    )


def add_heading_anchors(html: str) -> str:
    targets: list[str] = [
        f".block-{block_name} {header_tag}"
        for header_tag, block_name in product(
            HEADER_TAGS,
            [b[0] for b in get_blocks() if not isinstance(b[1], IGNORE_HEADING_BLOCKS)],
        )
    ]

    soup = BeautifulSoup(html, "lxml")
    for tag in soup.select(", ".join(targets)):
        slug = slugify(tag.text)
        anchor = soup.new_tag("a", href="#" + slug, id=slug)
        anchor.string = "#"
        anchor.attrs["class"] = "heading-anchor"
        tag.insert(0, anchor)
    return str(soup)
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00			`from itertools import product`
Add basic word count and reading time 2022-06-26 19:25:30 +01:00
Optimise getting content HTML by only parsing the necessary tags 2022-07-04 18:55:18 +01:00			`from bs4 import BeautifulSoup, SoupStrainer`
Create basic streamfield 2022-06-26 18:37:04 +01:00			`from django.utils import lorem_ipsum`
Use bs4 to extract plain text from HTML It's slower, but it keeps spaces between tags, which is what we want. 2022-06-30 21:32:47 +01:00			`from django.utils.html import format_html_join`
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00			`from django.utils.text import slugify`
Create basic streamfield 2022-06-26 18:37:04 +01:00			`from wagtail import blocks`
			`from wagtail.embeds.blocks import EmbedBlock`
Add image figure block 2022-06-27 20:40:55 +01:00			`from wagtail.images.blocks import ImageChooserBlock`
Create basic streamfield 2022-06-26 18:37:04 +01:00
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00			`from website.common.utils import HEADER_TAGS`
Add code block 2022-06-27 23:29:55 +01:00			`from website.contrib.code_block.blocks import CodeBlock`

Setup features for richtext 2022-06-27 19:58:08 +01:00			`RICH_TEXT_FEATURES = [`
			`"h2",`
			`"h3",`
			`"h4",`
			`"h5",`
			`"h6",`
			`"bold",`
			`"italic",`
			`"ol",`
			`"ul",`
			`"link",`
			`"document-link",`
			`"code",`
			`"strikethrough",`
Add reusable referral links 2022-07-14 21:41:43 +01:00			`"snippet-link",`
			`"snippet-embed",`
Setup features for richtext 2022-06-27 19:58:08 +01:00			`]`

Add a tangent block 2022-07-05 09:03:45 +01:00			`RICH_TEXT_FEATURES_PLAIN = [`
			`"bold",`
			`"italic",`
			`"link",`
			`"document-link",`
			`"code",`
			`"strikethrough",`
			`]`

Add image figure block 2022-06-27 20:40:55 +01:00			`RICH_TEXT_FEATURES_SIMPLE = [`
			`"bold",`
			`"italic",`
Add a tangent block 2022-07-05 09:03:45 +01:00			`"ol",`
			`"ul",`
Add image figure block 2022-06-27 20:40:55 +01:00			`"link",`
			`"document-link",`
			`"code",`
			`"strikethrough",`
			`]`

Create basic streamfield 2022-06-26 18:37:04 +01:00
			`class LoremBlock(blocks.StructBlock):`
			`paragraphs = blocks.IntegerBlock(min_value=1)`

Add basic word count and reading time 2022-06-26 19:25:30 +01:00			`def render(self, value: dict, context: dict \| None = None) -> str:`
Create basic streamfield 2022-06-26 18:37:04 +01:00			`return format_html_join(`
			`"\n\n",`
			`"<p>{}</p>",`
			`[(paragraph,) for paragraph in lorem_ipsum.paragraphs(value["paragraphs"])],`
			`)`

			`class Meta:`
			`icon = "openquote"`
			`label = "Lorem Ipsum"`


Add image figure block 2022-06-27 20:40:55 +01:00			`class ImageCaptionBlock(blocks.StructBlock):`
			`image = ImageChooserBlock()`
Add a tangent block 2022-07-05 09:03:45 +01:00			`caption = blocks.RichTextBlock(features=RICH_TEXT_FEATURES_PLAIN)`
Add image figure block 2022-06-27 20:40:55 +01:00
			`class Meta:`
			`icon = "image"`
			`label = "Image with caption"`
			`template = "common/blocks/image-caption.html"`


Add a tangent block 2022-07-05 09:03:45 +01:00			`class TangentBlock(blocks.StructBlock):`
			`name = blocks.CharBlock(max_length=64)`
			`content = blocks.RichTextBlock(features=RICH_TEXT_FEATURES_SIMPLE)`

			`class Meta:`
			`icon = "comment"`
			`label = "Tangent"`
			`template = "common/blocks/tangent.html"`


Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00			`IGNORE_PLAINTEXT_BLOCKS = (blocks.RawHTMLBlock, EmbedBlock, ImageCaptionBlock)`
			`IGNORE_HEADING_BLOCKS = (*IGNORE_PLAINTEXT_BLOCKS, LoremBlock)`


Create basic streamfield 2022-06-26 18:37:04 +01:00			`def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:`
			`return [`
			`("embed", EmbedBlock()),`
Setup features for richtext 2022-06-27 19:58:08 +01:00			`("rich_text", blocks.RichTextBlock(features=RICH_TEXT_FEATURES)),`
Create basic streamfield 2022-06-26 18:37:04 +01:00			`("lorem", LoremBlock()),`
			`("html", blocks.RawHTMLBlock()),`
Add image figure block 2022-06-27 20:40:55 +01:00			`("image", ImageCaptionBlock()),`
Add code block 2022-06-27 23:29:55 +01:00			`("code", CodeBlock()),`
Add a tangent block 2022-07-05 09:03:45 +01:00			`("tangent", TangentBlock()),`
Create basic streamfield 2022-06-26 18:37:04 +01:00			`]`
Add basic word count and reading time 2022-06-26 19:25:30 +01:00

Optimise getting content HTML by only parsing the necessary tags 2022-07-04 18:55:18 +01:00			`def get_content_html(html: str) -> str:`
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00			`"""`
			`Get the HTML of just original content (eg not embeds etc)`
			`"""`
Optimise getting content HTML by only parsing the necessary tags 2022-07-04 18:55:18 +01:00			`block_classes = [`
			`f"block-{block_name}"`
			`for block_name, block in get_blocks()`
			`if not isinstance(block, IGNORE_PLAINTEXT_BLOCKS)`
			`]`

			`return str(`
			`BeautifulSoup(html, "lxml", parse_only=SoupStrainer(class_=block_classes))`
			`)`
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00

			`def add_heading_anchors(html: str) -> str:`
			`targets: list[str] = [`
			`f".block-{block_name} {header_tag}"`
			`for header_tag, block_name in product(`
			`HEADER_TAGS,`
			`[b[0] for b in get_blocks() if not isinstance(b[1], IGNORE_HEADING_BLOCKS)],`
			`)`
			`]`
Add summary to content 2022-06-26 19:52:20 +01:00
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00			`soup = BeautifulSoup(html, "lxml")`
			`for tag in soup.select(", ".join(targets)):`
			`slug = slugify(tag.text)`
			`anchor = soup.new_tag("a", href="#" + slug, id=slug)`
			`anchor.string = "#"`
			`anchor.attrs["class"] = "heading-anchor"`
			`tag.insert(0, anchor)`
			`return str(soup)`