website/website/common/streamfield.py

from itertools import product

from bs4 import BeautifulSoup, SoupStrainer
from django.utils import lorem_ipsum
from django.utils.html import format_html_join
from wagtail import blocks
from wagtail.contrib.typed_table_block.blocks import TypedTableBlock
from wagtail.embeds.blocks import EmbedBlock
from wagtail.images.blocks import ImageChooserBlock

from website.common.utils import HEADER_TAGS, heading_id
from website.contrib.code_block.blocks import CodeBlock
from website.contrib.mermaid_block.blocks import MermaidBlock


class LoremBlock(blocks.StructBlock):
    paragraphs = blocks.IntegerBlock(min_value=1)

    def render(self, value: dict, context: dict | None = None) -> str:
        return format_html_join(
            "\n\n",
            "<p>{}</p>",
            [(paragraph,) for paragraph in lorem_ipsum.paragraphs(value["paragraphs"])],
        )

    class Meta:
        icon = "openquote"
        label = "Lorem Ipsum"


class ImageCaptionBlock(blocks.StructBlock):
    image = ImageChooserBlock()
    caption = blocks.RichTextBlock(editor="plain", required=False)

    class Meta:
        icon = "image"
        label = "Image with caption"
        template = "common/blocks/image-caption.html"


class TangentBlock(blocks.StructBlock):
    name = blocks.CharBlock(max_length=64)
    content = blocks.RichTextBlock(editor="simple")

    class Meta:
        icon = "comment"
        label = "Tangent"
        template = "common/blocks/tangent.html"


class IFrameBlock(blocks.StructBlock):
    url = blocks.URLBlock()
    caption = blocks.RichTextBlock(editor="plain", required=False)

    class Meta:
        icon = "link-external"
        label = "IFrame"
        template = "common/blocks/iframe.html"


IGNORE_PLAINTEXT_BLOCKS = (
    blocks.RawHTMLBlock,
    EmbedBlock,
    ImageCaptionBlock,
    CodeBlock,
    MermaidBlock,
    IFrameBlock,
)
IGNORE_HEADING_BLOCKS = (*IGNORE_PLAINTEXT_BLOCKS, LoremBlock)


def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
    return [
        ("embed", EmbedBlock()),
        ("rich_text", blocks.RichTextBlock()),
        ("lorem", LoremBlock()),
        ("html", blocks.RawHTMLBlock()),
        ("image", ImageCaptionBlock()),
        ("code", CodeBlock()),
        ("tangent", TangentBlock()),
        ("mermaid", MermaidBlock()),
        (
            "table",
            TypedTableBlock(
                [
                    (
                        "rich_text",
                        blocks.RichTextBlock(editor="plain"),
                    ),
                    ("numeric", blocks.FloatBlock()),
                    ("text", blocks.CharBlock()),
                ]
            ),
        ),
        ("iframe", IFrameBlock()),
    ]


def get_content_html(html: str) -> str:
    """
    Get the HTML of just original content (eg not embeds etc)
    """
    block_classes = [
        f"block-{block_name}"
        for block_name, block in get_blocks()
        if not isinstance(block, IGNORE_PLAINTEXT_BLOCKS)
    ]

    return str(
        BeautifulSoup(html, "lxml", parse_only=SoupStrainer(class_=block_classes))
    )


def add_heading_anchors(html: str) -> str:
    targets: list[str] = [
        f".block-{block_name} {header_tag}"
        for header_tag, block_name in product(
            HEADER_TAGS,
            [b[0] for b in get_blocks() if not isinstance(b[1], IGNORE_HEADING_BLOCKS)],
        )
    ]

    soup = BeautifulSoup(html, "lxml")
    for tag in soup.select(", ".join(targets)):
        slug = heading_id(tag.text)
        anchor = soup.new_tag("a", href="#" + slug, id=slug)
        anchor.string = "#"
        anchor.attrs["class"] = "heading-anchor"
        tag.insert(0, anchor)
    return str(soup)
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00			`from itertools import product`
Add basic word count and reading time 2022-06-26 19:25:30 +01:00
Optimise getting content HTML by only parsing the necessary tags 2022-07-04 18:55:18 +01:00			`from bs4 import BeautifulSoup, SoupStrainer`
Create basic streamfield 2022-06-26 18:37:04 +01:00			`from django.utils import lorem_ipsum`
Use bs4 to extract plain text from HTML It's slower, but it keeps spaces between tags, which is what we want. 2022-06-30 21:32:47 +01:00			`from django.utils.html import format_html_join`
Create basic streamfield 2022-06-26 18:37:04 +01:00			`from wagtail import blocks`
Add typed table block 2022-08-19 13:48:45 +01:00			`from wagtail.contrib.typed_table_block.blocks import TypedTableBlock`
Create basic streamfield 2022-06-26 18:37:04 +01:00			`from wagtail.embeds.blocks import EmbedBlock`
Add image figure block 2022-06-27 20:40:55 +01:00			`from wagtail.images.blocks import ImageChooserBlock`
Create basic streamfield 2022-06-26 18:37:04 +01:00
Ensure heading ids are always valid ids 2022-09-23 15:35:32 +01:00			`from website.common.utils import HEADER_TAGS, heading_id`
Add code block 2022-06-27 23:29:55 +01:00			`from website.contrib.code_block.blocks import CodeBlock`
Add mermaid embed 2022-07-15 09:56:22 +01:00			`from website.contrib.mermaid_block.blocks import MermaidBlock`
Add code block 2022-06-27 23:29:55 +01:00
Create basic streamfield 2022-06-26 18:37:04 +01:00
			`class LoremBlock(blocks.StructBlock):`
			`paragraphs = blocks.IntegerBlock(min_value=1)`

Add basic word count and reading time 2022-06-26 19:25:30 +01:00			`def render(self, value: dict, context: dict \| None = None) -> str:`
Create basic streamfield 2022-06-26 18:37:04 +01:00			`return format_html_join(`
			`"\n\n",`
			`"<p>{}</p>",`
			`[(paragraph,) for paragraph in lorem_ipsum.paragraphs(value["paragraphs"])],`
			`)`

			`class Meta:`
			`icon = "openquote"`
			`label = "Lorem Ipsum"`


Add image figure block 2022-06-27 20:40:55 +01:00			`class ImageCaptionBlock(blocks.StructBlock):`
			`image = ImageChooserBlock()`
Define rich text features in settings This removes them from migrations, avoiding the need to create new ones when they change 2022-09-03 21:07:27 +01:00			`caption = blocks.RichTextBlock(editor="plain", required=False)`
Add image figure block 2022-06-27 20:40:55 +01:00
			`class Meta:`
			`icon = "image"`
			`label = "Image with caption"`
			`template = "common/blocks/image-caption.html"`


Add a tangent block 2022-07-05 09:03:45 +01:00			`class TangentBlock(blocks.StructBlock):`
			`name = blocks.CharBlock(max_length=64)`
Define rich text features in settings This removes them from migrations, avoiding the need to create new ones when they change 2022-09-03 21:07:27 +01:00			`content = blocks.RichTextBlock(editor="simple")`
Add a tangent block 2022-07-05 09:03:45 +01:00
			`class Meta:`
			`icon = "comment"`
			`label = "Tangent"`
			`template = "common/blocks/tangent.html"`


Add iframe block 2022-09-08 14:31:01 +01:00			`class IFrameBlock(blocks.StructBlock):`
			`url = blocks.URLBlock()`
			`caption = blocks.RichTextBlock(editor="plain", required=False)`

			`class Meta:`
			`icon = "link-external"`
			`label = "IFrame"`
			`template = "common/blocks/iframe.html"`


Don't show code in plaintext 2022-07-29 09:09:35 +01:00			`IGNORE_PLAINTEXT_BLOCKS = (`
			`blocks.RawHTMLBlock,`
			`EmbedBlock,`
			`ImageCaptionBlock,`
			`CodeBlock,`
Ignore more blocks when getting plaintext 2022-09-25 21:54:02 +01:00			`MermaidBlock,`
			`IFrameBlock,`
Don't show code in plaintext 2022-07-29 09:09:35 +01:00			`)`
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00			`IGNORE_HEADING_BLOCKS = (*IGNORE_PLAINTEXT_BLOCKS, LoremBlock)`


Create basic streamfield 2022-06-26 18:37:04 +01:00			`def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:`
			`return [`
			`("embed", EmbedBlock()),`
Define rich text features in settings This removes them from migrations, avoiding the need to create new ones when they change 2022-09-03 21:07:27 +01:00			`("rich_text", blocks.RichTextBlock()),`
Create basic streamfield 2022-06-26 18:37:04 +01:00			`("lorem", LoremBlock()),`
			`("html", blocks.RawHTMLBlock()),`
Add image figure block 2022-06-27 20:40:55 +01:00			`("image", ImageCaptionBlock()),`
Add code block 2022-06-27 23:29:55 +01:00			`("code", CodeBlock()),`
Add a tangent block 2022-07-05 09:03:45 +01:00			`("tangent", TangentBlock()),`
Add mermaid embed 2022-07-15 09:56:22 +01:00			`("mermaid", MermaidBlock()),`
Add typed table block 2022-08-19 13:48:45 +01:00			`(`
			`"table",`
			`TypedTableBlock(`
			`[`
			`(`
			`"rich_text",`
Define rich text features in settings This removes them from migrations, avoiding the need to create new ones when they change 2022-09-03 21:07:27 +01:00			`blocks.RichTextBlock(editor="plain"),`
Add typed table block 2022-08-19 13:48:45 +01:00			`),`
			`("numeric", blocks.FloatBlock()),`
			`("text", blocks.CharBlock()),`
			`]`
			`),`
			`),`
Add iframe block 2022-09-08 14:31:01 +01:00			`("iframe", IFrameBlock()),`
Create basic streamfield 2022-06-26 18:37:04 +01:00			`]`
Add basic word count and reading time 2022-06-26 19:25:30 +01:00

Optimise getting content HTML by only parsing the necessary tags 2022-07-04 18:55:18 +01:00			`def get_content_html(html: str) -> str:`
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00			`"""`
			`Get the HTML of just original content (eg not embeds etc)`
			`"""`
Optimise getting content HTML by only parsing the necessary tags 2022-07-04 18:55:18 +01:00			`block_classes = [`
			`f"block-{block_name}"`
			`for block_name, block in get_blocks()`
			`if not isinstance(block, IGNORE_PLAINTEXT_BLOCKS)`
			`]`

			`return str(`
			`BeautifulSoup(html, "lxml", parse_only=SoupStrainer(class_=block_classes))`
			`)`
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00

			`def add_heading_anchors(html: str) -> str:`
			`targets: list[str] = [`
			`f".block-{block_name} {header_tag}"`
			`for header_tag, block_name in product(`
			`HEADER_TAGS,`
			`[b[0] for b in get_blocks() if not isinstance(b[1], IGNORE_HEADING_BLOCKS)],`
			`)`
			`]`
Add summary to content 2022-06-26 19:52:20 +01:00
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00			`soup = BeautifulSoup(html, "lxml")`
			`for tag in soup.select(", ".join(targets)):`
Ensure heading ids are always valid ids 2022-09-23 15:35:32 +01:00			`slug = heading_id(tag.text)`
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00			`anchor = soup.new_tag("a", href="#" + slug, id=slug)`
			`anchor.string = "#"`
			`anchor.attrs["class"] = "heading-anchor"`
			`tag.insert(0, anchor)`
			`return str(soup)`