From ccb481726cd28448442f159dee2ee0c2bd8a04a1 Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Mon, 4 Jul 2022 18:55:18 +0100 Subject: [PATCH] Optimise getting content HTML by only parsing the necessary tags --- website/common/models.py | 2 +- website/common/streamfield.py | 24 +++++++++++------------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/website/common/models.py b/website/common/models.py index 9c17039..a4ec578 100644 --- a/website/common/models.py +++ b/website/common/models.py @@ -75,7 +75,7 @@ class BaseContentMixin(models.Model): @cached_property def content_html(self) -> str: - return get_content_html(self.body) + return get_content_html(self.body_html) @cached_property def plain_text(self) -> str: diff --git a/website/common/streamfield.py b/website/common/streamfield.py index 4b5ab2d..646ddd3 100644 --- a/website/common/streamfield.py +++ b/website/common/streamfield.py @@ -1,7 +1,6 @@ from itertools import product -from typing import Iterable -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, SoupStrainer from django.utils import lorem_ipsum from django.utils.html import format_html_join from django.utils.text import slugify @@ -78,20 +77,19 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]: ] -def get_content_blocks(value: blocks.StreamValue) -> Iterable[blocks.BaseBlock]: - for block in value: - if not isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS): - yield block - - -def get_content_html(value: blocks.StreamValue) -> str: +def get_content_html(html: str) -> str: """ Get the HTML of just original content (eg not embeds etc) """ - html = "" - for block in get_content_blocks(value): - html += str(block) - return html + block_classes = [ + f"block-{block_name}" + for block_name, block in get_blocks() + if not isinstance(block, IGNORE_PLAINTEXT_BLOCKS) + ] + + return str( + BeautifulSoup(html, "lxml", parse_only=SoupStrainer(class_=block_classes)) + ) def add_heading_anchors(html: str) -> str: