Use bs4 to extract plain text from HTML

It's slower, but it keeps spaces between tags, which is what we want.
2022-06-30 21:32:47 +01:00 · 2022-06-30 21:32:47 +01:00 · 8a7dba4ca0
commit 8a7dba4ca0
parent 5a90a9963f
2 changed files with 9 additions and 2 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -3,3 +3,5 @@ wagtail==3.0
 django-environ==0.8.1
 whitenoise[brotli]==6.2.0
 pygments==2.12.0
+beautifulsoup4==4.9.3
+lxml==4.9.0
--- a/website/common/streamfield.py
+++ b/website/common/streamfield.py
@ -1,7 +1,8 @@
 from typing import Iterator

+from bs4 import BeautifulSoup
 from django.utils import lorem_ipsum
-from django.utils.html import format_html_join, strip_tags
+from django.utils.html import format_html_join
 from django.utils.text import smart_split
 from wagtail import blocks
 from wagtail.embeds.blocks import EmbedBlock
@ -74,11 +75,15 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
    ]


+def extract_text(html: str) -> str:
+    return " ".join(BeautifulSoup(html, "lxml").findAll(text=True))
+
+
 def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
    for block in value:
        if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
            continue
-        yield strip_tags(str(block))
+        yield extract_text(str(block))


 def truncate_streamfield(value: blocks.StreamValue, words: int) -> str: