Use bs4 to extract plain text from HTML

It's slower, but it keeps spaces between tags, which is what we want.
This commit is contained in:
Jake Howard 2022-06-30 21:32:47 +01:00
parent 5a90a9963f
commit 8a7dba4ca0
Signed by: jake
GPG key ID: 57AFB45680EDD477
2 changed files with 9 additions and 2 deletions

View file

@ -3,3 +3,5 @@ wagtail==3.0
django-environ==0.8.1
whitenoise[brotli]==6.2.0
pygments==2.12.0
beautifulsoup4==4.9.3
lxml==4.9.0

View file

@ -1,7 +1,8 @@
from typing import Iterator
from bs4 import BeautifulSoup
from django.utils import lorem_ipsum
from django.utils.html import format_html_join, strip_tags
from django.utils.html import format_html_join
from django.utils.text import smart_split
from wagtail import blocks
from wagtail.embeds.blocks import EmbedBlock
@ -74,11 +75,15 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
]
def extract_text(html: str) -> str:
return " ".join(BeautifulSoup(html, "lxml").findAll(text=True))
def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
for block in value:
if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
continue
yield strip_tags(str(block))
yield extract_text(str(block))
def truncate_streamfield(value: blocks.StreamValue, words: int) -> str: