Use bs4 to extract plain text from HTML
It's slower, but it keeps spaces between tags, which is what we want.
This commit is contained in:
parent
5a90a9963f
commit
8a7dba4ca0
2 changed files with 9 additions and 2 deletions
|
@ -3,3 +3,5 @@ wagtail==3.0
|
|||
django-environ==0.8.1
|
||||
whitenoise[brotli]==6.2.0
|
||||
pygments==2.12.0
|
||||
beautifulsoup4==4.9.3
|
||||
lxml==4.9.0
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
from typing import Iterator
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from django.utils import lorem_ipsum
|
||||
from django.utils.html import format_html_join, strip_tags
|
||||
from django.utils.html import format_html_join
|
||||
from django.utils.text import smart_split
|
||||
from wagtail import blocks
|
||||
from wagtail.embeds.blocks import EmbedBlock
|
||||
|
@ -74,11 +75,15 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
|
|||
]
|
||||
|
||||
|
||||
def extract_text(html: str) -> str:
|
||||
return " ".join(BeautifulSoup(html, "lxml").findAll(text=True))
|
||||
|
||||
|
||||
def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
|
||||
for block in value:
|
||||
if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
|
||||
continue
|
||||
yield strip_tags(str(block))
|
||||
yield extract_text(str(block))
|
||||
|
||||
|
||||
def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:
|
||||
|
|
Loading…
Reference in a new issue