Use bs4 to extract plain text from HTML
It's slower, but it keeps spaces between tags, which is what we want.
This commit is contained in:
parent
5a90a9963f
commit
8a7dba4ca0
2 changed files with 9 additions and 2 deletions
|
@ -3,3 +3,5 @@ wagtail==3.0
|
||||||
django-environ==0.8.1
|
django-environ==0.8.1
|
||||||
whitenoise[brotli]==6.2.0
|
whitenoise[brotli]==6.2.0
|
||||||
pygments==2.12.0
|
pygments==2.12.0
|
||||||
|
beautifulsoup4==4.9.3
|
||||||
|
lxml==4.9.0
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
from typing import Iterator
|
from typing import Iterator
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from django.utils import lorem_ipsum
|
from django.utils import lorem_ipsum
|
||||||
from django.utils.html import format_html_join, strip_tags
|
from django.utils.html import format_html_join
|
||||||
from django.utils.text import smart_split
|
from django.utils.text import smart_split
|
||||||
from wagtail import blocks
|
from wagtail import blocks
|
||||||
from wagtail.embeds.blocks import EmbedBlock
|
from wagtail.embeds.blocks import EmbedBlock
|
||||||
|
@ -74,11 +75,15 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text(html: str) -> str:
|
||||||
|
return " ".join(BeautifulSoup(html, "lxml").findAll(text=True))
|
||||||
|
|
||||||
|
|
||||||
def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
|
def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
|
||||||
for block in value:
|
for block in value:
|
||||||
if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
|
if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
|
||||||
continue
|
continue
|
||||||
yield strip_tags(str(block))
|
yield extract_text(str(block))
|
||||||
|
|
||||||
|
|
||||||
def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:
|
def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:
|
||||||
|
|
Loading…
Reference in a new issue