Account for spaces after tags when extracting text

This commit is contained in:
Jake Howard 2022-10-21 17:46:13 +01:00
parent c8f01870d4
commit 0971fcd8a3
Signed by: jake
GPG key ID: 57AFB45680EDD477
2 changed files with 6 additions and 3 deletions

View file

@ -101,6 +101,10 @@ class ExtractTextTestCase(SimpleTestCase):
extract_text("<p>Paragraph 1</p>\n<p>Paragraph 2</p>"), extract_text("<p>Paragraph 1</p>\n<p>Paragraph 2</p>"),
"Paragraph 1 Paragraph 2", "Paragraph 1 Paragraph 2",
) )
self.assertEqual(
extract_text("New <a>stuff</a>."),
"New stuff.",
)
def test_plain_text(self) -> None: def test_plain_text(self) -> None:
self.assertEqual(extract_text("Hello there!"), "Hello there!") self.assertEqual(extract_text("Hello there!"), "Hello there!")

View file

@ -76,10 +76,9 @@ def extract_text(html: str) -> str:
""" """
Get the plain text of some HTML. Get the plain text of some HTML.
""" """
lines = ( return (
text.strip(" \n") for text in BeautifulSoup(html, "lxml").find_all(text=True) BeautifulSoup(html.replace("<p", " <p"), "lxml").get_text().replace("\n", " ")
) )
return " ".join(line for line in lines if line)
def truncate_string(text: str, words: int) -> str: def truncate_string(text: str, words: int) -> str: