From 0971fcd8a33580ec84d9d30dfb10c7ce3da9111d Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Fri, 21 Oct 2022 17:46:13 +0100 Subject: [PATCH] Account for spaces after tags when extracting text --- website/common/tests/test_utils.py | 4 ++++ website/common/utils.py | 5 ++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/website/common/tests/test_utils.py b/website/common/tests/test_utils.py index 6e67e98..5f92a38 100644 --- a/website/common/tests/test_utils.py +++ b/website/common/tests/test_utils.py @@ -101,6 +101,10 @@ class ExtractTextTestCase(SimpleTestCase): extract_text("

Paragraph 1

\n

Paragraph 2

"), "Paragraph 1 Paragraph 2", ) + self.assertEqual( + extract_text("New stuff."), + "New stuff.", + ) def test_plain_text(self) -> None: self.assertEqual(extract_text("Hello there!"), "Hello there!") diff --git a/website/common/utils.py b/website/common/utils.py index 06a9e38..4ea5a3c 100644 --- a/website/common/utils.py +++ b/website/common/utils.py @@ -76,10 +76,9 @@ def extract_text(html: str) -> str: """ Get the plain text of some HTML. """ - lines = ( - text.strip(" \n") for text in BeautifulSoup(html, "lxml").find_all(text=True) + return ( + BeautifulSoup(html.replace(" str: