diff --git a/website/common/tests.py b/website/common/tests.py index d9d73fa..7bd88b3 100644 --- a/website/common/tests.py +++ b/website/common/tests.py @@ -2,7 +2,7 @@ from django.test import SimpleTestCase from .embed import YouTubeLiteEmbedFinder from .models import BasePage -from .utils import get_page_models, get_table_of_contents +from .utils import extract_text, get_page_models, get_table_of_contents class BasePageTestCase(SimpleTestCase): @@ -92,3 +92,15 @@ class TableOfContentsTestCase(SimpleTestCase): sub_entry = first_entry.children[1] self.assertEqual(len(sub_entry.children), 1) self.assertEqual([entry.title for entry in sub_entry.children], ["2.2.1"]) + + +class ExtractTextTestCase(SimpleTestCase): + def test_extracts_text(self) -> None: + self.assertEqual(extract_text("
Hello there!
"), "Hello there!") + self.assertEqual( + extract_text("Paragraph 1
\nParagraph 2
"), + "Paragraph 1 Paragraph 2", + ) + + def test_plain_text(self) -> None: + self.assertEqual(extract_text("Hello there!"), "Hello there!") diff --git a/website/common/utils.py b/website/common/utils.py index 1d57fa0..abfcde8 100644 --- a/website/common/utils.py +++ b/website/common/utils.py @@ -78,7 +78,10 @@ def extract_text(html: str) -> str: """ Get the plain text of some HTML. """ - return " ".join(BeautifulSoup(html, "lxml").find_all(text=True)) + lines = ( + text.strip(" \n") for text in BeautifulSoup(html, "lxml").find_all(text=True) + ) + return " ".join(line for line in lines if line) def truncate_string(text: str, words: int) -> str: