From 60e4913e432b748385b2c21b2a6faa340077f731 Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Mon, 4 Jul 2022 09:21:12 +0100 Subject: [PATCH] Add basic tests for extract_text --- website/common/tests.py | 14 +++++++++++++- website/common/utils.py | 5 ++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/website/common/tests.py b/website/common/tests.py index d9d73fa..7bd88b3 100644 --- a/website/common/tests.py +++ b/website/common/tests.py @@ -2,7 +2,7 @@ from django.test import SimpleTestCase from .embed import YouTubeLiteEmbedFinder from .models import BasePage -from .utils import get_page_models, get_table_of_contents +from .utils import extract_text, get_page_models, get_table_of_contents class BasePageTestCase(SimpleTestCase): @@ -92,3 +92,15 @@ class TableOfContentsTestCase(SimpleTestCase): sub_entry = first_entry.children[1] self.assertEqual(len(sub_entry.children), 1) self.assertEqual([entry.title for entry in sub_entry.children], ["2.2.1"]) + + +class ExtractTextTestCase(SimpleTestCase): + def test_extracts_text(self) -> None: + self.assertEqual(extract_text("

Hello there!

"), "Hello there!") + self.assertEqual( + extract_text("

Paragraph 1

\n

Paragraph 2

"), + "Paragraph 1 Paragraph 2", + ) + + def test_plain_text(self) -> None: + self.assertEqual(extract_text("Hello there!"), "Hello there!") diff --git a/website/common/utils.py b/website/common/utils.py index 1d57fa0..abfcde8 100644 --- a/website/common/utils.py +++ b/website/common/utils.py @@ -78,7 +78,10 @@ def extract_text(html: str) -> str: """ Get the plain text of some HTML. """ - return " ".join(BeautifulSoup(html, "lxml").find_all(text=True)) + lines = ( + text.strip(" \n") for text in BeautifulSoup(html, "lxml").find_all(text=True) + ) + return " ".join(line for line in lines if line) def truncate_string(text: str, words: int) -> str: