From 60e4913e432b748385b2c21b2a6faa340077f731 Mon Sep 17 00:00:00 2001
From: Jake Howard <git@theorangeone.net>
Date: Mon, 4 Jul 2022 09:21:12 +0100
Subject: [PATCH] Add basic tests for extract_text

---
 website/common/tests.py | 14 +++++++++++++-
 website/common/utils.py |  5 ++++-
 2 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/website/common/tests.py b/website/common/tests.py
index d9d73fa..7bd88b3 100644
--- a/website/common/tests.py
+++ b/website/common/tests.py
@@ -2,7 +2,7 @@ from django.test import SimpleTestCase
 
 from .embed import YouTubeLiteEmbedFinder
 from .models import BasePage
-from .utils import get_page_models, get_table_of_contents
+from .utils import extract_text, get_page_models, get_table_of_contents
 
 
 class BasePageTestCase(SimpleTestCase):
@@ -92,3 +92,15 @@ class TableOfContentsTestCase(SimpleTestCase):
         sub_entry = first_entry.children[1]
         self.assertEqual(len(sub_entry.children), 1)
         self.assertEqual([entry.title for entry in sub_entry.children], ["2.2.1"])
+
+
+class ExtractTextTestCase(SimpleTestCase):
+    def test_extracts_text(self) -> None:
+        self.assertEqual(extract_text("<p><b>Hello</b> there!</p>"), "Hello there!")
+        self.assertEqual(
+            extract_text("<p>Paragraph 1</p>\n<p>Paragraph 2</p>"),
+            "Paragraph 1 Paragraph 2",
+        )
+
+    def test_plain_text(self) -> None:
+        self.assertEqual(extract_text("Hello there!"), "Hello there!")
diff --git a/website/common/utils.py b/website/common/utils.py
index 1d57fa0..abfcde8 100644
--- a/website/common/utils.py
+++ b/website/common/utils.py
@@ -78,7 +78,10 @@ def extract_text(html: str) -> str:
     """
     Get the plain text of some HTML.
     """
-    return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
+    lines = (
+        text.strip(" \n") for text in BeautifulSoup(html, "lxml").find_all(text=True)
+    )
+    return " ".join(line for line in lines if line)
 
 
 def truncate_string(text: str, words: int) -> str: