From 0971fcd8a33580ec84d9d30dfb10c7ce3da9111d Mon Sep 17 00:00:00 2001
From: Jake Howard <git@theorangeone.net>
Date: Fri, 21 Oct 2022 17:46:13 +0100
Subject: [PATCH] Account for spaces after tags when extracting text

---
 website/common/tests/test_utils.py | 4 ++++
 website/common/utils.py            | 5 ++---
 2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/website/common/tests/test_utils.py b/website/common/tests/test_utils.py
index 6e67e98..5f92a38 100644
--- a/website/common/tests/test_utils.py
+++ b/website/common/tests/test_utils.py
@@ -101,6 +101,10 @@ class ExtractTextTestCase(SimpleTestCase):
             extract_text("<p>Paragraph 1</p>\n<p>Paragraph 2</p>"),
             "Paragraph 1 Paragraph 2",
         )
+        self.assertEqual(
+            extract_text("New <a>stuff</a>."),
+            "New stuff.",
+        )
 
     def test_plain_text(self) -> None:
         self.assertEqual(extract_text("Hello there!"), "Hello there!")
diff --git a/website/common/utils.py b/website/common/utils.py
index 06a9e38..4ea5a3c 100644
--- a/website/common/utils.py
+++ b/website/common/utils.py
@@ -76,10 +76,9 @@ def extract_text(html: str) -> str:
     """
     Get the plain text of some HTML.
     """
-    lines = (
-        text.strip(" \n") for text in BeautifulSoup(html, "lxml").find_all(text=True)
+    return (
+        BeautifulSoup(html.replace("<p", " <p"), "lxml").get_text().replace("\n", " ")
     )
-    return " ".join(line for line in lines if line)
 
 
 def truncate_string(text: str, words: int) -> str: