Account for spaces after tags when extracting text
This commit is contained in:
parent
c8f01870d4
commit
0971fcd8a3
2 changed files with 6 additions and 3 deletions
|
@ -101,6 +101,10 @@ class ExtractTextTestCase(SimpleTestCase):
|
||||||
extract_text("<p>Paragraph 1</p>\n<p>Paragraph 2</p>"),
|
extract_text("<p>Paragraph 1</p>\n<p>Paragraph 2</p>"),
|
||||||
"Paragraph 1 Paragraph 2",
|
"Paragraph 1 Paragraph 2",
|
||||||
)
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
extract_text("New <a>stuff</a>."),
|
||||||
|
"New stuff.",
|
||||||
|
)
|
||||||
|
|
||||||
def test_plain_text(self) -> None:
|
def test_plain_text(self) -> None:
|
||||||
self.assertEqual(extract_text("Hello there!"), "Hello there!")
|
self.assertEqual(extract_text("Hello there!"), "Hello there!")
|
||||||
|
|
|
@ -76,10 +76,9 @@ def extract_text(html: str) -> str:
|
||||||
"""
|
"""
|
||||||
Get the plain text of some HTML.
|
Get the plain text of some HTML.
|
||||||
"""
|
"""
|
||||||
lines = (
|
return (
|
||||||
text.strip(" \n") for text in BeautifulSoup(html, "lxml").find_all(text=True)
|
BeautifulSoup(html.replace("<p", " <p"), "lxml").get_text().replace("\n", " ")
|
||||||
)
|
)
|
||||||
return " ".join(line for line in lines if line)
|
|
||||||
|
|
||||||
|
|
||||||
def truncate_string(text: str, words: int) -> str:
|
def truncate_string(text: str, words: int) -> str:
|
||||||
|
|
Loading…
Reference in a new issue