Only parse header tags when looking for ToC

2022-07-04 18:56:11 +01:00 · 2022-07-04 18:56:11 +01:00 · 65044361a6
commit 65044361a6
parent ccb481726c
2 changed files with 5 additions and 5 deletions
--- a/website/common/tests.py
+++ b/website/common/tests.py
@ -74,8 +74,10 @@ class TableOfContentsTestCase(SimpleTestCase):
        toc = get_table_of_contents(
            """
        <h2>2</h2>
+        <p>2 content</p>
        <h3>2.1</h3>
        <h3>2.2</h3>
+        <p>2.2 content</p>
        <h5>2.2.1</h5>
        <h3>2.3</h3>
        """
--- a/website/common/utils.py
+++ b/website/common/utils.py
@ -2,7 +2,7 @@ from dataclasses import dataclass
 from itertools import islice, pairwise
 from typing import Type

-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, SoupStrainer
 from django.conf import settings
 from django.http.request import HttpRequest
 from django.utils.text import slugify, smart_split
@ -22,12 +22,10 @@ class TocEntry:


 def get_table_of_contents(html: str) -> list[TocEntry]:
-    soup = BeautifulSoup(html, "lxml")
-
-    headings = soup.find_all(HEADER_TAGS)
+    soup = BeautifulSoup(html, "lxml", parse_only=SoupStrainer(HEADER_TAGS))

    heading_levels = [
-        TocEntry(tag.text, slugify(tag.text), int(tag.name[1]), []) for tag in headings
+        TocEntry(tag.text, slugify(tag.text), int(tag.name[1]), []) for tag in soup
    ]

    # Abort if there are no headings