Only parse header tags when looking for ToC

This commit is contained in:
Jake Howard 2022-07-04 18:56:11 +01:00
parent ccb481726c
commit 65044361a6
Signed by: jake
GPG key ID: 57AFB45680EDD477
2 changed files with 5 additions and 5 deletions

View file

@ -74,8 +74,10 @@ class TableOfContentsTestCase(SimpleTestCase):
toc = get_table_of_contents(
"""
<h2>2</h2>
<p>2 content</p>
<h3>2.1</h3>
<h3>2.2</h3>
<p>2.2 content</p>
<h5>2.2.1</h5>
<h3>2.3</h3>
"""

View file

@ -2,7 +2,7 @@ from dataclasses import dataclass
from itertools import islice, pairwise
from typing import Type
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, SoupStrainer
from django.conf import settings
from django.http.request import HttpRequest
from django.utils.text import slugify, smart_split
@ -22,12 +22,10 @@ class TocEntry:
def get_table_of_contents(html: str) -> list[TocEntry]:
soup = BeautifulSoup(html, "lxml")
headings = soup.find_all(HEADER_TAGS)
soup = BeautifulSoup(html, "lxml", parse_only=SoupStrainer(HEADER_TAGS))
heading_levels = [
TocEntry(tag.text, slugify(tag.text), int(tag.name[1]), []) for tag in headings
TocEntry(tag.text, slugify(tag.text), int(tag.name[1]), []) for tag in soup
]
# Abort if there are no headings