Only parse header tags when looking for ToC

This commit is contained in:
Jake Howard 2022-07-04 18:56:11 +01:00
parent ccb481726c
commit 65044361a6
Signed by: jake
GPG key ID: 57AFB45680EDD477
2 changed files with 5 additions and 5 deletions

View file

@ -74,8 +74,10 @@ class TableOfContentsTestCase(SimpleTestCase):
toc = get_table_of_contents( toc = get_table_of_contents(
""" """
<h2>2</h2> <h2>2</h2>
<p>2 content</p>
<h3>2.1</h3> <h3>2.1</h3>
<h3>2.2</h3> <h3>2.2</h3>
<p>2.2 content</p>
<h5>2.2.1</h5> <h5>2.2.1</h5>
<h3>2.3</h3> <h3>2.3</h3>
""" """

View file

@ -2,7 +2,7 @@ from dataclasses import dataclass
from itertools import islice, pairwise from itertools import islice, pairwise
from typing import Type from typing import Type
from bs4 import BeautifulSoup from bs4 import BeautifulSoup, SoupStrainer
from django.conf import settings from django.conf import settings
from django.http.request import HttpRequest from django.http.request import HttpRequest
from django.utils.text import slugify, smart_split from django.utils.text import slugify, smart_split
@ -22,12 +22,10 @@ class TocEntry:
def get_table_of_contents(html: str) -> list[TocEntry]: def get_table_of_contents(html: str) -> list[TocEntry]:
soup = BeautifulSoup(html, "lxml") soup = BeautifulSoup(html, "lxml", parse_only=SoupStrainer(HEADER_TAGS))
headings = soup.find_all(HEADER_TAGS)
heading_levels = [ heading_levels = [
TocEntry(tag.text, slugify(tag.text), int(tag.name[1]), []) for tag in headings TocEntry(tag.text, slugify(tag.text), int(tag.name[1]), []) for tag in soup
] ]
# Abort if there are no headings # Abort if there are no headings