Only parse header tags when looking for ToC
This commit is contained in:
parent
ccb481726c
commit
65044361a6
2 changed files with 5 additions and 5 deletions
|
@ -74,8 +74,10 @@ class TableOfContentsTestCase(SimpleTestCase):
|
||||||
toc = get_table_of_contents(
|
toc = get_table_of_contents(
|
||||||
"""
|
"""
|
||||||
<h2>2</h2>
|
<h2>2</h2>
|
||||||
|
<p>2 content</p>
|
||||||
<h3>2.1</h3>
|
<h3>2.1</h3>
|
||||||
<h3>2.2</h3>
|
<h3>2.2</h3>
|
||||||
|
<p>2.2 content</p>
|
||||||
<h5>2.2.1</h5>
|
<h5>2.2.1</h5>
|
||||||
<h3>2.3</h3>
|
<h3>2.3</h3>
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -2,7 +2,7 @@ from dataclasses import dataclass
|
||||||
from itertools import islice, pairwise
|
from itertools import islice, pairwise
|
||||||
from typing import Type
|
from typing import Type
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup, SoupStrainer
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.http.request import HttpRequest
|
from django.http.request import HttpRequest
|
||||||
from django.utils.text import slugify, smart_split
|
from django.utils.text import slugify, smart_split
|
||||||
|
@ -22,12 +22,10 @@ class TocEntry:
|
||||||
|
|
||||||
|
|
||||||
def get_table_of_contents(html: str) -> list[TocEntry]:
|
def get_table_of_contents(html: str) -> list[TocEntry]:
|
||||||
soup = BeautifulSoup(html, "lxml")
|
soup = BeautifulSoup(html, "lxml", parse_only=SoupStrainer(HEADER_TAGS))
|
||||||
|
|
||||||
headings = soup.find_all(HEADER_TAGS)
|
|
||||||
|
|
||||||
heading_levels = [
|
heading_levels = [
|
||||||
TocEntry(tag.text, slugify(tag.text), int(tag.name[1]), []) for tag in headings
|
TocEntry(tag.text, slugify(tag.text), int(tag.name[1]), []) for tag in soup
|
||||||
]
|
]
|
||||||
|
|
||||||
# Abort if there are no headings
|
# Abort if there are no headings
|
||||||
|
|
Loading…
Reference in a new issue