Get table of contents from body
This commit is contained in:
parent
8a7dba4ca0
commit
af2dba84cd
3 changed files with 47 additions and 20 deletions
|
@ -9,7 +9,7 @@ from wagtail.fields import StreamField
|
|||
from wagtail.images import get_image_model_string
|
||||
from wagtail.models import Page
|
||||
|
||||
from .streamfield import get_blocks, get_word_count, truncate_streamfield
|
||||
from .streamfield import get_blocks, get_html, get_word_count, truncate_streamfield
|
||||
from .utils import TocEntry, get_table_of_contents
|
||||
|
||||
|
||||
|
@ -50,7 +50,8 @@ class BaseContentMixin(models.Model):
|
|||
|
||||
@cached_property
|
||||
def table_of_contents(self) -> list[TocEntry]:
|
||||
return get_table_of_contents()
|
||||
html = "".join(get_html(self.body))
|
||||
return get_table_of_contents(html)
|
||||
|
||||
@cached_property
|
||||
def reading_time(self) -> int:
|
||||
|
|
|
@ -76,14 +76,19 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
|
|||
|
||||
|
||||
def extract_text(html: str) -> str:
|
||||
return " ".join(BeautifulSoup(html, "lxml").findAll(text=True))
|
||||
return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
|
||||
|
||||
|
||||
def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
|
||||
def get_html(value: blocks.StreamValue) -> Iterator[str]:
|
||||
for block in value:
|
||||
if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
|
||||
continue
|
||||
yield extract_text(str(block))
|
||||
yield str(block)
|
||||
|
||||
|
||||
def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
|
||||
for html_chunk in get_html(value):
|
||||
yield extract_text(html_chunk)
|
||||
|
||||
|
||||
def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:
|
||||
|
|
|
@ -1,32 +1,53 @@
|
|||
from typing import NamedTuple, Type
|
||||
from dataclasses import dataclass
|
||||
from itertools import pairwise
|
||||
from typing import Type
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from django.conf import settings
|
||||
from django.http.request import HttpRequest
|
||||
from wagtail.models import Page
|
||||
from wagtail.models import get_page_models as get_wagtail_page_models
|
||||
|
||||
|
||||
class TocEntry(NamedTuple):
|
||||
@dataclass
|
||||
class TocEntry:
|
||||
title: str
|
||||
slug: str
|
||||
level: int
|
||||
children: list
|
||||
|
||||
|
||||
def get_table_of_contents() -> list[TocEntry]:
|
||||
return [
|
||||
TocEntry(
|
||||
"Title 1",
|
||||
"title-1",
|
||||
[
|
||||
TocEntry("Title 1.1", "title-11", []),
|
||||
TocEntry("Title 1.2", "title-12", []),
|
||||
TocEntry("Title 1.3", "title-13", []),
|
||||
],
|
||||
),
|
||||
TocEntry("Title 2", "title-2", []),
|
||||
TocEntry("Title 3", "title-3", []),
|
||||
def get_table_of_contents(html: str) -> list[TocEntry]:
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
|
||||
headings = soup.find_all(["h2", "h3", "h4", "h5", "h6"])
|
||||
|
||||
heading_levels = [
|
||||
TocEntry(tag.text, tag.text, int(tag.name[1]), []) for tag in headings
|
||||
]
|
||||
|
||||
# Ensure heading levels are sequential
|
||||
for heading, next_heading in pairwise(heading_levels):
|
||||
if next_heading.level - heading.level > 1:
|
||||
next_heading.level = heading.level + 1
|
||||
|
||||
# Lower heading levels to 0
|
||||
min_level = min([h.level for h in heading_levels])
|
||||
for heading in heading_levels:
|
||||
heading.level -= min_level
|
||||
|
||||
# A dummy root node, so we can pretend this is a tree
|
||||
root = TocEntry("", "", 0, [])
|
||||
|
||||
# https://stackoverflow.com/a/44015834
|
||||
for heading in heading_levels:
|
||||
last = root
|
||||
for _ in range(heading.level):
|
||||
last = last.children[-1]
|
||||
last.children.append(heading)
|
||||
|
||||
return root.children
|
||||
|
||||
|
||||
def get_page_models() -> list[Type[Page]]:
|
||||
page_models = get_wagtail_page_models().copy()
|
||||
|
|
Loading…
Reference in a new issue