Get table of contents from body

This commit is contained in:
Jake Howard 2022-06-30 23:27:50 +01:00
parent 8a7dba4ca0
commit af2dba84cd
Signed by: jake
GPG key ID: 57AFB45680EDD477
3 changed files with 47 additions and 20 deletions

View file

@ -9,7 +9,7 @@ from wagtail.fields import StreamField
from wagtail.images import get_image_model_string
from wagtail.models import Page
from .streamfield import get_blocks, get_word_count, truncate_streamfield
from .streamfield import get_blocks, get_html, get_word_count, truncate_streamfield
from .utils import TocEntry, get_table_of_contents
@ -50,7 +50,8 @@ class BaseContentMixin(models.Model):
@cached_property
def table_of_contents(self) -> list[TocEntry]:
return get_table_of_contents()
html = "".join(get_html(self.body))
return get_table_of_contents(html)
@cached_property
def reading_time(self) -> int:

View file

@ -76,14 +76,19 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
def extract_text(html: str) -> str:
return " ".join(BeautifulSoup(html, "lxml").findAll(text=True))
return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
def get_html(value: blocks.StreamValue) -> Iterator[str]:
for block in value:
if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
continue
yield extract_text(str(block))
yield str(block)
def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
for html_chunk in get_html(value):
yield extract_text(html_chunk)
def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:

View file

@ -1,32 +1,53 @@
from typing import NamedTuple, Type
from dataclasses import dataclass
from itertools import pairwise
from typing import Type
from bs4 import BeautifulSoup
from django.conf import settings
from django.http.request import HttpRequest
from wagtail.models import Page
from wagtail.models import get_page_models as get_wagtail_page_models
class TocEntry(NamedTuple):
@dataclass
class TocEntry:
title: str
slug: str
level: int
children: list
def get_table_of_contents() -> list[TocEntry]:
return [
TocEntry(
"Title 1",
"title-1",
[
TocEntry("Title 1.1", "title-11", []),
TocEntry("Title 1.2", "title-12", []),
TocEntry("Title 1.3", "title-13", []),
],
),
TocEntry("Title 2", "title-2", []),
TocEntry("Title 3", "title-3", []),
def get_table_of_contents(html: str) -> list[TocEntry]:
soup = BeautifulSoup(html, "lxml")
headings = soup.find_all(["h2", "h3", "h4", "h5", "h6"])
heading_levels = [
TocEntry(tag.text, tag.text, int(tag.name[1]), []) for tag in headings
]
# Ensure heading levels are sequential
for heading, next_heading in pairwise(heading_levels):
if next_heading.level - heading.level > 1:
next_heading.level = heading.level + 1
# Lower heading levels to 0
min_level = min([h.level for h in heading_levels])
for heading in heading_levels:
heading.level -= min_level
# A dummy root node, so we can pretend this is a tree
root = TocEntry("", "", 0, [])
# https://stackoverflow.com/a/44015834
for heading in heading_levels:
last = root
for _ in range(heading.level):
last = last.children[-1]
last.children.append(heading)
return root.children
def get_page_models() -> list[Type[Page]]:
page_models = get_wagtail_page_models().copy()