From af2dba84cd060ee8a8c5fa0108432cccecbf6fc6 Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Thu, 30 Jun 2022 23:27:50 +0100 Subject: [PATCH] Get table of contents from body --- website/common/models.py | 5 ++-- website/common/streamfield.py | 11 +++++--- website/common/utils.py | 51 ++++++++++++++++++++++++----------- 3 files changed, 47 insertions(+), 20 deletions(-) diff --git a/website/common/models.py b/website/common/models.py index c628b3c..38b2c43 100644 --- a/website/common/models.py +++ b/website/common/models.py @@ -9,7 +9,7 @@ from wagtail.fields import StreamField from wagtail.images import get_image_model_string from wagtail.models import Page -from .streamfield import get_blocks, get_word_count, truncate_streamfield +from .streamfield import get_blocks, get_html, get_word_count, truncate_streamfield from .utils import TocEntry, get_table_of_contents @@ -50,7 +50,8 @@ class BaseContentMixin(models.Model): @cached_property def table_of_contents(self) -> list[TocEntry]: - return get_table_of_contents() + html = "".join(get_html(self.body)) + return get_table_of_contents(html) @cached_property def reading_time(self) -> int: diff --git a/website/common/streamfield.py b/website/common/streamfield.py index 15c3477..aa00bda 100644 --- a/website/common/streamfield.py +++ b/website/common/streamfield.py @@ -76,14 +76,19 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]: def extract_text(html: str) -> str: - return " ".join(BeautifulSoup(html, "lxml").findAll(text=True)) + return " ".join(BeautifulSoup(html, "lxml").find_all(text=True)) -def get_plain_text(value: blocks.StreamValue) -> Iterator[str]: +def get_html(value: blocks.StreamValue) -> Iterator[str]: for block in value: if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS): continue - yield extract_text(str(block)) + yield str(block) + + +def get_plain_text(value: blocks.StreamValue) -> Iterator[str]: + for html_chunk in get_html(value): + yield extract_text(html_chunk) def truncate_streamfield(value: blocks.StreamValue, words: int) -> str: diff --git a/website/common/utils.py b/website/common/utils.py index 6056c3a..1e19d24 100644 --- a/website/common/utils.py +++ b/website/common/utils.py @@ -1,32 +1,53 @@ -from typing import NamedTuple, Type +from dataclasses import dataclass +from itertools import pairwise +from typing import Type +from bs4 import BeautifulSoup from django.conf import settings from django.http.request import HttpRequest from wagtail.models import Page from wagtail.models import get_page_models as get_wagtail_page_models -class TocEntry(NamedTuple): +@dataclass +class TocEntry: title: str slug: str + level: int children: list -def get_table_of_contents() -> list[TocEntry]: - return [ - TocEntry( - "Title 1", - "title-1", - [ - TocEntry("Title 1.1", "title-11", []), - TocEntry("Title 1.2", "title-12", []), - TocEntry("Title 1.3", "title-13", []), - ], - ), - TocEntry("Title 2", "title-2", []), - TocEntry("Title 3", "title-3", []), +def get_table_of_contents(html: str) -> list[TocEntry]: + soup = BeautifulSoup(html, "lxml") + + headings = soup.find_all(["h2", "h3", "h4", "h5", "h6"]) + + heading_levels = [ + TocEntry(tag.text, tag.text, int(tag.name[1]), []) for tag in headings ] + # Ensure heading levels are sequential + for heading, next_heading in pairwise(heading_levels): + if next_heading.level - heading.level > 1: + next_heading.level = heading.level + 1 + + # Lower heading levels to 0 + min_level = min([h.level for h in heading_levels]) + for heading in heading_levels: + heading.level -= min_level + + # A dummy root node, so we can pretend this is a tree + root = TocEntry("", "", 0, []) + + # https://stackoverflow.com/a/44015834 + for heading in heading_levels: + last = root + for _ in range(heading.level): + last = last.children[-1] + last.children.append(heading) + + return root.children + def get_page_models() -> list[Type[Page]]: page_models = get_wagtail_page_models().copy()