Get table of contents from body

2022-06-30 23:27:50 +01:00 · 2022-06-30 23:27:50 +01:00 · af2dba84cd
commit af2dba84cd
parent 8a7dba4ca0
3 changed files with 47 additions and 20 deletions
--- a/website/common/models.py
+++ b/website/common/models.py
@ -9,7 +9,7 @@ from wagtail.fields import StreamField
 from wagtail.images import get_image_model_string
 from wagtail.models import Page
-from .streamfield import get_blocks, get_word_count, truncate_streamfield
+from .streamfield import get_blocks, get_html, get_word_count, truncate_streamfield
 from .utils import TocEntry, get_table_of_contents
@ -50,7 +50,8 @@ class BaseContentMixin(models.Model):
    @cached_property
    def table_of_contents(self) -> list[TocEntry]:
-        return get_table_of_contents()
+        html = "".join(get_html(self.body))
        return get_table_of_contents(html)
    @cached_property
    def reading_time(self) -> int:
--- a/website/common/streamfield.py
+++ b/website/common/streamfield.py
@ -76,14 +76,19 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
 def extract_text(html: str) -> str:
-    return " ".join(BeautifulSoup(html, "lxml").findAll(text=True))
+    return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
-def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
+def get_html(value: blocks.StreamValue) -> Iterator[str]:
    for block in value:
        if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
            continue
-        yield extract_text(str(block))
+        yield str(block)
 def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
    for html_chunk in get_html(value):
        yield extract_text(html_chunk)
 def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:
--- a/website/common/utils.py
+++ b/website/common/utils.py
@ -1,32 +1,53 @@
-from typing import NamedTuple, Type
+from dataclasses import dataclass
 from itertools import pairwise
 from typing import Type
 from bs4 import BeautifulSoup
 from django.conf import settings
 from django.http.request import HttpRequest
 from wagtail.models import Page
 from wagtail.models import get_page_models as get_wagtail_page_models
-class TocEntry(NamedTuple):
+@dataclass
 class TocEntry:
    title: str
    slug: str
    level: int
    children: list
-def get_table_of_contents() -> list[TocEntry]:
+def get_table_of_contents(html: str) -> list[TocEntry]:
-    return [
+    soup = BeautifulSoup(html, "lxml")
-        TocEntry(
+
-            "Title 1",
+    headings = soup.find_all(["h2", "h3", "h4", "h5", "h6"])
-            "title-1",
+
-            [
+    heading_levels = [
-                TocEntry("Title 1.1", "title-11", []),
+        TocEntry(tag.text, tag.text, int(tag.name[1]), []) for tag in headings
                TocEntry("Title 1.2", "title-12", []),
                TocEntry("Title 1.3", "title-13", []),
            ],
        ),
        TocEntry("Title 2", "title-2", []),
        TocEntry("Title 3", "title-3", []),
    ]
    # Ensure heading levels are sequential
    for heading, next_heading in pairwise(heading_levels):
        if next_heading.level - heading.level > 1:
            next_heading.level = heading.level + 1
    # Lower heading levels to 0
    min_level = min([h.level for h in heading_levels])
    for heading in heading_levels:
        heading.level -= min_level
    # A dummy root node, so we can pretend this is a tree
    root = TocEntry("", "", 0, [])
    # https://stackoverflow.com/a/44015834
    for heading in heading_levels:
        last = root
        for _ in range(heading.level):
            last = last.children[-1]
        last.children.append(heading)
    return root.children
 def get_page_models() -> list[Type[Page]]:
    page_models = get_wagtail_page_models().copy()