Get table of contents from body

2022-06-30 23:27:50 +01:00 · 2022-06-30 23:27:50 +01:00 · af2dba84cd
commit af2dba84cd
parent 8a7dba4ca0
3 changed files with 47 additions and 20 deletions
--- a/website/common/models.py
+++ b/website/common/models.py
@ -9,7 +9,7 @@ from wagtail.fields import StreamField
 from wagtail.images import get_image_model_string
 from wagtail.models import Page

-from .streamfield import get_blocks, get_word_count, truncate_streamfield
+from .streamfield import get_blocks, get_html, get_word_count, truncate_streamfield
 from .utils import TocEntry, get_table_of_contents


@ -50,7 +50,8 @@ class BaseContentMixin(models.Model):

    @cached_property
    def table_of_contents(self) -> list[TocEntry]:
-        return get_table_of_contents()
+        html = "".join(get_html(self.body))
+        return get_table_of_contents(html)

    @cached_property
    def reading_time(self) -> int:
--- a/website/common/streamfield.py
+++ b/website/common/streamfield.py
@ -76,14 +76,19 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:


 def extract_text(html: str) -> str:
-    return " ".join(BeautifulSoup(html, "lxml").findAll(text=True))
+    return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))


-def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
+def get_html(value: blocks.StreamValue) -> Iterator[str]:
    for block in value:
        if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
            continue
-        yield extract_text(str(block))
+        yield str(block)
+
+
+def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
+    for html_chunk in get_html(value):
+        yield extract_text(html_chunk)


 def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:
--- a/website/common/utils.py
+++ b/website/common/utils.py
@ -1,32 +1,53 @@
-from typing import NamedTuple, Type
+from dataclasses import dataclass
+from itertools import pairwise
+from typing import Type

+from bs4 import BeautifulSoup
 from django.conf import settings
 from django.http.request import HttpRequest
 from wagtail.models import Page
 from wagtail.models import get_page_models as get_wagtail_page_models


-class TocEntry(NamedTuple):
+@dataclass
+class TocEntry:
    title: str
    slug: str
+    level: int
    children: list


-def get_table_of_contents() -> list[TocEntry]:
-    return [
-        TocEntry(
-            "Title 1",
-            "title-1",
-            [
-                TocEntry("Title 1.1", "title-11", []),
-                TocEntry("Title 1.2", "title-12", []),
-                TocEntry("Title 1.3", "title-13", []),
-            ],
-        ),
-        TocEntry("Title 2", "title-2", []),
-        TocEntry("Title 3", "title-3", []),
+def get_table_of_contents(html: str) -> list[TocEntry]:
+    soup = BeautifulSoup(html, "lxml")
+
+    headings = soup.find_all(["h2", "h3", "h4", "h5", "h6"])
+
+    heading_levels = [
+        TocEntry(tag.text, tag.text, int(tag.name[1]), []) for tag in headings
    ]

+    # Ensure heading levels are sequential
+    for heading, next_heading in pairwise(heading_levels):
+        if next_heading.level - heading.level > 1:
+            next_heading.level = heading.level + 1
+
+    # Lower heading levels to 0
+    min_level = min([h.level for h in heading_levels])
+    for heading in heading_levels:
+        heading.level -= min_level
+
+    # A dummy root node, so we can pretend this is a tree
+    root = TocEntry("", "", 0, [])
+
+    # https://stackoverflow.com/a/44015834
+    for heading in heading_levels:
+        last = root
+        for _ in range(heading.level):
+            last = last.children[-1]
+        last.children.append(heading)
+
+    return root.children
+

 def get_page_models() -> list[Type[Page]]:
    page_models = get_wagtail_page_models().copy()