From af2dba84cd060ee8a8c5fa0108432cccecbf6fc6 Mon Sep 17 00:00:00 2001
From: Jake Howard <git@theorangeone.net>
Date: Thu, 30 Jun 2022 23:27:50 +0100
Subject: [PATCH] Get table of contents from body

---
 website/common/models.py      |  5 ++--
 website/common/streamfield.py | 11 +++++---
 website/common/utils.py       | 51 ++++++++++++++++++++++++-----------
 3 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/website/common/models.py b/website/common/models.py
index c628b3c..38b2c43 100644
--- a/website/common/models.py
+++ b/website/common/models.py
@@ -9,7 +9,7 @@ from wagtail.fields import StreamField
 from wagtail.images import get_image_model_string
 from wagtail.models import Page
 
-from .streamfield import get_blocks, get_word_count, truncate_streamfield
+from .streamfield import get_blocks, get_html, get_word_count, truncate_streamfield
 from .utils import TocEntry, get_table_of_contents
 
 
@@ -50,7 +50,8 @@ class BaseContentMixin(models.Model):
 
     @cached_property
     def table_of_contents(self) -> list[TocEntry]:
-        return get_table_of_contents()
+        html = "".join(get_html(self.body))
+        return get_table_of_contents(html)
 
     @cached_property
     def reading_time(self) -> int:
diff --git a/website/common/streamfield.py b/website/common/streamfield.py
index 15c3477..aa00bda 100644
--- a/website/common/streamfield.py
+++ b/website/common/streamfield.py
@@ -76,14 +76,19 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
 
 
 def extract_text(html: str) -> str:
-    return " ".join(BeautifulSoup(html, "lxml").findAll(text=True))
+    return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
 
 
-def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
+def get_html(value: blocks.StreamValue) -> Iterator[str]:
     for block in value:
         if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
             continue
-        yield extract_text(str(block))
+        yield str(block)
+
+
+def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
+    for html_chunk in get_html(value):
+        yield extract_text(html_chunk)
 
 
 def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:
diff --git a/website/common/utils.py b/website/common/utils.py
index 6056c3a..1e19d24 100644
--- a/website/common/utils.py
+++ b/website/common/utils.py
@@ -1,32 +1,53 @@
-from typing import NamedTuple, Type
+from dataclasses import dataclass
+from itertools import pairwise
+from typing import Type
 
+from bs4 import BeautifulSoup
 from django.conf import settings
 from django.http.request import HttpRequest
 from wagtail.models import Page
 from wagtail.models import get_page_models as get_wagtail_page_models
 
 
-class TocEntry(NamedTuple):
+@dataclass
+class TocEntry:
     title: str
     slug: str
+    level: int
     children: list
 
 
-def get_table_of_contents() -> list[TocEntry]:
-    return [
-        TocEntry(
-            "Title 1",
-            "title-1",
-            [
-                TocEntry("Title 1.1", "title-11", []),
-                TocEntry("Title 1.2", "title-12", []),
-                TocEntry("Title 1.3", "title-13", []),
-            ],
-        ),
-        TocEntry("Title 2", "title-2", []),
-        TocEntry("Title 3", "title-3", []),
+def get_table_of_contents(html: str) -> list[TocEntry]:
+    soup = BeautifulSoup(html, "lxml")
+
+    headings = soup.find_all(["h2", "h3", "h4", "h5", "h6"])
+
+    heading_levels = [
+        TocEntry(tag.text, tag.text, int(tag.name[1]), []) for tag in headings
     ]
 
+    # Ensure heading levels are sequential
+    for heading, next_heading in pairwise(heading_levels):
+        if next_heading.level - heading.level > 1:
+            next_heading.level = heading.level + 1
+
+    # Lower heading levels to 0
+    min_level = min([h.level for h in heading_levels])
+    for heading in heading_levels:
+        heading.level -= min_level
+
+    # A dummy root node, so we can pretend this is a tree
+    root = TocEntry("", "", 0, [])
+
+    # https://stackoverflow.com/a/44015834
+    for heading in heading_levels:
+        last = root
+        for _ in range(heading.level):
+            last = last.children[-1]
+        last.children.append(heading)
+
+    return root.children
+
 
 def get_page_models() -> list[Type[Page]]:
     page_models = get_wagtail_page_models().copy()