From e4476e1b2aca856d387c61f4193ec42f6b837604 Mon Sep 17 00:00:00 2001
From: Jake Howard <git@theorangeone.net>
Date: Fri, 23 Sep 2022 15:35:32 +0100
Subject: [PATCH] Ensure heading ids are always valid ids

---
 website/blog/models.py                        |  5 ++++-
 .../templates/blog/blog_post_list_page.html   |  2 +-
 website/common/streamfield.py                 |  5 ++---
 website/common/tests/test_utils.py            | 19 ++++++++++++++++++-
 website/common/utils.py                       | 15 ++++++++++++++-
 5 files changed, 39 insertions(+), 7 deletions(-)
diff --git a/website/blog/models.py b/website/blog/models.py
index 419797e..7fa1b14 100644
--- a/website/blog/models.py
+++ b/website/blog/models.py
@@ -37,7 +37,10 @@ class BlogPostListPage(BaseListingPage):
             reverse=True,
         )
 
-        return [TocEntry(post_month, post_month, 0, []) for post_month in post_months]
+        return [
+            TocEntry(post_month, "date-" + post_month, 0, [])
+            for post_month in post_months
+        ]
 
     def get_listing_pages(self) -> models.QuerySet:
         return prefetch_for_listing(
diff --git a/website/blog/templates/blog/blog_post_list_page.html b/website/blog/templates/blog/blog_post_list_page.html
index b02e049..9aa0ecf 100644
--- a/website/blog/templates/blog/blog_post_list_page.html
+++ b/website/blog/templates/blog/blog_post_list_page.html
@@ -4,7 +4,7 @@
   <section class="container">
     {% for page in listing_pages %}
       {% ifchanged %}
-        <h3 id="{{ page.date|date:'Y-m' }}" class="date-header">
+        <h3 id="date-{{ page.date|date:'Y-m' }}" class="date-header">
           <time datetime="{{ page.date|date:'Y-m' }}" title='{{ page.date|date:"F Y" }}'>
             {{ page.date|date:"Y-m" }}
           </time>
diff --git a/website/common/streamfield.py b/website/common/streamfield.py
index 1866aa6..7f6f018 100644
--- a/website/common/streamfield.py
+++ b/website/common/streamfield.py
@@ -3,13 +3,12 @@ from itertools import product
 from bs4 import BeautifulSoup, SoupStrainer
 from django.utils import lorem_ipsum
 from django.utils.html import format_html_join
-from django.utils.text import slugify
 from wagtail import blocks
 from wagtail.contrib.typed_table_block.blocks import TypedTableBlock
 from wagtail.embeds.blocks import EmbedBlock
 from wagtail.images.blocks import ImageChooserBlock
 
-from website.common.utils import HEADER_TAGS
+from website.common.utils import HEADER_TAGS, heading_id
 from website.contrib.code_block.blocks import CodeBlock
 from website.contrib.mermaid_block.blocks import MermaidBlock
 
@@ -121,7 +120,7 @@ def add_heading_anchors(html: str) -> str:
 
     soup = BeautifulSoup(html, "lxml")
     for tag in soup.select(", ".join(targets)):
-        slug = slugify(tag.text)
+        slug = heading_id(tag.text)
         anchor = soup.new_tag("a", href="#" + slug, id=slug)
         anchor.string = "#"
         anchor.attrs["class"] = "heading-anchor"
diff --git a/website/common/tests/test_utils.py b/website/common/tests/test_utils.py
index e2d80de..6e67e98 100644
--- a/website/common/tests/test_utils.py
+++ b/website/common/tests/test_utils.py
@@ -3,7 +3,12 @@ from django.test import SimpleTestCase
 from wagtail.rich_text import features as richtext_feature_registry
 
 from website.common.embed import YouTubeLiteEmbedFinder
-from website.common.utils import count_words, extract_text, get_table_of_contents
+from website.common.utils import (
+    count_words,
+    extract_text,
+    get_table_of_contents,
+    heading_id,
+)
 
 
 class YouTubeLiteEmbedFinderTestCase(SimpleTestCase):
@@ -35,6 +40,7 @@ class TableOfContentsTestCase(SimpleTestCase):
 
         self.assertEqual(len(toc), 3)
         self.assertEqual([entry.title for entry in toc], ["2", "3", "4"])
+        self.assertEqual([entry.slug for entry in toc], ["ref-2", "ref-3", "ref-4"])
 
         first_entry = toc[0]
         self.assertEqual(len(first_entry.children), 3)
@@ -78,6 +84,10 @@ class TableOfContentsTestCase(SimpleTestCase):
         self.assertEqual(
             [entry.title for entry in first_entry.children], ["2.1", "2.2", "2.3"]
         )
+        self.assertEqual(
+            [entry.slug for entry in first_entry.children],
+            ["ref-21", "ref-22", "ref-23"],
+        )
 
         sub_entry = first_entry.children[1]
         self.assertEqual(len(sub_entry.children), 1)
@@ -111,3 +121,10 @@ class RichTextFeaturesTestCase(SimpleTestCase):
                     self.assertIsNotNone(
                         richtext_feature_registry.get_editor_plugin("draftail", feature)
                     )
+
+
+class HeadingIDTestCase(SimpleTestCase):
+    def test_headings(self) -> None:
+        self.assertEqual(heading_id("123"), "ref-123")
+        self.assertEqual(heading_id("test"), "test")
+        self.assertEqual(heading_id("Look, a title!"), "look-a-title")
diff --git a/website/common/utils.py b/website/common/utils.py
index e8b063e..4f4ded7 100644
--- a/website/common/utils.py
+++ b/website/common/utils.py
@@ -26,7 +26,7 @@ def get_table_of_contents(html: str) -> list[TocEntry]:
     soup = BeautifulSoup(html, "lxml", parse_only=SoupStrainer(HEADER_TAGS))
 
     heading_levels = [
-        TocEntry(tag.text, slugify(tag.text), int(tag.name[1]), []) for tag in soup
+        TocEntry(tag.text, heading_id(tag.text), int(tag.name[1]), []) for tag in soup
     ]
 
     # Abort if there are no headings
@@ -95,3 +95,16 @@ def prefetch_for_listing(queryset: PageQuerySet) -> PageQuerySet:
     different page models is a pain.
     """
     return queryset.select_related("hero_image", "hero_unsplash_photo")
+
+
+def heading_id(heading: str) -> str:
+    """
+    Convert a heading into an identifier which is valid for a HTML id attribute
+    """
+    if not heading:
+        return ""
+
+    slug = slugify(heading)
+    if slug[0].isdigit():
+        return "ref-" + slug
+    return slug