website/website/common/utils.py

from dataclasses import dataclass
from itertools import islice, pairwise
from typing import Iterable, Optional, Type

import requests
from bs4 import BeautifulSoup, SoupStrainer
from django.conf import settings
from django.http.request import HttpRequest
from django.utils.text import re_words, slugify
from django_cache_decorator import django_cache_decorator
from wagtail.models import Page, Site
from wagtail.models import get_page_models as get_wagtail_page_models

HEADER_TAGS = ["h2", "h3", "h4", "h5", "h6"]

requests_session = requests.Session()


@dataclass
class TocEntry:
    title: str
    slug: str
    level: int
    children: list


def get_table_of_contents(html: str) -> list[TocEntry]:
    soup = BeautifulSoup(html, "lxml", parse_only=SoupStrainer(HEADER_TAGS))

    heading_levels = [
        TocEntry(tag.text, heading_id(tag.text), int(tag.name[1]), []) for tag in soup
    ]

    # Abort if there are no headings
    if not heading_levels:
        return []

    # Ensure heading levels are sequential
    for heading, next_heading in pairwise(heading_levels):
        if next_heading.level - heading.level > 1:
            next_heading.level = heading.level + 1

    # Lower heading levels to 0
    min_level = min([h.level for h in heading_levels])
    for heading in heading_levels:
        heading.level -= min_level

    # A dummy root node, so we can pretend this is a tree
    root = TocEntry("", "", 0, [])

    # https://stackoverflow.com/a/44015834
    for heading in heading_levels:
        last = root
        for _ in range(heading.level):
            last = last.children[-1]
        last.children.append(heading)

    return root.children


def get_page_models() -> list[Type[Page]]:
    page_models = get_wagtail_page_models().copy()
    page_models.remove(Page)
    return page_models


def show_toolbar_callback(request: HttpRequest) -> bool:
    return settings.DEBUG


def split_words(text: str) -> Iterable[str]:
    for word in re_words.split(text):
        if word and word.strip():
            yield word.strip()


def count_words(text: str) -> int:
    """
    Count the number of words in the text, without duplicating the item in memory
    """
    return len(list(split_words(text)))


def extract_text(html: str) -> str:
    """
    Get the plain text of some HTML.
    """
    return (
        BeautifulSoup(html.replace("<p", " <p"), "lxml").get_text().replace("\n", " ")
    )


def truncate_string(text: str, words: int) -> str:
    return " ".join(islice(split_words(text), words))


def heading_id(heading: str) -> str:
    """
    Convert a heading into an identifier which is valid for a HTML id attribute
    """
    if not heading:
        return ""

    slug = slugify(heading)
    if slug[0].isdigit():
        return "ref-" + slug
    return slug


@django_cache_decorator(time=300)
def get_site_title() -> str:
    return Site.objects.values_list("site_name", flat=True).first()


@django_cache_decorator(time=21600)
def get_url_mime_type(url: str) -> Optional[str]:
    try:
        return requests_session.head(url).headers.get("Content-Type")
    except requests.exceptions.RequestException:
        return None
Get table of contents from body 2022-06-30 23:27:50 +01:00			`from dataclasses import dataclass`
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00			`from itertools import islice, pairwise`
Add size of enclosure to RSS feed 2023-09-04 21:39:14 +01:00			`from typing import Iterable, Optional, Type`
Add tests for unique body classes 2022-06-10 16:22:20 +01:00
Add size of enclosure to RSS feed 2023-09-04 21:39:14 +01:00			`import requests`
Only parse header tags when looking for ToC 2022-07-04 18:56:11 +01:00			`from bs4 import BeautifulSoup, SoupStrainer`
Add debug toolbar 2022-06-14 22:23:44 +01:00			`from django.conf import settings`
			`from django.http.request import HttpRequest`
Fix length calculation for posts `smart_split` isn't quite what's needed for splitting words. 2023-03-10 16:08:03 +00:00			`from django.utils.text import re_words, slugify`
Construct HTML title in Python Site is cached, and makes it easier to reference elsewhere 2022-10-30 12:53:15 +00:00			`from django_cache_decorator import django_cache_decorator`
			`from wagtail.models import Page, Site`
Add linting 2022-06-12 15:17:28 +01:00			`from wagtail.models import get_page_models as get_wagtail_page_models`

Add anchor links to headers 2022-07-01 09:25:57 +01:00			`HEADER_TAGS = ["h2", "h3", "h4", "h5", "h6"]`

Create a global requests session This should massively cut down on the number of TCP handshakes 2023-09-04 21:48:10 +01:00			`requests_session = requests.Session()`

Add linting 2022-06-12 15:17:28 +01:00
Get table of contents from body 2022-06-30 23:27:50 +01:00			`@dataclass`
			`class TocEntry:`
Build ToC from python 2022-06-19 20:13:19 +01:00			`title: str`
			`slug: str`
Get table of contents from body 2022-06-30 23:27:50 +01:00			`level: int`
Build ToC from python 2022-06-19 20:13:19 +01:00			`children: list`


Get table of contents from body 2022-06-30 23:27:50 +01:00			`def get_table_of_contents(html: str) -> list[TocEntry]:`
Only parse header tags when looking for ToC 2022-07-04 18:56:11 +01:00			`soup = BeautifulSoup(html, "lxml", parse_only=SoupStrainer(HEADER_TAGS))`
Get table of contents from body 2022-06-30 23:27:50 +01:00
			`heading_levels = [`
Ensure heading ids are always valid ids 2022-09-23 15:35:32 +01:00			`TocEntry(tag.text, heading_id(tag.text), int(tag.name[1]), []) for tag in soup`
Build ToC from python 2022-06-19 20:13:19 +01:00			`]`

Add tests for ToC generator 2022-07-01 08:55:42 +01:00			`# Abort if there are no headings`
			`if not heading_levels:`
			`return []`

Get table of contents from body 2022-06-30 23:27:50 +01:00			`# Ensure heading levels are sequential`
			`for heading, next_heading in pairwise(heading_levels):`
			`if next_heading.level - heading.level > 1:`
			`next_heading.level = heading.level + 1`

			`# Lower heading levels to 0`
			`min_level = min([h.level for h in heading_levels])`
			`for heading in heading_levels:`
			`heading.level -= min_level`

			`# A dummy root node, so we can pretend this is a tree`
			`root = TocEntry("", "", 0, [])`

			`# https://stackoverflow.com/a/44015834`
			`for heading in heading_levels:`
			`last = root`
			`for _ in range(heading.level):`
			`last = last.children[-1]`
			`last.children.append(heading)`

			`return root.children`

Build ToC from python 2022-06-19 20:13:19 +01:00
Add linting 2022-06-12 15:17:28 +01:00			`def get_page_models() -> list[Type[Page]]:`
Add tests for unique body classes 2022-06-10 16:22:20 +01:00			`page_models = get_wagtail_page_models().copy()`
			`page_models.remove(Page)`
			`return page_models`
Add debug toolbar 2022-06-14 22:23:44 +01:00

			`def show_toolbar_callback(request: HttpRequest) -> bool:`
			`return settings.DEBUG`
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00

Fix length calculation for posts `smart_split` isn't quite what's needed for splitting words. 2023-03-10 16:08:03 +00:00			`def split_words(text: str) -> Iterable[str]:`
			`for word in re_words.split(text):`
			`if word and word.strip():`
			`yield word.strip()`


Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00			`def count_words(text: str) -> int:`
			`"""`
			`Count the number of words in the text, without duplicating the item in memory`
			`"""`
Fix length calculation for posts `smart_split` isn't quite what's needed for splitting words. 2023-03-10 16:08:03 +00:00			`return len(list(split_words(text)))`
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00

			`def extract_text(html: str) -> str:`
			`"""`
			`Get the plain text of some HTML.`
			`"""`
Account for spaces after tags when extracting text 2022-10-21 17:46:13 +01:00			`return (`
			`BeautifulSoup(html.replace("<p", " <p"), "lxml").get_text().replace("\n", " ")`
Add basic tests for extract_text 2022-07-04 09:21:12 +01:00			`)`
Reduce complexity trying to save computation on rendering streamfield This replaces more custom iteration with caching, which will end up faster anyway, and is more drop-in with the new structure. Sadly it still renders the content twice, as `get_content_html` requires access to the blocks. 2022-07-03 23:10:57 +01:00

			`def truncate_string(text: str, words: int) -> str:`
Fix length calculation for posts `smart_split` isn't quite what's needed for splitting words. 2023-03-10 16:08:03 +00:00			`return " ".join(islice(split_words(text), words))`
Add method to prefetch for listing 2022-08-28 14:52:27 +01:00

Ensure heading ids are always valid ids 2022-09-23 15:35:32 +01:00			`def heading_id(heading: str) -> str:`
			`"""`
			`Convert a heading into an identifier which is valid for a HTML id attribute`
			`"""`
			`if not heading:`
			`return ""`

			`slug = slugify(heading)`
			`if slug[0].isdigit():`
			`return "ref-" + slug`
			`return slug`
Construct HTML title in Python Site is cached, and makes it easier to reference elsewhere 2022-10-30 12:53:15 +00:00

			`@django_cache_decorator(time=300)`
			`def get_site_title() -> str:`
			`return Site.objects.values_list("site_name", flat=True).first()`
Add size of enclosure to RSS feed 2023-09-04 21:39:14 +01:00

			`@django_cache_decorator(time=21600)`
			`def get_url_mime_type(url: str) -> Optional[str]:`
			`try:`
Create a global requests session This should massively cut down on the number of TCP handshakes 2023-09-04 21:48:10 +01:00			`return requests_session.head(url).headers.get("Content-Type")`
Add size of enclosure to RSS feed 2023-09-04 21:39:14 +01:00			`except requests.exceptions.RequestException:`
			`return None`