Get table of contents from body
This commit is contained in:
parent
8a7dba4ca0
commit
af2dba84cd
3 changed files with 47 additions and 20 deletions
|
@ -9,7 +9,7 @@ from wagtail.fields import StreamField
|
||||||
from wagtail.images import get_image_model_string
|
from wagtail.images import get_image_model_string
|
||||||
from wagtail.models import Page
|
from wagtail.models import Page
|
||||||
|
|
||||||
from .streamfield import get_blocks, get_word_count, truncate_streamfield
|
from .streamfield import get_blocks, get_html, get_word_count, truncate_streamfield
|
||||||
from .utils import TocEntry, get_table_of_contents
|
from .utils import TocEntry, get_table_of_contents
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,7 +50,8 @@ class BaseContentMixin(models.Model):
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def table_of_contents(self) -> list[TocEntry]:
|
def table_of_contents(self) -> list[TocEntry]:
|
||||||
return get_table_of_contents()
|
html = "".join(get_html(self.body))
|
||||||
|
return get_table_of_contents(html)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def reading_time(self) -> int:
|
def reading_time(self) -> int:
|
||||||
|
|
|
@ -76,14 +76,19 @@ def get_blocks() -> list[tuple[str, blocks.BaseBlock]]:
|
||||||
|
|
||||||
|
|
||||||
def extract_text(html: str) -> str:
|
def extract_text(html: str) -> str:
|
||||||
return " ".join(BeautifulSoup(html, "lxml").findAll(text=True))
|
return " ".join(BeautifulSoup(html, "lxml").find_all(text=True))
|
||||||
|
|
||||||
|
|
||||||
def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
|
def get_html(value: blocks.StreamValue) -> Iterator[str]:
|
||||||
for block in value:
|
for block in value:
|
||||||
if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
|
if isinstance(block.block_type, IGNORE_PLAINTEXT_BLOCKS):
|
||||||
continue
|
continue
|
||||||
yield extract_text(str(block))
|
yield str(block)
|
||||||
|
|
||||||
|
|
||||||
|
def get_plain_text(value: blocks.StreamValue) -> Iterator[str]:
|
||||||
|
for html_chunk in get_html(value):
|
||||||
|
yield extract_text(html_chunk)
|
||||||
|
|
||||||
|
|
||||||
def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:
|
def truncate_streamfield(value: blocks.StreamValue, words: int) -> str:
|
||||||
|
|
|
@ -1,32 +1,53 @@
|
||||||
from typing import NamedTuple, Type
|
from dataclasses import dataclass
|
||||||
|
from itertools import pairwise
|
||||||
|
from typing import Type
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.http.request import HttpRequest
|
from django.http.request import HttpRequest
|
||||||
from wagtail.models import Page
|
from wagtail.models import Page
|
||||||
from wagtail.models import get_page_models as get_wagtail_page_models
|
from wagtail.models import get_page_models as get_wagtail_page_models
|
||||||
|
|
||||||
|
|
||||||
class TocEntry(NamedTuple):
|
@dataclass
|
||||||
|
class TocEntry:
|
||||||
title: str
|
title: str
|
||||||
slug: str
|
slug: str
|
||||||
|
level: int
|
||||||
children: list
|
children: list
|
||||||
|
|
||||||
|
|
||||||
def get_table_of_contents() -> list[TocEntry]:
|
def get_table_of_contents(html: str) -> list[TocEntry]:
|
||||||
return [
|
soup = BeautifulSoup(html, "lxml")
|
||||||
TocEntry(
|
|
||||||
"Title 1",
|
headings = soup.find_all(["h2", "h3", "h4", "h5", "h6"])
|
||||||
"title-1",
|
|
||||||
[
|
heading_levels = [
|
||||||
TocEntry("Title 1.1", "title-11", []),
|
TocEntry(tag.text, tag.text, int(tag.name[1]), []) for tag in headings
|
||||||
TocEntry("Title 1.2", "title-12", []),
|
|
||||||
TocEntry("Title 1.3", "title-13", []),
|
|
||||||
],
|
|
||||||
),
|
|
||||||
TocEntry("Title 2", "title-2", []),
|
|
||||||
TocEntry("Title 3", "title-3", []),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Ensure heading levels are sequential
|
||||||
|
for heading, next_heading in pairwise(heading_levels):
|
||||||
|
if next_heading.level - heading.level > 1:
|
||||||
|
next_heading.level = heading.level + 1
|
||||||
|
|
||||||
|
# Lower heading levels to 0
|
||||||
|
min_level = min([h.level for h in heading_levels])
|
||||||
|
for heading in heading_levels:
|
||||||
|
heading.level -= min_level
|
||||||
|
|
||||||
|
# A dummy root node, so we can pretend this is a tree
|
||||||
|
root = TocEntry("", "", 0, [])
|
||||||
|
|
||||||
|
# https://stackoverflow.com/a/44015834
|
||||||
|
for heading in heading_levels:
|
||||||
|
last = root
|
||||||
|
for _ in range(heading.level):
|
||||||
|
last = last.children[-1]
|
||||||
|
last.children.append(heading)
|
||||||
|
|
||||||
|
return root.children
|
||||||
|
|
||||||
|
|
||||||
def get_page_models() -> list[Type[Page]]:
|
def get_page_models() -> list[Type[Page]]:
|
||||||
page_models = get_wagtail_page_models().copy()
|
page_models = get_wagtail_page_models().copy()
|
||||||
|
|
Loading…
Reference in a new issue