Add similar content based on text Trigram similarity

This commit is contained in:
Jake Howard 2023-06-02 15:33:15 +01:00
parent 5d4c095227
commit 6ab0ff3fa7
Signed by: jake
GPG key ID: 57AFB45680EDD477
6 changed files with 129 additions and 0 deletions

View file

@ -0,0 +1,16 @@
section#similar-content {
display: flex;
flex-direction: column;
align-items: center;
margin-top: 2rem;
h2 {
color: inherit;
}
.media {
@include desktop {
transform: scale(85%);
}
}
}

View file

@ -20,6 +20,7 @@
@import "404"; @import "404";
@import "password_required"; @import "password_required";
@import "commento"; @import "commento";
@import "similar_content";
html, html,
body { body {

View file

@ -0,0 +1,13 @@
# Generated by Django 4.1.9 on 2023-06-02 12:36
from django.contrib.postgres.operations import TrigramExtension
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("blog", "0004_alter_blogpostcollectionlistpage_body_and_more"),
]
operations = [TrigramExtension()]

View file

@ -1,5 +1,6 @@
from typing import Any, Optional, Type from typing import Any, Optional, Type
from django.contrib.postgres.search import TrigramSimilarity
from django.db import models from django.db import models
from django.utils import timezone from django.utils import timezone
from django.utils.functional import cached_property from django.utils.functional import cached_property
@ -66,6 +67,37 @@ class BlogPostPage(BaseContentPage):
def tag_list_page_url(self) -> Optional[str]: def tag_list_page_url(self) -> Optional[str]:
return SingletonPageCache.get_url(BlogPostTagListPage) return SingletonPageCache.get_url(BlogPostTagListPage)
def get_similar_posts(self) -> models.QuerySet:
try:
listing_pages = BlogPostListPage.objects.get().get_listing_pages()
except BlogPostListPage.DoesNotExist:
return BlogPostPage.objects.none()
similar_posts = listing_pages.exclude(id=self.id).annotate(
title_similarity=TrigramSimilarity("title", self.title),
# If this page has no subtitle, ignore it as part of similarity
subtitle_similarity=TrigramSimilarity("subtitle", self.subtitle)
if self.subtitle
else models.Value(1),
)
page_tags = list(self.tags.values_list("id", flat=True))
similar_posts = similar_posts.annotate(
# If this page has no tags, ignore it as part of similarity
tag_similarity=models.Count("tags", filter=models.Q(tags__in=page_tags))
/ len(page_tags)
if page_tags
else models.Value(1)
)
similar_posts = similar_posts.annotate(
similarity=(models.F("tag_similarity") * 2)
* (models.F("title_similarity") * 10)
* (models.F("subtitle_similarity"))
).order_by("-similarity")[:3]
return similar_posts
class BlogPostTagListPage(BaseListingPage): class BlogPostTagListPage(BaseListingPage):
max_count = 1 max_count = 1

View file

@ -1 +1,19 @@
{% extends "common/content_page.html" %} {% extends "common/content_page.html" %}
{% load cache util_tags %}
{% block post_content %}
{{ block.super }}
{% cache FRAGMENT_CACHE_TTL|jitter:FRAGMENT_CACHE_TTL_JITTER "similar-content" page.id request.is_preview %}
<section class="container similar-content" id="similar-content">
<h2 class="subtitle is-size-2">Similar content</h2>
{% for page in page.get_similar_posts %}
{% block listing_item %}
{% include "common/listing-item.html" %}
{% endblock %}
{% endfor %}
</section>
{% endcache %}
{% endblock %}

49
website/blog/tests.py Normal file
View file

@ -0,0 +1,49 @@
from django.test import TestCase
from website.home.models import HomePage
from .factories import BlogPostListPageFactory, BlogPostPageFactory
class BlogPostPageTestCase(TestCase):
@classmethod
def setUpTestData(cls) -> None:
cls.home_page = HomePage.objects.get()
cls.blog_post_list_page = BlogPostListPageFactory(parent=cls.home_page)
cls.page = BlogPostPageFactory(parent=cls.blog_post_list_page)
def test_accessible(self) -> None:
response = self.client.get(self.page.url)
self.assertEqual(response.status_code, 200)
def test_queries(self) -> None:
with self.assertNumQueries(45):
self.client.get(self.page.url)
class BlogPostListPageTestCase(TestCase):
@classmethod
def setUpTestData(cls) -> None:
cls.home_page = HomePage.objects.get()
cls.page = BlogPostListPageFactory(parent=cls.home_page)
BlogPostPageFactory(parent=cls.page)
BlogPostPageFactory(parent=cls.page)
def test_accessible(self) -> None:
response = self.client.get(self.page.url)
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.context["listing_pages"]), 2)
self.assertContains(response, self.page.reverse_subpage("feed"))
def test_queries(self) -> None:
with self.assertNumQueries(44):
self.client.get(self.page.url)
def test_feed_accessible(self) -> None:
with self.assertNumQueries(12):
response = self.client.get(
self.page.url + self.page.reverse_subpage("feed")
)
self.assertEqual(response.status_code, 200)
self.assertEqual(response["Content-Type"], "application/rss+xml; charset=utf-8")