From 41a04af8dc5fb5f42642a60d1a550ded08c5d15a Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Mon, 1 Jul 2024 22:34:58 +0100 Subject: [PATCH] Fix pickle errors for metadata --- website/blog/models.py | 4 ++-- website/blog/tests.py | 14 +++++++++++++- website/common/utils.py | 11 ++++++++--- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/website/blog/models.py b/website/blog/models.py index b33c1f3..97df387 100644 --- a/website/blog/models.py +++ b/website/blog/models.py @@ -7,7 +7,7 @@ from django.db.models.functions import Cast, Coalesce from django.http import HttpRequest, HttpResponse, HttpResponsePermanentRedirect from django.utils import timezone from django.utils.functional import cached_property -from metadata_parser import MetadataParser +from metadata_parser import ParsedResult from modelcluster.fields import ParentalManyToManyField from wagtail.admin.panels import FieldPanel from wagtail.models import Page, PageQuerySet, Site @@ -239,7 +239,7 @@ class ExternalBlogPostPage(BaseContentPage): return tags @cached_property - def metadata(self) -> MetadataParser: + def metadata(self) -> ParsedResult: return get_page_metadata(self.external_url) @cached_property diff --git a/website/blog/tests.py b/website/blog/tests.py index fa66c55..848888e 100644 --- a/website/blog/tests.py +++ b/website/blog/tests.py @@ -1,3 +1,5 @@ +import pickle + from django.test import TestCase from django.urls import reverse @@ -96,7 +98,9 @@ class ExternalBlogPostPageTestCase(TestCase): def setUpTestData(cls) -> None: cls.home_page = HomePage.objects.get() cls.blog_post_list_page = BlogPostListPageFactory(parent=cls.home_page) - cls.page = ExternalBlogPostPageFactory(parent=cls.blog_post_list_page) + cls.page = ExternalBlogPostPageFactory( + parent=cls.blog_post_list_page, external_url="https://example.com" + ) def test_redirects(self) -> None: with self.assertNumQueries(10): @@ -107,3 +111,11 @@ class ExternalBlogPostPageTestCase(TestCase): status_code=301, fetch_redirect_response=False, ) + + def test_metadata(self) -> None: + metadata = self.page.metadata + + self.assertIsNone(metadata.soup) + + # Confirm it can pickle + pickle.dumps(metadata) diff --git a/website/common/utils.py b/website/common/utils.py index 2e83fd9..6aef1f2 100644 --- a/website/common/utils.py +++ b/website/common/utils.py @@ -11,7 +11,7 @@ from django.http import QueryDict from django.http.request import HttpRequest from django.utils.text import slugify from django_cache_decorator import django_cache_decorator -from metadata_parser import MetadataParser +from metadata_parser import MetadataParser, ParsedResult from wagtail.models import Page, Site from wagtail.models import get_page_models as get_wagtail_page_models @@ -128,8 +128,13 @@ def get_ai_robots_txt() -> str: @django_cache_decorator(time=21600) -def get_page_metadata(url: str) -> MetadataParser: - return MetadataParser(url=url, search_head_only=True) +def get_page_metadata(url: str) -> ParsedResult: + metadata = MetadataParser(url=url, search_head_only=True).parsed_result + + # HACK: BeautifulSoup doesn't pickle nicely, and so can't be cached + metadata.soup = None + + return metadata def extend_query_params(url: str, params: dict[str, Any]) -> str: