Fix pickle errors for metadata

This commit is contained in:
Jake Howard 2024-07-01 22:34:58 +01:00
parent d242f94024
commit 41a04af8dc
Signed by: jake
GPG key ID: 57AFB45680EDD477
3 changed files with 23 additions and 6 deletions

View file

@ -7,7 +7,7 @@ from django.db.models.functions import Cast, Coalesce
from django.http import HttpRequest, HttpResponse, HttpResponsePermanentRedirect from django.http import HttpRequest, HttpResponse, HttpResponsePermanentRedirect
from django.utils import timezone from django.utils import timezone
from django.utils.functional import cached_property from django.utils.functional import cached_property
from metadata_parser import MetadataParser from metadata_parser import ParsedResult
from modelcluster.fields import ParentalManyToManyField from modelcluster.fields import ParentalManyToManyField
from wagtail.admin.panels import FieldPanel from wagtail.admin.panels import FieldPanel
from wagtail.models import Page, PageQuerySet, Site from wagtail.models import Page, PageQuerySet, Site
@ -239,7 +239,7 @@ class ExternalBlogPostPage(BaseContentPage):
return tags return tags
@cached_property @cached_property
def metadata(self) -> MetadataParser: def metadata(self) -> ParsedResult:
return get_page_metadata(self.external_url) return get_page_metadata(self.external_url)
@cached_property @cached_property

View file

@ -1,3 +1,5 @@
import pickle
from django.test import TestCase from django.test import TestCase
from django.urls import reverse from django.urls import reverse
@ -96,7 +98,9 @@ class ExternalBlogPostPageTestCase(TestCase):
def setUpTestData(cls) -> None: def setUpTestData(cls) -> None:
cls.home_page = HomePage.objects.get() cls.home_page = HomePage.objects.get()
cls.blog_post_list_page = BlogPostListPageFactory(parent=cls.home_page) cls.blog_post_list_page = BlogPostListPageFactory(parent=cls.home_page)
cls.page = ExternalBlogPostPageFactory(parent=cls.blog_post_list_page) cls.page = ExternalBlogPostPageFactory(
parent=cls.blog_post_list_page, external_url="https://example.com"
)
def test_redirects(self) -> None: def test_redirects(self) -> None:
with self.assertNumQueries(10): with self.assertNumQueries(10):
@ -107,3 +111,11 @@ class ExternalBlogPostPageTestCase(TestCase):
status_code=301, status_code=301,
fetch_redirect_response=False, fetch_redirect_response=False,
) )
def test_metadata(self) -> None:
metadata = self.page.metadata
self.assertIsNone(metadata.soup)
# Confirm it can pickle
pickle.dumps(metadata)

View file

@ -11,7 +11,7 @@ from django.http import QueryDict
from django.http.request import HttpRequest from django.http.request import HttpRequest
from django.utils.text import slugify from django.utils.text import slugify
from django_cache_decorator import django_cache_decorator from django_cache_decorator import django_cache_decorator
from metadata_parser import MetadataParser from metadata_parser import MetadataParser, ParsedResult
from wagtail.models import Page, Site from wagtail.models import Page, Site
from wagtail.models import get_page_models as get_wagtail_page_models from wagtail.models import get_page_models as get_wagtail_page_models
@ -128,8 +128,13 @@ def get_ai_robots_txt() -> str:
@django_cache_decorator(time=21600) @django_cache_decorator(time=21600)
def get_page_metadata(url: str) -> MetadataParser: def get_page_metadata(url: str) -> ParsedResult:
return MetadataParser(url=url, search_head_only=True) metadata = MetadataParser(url=url, search_head_only=True).parsed_result
# HACK: BeautifulSoup doesn't pickle nicely, and so can't be cached
metadata.soup = None
return metadata
def extend_query_params(url: str, params: dict[str, Any]) -> str: def extend_query_params(url: str, params: dict[str, Any]) -> str: