Fix pickle errors for metadata

This commit is contained in:
Jake Howard 2024-07-01 22:34:58 +01:00
parent d242f94024
commit 41a04af8dc
Signed by: jake
GPG Key ID: 57AFB45680EDD477
3 changed files with 23 additions and 6 deletions

View File

@ -7,7 +7,7 @@ from django.db.models.functions import Cast, Coalesce
from django.http import HttpRequest, HttpResponse, HttpResponsePermanentRedirect
from django.utils import timezone
from django.utils.functional import cached_property
from metadata_parser import MetadataParser
from metadata_parser import ParsedResult
from modelcluster.fields import ParentalManyToManyField
from wagtail.admin.panels import FieldPanel
from wagtail.models import Page, PageQuerySet, Site
@ -239,7 +239,7 @@ class ExternalBlogPostPage(BaseContentPage):
return tags
@cached_property
def metadata(self) -> MetadataParser:
def metadata(self) -> ParsedResult:
return get_page_metadata(self.external_url)
@cached_property

View File

@ -1,3 +1,5 @@
import pickle
from django.test import TestCase
from django.urls import reverse
@ -96,7 +98,9 @@ class ExternalBlogPostPageTestCase(TestCase):
def setUpTestData(cls) -> None:
cls.home_page = HomePage.objects.get()
cls.blog_post_list_page = BlogPostListPageFactory(parent=cls.home_page)
cls.page = ExternalBlogPostPageFactory(parent=cls.blog_post_list_page)
cls.page = ExternalBlogPostPageFactory(
parent=cls.blog_post_list_page, external_url="https://example.com"
)
def test_redirects(self) -> None:
with self.assertNumQueries(10):
@ -107,3 +111,11 @@ class ExternalBlogPostPageTestCase(TestCase):
status_code=301,
fetch_redirect_response=False,
)
def test_metadata(self) -> None:
metadata = self.page.metadata
self.assertIsNone(metadata.soup)
# Confirm it can pickle
pickle.dumps(metadata)

View File

@ -11,7 +11,7 @@ from django.http import QueryDict
from django.http.request import HttpRequest
from django.utils.text import slugify
from django_cache_decorator import django_cache_decorator
from metadata_parser import MetadataParser
from metadata_parser import MetadataParser, ParsedResult
from wagtail.models import Page, Site
from wagtail.models import get_page_models as get_wagtail_page_models
@ -128,8 +128,13 @@ def get_ai_robots_txt() -> str:
@django_cache_decorator(time=21600)
def get_page_metadata(url: str) -> MetadataParser:
return MetadataParser(url=url, search_head_only=True)
def get_page_metadata(url: str) -> ParsedResult:
metadata = MetadataParser(url=url, search_head_only=True).parsed_result
# HACK: BeautifulSoup doesn't pickle nicely, and so can't be cached
metadata.soup = None
return metadata
def extend_query_params(url: str, params: dict[str, Any]) -> str: