From 25314dd613ecaed102567b2aa3745de34817aaf3 Mon Sep 17 00:00:00 2001 From: Christian Grothoff Date: Wed, 19 Oct 2022 08:27:10 +0200 Subject: try to fix #7390: title extraction for 'Only the Free World Can Stand up to Microsoft' and related issues --- talermerchantdemos/blog/content.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'talermerchantdemos') diff --git a/talermerchantdemos/blog/content.py b/talermerchantdemos/blog/content.py index 8de89f4..ba69248 100644 --- a/talermerchantdemos/blog/content.py +++ b/talermerchantdemos/blog/content.py @@ -104,10 +104,21 @@ def add_from_html(resource_name, lang): teaser = soup.find("p", attrs={"id": ["teaser"]}) if teaser is None: paragraphs = soup.find_all("p") - if len(paragraphs) > 0: - teaser = paragraphs[0].get_text() - if (len(paragraphs) > 1) and (len(teaser) < 100): - teaser2 = paragraphs[1].get_text() + lists = soup.find_all("li") + if (len(paragraphs) > 0) and (len(lists) > 0): + if (paragraphs[0].sourcepos > lists[0].sourcepos): + titleat = lists + else: + titleat = paragraphs + else: + if (len(paragraphs) > 0): + titleat = paragraphs + else: + titleat = lists + if len(titleat) > 0: + teaser = titelat[0].prettify() + if (len(titleat) > 1) and (len(teaser) < 100): + teaser2 = titleat[1].prettify() if len(teaser2) > len(teaser): teaser = teaser2 else: -- cgit v1.2.3