commit 25314dd613ecaed102567b2aa3745de34817aaf3 parent a09b7bde1827b2f8c59e9f7a4227c3e204489d84 Author: Christian Grothoff <christian@grothoff.org> Date: Wed, 19 Oct 2022 08:27:10 +0200 try to fix #7390: title extraction for 'Only the Free World Can Stand up to Microsoft' and related issues Diffstat:
| M | talermerchantdemos/blog/content.py | | | 19 | +++++++++++++++---- |
1 file changed, 15 insertions(+), 4 deletions(-)
diff --git a/talermerchantdemos/blog/content.py b/talermerchantdemos/blog/content.py @@ -104,10 +104,21 @@ def add_from_html(resource_name, lang): teaser = soup.find("p", attrs={"id": ["teaser"]}) if teaser is None: paragraphs = soup.find_all("p") - if len(paragraphs) > 0: - teaser = paragraphs[0].get_text() - if (len(paragraphs) > 1) and (len(teaser) < 100): - teaser2 = paragraphs[1].get_text() + lists = soup.find_all("li") + if (len(paragraphs) > 0) and (len(lists) > 0): + if (paragraphs[0].sourcepos > lists[0].sourcepos): + titleat = lists + else: + titleat = paragraphs + else: + if (len(paragraphs) > 0): + titleat = paragraphs + else: + titleat = lists + if len(titleat) > 0: + teaser = titelat[0].prettify() + if (len(titleat) > 1) and (len(teaser) < 100): + teaser2 = titleat[1].prettify() if len(teaser2) > len(teaser): teaser = teaser2 else: