From ea081314ab0a1b74e31f36fd2a163d6b84d2b9b0 Mon Sep 17 00:00:00 2001 From: Christian Grothoff Date: Thu, 20 Oct 2022 23:52:09 +0200 Subject: simplify extraction --- talermerchantdemos/blog/content.py | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) (limited to 'talermerchantdemos/blog/content.py') diff --git a/talermerchantdemos/blog/content.py b/talermerchantdemos/blog/content.py index bb5fad6..875a5fa 100644 --- a/talermerchantdemos/blog/content.py +++ b/talermerchantdemos/blog/content.py @@ -104,34 +104,16 @@ def add_from_html(resource_name, lang): teaser = soup.find("p", attrs={"id": ["teaser"]}) if teaser is None: paragraphs = soup.find_all("p") - lists = soup.find_all("li") - if (len(paragraphs) > 0) and (len(lists) > 0): - if (paragraphs[0].sourceline > lists[0].sourceline): - titleat = lists - else: - titleat = paragraphs - else: - if (len(paragraphs) > 0): - titleat = paragraphs - else: - titleat = lists if len(titleat) > 0: - if (titleat[0].tag == 'li'): - teaser = titleat[0].contents[0].prettify() - else: - teaser = titleat[0].prettify() - if (len(titleat) > 1) and (len(teaser) < 100): - if (titleat[1].tag == 'li'): - teaser2 = titleat[1].contents[0].prettify() - else: - teaser2 = titleat[1].prettify() - if len(teaser2) > len(teaser): - teaser = teaser2 + teaser = paragraphs[0].prettify() + if len(teaser) < 100: + LOGGER.warning("Cannot extract adequate teaser from '%s'", resource_name) + return else: LOGGER.warning("Cannot extract teaser from '%s'", resource_name) - teaser = "" + return else: - teaser = teaser.get_text() + teaser = teaser.prettify() re_proc = re.compile("^/[^/][^/]/essay/[^/]+/data/[^/]+$") imgs = soup.find_all("img") extra_files = [] -- cgit v1.2.3