diff options
author | Christian Grothoff <christian@grothoff.org> | 2022-10-20 23:52:09 +0200 |
---|---|---|
committer | Christian Grothoff <christian@grothoff.org> | 2022-10-20 23:52:09 +0200 |
commit | ea081314ab0a1b74e31f36fd2a163d6b84d2b9b0 (patch) | |
tree | e0d1de9aab5fb5cc66d51d3c32746947b594fc18 | |
parent | b0c67aa4aeaf03ec7deac4f879811646c7308071 (diff) | |
download | taler-merchant-demos-ea081314ab0a1b74e31f36fd2a163d6b84d2b9b0.tar.gz taler-merchant-demos-ea081314ab0a1b74e31f36fd2a163d6b84d2b9b0.tar.bz2 taler-merchant-demos-ea081314ab0a1b74e31f36fd2a163d6b84d2b9b0.zip |
simplify extraction
-rw-r--r-- | talermerchantdemos/blog/content.py | 30 |
1 files changed, 6 insertions, 24 deletions
diff --git a/talermerchantdemos/blog/content.py b/talermerchantdemos/blog/content.py index bb5fad6..875a5fa 100644 --- a/talermerchantdemos/blog/content.py +++ b/talermerchantdemos/blog/content.py @@ -104,34 +104,16 @@ def add_from_html(resource_name, lang): teaser = soup.find("p", attrs={"id": ["teaser"]}) if teaser is None: paragraphs = soup.find_all("p") - lists = soup.find_all("li") - if (len(paragraphs) > 0) and (len(lists) > 0): - if (paragraphs[0].sourceline > lists[0].sourceline): - titleat = lists - else: - titleat = paragraphs - else: - if (len(paragraphs) > 0): - titleat = paragraphs - else: - titleat = lists if len(titleat) > 0: - if (titleat[0].tag == 'li'): - teaser = titleat[0].contents[0].prettify() - else: - teaser = titleat[0].prettify() - if (len(titleat) > 1) and (len(teaser) < 100): - if (titleat[1].tag == 'li'): - teaser2 = titleat[1].contents[0].prettify() - else: - teaser2 = titleat[1].prettify() - if len(teaser2) > len(teaser): - teaser = teaser2 + teaser = paragraphs[0].prettify() + if len(teaser) < 100: + LOGGER.warning("Cannot extract adequate teaser from '%s'", resource_name) + return else: LOGGER.warning("Cannot extract teaser from '%s'", resource_name) - teaser = "" + return else: - teaser = teaser.get_text() + teaser = teaser.prettify() re_proc = re.compile("^/[^/][^/]/essay/[^/]+/data/[^/]+$") imgs = soup.find_all("img") extra_files = [] |