summaryrefslogtreecommitdiff
path: root/talermerchantdemos/blog/content.py
diff options
context:
space:
mode:
Diffstat (limited to 'talermerchantdemos/blog/content.py')
-rw-r--r--talermerchantdemos/blog/content.py24
1 files changed, 12 insertions, 12 deletions
diff --git a/talermerchantdemos/blog/content.py b/talermerchantdemos/blog/content.py
index d049718..1b2a466 100644
--- a/talermerchantdemos/blog/content.py
+++ b/talermerchantdemos/blog/content.py
@@ -96,25 +96,24 @@ def add_from_html(resource_name, lang):
title_el = soup.find("h2")
if title_el is None:
LOGGER.warning("Cannot extract title from '%s'", resource_name)
- title = resource_name
- else:
- title = title_el.get_text().strip()
- slug = quote(title.replace(" ", "_"), safe="")
+ return
+ title = title_el.get_text().strip()
+ slug = title.replace(" ", "_")
+ slug = re.sub(r'[^a-zA-Z0-9_]+', "-", slug)
teaser = soup.find("p", attrs={"id": ["teaser"]})
if teaser is None:
paragraphs = soup.find_all("p")
if len(paragraphs) > 0:
- teaser = paragraphs[0].get_text()
- if (len(paragraphs) > 1) and (len(teaser) < 100):
- teaser2 = paragraphs[1].get_text()
- if len(teaser2) > len(teaser):
- teaser = teaser2
+ teaser = paragraphs[0].prettify()
+ if len(teaser) < 100:
+ LOGGER.warning("Cannot extract adequate teaser from '%s'", resource_name)
+ return
else:
LOGGER.warning("Cannot extract teaser from '%s'", resource_name)
- teaser = ""
+ return
else:
- teaser = teaser.get_text()
+ teaser = teaser.prettify()
re_proc = re.compile("^/[^/][^/]/essay/[^/]+/data/[^/]+$")
imgs = soup.find_all("img")
extra_files = []
@@ -141,4 +140,5 @@ for l in listdir(resource_filename("talermerchantdemos", "blog/articles/")):
if l in {"en", "ar", "zh", "fr", "hi", "it", "ja", "ko", "pt", "pt_BR", "ru", "tr", "de", "sv", "es"}:
LOGGER.info("importing %s" % l)
for a in listdir(resource_filename("talermerchantdemos", "blog/articles/" + l)):
- add_from_html("blog/articles/" + l + "/" + a, l)
+ if os.path.isfile(resource_filename("talermerchantdemos", "blog/articles/" + l + "/" + a)):
+ add_from_html("blog/articles/" + l + "/" + a, l)