content.py (5314B)
1 ## 2 # This file is part of GNU TALER. 3 # Copyright (C) 2014-2020 Taler Systems SA 4 # 5 # TALER is free software; you can redistribute it and/or modify it under the 6 # terms of the GNU Lesser General Public License as published by the Free Software 7 # Foundation; either version 2.1, or (at your option) any later version. 8 # 9 # TALER is distributed in the hope that it will be useful, but WITHOUT ANY 10 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 11 # A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. 12 # 13 # You should have received a copy of the GNU Lesser General Public License along with 14 # GNU TALER; see the file COPYING. If not, see <http://www.gnu.org/licenses/> 15 # 16 # @author Florian Dold 17 # @brief Define content and associated metadata that is served on the blog. 18 19 from collections import OrderedDict, namedtuple 20 import logging 21 import re 22 from bs4 import BeautifulSoup 23 24 import importlib.resources 25 26 logger = logging.getLogger(__name__) 27 noisy_logger = logging.getLogger("chardet.charsetprober") 28 noisy_logger.setLevel(logging.INFO) 29 Article = namedtuple("Article", "slug title teaser main_file extra_files lang") 30 31 ## 32 # @var if a article is added to this list, then it will 33 # be made available in the blog. 34 # ARTICLES is a dict mapping a languguage ('en') to an OrderedDict() of 35 # articles available in that language. 36 ARTICLES = {} 37 38 articles_per_lang = {} 39 40 41 ## 42 # Add article to the list of the available articles. 43 # 44 # @param slug article's title with all the spaces converted to underscores. 45 # @param title article's title. 46 # @param teaser a short description of the main article's content. 47 # @param main_file path to the article's HTML file. 48 # @param extra_file collection of extra files associated with the 49 # article, like images and sounds. 50 # @param lang language of the arcile 51 def add_article(slug, title, teaser, main_file, extra_files, lang="en"): 52 if not (lang in ARTICLES): 53 ARTICLES[lang] = OrderedDict() 54 ARTICLES[lang][slug] = Article(slug, title, teaser, main_file, extra_files, lang) 55 articles_per_lang.setdefault(lang, 0) 56 articles_per_lang[lang] += 1 57 58 59 ## 60 # Return contents of an article. 61 # 62 # @param article the article filename. 63 # @return text contents of the article 64 def get_article_contents(article): 65 return article.main_file.read_text() 66 67 68 ## 69 # Extract information from HTML file, and use these informations 70 # to make the article available in the blog. 71 # 72 # @param resource_name path to the (HTML) article. 73 # @param teaser_paragraph position of the teaser paragraph in the 74 # article's list of all the P tags. Defaults to zero, as normally 75 # this information is found under the very first P tag. 76 # @param title article's title; normally, this bit is extracted from the 77 # HTML itself, so give it here if a explicit title needs to be 78 # specified. 79 def add_from_html(resource_name, lang): 80 soup = BeautifulSoup(resource_name.read_bytes(), "html.parser") 81 title_el = soup.find("h2") 82 if title_el is None: 83 logger.warning("Cannot extract title from '%s'", resource_name) 84 return 85 title = title_el.get_text().strip() 86 slug = title.replace(" ", "_") 87 slug = re.sub(r"[^a-zA-Z0-9_]+", "-", slug) 88 89 teaser = soup.find("p", attrs={"id": ["teaser"]}) 90 if teaser is None: 91 paragraphs = soup.find_all("p") 92 if len(paragraphs) > 0: 93 teaser = paragraphs[0].prettify() 94 if len(teaser) < 100: 95 logger.warning( 96 "Cannot extract adequate teaser from '%s'", resource_name 97 ) 98 return 99 else: 100 logger.warning("Cannot extract teaser from '%s'", resource_name) 101 return 102 else: 103 teaser = teaser.prettify() 104 re_proc = re.compile("^/[^/][^/]/essay/[^/]+/data/[^/]+$") 105 imgs = soup.find_all("img") 106 extra_files = [] 107 #for img in imgs: 108 # # We require that any image whose access is regulated is src'd 109 # # as "<slug>/data/img.png". We also need to check if the <slug> 110 # # component actually matches the article's slug 111 # if re_proc.match(img["src"]): 112 # if img["src"].split(os.sep)[2] == slug: 113 # LOGGER.info( 114 # "extra file for %s is %s" % (slug, os.path.basename(img["src"])) 115 # ) 116 # extra_files.append(os.path.basename(img["src"])) 117 # else: 118 # LOGGER.warning( 119 # "Image src and slug don't match: '%s' != '%s'" 120 # % (img["src"].split(os.sep)[2], slug) 121 # ) 122 add_article(slug, title, teaser, resource_name, extra_files, lang) 123 124 125 pkgfiles = importlib.resources.files("talermerchantdemos") 126 127 supported_langs = { 128 "en", 129 "ar", 130 "zh", 131 "fr", 132 "hi", 133 "it", 134 "ja", 135 "ko", 136 "pt", 137 "pt_BR", 138 "ru", 139 "tr", 140 "de", 141 "sv", 142 "es", 143 } 144 145 for l in pkgfiles.joinpath("blog/articles/").iterdir(): 146 lang = l.name 147 if lang not in supported_langs: 148 continue 149 logger.info("importing %s" % l) 150 for a in l.iterdir(): 151 if not a.is_file(): 152 continue 153 # Max 50 articles per language 154 if articles_per_lang.get(lang, 0) > 50: 155 break 156 add_from_html(a, lang)