taler-merchant-demos

Python-based Frontends for the Demonstration Web site
Log | Files | Refs | Submodules | README | LICENSE

content.py (5314B)


      1 ##
      2 # This file is part of GNU TALER.
      3 # Copyright (C) 2014-2020 Taler Systems SA
      4 #
      5 # TALER is free software; you can redistribute it and/or modify it under the
      6 # terms of the GNU Lesser General Public License as published by the Free Software
      7 # Foundation; either version 2.1, or (at your option) any later version.
      8 #
      9 # TALER is distributed in the hope that it will be useful, but WITHOUT ANY
     10 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
     11 # A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more details.
     12 #
     13 # You should have received a copy of the GNU Lesser General Public License along with
     14 # GNU TALER; see the file COPYING.  If not, see <http://www.gnu.org/licenses/>
     15 #
     16 # @author Florian Dold
     17 # @brief Define content and associated metadata that is served on the blog.
     18 
     19 from collections import OrderedDict, namedtuple
     20 import logging
     21 import re
     22 from bs4 import BeautifulSoup
     23 
     24 import importlib.resources
     25 
     26 logger = logging.getLogger(__name__)
     27 noisy_logger = logging.getLogger("chardet.charsetprober")
     28 noisy_logger.setLevel(logging.INFO)
     29 Article = namedtuple("Article", "slug title teaser main_file extra_files lang")
     30 
     31 ##
     32 # @var if a article is added to this list, then it will
     33 #      be made available in the blog.
     34 #      ARTICLES is a dict mapping a languguage ('en') to an OrderedDict() of
     35 #      articles available in that language.
     36 ARTICLES = {}
     37 
     38 articles_per_lang = {}
     39 
     40 
     41 ##
     42 # Add article to the list of the available articles.
     43 #
     44 # @param slug article's title with all the spaces converted to underscores.
     45 # @param title article's title.
     46 # @param teaser a short description of the main article's content.
     47 # @param main_file path to the article's HTML file.
     48 # @param extra_file collection of extra files associated with the
     49 #        article, like images and sounds.
     50 # @param lang language of the arcile
     51 def add_article(slug, title, teaser, main_file, extra_files, lang="en"):
     52     if not (lang in ARTICLES):
     53         ARTICLES[lang] = OrderedDict()
     54     ARTICLES[lang][slug] = Article(slug, title, teaser, main_file, extra_files, lang)
     55     articles_per_lang.setdefault(lang, 0)
     56     articles_per_lang[lang] += 1
     57 
     58 
     59 ##
     60 # Return contents of an article.
     61 #
     62 # @param article the article filename.
     63 # @return text contents of the article
     64 def get_article_contents(article):
     65     return article.main_file.read_text()
     66 
     67 
     68 ##
     69 # Extract information from HTML file, and use these informations
     70 # to make the article available in the blog.
     71 #
     72 # @param resource_name path to the (HTML) article.
     73 # @param teaser_paragraph position of the teaser paragraph in the
     74 #        article's list of all the P tags.  Defaults to zero, as normally
     75 #        this information is found under the very first P tag.
     76 # @param title article's title; normally, this bit is extracted from the
     77 #        HTML itself, so give it here if a explicit title needs to be
     78 #        specified.
     79 def add_from_html(resource_name, lang):
     80     soup = BeautifulSoup(resource_name.read_bytes(), "html.parser")
     81     title_el = soup.find("h2")
     82     if title_el is None:
     83         logger.warning("Cannot extract title from '%s'", resource_name)
     84         return
     85     title = title_el.get_text().strip()
     86     slug = title.replace(" ", "_")
     87     slug = re.sub(r"[^a-zA-Z0-9_]+", "-", slug)
     88 
     89     teaser = soup.find("p", attrs={"id": ["teaser"]})
     90     if teaser is None:
     91         paragraphs = soup.find_all("p")
     92         if len(paragraphs) > 0:
     93             teaser = paragraphs[0].prettify()
     94             if len(teaser) < 100:
     95                 logger.warning(
     96                     "Cannot extract adequate teaser from '%s'", resource_name
     97                 )
     98                 return
     99         else:
    100             logger.warning("Cannot extract teaser from '%s'", resource_name)
    101             return
    102     else:
    103         teaser = teaser.prettify()
    104     re_proc = re.compile("^/[^/][^/]/essay/[^/]+/data/[^/]+$")
    105     imgs = soup.find_all("img")
    106     extra_files = []
    107     #for img in imgs:
    108     #    # We require that any image whose access is regulated is src'd
    109     #    # as "<slug>/data/img.png". We also need to check if the <slug>
    110     #    # component actually matches the article's slug
    111     #    if re_proc.match(img["src"]):
    112     #        if img["src"].split(os.sep)[2] == slug:
    113     #            LOGGER.info(
    114     #                "extra file for %s is %s" % (slug, os.path.basename(img["src"]))
    115     #            )
    116     #            extra_files.append(os.path.basename(img["src"]))
    117     #        else:
    118     #            LOGGER.warning(
    119     #                "Image src and slug don't match: '%s' != '%s'"
    120     #                % (img["src"].split(os.sep)[2], slug)
    121     #            )
    122     add_article(slug, title, teaser, resource_name, extra_files, lang)
    123 
    124 
    125 pkgfiles = importlib.resources.files("talermerchantdemos")
    126 
    127 supported_langs = {
    128     "en",
    129     "ar",
    130     "zh",
    131     "fr",
    132     "hi",
    133     "it",
    134     "ja",
    135     "ko",
    136     "pt",
    137     "pt_BR",
    138     "ru",
    139     "tr",
    140     "de",
    141     "sv",
    142     "es",
    143 }
    144 
    145 for l in pkgfiles.joinpath("blog/articles/").iterdir():
    146     lang = l.name
    147     if lang not in supported_langs:
    148         continue
    149     logger.info("importing %s" % l)
    150     for a in l.iterdir():
    151         if not a.is_file():
    152             continue
    153         # Max 50 articles per language
    154         if articles_per_lang.get(lang, 0) > 50:
    155             break
    156         add_from_html(a, lang)