summaryrefslogtreecommitdiff
path: root/talermerchantdemos/blog/content.py
blob: a0e90ddd31ffdc7dc0c95ccfe430bc3fb6d2ebce (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
##
# This file is part of GNU TALER.
# Copyright (C) 2014-2016 INRIA
#
# TALER is free software; you can redistribute it and/or modify it under the
# terms of the GNU Lesser General Public License as published by the Free Software
# Foundation; either version 2.1, or (at your option) any later version.
#
# TALER is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License along with
# GNU TALER; see the file COPYING.  If not, see <http://www.gnu.org/licenses/>
#
# @author Florian Dold
# @brief Define content and associated metadata that is served on the blog.

from collections import OrderedDict, namedtuple
import logging
import os
import re
from bs4 import BeautifulSoup
from pkg_resources import resource_stream, resource_filename

LOGGER = logging.getLogger(__name__)
NOISY_LOGGER = logging.getLogger("chardet.charsetprober")
NOISY_LOGGER.setLevel(logging.INFO)
Article = namedtuple("Article", "slug title teaser main_file extra_files")

##
# @var if a article is added to this list, then it will
#      be made available in the blog.
ARTICLES = OrderedDict()


##
# Add article to the list of the available articles.
#
# @param slug article's title with all the spaces converted to underscores.
# @param title article's title.
# @param teaser a short description of the main article's content.
# @param main_file path to the article's HTML file.
# @param extra_file collection of extra files associated with the
#        article, like images and sounds.
def add_article(slug, title, teaser, main_file, extra_files):
    ARTICLES[slug] = Article(slug, title, teaser, main_file, extra_files)


##
# Build the file path of a image.
#
# @param image the image filename.
# @return the path to the image file.
def get_image_file(image):
    filex = resource_filename("talermerchantdemos", os.path.join("blog/data/", image))
    return os.path.abspath(filex)


##
# Build the file path of a article.
#
# @param article the article filename.
# @return the path to the article HTML file.
def get_article_file(article):
    filex = resource_filename("talermerchantdemos", article.main_file)
    return os.path.basename(filex)


##
# Extract information from HTML file, and use these informations
# to make the article available in the blog.
#
# @param resource_name path to the (HTML) article.
# @param teaser_paragraph position of the teaser paragraph in the
#        article's list of all the P tags.  Defaults to zero, as normally
#        this information is found under the very first P tag.
# @param title article's title; normally, this bit is extracted from the
#        HTML itself, so give it here if a explicit title needs to be
#        specified.
def add_from_html(resource_name, teaser_paragraph=0, title=None):
    res = resource_stream("talermerchantdemos", resource_name)
    soup = BeautifulSoup(res, 'html.parser')
    res.close()
    if title is None:
        title_el = soup.find("h1", attrs={"class": ["chapter", "unnumbered"]})
        if title_el is None:
            LOGGER.warning("Can't extract title from '%s'", resource_name)
            title = resource_name
        else:
            title = title_el.get_text().strip()
    slug = title.replace(" ", "_")
    paragraphs = soup.find_all("p")

    teaser = soup.find("p", attrs={"id": ["teaser"]})
    if teaser is None:
        teaser = paragraphs[teaser_paragraph].get_text()
    else:
        teaser = teaser.get_text()
    re_proc = re.compile("^/essay/[^/]+/data/[^/]+$")
    imgs = soup.find_all("img")
    extra_files = []
    for img in imgs:
        # We require that any image whose access is regulated is src'd
        # as "<slug>/data/img.png". We also need to check if the <slug>
        # component actually matches the article's slug
        if re_proc.match(img['src']):
            if img['src'].split(os.sep)[2] == slug:
                LOGGER.info(
                    "extra file for %s is %s" %
                    (slug, os.path.basename(img['src']))
                )
                extra_files.append(os.path.basename(img['src']))
            else:
                LOGGER.warning("Image src and slug don't match: '%s' != '%s'" \
                               % (img['src'].split(os.sep)[2], slug))
    add_article(slug, title, teaser, resource_name, extra_files)


add_from_html("blog/articles/scrap1_U.0.html", 0)
add_from_html("blog/articles/scrap1_U.1.html", 0)
add_from_html("blog/articles/scrap1_1.html", 1)
add_from_html("blog/articles/scrap1_2.html")
add_from_html("blog/articles/scrap1_3.html")
add_from_html("blog/articles/scrap1_4.html")
add_from_html("blog/articles/scrap1_5.html")
add_from_html("blog/articles/scrap1_6.html")
add_from_html("blog/articles/scrap1_7.html")
add_from_html("blog/articles/scrap1_8.html")
add_from_html("blog/articles/scrap1_9.html")
add_from_html("blog/articles/scrap1_10.html")
add_from_html("blog/articles/scrap1_11.html")
add_from_html("blog/articles/scrap1_12.html")
add_from_html("blog/articles/scrap1_13.html", 1)
add_from_html("blog/articles/scrap1_14.html")
add_from_html("blog/articles/scrap1_15.html")
add_from_html("blog/articles/scrap1_16.html")
add_from_html("blog/articles/scrap1_17.html")
add_from_html("blog/articles/scrap1_18.html")
add_from_html("blog/articles/scrap1_19.html")
add_from_html("blog/articles/scrap1_20.html", 1)
add_from_html("blog/articles/scrap1_21.html")
add_from_html("blog/articles/scrap1_22.html")
add_from_html("blog/articles/scrap1_23.html")
add_from_html("blog/articles/scrap1_24.html")
add_from_html("blog/articles/scrap1_25.html", 1)
add_from_html("blog/articles/scrap1_26.html", 1)
add_from_html("blog/articles/scrap1_27.html")
add_from_html("blog/articles/scrap1_28.html", 1)
add_from_html("blog/articles/scrap1_29.html")
add_from_html("blog/articles/scrap1_30.html", 1)
add_from_html("blog/articles/scrap1_31.html", 1)
add_from_html("blog/articles/scrap1_32.html")
add_from_html("blog/articles/scrap1_33.html")
add_from_html("blog/articles/scrap1_34.html")
add_from_html("blog/articles/scrap1_35.html")
add_from_html("blog/articles/scrap1_36.html")
add_from_html("blog/articles/scrap1_37.html")
add_from_html("blog/articles/scrap1_38.html")
add_from_html("blog/articles/scrap1_39.html")
add_from_html("blog/articles/scrap1_40.html")
add_from_html("blog/articles/scrap1_41.html")
add_from_html("blog/articles/scrap1_42.html")
add_from_html("blog/articles/scrap1_43.html", 2)
add_from_html("blog/articles/scrap1_46.html", 1)
add_from_html("blog/articles/scrap1_47.html")