quickjs-tart

quickjs-based runtime for wallet-core logic
Log | Files | Refs | README | LICENSE

htmltitle.cpp (6500B)


      1 /***************************************************************************
      2  *                                  _   _ ____  _
      3  *  Project                     ___| | | |  _ \| |
      4  *                             / __| | | | |_) | |
      5  *                            | (__| |_| |  _ <| |___
      6  *                             \___|\___/|_| \_\_____|
      7  *
      8  * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
      9  *
     10  * This software is licensed as described in the file COPYING, which
     11  * you should have received as part of this distribution. The terms
     12  * are also available at https://curl.se/docs/copyright.html.
     13  *
     14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
     15  * copies of the Software, and permit persons to whom the Software is
     16  * furnished to do so, under the terms of the COPYING file.
     17  *
     18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
     19  * KIND, either express or implied.
     20  *
     21  * SPDX-License-Identifier: curl
     22  *
     23  ***************************************************************************/
     24 /* <DESC>
     25  * Get a web page, extract the title with libxml.
     26  * </DESC>
     27 
     28  Written by Lars Nilsson
     29 
     30  GNU C++ compile command line suggestion (edit paths accordingly):
     31 
     32  g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \
     33  -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
     34 */
     35 #include <stdio.h>
     36 #include <string.h>
     37 #include <stdlib.h>
     38 #include <string>
     39 #include <curl/curl.h>
     40 #include <libxml/HTMLparser.h>
     41 
     42 //
     43 //  Case-insensitive string comparison
     44 //
     45 
     46 #ifdef _WIN32
     47 #define COMPARE(a, b) (!_stricmp((a), (b)))
     48 #else
     49 #define COMPARE(a, b) (!strcasecmp((a), (b)))
     50 #endif
     51 
     52 //
     53 //  libxml callback context structure
     54 //
     55 
     56 struct Context
     57 {
     58   Context(): addTitle(false) { }
     59 
     60   bool addTitle;
     61   std::string title;
     62 };
     63 
     64 //
     65 //  libcurl variables for error strings and returned data
     66 
     67 static char errorBuffer[CURL_ERROR_SIZE];
     68 static std::string buffer;
     69 
     70 //
     71 //  libcurl write callback function
     72 //
     73 
     74 static size_t writer(char *data, size_t size, size_t nmemb,
     75                      std::string *writerData)
     76 {
     77   if(writerData == NULL)
     78     return 0;
     79 
     80   writerData->append(data, size*nmemb);
     81 
     82   return size * nmemb;
     83 }
     84 
     85 //
     86 //  libcurl connection initialization
     87 //
     88 
     89 static bool init(CURL *&conn, const char *url)
     90 {
     91   CURLcode code;
     92 
     93   conn = curl_easy_init();
     94 
     95   if(conn == NULL) {
     96     fprintf(stderr, "Failed to create CURL connection\n");
     97     return false;
     98   }
     99 
    100   code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
    101   if(code != CURLE_OK) {
    102     fprintf(stderr, "Failed to set error buffer [%d]\n", code);
    103     return false;
    104   }
    105 
    106   code = curl_easy_setopt(conn, CURLOPT_URL, url);
    107   if(code != CURLE_OK) {
    108     fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
    109     return false;
    110   }
    111 
    112   code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
    113   if(code != CURLE_OK) {
    114     fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
    115     return false;
    116   }
    117 
    118   code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
    119   if(code != CURLE_OK) {
    120     fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
    121     return false;
    122   }
    123 
    124   code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
    125   if(code != CURLE_OK) {
    126     fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
    127     return false;
    128   }
    129 
    130   return true;
    131 }
    132 
    133 //
    134 //  libxml start element callback function
    135 //
    136 
    137 static void StartElement(void *voidContext,
    138                          const xmlChar *name,
    139                          const xmlChar **attributes)
    140 {
    141   Context *context = static_cast<Context *>(voidContext);
    142 
    143   if(COMPARE(reinterpret_cast<const char *>(name), "TITLE")) {
    144     context->title = "";
    145     context->addTitle = true;
    146   }
    147   (void) attributes;
    148 }
    149 
    150 //
    151 //  libxml end element callback function
    152 //
    153 
    154 static void EndElement(void *voidContext,
    155                        const xmlChar *name)
    156 {
    157   Context *context = static_cast<Context *>(voidContext);
    158 
    159   if(COMPARE(reinterpret_cast<const char *>(name), "TITLE"))
    160     context->addTitle = false;
    161 }
    162 
    163 //
    164 //  Text handling helper function
    165 //
    166 
    167 static void handleCharacters(Context *context,
    168                              const xmlChar *chars,
    169                              int length)
    170 {
    171   if(context->addTitle)
    172     context->title.append(reinterpret_cast<const char *>(chars),
    173                           (unsigned long)length);
    174 }
    175 
    176 //
    177 //  libxml PCDATA callback function
    178 //
    179 
    180 static void Characters(void *voidContext,
    181                        const xmlChar *chars,
    182                        int length)
    183 {
    184   Context *context = static_cast<Context *>(voidContext);
    185 
    186   handleCharacters(context, chars, length);
    187 }
    188 
    189 //
    190 //  libxml CDATA callback function
    191 //
    192 
    193 static void cdata(void *voidContext,
    194                   const xmlChar *chars,
    195                   int length)
    196 {
    197   Context *context = static_cast<Context *>(voidContext);
    198 
    199   handleCharacters(context, chars, length);
    200 }
    201 
    202 //
    203 //  libxml SAX callback structure
    204 //
    205 
    206 static htmlSAXHandler saxHandler =
    207 {
    208   NULL,
    209   NULL,
    210   NULL,
    211   NULL,
    212   NULL,
    213   NULL,
    214   NULL,
    215   NULL,
    216   NULL,
    217   NULL,
    218   NULL,
    219   NULL,
    220   NULL,
    221   NULL,
    222   StartElement,
    223   EndElement,
    224   NULL,
    225   Characters,
    226   NULL,
    227   NULL,
    228   NULL,
    229   NULL,
    230   NULL,
    231   NULL,
    232   NULL,
    233   cdata,
    234   NULL,
    235   0,
    236   0,
    237   0,
    238   0,
    239   NULL
    240 };
    241 
    242 //
    243 //  Parse given (assumed to be) HTML text and return the title
    244 //
    245 
    246 static void parseHtml(const std::string &html,
    247                       std::string &title)
    248 {
    249   htmlParserCtxtPtr ctxt;
    250   Context context;
    251 
    252   ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
    253                                   XML_CHAR_ENCODING_NONE);
    254 
    255   htmlParseChunk(ctxt, html.c_str(), (int)html.size(), 0);
    256   htmlParseChunk(ctxt, "", 0, 1);
    257 
    258   htmlFreeParserCtxt(ctxt);
    259 
    260   title = context.title;
    261 }
    262 
    263 int main(int argc, char *argv[])
    264 {
    265   CURL *conn = NULL;
    266   CURLcode code;
    267   std::string title;
    268 
    269   // Ensure one argument is given
    270 
    271   if(argc != 2) {
    272     fprintf(stderr, "Usage: %s <url>\n", argv[0]);
    273     return EXIT_FAILURE;
    274   }
    275 
    276   curl_global_init(CURL_GLOBAL_DEFAULT);
    277 
    278   // Initialize CURL connection
    279 
    280   if(!init(conn, argv[1])) {
    281     fprintf(stderr, "Connection initialization failed\n");
    282     return EXIT_FAILURE;
    283   }
    284 
    285   // Retrieve content for the URL
    286 
    287   code = curl_easy_perform(conn);
    288   curl_easy_cleanup(conn);
    289 
    290   if(code != CURLE_OK) {
    291     fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
    292     return EXIT_FAILURE;
    293   }
    294 
    295   // Parse the (assumed) HTML code
    296   parseHtml(buffer, title);
    297 
    298   // Display the extracted title
    299   printf("Title: %s\n", title.c_str());
    300 
    301   return EXIT_SUCCESS;
    302 }