crawler.c - quickjs-tart - quickjs-based runtime for wallet-core logic

crawler.c (7967B)
      1 /***************************************************************************
      2  *                                  _   _ ____  _
      3  *  Project                     ___| | | |  _ \| |
      4  *                             / __| | | | |_) | |
      5  *                            | (__| |_| |  _ <| |___
      6  *                             \___|\___/|_| \_\_____|
      7  *
      8  * Copyright (C) Jeroen Ooms <jeroenooms@gmail.com>
      9  *
     10  * This software is licensed as described in the file COPYING, which
     11  * you should have received as part of this distribution. The terms
     12  * are also available at https://curl.se/docs/copyright.html.
     13  *
     14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
     15  * copies of the Software, and permit persons to whom the Software is
     16  * furnished to do so, under the terms of the COPYING file.
     17  *
     18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
     19  * KIND, either express or implied.
     20  *
     21  * SPDX-License-Identifier: curl
     22  *
     23  * To compile:
     24  *   gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
     25  *
     26  */
     27 /* <DESC>
     28  * Web crawler based on curl and libxml2 to stress-test curl with
     29  * hundreds of concurrent connections to various servers.
     30  * </DESC>
     31  */
     32 
     33 #include <libxml/HTMLparser.h>
     34 #include <libxml/xpath.h>
     35 #include <libxml/uri.h>
     36 #include <curl/curl.h>
     37 #include <stdlib.h>
     38 #include <string.h>
     39 #include <math.h>
     40 #include <signal.h>
     41 
     42 /* Parameters */
     43 static int max_con = 200;
     44 static int max_total = 20000;
     45 static int max_requests = 500;
     46 static size_t max_link_per_page = 5;
     47 static int follow_relative_links = 0;
     48 static const char *start_page = "https://www.reuters.com";
     49 
     50 static int pending_interrupt = 0;
     51 static void sighandler(int dummy)
     52 {
     53   (void)dummy;
     54   pending_interrupt = 1;
     55 }
     56 
     57 /* resizable buffer */
     58 struct memory {
     59   char *buf;
     60   size_t size;
     61 };
     62 
     63 static size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
     64 {
     65   size_t realsize = sz * nmemb;
     66   struct memory *mem = (struct memory*) ctx;
     67   char *ptr = realloc(mem->buf, mem->size + realsize);
     68   if(!ptr) {
     69     /* out of memory */
     70     printf("not enough memory (realloc returned NULL)\n");
     71     return 0;
     72   }
     73   mem->buf = ptr;
     74   memcpy(&(mem->buf[mem->size]), contents, realsize);
     75   mem->size += realsize;
     76   return realsize;
     77 }
     78 
     79 static CURL *make_handle(const char *url)
     80 {
     81   CURL *handle = curl_easy_init();
     82   struct memory *mem;
     83 
     84   /* Important: use HTTP2 over HTTPS */
     85   curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
     86   curl_easy_setopt(handle, CURLOPT_URL, url);
     87 
     88   /* buffer body */
     89   mem = malloc(sizeof(*mem));
     90   mem->size = 0;
     91   mem->buf = malloc(1);
     92   curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
     93   curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
     94   curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
     95 
     96   /* For completeness */
     97   curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
     98   curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
     99   curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
    100   /* only allow redirects to HTTP and HTTPS URLs */
    101   curl_easy_setopt(handle, CURLOPT_REDIR_PROTOCOLS_STR, "http,https");
    102   curl_easy_setopt(handle, CURLOPT_AUTOREFERER, 1L);
    103   curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
    104   /* each transfer needs to be done within 20 seconds! */
    105   curl_easy_setopt(handle, CURLOPT_TIMEOUT_MS, 20000L);
    106   /* connect fast or fail */
    107   curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT_MS, 2000L);
    108   /* skip files larger than a gigabyte */
    109   curl_easy_setopt(handle, CURLOPT_MAXFILESIZE_LARGE,
    110                    (curl_off_t)1024*1024*1024);
    111   curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
    112   curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
    113   curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
    114   curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
    115   curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
    116   curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
    117   curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
    118   return handle;
    119 }
    120 
    121 /* HREF finder implemented in libxml2 but could be any HTML parser */
    122 static size_t follow_links(CURLM *multi_handle, struct memory *mem,
    123                            const char *url)
    124 {
    125   int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
    126              HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
    127   htmlDocPtr doc = htmlReadMemory(mem->buf, (int)mem->size, url, NULL, opts);
    128   size_t count;
    129   int i;
    130   xmlChar *xpath;
    131   xmlNodeSetPtr nodeset;
    132   xmlXPathContextPtr context;
    133   xmlXPathObjectPtr result;
    134   if(!doc)
    135     return 0;
    136   xpath = (xmlChar*) "//a/@href";
    137   context = xmlXPathNewContext(doc);
    138   result = xmlXPathEvalExpression(xpath, context);
    139   xmlXPathFreeContext(context);
    140   if(!result)
    141     return 0;
    142   nodeset = result->nodesetval;
    143   if(xmlXPathNodeSetIsEmpty(nodeset)) {
    144     xmlXPathFreeObject(result);
    145     return 0;
    146   }
    147   count = 0;
    148   for(i = 0; i < nodeset->nodeNr; i++) {
    149     double r = rand();
    150     int x = (int)(r * nodeset->nodeNr / RAND_MAX);
    151     const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
    152     xmlChar *href = xmlNodeListGetString(doc, node, 1);
    153     char *link;
    154     if(follow_relative_links) {
    155       xmlChar *orig = href;
    156       href = xmlBuildURI(href, (xmlChar *) url);
    157       xmlFree(orig);
    158     }
    159     link = (char *) href;
    160     if(!link || strlen(link) < 20)
    161       continue;
    162     if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
    163       curl_multi_add_handle(multi_handle, make_handle(link));
    164       if(count++ == max_link_per_page)
    165         break;
    166     }
    167     xmlFree(link);
    168   }
    169   xmlXPathFreeObject(result);
    170   return count;
    171 }
    172 
    173 static int is_html(char *ctype)
    174 {
    175   return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
    176 }
    177 
    178 int main(void)
    179 {
    180   CURLM *multi_handle;
    181   int msgs_left;
    182   int pending;
    183   int complete;
    184   int still_running;
    185 
    186   signal(SIGINT, sighandler);
    187   LIBXML_TEST_VERSION
    188   curl_global_init(CURL_GLOBAL_DEFAULT);
    189   multi_handle = curl_multi_init();
    190   curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
    191   curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
    192 
    193   /* enables http/2 if available */
    194 #ifdef CURLPIPE_MULTIPLEX
    195   curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
    196 #endif
    197 
    198   /* sets html start page */
    199   curl_multi_add_handle(multi_handle, make_handle(start_page));
    200 
    201   pending = 0;
    202   complete = 0;
    203   still_running = 1;
    204   while(still_running && !pending_interrupt) {
    205     int numfds;
    206     CURLMsg *m;
    207 
    208     curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
    209     curl_multi_perform(multi_handle, &still_running);
    210 
    211     /* See how the transfers went */
    212     m = NULL;
    213     while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
    214       if(m->msg == CURLMSG_DONE) {
    215         CURL *handle = m->easy_handle;
    216         char *url;
    217         struct memory *mem;
    218         curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
    219         curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
    220         if(m->data.result == CURLE_OK) {
    221           long res_status;
    222           curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
    223           if(res_status == 200) {
    224             char *ctype;
    225             curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
    226             printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
    227             if(is_html(ctype) && mem->size > 100) {
    228               if(pending < max_requests && (complete + pending) < max_total) {
    229                 pending += follow_links(multi_handle, mem, url);
    230                 still_running = 1;
    231               }
    232             }
    233           }
    234           else {
    235             printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
    236           }
    237         }
    238         else {
    239           printf("[%d] Connection failure: %s\n", complete, url);
    240         }
    241         curl_multi_remove_handle(multi_handle, handle);
    242         curl_easy_cleanup(handle);
    243         free(mem->buf);
    244         free(mem);
    245         complete++;
    246         pending--;
    247       }
    248     }
    249   }
    250   curl_multi_cleanup(multi_handle);
    251   curl_global_cleanup();
    252   return 0;
    253 }
	quickjs-tart quickjs-based runtime for wallet-core logic
	Log \| Files \| Refs \| README \| LICENSE