crawler.c (7967B)
1 /*************************************************************************** 2 * _ _ ____ _ 3 * Project ___| | | | _ \| | 4 * / __| | | | |_) | | 5 * | (__| |_| | _ <| |___ 6 * \___|\___/|_| \_\_____| 7 * 8 * Copyright (C) Jeroen Ooms <jeroenooms@gmail.com> 9 * 10 * This software is licensed as described in the file COPYING, which 11 * you should have received as part of this distribution. The terms 12 * are also available at https://curl.se/docs/copyright.html. 13 * 14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell 15 * copies of the Software, and permit persons to whom the Software is 16 * furnished to do so, under the terms of the COPYING file. 17 * 18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY 19 * KIND, either express or implied. 20 * 21 * SPDX-License-Identifier: curl 22 * 23 * To compile: 24 * gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl) 25 * 26 */ 27 /* <DESC> 28 * Web crawler based on curl and libxml2 to stress-test curl with 29 * hundreds of concurrent connections to various servers. 30 * </DESC> 31 */ 32 33 #include <libxml/HTMLparser.h> 34 #include <libxml/xpath.h> 35 #include <libxml/uri.h> 36 #include <curl/curl.h> 37 #include <stdlib.h> 38 #include <string.h> 39 #include <math.h> 40 #include <signal.h> 41 42 /* Parameters */ 43 static int max_con = 200; 44 static int max_total = 20000; 45 static int max_requests = 500; 46 static size_t max_link_per_page = 5; 47 static int follow_relative_links = 0; 48 static const char *start_page = "https://www.reuters.com"; 49 50 static int pending_interrupt = 0; 51 static void sighandler(int dummy) 52 { 53 (void)dummy; 54 pending_interrupt = 1; 55 } 56 57 /* resizable buffer */ 58 struct memory { 59 char *buf; 60 size_t size; 61 }; 62 63 static size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx) 64 { 65 size_t realsize = sz * nmemb; 66 struct memory *mem = (struct memory*) ctx; 67 char *ptr = realloc(mem->buf, mem->size + realsize); 68 if(!ptr) { 69 /* out of memory */ 70 printf("not enough memory (realloc returned NULL)\n"); 71 return 0; 72 } 73 mem->buf = ptr; 74 memcpy(&(mem->buf[mem->size]), contents, realsize); 75 mem->size += realsize; 76 return realsize; 77 } 78 79 static CURL *make_handle(const char *url) 80 { 81 CURL *handle = curl_easy_init(); 82 struct memory *mem; 83 84 /* Important: use HTTP2 over HTTPS */ 85 curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS); 86 curl_easy_setopt(handle, CURLOPT_URL, url); 87 88 /* buffer body */ 89 mem = malloc(sizeof(*mem)); 90 mem->size = 0; 91 mem->buf = malloc(1); 92 curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer); 93 curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem); 94 curl_easy_setopt(handle, CURLOPT_PRIVATE, mem); 95 96 /* For completeness */ 97 curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, ""); 98 curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L); 99 curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L); 100 /* only allow redirects to HTTP and HTTPS URLs */ 101 curl_easy_setopt(handle, CURLOPT_REDIR_PROTOCOLS_STR, "http,https"); 102 curl_easy_setopt(handle, CURLOPT_AUTOREFERER, 1L); 103 curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L); 104 /* each transfer needs to be done within 20 seconds! */ 105 curl_easy_setopt(handle, CURLOPT_TIMEOUT_MS, 20000L); 106 /* connect fast or fail */ 107 curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT_MS, 2000L); 108 /* skip files larger than a gigabyte */ 109 curl_easy_setopt(handle, CURLOPT_MAXFILESIZE_LARGE, 110 (curl_off_t)1024*1024*1024); 111 curl_easy_setopt(handle, CURLOPT_COOKIEFILE, ""); 112 curl_easy_setopt(handle, CURLOPT_FILETIME, 1L); 113 curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler"); 114 curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY); 115 curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L); 116 curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY); 117 curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L); 118 return handle; 119 } 120 121 /* HREF finder implemented in libxml2 but could be any HTML parser */ 122 static size_t follow_links(CURLM *multi_handle, struct memory *mem, 123 const char *url) 124 { 125 int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \ 126 HTML_PARSE_NOWARNING | HTML_PARSE_NONET; 127 htmlDocPtr doc = htmlReadMemory(mem->buf, (int)mem->size, url, NULL, opts); 128 size_t count; 129 int i; 130 xmlChar *xpath; 131 xmlNodeSetPtr nodeset; 132 xmlXPathContextPtr context; 133 xmlXPathObjectPtr result; 134 if(!doc) 135 return 0; 136 xpath = (xmlChar*) "//a/@href"; 137 context = xmlXPathNewContext(doc); 138 result = xmlXPathEvalExpression(xpath, context); 139 xmlXPathFreeContext(context); 140 if(!result) 141 return 0; 142 nodeset = result->nodesetval; 143 if(xmlXPathNodeSetIsEmpty(nodeset)) { 144 xmlXPathFreeObject(result); 145 return 0; 146 } 147 count = 0; 148 for(i = 0; i < nodeset->nodeNr; i++) { 149 double r = rand(); 150 int x = (int)(r * nodeset->nodeNr / RAND_MAX); 151 const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode; 152 xmlChar *href = xmlNodeListGetString(doc, node, 1); 153 char *link; 154 if(follow_relative_links) { 155 xmlChar *orig = href; 156 href = xmlBuildURI(href, (xmlChar *) url); 157 xmlFree(orig); 158 } 159 link = (char *) href; 160 if(!link || strlen(link) < 20) 161 continue; 162 if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) { 163 curl_multi_add_handle(multi_handle, make_handle(link)); 164 if(count++ == max_link_per_page) 165 break; 166 } 167 xmlFree(link); 168 } 169 xmlXPathFreeObject(result); 170 return count; 171 } 172 173 static int is_html(char *ctype) 174 { 175 return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html"); 176 } 177 178 int main(void) 179 { 180 CURLM *multi_handle; 181 int msgs_left; 182 int pending; 183 int complete; 184 int still_running; 185 186 signal(SIGINT, sighandler); 187 LIBXML_TEST_VERSION 188 curl_global_init(CURL_GLOBAL_DEFAULT); 189 multi_handle = curl_multi_init(); 190 curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con); 191 curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L); 192 193 /* enables http/2 if available */ 194 #ifdef CURLPIPE_MULTIPLEX 195 curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX); 196 #endif 197 198 /* sets html start page */ 199 curl_multi_add_handle(multi_handle, make_handle(start_page)); 200 201 pending = 0; 202 complete = 0; 203 still_running = 1; 204 while(still_running && !pending_interrupt) { 205 int numfds; 206 CURLMsg *m; 207 208 curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); 209 curl_multi_perform(multi_handle, &still_running); 210 211 /* See how the transfers went */ 212 m = NULL; 213 while((m = curl_multi_info_read(multi_handle, &msgs_left))) { 214 if(m->msg == CURLMSG_DONE) { 215 CURL *handle = m->easy_handle; 216 char *url; 217 struct memory *mem; 218 curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem); 219 curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url); 220 if(m->data.result == CURLE_OK) { 221 long res_status; 222 curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status); 223 if(res_status == 200) { 224 char *ctype; 225 curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype); 226 printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url); 227 if(is_html(ctype) && mem->size > 100) { 228 if(pending < max_requests && (complete + pending) < max_total) { 229 pending += follow_links(multi_handle, mem, url); 230 still_running = 1; 231 } 232 } 233 } 234 else { 235 printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url); 236 } 237 } 238 else { 239 printf("[%d] Connection failure: %s\n", complete, url); 240 } 241 curl_multi_remove_handle(multi_handle, handle); 242 curl_easy_cleanup(handle); 243 free(mem->buf); 244 free(mem); 245 complete++; 246 pending--; 247 } 248 } 249 } 250 curl_multi_cleanup(multi_handle); 251 curl_global_cleanup(); 252 return 0; 253 }