htmltitle.cpp (6500B)
1 /*************************************************************************** 2 * _ _ ____ _ 3 * Project ___| | | | _ \| | 4 * / __| | | | |_) | | 5 * | (__| |_| | _ <| |___ 6 * \___|\___/|_| \_\_____| 7 * 8 * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al. 9 * 10 * This software is licensed as described in the file COPYING, which 11 * you should have received as part of this distribution. The terms 12 * are also available at https://curl.se/docs/copyright.html. 13 * 14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell 15 * copies of the Software, and permit persons to whom the Software is 16 * furnished to do so, under the terms of the COPYING file. 17 * 18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY 19 * KIND, either express or implied. 20 * 21 * SPDX-License-Identifier: curl 22 * 23 ***************************************************************************/ 24 /* <DESC> 25 * Get a web page, extract the title with libxml. 26 * </DESC> 27 28 Written by Lars Nilsson 29 30 GNU C++ compile command line suggestion (edit paths accordingly): 31 32 g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \ 33 -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2 34 */ 35 #include <stdio.h> 36 #include <string.h> 37 #include <stdlib.h> 38 #include <string> 39 #include <curl/curl.h> 40 #include <libxml/HTMLparser.h> 41 42 // 43 // Case-insensitive string comparison 44 // 45 46 #ifdef _WIN32 47 #define COMPARE(a, b) (!_stricmp((a), (b))) 48 #else 49 #define COMPARE(a, b) (!strcasecmp((a), (b))) 50 #endif 51 52 // 53 // libxml callback context structure 54 // 55 56 struct Context 57 { 58 Context(): addTitle(false) { } 59 60 bool addTitle; 61 std::string title; 62 }; 63 64 // 65 // libcurl variables for error strings and returned data 66 67 static char errorBuffer[CURL_ERROR_SIZE]; 68 static std::string buffer; 69 70 // 71 // libcurl write callback function 72 // 73 74 static size_t writer(char *data, size_t size, size_t nmemb, 75 std::string *writerData) 76 { 77 if(writerData == NULL) 78 return 0; 79 80 writerData->append(data, size*nmemb); 81 82 return size * nmemb; 83 } 84 85 // 86 // libcurl connection initialization 87 // 88 89 static bool init(CURL *&conn, const char *url) 90 { 91 CURLcode code; 92 93 conn = curl_easy_init(); 94 95 if(conn == NULL) { 96 fprintf(stderr, "Failed to create CURL connection\n"); 97 return false; 98 } 99 100 code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer); 101 if(code != CURLE_OK) { 102 fprintf(stderr, "Failed to set error buffer [%d]\n", code); 103 return false; 104 } 105 106 code = curl_easy_setopt(conn, CURLOPT_URL, url); 107 if(code != CURLE_OK) { 108 fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer); 109 return false; 110 } 111 112 code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L); 113 if(code != CURLE_OK) { 114 fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer); 115 return false; 116 } 117 118 code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer); 119 if(code != CURLE_OK) { 120 fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer); 121 return false; 122 } 123 124 code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer); 125 if(code != CURLE_OK) { 126 fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer); 127 return false; 128 } 129 130 return true; 131 } 132 133 // 134 // libxml start element callback function 135 // 136 137 static void StartElement(void *voidContext, 138 const xmlChar *name, 139 const xmlChar **attributes) 140 { 141 Context *context = static_cast<Context *>(voidContext); 142 143 if(COMPARE(reinterpret_cast<const char *>(name), "TITLE")) { 144 context->title = ""; 145 context->addTitle = true; 146 } 147 (void) attributes; 148 } 149 150 // 151 // libxml end element callback function 152 // 153 154 static void EndElement(void *voidContext, 155 const xmlChar *name) 156 { 157 Context *context = static_cast<Context *>(voidContext); 158 159 if(COMPARE(reinterpret_cast<const char *>(name), "TITLE")) 160 context->addTitle = false; 161 } 162 163 // 164 // Text handling helper function 165 // 166 167 static void handleCharacters(Context *context, 168 const xmlChar *chars, 169 int length) 170 { 171 if(context->addTitle) 172 context->title.append(reinterpret_cast<const char *>(chars), 173 (unsigned long)length); 174 } 175 176 // 177 // libxml PCDATA callback function 178 // 179 180 static void Characters(void *voidContext, 181 const xmlChar *chars, 182 int length) 183 { 184 Context *context = static_cast<Context *>(voidContext); 185 186 handleCharacters(context, chars, length); 187 } 188 189 // 190 // libxml CDATA callback function 191 // 192 193 static void cdata(void *voidContext, 194 const xmlChar *chars, 195 int length) 196 { 197 Context *context = static_cast<Context *>(voidContext); 198 199 handleCharacters(context, chars, length); 200 } 201 202 // 203 // libxml SAX callback structure 204 // 205 206 static htmlSAXHandler saxHandler = 207 { 208 NULL, 209 NULL, 210 NULL, 211 NULL, 212 NULL, 213 NULL, 214 NULL, 215 NULL, 216 NULL, 217 NULL, 218 NULL, 219 NULL, 220 NULL, 221 NULL, 222 StartElement, 223 EndElement, 224 NULL, 225 Characters, 226 NULL, 227 NULL, 228 NULL, 229 NULL, 230 NULL, 231 NULL, 232 NULL, 233 cdata, 234 NULL, 235 0, 236 0, 237 0, 238 0, 239 NULL 240 }; 241 242 // 243 // Parse given (assumed to be) HTML text and return the title 244 // 245 246 static void parseHtml(const std::string &html, 247 std::string &title) 248 { 249 htmlParserCtxtPtr ctxt; 250 Context context; 251 252 ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "", 253 XML_CHAR_ENCODING_NONE); 254 255 htmlParseChunk(ctxt, html.c_str(), (int)html.size(), 0); 256 htmlParseChunk(ctxt, "", 0, 1); 257 258 htmlFreeParserCtxt(ctxt); 259 260 title = context.title; 261 } 262 263 int main(int argc, char *argv[]) 264 { 265 CURL *conn = NULL; 266 CURLcode code; 267 std::string title; 268 269 // Ensure one argument is given 270 271 if(argc != 2) { 272 fprintf(stderr, "Usage: %s <url>\n", argv[0]); 273 return EXIT_FAILURE; 274 } 275 276 curl_global_init(CURL_GLOBAL_DEFAULT); 277 278 // Initialize CURL connection 279 280 if(!init(conn, argv[1])) { 281 fprintf(stderr, "Connection initialization failed\n"); 282 return EXIT_FAILURE; 283 } 284 285 // Retrieve content for the URL 286 287 code = curl_easy_perform(conn); 288 curl_easy_cleanup(conn); 289 290 if(code != CURLE_OK) { 291 fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer); 292 return EXIT_FAILURE; 293 } 294 295 // Parse the (assumed) HTML code 296 parseHtml(buffer, title); 297 298 // Display the extracted title 299 printf("Title: %s\n", title.c_str()); 300 301 return EXIT_SUCCESS; 302 }