1/*************************************************************************** 2 * _ _ ____ _ 3 * Project ___| | | | _ \| | 4 * / __| | | | |_) | | 5 * | (__| |_| | _ <| |___ 6 * \___|\___/|_| \_\_____| 7 * 8 * Copyright (C) 1998 - 2011, Daniel Stenberg, <daniel@haxx.se>, et al. 9 * 10 * This software is licensed as described in the file COPYING, which 11 * you should have received as part of this distribution. The terms 12 * are also available at http://curl.haxx.se/docs/copyright.html. 13 * 14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell 15 * copies of the Software, and permit persons to whom the Software is 16 * furnished to do so, under the terms of the COPYING file. 17 * 18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY 19 * KIND, either express or implied. 20 * 21 ***************************************************************************/ 22// Get a web page, parse it with libxml. 23// 24// Written by Lars Nilsson 25// 26// GNU C++ compile command line suggestion (edit paths accordingly): 27// 28// g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cc \ 29// -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2 30 31#include <stdio.h> 32#include <string.h> 33#include <stdlib.h> 34#include <string> 35#include <curl/curl.h> 36#include <libxml/HTMLparser.h> 37 38// 39// Case-insensitive string comparison 40// 41 42#ifdef _MSC_VER 43#define COMPARE(a, b) (!stricmp((a), (b))) 44#else 45#define COMPARE(a, b) (!strcasecmp((a), (b))) 46#endif 47 48// 49// libxml callback context structure 50// 51 52struct Context 53{ 54 Context(): addTitle(false) { } 55 56 bool addTitle; 57 std::string title; 58}; 59 60// 61// libcurl variables for error strings and returned data 62 63static char errorBuffer[CURL_ERROR_SIZE]; 64static std::string buffer; 65 66// 67// libcurl write callback function 68// 69 70static int writer(char *data, size_t size, size_t nmemb, 71 std::string *writerData) 72{ 73 if (writerData == NULL) 74 return 0; 75 76 writerData->append(data, size*nmemb); 77 78 return size * nmemb; 79} 80 81// 82// libcurl connection initialization 83// 84 85static bool init(CURL *&conn, char *url) 86{ 87 CURLcode code; 88 89 conn = curl_easy_init(); 90 91 if (conn == NULL) 92 { 93 fprintf(stderr, "Failed to create CURL connection\n"); 94 95 exit(EXIT_FAILURE); 96 } 97 98 code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer); 99 if (code != CURLE_OK) 100 { 101 fprintf(stderr, "Failed to set error buffer [%d]\n", code); 102 103 return false; 104 } 105 106 code = curl_easy_setopt(conn, CURLOPT_URL, url); 107 if (code != CURLE_OK) 108 { 109 fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer); 110 111 return false; 112 } 113 114 code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L); 115 if (code != CURLE_OK) 116 { 117 fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer); 118 119 return false; 120 } 121 122 code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer); 123 if (code != CURLE_OK) 124 { 125 fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer); 126 127 return false; 128 } 129 130 code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer); 131 if (code != CURLE_OK) 132 { 133 fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer); 134 135 return false; 136 } 137 138 return true; 139} 140 141// 142// libxml start element callback function 143// 144 145static void StartElement(void *voidContext, 146 const xmlChar *name, 147 const xmlChar **attributes) 148{ 149 Context *context = (Context *)voidContext; 150 151 if (COMPARE((char *)name, "TITLE")) 152 { 153 context->title = ""; 154 context->addTitle = true; 155 } 156 (void) attributes; 157} 158 159// 160// libxml end element callback function 161// 162 163static void EndElement(void *voidContext, 164 const xmlChar *name) 165{ 166 Context *context = (Context *)voidContext; 167 168 if (COMPARE((char *)name, "TITLE")) 169 context->addTitle = false; 170} 171 172// 173// Text handling helper function 174// 175 176static void handleCharacters(Context *context, 177 const xmlChar *chars, 178 int length) 179{ 180 if (context->addTitle) 181 context->title.append((char *)chars, length); 182} 183 184// 185// libxml PCDATA callback function 186// 187 188static void Characters(void *voidContext, 189 const xmlChar *chars, 190 int length) 191{ 192 Context *context = (Context *)voidContext; 193 194 handleCharacters(context, chars, length); 195} 196 197// 198// libxml CDATA callback function 199// 200 201static void cdata(void *voidContext, 202 const xmlChar *chars, 203 int length) 204{ 205 Context *context = (Context *)voidContext; 206 207 handleCharacters(context, chars, length); 208} 209 210// 211// libxml SAX callback structure 212// 213 214static htmlSAXHandler saxHandler = 215{ 216 NULL, 217 NULL, 218 NULL, 219 NULL, 220 NULL, 221 NULL, 222 NULL, 223 NULL, 224 NULL, 225 NULL, 226 NULL, 227 NULL, 228 NULL, 229 NULL, 230 StartElement, 231 EndElement, 232 NULL, 233 Characters, 234 NULL, 235 NULL, 236 NULL, 237 NULL, 238 NULL, 239 NULL, 240 NULL, 241 cdata, 242 NULL 243}; 244 245// 246// Parse given (assumed to be) HTML text and return the title 247// 248 249static void parseHtml(const std::string &html, 250 std::string &title) 251{ 252 htmlParserCtxtPtr ctxt; 253 Context context; 254 255 ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "", 256 XML_CHAR_ENCODING_NONE); 257 258 htmlParseChunk(ctxt, html.c_str(), html.size(), 0); 259 htmlParseChunk(ctxt, "", 0, 1); 260 261 htmlFreeParserCtxt(ctxt); 262 263 title = context.title; 264} 265 266int main(int argc, char *argv[]) 267{ 268 CURL *conn = NULL; 269 CURLcode code; 270 std::string title; 271 272 // Ensure one argument is given 273 274 if (argc != 2) 275 { 276 fprintf(stderr, "Usage: %s <url>\n", argv[0]); 277 278 exit(EXIT_FAILURE); 279 } 280 281 curl_global_init(CURL_GLOBAL_DEFAULT); 282 283 // Initialize CURL connection 284 285 if (!init(conn, argv[1])) 286 { 287 fprintf(stderr, "Connection initializion failed\n"); 288 289 exit(EXIT_FAILURE); 290 } 291 292 // Retrieve content for the URL 293 294 code = curl_easy_perform(conn); 295 curl_easy_cleanup(conn); 296 297 if (code != CURLE_OK) 298 { 299 fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer); 300 301 exit(EXIT_FAILURE); 302 } 303 304 // Parse the (assumed) HTML code 305 306 parseHtml(buffer, title); 307 308 // Display the extracted title 309 310 printf("Title: %s\n", title.c_str()); 311 312 return EXIT_SUCCESS; 313} 314