1/***************************************************************************
2 *                                  _   _ ____  _
3 *  Project                     ___| | | |  _ \| |
4 *                             / __| | | | |_) | |
5 *                            | (__| |_| |  _ <| |___
6 *                             \___|\___/|_| \_\_____|
7 *
8 * Copyright (C) 1998 - 2011, Daniel Stenberg, <daniel@haxx.se>, et al.
9 *
10 * This software is licensed as described in the file COPYING, which
11 * you should have received as part of this distribution. The terms
12 * are also available at http://curl.haxx.se/docs/copyright.html.
13 *
14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15 * copies of the Software, and permit persons to whom the Software is
16 * furnished to do so, under the terms of the COPYING file.
17 *
18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19 * KIND, either express or implied.
20 *
21 ***************************************************************************/
22// Get a web page, parse it with libxml.
23//
24// Written by Lars Nilsson
25//
26// GNU C++ compile command line suggestion (edit paths accordingly):
27//
28// g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cc \
29// -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
30
31#include <stdio.h>
32#include <string.h>
33#include <stdlib.h>
34#include <string>
35#include <curl/curl.h>
36#include <libxml/HTMLparser.h>
37
38//
39//  Case-insensitive string comparison
40//
41
42#ifdef _MSC_VER
43#define COMPARE(a, b) (!stricmp((a), (b)))
44#else
45#define COMPARE(a, b) (!strcasecmp((a), (b)))
46#endif
47
48//
49//  libxml callback context structure
50//
51
52struct Context
53{
54  Context(): addTitle(false) { }
55
56  bool addTitle;
57  std::string title;
58};
59
60//
61//  libcurl variables for error strings and returned data
62
63static char errorBuffer[CURL_ERROR_SIZE];
64static std::string buffer;
65
66//
67//  libcurl write callback function
68//
69
70static int writer(char *data, size_t size, size_t nmemb,
71                  std::string *writerData)
72{
73  if (writerData == NULL)
74    return 0;
75
76  writerData->append(data, size*nmemb);
77
78  return size * nmemb;
79}
80
81//
82//  libcurl connection initialization
83//
84
85static bool init(CURL *&conn, char *url)
86{
87  CURLcode code;
88
89  conn = curl_easy_init();
90
91  if (conn == NULL)
92  {
93    fprintf(stderr, "Failed to create CURL connection\n");
94
95    exit(EXIT_FAILURE);
96  }
97
98  code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
99  if (code != CURLE_OK)
100  {
101    fprintf(stderr, "Failed to set error buffer [%d]\n", code);
102
103    return false;
104  }
105
106  code = curl_easy_setopt(conn, CURLOPT_URL, url);
107  if (code != CURLE_OK)
108  {
109    fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
110
111    return false;
112  }
113
114  code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
115  if (code != CURLE_OK)
116  {
117    fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
118
119    return false;
120  }
121
122  code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
123  if (code != CURLE_OK)
124  {
125    fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
126
127    return false;
128  }
129
130  code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
131  if (code != CURLE_OK)
132  {
133    fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
134
135    return false;
136  }
137
138  return true;
139}
140
141//
142//  libxml start element callback function
143//
144
145static void StartElement(void *voidContext,
146                         const xmlChar *name,
147                         const xmlChar **attributes)
148{
149  Context *context = (Context *)voidContext;
150
151  if (COMPARE((char *)name, "TITLE"))
152  {
153    context->title = "";
154    context->addTitle = true;
155  }
156  (void) attributes;
157}
158
159//
160//  libxml end element callback function
161//
162
163static void EndElement(void *voidContext,
164                       const xmlChar *name)
165{
166  Context *context = (Context *)voidContext;
167
168  if (COMPARE((char *)name, "TITLE"))
169    context->addTitle = false;
170}
171
172//
173//  Text handling helper function
174//
175
176static void handleCharacters(Context *context,
177                             const xmlChar *chars,
178                             int length)
179{
180  if (context->addTitle)
181    context->title.append((char *)chars, length);
182}
183
184//
185//  libxml PCDATA callback function
186//
187
188static void Characters(void *voidContext,
189                       const xmlChar *chars,
190                       int length)
191{
192  Context *context = (Context *)voidContext;
193
194  handleCharacters(context, chars, length);
195}
196
197//
198//  libxml CDATA callback function
199//
200
201static void cdata(void *voidContext,
202                  const xmlChar *chars,
203                  int length)
204{
205  Context *context = (Context *)voidContext;
206
207  handleCharacters(context, chars, length);
208}
209
210//
211//  libxml SAX callback structure
212//
213
214static htmlSAXHandler saxHandler =
215{
216  NULL,
217  NULL,
218  NULL,
219  NULL,
220  NULL,
221  NULL,
222  NULL,
223  NULL,
224  NULL,
225  NULL,
226  NULL,
227  NULL,
228  NULL,
229  NULL,
230  StartElement,
231  EndElement,
232  NULL,
233  Characters,
234  NULL,
235  NULL,
236  NULL,
237  NULL,
238  NULL,
239  NULL,
240  NULL,
241  cdata,
242  NULL
243};
244
245//
246//  Parse given (assumed to be) HTML text and return the title
247//
248
249static void parseHtml(const std::string &html,
250                      std::string &title)
251{
252  htmlParserCtxtPtr ctxt;
253  Context context;
254
255  ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
256                                  XML_CHAR_ENCODING_NONE);
257
258  htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
259  htmlParseChunk(ctxt, "", 0, 1);
260
261  htmlFreeParserCtxt(ctxt);
262
263  title = context.title;
264}
265
266int main(int argc, char *argv[])
267{
268  CURL *conn = NULL;
269  CURLcode code;
270  std::string title;
271
272  // Ensure one argument is given
273
274  if (argc != 2)
275  {
276    fprintf(stderr, "Usage: %s <url>\n", argv[0]);
277
278    exit(EXIT_FAILURE);
279  }
280
281  curl_global_init(CURL_GLOBAL_DEFAULT);
282
283  // Initialize CURL connection
284
285  if (!init(conn, argv[1]))
286  {
287    fprintf(stderr, "Connection initializion failed\n");
288
289    exit(EXIT_FAILURE);
290  }
291
292  // Retrieve content for the URL
293
294  code = curl_easy_perform(conn);
295  curl_easy_cleanup(conn);
296
297  if (code != CURLE_OK)
298  {
299    fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
300
301    exit(EXIT_FAILURE);
302  }
303
304  // Parse the (assumed) HTML code
305
306  parseHtml(buffer, title);
307
308  // Display the extracted title
309
310  printf("Title: %s\n", title.c_str());
311
312  return EXIT_SUCCESS;
313}
314