1/* 2 * Copyright 2004-2008, Fran��ois Revol, <revol@free.fr>. 3 * Distributed under the terms of the MIT License. 4 */ 5 6#include <errno.h> 7#include <sys/param.h> 8#include <stdio.h> 9#include <stdlib.h> 10#include <string.h> 11#include <unistd.h> 12#include <OS.h> 13#include "google_request.h" 14#include "string_utils.h" 15 16#define TESTME 17 18#ifdef _KERNEL_MODE 19#define printf dprintf 20#undef TESTME 21#endif 22 23#define DBG "googlefs: parse_html: " 24 25#ifdef TESTME 26#define BUFSZ (128*1024) 27int dbgstep = 0; 28#define PRST printf(DBG "step %d\n", dbgstep++) 29#else 30#define PRST {} 31#endif 32 33//old 34//#define G_BEGIN_URL "<p class=g><a class=l href=\"" 35//#define G_BEGIN_URL "<div class=g><a class=l href=\"" 36//#define G_BEGIN_URL "<div class=g><a href=\"" 37#define G_BEGIN_URL "<li class=g><h3 class=r><a href=\"" 38//#define G_END_URL "\">" 39#define G_END_URL "\" class=l>" 40//#define G_BEGIN_NAME 41#define G_END_NAME "</a>" 42#define G_BEGIN_SNIPSET /*"<td class=j>"*/"<font size=-1>" 43#define G_END_SNIPSET "<br>" 44#define G_BEGIN_CACHESIM " <a class=fl href=\"" 45#define G_END_CACHESIM "\">" 46 47int google_parse_results(const char *html, size_t htmlsize, struct google_result **results) 48{ 49 struct google_result *res = NULL, *nres = NULL, *prev = NULL; 50 char *p, *q; 51 char *nextresult = NULL; 52 long numres = 0; 53 long maxres = 0; 54 long startid = 0; 55 long lastid = 0; 56 int done = 0; 57 int err = ENOMEM; 58 59 if (!html || !results) 60 return EINVAL; 61 /* sanity checks */ 62 printf(DBG"sanity check...\n"); 63 PRST; 64 /* google now sends <!doctype html><head> sometimes... */ 65 if (strstr(html, "<!doctype html><head>") != html) { 66 if (strstr(html, "<html><head>") != html) 67 return EINVAL; 68 } 69 PRST; 70// p = strstr(html, "<title>Google Search:"); 71 p = strstr(html, "Google"); 72 if (!p) return EINVAL; 73 PRST; 74 p = strstr(html, "<body"); 75 if (!p) return EINVAL; 76 PRST; 77 p = strstr(html, "> Results <b>"); 78 if (!p) return EINVAL; 79 PRST; 80 p+= strlen("> Results <b>"); 81 startid = strtol(p, &p, 10); 82 if (!p) return EINVAL; 83 PRST; 84 p = strstr(html, "</b> - <b>"); 85 p+= strlen("</b> - <b>"); 86 if (!p) return EINVAL; 87 PRST; 88 lastid = strtol(p, &p, 10); 89 if (!p) return EINVAL; 90 PRST; 91 maxres = lastid - startid + 1; 92 printf(DBG"getting %ld results (%ld to %ld)\n", maxres, startid, lastid); 93 94 p = strstr(html, "Search Results<"); 95 if (!p) return EINVAL; 96 PRST; 97 98 99 printf(DBG"parsing...\n"); 100 do { 101 char *item; 102 long itemlen; 103 char *tmp; 104 int i; 105#ifdef TESTME 106 dbgstep = 0; 107#endif 108 nres = malloc(sizeof(struct google_result)); 109 if (!nres) { 110 // XXX: cleanup! 111 goto err0; 112 } 113 memset(nres, 0, sizeof(struct google_result)); 114 nres->id = startid + numres; //- 1; 115 116 PRST; 117 /* find url */ 118 // <p class=g><a href=URL> 119 if (!p) break; 120 if (nextresult) 121 p = nextresult; 122 else 123 p = strstr(p, G_BEGIN_URL); 124 if (!p) break; 125 PRST; 126 p+= strlen(G_BEGIN_URL); 127 nextresult = strstr(p, G_BEGIN_URL); 128 //printf(DBG"[%ld] found token 1\n", numres); 129 item = p; 130 p = strstr(p, G_END_URL); 131 if (!p) break; 132 PRST; 133 p+= strlen(G_END_URL); 134 //printf(DBG"[%ld] found token 2\n", numres); 135 itemlen = p - item - strlen(G_END_URL); 136 itemlen = MIN(GR_MAX_URL-1, itemlen); 137 strncpy(nres->url, item, itemlen); 138 nres->url[itemlen] = '\0'; 139 140 /* find name */ 141 //<b>Google</b> Web APIs - FAQ</a><table 142 item = p; 143 p = strstr(p, G_END_NAME); 144 if (!p) break; 145 PRST; 146 p+= strlen(G_END_NAME); 147 //printf(DBG"[%ld] found token 3\n", numres); 148 itemlen = p - item - strlen(G_END_NAME); 149 //itemlen = MIN(GR_MAX_NAME-1, itemlen); 150 itemlen = MIN(GR_MAX_NAME*4-1, itemlen); 151 q = malloc(itemlen+1); 152 if (!q) 153 goto err0; 154 strncpy(q, item, itemlen); 155 q[itemlen] = '\0'; 156 /* strip <*b> off */ 157 PRST; 158 while ((tmp = strstr(q, "<b>"))) 159 strcpy(tmp, tmp + 3); 160 while ((tmp = strstr(q, "</b>"))) 161 strcpy(tmp, tmp + 4); 162 /* strip <*em> off */ 163 PRST; 164 while ((tmp = strstr(q, "<em>"))) 165 strcpy(tmp, tmp + 4); 166 while ((tmp = strstr(q, "</em>"))) 167 strcpy(tmp, tmp + 5); 168 /* strip &foo; */ 169 tmp = unentitify_string(q); 170 free(q); 171 if (!tmp) 172 goto err0; 173 strncpy(nres->name, tmp, GR_MAX_NAME-1); 174 nres->name[GR_MAX_NAME-1] = '\0'; 175 free(tmp); 176 PRST; 177 178#if 0 179 /* find snipset */ 180 //<td class=j><font size=-1><b>...</b> a custom Java client library, documentation on <b>how</b> <b>to</b> use the <b>...</b> You can find it at http://<b>api</b>.<b>google</b>.com/GoogleSearch.wsdl <b>...</b> need to get started is in <b>googleapi</b>.jar <b>...</b> <br> 181 if (!p) break; 182 q = strstr(p, G_BEGIN_SNIPSET); 183 if (q && (!nextresult || (q < nextresult))) { 184 p = q; 185 p+= strlen(G_BEGIN_SNIPSET); 186 //printf(DBG"[%ld] found token 4\n", numres); 187 item = p; 188 p = strstr(p, G_END_SNIPSET); 189 if (!p) break; 190 p+= strlen(G_END_SNIPSET); 191 //printf(DBG"[%ld] found token 5\n", numres); 192 itemlen = p - item - strlen(G_END_SNIPSET); 193 itemlen = MIN(GR_MAX_URL-1, itemlen); 194 strncpy(nres->snipset, item, itemlen); 195 nres->snipset[itemlen] = '\0'; 196 /* strip &foo; */ 197 tmp = unentitify_string(nres->snipset); 198 if (!tmp) 199 break; 200 strncpy(nres->snipset, tmp, GR_MAX_SNIPSET-1); 201 nres->snipset[GR_MAX_SNIPSET-1] = '\0'; 202 free(tmp); 203 /* strip <*b> off */ 204 while ((tmp = strstr(nres->snipset, "<b>"))) 205 strcpy(tmp, tmp + 3); 206 while ((tmp = strstr(nres->snipset, "</b>"))) 207 strcpy(tmp, tmp + 4); 208 while ((tmp = strstr(nres->snipset, "\r"))) 209 strcpy(tmp, tmp + 1); 210 while ((tmp = strstr(nres->snipset, "\n"))) 211 *tmp = ' '; 212 } 213 214#endif 215 /* find cache/similar url */ 216 // <a class=fl href="http://216.239.59.104/search?q=cache:vR7BaPWutnkJ:www.google.com/apis/api_faq.html+google+api++help+%22frequently+asked%22+-plop&hl=en&lr=lang_en&ie=UTF-8">Cached</a> 217 for (i = 0; i < 2; i++) { 218 if (!p) break; 219 q = strstr(p, G_BEGIN_CACHESIM); 220 if (q && nextresult && (q > nextresult)) { 221 p = q; 222 printf(DBG"[%ld] cache/sim beyond next\n", numres); 223 p = nextresult; /* reset */ 224 } else if (q && (!nextresult || (q < nextresult))) { 225 int iscache; 226 p = q; 227 p+= strlen(G_BEGIN_CACHESIM); 228 //printf(DBG"[%ld] found token 6\n", numres); 229 item = p; 230 p = strstr(p, G_END_CACHESIM); 231 if (!p) break; 232 p+= strlen(G_END_CACHESIM); 233 //printf(DBG"[%ld] found token 7\n", numres); 234 itemlen = p - item - strlen(G_END_CACHESIM); 235 itemlen = MIN(GR_MAX_URL-1, itemlen); 236 if (!strncmp(p, "Cached", 6)) { 237 strncpy(nres->cache_url, item, itemlen); 238 nres->cache_url[itemlen] = '\0'; 239 } else if (!strncmp(p, "Similar", 7)) { 240 strncpy(nres->similar_url, item, itemlen); 241 nres->similar_url[itemlen] = '\0'; 242 } 243// else 244// break; 245 } 246 } 247 248 numres++; 249 if (!prev) 250 res = nres; 251 else 252 prev->next = nres; 253 prev = nres; 254 nres = NULL; 255 } while (!done || numres < maxres); 256 *results = res; 257 return numres; 258err0: 259 free(nres); 260 while (res) { 261 nres = res->next; 262 free(res); 263 res = nres; 264 } 265 return err; 266} 267 268#ifdef TESTME 269int main(int argc, char **argv) 270{ 271 struct google_result *results; 272 struct google_result *tag1 = 0xaaaa5555, *res = NULL, *tag2 = 0x5555aaaa; 273 size_t len; 274 char *p; 275 int err; 276 277 p = malloc(BUFSZ+8); 278 len = read(0, p+4, BUFSZ); 279 p[BUFSZ+4-1] = '\0'; 280 *(uint32 *)p = 0xa5a5a5a5; 281 *(uint32 *)(&p[BUFSZ+4]) = 0x5a5a5a5a; 282 err = google_parse_results(p+4, len, &results); 283 printf("error 0x%08lx\n", err); 284 if (err < 0) 285 return 1; 286 res = results; 287 while (res) { 288 printf("[%ld]:\nURL='%s'\nNAME='%s'\nSNIPSET='%s'\nCACHE='%s'\nSIMILAR='%s'\n\n", res->id, res->url, res->name, res->snipset, res->cache_url, res->similar_url); 289 res = res->next; 290 } 291 printf("before = 0x%08lx:0x%08lx, after = 0x%08lx:0x%08lx\n", 0xa5a5a5a5, *(uint32 *)p, 0x5a5a5a5a, *(uint32 *)(&p[BUFSZ+4])); 292 printf("before = 0x%08lx:0x%08lx, after = 0x%08lx:0x%08lx\n", 0xaaaa5555, tag1, 0x5555aaaa, tag2); 293 return 0; 294} 295#endif 296