1/*
2 * Copyright 2004-2008, François Revol, <revol@free.fr>.
3 * Distributed under the terms of the MIT License.
4 */
5
6#include <errno.h>
7#include <sys/param.h>
8#include <stdio.h>
9#include <stdlib.h>
10#include <string.h>
11#include <unistd.h>
12#include <OS.h>
13#include "google_request.h"
14#include "string_utils.h"
15
16#define TESTME
17
18#ifdef _KERNEL_MODE
19#define printf dprintf
20#undef TESTME
21#endif
22
23#define DBG "googlefs: parse_html: "
24
25#ifdef TESTME
26#define BUFSZ (128*1024)
27int dbgstep = 0;
28#define PRST printf(DBG "step %d\n", dbgstep++)
29#else
30#define PRST {}
31#endif
32
33//old
34//#define G_BEGIN_URL "<p class=g><a class=l href=\""
35//#define G_BEGIN_URL "<div class=g><a class=l href=\""
36//#define G_BEGIN_URL "<div class=g><a href=\""
37#define G_BEGIN_URL "<li class=g><h3 class=r><a href=\""
38//#define G_END_URL "\">"
39#define G_END_URL "\" class=l>"
40//#define G_BEGIN_NAME
41#define G_END_NAME "</a>"
42#define G_BEGIN_SNIPSET /*"<td class=j>"*/"<font size=-1>"
43#define G_END_SNIPSET "<br>"
44#define G_BEGIN_CACHESIM " <a class=fl href=\""
45#define G_END_CACHESIM "\">"
46
47int google_parse_results(const char *html, size_t htmlsize, struct google_result **results)
48{
49	struct google_result *res = NULL, *nres = NULL, *prev = NULL;
50	char *p, *q;
51	char *nextresult = NULL;
52	long numres = 0;
53	long maxres = 0;
54	long startid = 0;
55	long lastid = 0;
56	int done = 0;
57	int err = ENOMEM;
58
59	if (!html || !results)
60		return EINVAL;
61	/* sanity checks */
62	printf(DBG"sanity check...\n");
63	PRST;
64	/* google now sends <!doctype html><head> sometimes... */
65	if (strstr(html, "<!doctype html><head>") != html) {
66		if (strstr(html, "<html><head>") != html)
67			return EINVAL;
68	}
69	PRST;
70//	p = strstr(html, "<title>Google Search:");
71	p = strstr(html, "Google");
72	if (!p) return EINVAL;
73	PRST;
74	p = strstr(html, "<body");
75	if (!p) return EINVAL;
76	PRST;
77	p = strstr(html, ">&nbsp;Results <b>");
78	if (!p) return EINVAL;
79	PRST;
80	p+= strlen(">&nbsp;Results <b>");
81	startid = strtol(p, &p, 10);
82	if (!p) return EINVAL;
83	PRST;
84	p = strstr(html, "</b> - <b>");
85	p+= strlen("</b> - <b>");
86	if (!p) return EINVAL;
87	PRST;
88	lastid = strtol(p, &p, 10);
89	if (!p) return EINVAL;
90	PRST;
91	maxres = lastid - startid + 1;
92	printf(DBG"getting %ld results (%ld to %ld)\n", maxres, startid, lastid);
93
94	p = strstr(html, "Search Results<");
95	if (!p) return EINVAL;
96	PRST;
97
98
99	printf(DBG"parsing...\n");
100	do {
101		char *item;
102		long itemlen;
103		char *tmp;
104		int i;
105#ifdef TESTME
106		dbgstep = 0;
107#endif
108		nres = malloc(sizeof(struct google_result));
109		if (!nres) {
110			// XXX: cleanup!
111			goto err0;
112		}
113		memset(nres, 0, sizeof(struct google_result));
114		nres->id = startid + numres; //- 1;
115
116		PRST;
117		/* find url */
118		// <p class=g><a href=URL>
119		if (!p) break;
120		if (nextresult)
121			p = nextresult;
122		else
123			p = strstr(p, G_BEGIN_URL);
124		if (!p) break;
125		PRST;
126		p+= strlen(G_BEGIN_URL);
127		nextresult = strstr(p, G_BEGIN_URL);
128		//printf(DBG"[%ld] found token 1\n", numres);
129		item = p;
130		p = strstr(p, G_END_URL);
131		if (!p) break;
132		PRST;
133		p+= strlen(G_END_URL);
134		//printf(DBG"[%ld] found token 2\n", numres);
135		itemlen = p - item - strlen(G_END_URL);
136		itemlen = MIN(GR_MAX_URL-1, itemlen);
137		strncpy(nres->url, item, itemlen);
138		nres->url[itemlen] = '\0';
139
140		/* find name */
141		//<b>Google</b> Web APIs - FAQ</a><table
142		item = p;
143		p = strstr(p, G_END_NAME);
144		if (!p) break;
145		PRST;
146		p+= strlen(G_END_NAME);
147		//printf(DBG"[%ld] found token 3\n", numres);
148		itemlen = p - item - strlen(G_END_NAME);
149		//itemlen = MIN(GR_MAX_NAME-1, itemlen);
150		itemlen = MIN(GR_MAX_NAME*4-1, itemlen);
151		q = malloc(itemlen+1);
152		if (!q)
153			goto err0;
154		strncpy(q, item, itemlen);
155		q[itemlen] = '\0';
156		/* strip <*b> off */
157		PRST;
158		while ((tmp = strstr(q, "<b>")))
159			strcpy(tmp, tmp + 3);
160		while ((tmp = strstr(q, "</b>")))
161			strcpy(tmp, tmp + 4);
162		/* strip <*em> off */
163		PRST;
164		while ((tmp = strstr(q, "<em>")))
165			strcpy(tmp, tmp + 4);
166		while ((tmp = strstr(q, "</em>")))
167			strcpy(tmp, tmp + 5);
168		/* strip &foo; */
169		tmp = unentitify_string(q);
170		free(q);
171		if (!tmp)
172			goto err0;
173		strncpy(nres->name, tmp, GR_MAX_NAME-1);
174		nres->name[GR_MAX_NAME-1] = '\0';
175		free(tmp);
176		PRST;
177
178#if 0
179		/* find snipset */
180		//<td class=j><font size=-1><b>...</b> a custom Java client library, documentation on <b>how</b> <b>to</b> use the <b>...</b> You can find it at http://<b>api</b>.<b>google</b>.com/GoogleSearch.wsdl <b>...</b> need to get started is in <b>googleapi</b>.jar <b>...</b> <br>
181		if (!p) break;
182		q = strstr(p, G_BEGIN_SNIPSET);
183		if (q && (!nextresult || (q < nextresult))) {
184			p = q;
185			p+= strlen(G_BEGIN_SNIPSET);
186			//printf(DBG"[%ld] found token 4\n", numres);
187			item = p;
188			p = strstr(p, G_END_SNIPSET);
189			if (!p) break;
190			p+= strlen(G_END_SNIPSET);
191			//printf(DBG"[%ld] found token 5\n", numres);
192			itemlen = p - item - strlen(G_END_SNIPSET);
193			itemlen = MIN(GR_MAX_URL-1, itemlen);
194			strncpy(nres->snipset, item, itemlen);
195			nres->snipset[itemlen] = '\0';
196			/* strip &foo; */
197			tmp = unentitify_string(nres->snipset);
198			if (!tmp)
199				break;
200			strncpy(nres->snipset, tmp, GR_MAX_SNIPSET-1);
201			nres->snipset[GR_MAX_SNIPSET-1] = '\0';
202			free(tmp);
203			/* strip <*b> off */
204			while ((tmp = strstr(nres->snipset, "<b>")))
205				strcpy(tmp, tmp + 3);
206			while ((tmp = strstr(nres->snipset, "</b>")))
207				strcpy(tmp, tmp + 4);
208			while ((tmp = strstr(nres->snipset, "\r")))
209				strcpy(tmp, tmp + 1);
210			while ((tmp = strstr(nres->snipset, "\n")))
211				*tmp = ' ';
212		}
213
214#endif
215		/* find cache/similar url */
216		//  <a class=fl href="http://216.239.59.104/search?q=cache:vR7BaPWutnkJ:www.google.com/apis/api_faq.html+google+api++help+%22frequently+asked%22+-plop&hl=en&lr=lang_en&ie=UTF-8">Cached</a>
217		for (i = 0; i < 2; i++) {
218			if (!p) break;
219			q = strstr(p, G_BEGIN_CACHESIM);
220			if (q && nextresult && (q > nextresult)) {
221				p = q;
222				printf(DBG"[%ld] cache/sim beyond next\n", numres);
223				p = nextresult; /* reset */
224			} else if (q && (!nextresult || (q < nextresult))) {
225				int iscache;
226				p = q;
227				p+= strlen(G_BEGIN_CACHESIM);
228				//printf(DBG"[%ld] found token 6\n", numres);
229				item = p;
230				p = strstr(p, G_END_CACHESIM);
231				if (!p) break;
232				p+= strlen(G_END_CACHESIM);
233				//printf(DBG"[%ld] found token 7\n", numres);
234				itemlen = p - item - strlen(G_END_CACHESIM);
235				itemlen = MIN(GR_MAX_URL-1, itemlen);
236				if (!strncmp(p, "Cached", 6)) {
237					strncpy(nres->cache_url, item, itemlen);
238					nres->cache_url[itemlen] = '\0';
239				} else if (!strncmp(p, "Similar", 7)) {
240					strncpy(nres->similar_url, item, itemlen);
241					nres->similar_url[itemlen] = '\0';
242				}
243//				 else
244//					break;
245			}
246		}
247
248		numres++;
249		if (!prev)
250			res = nres;
251		else
252			prev->next = nres;
253		prev = nres;
254		nres = NULL;
255	} while (!done || numres < maxres);
256	*results = res;
257	return numres;
258err0:
259	free(nres);
260	while (res) {
261		nres = res->next;
262		free(res);
263		res = nres;
264	}
265	return err;
266}
267
268#ifdef TESTME
269int main(int argc, char **argv)
270{
271	struct google_result *results;
272	struct google_result *tag1 = 0xaaaa5555, *res = NULL, *tag2 = 0x5555aaaa;
273	size_t len;
274	char *p;
275	int err;
276
277	p = malloc(BUFSZ+8);
278	len = read(0, p+4, BUFSZ);
279	p[BUFSZ+4-1] = '\0';
280	*(uint32 *)p = 0xa5a5a5a5;
281	*(uint32 *)(&p[BUFSZ+4]) = 0x5a5a5a5a;
282	err = google_parse_results(p+4, len, &results);
283	printf("error 0x%08lx\n", err);
284	if (err < 0)
285		return 1;
286	res = results;
287	while (res) {
288		printf("[%ld]:\nURL='%s'\nNAME='%s'\nSNIPSET='%s'\nCACHE='%s'\nSIMILAR='%s'\n\n", res->id, res->url, res->name, res->snipset, res->cache_url, res->similar_url);
289		res = res->next;
290	}
291	printf("before = 0x%08lx:0x%08lx, after = 0x%08lx:0x%08lx\n", 0xa5a5a5a5, *(uint32 *)p, 0x5a5a5a5a, *(uint32 *)(&p[BUFSZ+4]));
292	printf("before = 0x%08lx:0x%08lx, after = 0x%08lx:0x%08lx\n", 0xaaaa5555, tag1, 0x5555aaaa, tag2);
293	return 0;
294}
295#endif
296