1/*
2 * Copyright 2004-2008, Fran��ois Revol, <revol@free.fr>.
3 * Distributed under the terms of the MIT License.
4 */
5
6#include <errno.h>
7#include <sys/param.h>
8#include <stdio.h>
9#include <stdlib.h>
10#include <string.h>
11#include <unistd.h>
12#include <OS.h>
13#include <KernelExport.h>
14#include "duckduckgo_request.h"
15#include "string_utils.h"
16
17#define TESTME
18
19#ifdef _KERNEL_MODE
20#define printf dprintf
21#undef TESTME
22#endif
23
24#define DBG "duckduckgofs: parse_html: "
25
26#ifdef TESTME
27#define BUFSZ (128*1024)
28int dbgstep = 0;
29#define PRST printf(DBG "step %d\n", dbgstep++)
30#else
31#define PRST {}
32#endif
33
34#define G_BEGIN_URL "<a rel=\"nofollow\" class=\"result__a\" href=\""
35#define G_END_URL "\">"
36//#define G_BEGIN_NAME
37#define G_END_NAME "</a>"
38#define G_BEGIN_SNIPSET "<a class=\"result__snippet\""
39#define G_END_SNIPSET "</a>"
40#define G_BEGIN_CACHESIM " <a class=fl href=\""
41#define G_END_CACHESIM "\">"
42
43int duckduckgo_parse_results(const char *html, size_t htmlsize, long *nextid, struct duckduckgo_result **results)
44{
45	struct duckduckgo_result *res = NULL, *nres = NULL, *prev = NULL;
46	char *p, *q;
47	char *nextresult = NULL;
48	long numres = 0;
49	long maxres = 1000;
50	//long startid = 0;
51	int done = 0;
52	int err = ENOMEM;
53
54	if (!html || !results)
55		return EINVAL;
56	/* sanity checks */
57	printf(DBG"sanity check...\n");
58	PRST;
59	if (strstr(html, "<!DOCTYPE html PUBLIC") != html) {
60		return EINVAL;
61	}
62	PRST;
63	p = strstr(html, "DuckDuckGo");
64	if (!p) return EINVAL;
65	PRST;
66	p = strstr(html, "<body");
67	if (!p) return EINVAL;
68	PRST;
69
70	/*
71	p = strstr(html, "Search Results<");
72	if (!p) return EINVAL;
73	PRST;
74	*/
75
76
77	printf(DBG"parsing...\n");
78	do {
79		char *item;
80		unsigned long itemlen;
81		char *tmp;
82		char *urlp;
83		int i;
84#ifdef TESTME
85		dbgstep = 0;
86#endif
87		nres = malloc(sizeof(struct duckduckgo_result));
88		if (!nres) {
89			// XXX: cleanup!
90			goto err0;
91		}
92		memset(nres, 0, sizeof(struct duckduckgo_result));
93		nres->id = (*nextid)++; //- 1;
94
95		PRST;
96		/* find url */
97		// <p class=g><a href=URL>
98		if (!p) break;
99		if (nextresult)
100			p = nextresult;
101		else
102			p = strstr(p, G_BEGIN_URL);
103		if (!p) break;
104		PRST;
105		p+= strlen(G_BEGIN_URL);
106		nextresult = strstr(p, G_BEGIN_URL);
107		//printf(DBG"[%ld] found token 1\n", numres);
108		item = p;
109		p = strstr(p, G_END_URL);
110		if (!p) break;
111		PRST;
112		p+= strlen(G_END_URL);
113		//printf(DBG"[%ld] found token 2\n", numres);
114		itemlen = GR_MAX_URL-1;
115		urlp = nres->url;
116		itemlen = MIN(itemlen, p - item - strlen(G_END_URL));
117		strncpy(urlp, item, itemlen);
118		urlp[itemlen] = '\0';
119
120		/* find name */
121		item = p;
122		p = strstr(p, G_END_NAME);
123		if (!p) break;
124		PRST;
125		p+= strlen(G_END_NAME);
126		//printf(DBG"[%ld] found token 3\n", numres);
127		itemlen = p - item - strlen(G_END_NAME);
128		//itemlen = MIN(GR_MAX_NAME-1, itemlen);
129		itemlen = MIN(GR_MAX_NAME*4-1, itemlen);
130		q = malloc(itemlen+1);
131		if (!q)
132			goto err0;
133		strncpy(q, item, itemlen);
134		q[itemlen] = '\0';
135		/* strip <*b> off */
136		PRST;
137		while ((tmp = strstr(q, "<b>")))
138			strcpy(tmp, tmp + 3);
139		while ((tmp = strstr(q, "</b>")))
140			strcpy(tmp, tmp + 4);
141		/* strip <*em> off */
142		PRST;
143		while ((tmp = strstr(q, "<em>")))
144			strcpy(tmp, tmp + 4);
145		while ((tmp = strstr(q, "</em>")))
146			strcpy(tmp, tmp + 5);
147		/* strip &foo; */
148		tmp = unentitify_string(q);
149		free(q);
150		if (!tmp)
151			goto err0;
152		strncpy(nres->name, tmp, GR_MAX_NAME-1);
153		nres->name[GR_MAX_NAME-1] = '\0';
154		free(tmp);
155		PRST;
156
157#if 0
158		/* find snipset */
159		if (!p) break;
160		q = strstr(p, G_BEGIN_SNIPSET);
161		if (q && (!nextresult || (q < nextresult))) {
162			p = q;
163			p+= strlen(G_BEGIN_SNIPSET);
164			//printf(DBG"[%ld] found token 4\n", numres);
165			item = p;
166			p = strstr(p, G_END_SNIPSET);
167			if (!p) break;
168			p+= strlen(G_END_SNIPSET);
169			//printf(DBG"[%ld] found token 5\n", numres);
170			itemlen = p - item - strlen(G_END_SNIPSET);
171			itemlen = MIN(GR_MAX_URL-1, itemlen);
172			strncpy(nres->snipset, item, itemlen);
173			nres->snipset[itemlen] = '\0';
174			/* strip &foo; */
175			tmp = unentitify_string(nres->snipset);
176			if (!tmp)
177				break;
178			strncpy(nres->snipset, tmp, GR_MAX_SNIPSET-1);
179			nres->snipset[GR_MAX_SNIPSET-1] = '\0';
180			free(tmp);
181			/* strip <*b> off */
182			while ((tmp = strstr(nres->snipset, "<b>")))
183				strcpy(tmp, tmp + 3);
184			while ((tmp = strstr(nres->snipset, "</b>")))
185				strcpy(tmp, tmp + 4);
186			while ((tmp = strstr(nres->snipset, "\r")))
187				strcpy(tmp, tmp + 1);
188			while ((tmp = strstr(nres->snipset, "\n")))
189				*tmp = ' ';
190		}
191
192#endif
193		/* find cache/similar url */
194		for (i = 0; i < 2; i++) {
195			if (!p) break;
196			q = strstr(p, G_BEGIN_CACHESIM);
197			if (q && nextresult && (q > nextresult)) {
198				p = q;
199				printf(DBG"[%ld] cache/sim beyond next\n", numres);
200				p = nextresult; /* reset */
201			} else if (q && (!nextresult || (q < nextresult))) {
202				//int iscache;
203				p = q;
204				p+= strlen(G_BEGIN_CACHESIM);
205				//printf(DBG"[%ld] found token 6\n", numres);
206				item = p;
207				p = strstr(p, G_END_CACHESIM);
208				if (!p) break;
209				p+= strlen(G_END_CACHESIM);
210				//printf(DBG"[%ld] found token 7\n", numres);
211				itemlen = p - item - strlen(G_END_CACHESIM);
212				itemlen = MIN(GR_MAX_URL-1, itemlen);
213				if (!strncmp(p, "Cached", 6)) {
214					strncpy(nres->cache_url, item, itemlen);
215					nres->cache_url[itemlen] = '\0';
216				} else if (!strncmp(p, "Similar", 7)) {
217					strncpy(nres->similar_url, item, itemlen);
218					nres->similar_url[itemlen] = '\0';
219				}
220//				 else
221//					break;
222			}
223		}
224
225		numres++;
226		if (!prev)
227			res = nres;
228		else
229			prev->next = nres;
230		prev = nres;
231		nres = NULL;
232	} while (!done || numres < maxres);
233	*results = res;
234	return numres;
235err0:
236	free(nres);
237	while (res) {
238		nres = res->next;
239		free(res);
240		res = nres;
241	}
242	return err;
243}
244
245#ifdef TESTME
246int main(int argc, char **argv)
247{
248	struct duckduckgo_result *results;
249	struct duckduckgo_result *tag1 = (void*)0xaaaa5555, *res = NULL, *tag2 = (void*)0x5555aaaa;
250	size_t len;
251	char *p;
252	int err;
253	long nextid = 0;
254
255	p = malloc(BUFSZ+8);
256	len = read(0, p+4, BUFSZ);
257	p[BUFSZ+4-1] = '\0';
258	*(uint32 *)p = 0xa5a5a5a5;
259	*(uint32 *)(&p[BUFSZ+4]) = 0x5a5a5a5a;
260	err = duckduckgo_parse_results(p+4, len, &nextid, &results);
261	printf("error 0x%08x\n", err);
262	if (err < 0)
263		return 1;
264	res = results;
265	while (res) {
266		printf("[%ld]:\nURL='%s'\nNAME='%s'\nSNIPSET='%s'\nCACHE='%s'\nSIMILAR='%s'\n\n", res->id, res->url, res->name, res->snipset, res->cache_url, res->similar_url);
267		res = res->next;
268	}
269	printf("before = 0x%08x:0x%08x, after = 0x%08x:0x%08x\n", 0xa5a5a5a5, *(uint32 *)p, 0x5a5a5a5a, *(uint32 *)(&p[BUFSZ+4]));
270	printf("before = 0x%08x:%p, after = 0x%08x:%p\n", 0xaaaa5555, tag1, 0x5555aaaa, tag2);
271	return 0;
272}
273#endif
274