mansearch.c revision 1.12
1/*	$Id: mansearch.c,v 1.12 2014/01/19 23:09:13 schwarze Exp $ */
2/*
3 * Copyright (c) 2012 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18#include <assert.h>
19#include <fcntl.h>
20#include <getopt.h>
21#include <limits.h>
22#include <regex.h>
23#include <stdio.h>
24#include <stdint.h>
25#include <stddef.h>
26#include <stdlib.h>
27#include <string.h>
28#include <unistd.h>
29
30#include <ohash.h>
31#include <sqlite3.h>
32
33#include "mandoc.h"
34#include "manpath.h"
35#include "mansearch.h"
36
37extern int mansearch_keymax;
38extern const char *const mansearch_keynames[];
39
40#define	SQL_BIND_TEXT(_db, _s, _i, _v) \
41	do { if (SQLITE_OK != sqlite3_bind_text \
42		((_s), (_i)++, (_v), -1, SQLITE_STATIC)) \
43		fprintf(stderr, "%s\n", sqlite3_errmsg((_db))); \
44	} while (0)
45#define	SQL_BIND_INT64(_db, _s, _i, _v) \
46	do { if (SQLITE_OK != sqlite3_bind_int64 \
47		((_s), (_i)++, (_v))) \
48		fprintf(stderr, "%s\n", sqlite3_errmsg((_db))); \
49	} while (0)
50#define	SQL_BIND_BLOB(_db, _s, _i, _v) \
51	do { if (SQLITE_OK != sqlite3_bind_blob \
52		((_s), (_i)++, (&_v), sizeof(_v), SQLITE_STATIC)) \
53		fprintf(stderr, "%s\n", sqlite3_errmsg((_db))); \
54	} while (0)
55
56struct	expr {
57	uint64_t 	 bits;    /* type-mask */
58	const char	*substr;  /* to search for, if applicable */
59	regex_t		 regexp;  /* compiled regexp, if applicable */
60	int		 open;    /* opening parentheses before */
61	int		 and;	  /* logical AND before */
62	int		 close;   /* closing parentheses after */
63	struct expr	*next;    /* next in sequence */
64};
65
66struct	match {
67	uint64_t	 id; /* identifier in database */
68	int		 form; /* 0 == catpage */
69};
70
71static	void		 buildnames(struct manpage *, sqlite3 *,
72				sqlite3_stmt *, uint64_t,
73				const char *, int form);
74static	char		*buildoutput(sqlite3 *, sqlite3_stmt *,
75				 uint64_t, uint64_t);
76static	void		*hash_alloc(size_t, void *);
77static	void		 hash_free(void *, size_t, void *);
78static	void		*hash_halloc(size_t, void *);
79static	struct expr	*exprcomp(const struct mansearch *,
80				int, char *[]);
81static	void		 exprfree(struct expr *);
82static	struct expr	*exprspec(struct expr *, uint64_t,
83				 const char *, const char *);
84static	struct expr	*exprterm(const struct mansearch *, char *, int);
85static	void		 sql_append(char **sql, size_t *sz,
86				const char *newstr, int count);
87static	void		 sql_match(sqlite3_context *context,
88				int argc, sqlite3_value **argv);
89static	void		 sql_regexp(sqlite3_context *context,
90				int argc, sqlite3_value **argv);
91static	char		*sql_statement(const struct expr *);
92
93int
94mansearch(const struct mansearch *search,
95		const struct manpaths *paths,
96		int argc, char *argv[],
97		const char *outkey,
98		struct manpage **res, size_t *sz)
99{
100	int		 fd, rc, c, indexbit;
101	int64_t		 id;
102	uint64_t	 outbit, iterbit;
103	char		 buf[PATH_MAX];
104	char		*sql;
105	struct manpage	*mpage;
106	struct expr	*e, *ep;
107	sqlite3		*db;
108	sqlite3_stmt	*s, *s2;
109	struct match	*mp;
110	struct ohash_info info;
111	struct ohash	 htab;
112	unsigned int	 idx;
113	size_t		 i, j, cur, maxres;
114
115	memset(&info, 0, sizeof(struct ohash_info));
116
117	info.halloc = hash_halloc;
118	info.alloc = hash_alloc;
119	info.hfree = hash_free;
120	info.key_offset = offsetof(struct match, id);
121
122	*sz = cur = maxres = 0;
123	sql = NULL;
124	*res = NULL;
125	fd = -1;
126	e = NULL;
127	rc = 0;
128
129	if (0 == argc)
130		goto out;
131	if (NULL == (e = exprcomp(search, argc, argv)))
132		goto out;
133
134	outbit = 0;
135	if (NULL != outkey) {
136		for (indexbit = 0, iterbit = 1;
137		     indexbit < mansearch_keymax;
138		     indexbit++, iterbit <<= 1) {
139			if (0 == strcasecmp(outkey,
140			    mansearch_keynames[indexbit])) {
141				outbit = iterbit;
142				break;
143			}
144		}
145	}
146
147	/*
148	 * Save a descriptor to the current working directory.
149	 * Since pathnames in the "paths" variable might be relative,
150	 * and we'll be chdir()ing into them, we need to keep a handle
151	 * on our current directory from which to start the chdir().
152	 */
153
154	if (NULL == getcwd(buf, PATH_MAX)) {
155		perror(NULL);
156		goto out;
157	} else if (-1 == (fd = open(buf, O_RDONLY, 0))) {
158		perror(buf);
159		goto out;
160	}
161
162	sql = sql_statement(e);
163
164	/*
165	 * Loop over the directories (containing databases) for us to
166	 * search.
167	 * Don't let missing/bad databases/directories phase us.
168	 * In each, try to open the resident database and, if it opens,
169	 * scan it for our match expression.
170	 */
171
172	for (i = 0; i < paths->sz; i++) {
173		if (-1 == fchdir(fd)) {
174			perror(buf);
175			free(*res);
176			break;
177		} else if (-1 == chdir(paths->paths[i])) {
178			perror(paths->paths[i]);
179			continue;
180		}
181
182		c =  sqlite3_open_v2
183			(MANDOC_DB, &db,
184			 SQLITE_OPEN_READONLY, NULL);
185
186		if (SQLITE_OK != c) {
187			perror(MANDOC_DB);
188			sqlite3_close(db);
189			continue;
190		}
191
192		/*
193		 * Define the SQL functions for substring
194		 * and regular expression matching.
195		 */
196
197		c = sqlite3_create_function(db, "match", 2,
198		    SQLITE_ANY, NULL, sql_match, NULL, NULL);
199		assert(SQLITE_OK == c);
200		c = sqlite3_create_function(db, "regexp", 2,
201		    SQLITE_ANY, NULL, sql_regexp, NULL, NULL);
202		assert(SQLITE_OK == c);
203
204		j = 1;
205		c = sqlite3_prepare_v2(db, sql, -1, &s, NULL);
206		if (SQLITE_OK != c)
207			fprintf(stderr, "%s\n", sqlite3_errmsg(db));
208
209		for (ep = e; NULL != ep; ep = ep->next) {
210			if (NULL == ep->substr) {
211				SQL_BIND_BLOB(db, s, j, ep->regexp);
212			} else
213				SQL_BIND_TEXT(db, s, j, ep->substr);
214			SQL_BIND_INT64(db, s, j, ep->bits);
215		}
216
217		memset(&htab, 0, sizeof(struct ohash));
218		ohash_init(&htab, 4, &info);
219
220		/*
221		 * Hash each entry on its [unique] document identifier.
222		 * This is a uint64_t.
223		 * Instead of using a hash function, simply convert the
224		 * uint64_t to a uint32_t, the hash value's type.
225		 * This gives good performance and preserves the
226		 * distribution of buckets in the table.
227		 */
228		while (SQLITE_ROW == (c = sqlite3_step(s))) {
229			id = sqlite3_column_int64(s, 1);
230			idx = ohash_lookup_memory
231				(&htab, (char *)&id,
232				 sizeof(uint64_t), (uint32_t)id);
233
234			if (NULL != ohash_find(&htab, idx))
235				continue;
236
237			mp = mandoc_calloc(1, sizeof(struct match));
238			mp->id = id;
239			mp->form = sqlite3_column_int(s, 0);
240			ohash_insert(&htab, idx, mp);
241		}
242
243		if (SQLITE_DONE != c)
244			fprintf(stderr, "%s\n", sqlite3_errmsg(db));
245
246		sqlite3_finalize(s);
247
248		c = sqlite3_prepare_v2(db,
249		    "SELECT * FROM mlinks WHERE pageid=?",
250		    -1, &s, NULL);
251		if (SQLITE_OK != c)
252			fprintf(stderr, "%s\n", sqlite3_errmsg(db));
253
254		c = sqlite3_prepare_v2(db,
255		    "SELECT * FROM keys WHERE pageid=? AND bits & ?",
256		    -1, &s2, NULL);
257		if (SQLITE_OK != c)
258			fprintf(stderr, "%s\n", sqlite3_errmsg(db));
259
260		for (mp = ohash_first(&htab, &idx);
261				NULL != mp;
262				mp = ohash_next(&htab, &idx)) {
263			if (cur + 1 > maxres) {
264				maxres += 1024;
265				*res = mandoc_realloc
266					(*res, maxres * sizeof(struct manpage));
267			}
268			mpage = *res + cur;
269			mpage->form = mp->form;
270			buildnames(mpage, db, s, mp->id,
271			    paths->paths[i], mp->form);
272			mpage->output = outbit ?
273			    buildoutput(db, s2, mp->id, outbit) : NULL;
274
275			free(mp);
276			cur++;
277		}
278
279		sqlite3_finalize(s);
280		sqlite3_finalize(s2);
281		sqlite3_close(db);
282		ohash_delete(&htab);
283	}
284	rc = 1;
285out:
286	exprfree(e);
287	if (-1 != fd)
288		close(fd);
289	free(sql);
290	*sz = cur;
291	return(rc);
292}
293
294static void
295buildnames(struct manpage *mpage, sqlite3 *db, sqlite3_stmt *s,
296		uint64_t id, const char *path, int form)
297{
298	char		*newnames;
299	const char	*oldnames, *sep1, *name, *sec, *sep2, *arch, *fsec;
300	size_t		 i;
301	int		 c;
302
303	mpage->names = NULL;
304	i = 1;
305	SQL_BIND_INT64(db, s, i, id);
306	while (SQLITE_ROW == (c = sqlite3_step(s))) {
307
308		/* Assemble the list of names. */
309
310		if (NULL == mpage->names) {
311			oldnames = "";
312			sep1 = "";
313		} else {
314			oldnames = mpage->names;
315			sep1 = ", ";
316		}
317		sec = sqlite3_column_text(s, 0);
318		arch = sqlite3_column_text(s, 1);
319		name = sqlite3_column_text(s, 2);
320		sep2 = '\0' == *arch ? "" : "/";
321		if (-1 == asprintf(&newnames, "%s%s%s(%s%s%s)",
322		    oldnames, sep1, name, sec, sep2, arch)) {
323			perror(0);
324			exit((int)MANDOCLEVEL_SYSERR);
325		}
326		free(mpage->names);
327		mpage->names = newnames;
328
329		/* Also save the first file name encountered. */
330
331		if (NULL != mpage->file)
332			continue;
333
334		if (form) {
335			sep1 = "man";
336			fsec = sec;
337		} else {
338			sep1 = "cat";
339			fsec = "0";
340		}
341		if (-1 == asprintf(&mpage->file, "%s/%s%s%s%s/%s.%s",
342		    path, sep1, sec, sep2, arch, name, fsec)) {
343			perror(0);
344			exit((int)MANDOCLEVEL_SYSERR);
345		}
346	}
347	if (SQLITE_DONE != c)
348		fprintf(stderr, "%s\n", sqlite3_errmsg(db));
349	sqlite3_reset(s);
350}
351
352static char *
353buildoutput(sqlite3 *db, sqlite3_stmt *s, uint64_t id, uint64_t outbit)
354{
355	char		*output, *newoutput;
356	const char	*oldoutput, *sep1, *data;
357	size_t		 i;
358	int		 c;
359
360	output = NULL;
361	i = 1;
362	SQL_BIND_INT64(db, s, i, id);
363	SQL_BIND_INT64(db, s, i, outbit);
364	while (SQLITE_ROW == (c = sqlite3_step(s))) {
365		if (NULL == output) {
366			oldoutput = "";
367			sep1 = "";
368		} else {
369			oldoutput = output;
370			sep1 = " # ";
371		}
372		data = sqlite3_column_text(s, 1);
373		if (-1 == asprintf(&newoutput, "%s%s%s",
374		    oldoutput, sep1, data)) {
375			perror(0);
376			exit((int)MANDOCLEVEL_SYSERR);
377		}
378		free(output);
379		output = newoutput;
380	}
381	if (SQLITE_DONE != c)
382		fprintf(stderr, "%s\n", sqlite3_errmsg(db));
383	sqlite3_reset(s);
384	return(output);
385}
386
387/*
388 * Implement substring match as an application-defined SQL function.
389 * Using the SQL LIKE or GLOB operators instead would be a bad idea
390 * because that would require escaping metacharacters in the string
391 * being searched for.
392 */
393static void
394sql_match(sqlite3_context *context, int argc, sqlite3_value **argv)
395{
396
397	assert(2 == argc);
398	sqlite3_result_int(context, NULL != strcasestr(
399	    (const char *)sqlite3_value_text(argv[1]),
400	    (const char *)sqlite3_value_text(argv[0])));
401}
402
403/*
404 * Implement regular expression match
405 * as an application-defined SQL function.
406 */
407static void
408sql_regexp(sqlite3_context *context, int argc, sqlite3_value **argv)
409{
410
411	assert(2 == argc);
412	sqlite3_result_int(context, !regexec(
413	    (regex_t *)sqlite3_value_blob(argv[0]),
414	    (const char *)sqlite3_value_text(argv[1]),
415	    0, NULL, 0));
416}
417
418static void
419sql_append(char **sql, size_t *sz, const char *newstr, int count)
420{
421	size_t		 newsz;
422
423	newsz = 1 < count ? (size_t)count : strlen(newstr);
424	*sql = mandoc_realloc(*sql, *sz + newsz + 1);
425	if (1 < count)
426		memset(*sql + *sz, *newstr, (size_t)count);
427	else
428		memcpy(*sql + *sz, newstr, newsz);
429	*sz += newsz;
430	(*sql)[*sz] = '\0';
431}
432
433/*
434 * Prepare the search SQL statement.
435 */
436static char *
437sql_statement(const struct expr *e)
438{
439	char		*sql;
440	size_t		 sz;
441	int		 needop;
442
443	sql = mandoc_strdup("SELECT * FROM mpages WHERE ");
444	sz = strlen(sql);
445
446	for (needop = 0; NULL != e; e = e->next) {
447		if (e->and)
448			sql_append(&sql, &sz, " AND ", 1);
449		else if (needop)
450			sql_append(&sql, &sz, " OR ", 1);
451		if (e->open)
452			sql_append(&sql, &sz, "(", e->open);
453		sql_append(&sql, &sz, NULL == e->substr ?
454		    "id IN (SELECT pageid FROM keys "
455		    "WHERE key REGEXP ? AND bits & ?)" :
456		    "id IN (SELECT pageid FROM keys "
457		    "WHERE key MATCH ? AND bits & ?)", 1);
458		if (e->close)
459			sql_append(&sql, &sz, ")", e->close);
460		needop = 1;
461	}
462
463	return(sql);
464}
465
466/*
467 * Compile a set of string tokens into an expression.
468 * Tokens in "argv" are assumed to be individual expression atoms (e.g.,
469 * "(", "foo=bar", etc.).
470 */
471static struct expr *
472exprcomp(const struct mansearch *search, int argc, char *argv[])
473{
474	int		 i, toopen, logic, igncase, toclose;
475	struct expr	*first, *next, *cur;
476
477	first = cur = NULL;
478	logic = igncase = toclose = 0;
479	toopen = 1;
480
481	for (i = 0; i < argc; i++) {
482		if (0 == strcmp("(", argv[i])) {
483			if (igncase)
484				goto fail;
485			toopen++;
486			toclose++;
487			continue;
488		} else if (0 == strcmp(")", argv[i])) {
489			if (toopen || logic || igncase || NULL == cur)
490				goto fail;
491			cur->close++;
492			if (0 > --toclose)
493				goto fail;
494			continue;
495		} else if (0 == strcmp("-a", argv[i])) {
496			if (toopen || logic || igncase || NULL == cur)
497				goto fail;
498			logic = 1;
499			continue;
500		} else if (0 == strcmp("-o", argv[i])) {
501			if (toopen || logic || igncase || NULL == cur)
502				goto fail;
503			logic = 2;
504			continue;
505		} else if (0 == strcmp("-i", argv[i])) {
506			if (igncase)
507				goto fail;
508			igncase = 1;
509			continue;
510		}
511		next = exprterm(search, argv[i], !igncase);
512		if (NULL == next)
513			goto fail;
514		next->open = toopen;
515		next->and = (1 == logic);
516		if (NULL != first) {
517			cur->next = next;
518			cur = next;
519		} else
520			cur = first = next;
521		toopen = logic = igncase = 0;
522	}
523	if (toopen || logic || igncase || toclose)
524		goto fail;
525
526	cur->close++;
527	cur = exprspec(cur, TYPE_arch, search->arch, "^(%s|any)$");
528	exprspec(cur, TYPE_sec, search->sec, "^%s$");
529
530	return(first);
531
532fail:
533	if (NULL != first)
534		exprfree(first);
535	return(NULL);
536}
537
538static struct expr *
539exprspec(struct expr *cur, uint64_t key, const char *value,
540		const char *format)
541{
542	char	 errbuf[BUFSIZ];
543	char	*cp;
544	int	 irc;
545
546	if (NULL == value)
547		return(cur);
548
549	if (-1 == asprintf(&cp, format, value)) {
550		perror(0);
551		exit((int)MANDOCLEVEL_SYSERR);
552	}
553	cur->next = mandoc_calloc(1, sizeof(struct expr));
554	cur = cur->next;
555	cur->and = 1;
556	cur->bits = key;
557	if (0 != (irc = regcomp(&cur->regexp, cp,
558	    REG_EXTENDED | REG_NOSUB | REG_ICASE))) {
559		regerror(irc, &cur->regexp, errbuf, sizeof(errbuf));
560		fprintf(stderr, "regcomp: %s\n", errbuf);
561		cur->substr = value;
562	}
563	free(cp);
564	return(cur);
565}
566
567static struct expr *
568exprterm(const struct mansearch *search, char *buf, int cs)
569{
570	char		 errbuf[BUFSIZ];
571	struct expr	*e;
572	char		*key, *v;
573	uint64_t	 iterbit;
574	int		 i, irc;
575
576	if ('\0' == *buf)
577		return(NULL);
578
579	e = mandoc_calloc(1, sizeof(struct expr));
580
581	/*"whatis" mode uses an opaque string and default fields. */
582
583	if (MANSEARCH_WHATIS & search->flags) {
584		e->substr = buf;
585		e->bits = search->deftype;
586		return(e);
587	}
588
589	/*
590	 * If no =~ is specified, search with equality over names and
591	 * descriptions.
592	 * If =~ begins the phrase, use name and description fields.
593	 */
594
595	if (NULL == (v = strpbrk(buf, "=~"))) {
596		e->substr = buf;
597		e->bits = search->deftype;
598		return(e);
599	} else if (v == buf)
600		e->bits = search->deftype;
601
602	if ('~' == *v++) {
603		if (NULL != strstr(buf, "arch"))
604			cs = 0;
605		if (0 != (irc = regcomp(&e->regexp, v,
606		    REG_EXTENDED | REG_NOSUB | (cs ? 0 : REG_ICASE)))) {
607			regerror(irc, &e->regexp, errbuf, sizeof(errbuf));
608			fprintf(stderr, "regcomp: %s\n", errbuf);
609			free(e);
610			return(NULL);
611		}
612	} else
613		e->substr = v;
614	v[-1] = '\0';
615
616	/*
617	 * Parse out all possible fields.
618	 * If the field doesn't resolve, bail.
619	 */
620
621	while (NULL != (key = strsep(&buf, ","))) {
622		if ('\0' == *key)
623			continue;
624		for (i = 0, iterbit = 1;
625		     i < mansearch_keymax;
626		     i++, iterbit <<= 1) {
627			if (0 == strcasecmp(key,
628			    mansearch_keynames[i])) {
629				e->bits |= iterbit;
630				break;
631			}
632		}
633		if (i == mansearch_keymax) {
634			if (strcasecmp(key, "any")) {
635				free(e);
636				return(NULL);
637			}
638			e->bits |= ~0ULL;
639		}
640	}
641
642	return(e);
643}
644
645static void
646exprfree(struct expr *p)
647{
648	struct expr	*pp;
649
650	while (NULL != p) {
651		pp = p->next;
652		free(p);
653		p = pp;
654	}
655}
656
657static void *
658hash_halloc(size_t sz, void *arg)
659{
660
661	return(mandoc_calloc(sz, 1));
662}
663
664static void *
665hash_alloc(size_t sz, void *arg)
666{
667
668	return(mandoc_malloc(sz));
669}
670
671static void
672hash_free(void *p, size_t sz, void *arg)
673{
674
675	free(p);
676}
677