mansearch.c revision 1.6
1/*	$Id: mansearch.c,v 1.6 2014/01/05 03:06:36 schwarze Exp $ */
2/*
3 * Copyright (c) 2012 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18#include <assert.h>
19#include <fcntl.h>
20#include <getopt.h>
21#include <limits.h>
22#include <regex.h>
23#include <stdio.h>
24#include <stdint.h>
25#include <stddef.h>
26#include <stdlib.h>
27#include <string.h>
28#include <unistd.h>
29
30#include <ohash.h>
31#include <sqlite3.h>
32
33#include "mandoc.h"
34#include "manpath.h"
35#include "mansearch.h"
36
37#define	SQL_BIND_TEXT(_db, _s, _i, _v) \
38	do { if (SQLITE_OK != sqlite3_bind_text \
39		((_s), (_i)++, (_v), -1, SQLITE_STATIC)) \
40		fprintf(stderr, "%s\n", sqlite3_errmsg((_db))); \
41	} while (0)
42#define	SQL_BIND_INT64(_db, _s, _i, _v) \
43	do { if (SQLITE_OK != sqlite3_bind_int64 \
44		((_s), (_i)++, (_v))) \
45		fprintf(stderr, "%s\n", sqlite3_errmsg((_db))); \
46	} while (0)
47#define	SQL_BIND_BLOB(_db, _s, _i, _v) \
48	do { if (SQLITE_OK != sqlite3_bind_blob \
49		((_s), (_i)++, (&_v), sizeof(_v), SQLITE_STATIC)) \
50		fprintf(stderr, "%s\n", sqlite3_errmsg((_db))); \
51	} while (0)
52
53struct	expr {
54	uint64_t 	 bits;    /* type-mask */
55	const char	*substr;  /* to search for, if applicable */
56	regex_t		 regexp;  /* compiled regexp, if applicable */
57	int		 open;    /* opening parentheses before */
58	int		 and;	  /* logical AND before */
59	int		 close;   /* closing parentheses after */
60	struct expr	*next;    /* next in sequence */
61};
62
63struct	match {
64	uint64_t	 id; /* identifier in database */
65	char		*file; /* relative filepath of manpage */
66	char		*desc; /* description of manpage */
67	int		 form; /* 0 == catpage */
68};
69
70struct	type {
71	uint64_t	 bits;
72	const char	*name;
73};
74
75static	const struct type types[] = {
76	{ TYPE_An,  "An" },
77	{ TYPE_Ar,  "Ar" },
78	{ TYPE_At,  "At" },
79	{ TYPE_Bsx, "Bsx" },
80	{ TYPE_Bx,  "Bx" },
81	{ TYPE_Cd,  "Cd" },
82	{ TYPE_Cm,  "Cm" },
83	{ TYPE_Dv,  "Dv" },
84	{ TYPE_Dx,  "Dx" },
85	{ TYPE_Em,  "Em" },
86	{ TYPE_Er,  "Er" },
87	{ TYPE_Ev,  "Ev" },
88	{ TYPE_Fa,  "Fa" },
89	{ TYPE_Fl,  "Fl" },
90	{ TYPE_Fn,  "Fn" },
91	{ TYPE_Fn,  "Fo" },
92	{ TYPE_Ft,  "Ft" },
93	{ TYPE_Fx,  "Fx" },
94	{ TYPE_Ic,  "Ic" },
95	{ TYPE_In,  "In" },
96	{ TYPE_Lb,  "Lb" },
97	{ TYPE_Li,  "Li" },
98	{ TYPE_Lk,  "Lk" },
99	{ TYPE_Ms,  "Ms" },
100	{ TYPE_Mt,  "Mt" },
101	{ TYPE_Nd,  "Nd" },
102	{ TYPE_Nm,  "Nm" },
103	{ TYPE_Nx,  "Nx" },
104	{ TYPE_Ox,  "Ox" },
105	{ TYPE_Pa,  "Pa" },
106	{ TYPE_Rs,  "Rs" },
107	{ TYPE_Sh,  "Sh" },
108	{ TYPE_Ss,  "Ss" },
109	{ TYPE_St,  "St" },
110	{ TYPE_Sy,  "Sy" },
111	{ TYPE_Tn,  "Tn" },
112	{ TYPE_Va,  "Va" },
113	{ TYPE_Va,  "Vt" },
114	{ TYPE_Xr,  "Xr" },
115	{ TYPE_sec, "sec" },
116	{ TYPE_arch,"arch" },
117	{ ~0ULL,    "any" },
118	{ 0ULL, NULL }
119};
120
121static	char		*buildnames(sqlite3 *, sqlite3_stmt *, uint64_t);
122static	char		*buildoutput(sqlite3 *, sqlite3_stmt *,
123				 uint64_t, uint64_t);
124static	void		*hash_alloc(size_t, void *);
125static	void		 hash_free(void *, size_t, void *);
126static	void		*hash_halloc(size_t, void *);
127static	struct expr	*exprcomp(const struct mansearch *,
128				int, char *[]);
129static	void		 exprfree(struct expr *);
130static	struct expr	*exprspec(struct expr *, uint64_t,
131				 const char *, const char *);
132static	struct expr	*exprterm(const struct mansearch *, char *, int);
133static	void		 sql_append(char **sql, size_t *sz,
134				const char *newstr, int count);
135static	void		 sql_match(sqlite3_context *context,
136				int argc, sqlite3_value **argv);
137static	void		 sql_regexp(sqlite3_context *context,
138				int argc, sqlite3_value **argv);
139static	char		*sql_statement(const struct expr *);
140
141int
142mansearch(const struct mansearch *search,
143		const struct manpaths *paths,
144		int argc, char *argv[],
145		const char *outkey,
146		struct manpage **res, size_t *sz)
147{
148	int		 fd, rc, c, ibit;
149	int64_t		 id;
150	uint64_t	 outbit;
151	char		 buf[PATH_MAX];
152	char		*sql;
153	struct manpage	*mpage;
154	struct expr	*e, *ep;
155	sqlite3		*db;
156	sqlite3_stmt	*s, *s2;
157	struct match	*mp;
158	struct ohash_info info;
159	struct ohash	 htab;
160	unsigned int	 idx;
161	size_t		 i, j, cur, maxres;
162
163	memset(&info, 0, sizeof(struct ohash_info));
164
165	info.halloc = hash_halloc;
166	info.alloc = hash_alloc;
167	info.hfree = hash_free;
168	info.key_offset = offsetof(struct match, id);
169
170	*sz = cur = maxres = 0;
171	sql = NULL;
172	*res = NULL;
173	fd = -1;
174	e = NULL;
175	rc = 0;
176
177	if (0 == argc)
178		goto out;
179	if (NULL == (e = exprcomp(search, argc, argv)))
180		goto out;
181
182	outbit = 0;
183	if (NULL != outkey) {
184		for (ibit = 0; types[ibit].bits; ibit++) {
185			if (0 == strcasecmp(types[ibit].name, outkey)) {
186				outbit = types[ibit].bits;
187				break;
188			}
189		}
190	}
191
192	/*
193	 * Save a descriptor to the current working directory.
194	 * Since pathnames in the "paths" variable might be relative,
195	 * and we'll be chdir()ing into them, we need to keep a handle
196	 * on our current directory from which to start the chdir().
197	 */
198
199	if (NULL == getcwd(buf, PATH_MAX)) {
200		perror(NULL);
201		goto out;
202	} else if (-1 == (fd = open(buf, O_RDONLY, 0))) {
203		perror(buf);
204		goto out;
205	}
206
207	sql = sql_statement(e);
208
209	/*
210	 * Loop over the directories (containing databases) for us to
211	 * search.
212	 * Don't let missing/bad databases/directories phase us.
213	 * In each, try to open the resident database and, if it opens,
214	 * scan it for our match expression.
215	 */
216
217	for (i = 0; i < paths->sz; i++) {
218		if (-1 == fchdir(fd)) {
219			perror(buf);
220			free(*res);
221			break;
222		} else if (-1 == chdir(paths->paths[i])) {
223			perror(paths->paths[i]);
224			continue;
225		}
226
227		c =  sqlite3_open_v2
228			(MANDOC_DB, &db,
229			 SQLITE_OPEN_READONLY, NULL);
230
231		if (SQLITE_OK != c) {
232			perror(MANDOC_DB);
233			sqlite3_close(db);
234			continue;
235		}
236
237		/*
238		 * Define the SQL functions for substring
239		 * and regular expression matching.
240		 */
241
242		c = sqlite3_create_function(db, "match", 2,
243		    SQLITE_ANY, NULL, sql_match, NULL, NULL);
244		assert(SQLITE_OK == c);
245		c = sqlite3_create_function(db, "regexp", 2,
246		    SQLITE_ANY, NULL, sql_regexp, NULL, NULL);
247		assert(SQLITE_OK == c);
248
249		j = 1;
250		c = sqlite3_prepare_v2(db, sql, -1, &s, NULL);
251		if (SQLITE_OK != c)
252			fprintf(stderr, "%s\n", sqlite3_errmsg(db));
253
254		for (ep = e; NULL != ep; ep = ep->next) {
255			if (NULL == ep->substr) {
256				SQL_BIND_BLOB(db, s, j, ep->regexp);
257			} else
258				SQL_BIND_TEXT(db, s, j, ep->substr);
259			SQL_BIND_INT64(db, s, j, ep->bits);
260		}
261
262		memset(&htab, 0, sizeof(struct ohash));
263		ohash_init(&htab, 4, &info);
264
265		/*
266		 * Hash each entry on its [unique] document identifier.
267		 * This is a uint64_t.
268		 * Instead of using a hash function, simply convert the
269		 * uint64_t to a uint32_t, the hash value's type.
270		 * This gives good performance and preserves the
271		 * distribution of buckets in the table.
272		 */
273		while (SQLITE_ROW == (c = sqlite3_step(s))) {
274			id = sqlite3_column_int64(s, 5);
275			idx = ohash_lookup_memory
276				(&htab, (char *)&id,
277				 sizeof(uint64_t), (uint32_t)id);
278
279			if (NULL != ohash_find(&htab, idx))
280				continue;
281
282			mp = mandoc_calloc(1, sizeof(struct match));
283			mp->id = id;
284			mp->file = mandoc_strdup
285				((char *)sqlite3_column_text(s, 0));
286			mp->desc = mandoc_strdup
287				((char *)sqlite3_column_text(s, 3));
288			mp->form = sqlite3_column_int(s, 4);
289			ohash_insert(&htab, idx, mp);
290		}
291
292		if (SQLITE_DONE != c)
293			fprintf(stderr, "%s\n", sqlite3_errmsg(db));
294
295		sqlite3_finalize(s);
296
297		c = sqlite3_prepare_v2(db,
298		    "SELECT * FROM mlinks WHERE pageid=?",
299		    -1, &s, NULL);
300		if (SQLITE_OK != c)
301			fprintf(stderr, "%s\n", sqlite3_errmsg(db));
302
303		c = sqlite3_prepare_v2(db,
304		    "SELECT * FROM keys WHERE pageid=? AND bits & ?",
305		    -1, &s2, NULL);
306		if (SQLITE_OK != c)
307			fprintf(stderr, "%s\n", sqlite3_errmsg(db));
308
309		for (mp = ohash_first(&htab, &idx);
310				NULL != mp;
311				mp = ohash_next(&htab, &idx)) {
312			if (cur + 1 > maxres) {
313				maxres += 1024;
314				*res = mandoc_realloc
315					(*res, maxres * sizeof(struct manpage));
316			}
317			mpage = *res + cur;
318			if (-1 == asprintf(&mpage->file, "%s/%s",
319			    paths->paths[i], mp->file)) {
320				perror(0);
321				exit((int)MANDOCLEVEL_SYSERR);
322			}
323			mpage->desc = mp->desc;
324			mpage->form = mp->form;
325			mpage->names = buildnames(db, s, mp->id);
326			mpage->output = outbit ?
327			    buildoutput(db, s2, mp->id, outbit) : NULL;
328
329			free(mp->file);
330			free(mp);
331			cur++;
332		}
333
334		sqlite3_finalize(s);
335		sqlite3_finalize(s2);
336		sqlite3_close(db);
337		ohash_delete(&htab);
338	}
339	rc = 1;
340out:
341	exprfree(e);
342	if (-1 != fd)
343		close(fd);
344	free(sql);
345	*sz = cur;
346	return(rc);
347}
348
349static char *
350buildnames(sqlite3 *db, sqlite3_stmt *s, uint64_t id)
351{
352	char		*names, *newnames;
353	const char	*oldnames, *sep1, *name, *sec, *sep2, *arch;
354	size_t		 i;
355	int		 c;
356
357	names = NULL;
358	i = 1;
359	SQL_BIND_INT64(db, s, i, id);
360	while (SQLITE_ROW == (c = sqlite3_step(s))) {
361		if (NULL == names) {
362			oldnames = "";
363			sep1 = "";
364		} else {
365			oldnames = names;
366			sep1 = ", ";
367		}
368		sec = sqlite3_column_text(s, 1);
369		arch = sqlite3_column_text(s, 2);
370		name = sqlite3_column_text(s, 3);
371		sep2 = '\0' == *arch ? "" : "/";
372		if (-1 == asprintf(&newnames, "%s%s%s(%s%s%s)",
373		    oldnames, sep1, name, sec, sep2, arch)) {
374			perror(0);
375			exit((int)MANDOCLEVEL_SYSERR);
376		}
377		free(names);
378		names = newnames;
379	}
380	if (SQLITE_DONE != c)
381		fprintf(stderr, "%s\n", sqlite3_errmsg(db));
382	sqlite3_reset(s);
383	return(names);
384}
385
386static char *
387buildoutput(sqlite3 *db, sqlite3_stmt *s, uint64_t id, uint64_t outbit)
388{
389	char		*output, *newoutput;
390	const char	*oldoutput, *sep1, *data;
391	size_t		 i;
392	int		 c;
393
394	output = NULL;
395	i = 1;
396	SQL_BIND_INT64(db, s, i, id);
397	SQL_BIND_INT64(db, s, i, outbit);
398	while (SQLITE_ROW == (c = sqlite3_step(s))) {
399		if (NULL == output) {
400			oldoutput = "";
401			sep1 = "";
402		} else {
403			oldoutput = output;
404			sep1 = " # ";
405		}
406		data = sqlite3_column_text(s, 1);
407		if (-1 == asprintf(&newoutput, "%s%s%s",
408		    oldoutput, sep1, data)) {
409			perror(0);
410			exit((int)MANDOCLEVEL_SYSERR);
411		}
412		free(output);
413		output = newoutput;
414	}
415	if (SQLITE_DONE != c)
416		fprintf(stderr, "%s\n", sqlite3_errmsg(db));
417	sqlite3_reset(s);
418	return(output);
419}
420
421/*
422 * Implement substring match as an application-defined SQL function.
423 * Using the SQL LIKE or GLOB operators instead would be a bad idea
424 * because that would require escaping metacharacters in the string
425 * being searched for.
426 */
427static void
428sql_match(sqlite3_context *context, int argc, sqlite3_value **argv)
429{
430
431	assert(2 == argc);
432	sqlite3_result_int(context, NULL != strcasestr(
433	    (const char *)sqlite3_value_text(argv[1]),
434	    (const char *)sqlite3_value_text(argv[0])));
435}
436
437/*
438 * Implement regular expression match
439 * as an application-defined SQL function.
440 */
441static void
442sql_regexp(sqlite3_context *context, int argc, sqlite3_value **argv)
443{
444
445	assert(2 == argc);
446	sqlite3_result_int(context, !regexec(
447	    (regex_t *)sqlite3_value_blob(argv[0]),
448	    (const char *)sqlite3_value_text(argv[1]),
449	    0, NULL, 0));
450}
451
452static void
453sql_append(char **sql, size_t *sz, const char *newstr, int count)
454{
455	size_t		 newsz;
456
457	newsz = 1 < count ? (size_t)count : strlen(newstr);
458	*sql = mandoc_realloc(*sql, *sz + newsz + 1);
459	if (1 < count)
460		memset(*sql + *sz, *newstr, (size_t)count);
461	else
462		memcpy(*sql + *sz, newstr, newsz);
463	*sz += newsz;
464	(*sql)[*sz] = '\0';
465}
466
467/*
468 * Prepare the search SQL statement.
469 */
470static char *
471sql_statement(const struct expr *e)
472{
473	char		*sql;
474	size_t		 sz;
475	int		 needop;
476
477	sql = mandoc_strdup("SELECT * FROM mpages WHERE ");
478	sz = strlen(sql);
479
480	for (needop = 0; NULL != e; e = e->next) {
481		if (e->and)
482			sql_append(&sql, &sz, " AND ", 1);
483		else if (needop)
484			sql_append(&sql, &sz, " OR ", 1);
485		if (e->open)
486			sql_append(&sql, &sz, "(", e->open);
487		sql_append(&sql, &sz, NULL == e->substr ?
488		    "id IN (SELECT pageid FROM keys "
489		    "WHERE key REGEXP ? AND bits & ?)" :
490		    "id IN (SELECT pageid FROM keys "
491		    "WHERE key MATCH ? AND bits & ?)", 1);
492		if (e->close)
493			sql_append(&sql, &sz, ")", e->close);
494		needop = 1;
495	}
496
497	return(sql);
498}
499
500/*
501 * Compile a set of string tokens into an expression.
502 * Tokens in "argv" are assumed to be individual expression atoms (e.g.,
503 * "(", "foo=bar", etc.).
504 */
505static struct expr *
506exprcomp(const struct mansearch *search, int argc, char *argv[])
507{
508	int		 i, toopen, logic, igncase, toclose;
509	struct expr	*first, *next, *cur;
510
511	first = cur = NULL;
512	logic = igncase = toclose = 0;
513	toopen = 1;
514
515	for (i = 0; i < argc; i++) {
516		if (0 == strcmp("(", argv[i])) {
517			if (igncase)
518				goto fail;
519			toopen++;
520			toclose++;
521			continue;
522		} else if (0 == strcmp(")", argv[i])) {
523			if (toopen || logic || igncase || NULL == cur)
524				goto fail;
525			cur->close++;
526			if (0 > --toclose)
527				goto fail;
528			continue;
529		} else if (0 == strcmp("-a", argv[i])) {
530			if (toopen || logic || igncase || NULL == cur)
531				goto fail;
532			logic = 1;
533			continue;
534		} else if (0 == strcmp("-o", argv[i])) {
535			if (toopen || logic || igncase || NULL == cur)
536				goto fail;
537			logic = 2;
538			continue;
539		} else if (0 == strcmp("-i", argv[i])) {
540			if (igncase)
541				goto fail;
542			igncase = 1;
543			continue;
544		}
545		next = exprterm(search, argv[i], !igncase);
546		if (NULL == next)
547			goto fail;
548		next->open = toopen;
549		next->and = (1 == logic);
550		if (NULL != first) {
551			cur->next = next;
552			cur = next;
553		} else
554			cur = first = next;
555		toopen = logic = igncase = 0;
556	}
557	if (toopen || logic || igncase || toclose)
558		goto fail;
559
560	cur->close++;
561	cur = exprspec(cur, TYPE_arch, search->arch, "^(%s|any)$");
562	exprspec(cur, TYPE_sec, search->sec, "^%s$");
563
564	return(first);
565
566fail:
567	if (NULL != first)
568		exprfree(first);
569	return(NULL);
570}
571
572static struct expr *
573exprspec(struct expr *cur, uint64_t key, const char *value,
574		const char *format)
575{
576	char	 errbuf[BUFSIZ];
577	char	*cp;
578	int	 irc;
579
580	if (NULL == value)
581		return(cur);
582
583	if (-1 == asprintf(&cp, format, value)) {
584		perror(0);
585		exit((int)MANDOCLEVEL_SYSERR);
586	}
587	cur->next = mandoc_calloc(1, sizeof(struct expr));
588	cur = cur->next;
589	cur->and = 1;
590	cur->bits = key;
591	if (0 != (irc = regcomp(&cur->regexp, cp,
592	    REG_EXTENDED | REG_NOSUB | REG_ICASE))) {
593		regerror(irc, &cur->regexp, errbuf, sizeof(errbuf));
594		fprintf(stderr, "regcomp: %s\n", errbuf);
595		cur->substr = value;
596	}
597	free(cp);
598	return(cur);
599}
600
601static struct expr *
602exprterm(const struct mansearch *search, char *buf, int cs)
603{
604	char		 errbuf[BUFSIZ];
605	struct expr	*e;
606	char		*key, *v;
607	size_t		 i;
608	int		 irc;
609
610	if ('\0' == *buf)
611		return(NULL);
612
613	e = mandoc_calloc(1, sizeof(struct expr));
614
615	/*"whatis" mode uses an opaque string and default fields. */
616
617	if (MANSEARCH_WHATIS & search->flags) {
618		e->substr = buf;
619		e->bits = search->deftype;
620		return(e);
621	}
622
623	/*
624	 * If no =~ is specified, search with equality over names and
625	 * descriptions.
626	 * If =~ begins the phrase, use name and description fields.
627	 */
628
629	if (NULL == (v = strpbrk(buf, "=~"))) {
630		e->substr = buf;
631		e->bits = search->deftype;
632		return(e);
633	} else if (v == buf)
634		e->bits = search->deftype;
635
636	if ('~' == *v++) {
637		if (0 != (irc = regcomp(&e->regexp, v,
638		    REG_EXTENDED | REG_NOSUB | (cs ? 0 : REG_ICASE)))) {
639			regerror(irc, &e->regexp, errbuf, sizeof(errbuf));
640			fprintf(stderr, "regcomp: %s\n", errbuf);
641			free(e);
642			return(NULL);
643		}
644	} else
645		e->substr = v;
646	v[-1] = '\0';
647
648	/*
649	 * Parse out all possible fields.
650	 * If the field doesn't resolve, bail.
651	 */
652
653	while (NULL != (key = strsep(&buf, ","))) {
654		if ('\0' == *key)
655			continue;
656		i = 0;
657		while (types[i].bits &&
658			strcasecmp(types[i].name, key))
659			i++;
660		if (0 == types[i].bits) {
661			free(e);
662			return(NULL);
663		}
664		e->bits |= types[i].bits;
665	}
666
667	return(e);
668}
669
670static void
671exprfree(struct expr *p)
672{
673	struct expr	*pp;
674
675	while (NULL != p) {
676		pp = p->next;
677		free(p);
678		p = pp;
679	}
680}
681
682static void *
683hash_halloc(size_t sz, void *arg)
684{
685
686	return(mandoc_calloc(sz, 1));
687}
688
689static void *
690hash_alloc(size_t sz, void *arg)
691{
692
693	return(mandoc_malloc(sz));
694}
695
696static void
697hash_free(void *p, size_t sz, void *arg)
698{
699
700	free(p);
701}
702