mdoc.c revision 1.23
1/*	$Id: mdoc.c,v 1.23 2009/08/22 17:21:24 schwarze Exp $ */
2/*
3 * Copyright (c) 2008, 2009 Kristaps Dzonsons <kristaps@kth.se>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17#include <assert.h>
18#include <ctype.h>
19#include <stdarg.h>
20#include <stdio.h>
21#include <stdlib.h>
22#include <string.h>
23
24#include "libmdoc.h"
25
26const	char *const __mdoc_merrnames[MERRMAX] = {
27	"trailing whitespace", /* ETAILWS */
28	"unexpected quoted parameter", /* EQUOTPARM */
29	"unterminated quoted parameter", /* EQUOTTERM */
30	"system: malloc error", /* EMALLOC */
31	"argument parameter suggested", /* EARGVAL */
32	"macro not callable", /* ENOCALL */
33	"macro disallowed in prologue", /* EBODYPROL */
34	"macro disallowed in body", /* EPROLBODY */
35	"text disallowed in prologue", /* ETEXTPROL */
36	"blank line disallowed", /* ENOBLANK */
37	"text parameter too long", /* ETOOLONG */
38	"invalid escape sequence", /* EESCAPE */
39	"invalid character", /* EPRINT */
40	"document has no body", /* ENODAT */
41	"document has no prologue", /* ENOPROLOGUE */
42	"expected line arguments", /* ELINE */
43	"invalid AT&T argument", /* EATT */
44	"default name not yet set", /* ENAME */
45	"missing list type", /* ELISTTYPE */
46	"missing display type", /* EDISPTYPE */
47	"too many display types", /* EMULTIDISP */
48	"too many list types", /* EMULTILIST */
49	"NAME section must be first", /* ESECNAME */
50	"badly-formed NAME section", /* ENAMESECINC */
51	"argument repeated", /* EARGREP */
52	"expected boolean parameter", /* EBOOL */
53	"inconsistent column syntax", /* ECOLMIS */
54	"nested display invalid", /* ENESTDISP */
55	"width argument missing", /* EMISSWIDTH */
56	"invalid section for this manual section", /* EWRONGMSEC */
57	"section out of conventional order", /* ESECOOO */
58	"section repeated", /* ESECREP */
59	"invalid standard argument", /* EBADSTAND */
60	"multi-line arguments discouraged", /* ENOMULTILINE */
61	"multi-line arguments suggested", /* EMULTILINE */
62	"line arguments discouraged", /* ENOLINE */
63	"prologue macro out of conventional order", /* EPROLOOO */
64	"prologue macro repeated", /* EPROLREP */
65	"invalid manual section", /* EBADMSEC */
66	"invalid section", /* EBADSEC */
67	"invalid font mode", /* EFONT */
68	"invalid date syntax", /* EBADDATE */
69	"invalid number format", /* ENUMFMT */
70	"superfluous width argument", /* ENOWIDTH */
71	"system: utsname error", /* EUTSNAME */
72	"obsolete macro", /* EOBS */
73	"macro-like parameter", /* EMACPARM */
74	"end-of-line scope violation", /* EIMPBRK */
75	"empty macro ignored", /* EIGNE */
76	"unclosed explicit scope", /* EOPEN */
77	"unterminated quoted phrase", /* EQUOTPHR */
78	"closure macro without prior context", /* ENOCTX */
79	"invalid whitespace after control character", /* ESPACE */
80	"no description found for library" /* ELIB */
81};
82
83const	char *const __mdoc_macronames[MDOC_MAX] = {
84	"Ap",		"Dd",		"Dt",		"Os",
85	"Sh",		"Ss",		"Pp",		"D1",
86	"Dl",		"Bd",		"Ed",		"Bl",
87	"El",		"It",		"Ad",		"An",
88	"Ar",		"Cd",		"Cm",		"Dv",
89	"Er",		"Ev",		"Ex",		"Fa",
90	"Fd",		"Fl",		"Fn",		"Ft",
91	"Ic",		"In",		"Li",		"Nd",
92	"Nm",		"Op",		"Ot",		"Pa",
93	"Rv",		"St",		"Va",		"Vt",
94	/* LINTED */
95	"Xr",		"\%A",		"\%B",		"\%D",
96	/* LINTED */
97	"\%I",		"\%J",		"\%N",		"\%O",
98	/* LINTED */
99	"\%P",		"\%R",		"\%T",		"\%V",
100	"Ac",		"Ao",		"Aq",		"At",
101	"Bc",		"Bf",		"Bo",		"Bq",
102	"Bsx",		"Bx",		"Db",		"Dc",
103	"Do",		"Dq",		"Ec",		"Ef",
104	"Em",		"Eo",		"Fx",		"Ms",
105	"No",		"Ns",		"Nx",		"Ox",
106	"Pc",		"Pf",		"Po",		"Pq",
107	"Qc",		"Ql",		"Qo",		"Qq",
108	"Re",		"Rs",		"Sc",		"So",
109	"Sq",		"Sm",		"Sx",		"Sy",
110	"Tn",		"Ux",		"Xc",		"Xo",
111	"Fo",		"Fc",		"Oo",		"Oc",
112	"Bk",		"Ek",		"Bt",		"Hf",
113	"Fr",		"Ud",		"Lb",		"Lp",
114	"Lk",		"Mt",		"Brq",		"Bro",
115	/* LINTED */
116	"Brc",		"\%C",		"Es",		"En",
117	/* LINTED */
118	"Dx",		"\%Q",		"br",		"sp"
119	};
120
121const	char *const __mdoc_argnames[MDOC_ARG_MAX] = {
122	"split",		"nosplit",		"ragged",
123	"unfilled",		"literal",		"file",
124	"offset",		"bullet",		"dash",
125	"hyphen",		"item",			"enum",
126	"tag",			"diag",			"hang",
127	"ohang",		"inset",		"column",
128	"width",		"compact",		"std",
129	"filled",		"words",		"emphasis",
130	"symbolic",		"nested"
131	};
132
133const	char * const *mdoc_macronames = __mdoc_macronames;
134const	char * const *mdoc_argnames = __mdoc_argnames;
135
136static	void		  mdoc_free1(struct mdoc *);
137static	int		  mdoc_alloc1(struct mdoc *);
138static	struct mdoc_node *node_alloc(struct mdoc *, int, int,
139				int, enum mdoc_type);
140static	int		  node_append(struct mdoc *,
141				struct mdoc_node *);
142static	int		  parsetext(struct mdoc *, int, char *);
143static	int		  parsemacro(struct mdoc *, int, char *);
144static	int		  macrowarn(struct mdoc *, int, const char *);
145static	int		  pstring(struct mdoc *, int, int,
146				const char *, size_t);
147
148
149const struct mdoc_node *
150mdoc_node(const struct mdoc *m)
151{
152
153	return(MDOC_HALT & m->flags ? NULL : m->first);
154}
155
156
157const struct mdoc_meta *
158mdoc_meta(const struct mdoc *m)
159{
160
161	return(MDOC_HALT & m->flags ? NULL : &m->meta);
162}
163
164
165/*
166 * Frees volatile resources (parse tree, meta-data, fields).
167 */
168static void
169mdoc_free1(struct mdoc *mdoc)
170{
171
172	if (mdoc->first)
173		mdoc_node_freelist(mdoc->first);
174	if (mdoc->meta.title)
175		free(mdoc->meta.title);
176	if (mdoc->meta.os)
177		free(mdoc->meta.os);
178	if (mdoc->meta.name)
179		free(mdoc->meta.name);
180	if (mdoc->meta.arch)
181		free(mdoc->meta.arch);
182	if (mdoc->meta.vol)
183		free(mdoc->meta.vol);
184}
185
186
187/*
188 * Allocate all volatile resources (parse tree, meta-data, fields).
189 */
190static int
191mdoc_alloc1(struct mdoc *mdoc)
192{
193
194	bzero(&mdoc->meta, sizeof(struct mdoc_meta));
195	mdoc->flags = 0;
196	mdoc->lastnamed = mdoc->lastsec = SEC_NONE;
197	mdoc->last = calloc(1, sizeof(struct mdoc_node));
198	if (NULL == mdoc->last)
199		return(0);
200
201	mdoc->first = mdoc->last;
202	mdoc->last->type = MDOC_ROOT;
203	mdoc->next = MDOC_NEXT_CHILD;
204	return(1);
205}
206
207
208/*
209 * Free up volatile resources (see mdoc_free1()) then re-initialises the
210 * data with mdoc_alloc1().  After invocation, parse data has been reset
211 * and the parser is ready for re-invocation on a new tree; however,
212 * cross-parse non-volatile data is kept intact.
213 */
214int
215mdoc_reset(struct mdoc *mdoc)
216{
217
218	mdoc_free1(mdoc);
219	return(mdoc_alloc1(mdoc));
220}
221
222
223/*
224 * Completely free up all volatile and non-volatile parse resources.
225 * After invocation, the pointer is no longer usable.
226 */
227void
228mdoc_free(struct mdoc *mdoc)
229{
230
231	mdoc_free1(mdoc);
232	if (mdoc->htab)
233		mdoc_hash_free(mdoc->htab);
234	free(mdoc);
235}
236
237
238/*
239 * Allocate volatile and non-volatile parse resources.
240 */
241struct mdoc *
242mdoc_alloc(void *data, int pflags, const struct mdoc_cb *cb)
243{
244	struct mdoc	*p;
245
246	if (NULL == (p = calloc(1, sizeof(struct mdoc))))
247		return(NULL);
248	if (cb)
249		(void)memcpy(&p->cb, cb, sizeof(struct mdoc_cb));
250
251	p->data = data;
252	p->pflags = pflags;
253
254	if (NULL == (p->htab = mdoc_hash_alloc())) {
255		free(p);
256		return(NULL);
257	} else if (mdoc_alloc1(p))
258		return(p);
259
260	free(p);
261	return(NULL);
262}
263
264
265/*
266 * Climb back up the parse tree, validating open scopes.  Mostly calls
267 * through to macro_end() in macro.c.
268 */
269int
270mdoc_endparse(struct mdoc *m)
271{
272
273	if (MDOC_HALT & m->flags)
274		return(0);
275	else if (mdoc_macroend(m))
276		return(1);
277	m->flags |= MDOC_HALT;
278	return(0);
279}
280
281
282/*
283 * Main parse routine.  Parses a single line -- really just hands off to
284 * the macro (parsemacro()) or text parser (parsetext()).
285 */
286int
287mdoc_parseln(struct mdoc *m, int ln, char *buf)
288{
289
290	if (MDOC_HALT & m->flags)
291		return(0);
292
293	return('.' == *buf ? parsemacro(m, ln, buf) :
294			parsetext(m, ln, buf));
295}
296
297
298int
299mdoc_verr(struct mdoc *mdoc, int ln, int pos,
300		const char *fmt, ...)
301{
302	char		 buf[256];
303	va_list		 ap;
304
305	if (NULL == mdoc->cb.mdoc_err)
306		return(0);
307
308	va_start(ap, fmt);
309	(void)vsnprintf(buf, sizeof(buf) - 1, fmt, ap);
310	va_end(ap);
311
312	return((*mdoc->cb.mdoc_err)(mdoc->data, ln, pos, buf));
313}
314
315
316int
317mdoc_vwarn(struct mdoc *mdoc, int ln, int pos, const char *fmt, ...)
318{
319	char		 buf[256];
320	va_list		 ap;
321
322	if (NULL == mdoc->cb.mdoc_warn)
323		return(0);
324
325	va_start(ap, fmt);
326	(void)vsnprintf(buf, sizeof(buf) - 1, fmt, ap);
327	va_end(ap);
328
329	return((*mdoc->cb.mdoc_warn)(mdoc->data, ln, pos, buf));
330}
331
332
333int
334mdoc_err(struct mdoc *m, int line, int pos, int iserr, enum merr type)
335{
336	const char	*p;
337
338	p = __mdoc_merrnames[(int)type];
339	assert(p);
340
341	if (iserr)
342		return(mdoc_verr(m, line, pos, p));
343
344	return(mdoc_vwarn(m, line, pos, p));
345}
346
347
348int
349mdoc_macro(struct mdoc *m, int tok,
350		int ln, int pp, int *pos, char *buf)
351{
352
353	if (MDOC_PROLOGUE & mdoc_macros[tok].flags &&
354			MDOC_PBODY & m->flags)
355		return(mdoc_perr(m, ln, pp, EPROLBODY));
356	if ( ! (MDOC_PROLOGUE & mdoc_macros[tok].flags) &&
357			! (MDOC_PBODY & m->flags))
358		return(mdoc_perr(m, ln, pp, EBODYPROL));
359
360	if (1 != pp && ! (MDOC_CALLABLE & mdoc_macros[tok].flags))
361		return(mdoc_perr(m, ln, pp, ENOCALL));
362
363	return((*mdoc_macros[tok].fp)(m, tok, ln, pp, pos, buf));
364}
365
366
367static int
368node_append(struct mdoc *mdoc, struct mdoc_node *p)
369{
370
371	assert(mdoc->last);
372	assert(mdoc->first);
373	assert(MDOC_ROOT != p->type);
374
375	switch (mdoc->next) {
376	case (MDOC_NEXT_SIBLING):
377		mdoc->last->next = p;
378		p->prev = mdoc->last;
379		p->parent = mdoc->last->parent;
380		break;
381	case (MDOC_NEXT_CHILD):
382		mdoc->last->child = p;
383		p->parent = mdoc->last;
384		break;
385	default:
386		abort();
387		/* NOTREACHED */
388	}
389
390	p->parent->nchild++;
391
392	if ( ! mdoc_valid_pre(mdoc, p))
393		return(0);
394	if ( ! mdoc_action_pre(mdoc, p))
395		return(0);
396
397	switch (p->type) {
398	case (MDOC_HEAD):
399		assert(MDOC_BLOCK == p->parent->type);
400		p->parent->head = p;
401		break;
402	case (MDOC_TAIL):
403		assert(MDOC_BLOCK == p->parent->type);
404		p->parent->tail = p;
405		break;
406	case (MDOC_BODY):
407		assert(MDOC_BLOCK == p->parent->type);
408		p->parent->body = p;
409		break;
410	default:
411		break;
412	}
413
414	mdoc->last = p;
415
416	switch (p->type) {
417	case (MDOC_TEXT):
418		if ( ! mdoc_valid_post(mdoc))
419			return(0);
420		if ( ! mdoc_action_post(mdoc))
421			return(0);
422		break;
423	default:
424		break;
425	}
426
427	return(1);
428}
429
430
431static struct mdoc_node *
432node_alloc(struct mdoc *m, int line,
433		int pos, int tok, enum mdoc_type type)
434{
435	struct mdoc_node *p;
436
437	if (NULL == (p = calloc(1, sizeof(struct mdoc_node)))) {
438		(void)mdoc_nerr(m, m->last, EMALLOC);
439		return(NULL);
440	}
441
442	p->sec = m->lastsec;
443	p->line = line;
444	p->pos = pos;
445	p->tok = tok;
446	if (MDOC_TEXT != (p->type = type))
447		assert(p->tok >= 0);
448
449	return(p);
450}
451
452
453int
454mdoc_tail_alloc(struct mdoc *m, int line, int pos, int tok)
455{
456	struct mdoc_node *p;
457
458	p = node_alloc(m, line, pos, tok, MDOC_TAIL);
459	if (NULL == p)
460		return(0);
461	return(node_append(m, p));
462}
463
464
465int
466mdoc_head_alloc(struct mdoc *m, int line, int pos, int tok)
467{
468	struct mdoc_node *p;
469
470	assert(m->first);
471	assert(m->last);
472
473	p = node_alloc(m, line, pos, tok, MDOC_HEAD);
474	if (NULL == p)
475		return(0);
476	return(node_append(m, p));
477}
478
479
480int
481mdoc_body_alloc(struct mdoc *m, int line, int pos, int tok)
482{
483	struct mdoc_node *p;
484
485	p = node_alloc(m, line, pos, tok, MDOC_BODY);
486	if (NULL == p)
487		return(0);
488	return(node_append(m, p));
489}
490
491
492int
493mdoc_block_alloc(struct mdoc *m, int line, int pos,
494		int tok, struct mdoc_arg *args)
495{
496	struct mdoc_node *p;
497
498	p = node_alloc(m, line, pos, tok, MDOC_BLOCK);
499	if (NULL == p)
500		return(0);
501	p->args = args;
502	if (p->args)
503		(args->refcnt)++;
504	return(node_append(m, p));
505}
506
507
508int
509mdoc_elem_alloc(struct mdoc *m, int line, int pos,
510		int tok, struct mdoc_arg *args)
511{
512	struct mdoc_node *p;
513
514	p = node_alloc(m, line, pos, tok, MDOC_ELEM);
515	if (NULL == p)
516		return(0);
517	p->args = args;
518	if (p->args)
519		(args->refcnt)++;
520	return(node_append(m, p));
521}
522
523
524static int
525pstring(struct mdoc *m, int line, int pos, const char *p, size_t len)
526{
527	struct mdoc_node *n;
528	size_t		  sv;
529
530	n = node_alloc(m, line, pos, -1, MDOC_TEXT);
531	if (NULL == n)
532		return(mdoc_nerr(m, m->last, EMALLOC));
533
534	n->string = malloc(len + 1);
535	if (NULL == n->string) {
536		free(n);
537		return(mdoc_nerr(m, m->last, EMALLOC));
538	}
539
540	sv = strlcpy(n->string, p, len + 1);
541
542	/* Prohibit truncation. */
543	assert(sv < len + 1);
544
545	return(node_append(m, n));
546}
547
548
549int
550mdoc_word_alloc(struct mdoc *m, int line, int pos, const char *p)
551{
552
553	return(pstring(m, line, pos, p, strlen(p)));
554}
555
556
557void
558mdoc_node_free(struct mdoc_node *p)
559{
560
561	if (p->parent)
562		p->parent->nchild--;
563	if (p->string)
564		free(p->string);
565	if (p->args)
566		mdoc_argv_free(p->args);
567	free(p);
568}
569
570
571void
572mdoc_node_freelist(struct mdoc_node *p)
573{
574
575	if (p->child)
576		mdoc_node_freelist(p->child);
577	if (p->next)
578		mdoc_node_freelist(p->next);
579
580	assert(0 == p->nchild);
581	mdoc_node_free(p);
582}
583
584
585/*
586 * Parse free-form text, that is, a line that does not begin with the
587 * control character.
588 */
589static int
590parsetext(struct mdoc *m, int line, char *buf)
591{
592	int		 i, j;
593
594	if (SEC_NONE == m->lastnamed)
595		return(mdoc_perr(m, line, 0, ETEXTPROL));
596
597	/*
598	 * If in literal mode, then pass the buffer directly to the
599	 * back-end, as it should be preserved as a single term.
600	 */
601
602	if (MDOC_LITERAL & m->flags) {
603		if ( ! mdoc_word_alloc(m, line, 0, buf))
604			return(0);
605		m->next = MDOC_NEXT_SIBLING;
606		return(1);
607	}
608
609	/* Disallow blank/white-space lines in non-literal mode. */
610
611	for (i = 0; ' ' == buf[i]; i++)
612		/* Skip leading whitespace. */ ;
613	if (0 == buf[i])
614		return(mdoc_perr(m, line, 0, ENOBLANK));
615
616	/*
617	 * Break apart a free-form line into tokens.  Spaces are
618	 * stripped out of the input.
619	 */
620
621	for (j = i; buf[i]; i++) {
622		if (' ' != buf[i])
623			continue;
624
625		/* Escaped whitespace. */
626		if (i && ' ' == buf[i] && '\\' == buf[i - 1])
627			continue;
628
629		buf[i++] = 0;
630		if ( ! pstring(m, line, j, &buf[j], (size_t)(i - j)))
631			return(0);
632		m->next = MDOC_NEXT_SIBLING;
633
634		for ( ; ' ' == buf[i]; i++)
635			/* Skip trailing whitespace. */ ;
636
637		j = i;
638		if (0 == buf[i])
639			break;
640	}
641
642	if (j != i && ! pstring(m, line, j, &buf[j], (size_t)(i - j)))
643		return(0);
644
645	m->next = MDOC_NEXT_SIBLING;
646	return(1);
647}
648
649
650
651
652static int
653macrowarn(struct mdoc *m, int ln, const char *buf)
654{
655	if ( ! (MDOC_IGN_MACRO & m->pflags))
656		return(mdoc_verr(m, ln, 0,
657				"unknown macro: %s%s",
658				buf, strlen(buf) > 3 ? "..." : ""));
659	return(mdoc_vwarn(m, ln, 0, "unknown macro: %s%s",
660				buf, strlen(buf) > 3 ? "..." : ""));
661}
662
663
664/*
665 * Parse a macro line, that is, a line beginning with the control
666 * character.
667 */
668int
669parsemacro(struct mdoc *m, int ln, char *buf)
670{
671	int		  i, c;
672	char		  mac[5];
673
674	/* Empty lines are ignored. */
675
676	if (0 == buf[1])
677		return(1);
678
679	if (' ' == buf[1]) {
680		i = 2;
681		while (buf[i] && ' ' == buf[i])
682			i++;
683		if (0 == buf[i])
684			return(1);
685		return(mdoc_perr(m, ln, 1, ESPACE));
686	}
687
688	/* Copy the first word into a nil-terminated buffer. */
689
690	for (i = 1; i < 5; i++) {
691		if (0 == (mac[i - 1] = buf[i]))
692			break;
693		else if (' ' == buf[i])
694			break;
695	}
696
697	mac[i - 1] = 0;
698
699	if (i == 5 || i <= 2) {
700		if ( ! macrowarn(m, ln, mac))
701			goto err;
702		return(1);
703	}
704
705	if (MDOC_MAX == (c = mdoc_hash_find(m->htab, mac))) {
706		if ( ! macrowarn(m, ln, mac))
707			goto err;
708		return(1);
709	}
710
711	/* The macro is sane.  Jump to the next word. */
712
713	while (buf[i] && ' ' == buf[i])
714		i++;
715
716	/* Begin recursive parse sequence. */
717
718	if ( ! mdoc_macro(m, c, ln, 1, &i, buf))
719		goto err;
720
721	return(1);
722
723err:	/* Error out. */
724
725	m->flags |= MDOC_HALT;
726	return(0);
727}
728