1294113Sbapt/*	$Id: mdoc.c,v 1.256 2015/10/30 19:04:16 schwarze Exp $ */
2241675Suqs/*
3241675Suqs * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4279527Sbapt * Copyright (c) 2010, 2012-2015 Ingo Schwarze <schwarze@openbsd.org>
5241675Suqs *
6241675Suqs * Permission to use, copy, modify, and distribute this software for any
7241675Suqs * purpose with or without fee is hereby granted, provided that the above
8241675Suqs * copyright notice and this permission notice appear in all copies.
9241675Suqs *
10294113Sbapt * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11241675Suqs * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12294113Sbapt * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13241675Suqs * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14241675Suqs * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15241675Suqs * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16241675Suqs * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17241675Suqs */
18241675Suqs#include "config.h"
19241675Suqs
20241675Suqs#include <sys/types.h>
21241675Suqs
22241675Suqs#include <assert.h>
23274880Sbapt#include <ctype.h>
24241675Suqs#include <stdarg.h>
25241675Suqs#include <stdio.h>
26241675Suqs#include <stdlib.h>
27241675Suqs#include <string.h>
28241675Suqs#include <time.h>
29241675Suqs
30294113Sbapt#include "mandoc_aux.h"
31294113Sbapt#include "mandoc.h"
32294113Sbapt#include "roff.h"
33241675Suqs#include "mdoc.h"
34294113Sbapt#include "libmandoc.h"
35294113Sbapt#include "roff_int.h"
36241675Suqs#include "libmdoc.h"
37241675Suqs
38274880Sbaptconst	char *const __mdoc_macronames[MDOC_MAX + 1] = {
39294113Sbapt	"text",
40241675Suqs	"Ap",		"Dd",		"Dt",		"Os",
41241675Suqs	"Sh",		"Ss",		"Pp",		"D1",
42241675Suqs	"Dl",		"Bd",		"Ed",		"Bl",
43241675Suqs	"El",		"It",		"Ad",		"An",
44241675Suqs	"Ar",		"Cd",		"Cm",		"Dv",
45241675Suqs	"Er",		"Ev",		"Ex",		"Fa",
46241675Suqs	"Fd",		"Fl",		"Fn",		"Ft",
47241675Suqs	"Ic",		"In",		"Li",		"Nd",
48241675Suqs	"Nm",		"Op",		"Ot",		"Pa",
49241675Suqs	"Rv",		"St",		"Va",		"Vt",
50241675Suqs	"Xr",		"%A",		"%B",		"%D",
51241675Suqs	"%I",		"%J",		"%N",		"%O",
52241675Suqs	"%P",		"%R",		"%T",		"%V",
53241675Suqs	"Ac",		"Ao",		"Aq",		"At",
54241675Suqs	"Bc",		"Bf",		"Bo",		"Bq",
55241675Suqs	"Bsx",		"Bx",		"Db",		"Dc",
56241675Suqs	"Do",		"Dq",		"Ec",		"Ef",
57241675Suqs	"Em",		"Eo",		"Fx",		"Ms",
58241675Suqs	"No",		"Ns",		"Nx",		"Ox",
59241675Suqs	"Pc",		"Pf",		"Po",		"Pq",
60241675Suqs	"Qc",		"Ql",		"Qo",		"Qq",
61241675Suqs	"Re",		"Rs",		"Sc",		"So",
62241675Suqs	"Sq",		"Sm",		"Sx",		"Sy",
63241675Suqs	"Tn",		"Ux",		"Xc",		"Xo",
64241675Suqs	"Fo",		"Fc",		"Oo",		"Oc",
65241675Suqs	"Bk",		"Ek",		"Bt",		"Hf",
66241675Suqs	"Fr",		"Ud",		"Lb",		"Lp",
67241675Suqs	"Lk",		"Mt",		"Brq",		"Bro",
68241675Suqs	"Brc",		"%C",		"Es",		"En",
69241675Suqs	"Dx",		"%Q",		"br",		"sp",
70294113Sbapt	"%U",		"Ta",		"ll",
71294113Sbapt};
72241675Suqs
73274880Sbaptconst	char *const __mdoc_argnames[MDOC_ARG_MAX] = {
74241675Suqs	"split",		"nosplit",		"ragged",
75274880Sbapt	"unfilled",		"literal",		"file",
76274880Sbapt	"offset",		"bullet",		"dash",
77274880Sbapt	"hyphen",		"item",			"enum",
78274880Sbapt	"tag",			"diag",			"hang",
79274880Sbapt	"ohang",		"inset",		"column",
80274880Sbapt	"width",		"compact",		"std",
81241675Suqs	"filled",		"words",		"emphasis",
82241675Suqs	"symbolic",		"nested",		"centered"
83241675Suqs	};
84241675Suqs
85294113Sbaptconst	char * const *mdoc_macronames = __mdoc_macronames + 1;
86241675Suqsconst	char * const *mdoc_argnames = __mdoc_argnames;
87241675Suqs
88294113Sbaptstatic	int		  mdoc_ptext(struct roff_man *, int, char *, int);
89294113Sbaptstatic	int		  mdoc_pmacro(struct roff_man *, int, char *, int);
90241675Suqs
91274880Sbapt
92241675Suqs/*
93241675Suqs * Main parse routine.  Parses a single line -- really just hands off to
94241675Suqs * the macro (mdoc_pmacro()) or text parser (mdoc_ptext()).
95241675Suqs */
96241675Suqsint
97294113Sbaptmdoc_parseln(struct roff_man *mdoc, int ln, char *buf, int offs)
98241675Suqs{
99241675Suqs
100294113Sbapt	if (mdoc->last->type != ROFFT_EQN || ln > mdoc->last->line)
101275432Sbapt		mdoc->flags |= MDOC_NEWLINE;
102241675Suqs
103241675Suqs	/*
104241675Suqs	 * Let the roff nS register switch SYNOPSIS mode early,
105241675Suqs	 * such that the parser knows at all times
106241675Suqs	 * whether this mode is on or off.
107241675Suqs	 * Note that this mode is also switched by the Sh macro.
108241675Suqs	 */
109261344Suqs	if (roff_getreg(mdoc->roff, "nS"))
110261344Suqs		mdoc->flags |= MDOC_SYNOPSIS;
111261344Suqs	else
112261344Suqs		mdoc->flags &= ~MDOC_SYNOPSIS;
113241675Suqs
114294113Sbapt	return roff_getcontrol(mdoc->roff, buf, &offs) ?
115274880Sbapt	    mdoc_pmacro(mdoc, ln, buf, offs) :
116294113Sbapt	    mdoc_ptext(mdoc, ln, buf, offs);
117241675Suqs}
118241675Suqs
119275432Sbaptvoid
120241675Suqsmdoc_macro(MACRO_PROT_ARGS)
121241675Suqs{
122294113Sbapt	assert(tok > TOKEN_NONE && tok < MDOC_MAX);
123241675Suqs
124275432Sbapt	(*mdoc_macros[tok].fp)(mdoc, tok, line, ppos, pos, buf);
125241675Suqs}
126241675Suqs
127275432Sbaptvoid
128294113Sbaptmdoc_tail_alloc(struct roff_man *mdoc, int line, int pos, int tok)
129241675Suqs{
130294113Sbapt	struct roff_node *p;
131241675Suqs
132294113Sbapt	p = roff_node_alloc(mdoc, line, pos, ROFFT_TAIL, tok);
133294113Sbapt	roff_node_append(mdoc, p);
134294113Sbapt	mdoc->next = ROFF_NEXT_CHILD;
135241675Suqs}
136241675Suqs
137294113Sbaptstruct roff_node *
138294113Sbaptmdoc_endbody_alloc(struct roff_man *mdoc, int line, int pos, int tok,
139294113Sbapt		struct roff_node *body, enum mdoc_endbody end)
140241675Suqs{
141294113Sbapt	struct roff_node *p;
142241675Suqs
143279527Sbapt	body->flags |= MDOC_ENDED;
144279527Sbapt	body->parent->flags |= MDOC_ENDED;
145294113Sbapt	p = roff_node_alloc(mdoc, line, pos, ROFFT_BODY, tok);
146279527Sbapt	p->body = body;
147261344Suqs	p->norm = body->norm;
148241675Suqs	p->end = end;
149294113Sbapt	roff_node_append(mdoc, p);
150294113Sbapt	mdoc->next = ROFF_NEXT_SIBLING;
151294113Sbapt	return p;
152241675Suqs}
153241675Suqs
154294113Sbaptstruct roff_node *
155294113Sbaptmdoc_block_alloc(struct roff_man *mdoc, int line, int pos,
156294113Sbapt	int tok, struct mdoc_arg *args)
157241675Suqs{
158294113Sbapt	struct roff_node *p;
159241675Suqs
160294113Sbapt	p = roff_node_alloc(mdoc, line, pos, ROFFT_BLOCK, tok);
161241675Suqs	p->args = args;
162241675Suqs	if (p->args)
163241675Suqs		(args->refcnt)++;
164241675Suqs
165241675Suqs	switch (tok) {
166274880Sbapt	case MDOC_Bd:
167274880Sbapt	case MDOC_Bf:
168274880Sbapt	case MDOC_Bl:
169274880Sbapt	case MDOC_En:
170274880Sbapt	case MDOC_Rs:
171241675Suqs		p->norm = mandoc_calloc(1, sizeof(union mdoc_data));
172241675Suqs		break;
173241675Suqs	default:
174241675Suqs		break;
175241675Suqs	}
176294113Sbapt	roff_node_append(mdoc, p);
177294113Sbapt	mdoc->next = ROFF_NEXT_CHILD;
178294113Sbapt	return p;
179241675Suqs}
180241675Suqs
181275432Sbaptvoid
182294113Sbaptmdoc_elem_alloc(struct roff_man *mdoc, int line, int pos,
183294113Sbapt	int tok, struct mdoc_arg *args)
184241675Suqs{
185294113Sbapt	struct roff_node *p;
186241675Suqs
187294113Sbapt	p = roff_node_alloc(mdoc, line, pos, ROFFT_ELEM, tok);
188241675Suqs	p->args = args;
189241675Suqs	if (p->args)
190241675Suqs		(args->refcnt)++;
191241675Suqs
192241675Suqs	switch (tok) {
193274880Sbapt	case MDOC_An:
194241675Suqs		p->norm = mandoc_calloc(1, sizeof(union mdoc_data));
195241675Suqs		break;
196241675Suqs	default:
197241675Suqs		break;
198241675Suqs	}
199294113Sbapt	roff_node_append(mdoc, p);
200294113Sbapt	mdoc->next = ROFF_NEXT_CHILD;
201241675Suqs}
202241675Suqs
203275432Sbaptvoid
204294113Sbaptmdoc_node_relink(struct roff_man *mdoc, struct roff_node *p)
205241675Suqs{
206241675Suqs
207294113Sbapt	roff_node_unlink(mdoc, p);
208294113Sbapt	p->prev = p->next = NULL;
209294113Sbapt	roff_node_append(mdoc, p);
210241675Suqs}
211241675Suqs
212241675Suqs/*
213241675Suqs * Parse free-form text, that is, a line that does not begin with the
214241675Suqs * control character.
215241675Suqs */
216241675Suqsstatic int
217294113Sbaptmdoc_ptext(struct roff_man *mdoc, int line, char *buf, int offs)
218241675Suqs{
219294113Sbapt	struct roff_node *n;
220241675Suqs	char		 *c, *ws, *end;
221241675Suqs
222261344Suqs	assert(mdoc->last);
223261344Suqs	n = mdoc->last;
224241675Suqs
225241675Suqs	/*
226241675Suqs	 * Divert directly to list processing if we're encountering a
227294113Sbapt	 * columnar ROFFT_BLOCK with or without a prior ROFFT_BLOCK entry
228294113Sbapt	 * (a ROFFT_BODY means it's already open, in which case we should
229241675Suqs	 * process within its context in the normal way).
230241675Suqs	 */
231241675Suqs
232294113Sbapt	if (n->tok == MDOC_Bl && n->type == ROFFT_BODY &&
233279527Sbapt	    n->end == ENDBODY_NOT && n->norm->Bl.type == LIST_column) {
234241675Suqs		/* `Bl' is open without any children. */
235261344Suqs		mdoc->flags |= MDOC_FREECOL;
236275432Sbapt		mdoc_macro(mdoc, MDOC_It, line, offs, &offs, buf);
237294113Sbapt		return 1;
238241675Suqs	}
239241675Suqs
240294113Sbapt	if (n->tok == MDOC_It && n->type == ROFFT_BLOCK &&
241274880Sbapt	    NULL != n->parent &&
242274880Sbapt	    MDOC_Bl == n->parent->tok &&
243274880Sbapt	    LIST_column == n->parent->norm->Bl.type) {
244241675Suqs		/* `Bl' has block-level `It' children. */
245261344Suqs		mdoc->flags |= MDOC_FREECOL;
246275432Sbapt		mdoc_macro(mdoc, MDOC_It, line, offs, &offs, buf);
247294113Sbapt		return 1;
248241675Suqs	}
249241675Suqs
250241675Suqs	/*
251241675Suqs	 * Search for the beginning of unescaped trailing whitespace (ws)
252241675Suqs	 * and for the first character not to be output (end).
253241675Suqs	 */
254241675Suqs
255241675Suqs	/* FIXME: replace with strcspn(). */
256241675Suqs	ws = NULL;
257241675Suqs	for (c = end = buf + offs; *c; c++) {
258241675Suqs		switch (*c) {
259241675Suqs		case ' ':
260241675Suqs			if (NULL == ws)
261241675Suqs				ws = c;
262241675Suqs			continue;
263241675Suqs		case '\t':
264241675Suqs			/*
265241675Suqs			 * Always warn about trailing tabs,
266241675Suqs			 * even outside literal context,
267241675Suqs			 * where they should be put on the next line.
268241675Suqs			 */
269241675Suqs			if (NULL == ws)
270241675Suqs				ws = c;
271241675Suqs			/*
272241675Suqs			 * Strip trailing tabs in literal context only;
273241675Suqs			 * outside, they affect the next line.
274241675Suqs			 */
275261344Suqs			if (MDOC_LITERAL & mdoc->flags)
276241675Suqs				continue;
277241675Suqs			break;
278241675Suqs		case '\\':
279241675Suqs			/* Skip the escaped character, too, if any. */
280241675Suqs			if (c[1])
281241675Suqs				c++;
282241675Suqs			/* FALLTHROUGH */
283241675Suqs		default:
284241675Suqs			ws = NULL;
285241675Suqs			break;
286241675Suqs		}
287241675Suqs		end = c + 1;
288241675Suqs	}
289241675Suqs	*end = '\0';
290241675Suqs
291241675Suqs	if (ws)
292274880Sbapt		mandoc_msg(MANDOCERR_SPACE_EOL, mdoc->parse,
293274880Sbapt		    line, (int)(ws-buf), NULL);
294241675Suqs
295275432Sbapt	if (buf[offs] == '\0' && ! (mdoc->flags & MDOC_LITERAL)) {
296274880Sbapt		mandoc_msg(MANDOCERR_FI_BLANK, mdoc->parse,
297274880Sbapt		    line, (int)(c - buf), NULL);
298241675Suqs
299241675Suqs		/*
300241675Suqs		 * Insert a `sp' in the case of a blank line.  Technically,
301241675Suqs		 * blank lines aren't allowed, but enough manuals assume this
302241675Suqs		 * behaviour that we want to work around it.
303241675Suqs		 */
304294113Sbapt		roff_elem_alloc(mdoc, line, offs, MDOC_sp);
305294113Sbapt		mdoc->last->flags |= MDOC_VALID | MDOC_ENDED;
306294113Sbapt		mdoc->next = ROFF_NEXT_SIBLING;
307294113Sbapt		return 1;
308241675Suqs	}
309241675Suqs
310294113Sbapt	roff_word_alloc(mdoc, line, offs, buf+offs);
311241675Suqs
312275432Sbapt	if (mdoc->flags & MDOC_LITERAL)
313294113Sbapt		return 1;
314241675Suqs
315241675Suqs	/*
316241675Suqs	 * End-of-sentence check.  If the last character is an unescaped
317241675Suqs	 * EOS character, then flag the node as being the end of a
318241675Suqs	 * sentence.  The front-end will know how to interpret this.
319241675Suqs	 */
320241675Suqs
321241675Suqs	assert(buf < end);
322241675Suqs
323274880Sbapt	if (mandoc_eos(buf+offs, (size_t)(end-buf-offs)))
324261344Suqs		mdoc->last->flags |= MDOC_EOS;
325294113Sbapt	return 1;
326241675Suqs}
327241675Suqs
328241675Suqs/*
329241675Suqs * Parse a macro line, that is, a line beginning with the control
330241675Suqs * character.
331241675Suqs */
332241675Suqsstatic int
333294113Sbaptmdoc_pmacro(struct roff_man *mdoc, int ln, char *buf, int offs)
334241675Suqs{
335294113Sbapt	struct roff_node *n;
336275432Sbapt	const char	 *cp;
337294113Sbapt	int		  tok;
338241675Suqs	int		  i, sv;
339241675Suqs	char		  mac[5];
340241675Suqs
341241675Suqs	sv = offs;
342241675Suqs
343274880Sbapt	/*
344241675Suqs	 * Copy the first word into a nil-terminated buffer.
345275432Sbapt	 * Stop when a space, tab, escape, or eoln is encountered.
346241675Suqs	 */
347241675Suqs
348241675Suqs	i = 0;
349275432Sbapt	while (i < 4 && strchr(" \t\\", buf[offs]) == NULL)
350241675Suqs		mac[i++] = buf[offs++];
351241675Suqs
352241675Suqs	mac[i] = '\0';
353241675Suqs
354294113Sbapt	tok = (i > 1 && i < 4) ? mdoc_hash_find(mac) : TOKEN_NONE;
355241675Suqs
356294113Sbapt	if (tok == TOKEN_NONE) {
357274880Sbapt		mandoc_msg(MANDOCERR_MACRO, mdoc->parse,
358274880Sbapt		    ln, sv, buf + sv - 1);
359294113Sbapt		return 1;
360241675Suqs	}
361241675Suqs
362275432Sbapt	/* Skip a leading escape sequence or tab. */
363241675Suqs
364275432Sbapt	switch (buf[offs]) {
365275432Sbapt	case '\\':
366275432Sbapt		cp = buf + offs + 1;
367275432Sbapt		mandoc_escape(&cp, NULL, NULL);
368275432Sbapt		offs = cp - buf;
369275432Sbapt		break;
370275432Sbapt	case '\t':
371241675Suqs		offs++;
372275432Sbapt		break;
373275432Sbapt	default:
374275432Sbapt		break;
375275432Sbapt	}
376241675Suqs
377241675Suqs	/* Jump to the next non-whitespace word. */
378241675Suqs
379241675Suqs	while (buf[offs] && ' ' == buf[offs])
380241675Suqs		offs++;
381241675Suqs
382274880Sbapt	/*
383241675Suqs	 * Trailing whitespace.  Note that tabs are allowed to be passed
384241675Suqs	 * into the parser as "text", so we only warn about spaces here.
385241675Suqs	 */
386241675Suqs
387241675Suqs	if ('\0' == buf[offs] && ' ' == buf[offs - 1])
388274880Sbapt		mandoc_msg(MANDOCERR_SPACE_EOL, mdoc->parse,
389274880Sbapt		    ln, offs - 1, NULL);
390241675Suqs
391241675Suqs	/*
392241675Suqs	 * If an initial macro or a list invocation, divert directly
393241675Suqs	 * into macro processing.
394241675Suqs	 */
395241675Suqs
396275432Sbapt	if (NULL == mdoc->last || MDOC_It == tok || MDOC_El == tok) {
397275432Sbapt		mdoc_macro(mdoc, tok, ln, sv, &offs, buf);
398294113Sbapt		return 1;
399275432Sbapt	}
400241675Suqs
401261344Suqs	n = mdoc->last;
402261344Suqs	assert(mdoc->last);
403241675Suqs
404241675Suqs	/*
405241675Suqs	 * If the first macro of a `Bl -column', open an `It' block
406241675Suqs	 * context around the parsed macro.
407241675Suqs	 */
408241675Suqs
409294113Sbapt	if (n->tok == MDOC_Bl && n->type == ROFFT_BODY &&
410279527Sbapt	    n->end == ENDBODY_NOT && n->norm->Bl.type == LIST_column) {
411261344Suqs		mdoc->flags |= MDOC_FREECOL;
412275432Sbapt		mdoc_macro(mdoc, MDOC_It, ln, sv, &sv, buf);
413294113Sbapt		return 1;
414241675Suqs	}
415241675Suqs
416241675Suqs	/*
417241675Suqs	 * If we're following a block-level `It' within a `Bl -column'
418241675Suqs	 * context (perhaps opened in the above block or in ptext()),
419241675Suqs	 * then open an `It' block context around the parsed macro.
420241675Suqs	 */
421241675Suqs
422294113Sbapt	if (n->tok == MDOC_It && n->type == ROFFT_BLOCK &&
423274880Sbapt	    NULL != n->parent &&
424274880Sbapt	    MDOC_Bl == n->parent->tok &&
425274880Sbapt	    LIST_column == n->parent->norm->Bl.type) {
426261344Suqs		mdoc->flags |= MDOC_FREECOL;
427275432Sbapt		mdoc_macro(mdoc, MDOC_It, ln, sv, &sv, buf);
428294113Sbapt		return 1;
429241675Suqs	}
430241675Suqs
431241675Suqs	/* Normal processing of a macro. */
432241675Suqs
433275432Sbapt	mdoc_macro(mdoc, tok, ln, sv, &offs, buf);
434241675Suqs
435274880Sbapt	/* In quick mode (for mandocdb), abort after the NAME section. */
436241675Suqs
437274880Sbapt	if (mdoc->quick && MDOC_Sh == tok &&
438274880Sbapt	    SEC_NAME != mdoc->last->sec)
439294113Sbapt		return 2;
440241675Suqs
441294113Sbapt	return 1;
442241675Suqs}
443241675Suqs
444241675Suqsenum mdelim
445241675Suqsmdoc_isdelim(const char *p)
446241675Suqs{
447241675Suqs
448241675Suqs	if ('\0' == p[0])
449294113Sbapt		return DELIM_NONE;
450241675Suqs
451241675Suqs	if ('\0' == p[1])
452241675Suqs		switch (p[0]) {
453274880Sbapt		case '(':
454274880Sbapt		case '[':
455294113Sbapt			return DELIM_OPEN;
456274880Sbapt		case '|':
457294113Sbapt			return DELIM_MIDDLE;
458274880Sbapt		case '.':
459274880Sbapt		case ',':
460274880Sbapt		case ';':
461274880Sbapt		case ':':
462274880Sbapt		case '?':
463274880Sbapt		case '!':
464274880Sbapt		case ')':
465274880Sbapt		case ']':
466294113Sbapt			return DELIM_CLOSE;
467241675Suqs		default:
468294113Sbapt			return DELIM_NONE;
469241675Suqs		}
470241675Suqs
471241675Suqs	if ('\\' != p[0])
472294113Sbapt		return DELIM_NONE;
473241675Suqs
474241675Suqs	if (0 == strcmp(p + 1, "."))
475294113Sbapt		return DELIM_CLOSE;
476261344Suqs	if (0 == strcmp(p + 1, "fR|\\fP"))
477294113Sbapt		return DELIM_MIDDLE;
478241675Suqs
479294113Sbapt	return DELIM_NONE;
480241675Suqs}
481274880Sbapt
482274880Sbaptvoid
483294113Sbaptmdoc_validate(struct roff_man *mdoc)
484274880Sbapt{
485274880Sbapt
486294113Sbapt	mdoc->last = mdoc->first;
487294113Sbapt	mdoc_node_validate(mdoc);
488294113Sbapt	mdoc_state_reset(mdoc);
489274880Sbapt}
490