1/*	$Id: mdoc.c,v 1.274 2018/12/31 07:46:07 schwarze Exp $ */
2/*
3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2010, 2012-2018 Ingo Schwarze <schwarze@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18#include "config.h"
19
20#include <sys/types.h>
21
22#include <assert.h>
23#include <ctype.h>
24#include <stdarg.h>
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <time.h>
29
30#include "mandoc_aux.h"
31#include "mandoc.h"
32#include "roff.h"
33#include "mdoc.h"
34#include "libmandoc.h"
35#include "roff_int.h"
36#include "libmdoc.h"
37
38const	char *const __mdoc_argnames[MDOC_ARG_MAX] = {
39	"split",		"nosplit",		"ragged",
40	"unfilled",		"literal",		"file",
41	"offset",		"bullet",		"dash",
42	"hyphen",		"item",			"enum",
43	"tag",			"diag",			"hang",
44	"ohang",		"inset",		"column",
45	"width",		"compact",		"std",
46	"filled",		"words",		"emphasis",
47	"symbolic",		"nested",		"centered"
48};
49const	char * const *mdoc_argnames = __mdoc_argnames;
50
51static	int		  mdoc_ptext(struct roff_man *, int, char *, int);
52static	int		  mdoc_pmacro(struct roff_man *, int, char *, int);
53
54
55/*
56 * Main parse routine.  Parses a single line -- really just hands off to
57 * the macro (mdoc_pmacro()) or text parser (mdoc_ptext()).
58 */
59int
60mdoc_parseln(struct roff_man *mdoc, int ln, char *buf, int offs)
61{
62
63	if (mdoc->last->type != ROFFT_EQN || ln > mdoc->last->line)
64		mdoc->flags |= MDOC_NEWLINE;
65
66	/*
67	 * Let the roff nS register switch SYNOPSIS mode early,
68	 * such that the parser knows at all times
69	 * whether this mode is on or off.
70	 * Note that this mode is also switched by the Sh macro.
71	 */
72	if (roff_getreg(mdoc->roff, "nS"))
73		mdoc->flags |= MDOC_SYNOPSIS;
74	else
75		mdoc->flags &= ~MDOC_SYNOPSIS;
76
77	return roff_getcontrol(mdoc->roff, buf, &offs) ?
78	    mdoc_pmacro(mdoc, ln, buf, offs) :
79	    mdoc_ptext(mdoc, ln, buf, offs);
80}
81
82void
83mdoc_tail_alloc(struct roff_man *mdoc, int line, int pos, enum roff_tok tok)
84{
85	struct roff_node *p;
86
87	p = roff_node_alloc(mdoc, line, pos, ROFFT_TAIL, tok);
88	roff_node_append(mdoc, p);
89	mdoc->next = ROFF_NEXT_CHILD;
90}
91
92struct roff_node *
93mdoc_endbody_alloc(struct roff_man *mdoc, int line, int pos,
94    enum roff_tok tok, struct roff_node *body)
95{
96	struct roff_node *p;
97
98	body->flags |= NODE_ENDED;
99	body->parent->flags |= NODE_ENDED;
100	p = roff_node_alloc(mdoc, line, pos, ROFFT_BODY, tok);
101	p->body = body;
102	p->norm = body->norm;
103	p->end = ENDBODY_SPACE;
104	roff_node_append(mdoc, p);
105	mdoc->next = ROFF_NEXT_SIBLING;
106	return p;
107}
108
109struct roff_node *
110mdoc_block_alloc(struct roff_man *mdoc, int line, int pos,
111    enum roff_tok tok, struct mdoc_arg *args)
112{
113	struct roff_node *p;
114
115	p = roff_node_alloc(mdoc, line, pos, ROFFT_BLOCK, tok);
116	p->args = args;
117	if (p->args)
118		(args->refcnt)++;
119
120	switch (tok) {
121	case MDOC_Bd:
122	case MDOC_Bf:
123	case MDOC_Bl:
124	case MDOC_En:
125	case MDOC_Rs:
126		p->norm = mandoc_calloc(1, sizeof(union mdoc_data));
127		break;
128	default:
129		break;
130	}
131	roff_node_append(mdoc, p);
132	mdoc->next = ROFF_NEXT_CHILD;
133	return p;
134}
135
136void
137mdoc_elem_alloc(struct roff_man *mdoc, int line, int pos,
138     enum roff_tok tok, struct mdoc_arg *args)
139{
140	struct roff_node *p;
141
142	p = roff_node_alloc(mdoc, line, pos, ROFFT_ELEM, tok);
143	p->args = args;
144	if (p->args)
145		(args->refcnt)++;
146
147	switch (tok) {
148	case MDOC_An:
149		p->norm = mandoc_calloc(1, sizeof(union mdoc_data));
150		break;
151	default:
152		break;
153	}
154	roff_node_append(mdoc, p);
155	mdoc->next = ROFF_NEXT_CHILD;
156}
157
158/*
159 * Parse free-form text, that is, a line that does not begin with the
160 * control character.
161 */
162static int
163mdoc_ptext(struct roff_man *mdoc, int line, char *buf, int offs)
164{
165	struct roff_node *n;
166	const char	 *cp, *sp;
167	char		 *c, *ws, *end;
168
169	n = mdoc->last;
170
171	/*
172	 * If a column list contains plain text, assume an implicit item
173	 * macro.  This can happen one or more times at the beginning
174	 * of such a list, intermixed with non-It mdoc macros and with
175	 * nodes generated on the roff level, for example by tbl.
176	 */
177
178	if ((n->tok == MDOC_Bl && n->type == ROFFT_BODY &&
179	     n->end == ENDBODY_NOT && n->norm->Bl.type == LIST_column) ||
180	    (n->parent != NULL && n->parent->tok == MDOC_Bl &&
181	     n->parent->norm->Bl.type == LIST_column)) {
182		mdoc->flags |= MDOC_FREECOL;
183		(*mdoc_macro(MDOC_It)->fp)(mdoc, MDOC_It,
184		    line, offs, &offs, buf);
185		return 1;
186	}
187
188	/*
189	 * Search for the beginning of unescaped trailing whitespace (ws)
190	 * and for the first character not to be output (end).
191	 */
192
193	/* FIXME: replace with strcspn(). */
194	ws = NULL;
195	for (c = end = buf + offs; *c; c++) {
196		switch (*c) {
197		case ' ':
198			if (NULL == ws)
199				ws = c;
200			continue;
201		case '\t':
202			/*
203			 * Always warn about trailing tabs,
204			 * even outside literal context,
205			 * where they should be put on the next line.
206			 */
207			if (NULL == ws)
208				ws = c;
209			/*
210			 * Strip trailing tabs in literal context only;
211			 * outside, they affect the next line.
212			 */
213			if (mdoc->flags & ROFF_NOFILL)
214				continue;
215			break;
216		case '\\':
217			/* Skip the escaped character, too, if any. */
218			if (c[1])
219				c++;
220			/* FALLTHROUGH */
221		default:
222			ws = NULL;
223			break;
224		}
225		end = c + 1;
226	}
227	*end = '\0';
228
229	if (ws)
230		mandoc_msg(MANDOCERR_SPACE_EOL, line, (int)(ws - buf), NULL);
231
232	/*
233	 * Blank lines are allowed in no-fill mode
234	 * and cancel preceding \c,
235	 * but add a single vertical space elsewhere.
236	 */
237
238	if (buf[offs] == '\0' && (mdoc->flags & ROFF_NOFILL) == 0) {
239		switch (mdoc->last->type) {
240		case ROFFT_TEXT:
241			sp = mdoc->last->string;
242			cp = end = strchr(sp, '\0') - 2;
243			if (cp < sp || cp[0] != '\\' || cp[1] != 'c')
244				break;
245			while (cp > sp && cp[-1] == '\\')
246				cp--;
247			if ((end - cp) % 2)
248				break;
249			*end = '\0';
250			return 1;
251		default:
252			break;
253		}
254		mandoc_msg(MANDOCERR_FI_BLANK, line, (int)(c - buf), NULL);
255		roff_elem_alloc(mdoc, line, offs, ROFF_sp);
256		mdoc->last->flags |= NODE_VALID | NODE_ENDED;
257		mdoc->next = ROFF_NEXT_SIBLING;
258		return 1;
259	}
260
261	roff_word_alloc(mdoc, line, offs, buf+offs);
262
263	if (mdoc->flags & ROFF_NOFILL)
264		return 1;
265
266	/*
267	 * End-of-sentence check.  If the last character is an unescaped
268	 * EOS character, then flag the node as being the end of a
269	 * sentence.  The front-end will know how to interpret this.
270	 */
271
272	assert(buf < end);
273
274	if (mandoc_eos(buf+offs, (size_t)(end-buf-offs)))
275		mdoc->last->flags |= NODE_EOS;
276
277	for (c = buf + offs; c != NULL; c = strchr(c + 1, '.')) {
278		if (c - buf < offs + 2)
279			continue;
280		if (end - c < 3)
281			break;
282		if (c[1] != ' ' ||
283		    isalnum((unsigned char)c[-2]) == 0 ||
284		    isalnum((unsigned char)c[-1]) == 0 ||
285		    (c[-2] == 'n' && c[-1] == 'c') ||
286		    (c[-2] == 'v' && c[-1] == 's'))
287			continue;
288		c += 2;
289		if (*c == ' ')
290			c++;
291		if (*c == ' ')
292			c++;
293		if (isupper((unsigned char)(*c)))
294			mandoc_msg(MANDOCERR_EOS, line, (int)(c - buf), NULL);
295	}
296
297	return 1;
298}
299
300/*
301 * Parse a macro line, that is, a line beginning with the control
302 * character.
303 */
304static int
305mdoc_pmacro(struct roff_man *mdoc, int ln, char *buf, int offs)
306{
307	struct roff_node *n;
308	const char	 *cp;
309	size_t		  sz;
310	enum roff_tok	  tok;
311	int		  sv;
312
313	/* Determine the line macro. */
314
315	sv = offs;
316	tok = TOKEN_NONE;
317	for (sz = 0; sz < 4 && strchr(" \t\\", buf[offs]) == NULL; sz++)
318		offs++;
319	if (sz == 2 || sz == 3)
320		tok = roffhash_find(mdoc->mdocmac, buf + sv, sz);
321	if (tok == TOKEN_NONE) {
322		mandoc_msg(MANDOCERR_MACRO, ln, sv, "%s", buf + sv - 1);
323		return 1;
324	}
325
326	/* Skip a leading escape sequence or tab. */
327
328	switch (buf[offs]) {
329	case '\\':
330		cp = buf + offs + 1;
331		mandoc_escape(&cp, NULL, NULL);
332		offs = cp - buf;
333		break;
334	case '\t':
335		offs++;
336		break;
337	default:
338		break;
339	}
340
341	/* Jump to the next non-whitespace word. */
342
343	while (buf[offs] == ' ')
344		offs++;
345
346	/*
347	 * Trailing whitespace.  Note that tabs are allowed to be passed
348	 * into the parser as "text", so we only warn about spaces here.
349	 */
350
351	if ('\0' == buf[offs] && ' ' == buf[offs - 1])
352		mandoc_msg(MANDOCERR_SPACE_EOL, ln, offs - 1, NULL);
353
354	/*
355	 * If an initial macro or a list invocation, divert directly
356	 * into macro processing.
357	 */
358
359	n = mdoc->last;
360	if (n == NULL || tok == MDOC_It || tok == MDOC_El) {
361		(*mdoc_macro(tok)->fp)(mdoc, tok, ln, sv, &offs, buf);
362		return 1;
363	}
364
365	/*
366	 * If a column list contains a non-It macro, assume an implicit
367	 * item macro.  This can happen one or more times at the
368	 * beginning of such a list, intermixed with text lines and
369	 * with nodes generated on the roff level, for example by tbl.
370	 */
371
372	if ((n->tok == MDOC_Bl && n->type == ROFFT_BODY &&
373	     n->end == ENDBODY_NOT && n->norm->Bl.type == LIST_column) ||
374	    (n->parent != NULL && n->parent->tok == MDOC_Bl &&
375	     n->parent->norm->Bl.type == LIST_column)) {
376		mdoc->flags |= MDOC_FREECOL;
377		(*mdoc_macro(MDOC_It)->fp)(mdoc, MDOC_It, ln, sv, &sv, buf);
378		return 1;
379	}
380
381	/* Normal processing of a macro. */
382
383	(*mdoc_macro(tok)->fp)(mdoc, tok, ln, sv, &offs, buf);
384
385	/* In quick mode (for mandocdb), abort after the NAME section. */
386
387	if (mdoc->quick && MDOC_Sh == tok &&
388	    SEC_NAME != mdoc->last->sec)
389		return 2;
390
391	return 1;
392}
393
394enum mdelim
395mdoc_isdelim(const char *p)
396{
397
398	if ('\0' == p[0])
399		return DELIM_NONE;
400
401	if ('\0' == p[1])
402		switch (p[0]) {
403		case '(':
404		case '[':
405			return DELIM_OPEN;
406		case '|':
407			return DELIM_MIDDLE;
408		case '.':
409		case ',':
410		case ';':
411		case ':':
412		case '?':
413		case '!':
414		case ')':
415		case ']':
416			return DELIM_CLOSE;
417		default:
418			return DELIM_NONE;
419		}
420
421	if ('\\' != p[0])
422		return DELIM_NONE;
423
424	if (0 == strcmp(p + 1, "."))
425		return DELIM_CLOSE;
426	if (0 == strcmp(p + 1, "fR|\\fP"))
427		return DELIM_MIDDLE;
428
429	return DELIM_NONE;
430}
431