1/*	$Id: mandoc.c,v 1.103 2017/07/03 13:40:19 schwarze Exp $ */
2/*
3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2011-2015, 2017 Ingo Schwarze <schwarze@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18#include "config.h"
19
20#include <sys/types.h>
21
22#include <assert.h>
23#include <ctype.h>
24#include <errno.h>
25#include <limits.h>
26#include <stdlib.h>
27#include <stdio.h>
28#include <string.h>
29#include <time.h>
30
31#include "mandoc_aux.h"
32#include "mandoc.h"
33#include "roff.h"
34#include "libmandoc.h"
35
36static	int	 a2time(time_t *, const char *, const char *);
37static	char	*time2a(time_t);
38
39
40enum mandoc_esc
41mandoc_escape(const char **end, const char **start, int *sz)
42{
43	const char	*local_start;
44	int		 local_sz;
45	char		 term;
46	enum mandoc_esc	 gly;
47
48	/*
49	 * When the caller doesn't provide return storage,
50	 * use local storage.
51	 */
52
53	if (NULL == start)
54		start = &local_start;
55	if (NULL == sz)
56		sz = &local_sz;
57
58	/*
59	 * Beyond the backslash, at least one input character
60	 * is part of the escape sequence.  With one exception
61	 * (see below), that character won't be returned.
62	 */
63
64	gly = ESCAPE_ERROR;
65	*start = ++*end;
66	*sz = 0;
67	term = '\0';
68
69	switch ((*start)[-1]) {
70	/*
71	 * First the glyphs.  There are several different forms of
72	 * these, but each eventually returns a substring of the glyph
73	 * name.
74	 */
75	case '(':
76		gly = ESCAPE_SPECIAL;
77		*sz = 2;
78		break;
79	case '[':
80		gly = ESCAPE_SPECIAL;
81		term = ']';
82		break;
83	case 'C':
84		if ('\'' != **start)
85			return ESCAPE_ERROR;
86		*start = ++*end;
87		gly = ESCAPE_SPECIAL;
88		term = '\'';
89		break;
90
91	/*
92	 * Escapes taking no arguments at all.
93	 */
94	case 'd':
95	case 'u':
96	case ',':
97	case '/':
98		return ESCAPE_IGNORE;
99	case 'p':
100		return ESCAPE_BREAK;
101
102	/*
103	 * The \z escape is supposed to output the following
104	 * character without advancing the cursor position.
105	 * Since we are mostly dealing with terminal mode,
106	 * let us just skip the next character.
107	 */
108	case 'z':
109		return ESCAPE_SKIPCHAR;
110
111	/*
112	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
113	 * 'X' is the trigger.  These have opaque sub-strings.
114	 */
115	case 'F':
116	case 'g':
117	case 'k':
118	case 'M':
119	case 'm':
120	case 'n':
121	case 'V':
122	case 'Y':
123		gly = ESCAPE_IGNORE;
124		/* FALLTHROUGH */
125	case 'f':
126		if (ESCAPE_ERROR == gly)
127			gly = ESCAPE_FONT;
128		switch (**start) {
129		case '(':
130			*start = ++*end;
131			*sz = 2;
132			break;
133		case '[':
134			*start = ++*end;
135			term = ']';
136			break;
137		default:
138			*sz = 1;
139			break;
140		}
141		break;
142
143	/*
144	 * These escapes are of the form \X'Y', where 'X' is the trigger
145	 * and 'Y' is any string.  These have opaque sub-strings.
146	 * The \B and \w escapes are handled in roff.c, roff_res().
147	 */
148	case 'A':
149	case 'b':
150	case 'D':
151	case 'R':
152	case 'X':
153	case 'Z':
154		gly = ESCAPE_IGNORE;
155		/* FALLTHROUGH */
156	case 'o':
157		if (**start == '\0')
158			return ESCAPE_ERROR;
159		if (gly == ESCAPE_ERROR)
160			gly = ESCAPE_OVERSTRIKE;
161		term = **start;
162		*start = ++*end;
163		break;
164
165	/*
166	 * These escapes are of the form \X'N', where 'X' is the trigger
167	 * and 'N' resolves to a numerical expression.
168	 */
169	case 'h':
170	case 'H':
171	case 'L':
172	case 'l':
173	case 'S':
174	case 'v':
175	case 'x':
176		if (strchr(" %&()*+-./0123456789:<=>", **start)) {
177			if ('\0' != **start)
178				++*end;
179			return ESCAPE_ERROR;
180		}
181		switch ((*start)[-1]) {
182		case 'h':
183			gly = ESCAPE_HORIZ;
184			break;
185		case 'l':
186			gly = ESCAPE_HLINE;
187			break;
188		default:
189			gly = ESCAPE_IGNORE;
190			break;
191		}
192		term = **start;
193		*start = ++*end;
194		break;
195
196	/*
197	 * Special handling for the numbered character escape.
198	 * XXX Do any other escapes need similar handling?
199	 */
200	case 'N':
201		if ('\0' == **start)
202			return ESCAPE_ERROR;
203		(*end)++;
204		if (isdigit((unsigned char)**start)) {
205			*sz = 1;
206			return ESCAPE_IGNORE;
207		}
208		(*start)++;
209		while (isdigit((unsigned char)**end))
210			(*end)++;
211		*sz = *end - *start;
212		if ('\0' != **end)
213			(*end)++;
214		return ESCAPE_NUMBERED;
215
216	/*
217	 * Sizes get a special category of their own.
218	 */
219	case 's':
220		gly = ESCAPE_IGNORE;
221
222		/* See +/- counts as a sign. */
223		if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
224			*start = ++*end;
225
226		switch (**end) {
227		case '(':
228			*start = ++*end;
229			*sz = 2;
230			break;
231		case '[':
232			*start = ++*end;
233			term = ']';
234			break;
235		case '\'':
236			*start = ++*end;
237			term = '\'';
238			break;
239		case '3':
240		case '2':
241		case '1':
242			*sz = (*end)[-1] == 's' &&
243			    isdigit((unsigned char)(*end)[1]) ? 2 : 1;
244			break;
245		default:
246			*sz = 1;
247			break;
248		}
249
250		break;
251
252	/*
253	 * Anything else is assumed to be a glyph.
254	 * In this case, pass back the character after the backslash.
255	 */
256	default:
257		gly = ESCAPE_SPECIAL;
258		*start = --*end;
259		*sz = 1;
260		break;
261	}
262
263	assert(ESCAPE_ERROR != gly);
264
265	/*
266	 * Read up to the terminating character,
267	 * paying attention to nested escapes.
268	 */
269
270	if ('\0' != term) {
271		while (**end != term) {
272			switch (**end) {
273			case '\0':
274				return ESCAPE_ERROR;
275			case '\\':
276				(*end)++;
277				if (ESCAPE_ERROR ==
278				    mandoc_escape(end, NULL, NULL))
279					return ESCAPE_ERROR;
280				break;
281			default:
282				(*end)++;
283				break;
284			}
285		}
286		*sz = (*end)++ - *start;
287	} else {
288		assert(*sz > 0);
289		if ((size_t)*sz > strlen(*start))
290			return ESCAPE_ERROR;
291		*end += *sz;
292	}
293
294	/* Run post-processors. */
295
296	switch (gly) {
297	case ESCAPE_FONT:
298		if (2 == *sz) {
299			if ('C' == **start) {
300				/*
301				 * Treat constant-width font modes
302				 * just like regular font modes.
303				 */
304				(*start)++;
305				(*sz)--;
306			} else {
307				if ('B' == (*start)[0] && 'I' == (*start)[1])
308					gly = ESCAPE_FONTBI;
309				break;
310			}
311		} else if (1 != *sz)
312			break;
313
314		switch (**start) {
315		case '3':
316		case 'B':
317			gly = ESCAPE_FONTBOLD;
318			break;
319		case '2':
320		case 'I':
321			gly = ESCAPE_FONTITALIC;
322			break;
323		case 'P':
324			gly = ESCAPE_FONTPREV;
325			break;
326		case '1':
327		case 'R':
328			gly = ESCAPE_FONTROMAN;
329			break;
330		}
331		break;
332	case ESCAPE_SPECIAL:
333		if (1 == *sz && 'c' == **start)
334			gly = ESCAPE_NOSPACE;
335		/*
336		 * Unicode escapes are defined in groff as \[u0000]
337		 * to \[u10FFFF], where the contained value must be
338		 * a valid Unicode codepoint.  Here, however, only
339		 * check the length and range.
340		 */
341		if (**start != 'u' || *sz < 5 || *sz > 7)
342			break;
343		if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
344			break;
345		if (*sz == 6 && (*start)[1] == '0')
346			break;
347		if (*sz == 5 && (*start)[1] == 'D' &&
348		    strchr("89ABCDEF", (*start)[2]) != NULL)
349			break;
350		if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
351		    + 1 == *sz)
352			gly = ESCAPE_UNICODE;
353		break;
354	default:
355		break;
356	}
357
358	return gly;
359}
360
361/*
362 * Parse a quoted or unquoted roff-style request or macro argument.
363 * Return a pointer to the parsed argument, which is either the original
364 * pointer or advanced by one byte in case the argument is quoted.
365 * NUL-terminate the argument in place.
366 * Collapse pairs of quotes inside quoted arguments.
367 * Advance the argument pointer to the next argument,
368 * or to the NUL byte terminating the argument line.
369 */
370char *
371mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
372{
373	char	 *start, *cp;
374	int	  quoted, pairs, white;
375
376	/* Quoting can only start with a new word. */
377	start = *cpp;
378	quoted = 0;
379	if ('"' == *start) {
380		quoted = 1;
381		start++;
382	}
383
384	pairs = 0;
385	white = 0;
386	for (cp = start; '\0' != *cp; cp++) {
387
388		/*
389		 * Move the following text left
390		 * after quoted quotes and after "\\" and "\t".
391		 */
392		if (pairs)
393			cp[-pairs] = cp[0];
394
395		if ('\\' == cp[0]) {
396			/*
397			 * In copy mode, translate double to single
398			 * backslashes and backslash-t to literal tabs.
399			 */
400			switch (cp[1]) {
401			case 't':
402				cp[0] = '\t';
403				/* FALLTHROUGH */
404			case '\\':
405				pairs++;
406				cp++;
407				break;
408			case ' ':
409				/* Skip escaped blanks. */
410				if (0 == quoted)
411					cp++;
412				break;
413			default:
414				break;
415			}
416		} else if (0 == quoted) {
417			if (' ' == cp[0]) {
418				/* Unescaped blanks end unquoted args. */
419				white = 1;
420				break;
421			}
422		} else if ('"' == cp[0]) {
423			if ('"' == cp[1]) {
424				/* Quoted quotes collapse. */
425				pairs++;
426				cp++;
427			} else {
428				/* Unquoted quotes end quoted args. */
429				quoted = 2;
430				break;
431			}
432		}
433	}
434
435	/* Quoted argument without a closing quote. */
436	if (1 == quoted)
437		mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL);
438
439	/* NUL-terminate this argument and move to the next one. */
440	if (pairs)
441		cp[-pairs] = '\0';
442	if ('\0' != *cp) {
443		*cp++ = '\0';
444		while (' ' == *cp)
445			cp++;
446	}
447	*pos += (int)(cp - start) + (quoted ? 1 : 0);
448	*cpp = cp;
449
450	if ('\0' == *cp && (white || ' ' == cp[-1]))
451		mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL);
452
453	return start;
454}
455
456static int
457a2time(time_t *t, const char *fmt, const char *p)
458{
459	struct tm	 tm;
460	char		*pp;
461
462	memset(&tm, 0, sizeof(struct tm));
463
464	pp = NULL;
465#if HAVE_STRPTIME
466	pp = strptime(p, fmt, &tm);
467#endif
468	if (NULL != pp && '\0' == *pp) {
469		*t = mktime(&tm);
470		return 1;
471	}
472
473	return 0;
474}
475
476static char *
477time2a(time_t t)
478{
479	struct tm	*tm;
480	char		*buf, *p;
481	size_t		 ssz;
482	int		 isz;
483
484	tm = localtime(&t);
485	if (tm == NULL)
486		return NULL;
487
488	/*
489	 * Reserve space:
490	 * up to 9 characters for the month (September) + blank
491	 * up to 2 characters for the day + comma + blank
492	 * 4 characters for the year and a terminating '\0'
493	 */
494
495	p = buf = mandoc_malloc(10 + 4 + 4 + 1);
496
497	if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
498		goto fail;
499	p += (int)ssz;
500
501	/*
502	 * The output format is just "%d" here, not "%2d" or "%02d".
503	 * That's also the reason why we can't just format the
504	 * date as a whole with "%B %e, %Y" or "%B %d, %Y".
505	 * Besides, the present approach is less prone to buffer
506	 * overflows, in case anybody should ever introduce the bug
507	 * of looking at LC_TIME.
508	 */
509
510	if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1)
511		goto fail;
512	p += isz;
513
514	if (strftime(p, 4 + 1, "%Y", tm) == 0)
515		goto fail;
516	return buf;
517
518fail:
519	free(buf);
520	return NULL;
521}
522
523char *
524mandoc_normdate(struct roff_man *man, char *in, int ln, int pos)
525{
526	char		*cp;
527	time_t		 t;
528
529	/* No date specified: use today's date. */
530
531	if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) {
532		mandoc_msg(MANDOCERR_DATE_MISSING, man->parse, ln, pos, NULL);
533		return time2a(time(NULL));
534	}
535
536	/* Valid mdoc(7) date format. */
537
538	if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) ||
539	    a2time(&t, "%b %d, %Y", in)) {
540		cp = time2a(t);
541		if (t > time(NULL) + 86400)
542			mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse,
543			    ln, pos, cp);
544		return cp;
545	}
546
547	/* In man(7), do not warn about the legacy format. */
548
549	if (a2time(&t, "%Y-%m-%d", in) == 0)
550		mandoc_msg(MANDOCERR_DATE_BAD, man->parse, ln, pos, in);
551	else if (t > time(NULL) + 86400)
552		mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse, ln, pos, in);
553	else if (man->macroset == MACROSET_MDOC)
554		mandoc_vmsg(MANDOCERR_DATE_LEGACY, man->parse,
555		    ln, pos, "Dd %s", in);
556
557	/* Use any non-mdoc(7) date verbatim. */
558
559	return mandoc_strdup(in);
560}
561
562int
563mandoc_eos(const char *p, size_t sz)
564{
565	const char	*q;
566	int		 enclosed, found;
567
568	if (0 == sz)
569		return 0;
570
571	/*
572	 * End-of-sentence recognition must include situations where
573	 * some symbols, such as `)', allow prior EOS punctuation to
574	 * propagate outward.
575	 */
576
577	enclosed = found = 0;
578	for (q = p + (int)sz - 1; q >= p; q--) {
579		switch (*q) {
580		case '\"':
581		case '\'':
582		case ']':
583		case ')':
584			if (0 == found)
585				enclosed = 1;
586			break;
587		case '.':
588		case '!':
589		case '?':
590			found = 1;
591			break;
592		default:
593			return found &&
594			    (!enclosed || isalnum((unsigned char)*q));
595		}
596	}
597
598	return found && !enclosed;
599}
600
601/*
602 * Convert a string to a long that may not be <0.
603 * If the string is invalid, or is less than 0, return -1.
604 */
605int
606mandoc_strntoi(const char *p, size_t sz, int base)
607{
608	char		 buf[32];
609	char		*ep;
610	long		 v;
611
612	if (sz > 31)
613		return -1;
614
615	memcpy(buf, p, sz);
616	buf[(int)sz] = '\0';
617
618	errno = 0;
619	v = strtol(buf, &ep, base);
620
621	if (buf[0] == '\0' || *ep != '\0')
622		return -1;
623
624	if (v > INT_MAX)
625		v = INT_MAX;
626	if (v < INT_MIN)
627		v = INT_MIN;
628
629	return (int)v;
630}
631