read.c revision 1.21
1/*	Id: read.c,v 1.211 2019/01/11 17:04:44 schwarze Exp  */
2/*
3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2010-2019 Ingo Schwarze <schwarze@openbsd.org>
5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19#include "config.h"
20
21#include <sys/types.h>
22#include <sys/mman.h>
23#include <sys/stat.h>
24
25#include <assert.h>
26#include <ctype.h>
27#include <errno.h>
28#include <fcntl.h>
29#include <stdarg.h>
30#include <stdio.h>
31#include <stdlib.h>
32#include <string.h>
33#include <unistd.h>
34#include <zlib.h>
35
36#include "mandoc_aux.h"
37#include "mandoc.h"
38#include "roff.h"
39#include "mdoc.h"
40#include "man.h"
41#include "mandoc_parse.h"
42#include "libmandoc.h"
43#include "roff_int.h"
44
45#define	REPARSE_LIMIT	1000
46
47struct	mparse {
48	struct roff	 *roff; /* roff parser (!NULL) */
49	struct roff_man	 *man; /* man parser */
50	struct buf	 *primary; /* buffer currently being parsed */
51	struct buf	 *secondary; /* copy of top level input */
52	struct buf	 *loop; /* open .while request line */
53	const char	 *os_s; /* default operating system */
54	int		  options; /* parser options */
55	int		  gzip; /* current input file is gzipped */
56	int		  filenc; /* encoding of the current file */
57	int		  reparse_count; /* finite interp. stack */
58	int		  line; /* line number in the file */
59};
60
61static	void	  choose_parser(struct mparse *);
62static	void	  free_buf_list(struct buf *);
63static	void	  resize_buf(struct buf *, size_t);
64static	int	  mparse_buf_r(struct mparse *, const struct buf, size_t, int);
65static	int	  read_whole_file(struct mparse *, int, struct buf *, int *);
66static	void	  mparse_end(struct mparse *);
67
68
69static void
70resize_buf(struct buf *buf, size_t initial)
71{
72
73	buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
74	buf->buf = mandoc_realloc(buf->buf, buf->sz);
75}
76
77static void
78free_buf_list(struct buf *buf)
79{
80	struct buf *tmp;
81
82	while (buf != NULL) {
83		tmp = buf;
84		buf = tmp->next;
85		free(tmp->buf);
86		free(tmp);
87	}
88}
89
90static void
91choose_parser(struct mparse *curp)
92{
93	char		*cp, *ep;
94	int		 format;
95
96	/*
97	 * If neither command line arguments -mdoc or -man select
98	 * a parser nor the roff parser found a .Dd or .TH macro
99	 * yet, look ahead in the main input buffer.
100	 */
101
102	if ((format = roff_getformat(curp->roff)) == 0) {
103		cp = curp->primary->buf;
104		ep = cp + curp->primary->sz;
105		while (cp < ep) {
106			if (*cp == '.' || *cp == '\'') {
107				cp++;
108				if (cp[0] == 'D' && cp[1] == 'd') {
109					format = MPARSE_MDOC;
110					break;
111				}
112				if (cp[0] == 'T' && cp[1] == 'H') {
113					format = MPARSE_MAN;
114					break;
115				}
116			}
117			cp = memchr(cp, '\n', ep - cp);
118			if (cp == NULL)
119				break;
120			cp++;
121		}
122	}
123
124	if (format == MPARSE_MDOC) {
125		curp->man->meta.macroset = MACROSET_MDOC;
126		if (curp->man->mdocmac == NULL)
127			curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
128	} else {
129		curp->man->meta.macroset = MACROSET_MAN;
130		if (curp->man->manmac == NULL)
131			curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
132	}
133	curp->man->meta.first->tok = TOKEN_NONE;
134}
135
136/*
137 * Main parse routine for a buffer.
138 * It assumes encoding and line numbering are already set up.
139 * It can recurse directly (for invocations of user-defined
140 * macros, inline equations, and input line traps)
141 * and indirectly (for .so file inclusion).
142 */
143static int
144mparse_buf_r(struct mparse *curp, const struct buf blk, size_t i, int start)
145{
146	struct buf	 ln;
147	struct buf	*firstln, *lastln, *thisln, *loop;
148	char		*cp;
149	size_t		 pos; /* byte number in the ln buffer */
150	int		 line_result, result;
151	int		 of;
152	int		 lnn; /* line number in the real file */
153	int		 fd;
154	int		 inloop; /* Saw .while on this level. */
155	unsigned char	 c;
156
157	ln.sz = 256;
158	ln.buf = mandoc_malloc(ln.sz);
159	ln.next = NULL;
160	firstln = loop = NULL;
161	lnn = curp->line;
162	pos = 0;
163	inloop = 0;
164	result = ROFF_CONT;
165
166	while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) {
167		if (start) {
168			curp->line = lnn;
169			curp->reparse_count = 0;
170
171			if (lnn < 3 &&
172			    curp->filenc & MPARSE_UTF8 &&
173			    curp->filenc & MPARSE_LATIN1)
174				curp->filenc = preconv_cue(&blk, i);
175		}
176
177		while (i < blk.sz && (start || blk.buf[i] != '\0')) {
178
179			/*
180			 * When finding an unescaped newline character,
181			 * leave the character loop to process the line.
182			 * Skip a preceding carriage return, if any.
183			 */
184
185			if ('\r' == blk.buf[i] && i + 1 < blk.sz &&
186			    '\n' == blk.buf[i + 1])
187				++i;
188			if ('\n' == blk.buf[i]) {
189				++i;
190				++lnn;
191				break;
192			}
193
194			/*
195			 * Make sure we have space for the worst
196			 * case of 12 bytes: "\\[u10ffff]\n\0"
197			 */
198
199			if (pos + 12 > ln.sz)
200				resize_buf(&ln, 256);
201
202			/*
203			 * Encode 8-bit input.
204			 */
205
206			c = blk.buf[i];
207			if (c & 0x80) {
208				if ( ! (curp->filenc && preconv_encode(
209				    &blk, &i, &ln, &pos, &curp->filenc))) {
210					mandoc_msg(MANDOCERR_CHAR_BAD,
211					    curp->line, pos, "0x%x", c);
212					ln.buf[pos++] = '?';
213					i++;
214				}
215				continue;
216			}
217
218			/*
219			 * Exclude control characters.
220			 */
221
222			if (c == 0x7f || (c < 0x20 && c != 0x09)) {
223				mandoc_msg(c == 0x00 || c == 0x04 ||
224				    c > 0x0a ? MANDOCERR_CHAR_BAD :
225				    MANDOCERR_CHAR_UNSUPP,
226				    curp->line, pos, "0x%x", c);
227				i++;
228				if (c != '\r')
229					ln.buf[pos++] = '?';
230				continue;
231			}
232
233			ln.buf[pos++] = blk.buf[i++];
234		}
235		ln.buf[pos] = '\0';
236
237		/*
238		 * Maintain a lookaside buffer of all lines.
239		 * parsed from this input source.
240		 */
241
242		thisln = mandoc_malloc(sizeof(*thisln));
243		thisln->buf = mandoc_strdup(ln.buf);
244		thisln->sz = strlen(ln.buf) + 1;
245		thisln->next = NULL;
246		if (firstln == NULL) {
247			firstln = lastln = thisln;
248			if (curp->secondary == NULL)
249				curp->secondary = firstln;
250		} else {
251			lastln->next = thisln;
252			lastln = thisln;
253		}
254
255		/* XXX Ugly hack to mark the end of the input. */
256
257		if (i == blk.sz || blk.buf[i] == '\0') {
258			ln.buf[pos++] = '\n';
259			ln.buf[pos] = '\0';
260		}
261
262		/*
263		 * A significant amount of complexity is contained by
264		 * the roff preprocessor.  It's line-oriented but can be
265		 * expressed on one line, so we need at times to
266		 * readjust our starting point and re-run it.  The roff
267		 * preprocessor can also readjust the buffers with new
268		 * data, so we pass them in wholesale.
269		 */
270
271		of = 0;
272rerun:
273		line_result = roff_parseln(curp->roff, curp->line, &ln, &of);
274
275		/* Process options. */
276
277		if (line_result & ROFF_APPEND)
278			assert(line_result == (ROFF_IGN | ROFF_APPEND));
279
280		if (line_result & ROFF_USERCALL)
281			assert((line_result & ROFF_MASK) == ROFF_REPARSE);
282
283		if (line_result & ROFF_USERRET) {
284			assert(line_result == (ROFF_IGN | ROFF_USERRET));
285			if (start == 0) {
286				/* Return from the current macro. */
287				result = ROFF_USERRET;
288				goto out;
289			}
290		}
291
292		switch (line_result & ROFF_LOOPMASK) {
293		case ROFF_IGN:
294			break;
295		case ROFF_WHILE:
296			if (curp->loop != NULL) {
297				if (loop == curp->loop)
298					break;
299				mandoc_msg(MANDOCERR_WHILE_NEST,
300				    curp->line, pos, NULL);
301			}
302			curp->loop = thisln;
303			loop = NULL;
304			inloop = 1;
305			break;
306		case ROFF_LOOPCONT:
307		case ROFF_LOOPEXIT:
308			if (curp->loop == NULL) {
309				mandoc_msg(MANDOCERR_WHILE_FAIL,
310				    curp->line, pos, NULL);
311				break;
312			}
313			if (inloop == 0) {
314				mandoc_msg(MANDOCERR_WHILE_INTO,
315				    curp->line, pos, NULL);
316				curp->loop = loop = NULL;
317				break;
318			}
319			if (line_result & ROFF_LOOPCONT)
320				loop = curp->loop;
321			else {
322				curp->loop = loop = NULL;
323				inloop = 0;
324			}
325			break;
326		default:
327			abort();
328		}
329
330		/* Process the main instruction from the roff parser. */
331
332		switch (line_result & ROFF_MASK) {
333		case ROFF_IGN:
334			break;
335		case ROFF_CONT:
336			if (curp->man->meta.macroset == MACROSET_NONE)
337				choose_parser(curp);
338			if ((curp->man->meta.macroset == MACROSET_MDOC ?
339			     mdoc_parseln(curp->man, curp->line, ln.buf, of) :
340			     man_parseln(curp->man, curp->line, ln.buf, of)
341			    ) == 2)
342				goto out;
343			break;
344		case ROFF_RERUN:
345			goto rerun;
346		case ROFF_REPARSE:
347			if (++curp->reparse_count > REPARSE_LIMIT) {
348				/* Abort and return to the top level. */
349				result = ROFF_IGN;
350				mandoc_msg(MANDOCERR_ROFFLOOP,
351				    curp->line, pos, NULL);
352				goto out;
353			}
354			result = mparse_buf_r(curp, ln, of, 0);
355			if (line_result & ROFF_USERCALL) {
356				roff_userret(curp->roff);
357				/* Continue normally. */
358				if (result & ROFF_USERRET)
359					result = ROFF_CONT;
360			}
361			if (start == 0 && result != ROFF_CONT)
362				goto out;
363			break;
364		case ROFF_SO:
365			if ( ! (curp->options & MPARSE_SO) &&
366			    (i >= blk.sz || blk.buf[i] == '\0')) {
367				curp->man->meta.sodest =
368				    mandoc_strdup(ln.buf + of);
369				goto out;
370			}
371			if ((fd = mparse_open(curp, ln.buf + of)) != -1) {
372				mparse_readfd(curp, fd, ln.buf + of);
373				close(fd);
374			} else {
375				mandoc_msg(MANDOCERR_SO_FAIL,
376				    curp->line, of, ".so %s: %s",
377				    ln.buf + of, strerror(errno));
378				ln.sz = mandoc_asprintf(&cp,
379				    ".sp\nSee the file %s.\n.sp",
380				    ln.buf + of);
381				free(ln.buf);
382				ln.buf = cp;
383				of = 0;
384				mparse_buf_r(curp, ln, of, 0);
385			}
386			break;
387		default:
388			abort();
389		}
390
391		/* Start the next input line. */
392
393		if (loop != NULL &&
394		    (line_result & ROFF_LOOPMASK) == ROFF_IGN)
395			loop = loop->next;
396
397		if (loop != NULL) {
398			if ((line_result & ROFF_APPEND) == 0)
399				*ln.buf = '\0';
400			if (ln.sz < loop->sz)
401				resize_buf(&ln, loop->sz);
402			(void)strlcat(ln.buf, loop->buf, ln.sz);
403			of = 0;
404			goto rerun;
405		}
406
407		pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0;
408	}
409out:
410	if (inloop) {
411		if (result != ROFF_USERRET)
412			mandoc_msg(MANDOCERR_WHILE_OUTOF,
413			    curp->line, pos, NULL);
414		curp->loop = NULL;
415	}
416	free(ln.buf);
417	if (firstln != curp->secondary)
418		free_buf_list(firstln);
419	return result;
420}
421
422static int
423read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap)
424{
425	struct stat	 st;
426	gzFile		 gz;
427	size_t		 off;
428	ssize_t		 ssz;
429	int		 gzerrnum, retval;
430
431	if (fstat(fd, &st) == -1) {
432		mandoc_msg(MANDOCERR_FILE, 0, 0,
433		    "fstat: %s", strerror(errno));
434		return 0;
435	}
436
437	/*
438	 * If we're a regular file, try just reading in the whole entry
439	 * via mmap().  This is faster than reading it into blocks, and
440	 * since each file is only a few bytes to begin with, I'm not
441	 * concerned that this is going to tank any machines.
442	 */
443
444	if (curp->gzip == 0 && S_ISREG(st.st_mode)) {
445		if (st.st_size > 0x7fffffff) {
446			mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL);
447			return 0;
448		}
449		*with_mmap = 1;
450		fb->sz = (size_t)st.st_size;
451		fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
452		if (fb->buf != MAP_FAILED)
453			return 1;
454	}
455
456	if (curp->gzip) {
457		/*
458		 * Duplicating the file descriptor is required
459		 * because we will have to call gzclose(3)
460		 * to free memory used internally by zlib,
461		 * but that will also close the file descriptor,
462		 * which this function must not do.
463		 */
464		if ((fd = dup(fd)) == -1) {
465			mandoc_msg(MANDOCERR_FILE, 0, 0,
466			    "dup: %s", strerror(errno));
467			return 0;
468		}
469		if ((gz = gzdopen(fd, "rb")) == NULL) {
470			mandoc_msg(MANDOCERR_FILE, 0, 0,
471			    "gzdopen: %s", strerror(errno));
472			close(fd);
473			return 0;
474		}
475	} else
476		gz = NULL;
477
478	/*
479	 * If this isn't a regular file (like, say, stdin), then we must
480	 * go the old way and just read things in bit by bit.
481	 */
482
483	*with_mmap = 0;
484	off = 0;
485	retval = 0;
486	fb->sz = 0;
487	fb->buf = NULL;
488	for (;;) {
489		if (off == fb->sz) {
490			if (fb->sz == (1U << 31)) {
491				mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL);
492				break;
493			}
494			resize_buf(fb, 65536);
495		}
496		ssz = curp->gzip ?
497		    gzread(gz, fb->buf + (int)off, fb->sz - off) :
498		    read(fd, fb->buf + (int)off, fb->sz - off);
499		if (ssz == 0) {
500			fb->sz = off;
501			retval = 1;
502			break;
503		}
504		if (ssz == -1) {
505			if (curp->gzip)
506				(void)gzerror(gz, &gzerrnum);
507			mandoc_msg(MANDOCERR_FILE, 0, 0, "read: %s",
508			    curp->gzip && gzerrnum != Z_ERRNO ?
509			    zError(gzerrnum) : strerror(errno));
510			break;
511		}
512		off += (size_t)ssz;
513	}
514
515	if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK)
516		mandoc_msg(MANDOCERR_FILE, 0, 0, "gzclose: %s",
517		    gzerrnum == Z_ERRNO ? strerror(errno) :
518		    zError(gzerrnum));
519	if (retval == 0) {
520		free(fb->buf);
521		fb->buf = NULL;
522	}
523	return retval;
524}
525
526static void
527mparse_end(struct mparse *curp)
528{
529	if (curp->man->meta.macroset == MACROSET_NONE)
530		curp->man->meta.macroset = MACROSET_MAN;
531	if (curp->man->meta.macroset == MACROSET_MDOC)
532		mdoc_endparse(curp->man);
533	else
534		man_endparse(curp->man);
535	roff_endparse(curp->roff);
536}
537
538/*
539 * Read the whole file into memory and call the parsers.
540 * Called recursively when an .so request is encountered.
541 */
542void
543mparse_readfd(struct mparse *curp, int fd, const char *filename)
544{
545	static int	 recursion_depth;
546
547	struct buf	 blk;
548	struct buf	*save_primary;
549	const char	*save_filename;
550	size_t		 offset;
551	int		 save_filenc, save_lineno;
552	int		 with_mmap;
553
554	if (recursion_depth > 64) {
555		mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL);
556		return;
557	}
558	if (read_whole_file(curp, fd, &blk, &with_mmap) == 0)
559		return;
560
561	/*
562	 * Save some properties of the parent file.
563	 */
564
565	save_primary = curp->primary;
566	save_filenc = curp->filenc;
567	save_lineno = curp->line;
568	save_filename = mandoc_msg_getinfilename();
569
570	curp->primary = &blk;
571	curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1);
572	curp->line = 1;
573	mandoc_msg_setinfilename(filename);
574
575	/* Skip an UTF-8 byte order mark. */
576	if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
577	    (unsigned char)blk.buf[0] == 0xef &&
578	    (unsigned char)blk.buf[1] == 0xbb &&
579	    (unsigned char)blk.buf[2] == 0xbf) {
580		offset = 3;
581		curp->filenc &= ~MPARSE_LATIN1;
582	} else
583		offset = 0;
584
585	recursion_depth++;
586	mparse_buf_r(curp, blk, offset, 1);
587	if (--recursion_depth == 0)
588		mparse_end(curp);
589
590	/*
591	 * Clean up and restore saved parent properties.
592	 */
593
594	if (with_mmap)
595		munmap(blk.buf, blk.sz);
596	else
597		free(blk.buf);
598
599	curp->primary = save_primary;
600	curp->filenc = save_filenc;
601	curp->line = save_lineno;
602	if (save_filename != NULL)
603		mandoc_msg_setinfilename(save_filename);
604}
605
606int
607mparse_open(struct mparse *curp, const char *file)
608{
609	char		 *cp;
610	int		  fd, save_errno;
611
612	cp = strrchr(file, '.');
613	curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz"));
614
615	/* First try to use the filename as it is. */
616
617	if ((fd = open(file, O_RDONLY)) != -1)
618		return fd;
619
620	/*
621	 * If that doesn't work and the filename doesn't
622	 * already  end in .gz, try appending .gz.
623	 */
624
625	if ( ! curp->gzip) {
626		save_errno = errno;
627		mandoc_asprintf(&cp, "%s.gz", file);
628		fd = open(cp, O_RDONLY);
629		free(cp);
630		errno = save_errno;
631		if (fd != -1) {
632			curp->gzip = 1;
633			return fd;
634		}
635	}
636
637	/* Neither worked, give up. */
638
639	return -1;
640}
641
642struct mparse *
643mparse_alloc(int options, enum mandoc_os os_e, const char *os_s)
644{
645	struct mparse	*curp;
646
647	curp = mandoc_calloc(1, sizeof(struct mparse));
648
649	curp->options = options;
650	curp->os_s = os_s;
651
652	curp->roff = roff_alloc(options);
653	curp->man = roff_man_alloc(curp->roff, curp->os_s,
654		curp->options & MPARSE_QUICK ? 1 : 0);
655	if (curp->options & MPARSE_MDOC) {
656		curp->man->meta.macroset = MACROSET_MDOC;
657		if (curp->man->mdocmac == NULL)
658			curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
659	} else if (curp->options & MPARSE_MAN) {
660		curp->man->meta.macroset = MACROSET_MAN;
661		if (curp->man->manmac == NULL)
662			curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
663	}
664	curp->man->meta.first->tok = TOKEN_NONE;
665	curp->man->meta.os_e = os_e;
666	return curp;
667}
668
669void
670mparse_reset(struct mparse *curp)
671{
672	roff_reset(curp->roff);
673	roff_man_reset(curp->man);
674	free_buf_list(curp->secondary);
675	curp->secondary = NULL;
676	curp->gzip = 0;
677}
678
679void
680mparse_free(struct mparse *curp)
681{
682	roffhash_free(curp->man->mdocmac);
683	roffhash_free(curp->man->manmac);
684	roff_man_free(curp->man);
685	roff_free(curp->roff);
686	free_buf_list(curp->secondary);
687	free(curp);
688}
689
690struct roff_meta *
691mparse_result(struct mparse *curp)
692{
693	roff_state_reset(curp->man);
694	if (curp->options & MPARSE_VALIDATE) {
695		if (curp->man->meta.macroset == MACROSET_MDOC)
696			mdoc_validate(curp->man);
697		else
698			man_validate(curp->man);
699	}
700	return &curp->man->meta;
701}
702
703void
704mparse_copy(const struct mparse *p)
705{
706	struct buf	*buf;
707
708	for (buf = p->secondary; buf != NULL; buf = buf->next)
709		puts(buf->buf);
710}
711