read.c revision 1.20
1/*	Id: read.c,v 1.211 2019/01/11 17:04:44 schwarze Exp  */
2/*
3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2010-2019 Ingo Schwarze <schwarze@openbsd.org>
5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19#include "config.h"
20
21#include <sys/types.h>
22#include <sys/mman.h>
23#include <sys/stat.h>
24
25#include <assert.h>
26#include <ctype.h>
27#include <errno.h>
28#include <fcntl.h>
29#include <stdarg.h>
30#include <stdio.h>
31#include <stdlib.h>
32#include <string.h>
33#include <unistd.h>
34#include <zlib.h>
35
36#include "mandoc_aux.h"
37#include "mandoc.h"
38#include "roff.h"
39#include "mdoc.h"
40#include "man.h"
41#include "mandoc_parse.h"
42#include "libmandoc.h"
43#include "roff_int.h"
44
45#define	REPARSE_LIMIT	1000
46
47struct	mparse {
48	struct roff	 *roff; /* roff parser (!NULL) */
49	struct roff_man	 *man; /* man parser */
50	struct buf	 *primary; /* buffer currently being parsed */
51	struct buf	 *secondary; /* copy of top level input */
52	struct buf	 *loop; /* open .while request line */
53	const char	 *os_s; /* default operating system */
54	int		  options; /* parser options */
55	int		  gzip; /* current input file is gzipped */
56	int		  filenc; /* encoding of the current file */
57	int		  reparse_count; /* finite interp. stack */
58	int		  line; /* line number in the file */
59};
60
61static	void	  choose_parser(struct mparse *);
62static	void	  free_buf_list(struct buf *);
63static	void	  resize_buf(struct buf *, size_t);
64static	int	  mparse_buf_r(struct mparse *, const struct buf, size_t, int);
65static	int	  mparse_buf_r(struct mparse *, struct buf, size_t, int);
66static	int	  read_whole_file(struct mparse *, int, struct buf *, int *);
67static	void	  mparse_end(struct mparse *);
68
69
70static void
71resize_buf(struct buf *buf, size_t initial)
72{
73
74	buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
75	buf->buf = mandoc_realloc(buf->buf, buf->sz);
76}
77
78static void
79free_buf_list(struct buf *buf)
80{
81	struct buf *tmp;
82
83	while (buf != NULL) {
84		tmp = buf;
85		buf = tmp->next;
86		free(tmp->buf);
87		free(tmp);
88	}
89}
90
91static void
92choose_parser(struct mparse *curp)
93{
94	char		*cp, *ep;
95	int		 format;
96
97	/*
98	 * If neither command line arguments -mdoc or -man select
99	 * a parser nor the roff parser found a .Dd or .TH macro
100	 * yet, look ahead in the main input buffer.
101	 */
102
103	if ((format = roff_getformat(curp->roff)) == 0) {
104		cp = curp->primary->buf;
105		ep = cp + curp->primary->sz;
106		while (cp < ep) {
107			if (*cp == '.' || *cp == '\'') {
108				cp++;
109				if (cp[0] == 'D' && cp[1] == 'd') {
110					format = MPARSE_MDOC;
111					break;
112				}
113				if (cp[0] == 'T' && cp[1] == 'H') {
114					format = MPARSE_MAN;
115					break;
116				}
117			}
118			cp = memchr(cp, '\n', ep - cp);
119			if (cp == NULL)
120				break;
121			cp++;
122		}
123	}
124
125	if (format == MPARSE_MDOC) {
126		curp->man->meta.macroset = MACROSET_MDOC;
127		if (curp->man->mdocmac == NULL)
128			curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
129	} else {
130		curp->man->meta.macroset = MACROSET_MAN;
131		if (curp->man->manmac == NULL)
132			curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
133	}
134	curp->man->meta.first->tok = TOKEN_NONE;
135}
136
137/*
138 * Main parse routine for a buffer.
139 * It assumes encoding and line numbering are already set up.
140 * It can recurse directly (for invocations of user-defined
141 * macros, inline equations, and input line traps)
142 * and indirectly (for .so file inclusion).
143 */
144static int
145mparse_buf_r(struct mparse *curp, const struct buf blk, size_t i, int start)
146{
147	struct buf	 ln;
148	struct buf	*firstln, *lastln, *thisln, *loop;
149	char		*cp;
150	size_t		 pos; /* byte number in the ln buffer */
151	int		 line_result, result;
152	int		 of;
153	int		 lnn; /* line number in the real file */
154	int		 fd;
155	int		 inloop; /* Saw .while on this level. */
156	unsigned char	 c;
157
158	ln.sz = 256;
159	ln.buf = mandoc_malloc(ln.sz);
160	ln.next = NULL;
161	firstln = loop = NULL;
162	lnn = curp->line;
163	pos = 0;
164	inloop = 0;
165	result = ROFF_CONT;
166
167	while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) {
168		if (start) {
169			curp->line = lnn;
170			curp->reparse_count = 0;
171
172			if (lnn < 3 &&
173			    curp->filenc & MPARSE_UTF8 &&
174			    curp->filenc & MPARSE_LATIN1)
175				curp->filenc = preconv_cue(&blk, i);
176		}
177
178		while (i < blk.sz && (start || blk.buf[i] != '\0')) {
179
180			/*
181			 * When finding an unescaped newline character,
182			 * leave the character loop to process the line.
183			 * Skip a preceding carriage return, if any.
184			 */
185
186			if ('\r' == blk.buf[i] && i + 1 < blk.sz &&
187			    '\n' == blk.buf[i + 1])
188				++i;
189			if ('\n' == blk.buf[i]) {
190				++i;
191				++lnn;
192				break;
193			}
194
195			/*
196			 * Make sure we have space for the worst
197			 * case of 12 bytes: "\\[u10ffff]\n\0"
198			 */
199
200			if (pos + 12 > ln.sz)
201				resize_buf(&ln, 256);
202
203			/*
204			 * Encode 8-bit input.
205			 */
206
207			c = blk.buf[i];
208			if (c & 0x80) {
209				if ( ! (curp->filenc && preconv_encode(
210				    &blk, &i, &ln, &pos, &curp->filenc))) {
211					mandoc_msg(MANDOCERR_CHAR_BAD,
212					    curp->line, pos, "0x%x", c);
213					ln.buf[pos++] = '?';
214					i++;
215				}
216				continue;
217			}
218
219			/*
220			 * Exclude control characters.
221			 */
222
223			if (c == 0x7f || (c < 0x20 && c != 0x09)) {
224				mandoc_msg(c == 0x00 || c == 0x04 ||
225				    c > 0x0a ? MANDOCERR_CHAR_BAD :
226				    MANDOCERR_CHAR_UNSUPP,
227				    curp->line, pos, "0x%x", c);
228				i++;
229				if (c != '\r')
230					ln.buf[pos++] = '?';
231				continue;
232			}
233
234			ln.buf[pos++] = blk.buf[i++];
235		}
236		ln.buf[pos] = '\0';
237
238		/*
239		 * Maintain a lookaside buffer of all lines.
240		 * parsed from this input source.
241		 */
242
243		thisln = mandoc_malloc(sizeof(*thisln));
244		thisln->buf = mandoc_strdup(ln.buf);
245		thisln->sz = strlen(ln.buf) + 1;
246		thisln->next = NULL;
247		if (firstln == NULL) {
248			firstln = lastln = thisln;
249			if (curp->secondary == NULL)
250				curp->secondary = firstln;
251		} else {
252			lastln->next = thisln;
253			lastln = thisln;
254		}
255
256		/* XXX Ugly hack to mark the end of the input. */
257
258		if (i == blk.sz || blk.buf[i] == '\0') {
259			ln.buf[pos++] = '\n';
260			ln.buf[pos] = '\0';
261		}
262
263		/*
264		 * A significant amount of complexity is contained by
265		 * the roff preprocessor.  It's line-oriented but can be
266		 * expressed on one line, so we need at times to
267		 * readjust our starting point and re-run it.  The roff
268		 * preprocessor can also readjust the buffers with new
269		 * data, so we pass them in wholesale.
270		 */
271
272		of = 0;
273rerun:
274		line_result = roff_parseln(curp->roff, curp->line, &ln, &of);
275
276		/* Process options. */
277
278		if (line_result & ROFF_APPEND)
279			assert(line_result == (ROFF_IGN | ROFF_APPEND));
280
281		if (line_result & ROFF_USERCALL)
282			assert((line_result & ROFF_MASK) == ROFF_REPARSE);
283
284		if (line_result & ROFF_USERRET) {
285			assert(line_result == (ROFF_IGN | ROFF_USERRET));
286			if (start == 0) {
287				/* Return from the current macro. */
288				result = ROFF_USERRET;
289				goto out;
290			}
291		}
292
293		switch (line_result & ROFF_LOOPMASK) {
294		case ROFF_IGN:
295			break;
296		case ROFF_WHILE:
297			if (curp->loop != NULL) {
298				if (loop == curp->loop)
299					break;
300				mandoc_msg(MANDOCERR_WHILE_NEST,
301				    curp->line, pos, NULL);
302			}
303			curp->loop = thisln;
304			loop = NULL;
305			inloop = 1;
306			break;
307		case ROFF_LOOPCONT:
308		case ROFF_LOOPEXIT:
309			if (curp->loop == NULL) {
310				mandoc_msg(MANDOCERR_WHILE_FAIL,
311				    curp->line, pos, NULL);
312				break;
313			}
314			if (inloop == 0) {
315				mandoc_msg(MANDOCERR_WHILE_INTO,
316				    curp->line, pos, NULL);
317				curp->loop = loop = NULL;
318				break;
319			}
320			if (line_result & ROFF_LOOPCONT)
321				loop = curp->loop;
322			else {
323				curp->loop = loop = NULL;
324				inloop = 0;
325			}
326			break;
327		default:
328			abort();
329		}
330
331		/* Process the main instruction from the roff parser. */
332
333		switch (line_result & ROFF_MASK) {
334		case ROFF_IGN:
335			break;
336		case ROFF_CONT:
337			if (curp->man->meta.macroset == MACROSET_NONE)
338				choose_parser(curp);
339			if ((curp->man->meta.macroset == MACROSET_MDOC ?
340			     mdoc_parseln(curp->man, curp->line, ln.buf, of) :
341			     man_parseln(curp->man, curp->line, ln.buf, of)
342			    ) == 2)
343				goto out;
344			break;
345		case ROFF_RERUN:
346			goto rerun;
347		case ROFF_REPARSE:
348			if (++curp->reparse_count > REPARSE_LIMIT) {
349				/* Abort and return to the top level. */
350				result = ROFF_IGN;
351				mandoc_msg(MANDOCERR_ROFFLOOP,
352				    curp->line, pos, NULL);
353				goto out;
354			}
355			result = mparse_buf_r(curp, ln, of, 0);
356			if (line_result & ROFF_USERCALL) {
357				roff_userret(curp->roff);
358				/* Continue normally. */
359				if (result & ROFF_USERRET)
360					result = ROFF_CONT;
361			}
362			if (start == 0 && result != ROFF_CONT)
363				goto out;
364			break;
365		case ROFF_SO:
366			if ( ! (curp->options & MPARSE_SO) &&
367			    (i >= blk.sz || blk.buf[i] == '\0')) {
368				curp->man->meta.sodest =
369				    mandoc_strdup(ln.buf + of);
370				goto out;
371			}
372			if ((fd = mparse_open(curp, ln.buf + of)) != -1) {
373				mparse_readfd(curp, fd, ln.buf + of);
374				close(fd);
375			} else {
376				mandoc_msg(MANDOCERR_SO_FAIL,
377				    curp->line, of, ".so %s: %s",
378				    ln.buf + of, strerror(errno));
379				ln.sz = mandoc_asprintf(&cp,
380				    ".sp\nSee the file %s.\n.sp",
381				    ln.buf + of);
382				free(ln.buf);
383				ln.buf = cp;
384				of = 0;
385				mparse_buf_r(curp, ln, of, 0);
386			}
387			break;
388		default:
389			abort();
390		}
391
392		/* Start the next input line. */
393
394		if (loop != NULL &&
395		    (line_result & ROFF_LOOPMASK) == ROFF_IGN)
396			loop = loop->next;
397
398		if (loop != NULL) {
399			if ((line_result & ROFF_APPEND) == 0)
400				*ln.buf = '\0';
401			if (ln.sz < loop->sz)
402				resize_buf(&ln, loop->sz);
403			(void)strlcat(ln.buf, loop->buf, ln.sz);
404			of = 0;
405			goto rerun;
406		}
407
408		pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0;
409	}
410out:
411	if (inloop) {
412		if (result != ROFF_USERRET)
413			mandoc_msg(MANDOCERR_WHILE_OUTOF,
414			    curp->line, pos, NULL);
415		curp->loop = NULL;
416	}
417	free(ln.buf);
418	if (firstln != curp->secondary)
419		free_buf_list(firstln);
420	return result;
421}
422
423static int
424read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap)
425{
426	struct stat	 st;
427	gzFile		 gz;
428	size_t		 off;
429	ssize_t		 ssz;
430	int		 gzerrnum, retval;
431
432	if (fstat(fd, &st) == -1) {
433		mandoc_msg(MANDOCERR_FILE, 0, 0,
434		    "fstat: %s", strerror(errno));
435		return 0;
436	}
437
438	/*
439	 * If we're a regular file, try just reading in the whole entry
440	 * via mmap().  This is faster than reading it into blocks, and
441	 * since each file is only a few bytes to begin with, I'm not
442	 * concerned that this is going to tank any machines.
443	 */
444
445	if (curp->gzip == 0 && S_ISREG(st.st_mode)) {
446		if (st.st_size > 0x7fffffff) {
447			mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL);
448			return 0;
449		}
450		*with_mmap = 1;
451		fb->sz = (size_t)st.st_size;
452		fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
453		if (fb->buf != MAP_FAILED)
454			return 1;
455	}
456
457	if (curp->gzip) {
458		/*
459		 * Duplicating the file descriptor is required
460		 * because we will have to call gzclose(3)
461		 * to free memory used internally by zlib,
462		 * but that will also close the file descriptor,
463		 * which this function must not do.
464		 */
465		if ((fd = dup(fd)) == -1) {
466			mandoc_msg(MANDOCERR_FILE, 0, 0,
467			    "dup: %s", strerror(errno));
468			return 0;
469		}
470		if ((gz = gzdopen(fd, "rb")) == NULL) {
471			mandoc_msg(MANDOCERR_FILE, 0, 0,
472			    "gzdopen: %s", strerror(errno));
473			close(fd);
474			return 0;
475		}
476	} else
477		gz = NULL;
478
479	/*
480	 * If this isn't a regular file (like, say, stdin), then we must
481	 * go the old way and just read things in bit by bit.
482	 */
483
484	*with_mmap = 0;
485	off = 0;
486	retval = 0;
487	fb->sz = 0;
488	fb->buf = NULL;
489	for (;;) {
490		if (off == fb->sz) {
491			if (fb->sz == (1U << 31)) {
492				mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL);
493				break;
494			}
495			resize_buf(fb, 65536);
496		}
497		ssz = curp->gzip ?
498		    gzread(gz, fb->buf + (int)off, fb->sz - off) :
499		    read(fd, fb->buf + (int)off, fb->sz - off);
500		if (ssz == 0) {
501			fb->sz = off;
502			retval = 1;
503			break;
504		}
505		if (ssz == -1) {
506			if (curp->gzip)
507				(void)gzerror(gz, &gzerrnum);
508			mandoc_msg(MANDOCERR_FILE, 0, 0, "read: %s",
509			    curp->gzip && gzerrnum != Z_ERRNO ?
510			    zError(gzerrnum) : strerror(errno));
511			break;
512		}
513		off += (size_t)ssz;
514	}
515
516	if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK)
517		mandoc_msg(MANDOCERR_FILE, 0, 0, "gzclose: %s",
518		    gzerrnum == Z_ERRNO ? strerror(errno) :
519		    zError(gzerrnum));
520	if (retval == 0) {
521		free(fb->buf);
522		fb->buf = NULL;
523	}
524	return retval;
525}
526
527static void
528mparse_end(struct mparse *curp)
529{
530	if (curp->man->meta.macroset == MACROSET_NONE)
531		curp->man->meta.macroset = MACROSET_MAN;
532	if (curp->man->meta.macroset == MACROSET_MDOC)
533		mdoc_endparse(curp->man);
534	else
535		man_endparse(curp->man);
536	roff_endparse(curp->roff);
537}
538
539/*
540 * Read the whole file into memory and call the parsers.
541 * Called recursively when an .so request is encountered.
542 */
543void
544mparse_readfd(struct mparse *curp, int fd, const char *filename)
545{
546	static int	 recursion_depth;
547
548	struct buf	 blk;
549	struct buf	*save_primary;
550	const char	*save_filename;
551	size_t		 offset;
552	int		 save_filenc, save_lineno;
553	int		 with_mmap;
554
555	if (recursion_depth > 64) {
556		mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL);
557		return;
558	}
559	if (read_whole_file(curp, fd, &blk, &with_mmap) == 0)
560		return;
561
562	/*
563	 * Save some properties of the parent file.
564	 */
565
566	save_primary = curp->primary;
567	save_filenc = curp->filenc;
568	save_lineno = curp->line;
569	save_filename = mandoc_msg_getinfilename();
570
571	curp->primary = &blk;
572	curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1);
573	curp->line = 1;
574	mandoc_msg_setinfilename(filename);
575
576	/* Skip an UTF-8 byte order mark. */
577	if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
578	    (unsigned char)blk.buf[0] == 0xef &&
579	    (unsigned char)blk.buf[1] == 0xbb &&
580	    (unsigned char)blk.buf[2] == 0xbf) {
581		offset = 3;
582		curp->filenc &= ~MPARSE_LATIN1;
583	} else
584		offset = 0;
585
586	recursion_depth++;
587	mparse_buf_r(curp, blk, offset, 1);
588	if (--recursion_depth == 0)
589		mparse_end(curp);
590
591	/*
592	 * Clean up and restore saved parent properties.
593	 */
594
595	if (with_mmap)
596		munmap(blk.buf, blk.sz);
597	else
598		free(blk.buf);
599
600	curp->primary = save_primary;
601	curp->filenc = save_filenc;
602	curp->line = save_lineno;
603	if (save_filename != NULL)
604		mandoc_msg_setinfilename(save_filename);
605}
606
607int
608mparse_open(struct mparse *curp, const char *file)
609{
610	char		 *cp;
611	int		  fd, save_errno;
612
613	cp = strrchr(file, '.');
614	curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz"));
615
616	/* First try to use the filename as it is. */
617
618	if ((fd = open(file, O_RDONLY)) != -1)
619		return fd;
620
621	/*
622	 * If that doesn't work and the filename doesn't
623	 * already  end in .gz, try appending .gz.
624	 */
625
626	if ( ! curp->gzip) {
627		save_errno = errno;
628		mandoc_asprintf(&cp, "%s.gz", file);
629		fd = open(cp, O_RDONLY);
630		free(cp);
631		errno = save_errno;
632		if (fd != -1) {
633			curp->gzip = 1;
634			return fd;
635		}
636	}
637
638	/* Neither worked, give up. */
639
640	return -1;
641}
642
643struct mparse *
644mparse_alloc(int options, enum mandoc_os os_e, const char *os_s)
645{
646	struct mparse	*curp;
647
648	curp = mandoc_calloc(1, sizeof(struct mparse));
649
650	curp->options = options;
651	curp->os_s = os_s;
652
653	curp->roff = roff_alloc(options);
654	curp->man = roff_man_alloc(curp->roff, curp->os_s,
655		curp->options & MPARSE_QUICK ? 1 : 0);
656	if (curp->options & MPARSE_MDOC) {
657		curp->man->meta.macroset = MACROSET_MDOC;
658		if (curp->man->mdocmac == NULL)
659			curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
660	} else if (curp->options & MPARSE_MAN) {
661		curp->man->meta.macroset = MACROSET_MAN;
662		if (curp->man->manmac == NULL)
663			curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
664	}
665	curp->man->meta.first->tok = TOKEN_NONE;
666	curp->man->meta.os_e = os_e;
667	return curp;
668}
669
670void
671mparse_reset(struct mparse *curp)
672{
673	roff_reset(curp->roff);
674	roff_man_reset(curp->man);
675	free_buf_list(curp->secondary);
676	curp->secondary = NULL;
677	curp->gzip = 0;
678}
679
680void
681mparse_free(struct mparse *curp)
682{
683	roffhash_free(curp->man->mdocmac);
684	roffhash_free(curp->man->manmac);
685	roff_man_free(curp->man);
686	roff_free(curp->roff);
687	free_buf_list(curp->secondary);
688	free(curp);
689}
690
691struct roff_meta *
692mparse_result(struct mparse *curp)
693{
694	roff_state_reset(curp->man);
695	if (curp->options & MPARSE_VALIDATE) {
696		if (curp->man->meta.macroset == MACROSET_MDOC)
697			mdoc_validate(curp->man);
698		else
699			man_validate(curp->man);
700	}
701	return &curp->man->meta;
702}
703
704void
705mparse_copy(const struct mparse *p)
706{
707	struct buf	*buf;
708
709	for (buf = p->secondary; buf != NULL; buf = buf->next)
710		puts(buf->buf);
711}
712