read.c revision 1.20
1/* Id: read.c,v 1.211 2019/01/11 17:04:44 schwarze Exp */ 2/* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2010-2019 Ingo Schwarze <schwarze@openbsd.org> 5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19#include "config.h" 20 21#include <sys/types.h> 22#include <sys/mman.h> 23#include <sys/stat.h> 24 25#include <assert.h> 26#include <ctype.h> 27#include <errno.h> 28#include <fcntl.h> 29#include <stdarg.h> 30#include <stdio.h> 31#include <stdlib.h> 32#include <string.h> 33#include <unistd.h> 34#include <zlib.h> 35 36#include "mandoc_aux.h" 37#include "mandoc.h" 38#include "roff.h" 39#include "mdoc.h" 40#include "man.h" 41#include "mandoc_parse.h" 42#include "libmandoc.h" 43#include "roff_int.h" 44 45#define REPARSE_LIMIT 1000 46 47struct mparse { 48 struct roff *roff; /* roff parser (!NULL) */ 49 struct roff_man *man; /* man parser */ 50 struct buf *primary; /* buffer currently being parsed */ 51 struct buf *secondary; /* copy of top level input */ 52 struct buf *loop; /* open .while request line */ 53 const char *os_s; /* default operating system */ 54 int options; /* parser options */ 55 int gzip; /* current input file is gzipped */ 56 int filenc; /* encoding of the current file */ 57 int reparse_count; /* finite interp. stack */ 58 int line; /* line number in the file */ 59}; 60 61static void choose_parser(struct mparse *); 62static void free_buf_list(struct buf *); 63static void resize_buf(struct buf *, size_t); 64static int mparse_buf_r(struct mparse *, const struct buf, size_t, int); 65static int mparse_buf_r(struct mparse *, struct buf, size_t, int); 66static int read_whole_file(struct mparse *, int, struct buf *, int *); 67static void mparse_end(struct mparse *); 68 69 70static void 71resize_buf(struct buf *buf, size_t initial) 72{ 73 74 buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial; 75 buf->buf = mandoc_realloc(buf->buf, buf->sz); 76} 77 78static void 79free_buf_list(struct buf *buf) 80{ 81 struct buf *tmp; 82 83 while (buf != NULL) { 84 tmp = buf; 85 buf = tmp->next; 86 free(tmp->buf); 87 free(tmp); 88 } 89} 90 91static void 92choose_parser(struct mparse *curp) 93{ 94 char *cp, *ep; 95 int format; 96 97 /* 98 * If neither command line arguments -mdoc or -man select 99 * a parser nor the roff parser found a .Dd or .TH macro 100 * yet, look ahead in the main input buffer. 101 */ 102 103 if ((format = roff_getformat(curp->roff)) == 0) { 104 cp = curp->primary->buf; 105 ep = cp + curp->primary->sz; 106 while (cp < ep) { 107 if (*cp == '.' || *cp == '\'') { 108 cp++; 109 if (cp[0] == 'D' && cp[1] == 'd') { 110 format = MPARSE_MDOC; 111 break; 112 } 113 if (cp[0] == 'T' && cp[1] == 'H') { 114 format = MPARSE_MAN; 115 break; 116 } 117 } 118 cp = memchr(cp, '\n', ep - cp); 119 if (cp == NULL) 120 break; 121 cp++; 122 } 123 } 124 125 if (format == MPARSE_MDOC) { 126 curp->man->meta.macroset = MACROSET_MDOC; 127 if (curp->man->mdocmac == NULL) 128 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 129 } else { 130 curp->man->meta.macroset = MACROSET_MAN; 131 if (curp->man->manmac == NULL) 132 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 133 } 134 curp->man->meta.first->tok = TOKEN_NONE; 135} 136 137/* 138 * Main parse routine for a buffer. 139 * It assumes encoding and line numbering are already set up. 140 * It can recurse directly (for invocations of user-defined 141 * macros, inline equations, and input line traps) 142 * and indirectly (for .so file inclusion). 143 */ 144static int 145mparse_buf_r(struct mparse *curp, const struct buf blk, size_t i, int start) 146{ 147 struct buf ln; 148 struct buf *firstln, *lastln, *thisln, *loop; 149 char *cp; 150 size_t pos; /* byte number in the ln buffer */ 151 int line_result, result; 152 int of; 153 int lnn; /* line number in the real file */ 154 int fd; 155 int inloop; /* Saw .while on this level. */ 156 unsigned char c; 157 158 ln.sz = 256; 159 ln.buf = mandoc_malloc(ln.sz); 160 ln.next = NULL; 161 firstln = loop = NULL; 162 lnn = curp->line; 163 pos = 0; 164 inloop = 0; 165 result = ROFF_CONT; 166 167 while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) { 168 if (start) { 169 curp->line = lnn; 170 curp->reparse_count = 0; 171 172 if (lnn < 3 && 173 curp->filenc & MPARSE_UTF8 && 174 curp->filenc & MPARSE_LATIN1) 175 curp->filenc = preconv_cue(&blk, i); 176 } 177 178 while (i < blk.sz && (start || blk.buf[i] != '\0')) { 179 180 /* 181 * When finding an unescaped newline character, 182 * leave the character loop to process the line. 183 * Skip a preceding carriage return, if any. 184 */ 185 186 if ('\r' == blk.buf[i] && i + 1 < blk.sz && 187 '\n' == blk.buf[i + 1]) 188 ++i; 189 if ('\n' == blk.buf[i]) { 190 ++i; 191 ++lnn; 192 break; 193 } 194 195 /* 196 * Make sure we have space for the worst 197 * case of 12 bytes: "\\[u10ffff]\n\0" 198 */ 199 200 if (pos + 12 > ln.sz) 201 resize_buf(&ln, 256); 202 203 /* 204 * Encode 8-bit input. 205 */ 206 207 c = blk.buf[i]; 208 if (c & 0x80) { 209 if ( ! (curp->filenc && preconv_encode( 210 &blk, &i, &ln, &pos, &curp->filenc))) { 211 mandoc_msg(MANDOCERR_CHAR_BAD, 212 curp->line, pos, "0x%x", c); 213 ln.buf[pos++] = '?'; 214 i++; 215 } 216 continue; 217 } 218 219 /* 220 * Exclude control characters. 221 */ 222 223 if (c == 0x7f || (c < 0x20 && c != 0x09)) { 224 mandoc_msg(c == 0x00 || c == 0x04 || 225 c > 0x0a ? MANDOCERR_CHAR_BAD : 226 MANDOCERR_CHAR_UNSUPP, 227 curp->line, pos, "0x%x", c); 228 i++; 229 if (c != '\r') 230 ln.buf[pos++] = '?'; 231 continue; 232 } 233 234 ln.buf[pos++] = blk.buf[i++]; 235 } 236 ln.buf[pos] = '\0'; 237 238 /* 239 * Maintain a lookaside buffer of all lines. 240 * parsed from this input source. 241 */ 242 243 thisln = mandoc_malloc(sizeof(*thisln)); 244 thisln->buf = mandoc_strdup(ln.buf); 245 thisln->sz = strlen(ln.buf) + 1; 246 thisln->next = NULL; 247 if (firstln == NULL) { 248 firstln = lastln = thisln; 249 if (curp->secondary == NULL) 250 curp->secondary = firstln; 251 } else { 252 lastln->next = thisln; 253 lastln = thisln; 254 } 255 256 /* XXX Ugly hack to mark the end of the input. */ 257 258 if (i == blk.sz || blk.buf[i] == '\0') { 259 ln.buf[pos++] = '\n'; 260 ln.buf[pos] = '\0'; 261 } 262 263 /* 264 * A significant amount of complexity is contained by 265 * the roff preprocessor. It's line-oriented but can be 266 * expressed on one line, so we need at times to 267 * readjust our starting point and re-run it. The roff 268 * preprocessor can also readjust the buffers with new 269 * data, so we pass them in wholesale. 270 */ 271 272 of = 0; 273rerun: 274 line_result = roff_parseln(curp->roff, curp->line, &ln, &of); 275 276 /* Process options. */ 277 278 if (line_result & ROFF_APPEND) 279 assert(line_result == (ROFF_IGN | ROFF_APPEND)); 280 281 if (line_result & ROFF_USERCALL) 282 assert((line_result & ROFF_MASK) == ROFF_REPARSE); 283 284 if (line_result & ROFF_USERRET) { 285 assert(line_result == (ROFF_IGN | ROFF_USERRET)); 286 if (start == 0) { 287 /* Return from the current macro. */ 288 result = ROFF_USERRET; 289 goto out; 290 } 291 } 292 293 switch (line_result & ROFF_LOOPMASK) { 294 case ROFF_IGN: 295 break; 296 case ROFF_WHILE: 297 if (curp->loop != NULL) { 298 if (loop == curp->loop) 299 break; 300 mandoc_msg(MANDOCERR_WHILE_NEST, 301 curp->line, pos, NULL); 302 } 303 curp->loop = thisln; 304 loop = NULL; 305 inloop = 1; 306 break; 307 case ROFF_LOOPCONT: 308 case ROFF_LOOPEXIT: 309 if (curp->loop == NULL) { 310 mandoc_msg(MANDOCERR_WHILE_FAIL, 311 curp->line, pos, NULL); 312 break; 313 } 314 if (inloop == 0) { 315 mandoc_msg(MANDOCERR_WHILE_INTO, 316 curp->line, pos, NULL); 317 curp->loop = loop = NULL; 318 break; 319 } 320 if (line_result & ROFF_LOOPCONT) 321 loop = curp->loop; 322 else { 323 curp->loop = loop = NULL; 324 inloop = 0; 325 } 326 break; 327 default: 328 abort(); 329 } 330 331 /* Process the main instruction from the roff parser. */ 332 333 switch (line_result & ROFF_MASK) { 334 case ROFF_IGN: 335 break; 336 case ROFF_CONT: 337 if (curp->man->meta.macroset == MACROSET_NONE) 338 choose_parser(curp); 339 if ((curp->man->meta.macroset == MACROSET_MDOC ? 340 mdoc_parseln(curp->man, curp->line, ln.buf, of) : 341 man_parseln(curp->man, curp->line, ln.buf, of) 342 ) == 2) 343 goto out; 344 break; 345 case ROFF_RERUN: 346 goto rerun; 347 case ROFF_REPARSE: 348 if (++curp->reparse_count > REPARSE_LIMIT) { 349 /* Abort and return to the top level. */ 350 result = ROFF_IGN; 351 mandoc_msg(MANDOCERR_ROFFLOOP, 352 curp->line, pos, NULL); 353 goto out; 354 } 355 result = mparse_buf_r(curp, ln, of, 0); 356 if (line_result & ROFF_USERCALL) { 357 roff_userret(curp->roff); 358 /* Continue normally. */ 359 if (result & ROFF_USERRET) 360 result = ROFF_CONT; 361 } 362 if (start == 0 && result != ROFF_CONT) 363 goto out; 364 break; 365 case ROFF_SO: 366 if ( ! (curp->options & MPARSE_SO) && 367 (i >= blk.sz || blk.buf[i] == '\0')) { 368 curp->man->meta.sodest = 369 mandoc_strdup(ln.buf + of); 370 goto out; 371 } 372 if ((fd = mparse_open(curp, ln.buf + of)) != -1) { 373 mparse_readfd(curp, fd, ln.buf + of); 374 close(fd); 375 } else { 376 mandoc_msg(MANDOCERR_SO_FAIL, 377 curp->line, of, ".so %s: %s", 378 ln.buf + of, strerror(errno)); 379 ln.sz = mandoc_asprintf(&cp, 380 ".sp\nSee the file %s.\n.sp", 381 ln.buf + of); 382 free(ln.buf); 383 ln.buf = cp; 384 of = 0; 385 mparse_buf_r(curp, ln, of, 0); 386 } 387 break; 388 default: 389 abort(); 390 } 391 392 /* Start the next input line. */ 393 394 if (loop != NULL && 395 (line_result & ROFF_LOOPMASK) == ROFF_IGN) 396 loop = loop->next; 397 398 if (loop != NULL) { 399 if ((line_result & ROFF_APPEND) == 0) 400 *ln.buf = '\0'; 401 if (ln.sz < loop->sz) 402 resize_buf(&ln, loop->sz); 403 (void)strlcat(ln.buf, loop->buf, ln.sz); 404 of = 0; 405 goto rerun; 406 } 407 408 pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0; 409 } 410out: 411 if (inloop) { 412 if (result != ROFF_USERRET) 413 mandoc_msg(MANDOCERR_WHILE_OUTOF, 414 curp->line, pos, NULL); 415 curp->loop = NULL; 416 } 417 free(ln.buf); 418 if (firstln != curp->secondary) 419 free_buf_list(firstln); 420 return result; 421} 422 423static int 424read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap) 425{ 426 struct stat st; 427 gzFile gz; 428 size_t off; 429 ssize_t ssz; 430 int gzerrnum, retval; 431 432 if (fstat(fd, &st) == -1) { 433 mandoc_msg(MANDOCERR_FILE, 0, 0, 434 "fstat: %s", strerror(errno)); 435 return 0; 436 } 437 438 /* 439 * If we're a regular file, try just reading in the whole entry 440 * via mmap(). This is faster than reading it into blocks, and 441 * since each file is only a few bytes to begin with, I'm not 442 * concerned that this is going to tank any machines. 443 */ 444 445 if (curp->gzip == 0 && S_ISREG(st.st_mode)) { 446 if (st.st_size > 0x7fffffff) { 447 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 448 return 0; 449 } 450 *with_mmap = 1; 451 fb->sz = (size_t)st.st_size; 452 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0); 453 if (fb->buf != MAP_FAILED) 454 return 1; 455 } 456 457 if (curp->gzip) { 458 /* 459 * Duplicating the file descriptor is required 460 * because we will have to call gzclose(3) 461 * to free memory used internally by zlib, 462 * but that will also close the file descriptor, 463 * which this function must not do. 464 */ 465 if ((fd = dup(fd)) == -1) { 466 mandoc_msg(MANDOCERR_FILE, 0, 0, 467 "dup: %s", strerror(errno)); 468 return 0; 469 } 470 if ((gz = gzdopen(fd, "rb")) == NULL) { 471 mandoc_msg(MANDOCERR_FILE, 0, 0, 472 "gzdopen: %s", strerror(errno)); 473 close(fd); 474 return 0; 475 } 476 } else 477 gz = NULL; 478 479 /* 480 * If this isn't a regular file (like, say, stdin), then we must 481 * go the old way and just read things in bit by bit. 482 */ 483 484 *with_mmap = 0; 485 off = 0; 486 retval = 0; 487 fb->sz = 0; 488 fb->buf = NULL; 489 for (;;) { 490 if (off == fb->sz) { 491 if (fb->sz == (1U << 31)) { 492 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 493 break; 494 } 495 resize_buf(fb, 65536); 496 } 497 ssz = curp->gzip ? 498 gzread(gz, fb->buf + (int)off, fb->sz - off) : 499 read(fd, fb->buf + (int)off, fb->sz - off); 500 if (ssz == 0) { 501 fb->sz = off; 502 retval = 1; 503 break; 504 } 505 if (ssz == -1) { 506 if (curp->gzip) 507 (void)gzerror(gz, &gzerrnum); 508 mandoc_msg(MANDOCERR_FILE, 0, 0, "read: %s", 509 curp->gzip && gzerrnum != Z_ERRNO ? 510 zError(gzerrnum) : strerror(errno)); 511 break; 512 } 513 off += (size_t)ssz; 514 } 515 516 if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK) 517 mandoc_msg(MANDOCERR_FILE, 0, 0, "gzclose: %s", 518 gzerrnum == Z_ERRNO ? strerror(errno) : 519 zError(gzerrnum)); 520 if (retval == 0) { 521 free(fb->buf); 522 fb->buf = NULL; 523 } 524 return retval; 525} 526 527static void 528mparse_end(struct mparse *curp) 529{ 530 if (curp->man->meta.macroset == MACROSET_NONE) 531 curp->man->meta.macroset = MACROSET_MAN; 532 if (curp->man->meta.macroset == MACROSET_MDOC) 533 mdoc_endparse(curp->man); 534 else 535 man_endparse(curp->man); 536 roff_endparse(curp->roff); 537} 538 539/* 540 * Read the whole file into memory and call the parsers. 541 * Called recursively when an .so request is encountered. 542 */ 543void 544mparse_readfd(struct mparse *curp, int fd, const char *filename) 545{ 546 static int recursion_depth; 547 548 struct buf blk; 549 struct buf *save_primary; 550 const char *save_filename; 551 size_t offset; 552 int save_filenc, save_lineno; 553 int with_mmap; 554 555 if (recursion_depth > 64) { 556 mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL); 557 return; 558 } 559 if (read_whole_file(curp, fd, &blk, &with_mmap) == 0) 560 return; 561 562 /* 563 * Save some properties of the parent file. 564 */ 565 566 save_primary = curp->primary; 567 save_filenc = curp->filenc; 568 save_lineno = curp->line; 569 save_filename = mandoc_msg_getinfilename(); 570 571 curp->primary = &blk; 572 curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1); 573 curp->line = 1; 574 mandoc_msg_setinfilename(filename); 575 576 /* Skip an UTF-8 byte order mark. */ 577 if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 && 578 (unsigned char)blk.buf[0] == 0xef && 579 (unsigned char)blk.buf[1] == 0xbb && 580 (unsigned char)blk.buf[2] == 0xbf) { 581 offset = 3; 582 curp->filenc &= ~MPARSE_LATIN1; 583 } else 584 offset = 0; 585 586 recursion_depth++; 587 mparse_buf_r(curp, blk, offset, 1); 588 if (--recursion_depth == 0) 589 mparse_end(curp); 590 591 /* 592 * Clean up and restore saved parent properties. 593 */ 594 595 if (with_mmap) 596 munmap(blk.buf, blk.sz); 597 else 598 free(blk.buf); 599 600 curp->primary = save_primary; 601 curp->filenc = save_filenc; 602 curp->line = save_lineno; 603 if (save_filename != NULL) 604 mandoc_msg_setinfilename(save_filename); 605} 606 607int 608mparse_open(struct mparse *curp, const char *file) 609{ 610 char *cp; 611 int fd, save_errno; 612 613 cp = strrchr(file, '.'); 614 curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz")); 615 616 /* First try to use the filename as it is. */ 617 618 if ((fd = open(file, O_RDONLY)) != -1) 619 return fd; 620 621 /* 622 * If that doesn't work and the filename doesn't 623 * already end in .gz, try appending .gz. 624 */ 625 626 if ( ! curp->gzip) { 627 save_errno = errno; 628 mandoc_asprintf(&cp, "%s.gz", file); 629 fd = open(cp, O_RDONLY); 630 free(cp); 631 errno = save_errno; 632 if (fd != -1) { 633 curp->gzip = 1; 634 return fd; 635 } 636 } 637 638 /* Neither worked, give up. */ 639 640 return -1; 641} 642 643struct mparse * 644mparse_alloc(int options, enum mandoc_os os_e, const char *os_s) 645{ 646 struct mparse *curp; 647 648 curp = mandoc_calloc(1, sizeof(struct mparse)); 649 650 curp->options = options; 651 curp->os_s = os_s; 652 653 curp->roff = roff_alloc(options); 654 curp->man = roff_man_alloc(curp->roff, curp->os_s, 655 curp->options & MPARSE_QUICK ? 1 : 0); 656 if (curp->options & MPARSE_MDOC) { 657 curp->man->meta.macroset = MACROSET_MDOC; 658 if (curp->man->mdocmac == NULL) 659 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 660 } else if (curp->options & MPARSE_MAN) { 661 curp->man->meta.macroset = MACROSET_MAN; 662 if (curp->man->manmac == NULL) 663 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 664 } 665 curp->man->meta.first->tok = TOKEN_NONE; 666 curp->man->meta.os_e = os_e; 667 return curp; 668} 669 670void 671mparse_reset(struct mparse *curp) 672{ 673 roff_reset(curp->roff); 674 roff_man_reset(curp->man); 675 free_buf_list(curp->secondary); 676 curp->secondary = NULL; 677 curp->gzip = 0; 678} 679 680void 681mparse_free(struct mparse *curp) 682{ 683 roffhash_free(curp->man->mdocmac); 684 roffhash_free(curp->man->manmac); 685 roff_man_free(curp->man); 686 roff_free(curp->roff); 687 free_buf_list(curp->secondary); 688 free(curp); 689} 690 691struct roff_meta * 692mparse_result(struct mparse *curp) 693{ 694 roff_state_reset(curp->man); 695 if (curp->options & MPARSE_VALIDATE) { 696 if (curp->man->meta.macroset == MACROSET_MDOC) 697 mdoc_validate(curp->man); 698 else 699 man_validate(curp->man); 700 } 701 return &curp->man->meta; 702} 703 704void 705mparse_copy(const struct mparse *p) 706{ 707 struct buf *buf; 708 709 for (buf = p->secondary; buf != NULL; buf = buf->next) 710 puts(buf->buf); 711} 712