1294113Sbapt/* $Id: mdoc.c,v 1.256 2015/10/30 19:04:16 schwarze Exp $ */ 2241675Suqs/* 3241675Suqs * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4279527Sbapt * Copyright (c) 2010, 2012-2015 Ingo Schwarze <schwarze@openbsd.org> 5241675Suqs * 6241675Suqs * Permission to use, copy, modify, and distribute this software for any 7241675Suqs * purpose with or without fee is hereby granted, provided that the above 8241675Suqs * copyright notice and this permission notice appear in all copies. 9241675Suqs * 10294113Sbapt * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11241675Suqs * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12294113Sbapt * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13241675Suqs * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14241675Suqs * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15241675Suqs * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16241675Suqs * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17241675Suqs */ 18241675Suqs#include "config.h" 19241675Suqs 20241675Suqs#include <sys/types.h> 21241675Suqs 22241675Suqs#include <assert.h> 23274880Sbapt#include <ctype.h> 24241675Suqs#include <stdarg.h> 25241675Suqs#include <stdio.h> 26241675Suqs#include <stdlib.h> 27241675Suqs#include <string.h> 28241675Suqs#include <time.h> 29241675Suqs 30294113Sbapt#include "mandoc_aux.h" 31294113Sbapt#include "mandoc.h" 32294113Sbapt#include "roff.h" 33241675Suqs#include "mdoc.h" 34294113Sbapt#include "libmandoc.h" 35294113Sbapt#include "roff_int.h" 36241675Suqs#include "libmdoc.h" 37241675Suqs 38274880Sbaptconst char *const __mdoc_macronames[MDOC_MAX + 1] = { 39294113Sbapt "text", 40241675Suqs "Ap", "Dd", "Dt", "Os", 41241675Suqs "Sh", "Ss", "Pp", "D1", 42241675Suqs "Dl", "Bd", "Ed", "Bl", 43241675Suqs "El", "It", "Ad", "An", 44241675Suqs "Ar", "Cd", "Cm", "Dv", 45241675Suqs "Er", "Ev", "Ex", "Fa", 46241675Suqs "Fd", "Fl", "Fn", "Ft", 47241675Suqs "Ic", "In", "Li", "Nd", 48241675Suqs "Nm", "Op", "Ot", "Pa", 49241675Suqs "Rv", "St", "Va", "Vt", 50241675Suqs "Xr", "%A", "%B", "%D", 51241675Suqs "%I", "%J", "%N", "%O", 52241675Suqs "%P", "%R", "%T", "%V", 53241675Suqs "Ac", "Ao", "Aq", "At", 54241675Suqs "Bc", "Bf", "Bo", "Bq", 55241675Suqs "Bsx", "Bx", "Db", "Dc", 56241675Suqs "Do", "Dq", "Ec", "Ef", 57241675Suqs "Em", "Eo", "Fx", "Ms", 58241675Suqs "No", "Ns", "Nx", "Ox", 59241675Suqs "Pc", "Pf", "Po", "Pq", 60241675Suqs "Qc", "Ql", "Qo", "Qq", 61241675Suqs "Re", "Rs", "Sc", "So", 62241675Suqs "Sq", "Sm", "Sx", "Sy", 63241675Suqs "Tn", "Ux", "Xc", "Xo", 64241675Suqs "Fo", "Fc", "Oo", "Oc", 65241675Suqs "Bk", "Ek", "Bt", "Hf", 66241675Suqs "Fr", "Ud", "Lb", "Lp", 67241675Suqs "Lk", "Mt", "Brq", "Bro", 68241675Suqs "Brc", "%C", "Es", "En", 69241675Suqs "Dx", "%Q", "br", "sp", 70294113Sbapt "%U", "Ta", "ll", 71294113Sbapt}; 72241675Suqs 73274880Sbaptconst char *const __mdoc_argnames[MDOC_ARG_MAX] = { 74241675Suqs "split", "nosplit", "ragged", 75274880Sbapt "unfilled", "literal", "file", 76274880Sbapt "offset", "bullet", "dash", 77274880Sbapt "hyphen", "item", "enum", 78274880Sbapt "tag", "diag", "hang", 79274880Sbapt "ohang", "inset", "column", 80274880Sbapt "width", "compact", "std", 81241675Suqs "filled", "words", "emphasis", 82241675Suqs "symbolic", "nested", "centered" 83241675Suqs }; 84241675Suqs 85294113Sbaptconst char * const *mdoc_macronames = __mdoc_macronames + 1; 86241675Suqsconst char * const *mdoc_argnames = __mdoc_argnames; 87241675Suqs 88294113Sbaptstatic int mdoc_ptext(struct roff_man *, int, char *, int); 89294113Sbaptstatic int mdoc_pmacro(struct roff_man *, int, char *, int); 90241675Suqs 91274880Sbapt 92241675Suqs/* 93241675Suqs * Main parse routine. Parses a single line -- really just hands off to 94241675Suqs * the macro (mdoc_pmacro()) or text parser (mdoc_ptext()). 95241675Suqs */ 96241675Suqsint 97294113Sbaptmdoc_parseln(struct roff_man *mdoc, int ln, char *buf, int offs) 98241675Suqs{ 99241675Suqs 100294113Sbapt if (mdoc->last->type != ROFFT_EQN || ln > mdoc->last->line) 101275432Sbapt mdoc->flags |= MDOC_NEWLINE; 102241675Suqs 103241675Suqs /* 104241675Suqs * Let the roff nS register switch SYNOPSIS mode early, 105241675Suqs * such that the parser knows at all times 106241675Suqs * whether this mode is on or off. 107241675Suqs * Note that this mode is also switched by the Sh macro. 108241675Suqs */ 109261344Suqs if (roff_getreg(mdoc->roff, "nS")) 110261344Suqs mdoc->flags |= MDOC_SYNOPSIS; 111261344Suqs else 112261344Suqs mdoc->flags &= ~MDOC_SYNOPSIS; 113241675Suqs 114294113Sbapt return roff_getcontrol(mdoc->roff, buf, &offs) ? 115274880Sbapt mdoc_pmacro(mdoc, ln, buf, offs) : 116294113Sbapt mdoc_ptext(mdoc, ln, buf, offs); 117241675Suqs} 118241675Suqs 119275432Sbaptvoid 120241675Suqsmdoc_macro(MACRO_PROT_ARGS) 121241675Suqs{ 122294113Sbapt assert(tok > TOKEN_NONE && tok < MDOC_MAX); 123241675Suqs 124275432Sbapt (*mdoc_macros[tok].fp)(mdoc, tok, line, ppos, pos, buf); 125241675Suqs} 126241675Suqs 127275432Sbaptvoid 128294113Sbaptmdoc_tail_alloc(struct roff_man *mdoc, int line, int pos, int tok) 129241675Suqs{ 130294113Sbapt struct roff_node *p; 131241675Suqs 132294113Sbapt p = roff_node_alloc(mdoc, line, pos, ROFFT_TAIL, tok); 133294113Sbapt roff_node_append(mdoc, p); 134294113Sbapt mdoc->next = ROFF_NEXT_CHILD; 135241675Suqs} 136241675Suqs 137294113Sbaptstruct roff_node * 138294113Sbaptmdoc_endbody_alloc(struct roff_man *mdoc, int line, int pos, int tok, 139294113Sbapt struct roff_node *body, enum mdoc_endbody end) 140241675Suqs{ 141294113Sbapt struct roff_node *p; 142241675Suqs 143279527Sbapt body->flags |= MDOC_ENDED; 144279527Sbapt body->parent->flags |= MDOC_ENDED; 145294113Sbapt p = roff_node_alloc(mdoc, line, pos, ROFFT_BODY, tok); 146279527Sbapt p->body = body; 147261344Suqs p->norm = body->norm; 148241675Suqs p->end = end; 149294113Sbapt roff_node_append(mdoc, p); 150294113Sbapt mdoc->next = ROFF_NEXT_SIBLING; 151294113Sbapt return p; 152241675Suqs} 153241675Suqs 154294113Sbaptstruct roff_node * 155294113Sbaptmdoc_block_alloc(struct roff_man *mdoc, int line, int pos, 156294113Sbapt int tok, struct mdoc_arg *args) 157241675Suqs{ 158294113Sbapt struct roff_node *p; 159241675Suqs 160294113Sbapt p = roff_node_alloc(mdoc, line, pos, ROFFT_BLOCK, tok); 161241675Suqs p->args = args; 162241675Suqs if (p->args) 163241675Suqs (args->refcnt)++; 164241675Suqs 165241675Suqs switch (tok) { 166274880Sbapt case MDOC_Bd: 167274880Sbapt case MDOC_Bf: 168274880Sbapt case MDOC_Bl: 169274880Sbapt case MDOC_En: 170274880Sbapt case MDOC_Rs: 171241675Suqs p->norm = mandoc_calloc(1, sizeof(union mdoc_data)); 172241675Suqs break; 173241675Suqs default: 174241675Suqs break; 175241675Suqs } 176294113Sbapt roff_node_append(mdoc, p); 177294113Sbapt mdoc->next = ROFF_NEXT_CHILD; 178294113Sbapt return p; 179241675Suqs} 180241675Suqs 181275432Sbaptvoid 182294113Sbaptmdoc_elem_alloc(struct roff_man *mdoc, int line, int pos, 183294113Sbapt int tok, struct mdoc_arg *args) 184241675Suqs{ 185294113Sbapt struct roff_node *p; 186241675Suqs 187294113Sbapt p = roff_node_alloc(mdoc, line, pos, ROFFT_ELEM, tok); 188241675Suqs p->args = args; 189241675Suqs if (p->args) 190241675Suqs (args->refcnt)++; 191241675Suqs 192241675Suqs switch (tok) { 193274880Sbapt case MDOC_An: 194241675Suqs p->norm = mandoc_calloc(1, sizeof(union mdoc_data)); 195241675Suqs break; 196241675Suqs default: 197241675Suqs break; 198241675Suqs } 199294113Sbapt roff_node_append(mdoc, p); 200294113Sbapt mdoc->next = ROFF_NEXT_CHILD; 201241675Suqs} 202241675Suqs 203275432Sbaptvoid 204294113Sbaptmdoc_node_relink(struct roff_man *mdoc, struct roff_node *p) 205241675Suqs{ 206241675Suqs 207294113Sbapt roff_node_unlink(mdoc, p); 208294113Sbapt p->prev = p->next = NULL; 209294113Sbapt roff_node_append(mdoc, p); 210241675Suqs} 211241675Suqs 212241675Suqs/* 213241675Suqs * Parse free-form text, that is, a line that does not begin with the 214241675Suqs * control character. 215241675Suqs */ 216241675Suqsstatic int 217294113Sbaptmdoc_ptext(struct roff_man *mdoc, int line, char *buf, int offs) 218241675Suqs{ 219294113Sbapt struct roff_node *n; 220241675Suqs char *c, *ws, *end; 221241675Suqs 222261344Suqs assert(mdoc->last); 223261344Suqs n = mdoc->last; 224241675Suqs 225241675Suqs /* 226241675Suqs * Divert directly to list processing if we're encountering a 227294113Sbapt * columnar ROFFT_BLOCK with or without a prior ROFFT_BLOCK entry 228294113Sbapt * (a ROFFT_BODY means it's already open, in which case we should 229241675Suqs * process within its context in the normal way). 230241675Suqs */ 231241675Suqs 232294113Sbapt if (n->tok == MDOC_Bl && n->type == ROFFT_BODY && 233279527Sbapt n->end == ENDBODY_NOT && n->norm->Bl.type == LIST_column) { 234241675Suqs /* `Bl' is open without any children. */ 235261344Suqs mdoc->flags |= MDOC_FREECOL; 236275432Sbapt mdoc_macro(mdoc, MDOC_It, line, offs, &offs, buf); 237294113Sbapt return 1; 238241675Suqs } 239241675Suqs 240294113Sbapt if (n->tok == MDOC_It && n->type == ROFFT_BLOCK && 241274880Sbapt NULL != n->parent && 242274880Sbapt MDOC_Bl == n->parent->tok && 243274880Sbapt LIST_column == n->parent->norm->Bl.type) { 244241675Suqs /* `Bl' has block-level `It' children. */ 245261344Suqs mdoc->flags |= MDOC_FREECOL; 246275432Sbapt mdoc_macro(mdoc, MDOC_It, line, offs, &offs, buf); 247294113Sbapt return 1; 248241675Suqs } 249241675Suqs 250241675Suqs /* 251241675Suqs * Search for the beginning of unescaped trailing whitespace (ws) 252241675Suqs * and for the first character not to be output (end). 253241675Suqs */ 254241675Suqs 255241675Suqs /* FIXME: replace with strcspn(). */ 256241675Suqs ws = NULL; 257241675Suqs for (c = end = buf + offs; *c; c++) { 258241675Suqs switch (*c) { 259241675Suqs case ' ': 260241675Suqs if (NULL == ws) 261241675Suqs ws = c; 262241675Suqs continue; 263241675Suqs case '\t': 264241675Suqs /* 265241675Suqs * Always warn about trailing tabs, 266241675Suqs * even outside literal context, 267241675Suqs * where they should be put on the next line. 268241675Suqs */ 269241675Suqs if (NULL == ws) 270241675Suqs ws = c; 271241675Suqs /* 272241675Suqs * Strip trailing tabs in literal context only; 273241675Suqs * outside, they affect the next line. 274241675Suqs */ 275261344Suqs if (MDOC_LITERAL & mdoc->flags) 276241675Suqs continue; 277241675Suqs break; 278241675Suqs case '\\': 279241675Suqs /* Skip the escaped character, too, if any. */ 280241675Suqs if (c[1]) 281241675Suqs c++; 282241675Suqs /* FALLTHROUGH */ 283241675Suqs default: 284241675Suqs ws = NULL; 285241675Suqs break; 286241675Suqs } 287241675Suqs end = c + 1; 288241675Suqs } 289241675Suqs *end = '\0'; 290241675Suqs 291241675Suqs if (ws) 292274880Sbapt mandoc_msg(MANDOCERR_SPACE_EOL, mdoc->parse, 293274880Sbapt line, (int)(ws-buf), NULL); 294241675Suqs 295275432Sbapt if (buf[offs] == '\0' && ! (mdoc->flags & MDOC_LITERAL)) { 296274880Sbapt mandoc_msg(MANDOCERR_FI_BLANK, mdoc->parse, 297274880Sbapt line, (int)(c - buf), NULL); 298241675Suqs 299241675Suqs /* 300241675Suqs * Insert a `sp' in the case of a blank line. Technically, 301241675Suqs * blank lines aren't allowed, but enough manuals assume this 302241675Suqs * behaviour that we want to work around it. 303241675Suqs */ 304294113Sbapt roff_elem_alloc(mdoc, line, offs, MDOC_sp); 305294113Sbapt mdoc->last->flags |= MDOC_VALID | MDOC_ENDED; 306294113Sbapt mdoc->next = ROFF_NEXT_SIBLING; 307294113Sbapt return 1; 308241675Suqs } 309241675Suqs 310294113Sbapt roff_word_alloc(mdoc, line, offs, buf+offs); 311241675Suqs 312275432Sbapt if (mdoc->flags & MDOC_LITERAL) 313294113Sbapt return 1; 314241675Suqs 315241675Suqs /* 316241675Suqs * End-of-sentence check. If the last character is an unescaped 317241675Suqs * EOS character, then flag the node as being the end of a 318241675Suqs * sentence. The front-end will know how to interpret this. 319241675Suqs */ 320241675Suqs 321241675Suqs assert(buf < end); 322241675Suqs 323274880Sbapt if (mandoc_eos(buf+offs, (size_t)(end-buf-offs))) 324261344Suqs mdoc->last->flags |= MDOC_EOS; 325294113Sbapt return 1; 326241675Suqs} 327241675Suqs 328241675Suqs/* 329241675Suqs * Parse a macro line, that is, a line beginning with the control 330241675Suqs * character. 331241675Suqs */ 332241675Suqsstatic int 333294113Sbaptmdoc_pmacro(struct roff_man *mdoc, int ln, char *buf, int offs) 334241675Suqs{ 335294113Sbapt struct roff_node *n; 336275432Sbapt const char *cp; 337294113Sbapt int tok; 338241675Suqs int i, sv; 339241675Suqs char mac[5]; 340241675Suqs 341241675Suqs sv = offs; 342241675Suqs 343274880Sbapt /* 344241675Suqs * Copy the first word into a nil-terminated buffer. 345275432Sbapt * Stop when a space, tab, escape, or eoln is encountered. 346241675Suqs */ 347241675Suqs 348241675Suqs i = 0; 349275432Sbapt while (i < 4 && strchr(" \t\\", buf[offs]) == NULL) 350241675Suqs mac[i++] = buf[offs++]; 351241675Suqs 352241675Suqs mac[i] = '\0'; 353241675Suqs 354294113Sbapt tok = (i > 1 && i < 4) ? mdoc_hash_find(mac) : TOKEN_NONE; 355241675Suqs 356294113Sbapt if (tok == TOKEN_NONE) { 357274880Sbapt mandoc_msg(MANDOCERR_MACRO, mdoc->parse, 358274880Sbapt ln, sv, buf + sv - 1); 359294113Sbapt return 1; 360241675Suqs } 361241675Suqs 362275432Sbapt /* Skip a leading escape sequence or tab. */ 363241675Suqs 364275432Sbapt switch (buf[offs]) { 365275432Sbapt case '\\': 366275432Sbapt cp = buf + offs + 1; 367275432Sbapt mandoc_escape(&cp, NULL, NULL); 368275432Sbapt offs = cp - buf; 369275432Sbapt break; 370275432Sbapt case '\t': 371241675Suqs offs++; 372275432Sbapt break; 373275432Sbapt default: 374275432Sbapt break; 375275432Sbapt } 376241675Suqs 377241675Suqs /* Jump to the next non-whitespace word. */ 378241675Suqs 379241675Suqs while (buf[offs] && ' ' == buf[offs]) 380241675Suqs offs++; 381241675Suqs 382274880Sbapt /* 383241675Suqs * Trailing whitespace. Note that tabs are allowed to be passed 384241675Suqs * into the parser as "text", so we only warn about spaces here. 385241675Suqs */ 386241675Suqs 387241675Suqs if ('\0' == buf[offs] && ' ' == buf[offs - 1]) 388274880Sbapt mandoc_msg(MANDOCERR_SPACE_EOL, mdoc->parse, 389274880Sbapt ln, offs - 1, NULL); 390241675Suqs 391241675Suqs /* 392241675Suqs * If an initial macro or a list invocation, divert directly 393241675Suqs * into macro processing. 394241675Suqs */ 395241675Suqs 396275432Sbapt if (NULL == mdoc->last || MDOC_It == tok || MDOC_El == tok) { 397275432Sbapt mdoc_macro(mdoc, tok, ln, sv, &offs, buf); 398294113Sbapt return 1; 399275432Sbapt } 400241675Suqs 401261344Suqs n = mdoc->last; 402261344Suqs assert(mdoc->last); 403241675Suqs 404241675Suqs /* 405241675Suqs * If the first macro of a `Bl -column', open an `It' block 406241675Suqs * context around the parsed macro. 407241675Suqs */ 408241675Suqs 409294113Sbapt if (n->tok == MDOC_Bl && n->type == ROFFT_BODY && 410279527Sbapt n->end == ENDBODY_NOT && n->norm->Bl.type == LIST_column) { 411261344Suqs mdoc->flags |= MDOC_FREECOL; 412275432Sbapt mdoc_macro(mdoc, MDOC_It, ln, sv, &sv, buf); 413294113Sbapt return 1; 414241675Suqs } 415241675Suqs 416241675Suqs /* 417241675Suqs * If we're following a block-level `It' within a `Bl -column' 418241675Suqs * context (perhaps opened in the above block or in ptext()), 419241675Suqs * then open an `It' block context around the parsed macro. 420241675Suqs */ 421241675Suqs 422294113Sbapt if (n->tok == MDOC_It && n->type == ROFFT_BLOCK && 423274880Sbapt NULL != n->parent && 424274880Sbapt MDOC_Bl == n->parent->tok && 425274880Sbapt LIST_column == n->parent->norm->Bl.type) { 426261344Suqs mdoc->flags |= MDOC_FREECOL; 427275432Sbapt mdoc_macro(mdoc, MDOC_It, ln, sv, &sv, buf); 428294113Sbapt return 1; 429241675Suqs } 430241675Suqs 431241675Suqs /* Normal processing of a macro. */ 432241675Suqs 433275432Sbapt mdoc_macro(mdoc, tok, ln, sv, &offs, buf); 434241675Suqs 435274880Sbapt /* In quick mode (for mandocdb), abort after the NAME section. */ 436241675Suqs 437274880Sbapt if (mdoc->quick && MDOC_Sh == tok && 438274880Sbapt SEC_NAME != mdoc->last->sec) 439294113Sbapt return 2; 440241675Suqs 441294113Sbapt return 1; 442241675Suqs} 443241675Suqs 444241675Suqsenum mdelim 445241675Suqsmdoc_isdelim(const char *p) 446241675Suqs{ 447241675Suqs 448241675Suqs if ('\0' == p[0]) 449294113Sbapt return DELIM_NONE; 450241675Suqs 451241675Suqs if ('\0' == p[1]) 452241675Suqs switch (p[0]) { 453274880Sbapt case '(': 454274880Sbapt case '[': 455294113Sbapt return DELIM_OPEN; 456274880Sbapt case '|': 457294113Sbapt return DELIM_MIDDLE; 458274880Sbapt case '.': 459274880Sbapt case ',': 460274880Sbapt case ';': 461274880Sbapt case ':': 462274880Sbapt case '?': 463274880Sbapt case '!': 464274880Sbapt case ')': 465274880Sbapt case ']': 466294113Sbapt return DELIM_CLOSE; 467241675Suqs default: 468294113Sbapt return DELIM_NONE; 469241675Suqs } 470241675Suqs 471241675Suqs if ('\\' != p[0]) 472294113Sbapt return DELIM_NONE; 473241675Suqs 474241675Suqs if (0 == strcmp(p + 1, ".")) 475294113Sbapt return DELIM_CLOSE; 476261344Suqs if (0 == strcmp(p + 1, "fR|\\fP")) 477294113Sbapt return DELIM_MIDDLE; 478241675Suqs 479294113Sbapt return DELIM_NONE; 480241675Suqs} 481274880Sbapt 482274880Sbaptvoid 483294113Sbaptmdoc_validate(struct roff_man *mdoc) 484274880Sbapt{ 485274880Sbapt 486294113Sbapt mdoc->last = mdoc->first; 487294113Sbapt mdoc_node_validate(mdoc); 488294113Sbapt mdoc_state_reset(mdoc); 489274880Sbapt} 490