Cross Reference: /freebsd-11-stable/contrib/mdocml/preconv.c

Deleted Added

sdiff udiff text old ( 274888 ) new ( 275432 )

full compact

preconv.c (274888)	preconv.c (275432)
1/* $Id: preconv.c,v 1.6 2013/06/02 03:52:21 schwarze Exp $ */	1/* $Id: preconv.c,v 1.12 2014/11/14 04:24:04 schwarze Exp $ */
2/* 3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>	2/* 3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
	4 * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */	5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */
17#ifdef HAVE_CONFIG_H
18#include "config.h"	18#include "config.h"
19#endif
20	19
21#ifdef HAVE_MMAP 22#include <sys/stat.h> 23#include <sys/mman.h> 24#endif	20#include <sys/types.h>
25	21
26#include <assert.h> 27#include <fcntl.h>
28#include <stdio.h>	22#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>	23#include <string.h>
31#include <unistd.h>	24#include "mandoc.h" 25#include "libmandoc.h"
32	26
33/* 34 * The read_whole_file() and resize_buf() functions are copied from 35 * read.c, including all dependency code. 36 / 37 38enum enc { 39 ENC_UTF_8, / UTF-8 / 40 ENC_US_ASCII, / US-ASCII / 41 ENC_LATIN_1, / Latin-1 / 42 ENC__MAX 43}; 44 45struct buf { 46 char buf; /* binary input buffer / 47 size_t sz; / size of binary buffer / 48 size_t offs; / starting buffer offset / 49}; 50 51struct encode { 52 const char name; 53 int (conv)(const struct buf ); 54}; 55 56static int cue_enc(const struct buf , size_t , enum enc ); 57static int conv_latin_1(const struct buf ); 58static int conv_us_ascii(const struct buf ); 59static int conv_utf_8(const struct buf ); 60static int read_whole_file(const char , int, 61 struct buf , int ); 62static void resize_buf(struct buf , size_t); 63static void usage(void); 64 65static const struct encode encs[ENC__MAX] = { 66 { "utf-8", conv_utf_8 }, /* ENC_UTF_8 / 67 { "us-ascii", conv_us_ascii }, / ENC_US_ASCII / 68 { "latin-1", conv_latin_1 }, / ENC_LATIN_1 / 69}; 70 71static const char progname; 72 73static void 74usage(void)	27int 28preconv_encode(struct buf ib, size_t ii, struct buf ob, size_t oi, 29 int *filenc)
75{	30{
76 77 fprintf(stderr, "usage: %s " 78 "[-D enc] " 79 "[-e ENC] " 80 "[file]\n", progname); 81} 82 83static int 84conv_latin_1(const struct buf *b) 85{
86 size_t i;	31 size_t i;
87 unsigned char cu; 88 const char cp; 89 90 cp = b->buf + (int)b->offs; 91 92 / 93 * Latin-1 falls into the first 256 code-points of Unicode, so 94 * there's no need for any sort of translation. Just make the 95 * 8-bit characters use the Unicode escape. 96 * Note that binary values 128 < v < 160 are passed through 97 * unmodified to mandoc. 98 / 99 100* for (i = b->offs; i < b->sz; i++) { 101 cu = (unsigned char)cp++; 102* cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu); 103 } 104 105 return(1); 106} 107 108static int 109conv_us_ascii(const struct buf b) 110{ 111* 112 /* 113 * US-ASCII has no conversion since it falls into the first 128 114 * bytes of Unicode. 115 / 116* 117 fwrite(b->buf, 1, b->sz, stdout); 118 return(1); 119} 120 121static int 122conv_utf_8(const struct buf b) 123{ 124* int state, be;	32 int state;
125 unsigned int accum;	33 unsigned int accum;
126 size_t i;
127 unsigned char cu;	34 unsigned char cu;
128 const char cp; 129* const long one = 1L;
130	35
131 cp = b->buf + (int)b->offs;	36 if ( ! (*filenc & MPARSE_UTF8)) 37 goto latin; 38
132 state = 0; 133 accum = 0U;	39 state = 0; 40 accum = 0U;
134 be = 0;
135	41
136 /* Quick test for big-endian value. / 137* 138 if ( ! (((const char )(&one)))) 139 be = 1; 140 141 for (i = b->offs; i < b->sz; i++) { 142 cu = (unsigned char)*cp++;	42 for (i = *ii; i < ib->sz; i++) { 43 cu = ib->buf[i];
143 if (state) { 144 if ( ! (cu & 128) \|\| (cu & 64)) { 145 /* Bad sequence header. */	44 if (state) { 45 if ( ! (cu & 128) \|\| (cu & 64)) { 46 /* Bad sequence header. */
146 return(0);	47 break;
147 } 148 149 /* Accept only legitimate bit patterns. / 150* 151 if (cu > 191 \|\| cu < 128) { 152 /* Bad in-sequence bits. */	48 } 49 50 /* Accept only legitimate bit patterns. / 51 52 if (cu > 191 \|\| cu < 128) { 53 / Bad in-sequence bits. */
153 return(0);	54 break;
154 } 155 156 accum \|= (cu & 63) << --state * 6; 157	55 } 56 57 accum \|= (cu & 63) << --state * 6; 58
158 /* 159 * Accum is held in little-endian order as 160 * stipulated by the UTF-8 sequence coding. We 161 * need to convert to a native big-endian if our 162 * architecture requires it. 163 */	59 if (state) 60 continue;
164	61
165 if (0 == state && be) 166 accum = (accum >> 24) \| 167 ((accum << 8) & 0x00FF0000) \| 168 ((accum >> 8) & 0x0000FF00) \| 169 (accum << 24); 170 171 if (0 == state) { 172 accum < 128U ? putchar(accum) : 173 printf("\\[u%.4X]", accum); 174 accum = 0U; 175 } 176 } else if (cu & (1 << 7)) {	62 if (accum < 0x80) 63 ob->buf[(oi)++] = accum; 64 else 65 oi += snprintf(ob->buf + oi, 66 11, "\\[u%.4X]", accum); 67 ii = i + 1; 68 *filenc &= ~MPARSE_LATIN1; 69 return(1); 70 } else {
177 /* 178 * Entering a UTF-8 state: if we encounter a 179 * UTF-8 bitmask, calculate the expected UTF-8 180 * state from it. 181 */	71 /* 72 * Entering a UTF-8 state: if we encounter a 73 * UTF-8 bitmask, calculate the expected UTF-8 74 * state from it. 75 */
182 for (state = 0; state < 7; state++)	76 for (state = 0; state < 7; state++)
183 if ( ! (cu & (1 << (7 - state)))) 184 break; 185 186 /* Accept only legitimate bit patterns. / 187*	77 if ( ! (cu & (1 << (7 - state)))) 78 break; 79 80 /* Accept only legitimate bit patterns. */ 81
188 switch (state) {	82 switch (state--) {
189 case (4): 190 if (cu <= 244 && cu >= 240) { 191 accum = (cu & 7) << 18;	83 case (4): 84 if (cu <= 244 && cu >= 240) { 85 accum = (cu & 7) << 18;
192 break;	86 continue;
193 } 194 /* Bad 4-sequence start bits. */	87 } 88 /* Bad 4-sequence start bits. */
195 return(0);	89 break;
196 case (3): 197 if (cu <= 239 && cu >= 224) { 198 accum = (cu & 15) << 12;	90 case (3): 91 if (cu <= 239 && cu >= 224) { 92 accum = (cu & 15) << 12;
199 break;	93 continue;
200 } 201 /* Bad 3-sequence start bits. */	94 } 95 /* Bad 3-sequence start bits. */
202 return(0);	96 break;
203 case (2): 204 if (cu <= 223 && cu >= 194) { 205 accum = (cu & 31) << 6;	97 case (2): 98 if (cu <= 223 && cu >= 194) { 99 accum = (cu & 31) << 6;
206 break;	100 continue;
207 } 208 /* Bad 2-sequence start bits. */	101 } 102 /* Bad 2-sequence start bits. */
209 return(0);	103 break;
210 default: 211 /* Bad sequence bit mask. */	104 default: 105 /* Bad sequence bit mask. */
212 return(0);	106 break;
213 }	107 }
214 state--; 215 } else 216 putchar(cu);	108 break; 109 }
217 } 218	110 } 111
219 if (0 != state) { 220 /* Bad trailing bits. / 221* return(0); 222 }	112 /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
223	113
224 return(1); 225} 226 227static void 228resize_buf(struct buf buf, size_t initial) 229{ 230* 231 buf->sz = buf->sz > initial / 2 ? 232 2 * buf->sz : initial; 233 234 buf->buf = realloc(buf->buf, buf->sz); 235 if (NULL == buf->buf) { 236 perror(NULL); 237 exit(EXIT_FAILURE); 238 } 239} 240 241static int 242read_whole_file(const char f, int fd, 243* struct buf fb, int with_mmap) 244{ 245 size_t off; 246 ssize_t ssz; 247 248#ifdef HAVE_MMAP 249 struct stat st; 250 if (-1 == fstat(fd, &st)) { 251 perror(f);	114latin: 115 if ( ! (*filenc & MPARSE_LATIN1))
252 return(0);	116 return(0);
253 }
254	117
255 /* 256 * If we're a regular file, try just reading in the whole entry 257 * via mmap(). This is faster than reading it into blocks, and 258 * since each file is only a few bytes to begin with, I'm not 259 * concerned that this is going to tank any machines. 260 */	118 oi += snprintf(ob->buf + oi, 11, 119 "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]);
261	120
262 if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) { 263 fprintf(stderr, "%s: input too large\n", f); 264 return(0); 265 } 266 267 if (S_ISREG(st.st_mode)) { 268 with_mmap = 1; 269* fb->sz = (size_t)st.st_size; 270 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0); 271 if (fb->buf != MAP_FAILED) 272 return(1); 273 } 274#endif 275 276 /* 277 * If this isn't a regular file (like, say, stdin), then we must 278 * go the old way and just read things in bit by bit. 279 / 280* 281 with_mmap = 0; 282* off = 0; 283 fb->sz = 0; 284 fb->buf = NULL; 285 for (;;) { 286 if (off == fb->sz && fb->sz == (1U << 31)) { 287 fprintf(stderr, "%s: input too large\n", f); 288 break; 289 } 290 291 if (off == fb->sz) 292 resize_buf(fb, 65536); 293 294 ssz = read(fd, fb->buf + (int)off, fb->sz - off); 295 if (ssz == 0) { 296 fb->sz = off; 297 return(1); 298 } 299 if (ssz == -1) { 300 perror(f); 301 break; 302 } 303 off += (size_t)ssz; 304 } 305 306 free(fb->buf); 307 fb->buf = NULL; 308 return(0);	121 filenc &= ~MPARSE_UTF8; 122* return(1);
309} 310	123} 124
311static int 312cue_enc(const struct buf b, size_t offs, enum enc *enc)	125int 126preconv_cue(const struct buf *b, size_t offset)
313{ 314 const char ln, eoln, *eoph;	127{ 128 const char ln, eoln, *eoph;
315 size_t sz, phsz, nsz; 316 int i;	129 size_t sz, phsz;
317	130
318 ln = b->buf + (int)offs; 319* sz = b->sz - *offs;	131 ln = b->buf + offset; 132 sz = b->sz - offset;
320 321 /* Look for the end-of-line. / 322* 323 if (NULL == (eoln = memchr(ln, '\n', sz)))	133 134 /* Look for the end-of-line. / 135* 136 if (NULL == (eoln = memchr(ln, '\n', sz)))
324 return(-1);	137 eoln = ln + sz;
325	138
326 /* Set next-line marker. / 327* 328 offs = (size_t)((eoln + 1) - b->buf); 329*
330 /* Check if we have the correct header/trailer. / 331*	139 /* Check if we have the correct header/trailer. / 140*
332 if ((sz = (size_t)(eoln - ln)) < 10 \|\| 333 memcmp(ln, ".\\\" --", 7) \|\| 334* memcmp(eoln - 3, "--", 3)) 335* return(0);	141 if ((sz = (size_t)(eoln - ln)) < 10 \|\| 142 memcmp(ln, ".\\\" --", 7) \|\| memcmp(eoln - 3, "--", 3)) 143 return(MPARSE_UTF8 \| MPARSE_LATIN1);
336 337 /* Move after the header and adjust for the trailer. / 338* 339 ln += 7; 340 sz -= 10; 341 342 while (sz > 0) { 343 while (sz > 0 && ' ' == ln) { --- 7 unchanged lines hidden* (view full) --- 351 352 if (NULL == (eoph = memchr(ln, ';', sz))) 353 eoph = eoln - 3; 354 else 355 eoph++; 356 357 /* Only account for the "coding" phrase. / 358*	144 145 /* Move after the header and adjust for the trailer. / 146* 147 ln += 7; 148 sz -= 10; 149 150 while (sz > 0) { 151 while (sz > 0 && ' ' == ln) { --- 7 unchanged lines hidden* (view full) --- 159 160 if (NULL == (eoph = memchr(ln, ';', sz))) 161 eoph = eoln - 3; 162 else 163 eoph++; 164 165 /* Only account for the "coding" phrase. / 166*
359 if ((phsz = (size_t)(eoph - ln)) < 7 \|\| 360 strncasecmp(ln, "coding:", 7)) {	167 if ((phsz = eoph - ln) < 7 \|\| 168 strncasecmp(ln, "coding:", 7)) {
361 sz -= phsz; 362 ln += phsz; 363 continue;	169 sz -= phsz; 170 ln += phsz; 171 continue;
364 }	172 }
365 366 sz -= 7; 367 ln += 7; 368 369 while (sz > 0 && ' ' == ln) { 370* ln++; 371 sz--; 372 } 373 if (0 == sz)	173 174 sz -= 7; 175 ln += 7; 176 177 while (sz > 0 && ' ' == ln) { 178* ln++; 179 sz--; 180 } 181 if (0 == sz)
374 break;	182 return(0);
375 376 /* Check us against known encodings. / 377*	183 184 /* Check us against known encodings. / 185*
378 for (i = 0; i < (int)ENC__MAX; i++) { 379 nsz = strlen(encs[i].name); 380 if (phsz < nsz) 381 continue; 382 if (strncasecmp(ln, encs[i].name, nsz)) 383 continue; 384 385 enc = (enum enc)i; 386* return(1); 387 } 388 389 /* Unknown encoding. / 390* 391 enc = ENC__MAX; 392* return(1);	186 if (phsz > 4 && !strncasecmp(ln, "utf-8", 5)) 187 return(MPARSE_UTF8); 188 if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11)) 189 return(MPARSE_LATIN1); 190 return(0);
393 }	191 }
394 395 return(0);	192 return(MPARSE_UTF8 \| MPARSE_LATIN1);
396}	193}
397 398int 399main(int argc, char argv[]) 400{ 401* int i, ch, map, fd, rc; 402 struct buf b; 403 const char fn; 404* enum enc enc, def; 405 unsigned char bom[3] = { 0xEF, 0xBB, 0xBF }; 406 size_t offs; 407 extern int optind; 408 extern char optarg; 409* 410 progname = strrchr(argv[0], '/'); 411 if (progname == NULL) 412 progname = argv[0]; 413 else 414 ++progname; 415 416 fn = "<stdin>"; 417 fd = STDIN_FILENO; 418 rc = EXIT_FAILURE; 419 enc = def = ENC__MAX; 420 map = 0; 421 422 memset(&b, 0, sizeof(struct buf)); 423 424 while (-1 != (ch = getopt(argc, argv, "D:e:rdvh"))) 425 switch (ch) { 426 case ('D'): 427 /* FALLTHROUGH / 428* case ('e'): 429 for (i = 0; i < (int)ENC__MAX; i++) { 430 if (strcasecmp(optarg, encs[i].name)) 431 continue; 432 break; 433 } 434 if (i < (int)ENC__MAX) { 435 if ('D' == ch) 436 def = (enum enc)i; 437 else 438 enc = (enum enc)i; 439 break; 440 } 441 442 fprintf(stderr, "%s: Bad encoding\n", optarg); 443 return(EXIT_FAILURE); 444 case ('r'): 445 /* FALLTHROUGH / 446* case ('d'): 447 /* FALLTHROUGH / 448* case ('v'): 449 /* Compatibility with GNU preconv. / 450* break; 451 case ('h'): 452 /* Compatibility with GNU preconv. / 453* /* FALLTHROUGH / 454* default: 455 usage(); 456 return(EXIT_FAILURE); 457 } 458 459 argc -= optind; 460 argv += optind; 461 462 /* 463 * Open and read the first argument on the command-line. 464 * If we don't have one, we default to stdin. 465 / 466* 467 if (argc > 0) { 468 fn = argv; 469* fd = open(fn, O_RDONLY, 0); 470 if (-1 == fd) { 471 perror(fn); 472 return(EXIT_FAILURE); 473 } 474 } 475 476 if ( ! read_whole_file(fn, fd, &b, &map)) 477 goto out; 478 479 /* Try to read the UTF-8 BOM. / 480* 481 if (ENC__MAX == enc) 482 if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) { 483 b.offs = 3; 484 enc = ENC_UTF_8; 485 } 486 487 /* Try reading from the "--" cue. / 488 489 if (ENC__MAX == enc) { 490 offs = b.offs; 491 ch = cue_enc(&b, &offs, &enc); 492 if (0 == ch) 493 ch = cue_enc(&b, &offs, &enc); 494 } 495 496 /* 497 * No encoding has been detected. 498 * Thus, we either fall into our default encoder, if specified, 499 * or use Latin-1 if all else fails. 500 / 501* 502 if (ENC__MAX == enc) 503 enc = ENC__MAX == def ? ENC_LATIN_1 : def; 504 505 if ( ! (encs[(int)enc].conv)(&b)) { 506* fprintf(stderr, "%s: Bad encoding\n", fn); 507 goto out; 508 } 509 510 rc = EXIT_SUCCESS; 511out: 512#ifdef HAVE_MMAP 513 if (map) 514 munmap(b.buf, b.sz); 515 else 516#endif 517 free(b.buf); 518 519 if (fd > STDIN_FILENO) 520 close(fd); 521 522 return(rc); 523}