preconv.c (274888) | preconv.c (275432) |
---|---|
1/* $Id: preconv.c,v 1.6 2013/06/02 03:52:21 schwarze Exp $ */ | 1/* $Id: preconv.c,v 1.12 2014/11/14 04:24:04 schwarze Exp $ */ |
2/* 3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv> | 2/* 3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv> |
4 * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org> |
|
4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ | 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ |
17#ifdef HAVE_CONFIG_H | |
18#include "config.h" | 18#include "config.h" |
19#endif | |
20 | 19 |
21#ifdef HAVE_MMAP 22#include <sys/stat.h> 23#include <sys/mman.h> 24#endif | 20#include <sys/types.h> |
25 | 21 |
26#include <assert.h> 27#include <fcntl.h> | |
28#include <stdio.h> | 22#include <stdio.h> |
29#include <stdlib.h> | |
30#include <string.h> | 23#include <string.h> |
31#include <unistd.h> | 24#include "mandoc.h" 25#include "libmandoc.h" |
32 | 26 |
33/* 34 * The read_whole_file() and resize_buf() functions are copied from 35 * read.c, including all dependency code. 36 */ 37 38enum enc { 39 ENC_UTF_8, /* UTF-8 */ 40 ENC_US_ASCII, /* US-ASCII */ 41 ENC_LATIN_1, /* Latin-1 */ 42 ENC__MAX 43}; 44 45struct buf { 46 char *buf; /* binary input buffer */ 47 size_t sz; /* size of binary buffer */ 48 size_t offs; /* starting buffer offset */ 49}; 50 51struct encode { 52 const char *name; 53 int (*conv)(const struct buf *); 54}; 55 56static int cue_enc(const struct buf *, size_t *, enum enc *); 57static int conv_latin_1(const struct buf *); 58static int conv_us_ascii(const struct buf *); 59static int conv_utf_8(const struct buf *); 60static int read_whole_file(const char *, int, 61 struct buf *, int *); 62static void resize_buf(struct buf *, size_t); 63static void usage(void); 64 65static const struct encode encs[ENC__MAX] = { 66 { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */ 67 { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */ 68 { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */ 69}; 70 71static const char *progname; 72 73static void 74usage(void) | 27int 28preconv_encode(struct buf *ib, size_t *ii, struct buf *ob, size_t *oi, 29 int *filenc) |
75{ | 30{ |
76 77 fprintf(stderr, "usage: %s " 78 "[-D enc] " 79 "[-e ENC] " 80 "[file]\n", progname); 81} 82 83static int 84conv_latin_1(const struct buf *b) 85{ | |
86 size_t i; | 31 size_t i; |
87 unsigned char cu; 88 const char *cp; 89 90 cp = b->buf + (int)b->offs; 91 92 /* 93 * Latin-1 falls into the first 256 code-points of Unicode, so 94 * there's no need for any sort of translation. Just make the 95 * 8-bit characters use the Unicode escape. 96 * Note that binary values 128 < v < 160 are passed through 97 * unmodified to mandoc. 98 */ 99 100 for (i = b->offs; i < b->sz; i++) { 101 cu = (unsigned char)*cp++; 102 cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu); 103 } 104 105 return(1); 106} 107 108static int 109conv_us_ascii(const struct buf *b) 110{ 111 112 /* 113 * US-ASCII has no conversion since it falls into the first 128 114 * bytes of Unicode. 115 */ 116 117 fwrite(b->buf, 1, b->sz, stdout); 118 return(1); 119} 120 121static int 122conv_utf_8(const struct buf *b) 123{ 124 int state, be; | 32 int state; |
125 unsigned int accum; | 33 unsigned int accum; |
126 size_t i; | |
127 unsigned char cu; | 34 unsigned char cu; |
128 const char *cp; 129 const long one = 1L; | |
130 | 35 |
131 cp = b->buf + (int)b->offs; | 36 if ( ! (*filenc & MPARSE_UTF8)) 37 goto latin; 38 |
132 state = 0; 133 accum = 0U; | 39 state = 0; 40 accum = 0U; |
134 be = 0; | |
135 | 41 |
136 /* Quick test for big-endian value. */ 137 138 if ( ! (*((const char *)(&one)))) 139 be = 1; 140 141 for (i = b->offs; i < b->sz; i++) { 142 cu = (unsigned char)*cp++; | 42 for (i = *ii; i < ib->sz; i++) { 43 cu = ib->buf[i]; |
143 if (state) { 144 if ( ! (cu & 128) || (cu & 64)) { 145 /* Bad sequence header. */ | 44 if (state) { 45 if ( ! (cu & 128) || (cu & 64)) { 46 /* Bad sequence header. */ |
146 return(0); | 47 break; |
147 } 148 149 /* Accept only legitimate bit patterns. */ 150 151 if (cu > 191 || cu < 128) { 152 /* Bad in-sequence bits. */ | 48 } 49 50 /* Accept only legitimate bit patterns. */ 51 52 if (cu > 191 || cu < 128) { 53 /* Bad in-sequence bits. */ |
153 return(0); | 54 break; |
154 } 155 156 accum |= (cu & 63) << --state * 6; 157 | 55 } 56 57 accum |= (cu & 63) << --state * 6; 58 |
158 /* 159 * Accum is held in little-endian order as 160 * stipulated by the UTF-8 sequence coding. We 161 * need to convert to a native big-endian if our 162 * architecture requires it. 163 */ | 59 if (state) 60 continue; |
164 | 61 |
165 if (0 == state && be) 166 accum = (accum >> 24) | 167 ((accum << 8) & 0x00FF0000) | 168 ((accum >> 8) & 0x0000FF00) | 169 (accum << 24); 170 171 if (0 == state) { 172 accum < 128U ? putchar(accum) : 173 printf("\\[u%.4X]", accum); 174 accum = 0U; 175 } 176 } else if (cu & (1 << 7)) { | 62 if (accum < 0x80) 63 ob->buf[(*oi)++] = accum; 64 else 65 *oi += snprintf(ob->buf + *oi, 66 11, "\\[u%.4X]", accum); 67 *ii = i + 1; 68 *filenc &= ~MPARSE_LATIN1; 69 return(1); 70 } else { |
177 /* 178 * Entering a UTF-8 state: if we encounter a 179 * UTF-8 bitmask, calculate the expected UTF-8 180 * state from it. 181 */ | 71 /* 72 * Entering a UTF-8 state: if we encounter a 73 * UTF-8 bitmask, calculate the expected UTF-8 74 * state from it. 75 */ |
182 for (state = 0; state < 7; state++) | 76 for (state = 0; state < 7; state++) |
183 if ( ! (cu & (1 << (7 - state)))) 184 break; 185 186 /* Accept only legitimate bit patterns. */ 187 | 77 if ( ! (cu & (1 << (7 - state)))) 78 break; 79 80 /* Accept only legitimate bit patterns. */ 81 |
188 switch (state) { | 82 switch (state--) { |
189 case (4): 190 if (cu <= 244 && cu >= 240) { 191 accum = (cu & 7) << 18; | 83 case (4): 84 if (cu <= 244 && cu >= 240) { 85 accum = (cu & 7) << 18; |
192 break; | 86 continue; |
193 } 194 /* Bad 4-sequence start bits. */ | 87 } 88 /* Bad 4-sequence start bits. */ |
195 return(0); | 89 break; |
196 case (3): 197 if (cu <= 239 && cu >= 224) { 198 accum = (cu & 15) << 12; | 90 case (3): 91 if (cu <= 239 && cu >= 224) { 92 accum = (cu & 15) << 12; |
199 break; | 93 continue; |
200 } 201 /* Bad 3-sequence start bits. */ | 94 } 95 /* Bad 3-sequence start bits. */ |
202 return(0); | 96 break; |
203 case (2): 204 if (cu <= 223 && cu >= 194) { 205 accum = (cu & 31) << 6; | 97 case (2): 98 if (cu <= 223 && cu >= 194) { 99 accum = (cu & 31) << 6; |
206 break; | 100 continue; |
207 } 208 /* Bad 2-sequence start bits. */ | 101 } 102 /* Bad 2-sequence start bits. */ |
209 return(0); | 103 break; |
210 default: 211 /* Bad sequence bit mask. */ | 104 default: 105 /* Bad sequence bit mask. */ |
212 return(0); | 106 break; |
213 } | 107 } |
214 state--; 215 } else 216 putchar(cu); | 108 break; 109 } |
217 } 218 | 110 } 111 |
219 if (0 != state) { 220 /* Bad trailing bits. */ 221 return(0); 222 } | 112 /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */ |
223 | 113 |
224 return(1); 225} 226 227static void 228resize_buf(struct buf *buf, size_t initial) 229{ 230 231 buf->sz = buf->sz > initial / 2 ? 232 2 * buf->sz : initial; 233 234 buf->buf = realloc(buf->buf, buf->sz); 235 if (NULL == buf->buf) { 236 perror(NULL); 237 exit(EXIT_FAILURE); 238 } 239} 240 241static int 242read_whole_file(const char *f, int fd, 243 struct buf *fb, int *with_mmap) 244{ 245 size_t off; 246 ssize_t ssz; 247 248#ifdef HAVE_MMAP 249 struct stat st; 250 if (-1 == fstat(fd, &st)) { 251 perror(f); | 114latin: 115 if ( ! (*filenc & MPARSE_LATIN1)) |
252 return(0); | 116 return(0); |
253 } | |
254 | 117 |
255 /* 256 * If we're a regular file, try just reading in the whole entry 257 * via mmap(). This is faster than reading it into blocks, and 258 * since each file is only a few bytes to begin with, I'm not 259 * concerned that this is going to tank any machines. 260 */ | 118 *oi += snprintf(ob->buf + *oi, 11, 119 "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]); |
261 | 120 |
262 if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) { 263 fprintf(stderr, "%s: input too large\n", f); 264 return(0); 265 } 266 267 if (S_ISREG(st.st_mode)) { 268 *with_mmap = 1; 269 fb->sz = (size_t)st.st_size; 270 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0); 271 if (fb->buf != MAP_FAILED) 272 return(1); 273 } 274#endif 275 276 /* 277 * If this isn't a regular file (like, say, stdin), then we must 278 * go the old way and just read things in bit by bit. 279 */ 280 281 *with_mmap = 0; 282 off = 0; 283 fb->sz = 0; 284 fb->buf = NULL; 285 for (;;) { 286 if (off == fb->sz && fb->sz == (1U << 31)) { 287 fprintf(stderr, "%s: input too large\n", f); 288 break; 289 } 290 291 if (off == fb->sz) 292 resize_buf(fb, 65536); 293 294 ssz = read(fd, fb->buf + (int)off, fb->sz - off); 295 if (ssz == 0) { 296 fb->sz = off; 297 return(1); 298 } 299 if (ssz == -1) { 300 perror(f); 301 break; 302 } 303 off += (size_t)ssz; 304 } 305 306 free(fb->buf); 307 fb->buf = NULL; 308 return(0); | 121 *filenc &= ~MPARSE_UTF8; 122 return(1); |
309} 310 | 123} 124 |
311static int 312cue_enc(const struct buf *b, size_t *offs, enum enc *enc) | 125int 126preconv_cue(const struct buf *b, size_t offset) |
313{ 314 const char *ln, *eoln, *eoph; | 127{ 128 const char *ln, *eoln, *eoph; |
315 size_t sz, phsz, nsz; 316 int i; | 129 size_t sz, phsz; |
317 | 130 |
318 ln = b->buf + (int)*offs; 319 sz = b->sz - *offs; | 131 ln = b->buf + offset; 132 sz = b->sz - offset; |
320 321 /* Look for the end-of-line. */ 322 323 if (NULL == (eoln = memchr(ln, '\n', sz))) | 133 134 /* Look for the end-of-line. */ 135 136 if (NULL == (eoln = memchr(ln, '\n', sz))) |
324 return(-1); | 137 eoln = ln + sz; |
325 | 138 |
326 /* Set next-line marker. */ 327 328 *offs = (size_t)((eoln + 1) - b->buf); 329 | |
330 /* Check if we have the correct header/trailer. */ 331 | 139 /* Check if we have the correct header/trailer. */ 140 |
332 if ((sz = (size_t)(eoln - ln)) < 10 || 333 memcmp(ln, ".\\\" -*-", 7) || 334 memcmp(eoln - 3, "-*-", 3)) 335 return(0); | 141 if ((sz = (size_t)(eoln - ln)) < 10 || 142 memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3)) 143 return(MPARSE_UTF8 | MPARSE_LATIN1); |
336 337 /* Move after the header and adjust for the trailer. */ 338 339 ln += 7; 340 sz -= 10; 341 342 while (sz > 0) { 343 while (sz > 0 && ' ' == *ln) { --- 7 unchanged lines hidden (view full) --- 351 352 if (NULL == (eoph = memchr(ln, ';', sz))) 353 eoph = eoln - 3; 354 else 355 eoph++; 356 357 /* Only account for the "coding" phrase. */ 358 | 144 145 /* Move after the header and adjust for the trailer. */ 146 147 ln += 7; 148 sz -= 10; 149 150 while (sz > 0) { 151 while (sz > 0 && ' ' == *ln) { --- 7 unchanged lines hidden (view full) --- 159 160 if (NULL == (eoph = memchr(ln, ';', sz))) 161 eoph = eoln - 3; 162 else 163 eoph++; 164 165 /* Only account for the "coding" phrase. */ 166 |
359 if ((phsz = (size_t)(eoph - ln)) < 7 || 360 strncasecmp(ln, "coding:", 7)) { | 167 if ((phsz = eoph - ln) < 7 || 168 strncasecmp(ln, "coding:", 7)) { |
361 sz -= phsz; 362 ln += phsz; 363 continue; | 169 sz -= phsz; 170 ln += phsz; 171 continue; |
364 } | 172 } |
365 366 sz -= 7; 367 ln += 7; 368 369 while (sz > 0 && ' ' == *ln) { 370 ln++; 371 sz--; 372 } 373 if (0 == sz) | 173 174 sz -= 7; 175 ln += 7; 176 177 while (sz > 0 && ' ' == *ln) { 178 ln++; 179 sz--; 180 } 181 if (0 == sz) |
374 break; | 182 return(0); |
375 376 /* Check us against known encodings. */ 377 | 183 184 /* Check us against known encodings. */ 185 |
378 for (i = 0; i < (int)ENC__MAX; i++) { 379 nsz = strlen(encs[i].name); 380 if (phsz < nsz) 381 continue; 382 if (strncasecmp(ln, encs[i].name, nsz)) 383 continue; 384 385 *enc = (enum enc)i; 386 return(1); 387 } 388 389 /* Unknown encoding. */ 390 391 *enc = ENC__MAX; 392 return(1); | 186 if (phsz > 4 && !strncasecmp(ln, "utf-8", 5)) 187 return(MPARSE_UTF8); 188 if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11)) 189 return(MPARSE_LATIN1); 190 return(0); |
393 } | 191 } |
394 395 return(0); | 192 return(MPARSE_UTF8 | MPARSE_LATIN1); |
396} | 193} |
397 398int 399main(int argc, char *argv[]) 400{ 401 int i, ch, map, fd, rc; 402 struct buf b; 403 const char *fn; 404 enum enc enc, def; 405 unsigned char bom[3] = { 0xEF, 0xBB, 0xBF }; 406 size_t offs; 407 extern int optind; 408 extern char *optarg; 409 410 progname = strrchr(argv[0], '/'); 411 if (progname == NULL) 412 progname = argv[0]; 413 else 414 ++progname; 415 416 fn = "<stdin>"; 417 fd = STDIN_FILENO; 418 rc = EXIT_FAILURE; 419 enc = def = ENC__MAX; 420 map = 0; 421 422 memset(&b, 0, sizeof(struct buf)); 423 424 while (-1 != (ch = getopt(argc, argv, "D:e:rdvh"))) 425 switch (ch) { 426 case ('D'): 427 /* FALLTHROUGH */ 428 case ('e'): 429 for (i = 0; i < (int)ENC__MAX; i++) { 430 if (strcasecmp(optarg, encs[i].name)) 431 continue; 432 break; 433 } 434 if (i < (int)ENC__MAX) { 435 if ('D' == ch) 436 def = (enum enc)i; 437 else 438 enc = (enum enc)i; 439 break; 440 } 441 442 fprintf(stderr, "%s: Bad encoding\n", optarg); 443 return(EXIT_FAILURE); 444 case ('r'): 445 /* FALLTHROUGH */ 446 case ('d'): 447 /* FALLTHROUGH */ 448 case ('v'): 449 /* Compatibility with GNU preconv. */ 450 break; 451 case ('h'): 452 /* Compatibility with GNU preconv. */ 453 /* FALLTHROUGH */ 454 default: 455 usage(); 456 return(EXIT_FAILURE); 457 } 458 459 argc -= optind; 460 argv += optind; 461 462 /* 463 * Open and read the first argument on the command-line. 464 * If we don't have one, we default to stdin. 465 */ 466 467 if (argc > 0) { 468 fn = *argv; 469 fd = open(fn, O_RDONLY, 0); 470 if (-1 == fd) { 471 perror(fn); 472 return(EXIT_FAILURE); 473 } 474 } 475 476 if ( ! read_whole_file(fn, fd, &b, &map)) 477 goto out; 478 479 /* Try to read the UTF-8 BOM. */ 480 481 if (ENC__MAX == enc) 482 if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) { 483 b.offs = 3; 484 enc = ENC_UTF_8; 485 } 486 487 /* Try reading from the "-*-" cue. */ 488 489 if (ENC__MAX == enc) { 490 offs = b.offs; 491 ch = cue_enc(&b, &offs, &enc); 492 if (0 == ch) 493 ch = cue_enc(&b, &offs, &enc); 494 } 495 496 /* 497 * No encoding has been detected. 498 * Thus, we either fall into our default encoder, if specified, 499 * or use Latin-1 if all else fails. 500 */ 501 502 if (ENC__MAX == enc) 503 enc = ENC__MAX == def ? ENC_LATIN_1 : def; 504 505 if ( ! (*encs[(int)enc].conv)(&b)) { 506 fprintf(stderr, "%s: Bad encoding\n", fn); 507 goto out; 508 } 509 510 rc = EXIT_SUCCESS; 511out: 512#ifdef HAVE_MMAP 513 if (map) 514 munmap(b.buf, b.sz); 515 else 516#endif 517 free(b.buf); 518 519 if (fd > STDIN_FILENO) 520 close(fd); 521 522 return(rc); 523} | |