1/*	$Vendor-Id: preconv.c,v 1.5 2011/07/24 18:15:14 kristaps Exp $ */
2/*
3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17#ifdef HAVE_CONFIG_H
18#include "config.h"
19#endif
20
21#ifdef HAVE_MMAP
22#include <sys/stat.h>
23#include <sys/mman.h>
24#endif
25
26#include <assert.h>
27#include <fcntl.h>
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31#include <unistd.h>
32
33/*
34 * The read_whole_file() and resize_buf() functions are copied from
35 * read.c, including all dependency code (MAP_FILE, etc.).
36 */
37
38#ifndef MAP_FILE
39#define	MAP_FILE	0
40#endif
41
42enum	enc {
43	ENC_UTF_8, /* UTF-8 */
44	ENC_US_ASCII, /* US-ASCII */
45	ENC_LATIN_1, /* Latin-1 */
46	ENC__MAX
47};
48
49struct	buf {
50	char		 *buf; /* binary input buffer */
51	size_t	 	  sz; /* size of binary buffer */
52	size_t		  offs; /* starting buffer offset */
53};
54
55struct	encode {
56	const char	 *name;
57	int		(*conv)(const struct buf *);
58};
59
60static	int	 cue_enc(const struct buf *, size_t *, enum enc *);
61static	int	 conv_latin_1(const struct buf *);
62static	int	 conv_us_ascii(const struct buf *);
63static	int	 conv_utf_8(const struct buf *);
64static	int	 read_whole_file(const char *, int,
65			struct buf *, int *);
66static	void	 resize_buf(struct buf *, size_t);
67static	void	 usage(void);
68
69static	const struct encode encs[ENC__MAX] = {
70	{ "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
71	{ "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
72	{ "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
73};
74
75static	const char	 *progname;
76
77static void
78usage(void)
79{
80
81	fprintf(stderr, "usage: %s "
82			"[-D enc] "
83			"[-e ENC] "
84			"[file]\n", progname);
85}
86
87static int
88conv_latin_1(const struct buf *b)
89{
90	size_t		 i;
91	unsigned char	 cu;
92	const char	*cp;
93
94	cp = b->buf + (int)b->offs;
95
96	/*
97	 * Latin-1 falls into the first 256 code-points of Unicode, so
98	 * there's no need for any sort of translation.  Just make the
99	 * 8-bit characters use the Unicode escape.
100	 * Note that binary values 128 < v < 160 are passed through
101	 * unmodified to mandoc.
102	 */
103
104	for (i = b->offs; i < b->sz; i++) {
105		cu = (unsigned char)*cp++;
106		cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu);
107	}
108
109	return(1);
110}
111
112static int
113conv_us_ascii(const struct buf *b)
114{
115
116	/*
117	 * US-ASCII has no conversion since it falls into the first 128
118	 * bytes of Unicode.
119	 */
120
121	fwrite(b->buf, 1, b->sz, stdout);
122	return(1);
123}
124
125static int
126conv_utf_8(const struct buf *b)
127{
128	int		 state, be;
129	unsigned int	 accum;
130	size_t		 i;
131	unsigned char	 cu;
132	const char	*cp;
133	const long	 one = 1L;
134
135	cp = b->buf + (int)b->offs;
136	state = 0;
137	accum = 0U;
138	be = 0;
139
140	/* Quick test for big-endian value. */
141
142	if ( ! (*((const char *)(&one))))
143		be = 1;
144
145	for (i = b->offs; i < b->sz; i++) {
146		cu = (unsigned char)*cp++;
147		if (state) {
148			if ( ! (cu & 128) || (cu & 64)) {
149				/* Bad sequence header. */
150				return(0);
151			}
152
153			/* Accept only legitimate bit patterns. */
154
155			if (cu > 191 || cu < 128) {
156				/* Bad in-sequence bits. */
157				return(0);
158			}
159
160			accum |= (cu & 63) << --state * 6;
161
162			/*
163			 * Accum is held in little-endian order as
164			 * stipulated by the UTF-8 sequence coding.  We
165			 * need to convert to a native big-endian if our
166			 * architecture requires it.
167			 */
168
169			if (0 == state && be)
170				accum = (accum >> 24) |
171					((accum << 8) & 0x00FF0000) |
172					((accum >> 8) & 0x0000FF00) |
173					(accum << 24);
174
175			if (0 == state) {
176				accum < 128U ? putchar(accum) :
177					printf("\\[u%.4X]", accum);
178				accum = 0U;
179			}
180		} else if (cu & (1 << 7)) {
181			/*
182			 * Entering a UTF-8 state:  if we encounter a
183			 * UTF-8 bitmask, calculate the expected UTF-8
184			 * state from it.
185			 */
186			for (state = 0; state < 7; state++)
187				if ( ! (cu & (1 << (7 - state))))
188					break;
189
190			/* Accept only legitimate bit patterns. */
191
192			switch (state) {
193			case (4):
194				if (cu <= 244 && cu >= 240) {
195					accum = (cu & 7) << 18;
196					break;
197				}
198				/* Bad 4-sequence start bits. */
199				return(0);
200			case (3):
201				if (cu <= 239 && cu >= 224) {
202					accum = (cu & 15) << 12;
203					break;
204				}
205				/* Bad 3-sequence start bits. */
206				return(0);
207			case (2):
208				if (cu <= 223 && cu >= 194) {
209					accum = (cu & 31) << 6;
210					break;
211				}
212				/* Bad 2-sequence start bits. */
213				return(0);
214			default:
215				/* Bad sequence bit mask. */
216				return(0);
217			}
218			state--;
219		} else
220			putchar(cu);
221	}
222
223	if (0 != state) {
224		/* Bad trailing bits. */
225		return(0);
226	}
227
228	return(1);
229}
230
231static void
232resize_buf(struct buf *buf, size_t initial)
233{
234
235	buf->sz = buf->sz > initial / 2 ?
236		2 * buf->sz : initial;
237
238	buf->buf = realloc(buf->buf, buf->sz);
239	if (NULL == buf->buf) {
240		perror(NULL);
241		exit(EXIT_FAILURE);
242	}
243}
244
245static int
246read_whole_file(const char *f, int fd,
247		struct buf *fb, int *with_mmap)
248{
249	size_t		 off;
250	ssize_t		 ssz;
251
252#ifdef	HAVE_MMAP
253	struct stat	 st;
254	if (-1 == fstat(fd, &st)) {
255		perror(f);
256		return(0);
257	}
258
259	/*
260	 * If we're a regular file, try just reading in the whole entry
261	 * via mmap().  This is faster than reading it into blocks, and
262	 * since each file is only a few bytes to begin with, I'm not
263	 * concerned that this is going to tank any machines.
264	 */
265
266	if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
267		fprintf(stderr, "%s: input too large\n", f);
268		return(0);
269	}
270
271	if (S_ISREG(st.st_mode)) {
272		*with_mmap = 1;
273		fb->sz = (size_t)st.st_size;
274		fb->buf = mmap(NULL, fb->sz, PROT_READ,
275				MAP_FILE|MAP_SHARED, fd, 0);
276		if (fb->buf != MAP_FAILED)
277			return(1);
278	}
279#endif
280
281	/*
282	 * If this isn't a regular file (like, say, stdin), then we must
283	 * go the old way and just read things in bit by bit.
284	 */
285
286	*with_mmap = 0;
287	off = 0;
288	fb->sz = 0;
289	fb->buf = NULL;
290	for (;;) {
291		if (off == fb->sz && fb->sz == (1U << 31)) {
292			fprintf(stderr, "%s: input too large\n", f);
293			break;
294		}
295
296		if (off == fb->sz)
297			resize_buf(fb, 65536);
298
299		ssz = read(fd, fb->buf + (int)off, fb->sz - off);
300		if (ssz == 0) {
301			fb->sz = off;
302			return(1);
303		}
304		if (ssz == -1) {
305			perror(f);
306			break;
307		}
308		off += (size_t)ssz;
309	}
310
311	free(fb->buf);
312	fb->buf = NULL;
313	return(0);
314}
315
316static int
317cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
318{
319	const char	*ln, *eoln, *eoph;
320	size_t		 sz, phsz, nsz;
321	int		 i;
322
323	ln = b->buf + (int)*offs;
324	sz = b->sz - *offs;
325
326	/* Look for the end-of-line. */
327
328	if (NULL == (eoln = memchr(ln, '\n', sz)))
329		return(-1);
330
331	/* Set next-line marker. */
332
333	*offs = (size_t)((eoln + 1) - b->buf);
334
335	/* Check if we have the correct header/trailer. */
336
337	if ((sz = (size_t)(eoln - ln)) < 10 ||
338			memcmp(ln, ".\\\" -*-", 7) ||
339			memcmp(eoln - 3, "-*-", 3))
340		return(0);
341
342	/* Move after the header and adjust for the trailer. */
343
344	ln += 7;
345	sz -= 10;
346
347	while (sz > 0) {
348		while (sz > 0 && ' ' == *ln) {
349			ln++;
350			sz--;
351		}
352		if (0 == sz)
353			break;
354
355		/* Find the end-of-phrase marker (or eoln). */
356
357		if (NULL == (eoph = memchr(ln, ';', sz)))
358			eoph = eoln - 3;
359		else
360			eoph++;
361
362		/* Only account for the "coding" phrase. */
363
364		if ((phsz = (size_t)(eoph - ln)) < 7 ||
365				strncasecmp(ln, "coding:", 7)) {
366			sz -= phsz;
367			ln += phsz;
368			continue;
369		}
370
371		sz -= 7;
372		ln += 7;
373
374		while (sz > 0 && ' ' == *ln) {
375			ln++;
376			sz--;
377		}
378		if (0 == sz)
379			break;
380
381		/* Check us against known encodings. */
382
383		for (i = 0; i < (int)ENC__MAX; i++) {
384			nsz = strlen(encs[i].name);
385			if (phsz < nsz)
386				continue;
387			if (strncasecmp(ln, encs[i].name, nsz))
388				continue;
389
390			*enc = (enum enc)i;
391			return(1);
392		}
393
394		/* Unknown encoding. */
395
396		*enc = ENC__MAX;
397		return(1);
398	}
399
400	return(0);
401}
402
403int
404main(int argc, char *argv[])
405{
406	int	 	 i, ch, map, fd, rc;
407	struct buf	 b;
408	const char	*fn;
409	enum enc	 enc, def;
410	unsigned char 	 bom[3] = { 0xEF, 0xBB, 0xBF };
411	size_t		 offs;
412	extern int	 optind;
413	extern char	*optarg;
414
415	progname = strrchr(argv[0], '/');
416	if (progname == NULL)
417		progname = argv[0];
418	else
419		++progname;
420
421	fn = "<stdin>";
422	fd = STDIN_FILENO;
423	rc = EXIT_FAILURE;
424	enc = def = ENC__MAX;
425	map = 0;
426
427	memset(&b, 0, sizeof(struct buf));
428
429	while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
430		switch (ch) {
431		case ('D'):
432			/* FALLTHROUGH */
433		case ('e'):
434			for (i = 0; i < (int)ENC__MAX; i++) {
435				if (strcasecmp(optarg, encs[i].name))
436					continue;
437				break;
438			}
439			if (i < (int)ENC__MAX) {
440				if ('D' == ch)
441					def = (enum enc)i;
442				else
443					enc = (enum enc)i;
444				break;
445			}
446
447			fprintf(stderr, "%s: Bad encoding\n", optarg);
448			return(EXIT_FAILURE);
449		case ('r'):
450			/* FALLTHROUGH */
451		case ('d'):
452			/* FALLTHROUGH */
453		case ('v'):
454			/* Compatibility with GNU preconv. */
455			break;
456		case ('h'):
457			/* Compatibility with GNU preconv. */
458			/* FALLTHROUGH */
459		default:
460			usage();
461			return(EXIT_FAILURE);
462		}
463
464	argc -= optind;
465	argv += optind;
466
467	/*
468	 * Open and read the first argument on the command-line.
469	 * If we don't have one, we default to stdin.
470	 */
471
472	if (argc > 0) {
473		fn = *argv;
474		fd = open(fn, O_RDONLY, 0);
475		if (-1 == fd) {
476			perror(fn);
477			return(EXIT_FAILURE);
478		}
479	}
480
481	if ( ! read_whole_file(fn, fd, &b, &map))
482		goto out;
483
484	/* Try to read the UTF-8 BOM. */
485
486	if (ENC__MAX == enc)
487		if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) {
488			b.offs = 3;
489			enc = ENC_UTF_8;
490		}
491
492	/* Try reading from the "-*-" cue. */
493
494	if (ENC__MAX == enc) {
495		offs = b.offs;
496		ch = cue_enc(&b, &offs, &enc);
497		if (0 == ch)
498			ch = cue_enc(&b, &offs, &enc);
499	}
500
501	/*
502	 * No encoding has been detected.
503	 * Thus, we either fall into our default encoder, if specified,
504	 * or use Latin-1 if all else fails.
505	 */
506
507	if (ENC__MAX == enc)
508		enc = ENC__MAX == def ? ENC_LATIN_1 : def;
509
510	if ( ! (*encs[(int)enc].conv)(&b)) {
511		fprintf(stderr, "%s: Bad encoding\n", fn);
512		goto out;
513	}
514
515	rc = EXIT_SUCCESS;
516out:
517#ifdef	HAVE_MMAP
518	if (map)
519		munmap(b.buf, b.sz);
520	else
521#endif
522		free(b.buf);
523
524	if (fd > STDIN_FILENO)
525		close(fd);
526
527	return(rc);
528}
529