1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2001, 2002 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include "gnu_msgfmt.h"
30#include "gnu_lex.h"
31#include "y.tab.h"
32
33int	cur_line = 1;
34
35static char	backbuf[MB_LEN_MAX];
36static int	backlen = 0;
37
38/*
39 * get_mb() returns one multibyte character.
40 *
41 * This function uses the iconv() function to find out one
42 * multibyte character from a sequence of bytes in the file stream.
43 * The conversion from the codeset specified in the PO file to UTF-8
44 * is performed.  The funcition reads another byte and calls iconv(),
45 * until iconv() successfully returns as a valid UTF-8 character has
46 * been converted or returns EILSEQ.  If iconv() successfully returned,
47 * the function returns the read bytes as one character.  Otherwise,
48 * returns error.  The string converted to UTF-8 in outbuf won't be
49 * used at all.
50 */
51static size_t
52get_mb(unsigned char *tmpbuf, unsigned char fc)
53{
54	int	c;
55	char	outbuf[8];			/* max size of a UTF-8 char */
56	const char	*inptr;
57	char	*outptr;
58	size_t	insize = 0, inlen, outlen, ret;
59
60	tmpbuf[insize++] = fc;		/* size of tmpbuf is MB_LEN_MAX+1 */
61
62	if (cd == (iconv_t)-1) {
63		/* no conversion */
64		tmpbuf[insize] = '\0';
65		return (insize);
66	}
67
68	for (; ; ) {
69		inptr = (const char *)tmpbuf;
70		outptr = &outbuf[0];
71		inlen = insize;
72		outlen = sizeof (outbuf);
73
74		errno = 0;
75		ret = iconv(cd, &inptr, &inlen, &outptr, &outlen);
76		if (ret == (size_t)-1) {
77			/* iconv failed */
78			switch (errno) {
79			case EILSEQ:
80				/* invalid character found */
81				error(gettext(ERR_INVALID_CHAR),
82					cur_line, cur_po);
83				/* NOTREACHED */
84			case EINVAL:
85				/* not enough input */
86				if (insize == MB_LEN_MAX) {
87					/* invalid character found */
88					error(gettext(ERR_INVALID_CHAR),
89						cur_line, cur_po);
90					/* NOTREACHED */
91				}
92				c = getc(fp);
93				if (c == EOF) {
94					error(gettext(ERR_UNEXP_EOF),
95						cur_line, cur_po);
96					/* NOTREACHED */
97				}
98				tmpbuf[insize++] = (unsigned char)c;
99
100				/* initialize the conversion */
101				outptr = &outbuf[0];
102				outlen = sizeof (outbuf);
103				(void) iconv(cd, NULL, NULL, &outptr, &outlen);
104
105				continue;
106				/* NOTREACHED */
107			default:
108				/* should never happen */
109				error(ERR_INTERNAL,
110					cur_line, cur_po);
111				/* NOTREACHED */
112			}
113			/* NOTREACHED */
114		}
115		tmpbuf[insize] = '\0';
116		return (insize);
117		/* NOTRECHED */
118	}
119}
120
121static void
122po_uninput(int c)
123{
124	(void) ungetc(c, fp);
125	if (c == '\n')
126		cur_line--;
127}
128
129static void
130po_ungetc(struct ch *pch)
131{
132	if (backlen) {
133		error(gettext(ERR_INTERNAL), cur_line, cur_po);
134		/* NOTREACHED */
135	}
136	if (!pch->eof) {
137		backlen = pch->len;
138		(void) memcpy(backbuf, pch->buf, backlen);
139	}
140}
141
142static struct ch *
143po_getc(void)
144{
145	static struct ch	och;
146	int	c;
147
148	if (backlen) {
149		och.len = backlen;
150		(void) memcpy(och.buf, backbuf, backlen);
151		backlen = 0;
152		return (&och);
153	}
154
155	for (; ; ) {
156		c = getc(fp);
157		if (c == EOF) {
158			if (ferror(fp)) {
159				/* error happend */
160				error(gettext(ERR_READ_FAILED), cur_po);
161				/* NOTREACHED */
162			}
163			och.len = 0;
164			och.eof = 1;
165			return (&och);
166		}
167		if (c == '\\') {
168			c = getc(fp);
169			if (c == '\n') {
170				/* this newline should be escaped */
171				cur_line++;
172				continue;
173			} else {
174				po_uninput(c);
175				och.len = 1;
176				och.eof = 0;
177				och.buf[0] = '\\';
178				return (&och);
179			}
180			/* NOTREACHED */
181		}
182		if (c == '\n') {
183			cur_line++;
184			och.len = 1;
185			och.eof = 0;
186			och.buf[0] = '\n';
187			return (&och);
188		}
189		if (isascii((unsigned char)c)) {
190			/* single byte ascii */
191			och.len = 1;
192			och.eof = 0;
193			och.buf[0] = (unsigned char)c;
194			return (&och);
195		}
196
197		och.len = get_mb(&och.buf[0], (unsigned char)c);
198		och.eof = 0;
199		return (&och);
200	}
201	/* NOTREACHED */
202}
203
204static void
205extend_buf(char **buf, size_t *size, size_t add)
206{
207	char	*tmp;
208
209	*size += add;
210	tmp = (char *)Xrealloc(*buf, *size);
211	*buf = tmp;
212}
213
214static struct ch	*
215expand_es(void)
216{
217	int	c, n, loop;
218	static struct ch	och;
219	struct ch	*pch;
220
221	pch = po_getc();
222	if (pch->eof) {
223		error(gettext(ERR_UNEXP_EOF),
224			cur_line, cur_po);
225		/* NOTREACHED */
226	}
227	if (pch->len > 1) {
228		/* not a valid escape sequence */
229		return (pch);
230	}
231
232	och.len = 1;
233	och.eof = 0;
234	switch (pch->buf[0]) {
235	case '"':
236	case '\\':
237		och.buf[0] = pch->buf[0];
238		break;
239	case 'b':
240		och.buf[0] = '\b';
241		break;
242	case 'f':
243		och.buf[0] = '\f';
244		break;
245	case 'n':
246		och.buf[0] = '\n';
247		break;
248	case 'r':
249		och.buf[0] = '\r';
250		break;
251	case 't':
252		och.buf[0] = '\t';
253		break;
254	case 'v':
255		och.buf[0] = '\v';
256		break;
257	case 'a':
258		och.buf[0] = '\a';
259		break;
260	case '0':
261	case '1':
262	case '2':
263	case '3':
264	case '4':
265	case '5':
266	case '6':
267	case '7':
268		/* octal */
269		c = pch->buf[0];
270		for (n = 0, loop = 0; ; ) {
271			n = n * 8 + c - '0';
272			loop++;
273			if (loop >= 3)
274				break;
275			pch = po_getc();
276			if (pch->eof) {
277				error(gettext(ERR_UNEXP_EOF),
278					cur_line, cur_po);
279				/* NOTREACHED */
280			}
281			if ((pch->len > 1) || (pch->buf[0] < '0') ||
282				(pch->buf[0] > '7'))
283				break;
284			c = pch->buf[0];
285		}
286		po_ungetc(pch);
287		och.buf[0] = (unsigned char)n;
288		break;
289	case 'x':
290		/* hex */
291		pch = po_getc();
292		if (pch->eof) {
293			error(gettext(ERR_UNEXP_EOF),
294				cur_line, cur_po);
295			/* NOTREACHED */
296		}
297		if (pch->len > 1) {
298			po_ungetc(pch);
299			och.buf[0] = 'x';
300			break;
301		}
302		c = pch->buf[0];
303		if (!isxdigit((unsigned char)c)) {
304			po_ungetc(pch);
305			och.buf[0] = 'x';
306			break;
307		}
308		if (isdigit((unsigned char)c)) {
309			n = c - '0';
310		} else if (isupper((unsigned char)c)) {
311			n = c - 'A' + 10;
312		} else {
313			n = c - 'a' + 10;
314		}
315
316		pch = po_getc();
317		if (pch->eof) {
318			error(gettext(ERR_UNEXP_EOF),
319				cur_line, cur_po);
320			/* NOTREACHED */
321		}
322		if (pch->len > 1) {
323			po_ungetc(pch);
324			och.buf[0] = (unsigned char)n;
325			break;
326		}
327		c = pch->buf[0];
328		if (!isxdigit((unsigned char)c)) {
329			po_ungetc(pch);
330			och.buf[0] = (unsigned char)n;
331			break;
332		}
333		n *= 16;
334		if (isdigit((unsigned char)c)) {
335			n += c - '0';
336		} else if (isupper((unsigned char)c)) {
337			n += c - 'A' + 10;
338		} else {
339			n += c - 'a' + 10;
340		}
341		och.buf[0] = (unsigned char)n;
342		break;
343
344	default:
345		och.buf[0] = pch->buf[0];
346		break;
347	}
348	return (&och);
349}
350
351int
352yylex(void)
353{
354	unsigned int	uc;
355	struct ch	*pch;
356	char	*buf;
357	size_t	buf_size, buf_pos;
358
359	for (; ; ) {
360		pch = po_getc();
361
362		if (pch->eof) {
363			/* EOF */
364			return (0);
365		}
366
367		if (pch->len > 1) {
368			/* multi byte */
369			yylval.c.len = pch->len;
370			(void) memcpy(yylval.c.buf, pch->buf, pch->len);
371			return (CHR);
372		}
373		/* single byte */
374		switch (pch->buf[0]) {
375		case ' ':
376		case '\t':
377		case '\n':
378			break;
379
380		case '#':
381			/* comment start */
382			buf_size = CBUFSIZE;
383			buf = (char *)Xmalloc(buf_size);
384			buf_pos = 0;
385			pch = po_getc();
386			while (!pch->eof &&
387				((pch->len != 1) || (pch->buf[0] != '\n'))) {
388				if (buf_pos + pch->len + 1 > buf_size)
389					extend_buf(&buf, &buf_size, CBUFSIZE);
390				(void) memcpy(buf + buf_pos,
391					pch->buf, pch->len);
392				buf_pos += pch->len;
393				pch = po_getc();
394			}
395			buf[buf_pos] = '\0';
396			yylval.str = buf;
397			return (COMMENT);
398			/* NOTREACHED */
399
400		case '[':
401		case ']':
402			return (pch->buf[0]);
403			/* NOTREACHED */
404
405		case '"':
406			buf_size = MBUFSIZE;
407			buf = (char *)Xmalloc(buf_size);
408			buf_pos = 0;
409			for (; ; ) {
410				pch = po_getc();
411
412				if (pch->eof) {
413					/* EOF */
414					error(gettext(ERR_UNEXP_EOF),
415						cur_line, cur_po);
416					/* NOTREACHED */
417				}
418
419				if (pch->len == 1) {
420					uc = pch->buf[0];
421
422					if (uc == '\n') {
423						error(gettext(ERR_UNEXP_EOL),
424							cur_line, cur_po);
425						/* NOTREACHED */
426					}
427					if (uc == '"')
428						break;
429					if (uc == '\\')
430						pch = expand_es();
431				}
432				if (buf_pos + pch->len + 1 > buf_size)
433					extend_buf(&buf, &buf_size,
434						MBUFSIZE);
435				(void) memcpy(buf + buf_pos,
436					pch->buf, pch->len);
437				buf_pos += pch->len;
438			}
439
440			buf[buf_pos] = '\0';
441			yylval.str = buf;
442			return (STR);
443			/* NOTREACHED */
444
445		default:
446			uc = pch->buf[0];
447
448			if (isalpha(uc) || (uc == '_')) {
449				buf_size = KBUFSIZE;
450				buf = (char *)Xmalloc(buf_size);
451				buf_pos = 0;
452				buf[buf_pos++] = (char)uc;
453				pch = po_getc();
454				while (!pch->eof &&
455					(pch->len == 1) &&
456					(isalpha(uc = pch->buf[0]) ||
457					isdigit(uc) || (uc == '_'))) {
458					if (buf_pos + 1 + 1 > buf_size)
459						extend_buf(&buf, &buf_size,
460							KBUFSIZE);
461					buf[buf_pos++] = (char)uc;
462					pch = po_getc();
463				}
464				/* push back the last char */
465				po_ungetc(pch);
466				buf[buf_pos] = '\0';
467				yylval.str = buf;
468				if (buf_pos > MAX_KW_LEN) {
469					/* kbuf is longer than any keywords */
470					return (SYMBOL);
471				}
472				yylval.num = cur_line;
473				if (strcmp(buf, KW_DOMAIN) == 0) {
474					free(buf);
475					return (DOMAIN);
476				} else if (strcmp(buf, KW_MSGID) == 0) {
477					free(buf);
478					return (MSGID);
479				} else if (strcmp(buf, KW_MSGID_PLURAL) == 0) {
480					free(buf);
481					return (MSGID_PLURAL);
482				} else if (strcmp(buf, KW_MSGSTR) == 0) {
483					free(buf);
484					return (MSGSTR);
485				} else {
486					free(buf);
487					return (SYMBOL);
488				}
489				/* NOTREACHED */
490			}
491			if (isdigit(uc)) {
492				buf_size = NBUFSIZE;
493				buf = (char *)Xmalloc(buf_size);
494				buf_pos = 0;
495				buf[buf_pos++] = (char)uc;
496				pch = po_getc();
497				while (!pch->eof &&
498					(pch->len == 1) &&
499					isdigit(uc = pch->buf[0])) {
500					if (buf_pos + 1 + 1 > buf_size)
501						extend_buf(&buf, &buf_size,
502							NBUFSIZE);
503					buf[buf_pos++] = (char)uc;
504					pch = po_getc();
505				}
506				/* push back the last char */
507				po_ungetc(pch);
508				buf[buf_pos] = '\0';
509				yylval.num = atoi(buf);
510				free(buf);
511				return (NUM);
512			}
513			/* just a char */
514			yylval.c.len = 1;
515			yylval.c.buf[0] = uc;
516			return (CHR);
517			/* NOTREACHED */
518		}
519	}
520}
521