1#ifndef _LINUX_UTF_H
2#define _LINUX_UTF_H
3
4#include <asm/unaligned.h>
5
6static inline int utf8_to_utf16le(const char *s, __le16 *cp, unsigned len)
7{
8	int	count = 0;
9	u8	c;
10	u16	uchar;
11
12	/*
13	 * this insists on correct encodings, though not minimal ones.
14	 * BUT it currently rejects legit 4-byte UTF-8 code points,
15	 * which need surrogate pairs.  (Unicode 3.1 can use them.)
16	 */
17	while (len != 0 && (c = (u8) *s++) != 0) {
18		if ((c & 0x80)) {
19			/*
20			 * 2-byte sequence:
21			 * 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
22			 */
23			if ((c & 0xe0) == 0xc0) {
24				uchar = (c & 0x1f) << 6;
25
26				c = (u8) *s++;
27				if ((c & 0xc0) != 0x80)
28					goto fail;
29				c &= 0x3f;
30				uchar |= c;
31
32			/*
33			 * 3-byte sequence (most CJKV characters):
34			 * zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
35			 */
36			} else if ((c & 0xf0) == 0xe0) {
37				uchar = (c & 0x0f) << 12;
38
39				c = (u8) *s++;
40				if ((c & 0xc0) != 0x80)
41					goto fail;
42				c &= 0x3f;
43				uchar |= c << 6;
44
45				c = (u8) *s++;
46				if ((c & 0xc0) != 0x80)
47					goto fail;
48				c &= 0x3f;
49				uchar |= c;
50
51				/* no bogus surrogates */
52				if (0xd800 <= uchar && uchar <= 0xdfff)
53					goto fail;
54
55			/*
56			 * 4-byte sequence (surrogate pairs, currently rare):
57			 * 11101110wwwwzzzzyy + 110111yyyyxxxxxx
58			 *     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
59			 * (uuuuu = wwww + 1)
60			 * FIXME accept the surrogate code points (only)
61			 */
62			} else
63				goto fail;
64		} else
65			uchar = c;
66		put_unaligned_le16(uchar, cp++);
67		count++;
68		len--;
69	}
70	return count;
71fail:
72	return -1;
73}
74
75#endif /* _LINUX_UTF_H */
76