1/*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 	Includes Unicode 3.2 decomposition code derived from Core Foundation
31 */
32
33#include <sys/param.h>
34#include <sys/utfconv.h>
35#include <sys/errno.h>
36#include <sys/malloc.h>
37#include <libkern/OSByteOrder.h>
38
39/*
40 * UTF-8 (Unicode Transformation Format)
41 *
42 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
43 * character as a sequence of one to four bytes. Only the shortest form
44 * required to represent the significant Unicode bits is legal.
45 *
46 * UTF-8 Multibyte Codes
47 *
48 * Bytes   Bits   Unicode Min  Unicode Max   UTF-8 Byte Sequence (binary)
49 * -----------------------------------------------------------------------------
50 *   1       7       0x0000        0x007F    0xxxxxxx
51 *   2      11       0x0080        0x07FF    110xxxxx 10xxxxxx
52 *   3      16       0x0800        0xFFFF    1110xxxx 10xxxxxx 10xxxxxx
53 *   4      21      0x10000      0x10FFFF    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
54 * -----------------------------------------------------------------------------
55 */
56
57
58#define UNICODE_TO_UTF8_LEN(c)  \
59	((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
60
61#define UCS_ALT_NULL	0x2400
62
63/* Surrogate Pair Constants */
64#define SP_HALF_SHIFT	10
65#define SP_HALF_BASE	0x0010000UL
66#define SP_HALF_MASK	0x3FFUL
67
68#define SP_HIGH_FIRST	0xD800UL
69#define SP_HIGH_LAST	0xDBFFUL
70#define SP_LOW_FIRST	0xDC00UL
71#define SP_LOW_LAST	0xDFFFUL
72
73
74#include "vfs_utfconvdata.h"
75
76
77/*
78 * Test for a combining character.
79 *
80 * Similar to __CFUniCharIsNonBaseCharacter except that
81 * unicode_combinable also includes Hangul Jamo characters.
82 */
83int
84unicode_combinable(u_int16_t character)
85{
86	const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
87	u_int8_t value;
88
89	if (character < 0x0300)
90		return (0);
91
92	value = bitmap[(character >> 8) & 0xFF];
93
94	if (value == 0xFF) {
95		return (1);
96	} else if (value) {
97		bitmap = bitmap + ((value - 1) * 32) + 256;
98		return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
99	}
100	return (0);
101}
102
103/*
104 * Test for a precomposed character.
105 *
106 * Similar to __CFUniCharIsDecomposableCharacter.
107 */
108int
109unicode_decomposeable(u_int16_t character) {
110	const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
111	u_int8_t value;
112
113	if (character < 0x00C0)
114		return (0);
115
116	value = bitmap[(character >> 8) & 0xFF];
117
118	if (value == 0xFF) {
119		return (1);
120	} else if (value) {
121		bitmap = bitmap + ((value - 1) * 32) + 256;
122		return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
123	}
124    	return (0);
125}
126
127
128/*
129 * Get the combing class.
130 *
131 * Similar to CFUniCharGetCombiningPropertyForCharacter.
132 */
133static inline u_int8_t
134get_combining_class(u_int16_t character) {
135	const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
136
137	u_int8_t value = bitmap[(character >> 8)];
138
139	if (value) {
140		bitmap = bitmap + (value * 256);
141		return bitmap[character % 256];
142	}
143	return (0);
144}
145
146
147static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
148
149static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
150
151static void priortysort(u_int16_t* characters, int count);
152
153static u_int16_t  ucs_to_sfm(u_int16_t ucs_ch, int lastchar);
154
155static u_int16_t  sfm_to_ucs(u_int16_t ucs_ch);
156
157
158char utf_extrabytes[32] = {
159	 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
160	-1, -1, -1, -1, -1, -1, -1, -1,  1,  1,  1,  1,  2,  2,  3, -1
161};
162
163const char hexdigits[16] = {
164	 '0',  '1',  '2',  '3',  '4',  '5',  '6', '7',
165	 '8',  '9',  'A',  'B',  'C',  'D',  'E', 'F'
166};
167
168/*
169 * utf8_encodelen - Calculate the UTF-8 encoding length
170 *
171 * This function takes a Unicode input string, ucsp, of ucslen bytes
172 * and calculates the size of the UTF-8 output in bytes (not including
173 * a NULL termination byte). The string must reside in kernel memory.
174 *
175 * If '/' chars are possible in the Unicode input then an alternate
176 * (replacement) char should be provided in altslash.
177 *
178 * FLAGS
179 *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
180 *
181 *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
182 *
183 *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
184 *
185 *    UTF_DECOMPOSED:  generate fully decomposed output
186 *
187 *    UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
188 *
189 * ERRORS
190 *    None
191 */
192size_t
193utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags)
194{
195	u_int16_t ucs_ch;
196	u_int16_t * chp = NULL;
197	u_int16_t sequence[8];
198	int extra = 0;
199	int charcnt;
200	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
201	int decompose = (flags & UTF_DECOMPOSED);
202	size_t len;
203
204	charcnt = ucslen / 2;
205	len = 0;
206
207	while (charcnt-- > 0) {
208		if (extra > 0) {
209			--extra;
210			ucs_ch = *chp++;
211		} else {
212			ucs_ch = *ucsp++;
213			if (swapbytes) {
214				ucs_ch = OSSwapInt16(ucs_ch);
215			}
216			if (ucs_ch == '/') {
217				ucs_ch = altslash ? altslash : '_';
218			} else if (ucs_ch == '\0') {
219				ucs_ch = UCS_ALT_NULL;
220			} else if (decompose && unicode_decomposeable(ucs_ch)) {
221				extra = unicode_decompose(ucs_ch, sequence) - 1;
222				charcnt += extra;
223				ucs_ch = sequence[0];
224				chp = &sequence[1];
225			}
226		}
227		len += UNICODE_TO_UTF8_LEN(ucs_ch);
228	}
229
230	return (len);
231}
232
233
234/*
235 * utf8_encodestr - Encodes a Unicode string to UTF-8
236 *
237 * NOTES:
238 *    The resulting UTF-8 string is NULL terminated.
239 *
240 *    If '/' chars are allowed on disk then an alternate
241 *    (replacement) char must be provided in altslash.
242 *
243 * input flags:
244 *    UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
245 *
246 *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
247 *
248 *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
249 *
250 *    UTF_DECOMPOSED:  generate fully decomposed output
251 *
252 *    UTF_NO_NULL_TERM:  don't add NULL termination to UTF-8 output
253 *
254 * result:
255 *    ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
256 *
257 *    EINVAL: Illegal char found; char was replaced by an '_'.
258 */
259int
260utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
261               size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
262{
263	u_int8_t * bufstart;
264	u_int8_t * bufend;
265	u_int16_t ucs_ch;
266	u_int16_t * chp = NULL;
267	u_int16_t sequence[8];
268	int extra = 0;
269	int charcnt;
270	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
271	int nullterm  = ((flags & UTF_NO_NULL_TERM) == 0);
272	int decompose = (flags & UTF_DECOMPOSED);
273	int sfmconv = (flags & UTF_SFM_CONVERSIONS);
274	int result = 0;
275
276	bufstart = utf8p;
277	bufend = bufstart + buflen;
278	if (nullterm)
279		--bufend;
280	charcnt = ucslen / 2;
281
282	while (charcnt-- > 0) {
283		if (extra > 0) {
284			--extra;
285			ucs_ch = *chp++;
286		} else {
287			ucs_ch = swapbytes ? OSSwapInt16(*ucsp++) : *ucsp++;
288
289			if (decompose && unicode_decomposeable(ucs_ch)) {
290				extra = unicode_decompose(ucs_ch, sequence) - 1;
291				charcnt += extra;
292				ucs_ch = sequence[0];
293				chp = &sequence[1];
294			}
295		}
296
297		/* Slash and NULL are not permitted */
298		if (ucs_ch == '/') {
299			if (altslash)
300				ucs_ch = altslash;
301			else {
302				ucs_ch = '_';
303				result = EINVAL;
304			}
305		} else if (ucs_ch == '\0') {
306			ucs_ch = UCS_ALT_NULL;
307		}
308
309		if (ucs_ch < 0x0080) {
310			if (utf8p >= bufend) {
311				result = ENAMETOOLONG;
312				break;
313			}
314			*utf8p++ = ucs_ch;
315
316		} else if (ucs_ch < 0x800) {
317			if ((utf8p + 1) >= bufend) {
318				result = ENAMETOOLONG;
319				break;
320			}
321			*utf8p++ = 0xc0 | (ucs_ch >> 6);
322			*utf8p++ = 0x80 | (0x3f & ucs_ch);
323
324		} else {
325			/* These chars never valid Unicode. */
326			if (ucs_ch == 0xFFFE || ucs_ch == 0xFFFF) {
327				result = EINVAL;
328				break;
329			}
330
331			/* Combine valid surrogate pairs */
332			if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
333				&& charcnt > 0) {
334				u_int16_t ch2;
335				u_int32_t pair;
336
337				ch2 = swapbytes ? OSSwapInt16(*ucsp) : *ucsp;
338				if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
339					pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
340						+ (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
341					if ((utf8p + 3) >= bufend) {
342						result = ENAMETOOLONG;
343						break;
344					}
345					--charcnt;
346					++ucsp;
347					*utf8p++ = 0xf0 | (pair >> 18);
348					*utf8p++ = 0x80 | (0x3f & (pair >> 12));
349					*utf8p++ = 0x80 | (0x3f & (pair >> 6));
350					*utf8p++ = 0x80 | (0x3f & pair);
351					continue;
352				}
353			} else if (sfmconv) {
354				ucs_ch = sfm_to_ucs(ucs_ch);
355				if (ucs_ch < 0x0080) {
356					if (utf8p >= bufend) {
357						result = ENAMETOOLONG;
358						break;
359					}
360					*utf8p++ = ucs_ch;
361					continue;
362				}
363			}
364			if ((utf8p + 2) >= bufend) {
365				result = ENAMETOOLONG;
366				break;
367			}
368			*utf8p++ = 0xe0 | (ucs_ch >> 12);
369			*utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
370			*utf8p++ = 0x80 | (0x3f & ucs_ch);
371		}
372	}
373
374	*utf8len = utf8p - bufstart;
375	if (nullterm)
376		*utf8p++ = '\0';
377
378	return (result);
379}
380
381
382/*
383 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
384 *
385 * NOTES:
386 *    The input UTF-8 string does not need to be null terminated
387 *    if utf8len is set.
388 *
389 *    If '/' chars are allowed on disk then an alternate
390 *    (replacement) char must be provided in altslash.
391 *
392 * input flags:
393 *    UTF_REV_ENDIAN:  Unicode byte order is opposite current runtime
394 *
395 *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
396 *
397 *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
398 *
399 *    UTF_DECOMPOSED:  generate fully decomposed output (NFD)
400 *
401 *    UTF_PRECOMPOSED:  generate precomposed output (NFC)
402 *
403 *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
404 *
405 * result:
406 *    ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
407 *
408 *    EINVAL: Illegal UTF-8 sequence found.
409 */
410int
411utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
412               size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
413{
414	u_int16_t* bufstart;
415	u_int16_t* bufend;
416	unsigned int ucs_ch;
417	unsigned int byte;
418	int combcharcnt = 0;
419	int result = 0;
420	int decompose, precompose, swapbytes, escaping;
421	int sfmconv;
422	int extrabytes;
423
424	decompose  = (flags & UTF_DECOMPOSED);
425	precompose = (flags & UTF_PRECOMPOSED);
426	swapbytes  = (flags & UTF_REVERSE_ENDIAN);
427	escaping   = (flags & UTF_ESCAPE_ILLEGAL);
428	sfmconv    = (flags & UTF_SFM_CONVERSIONS);
429
430	bufstart = ucsp;
431	bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
432
433	while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
434		if (ucsp >= bufend)
435			goto toolong;
436
437		/* check for ascii */
438		if (byte < 0x80) {
439			ucs_ch = sfmconv ? ucs_to_sfm(byte, utf8len == 0) : byte;
440		} else {
441			u_int32_t ch;
442
443			extrabytes = utf_extrabytes[byte >> 3];
444			if ((extrabytes < 0) || ((int)utf8len < extrabytes)) {
445				goto escape;
446			}
447			utf8len -= extrabytes;
448
449			switch (extrabytes) {
450			case 1:
451				ch = byte; ch <<= 6;   /* 1st byte */
452				byte = *utf8p++;       /* 2nd byte */
453				if ((byte >> 6) != 2)
454					goto escape2;
455				ch += byte;
456				ch -= 0x00003080UL;
457				if (ch < 0x0080)
458					goto escape2;
459				ucs_ch = ch;
460			        break;
461			case 2:
462				ch = byte; ch <<= 6;   /* 1st byte */
463				byte = *utf8p++;       /* 2nd byte */
464				if ((byte >> 6) != 2)
465					goto escape2;
466				ch += byte; ch <<= 6;
467				byte = *utf8p++;       /* 3rd byte */
468				if ((byte >> 6) != 2)
469					goto escape3;
470				ch += byte;
471				ch -= 0x000E2080UL;
472				if (ch < 0x0800)
473					goto escape3;
474				if (ch >= 0xD800) {
475					if (ch <= 0xDFFF)
476						goto escape3;
477					if (ch == 0xFFFE || ch == 0xFFFF)
478						goto escape3;
479				}
480				ucs_ch = ch;
481				break;
482			case 3:
483				ch = byte; ch <<= 6;   /* 1st byte */
484				byte = *utf8p++;       /* 2nd byte */
485				if ((byte >> 6) != 2)
486					goto escape2;
487				ch += byte; ch <<= 6;
488				byte = *utf8p++;       /* 3rd byte */
489				if ((byte >> 6) != 2)
490					goto escape3;
491				ch += byte; ch <<= 6;
492				byte = *utf8p++;       /* 4th byte */
493				if ((byte >> 6) != 2)
494					goto escape4;
495			        ch += byte;
496				ch -= 0x03C82080UL + SP_HALF_BASE;
497				ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
498				if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
499					goto escape4;
500				*ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
501				if (ucsp >= bufend)
502					goto toolong;
503				ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
504				if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST) {
505					--ucsp;
506					goto escape4;
507				}
508				*ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
509			        continue;
510			default:
511				result = EINVAL;
512				goto exit;
513			}
514			if (decompose) {
515				if (unicode_decomposeable(ucs_ch)) {
516					u_int16_t sequence[8];
517					int count, i;
518
519					/* Before decomposing a new unicode character, sort
520					 * previous combining characters, if any, and reset
521					 * the counter.
522					 */
523					if (combcharcnt > 1) {
524						priortysort(ucsp - combcharcnt, combcharcnt);
525					}
526					combcharcnt = 0;
527
528					count = unicode_decompose(ucs_ch, sequence);
529					for (i = 0; i < count; ++i) {
530						ucs_ch = sequence[i];
531						*ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
532						if (ucsp >= bufend)
533							goto toolong;
534					}
535					combcharcnt += count - 1;
536					continue;
537				}
538			} else if (precompose && (ucsp != bufstart)) {
539				u_int16_t composite, base;
540
541				if (unicode_combinable(ucs_ch)) {
542					base = swapbytes ? OSSwapInt16(*(ucsp - 1)) : *(ucsp - 1);
543					composite = unicode_combine(base, ucs_ch);
544					if (composite) {
545						--ucsp;
546						ucs_ch = composite;
547					}
548				}
549			}
550			if (ucs_ch == UCS_ALT_NULL)
551				ucs_ch = '\0';
552		}
553		if (ucs_ch == altslash)
554			ucs_ch = '/';
555
556		/*
557		 * Make multiple combining character sequences canonical
558		 */
559		if (unicode_combinable(ucs_ch)) {
560			++combcharcnt;   /* start tracking a run */
561		} else if (combcharcnt) {
562			if (combcharcnt > 1) {
563				priortysort(ucsp - combcharcnt, combcharcnt);
564			}
565			combcharcnt = 0;  /* start over */
566		}
567
568		*ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
569		continue;
570
571		/*
572		 * Escape illegal UTF-8 into something legal.
573		 */
574escape4:
575		utf8p -= 3;
576		goto escape;
577escape3:
578		utf8p -= 2;
579		goto escape;
580escape2:
581		utf8p -= 1;
582escape:
583		if (!escaping) {
584			result = EINVAL;
585			goto exit;
586		}
587		if (extrabytes > 0)
588			utf8len += extrabytes;
589		byte = *(utf8p - 1);
590
591		if ((ucsp + 2) >= bufend)
592			goto toolong;
593
594		/* Make a previous combining sequence canonical. */
595		if (combcharcnt > 1) {
596			priortysort(ucsp - combcharcnt, combcharcnt);
597		}
598		combcharcnt = 0;
599
600		ucs_ch = '%';
601		*ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
602		ucs_ch =  hexdigits[byte >> 4];
603		*ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
604		ucs_ch =  hexdigits[byte & 0x0F];
605		*ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
606	}
607	/*
608	 * Make a previous combining sequence canonical
609	 */
610	if (combcharcnt > 1) {
611		priortysort(ucsp - combcharcnt, combcharcnt);
612	}
613exit:
614	*ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
615
616	return (result);
617
618toolong:
619	result = ENAMETOOLONG;
620	goto exit;
621}
622
623
624/*
625 * utf8_validatestr - Check for a valid UTF-8 string.
626 */
627int
628utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
629{
630	unsigned int byte;
631	u_int32_t ch;
632	unsigned int ucs_ch;
633	size_t extrabytes;
634
635	while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
636		if (byte < 0x80)
637			continue;  /* plain ascii */
638
639		extrabytes = utf_extrabytes[byte >> 3];
640
641		if (utf8len < extrabytes)
642			goto invalid;
643		utf8len -= extrabytes;
644
645		switch (extrabytes) {
646		case 1:
647			ch = byte; ch <<= 6;   /* 1st byte */
648			byte = *utf8p++;       /* 2nd byte */
649			if ((byte >> 6) != 2)
650				goto invalid;
651			ch += byte;
652			ch -= 0x00003080UL;
653			if (ch < 0x0080)
654				goto invalid;
655			break;
656		case 2:
657			ch = byte; ch <<= 6;   /* 1st byte */
658			byte = *utf8p++;       /* 2nd byte */
659			if ((byte >> 6) != 2)
660				goto invalid;
661			ch += byte; ch <<= 6;
662			byte = *utf8p++;       /* 3rd byte */
663			if ((byte >> 6) != 2)
664				goto invalid;
665			ch += byte;
666			ch -= 0x000E2080UL;
667			if (ch < 0x0800)
668				goto invalid;
669			if (ch >= 0xD800) {
670				if (ch <= 0xDFFF)
671					goto invalid;
672				if (ch == 0xFFFE || ch == 0xFFFF)
673					goto invalid;
674			}
675			break;
676		case 3:
677			ch = byte; ch <<= 6;   /* 1st byte */
678			byte = *utf8p++;       /* 2nd byte */
679			if ((byte >> 6) != 2)
680				goto invalid;
681			ch += byte; ch <<= 6;
682			byte = *utf8p++;       /* 3rd byte */
683			if ((byte >> 6) != 2)
684				goto invalid;
685			ch += byte; ch <<= 6;
686			byte = *utf8p++;       /* 4th byte */
687			if ((byte >> 6) != 2)
688				goto invalid;
689			ch += byte;
690			ch -= 0x03C82080UL + SP_HALF_BASE;
691			ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
692			if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
693				goto invalid;
694			ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
695			if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
696				goto invalid;
697			break;
698		default:
699			goto invalid;
700		}
701
702	}
703	return (0);
704invalid:
705	return (EINVAL);
706}
707
708/*
709 * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
710 *
711 * This function takes an UTF-8 input string, instr, of inlen bytes
712 * and produces normalized UTF-8 output into a buffer of buflen bytes
713 * pointed to by outstr. The size of the output in bytes (not including
714 * a NULL termination byte) is returned in outlen. In-place conversions
715 * are not supported (i.e. instr != outstr).]
716
717 * FLAGS
718 *    UTF_DECOMPOSED:  output string will be fully decomposed (NFD)
719 *
720 *    UTF_PRECOMPOSED:  output string will be precomposed (NFC)
721 *
722 *    UTF_NO_NULL_TERM:  do not add null termination to output string
723 *
724 *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
725 *
726 * ERRORS
727 *    ENAMETOOLONG:  output did not fit or input exceeded MAXPATHLEN bytes
728 *
729 *    EINVAL:  illegal UTF-8 sequence encountered or invalid flags
730 */
731int
732utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
733                  size_t *outlen, size_t buflen, int flags)
734{
735	u_int16_t unicodebuf[32];
736	u_int16_t* unistr = NULL;
737	size_t unicode_bytes;
738	size_t uft8_bytes;
739	size_t inbuflen;
740	u_int8_t *outbufstart, *outbufend;
741	const u_int8_t *inbufstart;
742	unsigned int byte;
743	int decompose, precompose;
744	int result = 0;
745
746	if (flags & ~(UTF_DECOMPOSED | UTF_PRECOMPOSED | UTF_NO_NULL_TERM | UTF_ESCAPE_ILLEGAL)) {
747		return (EINVAL);
748	}
749	decompose = (flags & UTF_DECOMPOSED);
750	precompose = (flags & UTF_PRECOMPOSED);
751	if ((decompose && precompose) || (!decompose && !precompose)) {
752		return (EINVAL);
753	}
754	outbufstart = outstr;
755	outbufend = outbufstart + buflen;
756	inbufstart = instr;
757	inbuflen = inlen;
758
759	while (inlen-- > 0 && (byte = *instr++) != '\0') {
760		if (outstr >= outbufend) {
761			result = ENAMETOOLONG;
762			goto exit;
763		}
764		if (byte >= 0x80) {
765			goto nonASCII;
766		}
767		/* ASCII is already normalized. */
768		*outstr++ = byte;
769	}
770exit:
771	*outlen = outstr - outbufstart;
772	if (((flags & UTF_NO_NULL_TERM) == 0)) {
773		if (outstr < outbufend)
774			*outstr++ = '\0';
775		else
776			result = ENAMETOOLONG;
777	}
778	return (result);
779
780
781	/*
782	 * Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
783	 * functions to perform the normalization.  Since this will
784	 * presumably be used to normalize filenames in the back-end
785	 * (on disk or over-the-wire), it should be fast enough.
786	 */
787nonASCII:
788
789	/* Make sure the input size is reasonable. */
790	if (inbuflen > MAXPATHLEN) {
791		result = ENAMETOOLONG;
792		goto exit;
793	}
794	/*
795	 * Compute worst case Unicode buffer size.
796	 *
797	 * For pre-composed output, every UTF-8 input byte will be at
798	 * most 2 Unicode bytes.  For decomposed output, 2 UTF-8 bytes
799	 * (smallest composite char sequence) may yield 6 Unicode bytes
800	 * (1 base char + 2 combining chars).
801	 */
802	unicode_bytes = precompose ? (inbuflen * 2) : (inbuflen * 3);
803
804	if (unicode_bytes <= sizeof(unicodebuf))
805		unistr = &unicodebuf[0];
806	else
807		MALLOC(unistr, u_int16_t *, unicode_bytes, M_TEMP, M_WAITOK);
808
809	/* Normalize the string. */
810	result = utf8_decodestr(inbufstart, inbuflen, unistr, &unicode_bytes,
811	                        unicode_bytes, 0, flags & ~UTF_NO_NULL_TERM);
812	if (result == 0) {
813		/* Put results back into UTF-8. */
814		result = utf8_encodestr(unistr, unicode_bytes, outbufstart,
815		                        &uft8_bytes, buflen, 0, UTF_NO_NULL_TERM);
816		outstr = outbufstart + uft8_bytes;
817	}
818	if (unistr && unistr != &unicodebuf[0]) {
819		FREE(unistr, M_TEMP);
820	}
821	goto exit;
822}
823
824
825 /*
826  * Unicode 3.2 decomposition code (derived from Core Foundation)
827  */
828
829typedef struct {
830	u_int32_t _key;
831	u_int32_t _value;
832} unicode_mappings32;
833
834static inline u_int32_t
835getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
836		u_int16_t character)
837{
838	const unicode_mappings32 *p, *q, *divider;
839
840	if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
841		return (0);
842
843	p = theTable;
844	q = p + (numElem-1);
845	while (p <= q) {
846		divider = p + ((q - p) >> 1);	/* divide by 2 */
847		if (character < divider->_key) { q = divider - 1; }
848		else if (character > divider->_key) { p = divider + 1; }
849		else { return (divider->_value); }
850	}
851	return (0);
852}
853
854#define RECURSIVE_DECOMPOSITION	(1 << 15)
855#define EXTRACT_COUNT(value)	(((value) >> 12) & 0x0007)
856
857typedef struct {
858	u_int16_t _key;
859	u_int16_t _value;
860} unicode_mappings16;
861
862static inline u_int16_t
863getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
864		u_int16_t character)
865{
866	const unicode_mappings16 *p, *q, *divider;
867
868	if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
869		return (0);
870
871	p = theTable;
872	q = p + (numElem-1);
873	while (p <= q) {
874		divider = p + ((q - p) >> 1);	/* divide by 2 */
875		if (character < divider->_key)
876			q = divider - 1;
877		else if (character > divider->_key)
878			p = divider + 1;
879		else
880			return (divider->_value);
881	}
882	return (0);
883}
884
885
886static u_int32_t
887unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
888{
889	u_int16_t value;
890	u_int32_t length;
891	u_int16_t firstChar;
892	u_int16_t theChar;
893	const u_int16_t *bmpMappings;
894	u_int32_t usedLength;
895
896	value = getmappedvalue16(
897		(const unicode_mappings16 *)__CFUniCharDecompositionTable,
898		__UniCharDecompositionTableLength, character);
899	length = EXTRACT_COUNT(value);
900	firstChar = value & 0x0FFF;
901	theChar = firstChar;
902	bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
903	usedLength = 0;
904
905	if (value & RECURSIVE_DECOMPOSITION) {
906	    usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
907
908	    --length;	/* Decrement for the first char */
909	    if (!usedLength)
910	    	return 0;
911	    ++bmpMappings;
912	    convertedChars += usedLength;
913	}
914
915	usedLength += length;
916
917	while (length--)
918		*(convertedChars++) = *(bmpMappings++);
919
920	return (usedLength);
921}
922
923#define HANGUL_SBASE 0xAC00
924#define HANGUL_LBASE 0x1100
925#define HANGUL_VBASE 0x1161
926#define HANGUL_TBASE 0x11A7
927
928#define HANGUL_SCOUNT 11172
929#define HANGUL_LCOUNT 19
930#define HANGUL_VCOUNT 21
931#define HANGUL_TCOUNT 28
932#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
933
934/*
935 * unicode_decompose - decompose a composed Unicode char
936 *
937 * Composed Unicode characters are forbidden on
938 * HFS Plus volumes. ucs_decompose will convert a
939 * composed character into its correct decomposed
940 * sequence.
941 *
942 * Similar to CFUniCharDecomposeCharacter
943 */
944static int
945unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
946{
947	if ((character >= HANGUL_SBASE) &&
948	    (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
949		u_int32_t length;
950
951		character -= HANGUL_SBASE;
952		length = (character % HANGUL_TCOUNT ? 3 : 2);
953
954		*(convertedChars++) =
955			character / HANGUL_NCOUNT + HANGUL_LBASE;
956		*(convertedChars++) =
957			(character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
958		if (length > 2)
959			*convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
960		return (length);
961	} else {
962		return (unicode_recursive_decompose(character, convertedChars));
963	}
964}
965
966/*
967 * unicode_combine - generate a precomposed Unicode char
968 *
969 * Precomposed Unicode characters are required for some volume
970 * formats and network protocols.  unicode_combine will combine
971 * a decomposed character sequence into a single precomposed
972 * (composite) character.
973 *
974 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
975 * also handles Hangul Jamo characters.
976 */
977static u_int16_t
978unicode_combine(u_int16_t base, u_int16_t combining)
979{
980	u_int32_t value;
981
982	/* Check HANGUL */
983	if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
984		/* 2 char Hangul sequences */
985		if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
986		    (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
987		    return (HANGUL_SBASE +
988		            ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
989		            ((combining  - HANGUL_VBASE)*HANGUL_TCOUNT));
990		}
991
992		/* 3 char Hangul sequences */
993		if ((combining > HANGUL_TBASE) &&
994		    (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
995			if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
996				return (0);
997			else
998				return (base + (combining - HANGUL_TBASE));
999		}
1000	}
1001
1002	value = getmappedvalue32(
1003		(const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
1004		__CFUniCharPrecompositionTableLength, combining);
1005
1006	if (value) {
1007		value = getmappedvalue16(
1008			(const unicode_mappings16 *)
1009			((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
1010			(value >> 16), base);
1011	}
1012	return (value);
1013}
1014
1015
1016/*
1017 * priortysort - order combining chars into canonical order
1018 *
1019 * Similar to CFUniCharPrioritySort
1020 */
1021static void
1022priortysort(u_int16_t* characters, int count)
1023{
1024	u_int32_t p1, p2;
1025	u_int16_t *ch1, *ch2;
1026	u_int16_t *end;
1027	int changes = 0;
1028
1029	end = characters + count;
1030	do {
1031		changes = 0;
1032		ch1 = characters;
1033		ch2 = characters + 1;
1034		p2 = get_combining_class(*ch1);
1035		while (ch2 < end) {
1036			p1 = p2;
1037			p2 = get_combining_class(*ch2);
1038			if (p1 > p2 && p2 != 0) {
1039				u_int32_t tmp;
1040
1041				tmp = *ch1;
1042				*ch1 = *ch2;
1043				*ch2 = tmp;
1044				changes = 1;
1045
1046				/*
1047				 * Make sure that p2 contains the combining class for the
1048				 * character now stored at *ch2.  This isn't required for
1049				 * correctness, but it will be more efficient if a character
1050				 * with a large combining class has to "bubble past" several
1051				 * characters with lower combining classes.
1052				 */
1053				p2 = p1;
1054			}
1055			++ch1;
1056			++ch2;
1057		}
1058	} while (changes);
1059}
1060
1061
1062/*
1063 * Invalid NTFS filename characters are encodeded using the
1064 * SFM (Services for Macintosh) private use Unicode characters.
1065 *
1066 * These should only be used for SMB, MSDOS or NTFS.
1067 *
1068 *    Illegal NTFS Char   SFM Unicode Char
1069 *  ----------------------------------------
1070 *    0x01-0x1f           0xf001-0xf01f
1071 *    '"'                 0xf020
1072 *    '*'                 0xf021
1073 *    '/'                 0xf022
1074 *    '<'                 0xf023
1075 *    '>'                 0xf024
1076 *    '?'                 0xf025
1077 *    '\'                 0xf026
1078 *    '|'                 0xf027
1079 *    ' '                 0xf028  (Only if last char of the name)
1080 *    '.'                 0xf029  (Only if last char of the name)
1081 *  ----------------------------------------
1082 *
1083 *  Reference: http://support.microsoft.com/kb/q117258/
1084 */
1085
1086#define MAX_SFM2MAC           0x29
1087#define SFMCODE_PREFIX_MASK   0xf000
1088
1089/*
1090 * In the Mac OS 9 days the colon was illegal in a file name. For that reason
1091 * SFM had no conversion for the colon. There is a conversion for the
1092 * slash. In Mac OS X the slash is illegal in a file name. So for us the colon
1093 * is a slash and a slash is a colon. So we can just replace the slash with the
1094 * colon in our tables and everything will just work.
1095 */
1096static u_int8_t
1097sfm2mac[42] = {
1098	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* 00 - 07 */
1099	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,   /* 08 - 0F */
1100	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,   /* 10 - 17 */
1101	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,   /* 18 - 1F */
1102	0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c,   /* 20 - 27 */
1103	0x20, 0x2e                                        /* 28 - 29 */
1104};
1105
1106static u_int8_t
1107mac2sfm[112] = {
1108	0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27,	  /* 20 - 27 */
1109	0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22,   /* 28 - 2f */
1110	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,   /* 30 - 37 */
1111	0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25,   /* 38 - 3f */
1112	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,   /* 40 - 47 */
1113	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,   /* 48 - 4f */
1114	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,   /* 50 - 57 */
1115	0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f,   /* 58 - 5f */
1116	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,   /* 60 - 67 */
1117	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,   /* 68 - 6f */
1118	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,   /* 70 - 77 */
1119	0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f    /* 78 - 7f */
1120};
1121
1122
1123/*
1124 * Encode illegal NTFS filename characters into SFM Private Unicode characters
1125 *
1126 * Assumes non-zero ASCII input.
1127 */
1128static u_int16_t
1129ucs_to_sfm(u_int16_t ucs_ch, int lastchar)
1130{
1131	/* The last character of filename cannot be a space or period. */
1132	if (lastchar) {
1133		if (ucs_ch == 0x20)
1134			return (0xf028);
1135		else if (ucs_ch == 0x2e)
1136			return (0xf029);
1137	}
1138	/* 0x01 - 0x1f is simple transformation. */
1139	if (ucs_ch <= 0x1f) {
1140		return (ucs_ch | 0xf000);
1141	} else /* 0x20 - 0x7f */ {
1142		u_int16_t lsb;
1143
1144		lsb = mac2sfm[ucs_ch - 0x0020];
1145		if (lsb != ucs_ch)
1146			return(0xf000 | lsb);
1147	}
1148	return (ucs_ch);
1149}
1150
1151/*
1152 * Decode any SFM Private Unicode characters
1153 */
1154static u_int16_t
1155sfm_to_ucs(u_int16_t ucs_ch)
1156{
1157	if (((ucs_ch & 0xffC0) == SFMCODE_PREFIX_MASK) &&
1158	    ((ucs_ch & 0x003f) <= MAX_SFM2MAC)) {
1159		ucs_ch = sfm2mac[ucs_ch & 0x003f];
1160	}
1161	return (ucs_ch);
1162}
1163
1164
1165