1/**
2 * unistr.c - Unicode string handling. Originated from the Linux-NTFS project.
3 *
4 * Copyright (c) 2000-2004 Anton Altaparmakov
5 * Copyright (c) 2002-2009 Szabolcs Szakacsits
6 * Copyright (c) 2008-2015 Jean-Pierre Andre
7 * Copyright (c) 2008      Bernhard Kaindl
8 *
9 * This program/include file is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as published
11 * by the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program/include file is distributed in the hope that it will be
15 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
16 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program (in the main directory of the NTFS-3G
21 * distribution in the file COPYING); if not, write to the Free Software
22 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23 */
24
25#ifdef HAVE_CONFIG_H
26#include "config.h"
27#endif
28
29#ifdef HAVE_STDIO_H
30#include <stdio.h>
31#endif
32#ifdef HAVE_STDLIB_H
33#include <stdlib.h>
34#endif
35#ifdef HAVE_WCHAR_H
36#include <wchar.h>
37#endif
38#ifdef HAVE_STRING_H
39#include <string.h>
40#endif
41#ifdef HAVE_ERRNO_H
42#include <errno.h>
43#endif
44#ifdef HAVE_LOCALE_H
45#include <locale.h>
46#endif
47
48#if defined(__APPLE__) || defined(__DARWIN__)
49#ifdef ENABLE_NFCONV
50#include <CoreFoundation/CoreFoundation.h>
51#endif /* ENABLE_NFCONV */
52#endif /* defined(__APPLE__) || defined(__DARWIN__) */
53
54#include "compat.h"
55#include "attrib.h"
56#include "types.h"
57#include "unistr.h"
58#include "debug.h"
59#include "logging.h"
60#include "misc.h"
61
62#ifndef ALLOW_BROKEN_UNICODE
63/* Erik allowing broken UTF-16 surrogate pairs and U+FFFE and U+FFFF by default,
64 * open to debate. */
65#define ALLOW_BROKEN_UNICODE 1
66#endif /* !defined(ALLOW_BROKEN_UNICODE) */
67
68/*
69 * IMPORTANT
70 * =========
71 *
72 * All these routines assume that the Unicode characters are in little endian
73 * encoding inside the strings!!!
74 */
75
76static int use_utf8 = 1; /* use UTF-8 encoding for file names */
77
78#if defined(__APPLE__) || defined(__DARWIN__)
79#ifdef ENABLE_NFCONV
80/**
81 * This variable controls whether or not automatic normalization form conversion
82 * should be performed when translating NTFS unicode file names to UTF-8.
83 * Defaults to on, but can be controlled from the outside using the function
84 *   int ntfs_macosx_normalize_filenames(int normalize);
85 */
86static int nfconvert_utf8 = 1;
87#endif /* ENABLE_NFCONV */
88#endif /* defined(__APPLE__) || defined(__DARWIN__) */
89
90/*
91 * This is used by the name collation functions to quickly determine what
92 * characters are (in)valid.
93 */
94#if 0
95static const u8 legal_ansi_char_array[0x40] = {
96	0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
97	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
98
99	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
100	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
101
102	0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
103	0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
104
105	0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
106	0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
107};
108#endif
109
110/**
111 * ntfs_names_are_equal - compare two Unicode names for equality
112 * @s1:			name to compare to @s2
113 * @s1_len:		length in Unicode characters of @s1
114 * @s2:			name to compare to @s1
115 * @s2_len:		length in Unicode characters of @s2
116 * @ic:			ignore case bool
117 * @upcase:		upcase table (only if @ic == IGNORE_CASE)
118 * @upcase_size:	length in Unicode characters of @upcase (if present)
119 *
120 * Compare the names @s1 and @s2 and return TRUE (1) if the names are
121 * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
122 * the @upcase table is used to perform a case insensitive comparison.
123 */
124BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len,
125		const ntfschar *s2, size_t s2_len,
126		const IGNORE_CASE_BOOL ic,
127		const ntfschar *upcase, const u32 upcase_size)
128{
129	if (s1_len != s2_len)
130		return FALSE;
131	if (!s1_len)
132		return TRUE;
133	if (ic == CASE_SENSITIVE)
134		return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE;
135	return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE:
136								       TRUE;
137}
138
139/*
140 * ntfs_names_full_collate() fully collate two Unicode names
141 *
142 * @name1:	first Unicode name to compare
143 * @name1_len:	length of first Unicode name to compare
144 * @name2:	second Unicode name to compare
145 * @name2_len:	length of second Unicode name to compare
146 * @ic:		either CASE_SENSITIVE or IGNORE_CASE (see below)
147 * @upcase:	upcase table
148 * @upcase_len:	upcase table size
149 *
150 * If @ic is CASE_SENSITIVE, then the names are compared primarily ignoring
151 * case, but if the names are equal ignoring case, then they are compared
152 * case-sensitively.  As an example, "abc" would collate before "BCD" (since
153 * "abc" and "BCD" differ ignoring case and 'A' < 'B') but after "ABC" (since
154 * "ABC" and "abc" are equal ignoring case and 'A' < 'a').  This matches the
155 * collation order of filenames as indexed in NTFS directories.
156 *
157 * If @ic is IGNORE_CASE, then the names are only compared case-insensitively
158 * and are considered to match if and only if they are equal ignoring case.
159 *
160 * Returns:
161 *  -1 if the first name collates before the second one,
162 *   0 if the names match, or
163 *   1 if the second name collates before the first one
164 */
165int ntfs_names_full_collate(const ntfschar *name1, const u32 name1_len,
166		const ntfschar *name2, const u32 name2_len,
167		const IGNORE_CASE_BOOL ic, const ntfschar *upcase,
168		const u32 upcase_len)
169{
170	u32 cnt;
171	u16 c1, c2;
172	u16 u1, u2;
173
174#ifdef DEBUG
175	if (!name1 || !name2 || !upcase || !upcase_len) {
176		ntfs_log_debug("ntfs_names_collate received NULL pointer!\n");
177		exit(1);
178	}
179#endif
180	cnt = min(name1_len, name2_len);
181	if (cnt > 0) {
182		if (ic == CASE_SENSITIVE) {
183			while (--cnt && (*name1 == *name2)) {
184				name1++;
185				name2++;
186			}
187			u1 = c1 = le16_to_cpu(*name1);
188			u2 = c2 = le16_to_cpu(*name2);
189			if (u1 < upcase_len)
190				u1 = le16_to_cpu(upcase[u1]);
191			if (u2 < upcase_len)
192				u2 = le16_to_cpu(upcase[u2]);
193			if ((u1 == u2) && cnt)
194				do {
195					name1++;
196					u1 = le16_to_cpu(*name1);
197					name2++;
198					u2 = le16_to_cpu(*name2);
199					if (u1 < upcase_len)
200						u1 = le16_to_cpu(upcase[u1]);
201					if (u2 < upcase_len)
202						u2 = le16_to_cpu(upcase[u2]);
203				} while ((u1 == u2) && --cnt);
204			if (u1 < u2)
205				return -1;
206			if (u1 > u2)
207				return 1;
208			if (name1_len < name2_len)
209				return -1;
210			if (name1_len > name2_len)
211				return 1;
212			if (c1 < c2)
213				return -1;
214			if (c1 > c2)
215				return 1;
216		} else {
217			do {
218				u1 = le16_to_cpu(*name1);
219				name1++;
220				u2 = le16_to_cpu(*name2);
221				name2++;
222				if (u1 < upcase_len)
223					u1 = le16_to_cpu(upcase[u1]);
224				if (u2 < upcase_len)
225					u2 = le16_to_cpu(upcase[u2]);
226			} while ((u1 == u2) && --cnt);
227			if (u1 < u2)
228				return -1;
229			if (u1 > u2)
230				return 1;
231			if (name1_len < name2_len)
232				return -1;
233			if (name1_len > name2_len)
234				return 1;
235		}
236	} else {
237		if (name1_len < name2_len)
238			return -1;
239		if (name1_len > name2_len)
240			return 1;
241	}
242	return 0;
243}
244
245/**
246 * ntfs_ucsncmp - compare two little endian Unicode strings
247 * @s1:		first string
248 * @s2:		second string
249 * @n:		maximum unicode characters to compare
250 *
251 * Compare the first @n characters of the Unicode strings @s1 and @s2,
252 * The strings in little endian format and appropriate le16_to_cpu()
253 * conversion is performed on non-little endian machines.
254 *
255 * The function returns an integer less than, equal to, or greater than zero
256 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
257 * to be less than, to match, or be greater than @s2.
258 */
259int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
260{
261	u16 c1, c2;
262	size_t i;
263
264#ifdef DEBUG
265	if (!s1 || !s2) {
266		ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n");
267		exit(1);
268	}
269#endif
270	for (i = 0; i < n; ++i) {
271		c1 = le16_to_cpu(s1[i]);
272		c2 = le16_to_cpu(s2[i]);
273		if (c1 < c2)
274			return -1;
275		if (c1 > c2)
276			return 1;
277		if (!c1)
278			break;
279	}
280	return 0;
281}
282
283/**
284 * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
285 * @s1:			first string
286 * @s2:			second string
287 * @n:			maximum unicode characters to compare
288 * @upcase:		upcase table
289 * @upcase_size:	upcase table size in Unicode characters
290 *
291 * Compare the first @n characters of the Unicode strings @s1 and @s2,
292 * ignoring case. The strings in little endian format and appropriate
293 * le16_to_cpu() conversion is performed on non-little endian machines.
294 *
295 * Each character is uppercased using the @upcase table before the comparison.
296 *
297 * The function returns an integer less than, equal to, or greater than zero
298 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
299 * to be less than, to match, or be greater than @s2.
300 */
301int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
302		const ntfschar *upcase, const u32 upcase_size)
303{
304	u16 c1, c2;
305	size_t i;
306
307#ifdef DEBUG
308	if (!s1 || !s2 || !upcase) {
309		ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n");
310		exit(1);
311	}
312#endif
313	for (i = 0; i < n; ++i) {
314		if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
315			c1 = le16_to_cpu(upcase[c1]);
316		if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
317			c2 = le16_to_cpu(upcase[c2]);
318		if (c1 < c2)
319			return -1;
320		if (c1 > c2)
321			return 1;
322		if (!c1)
323			break;
324	}
325	return 0;
326}
327
328/**
329 * ntfs_ucsnlen - determine the length of a little endian Unicode string
330 * @s:		pointer to Unicode string
331 * @maxlen:	maximum length of string @s
332 *
333 * Return the number of Unicode characters in the little endian Unicode
334 * string @s up to a maximum of maxlen Unicode characters, not including
335 * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s
336 * and @s + @maxlen, @maxlen is returned.
337 *
338 * This function never looks beyond @s + @maxlen.
339 */
340u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen)
341{
342	u32 i;
343
344	for (i = 0; i < maxlen; i++) {
345		if (!le16_to_cpu(s[i]))
346			break;
347	}
348	return i;
349}
350
351/**
352 * ntfs_ucsndup - duplicate little endian Unicode string
353 * @s:		pointer to Unicode string
354 * @maxlen:	maximum length of string @s
355 *
356 * Return a pointer to a new little endian Unicode string which is a duplicate
357 * of the string s.  Memory for the new string is obtained with ntfs_malloc(3),
358 * and can be freed with free(3).
359 *
360 * A maximum of @maxlen Unicode characters are copied and a terminating
361 * (ntfschar)'\0' little endian Unicode character is added.
362 *
363 * This function never looks beyond @s + @maxlen.
364 *
365 * Return a pointer to the new little endian Unicode string on success and NULL
366 * on failure with errno set to the error code.
367 */
368ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen)
369{
370	ntfschar *dst;
371	u32 len;
372
373	len = ntfs_ucsnlen(s, maxlen);
374	dst = ntfs_malloc((len + 1) * sizeof(ntfschar));
375	if (dst) {
376		memcpy(dst, s, len * sizeof(ntfschar));
377		dst[len] = const_cpu_to_le16(L'\0');
378	}
379	return dst;
380}
381
382/**
383 * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent
384 * @name:
385 * @name_len:
386 * @upcase:
387 * @upcase_len:
388 *
389 * Description...
390 *
391 * Returns:
392 */
393void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase,
394		const u32 upcase_len)
395{
396	u32 i;
397	u16 u;
398
399	for (i = 0; i < name_len; i++)
400		if ((u = le16_to_cpu(name[i])) < upcase_len)
401			name[i] = upcase[u];
402}
403
404/**
405 * ntfs_name_locase - Map a Unicode name to its lowercase equivalent
406 */
407void ntfs_name_locase(ntfschar *name, u32 name_len, const ntfschar *locase,
408		const u32 locase_len)
409{
410	u32 i;
411	u16 u;
412
413	if (locase)
414		for (i = 0; i < name_len; i++)
415			if ((u = le16_to_cpu(name[i])) < locase_len)
416				name[i] = locase[u];
417}
418
419/**
420 * ntfs_file_value_upcase - Convert a filename to upper case
421 * @file_name_attr:
422 * @upcase:
423 * @upcase_len:
424 *
425 * Description...
426 *
427 * Returns:
428 */
429void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr,
430		const ntfschar *upcase, const u32 upcase_len)
431{
432	ntfs_name_upcase((ntfschar*)&file_name_attr->file_name,
433			file_name_attr->file_name_length, upcase, upcase_len);
434}
435
436/*
437   NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
438   for now]) for path names, but the Unicode code points need to be
439   converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
440   glibc does this even without a locale in a hard-coded fashion as that
441   appears to be is easy because the low 7-bit ASCII range appears to be
442   available in all charsets but it does not convert anything if
443   there was some error with the locale setup or none set up like
444   when mount is called during early boot where he (by policy) do
445   not use locales (and may be not available if /usr is not yet mounted),
446   so this patch fixes the resulting issues for systems which use
447   UTF-8 and for others, specifying the locale in fstab brings them
448   the encoding which they want.
449
450   If no locale is defined or there was a problem with setting one
451   up and whenever nl_langinfo(CODESET) returns a sting starting with
452   "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
453   the bug where NTFS-3G does not show any path names which include
454   international characters!!! (and also fails on creating them) as result.
455
456   Author: Bernhard Kaindl <bk@suse.de>
457   Jean-Pierre Andre made it compliant with RFC3629/RFC2781.
458*/
459
460/*
461 * Return the number of bytes in UTF-8 needed (without the terminating null) to
462 * store the given UTF-16LE string.
463 *
464 * On error, -1 is returned, and errno is set to the error code. The following
465 * error codes can be expected:
466 *	EILSEQ		The input string is not valid UTF-16LE (only possible
467 *			if compiled without ALLOW_BROKEN_UNICODE).
468 *	ENAMETOOLONG	The length of the UTF-8 string in bytes (without the
469 *			terminating null) would exceed @outs_len.
470 */
471static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len)
472{
473	int i, ret = -1;
474	int count = 0;
475	BOOL surrog;
476
477	surrog = FALSE;
478	for (i = 0; i < ins_len && ins[i] && count <= outs_len; i++) {
479		unsigned short c = le16_to_cpu(ins[i]);
480		if (surrog) {
481			if ((c >= 0xdc00) && (c < 0xe000)) {
482				surrog = FALSE;
483				count += 4;
484			} else {
485#if ALLOW_BROKEN_UNICODE
486				/* The first UTF-16 unit of a surrogate pair has
487				 * a value between 0xd800 and 0xdc00. It can be
488				 * encoded as an individual UTF-8 sequence if we
489				 * cannot combine it with the next UTF-16 unit
490				 * unit as a surrogate pair. */
491				surrog = FALSE;
492				count += 3;
493
494				--i;
495				continue;
496#else
497				goto fail;
498#endif /* ALLOW_BROKEN_UNICODE */
499			}
500		} else
501			if (c < 0x80)
502				count++;
503			else if (c < 0x800)
504				count += 2;
505			else if (c < 0xd800)
506				count += 3;
507			else if (c < 0xdc00)
508				surrog = TRUE;
509#if ALLOW_BROKEN_UNICODE
510			else if (c < 0xe000)
511				count += 3;
512			else if (c >= 0xe000)
513#else
514			else if ((c >= 0xe000) && (c < 0xfffe))
515#endif /* ALLOW_BROKEN_UNICODE */
516				count += 3;
517			else
518				goto fail;
519	}
520
521	if (surrog && count <= outs_len) {
522#if ALLOW_BROKEN_UNICODE
523		count += 3; /* ending with a single surrogate */
524#else
525		goto fail;
526#endif /* ALLOW_BROKEN_UNICODE */
527	}
528
529	if (count > outs_len) {
530		errno = ENAMETOOLONG;
531		goto out;
532	}
533
534	ret = count;
535out:
536	return ret;
537fail:
538	errno = EILSEQ;
539	goto out;
540}
541
542/*
543 * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string
544 * @ins:	input utf16 string buffer
545 * @ins_len:	length of input string in utf16 characters
546 * @outs:	on return contains the (allocated) output multibyte string
547 * @outs_len:	length of output buffer in bytes (ignored if *@outs is NULL)
548 *
549 * Return -1 with errno set if string has invalid byte sequence or too long.
550 */
551static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
552			      char **outs, int outs_len)
553{
554#if defined(__APPLE__) || defined(__DARWIN__)
555#ifdef ENABLE_NFCONV
556	char *original_outs_value = *outs;
557	int original_outs_len = outs_len;
558#endif /* ENABLE_NFCONV */
559#endif /* defined(__APPLE__) || defined(__DARWIN__) */
560
561	char *t;
562	int i, size, ret = -1;
563	int halfpair;
564
565	halfpair = 0;
566	if (!*outs) {
567		/* If no output buffer was provided, we will allocate one and
568		 * limit its length to PATH_MAX.  Note: we follow the standard
569		 * convention of PATH_MAX including the terminating null. */
570		outs_len = PATH_MAX;
571	}
572
573	/* The size *with* the terminating null is limited to @outs_len,
574	 * so the size *without* the terminating null is limited to one less. */
575	size = utf16_to_utf8_size(ins, ins_len, outs_len - 1);
576
577	if (size < 0)
578		goto out;
579
580	if (!*outs) {
581		outs_len = size + 1;
582		*outs = ntfs_malloc(outs_len);
583		if (!*outs)
584			goto out;
585	}
586
587	t = *outs;
588
589	for (i = 0; i < ins_len && ins[i]; i++) {
590	    unsigned short c = le16_to_cpu(ins[i]);
591			/* size not double-checked */
592		if (halfpair) {
593			if ((c >= 0xdc00) && (c < 0xe000)) {
594				*t++ = 0xf0 + (((halfpair + 64) >> 8) & 7);
595				*t++ = 0x80 + (((halfpair + 64) >> 2) & 63);
596				*t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4);
597				*t++ = 0x80 + (c & 63);
598				halfpair = 0;
599			} else {
600#if ALLOW_BROKEN_UNICODE
601				/* The first UTF-16 unit of a surrogate pair has
602				 * a value between 0xd800 and 0xdc00. It can be
603				 * encoded as an individual UTF-8 sequence if we
604				 * cannot combine it with the next UTF-16 unit
605				 * unit as a surrogate pair. */
606				*t++ = 0xe0 | (halfpair >> 12);
607				*t++ = 0x80 | ((halfpair >> 6) & 0x3f);
608				*t++ = 0x80 | (halfpair & 0x3f);
609				halfpair = 0;
610
611				--i;
612				continue;
613#else
614				goto fail;
615#endif /* ALLOW_BROKEN_UNICODE */
616			}
617		} else if (c < 0x80) {
618			*t++ = c;
619	    	} else {
620			if (c < 0x800) {
621			   	*t++ = (0xc0 | ((c >> 6) & 0x3f));
622			        *t++ = 0x80 | (c & 0x3f);
623			} else if (c < 0xd800) {
624			   	*t++ = 0xe0 | (c >> 12);
625			   	*t++ = 0x80 | ((c >> 6) & 0x3f);
626		        	*t++ = 0x80 | (c & 0x3f);
627			} else if (c < 0xdc00)
628				halfpair = c;
629#if ALLOW_BROKEN_UNICODE
630			else if (c < 0xe000) {
631				*t++ = 0xe0 | (c >> 12);
632				*t++ = 0x80 | ((c >> 6) & 0x3f);
633				*t++ = 0x80 | (c & 0x3f);
634			}
635#endif /* ALLOW_BROKEN_UNICODE */
636			else if (c >= 0xe000) {
637				*t++ = 0xe0 | (c >> 12);
638				*t++ = 0x80 | ((c >> 6) & 0x3f);
639			        *t++ = 0x80 | (c & 0x3f);
640			} else
641				goto fail;
642	        }
643	}
644#if ALLOW_BROKEN_UNICODE
645	if (halfpair) { /* ending with a single surrogate */
646		*t++ = 0xe0 | (halfpair >> 12);
647		*t++ = 0x80 | ((halfpair >> 6) & 0x3f);
648		*t++ = 0x80 | (halfpair & 0x3f);
649	}
650#endif /* ALLOW_BROKEN_UNICODE */
651	*t = '\0';
652
653#if defined(__APPLE__) || defined(__DARWIN__)
654#ifdef ENABLE_NFCONV
655	if(nfconvert_utf8 && (t - *outs) > 0) {
656		char *new_outs = NULL;
657		int new_outs_len = ntfs_macosx_normalize_utf8(*outs, &new_outs, 0); // Normalize to decomposed form
658		if(new_outs_len >= 0 && new_outs != NULL) {
659			if(original_outs_value != *outs) {
660				// We have allocated outs ourselves.
661				free(*outs);
662				*outs = new_outs;
663				t = *outs + new_outs_len;
664			}
665			else {
666				// We need to copy new_outs into the fixed outs buffer.
667				memset(*outs, 0, original_outs_len);
668				strncpy(*outs, new_outs, original_outs_len-1);
669				t = *outs + original_outs_len;
670				free(new_outs);
671			}
672		}
673		else {
674			ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs);
675			ntfs_log_error("  new_outs=0x%p\n", new_outs);
676			ntfs_log_error("  new_outs_len=%d\n", new_outs_len);
677		}
678	}
679#endif /* ENABLE_NFCONV */
680#endif /* defined(__APPLE__) || defined(__DARWIN__) */
681
682	ret = t - *outs;
683out:
684	return ret;
685fail:
686	errno = EILSEQ;
687	goto out;
688}
689
690/*
691 * Return the amount of 16-bit elements in UTF-16LE needed
692 * (without the terminating null) to store given UTF-8 string.
693 *
694 * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
695 *
696 * Note: This does not check whether the input sequence is a valid utf8 string,
697 *	 and should be used only in context where such check is made!
698 */
699static int utf8_to_utf16_size(const char *s)
700{
701	int ret = -1;
702	unsigned int byte;
703	size_t count = 0;
704
705	while ((byte = *((const unsigned char *)s++))) {
706		if (++count >= PATH_MAX)
707			goto fail;
708		if (byte >= 0xc0) {
709			if (byte >= 0xF5) {
710				errno = EILSEQ;
711				goto out;
712			}
713			if (!*s)
714				break;
715			if (byte >= 0xC0)
716				s++;
717			if (!*s)
718				break;
719			if (byte >= 0xE0)
720				s++;
721			if (!*s)
722				break;
723			if (byte >= 0xF0) {
724				s++;
725				if (++count >= PATH_MAX)
726					goto fail;
727			}
728		}
729	}
730	ret = count;
731out:
732	return ret;
733fail:
734	errno = ENAMETOOLONG;
735	goto out;
736}
737/*
738 * This converts one UTF-8 sequence to cpu-endian Unicode value
739 * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF
740 *
741 * Return the number of used utf8 bytes or -1 with errno set
742 * if sequence is invalid.
743 */
744static int utf8_to_unicode(u32 *wc, const char *s)
745{
746    	unsigned int byte = *((const unsigned char *)s);
747
748					/* single byte */
749	if (byte == 0) {
750		*wc = (u32) 0;
751		return 0;
752	} else if (byte < 0x80) {
753		*wc = (u32) byte;
754		return 1;
755					/* double byte */
756	} else if (byte < 0xc2) {
757		goto fail;
758	} else if (byte < 0xE0) {
759		if ((s[1] & 0xC0) == 0x80) {
760			*wc = ((u32)(byte & 0x1F) << 6)
761			    | ((u32)(s[1] & 0x3F));
762			return 2;
763		} else
764			goto fail;
765					/* three-byte */
766	} else if (byte < 0xF0) {
767		if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
768			*wc = ((u32)(byte & 0x0F) << 12)
769			    | ((u32)(s[1] & 0x3F) << 6)
770			    | ((u32)(s[2] & 0x3F));
771			/* Check valid ranges */
772#if ALLOW_BROKEN_UNICODE
773			if (((*wc >= 0x800) && (*wc <= 0xD7FF))
774			  || ((*wc >= 0xD800) && (*wc <= 0xDFFF))
775			  || ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
776				return 3;
777#else
778			if (((*wc >= 0x800) && (*wc <= 0xD7FF))
779			  || ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
780				return 3;
781#endif /* ALLOW_BROKEN_UNICODE */
782		}
783		goto fail;
784					/* four-byte */
785	} else if (byte < 0xF5) {
786		if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)
787		  && ((s[3] & 0xC0) == 0x80)) {
788			*wc = ((u32)(byte & 0x07) << 18)
789			    | ((u32)(s[1] & 0x3F) << 12)
790			    | ((u32)(s[2] & 0x3F) << 6)
791			    | ((u32)(s[3] & 0x3F));
792			/* Check valid ranges */
793			if ((*wc <= 0x10ffff) && (*wc >= 0x10000))
794				return 4;
795		}
796		goto fail;
797	}
798fail:
799	errno = EILSEQ;
800	return -1;
801}
802
803/**
804 * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string
805 * @ins:	input multibyte string buffer
806 * @outs:	on return contains the (allocated) output utf16 string
807 * @outs_len:	length of output buffer in utf16 characters
808 *
809 * Return -1 with errno set.
810 */
811static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs)
812{
813#if defined(__APPLE__) || defined(__DARWIN__)
814#ifdef ENABLE_NFCONV
815	char *new_ins = NULL;
816	if(nfconvert_utf8) {
817		int new_ins_len;
818		new_ins_len = ntfs_macosx_normalize_utf8(ins, &new_ins, 1); // Normalize to composed form
819		if(new_ins_len >= 0)
820			ins = new_ins;
821		else
822			ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins);
823	}
824#endif /* ENABLE_NFCONV */
825#endif /* defined(__APPLE__) || defined(__DARWIN__) */
826	const char *t = ins;
827	u32 wc;
828	BOOL allocated;
829	ntfschar *outpos;
830	int shorts, ret = -1;
831
832	shorts = utf8_to_utf16_size(ins);
833	if (shorts < 0)
834		goto fail;
835
836	allocated = FALSE;
837	if (!*outs) {
838		*outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar));
839		if (!*outs)
840			goto fail;
841		allocated = TRUE;
842	}
843
844	outpos = *outs;
845
846	while(1) {
847		int m  = utf8_to_unicode(&wc, t);
848		if (m <= 0) {
849			if (m < 0) {
850				/* do not leave space allocated if failed */
851				if (allocated) {
852					free(*outs);
853					*outs = (ntfschar*)NULL;
854				}
855				goto fail;
856			}
857			*outpos++ = const_cpu_to_le16(0);
858			break;
859		}
860		if (wc < 0x10000)
861			*outpos++ = cpu_to_le16(wc);
862		else {
863			wc -= 0x10000;
864			*outpos++ = cpu_to_le16((wc >> 10) + 0xd800);
865			*outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00);
866		}
867		t += m;
868	}
869
870	ret = --outpos - *outs;
871fail:
872#if defined(__APPLE__) || defined(__DARWIN__)
873#ifdef ENABLE_NFCONV
874	if(new_ins != NULL)
875		free(new_ins);
876#endif /* ENABLE_NFCONV */
877#endif /* defined(__APPLE__) || defined(__DARWIN__) */
878	return ret;
879}
880
881/**
882 * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
883 * @ins:	input Unicode string buffer
884 * @ins_len:	length of input string in Unicode characters
885 * @outs:	on return contains the (allocated) output multibyte string
886 * @outs_len:	length of output buffer in bytes (ignored if *@outs is NULL)
887 *
888 * Convert the input little endian, 2-byte Unicode string @ins, of length
889 * @ins_len into the multibyte string format dictated by the current locale.
890 *
891 * If *@outs is NULL, the function allocates the string and the caller is
892 * responsible for calling free(*@outs); when finished with it.
893 *
894 * On success the function returns the number of bytes written to the output
895 * string *@outs (>= 0), not counting the terminating NULL byte. If the output
896 * string buffer was allocated, *@outs is set to it.
897 *
898 * On error, -1 is returned, and errno is set to the error code. The following
899 * error codes can be expected:
900 *	EINVAL		Invalid arguments (e.g. @ins or @outs is NULL).
901 *	EILSEQ		The input string cannot be represented as a multibyte
902 *			sequence according to the current locale.
903 *	ENAMETOOLONG	Destination buffer is too small for input string.
904 *	ENOMEM		Not enough memory to allocate destination buffer.
905 */
906int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs,
907		int outs_len)
908{
909	char *mbs;
910	int mbs_len;
911#ifdef MB_CUR_MAX
912	wchar_t wc;
913	int i, o;
914	int cnt = 0;
915#ifdef HAVE_MBSINIT
916	mbstate_t mbstate;
917#endif
918#endif /* MB_CUR_MAX */
919
920	if (!ins || !outs) {
921		errno = EINVAL;
922		return -1;
923	}
924	mbs = *outs;
925	mbs_len = outs_len;
926	if (mbs && !mbs_len) {
927		errno = ENAMETOOLONG;
928		return -1;
929	}
930	if (use_utf8)
931		return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len);
932#ifdef MB_CUR_MAX
933	if (!mbs) {
934		mbs_len = (ins_len + 1) * MB_CUR_MAX;
935		mbs = ntfs_malloc(mbs_len);
936		if (!mbs)
937			return -1;
938	}
939#ifdef HAVE_MBSINIT
940	memset(&mbstate, 0, sizeof(mbstate));
941#else
942#ifndef __HAIKU__
943	wctomb(NULL, 0);
944#endif
945#endif
946	for (i = o = 0; i < ins_len; i++) {
947		/* Reallocate memory if necessary or abort. */
948		if ((int)(o + MB_CUR_MAX) > mbs_len) {
949			char *tc;
950			if (mbs == *outs) {
951				errno = ENAMETOOLONG;
952				return -1;
953			}
954			tc = ntfs_malloc((mbs_len + 64) & ~63);
955			if (!tc)
956				goto err_out;
957			memcpy(tc, mbs, mbs_len);
958			mbs_len = (mbs_len + 64) & ~63;
959			free(mbs);
960			mbs = tc;
961		}
962		/* Convert the LE Unicode character to a CPU wide character. */
963		wc = (wchar_t)le16_to_cpu(ins[i]);
964		if (!wc)
965			break;
966		/* Convert the CPU endian wide character to multibyte. */
967#ifdef HAVE_MBSINIT
968		cnt = wcrtomb(mbs + o, wc, &mbstate);
969#elif defined(__HAIKU__)
970		cnt = -1;
971#else
972		cnt = wctomb(mbs + o, wc);
973#endif
974		if (cnt == -1)
975			goto err_out;
976		if (cnt <= 0) {
977			ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt);
978			errno = EINVAL;
979			goto err_out;
980		}
981		o += cnt;
982	}
983#ifdef HAVE_MBSINIT
984	/* Make sure we are back in the initial state. */
985	if (!mbsinit(&mbstate)) {
986		ntfs_log_debug("Eeek. mbstate not in initial state!\n");
987		errno = EILSEQ;
988		goto err_out;
989	}
990#endif
991	/* Now write the NULL character. */
992	mbs[o] = '\0';
993	if (*outs != mbs)
994		*outs = mbs;
995	return o;
996err_out:
997	if (mbs != *outs) {
998		int eo = errno;
999		free(mbs);
1000		errno = eo;
1001	}
1002#else /* MB_CUR_MAX */
1003	errno = EILSEQ;
1004#endif /* MB_CUR_MAX */
1005	return -1;
1006}
1007
1008/**
1009 * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
1010 * @ins:	input multibyte string buffer
1011 * @outs:	on return contains the (allocated) output Unicode string
1012 *
1013 * Convert the input multibyte string @ins, from the current locale into the
1014 * corresponding little endian, 2-byte Unicode string.
1015 *
1016 * The function allocates the string and the caller is responsible for calling
1017 * free(*@outs); when finished with it.
1018 *
1019 * On success the function returns the number of Unicode characters written to
1020 * the output string *@outs (>= 0), not counting the terminating Unicode NULL
1021 * character.
1022 *
1023 * On error, -1 is returned, and errno is set to the error code. The following
1024 * error codes can be expected:
1025 *	EINVAL		Invalid arguments (e.g. @ins or @outs is NULL).
1026 *	EILSEQ		The input string cannot be represented as a Unicode
1027 *			string according to the current locale.
1028 *	ENAMETOOLONG	Destination buffer is too small for input string.
1029 *	ENOMEM		Not enough memory to allocate destination buffer.
1030 */
1031int ntfs_mbstoucs(const char *ins, ntfschar **outs)
1032{
1033#ifdef MB_CUR_MAX
1034	ntfschar *ucs;
1035	const char *s;
1036	wchar_t wc;
1037	int i, o, cnt, ins_len, ucs_len, ins_size;
1038#ifdef HAVE_MBSINIT
1039	mbstate_t mbstate;
1040#endif
1041#endif /* MB_CUR_MAX */
1042
1043	if (!ins || !outs) {
1044		errno = EINVAL;
1045		return -1;
1046	}
1047
1048	if (use_utf8)
1049		return ntfs_utf8_to_utf16(ins, outs);
1050
1051#ifdef MB_CUR_MAX
1052	/* Determine the size of the multi-byte string in bytes. */
1053	ins_size = strlen(ins);
1054	/* Determine the length of the multi-byte string. */
1055	s = ins;
1056#if defined(HAVE_MBSINIT)
1057	memset(&mbstate, 0, sizeof(mbstate));
1058	ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate);
1059#ifdef __CYGWIN32__
1060	if (!ins_len && *ins) {
1061		/* Older Cygwin had broken mbsrtowcs() implementation. */
1062		ins_len = strlen(ins);
1063	}
1064#endif
1065#elif !defined(DJGPP) && !defined(__HAIKU__)
1066	ins_len = mbstowcs(NULL, s, 0);
1067#else
1068	/* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */
1069	ins_len = strlen(ins);
1070#endif
1071	if (ins_len == -1)
1072		return ins_len;
1073#ifdef HAVE_MBSINIT
1074	if ((s != ins) || !mbsinit(&mbstate)) {
1075#else
1076	if (s != ins) {
1077#endif
1078		errno = EILSEQ;
1079		return -1;
1080	}
1081	/* Add the NULL terminator. */
1082	ins_len++;
1083	ucs_len = ins_len;
1084	ucs = ntfs_malloc(ucs_len * sizeof(ntfschar));
1085	if (!ucs)
1086		return -1;
1087#ifdef HAVE_MBSINIT
1088	memset(&mbstate, 0, sizeof(mbstate));
1089#else
1090#ifndef __HAIKU__
1091	mbtowc(NULL, NULL, 0);
1092#endif
1093#endif
1094	for (i = o = cnt = 0; i < ins_size; i += cnt, o++) {
1095		/* Reallocate memory if necessary. */
1096		if (o >= ucs_len) {
1097			ntfschar *tc;
1098			ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63;
1099			tc = realloc(ucs, ucs_len);
1100			if (!tc)
1101				goto err_out;
1102			ucs = tc;
1103			ucs_len /= sizeof(ntfschar);
1104		}
1105		/* Convert the multibyte character to a wide character. */
1106#ifdef HAVE_MBSINIT
1107		cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate);
1108#elif defined(__HAIKU__)
1109		cnt = -1;
1110#else
1111		cnt = mbtowc(&wc, ins + i, ins_size - i);
1112#endif
1113		if (!cnt)
1114			break;
1115		if (cnt == -1)
1116			goto err_out;
1117		if (cnt < -1) {
1118			ntfs_log_trace("Eeek. cnt = %i\n", cnt);
1119			errno = EINVAL;
1120			goto err_out;
1121		}
1122		/* Make sure we are not overflowing the NTFS Unicode set. */
1123		if ((unsigned long)wc >= (unsigned long)(1 <<
1124				(8 * sizeof(ntfschar)))) {
1125			errno = EILSEQ;
1126			goto err_out;
1127		}
1128		/* Convert the CPU wide character to a LE Unicode character. */
1129		ucs[o] = cpu_to_le16(wc);
1130	}
1131#ifdef HAVE_MBSINIT
1132	/* Make sure we are back in the initial state. */
1133	if (!mbsinit(&mbstate)) {
1134		ntfs_log_trace("Eeek. mbstate not in initial state!\n");
1135		errno = EILSEQ;
1136		goto err_out;
1137	}
1138#endif
1139	/* Now write the NULL character. */
1140	ucs[o] = const_cpu_to_le16(L'\0');
1141	*outs = ucs;
1142	return o;
1143err_out:
1144	free(ucs);
1145#else /* MB_CUR_MAX */
1146	errno = EILSEQ;
1147#endif /* MB_CUR_MAX */
1148	return -1;
1149}
1150
1151/*
1152 *		Turn a UTF8 name uppercase
1153 *
1154 *	Returns an allocated uppercase name which has to be freed by caller
1155 *	or NULL if there is an error (described by errno)
1156 */
1157
1158char *ntfs_uppercase_mbs(const char *low,
1159			const ntfschar *upcase, u32 upcase_size)
1160{
1161	int size;
1162	char *upp;
1163	u32 wc;
1164	int n;
1165	const char *s;
1166	char *t;
1167
1168	size = strlen(low);
1169	upp = (char*)ntfs_malloc(3*size + 1);
1170	if (upp) {
1171		s = low;
1172		t = upp;
1173		do {
1174			n = utf8_to_unicode(&wc, s);
1175			if (n > 0) {
1176				if (wc < upcase_size)
1177					wc = le16_to_cpu(upcase[wc]);
1178				if (wc < 0x80)
1179					*t++ = wc;
1180				else if (wc < 0x800) {
1181					*t++ = (0xc0 | ((wc >> 6) & 0x3f));
1182					*t++ = 0x80 | (wc & 0x3f);
1183				} else if (wc < 0x10000) {
1184					*t++ = 0xe0 | (wc >> 12);
1185					*t++ = 0x80 | ((wc >> 6) & 0x3f);
1186					*t++ = 0x80 | (wc & 0x3f);
1187				} else {
1188					*t++ = 0xf0 | ((wc >> 18) & 7);
1189					*t++ = 0x80 | ((wc >> 12) & 63);
1190					*t++ = 0x80 | ((wc >> 6) & 0x3f);
1191					*t++ = 0x80 | (wc & 0x3f);
1192				}
1193			s += n;
1194			}
1195		} while (n > 0);
1196		if (n < 0) {
1197			free(upp);
1198			upp = (char*)NULL;
1199			errno = EILSEQ;
1200		}
1201		*t = 0;
1202	}
1203	return (upp);
1204}
1205
1206/**
1207 * ntfs_upcase_table_build - build the default upcase table for NTFS
1208 * @uc:		destination buffer where to store the built table
1209 * @uc_len:	size of destination buffer in bytes
1210 *
1211 * ntfs_upcase_table_build() builds the default upcase table for NTFS and
1212 * stores it in the caller supplied buffer @uc of size @uc_len.
1213 *
1214 * Note, @uc_len must be at least 128kiB in size or bad things will happen!
1215 */
1216void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len)
1217{
1218	struct NEWUPPERCASE {
1219		unsigned short first;
1220		unsigned short last;
1221		short diff;
1222		unsigned char step;
1223		unsigned char osmajor;
1224		unsigned char osminor;
1225	} ;
1226
1227	/*
1228	 *	This is the table as defined by Windows XP
1229	 */
1230	static int uc_run_table[][3] = { /* Start, End, Add */
1231	{0x0061, 0x007B,  -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72,  74},
1232	{0x00E0, 0x00F7,  -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76,  86},
1233	{0x00F8, 0x00FF,  -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
1234	{0x0256, 0x0258, -205}, {0x1F00, 0x1F08,   8}, {0x1F78, 0x1F7A, 128},
1235	{0x028A, 0x028C, -217}, {0x1F10, 0x1F16,   8}, {0x1F7A, 0x1F7C, 112},
1236	{0x03AC, 0x03AD,  -38}, {0x1F20, 0x1F28,   8}, {0x1F7C, 0x1F7E, 126},
1237	{0x03AD, 0x03B0,  -37}, {0x1F30, 0x1F38,   8}, {0x1FB0, 0x1FB2,   8},
1238	{0x03B1, 0x03C2,  -32}, {0x1F40, 0x1F46,   8}, {0x1FD0, 0x1FD2,   8},
1239	{0x03C2, 0x03C3,  -31}, {0x1F51, 0x1F52,   8}, {0x1FE0, 0x1FE2,   8},
1240	{0x03C3, 0x03CC,  -32}, {0x1F53, 0x1F54,   8}, {0x1FE5, 0x1FE6,   7},
1241	{0x03CC, 0x03CD,  -64}, {0x1F55, 0x1F56,   8}, {0x2170, 0x2180, -16},
1242	{0x03CD, 0x03CF,  -63}, {0x1F57, 0x1F58,   8}, {0x24D0, 0x24EA, -26},
1243	{0x0430, 0x0450,  -32}, {0x1F60, 0x1F68,   8}, {0xFF41, 0xFF5B, -32},
1244	{0}
1245	};
1246	static int uc_dup_table[][2] = { /* Start, End */
1247	{0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
1248	{0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
1249	{0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
1250	{0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
1251	{0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
1252	{0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
1253	{0}
1254	};
1255	static int uc_byte_table[][2] = { /* Offset, Value */
1256	{0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
1257	{0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
1258	{0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
1259	{0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
1260	{0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
1261	{0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
1262	{0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
1263	{0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
1264	{0}
1265	};
1266
1267/*
1268 *		Changes which were applied to later Windows versions
1269 *
1270 *   md5 for $UpCase from Winxp : 6fa3db2468275286210751e869d36373
1271 *                        Vista : 2f03b5a69d486ff3864cecbd07f24440
1272 *                        Win8 :  7ff498a44e45e77374cc7c962b1b92f2
1273 */
1274	static const struct NEWUPPERCASE newuppercase[] = {
1275						/* from Windows 6.0 (Vista) */
1276		{ 0x37b, 0x37d, 0x82, 1, 6, 0 },
1277		{ 0x1f80, 0x1f87, 0x8, 1, 6, 0 },
1278		{ 0x1f90, 0x1f97, 0x8, 1, 6, 0 },
1279		{ 0x1fa0, 0x1fa7, 0x8, 1, 6, 0 },
1280		{ 0x2c30, 0x2c5e, -0x30, 1, 6, 0 },
1281		{ 0x2d00, 0x2d25, -0x1c60, 1, 6, 0 },
1282		{ 0x2c68, 0x2c6c, -0x1, 2, 6, 0 },
1283		{ 0x219, 0x21f, -0x1, 2, 6, 0 },
1284		{ 0x223, 0x233, -0x1, 2, 6, 0 },
1285		{ 0x247, 0x24f, -0x1, 2, 6, 0 },
1286		{ 0x3d9, 0x3e1, -0x1, 2, 6, 0 },
1287		{ 0x48b, 0x48f, -0x1, 2, 6, 0 },
1288		{ 0x4fb, 0x513, -0x1, 2, 6, 0 },
1289		{ 0x2c81, 0x2ce3, -0x1, 2, 6, 0 },
1290		{ 0x3f8, 0x3fb, -0x1, 3, 6, 0 },
1291		{ 0x4c6, 0x4ce, -0x1, 4, 6, 0 },
1292		{ 0x23c, 0x242, -0x1, 6, 6, 0 },
1293		{ 0x4ed, 0x4f7, -0x1, 10, 6, 0 },
1294		{ 0x450, 0x45d, -0x50, 13, 6, 0 },
1295		{ 0x2c61, 0x2c76, -0x1, 21, 6, 0 },
1296		{ 0x1fcc, 0x1ffc, -0x9, 48, 6, 0 },
1297		{ 0x180, 0x180, 0xc3, 1, 6, 0 },
1298		{ 0x195, 0x195, 0x61, 1, 6, 0 },
1299		{ 0x19a, 0x19a, 0xa3, 1, 6, 0 },
1300		{ 0x19e, 0x19e, 0x82, 1, 6, 0 },
1301		{ 0x1bf, 0x1bf, 0x38, 1, 6, 0 },
1302		{ 0x1f9, 0x1f9, -0x1, 1, 6, 0 },
1303		{ 0x23a, 0x23a, 0x2a2b, 1, 6, 0 },
1304		{ 0x23e, 0x23e, 0x2a28, 1, 6, 0 },
1305		{ 0x26b, 0x26b, 0x29f7, 1, 6, 0 },
1306		{ 0x27d, 0x27d, 0x29e7, 1, 6, 0 },
1307		{ 0x280, 0x280, -0xda, 1, 6, 0 },
1308		{ 0x289, 0x289, -0x45, 1, 6, 0 },
1309		{ 0x28c, 0x28c, -0x47, 1, 6, 0 },
1310		{ 0x3f2, 0x3f2, 0x7, 1, 6, 0 },
1311		{ 0x4cf, 0x4cf, -0xf, 1, 6, 0 },
1312		{ 0x1d7d, 0x1d7d, 0xee6, 1, 6, 0 },
1313		{ 0x1fb3, 0x1fb3, 0x9, 1, 6, 0 },
1314		{ 0x214e, 0x214e, -0x1c, 1, 6, 0 },
1315		{ 0x2184, 0x2184, -0x1, 1, 6, 0 },
1316						/* from Windows 6.1 (Win7) */
1317		{ 0x23a, 0x23e,  0x0, 4, 6, 1 },
1318		{ 0x250, 0x250,  0x2a1f, 2, 6, 1 },
1319		{ 0x251, 0x251,  0x2a1c, 2, 6, 1 },
1320		{ 0x271, 0x271,  0x29fd, 2, 6, 1 },
1321		{ 0x371, 0x373, -0x1, 2, 6, 1 },
1322		{ 0x377, 0x377, -0x1, 2, 6, 1 },
1323		{ 0x3c2, 0x3c2,  0x0, 2, 6, 1 },
1324		{ 0x3d7, 0x3d7, -0x8, 2, 6, 1 },
1325		{ 0x515, 0x523, -0x1, 2, 6, 1 },
1326			/* below, -0x75fc stands for 0x8a04 and truncation */
1327		{ 0x1d79, 0x1d79, -0x75fc, 2, 6, 1 },
1328		{ 0x1efb, 0x1eff, -0x1, 2, 6, 1 },
1329		{ 0x1fc3, 0x1ff3,  0x9, 48, 6, 1 },
1330		{ 0x1fcc, 0x1ffc,  0x0, 48, 6, 1 },
1331		{ 0x2c65, 0x2c65, -0x2a2b, 2, 6, 1 },
1332		{ 0x2c66, 0x2c66, -0x2a28, 2, 6, 1 },
1333		{ 0x2c73, 0x2c73, -0x1, 2, 6, 1 },
1334		{ 0xa641, 0xa65f, -0x1, 2, 6, 1 },
1335		{ 0xa663, 0xa66d, -0x1, 2, 6, 1 },
1336		{ 0xa681, 0xa697, -0x1, 2, 6, 1 },
1337		{ 0xa723, 0xa72f, -0x1, 2, 6, 1 },
1338		{ 0xa733, 0xa76f, -0x1, 2, 6, 1 },
1339		{ 0xa77a, 0xa77c, -0x1, 2, 6, 1 },
1340		{ 0xa77f, 0xa787, -0x1, 2, 6, 1 },
1341		{ 0xa78c, 0xa78c, -0x1, 2, 6, 1 },
1342							/* end mark */
1343		{ 0 }
1344	} ;
1345
1346	int i, r;
1347	int k, off;
1348	const struct NEWUPPERCASE *puc;
1349
1350	memset((char*)uc, 0, uc_len);
1351	uc_len >>= 1;
1352	if (uc_len > 65536)
1353		uc_len = 65536;
1354	for (i = 0; (u32)i < uc_len; i++)
1355		uc[i] = cpu_to_le16(i);
1356	for (r = 0; uc_run_table[r][0]; r++) {
1357		off = uc_run_table[r][2];
1358		for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
1359			uc[i] = cpu_to_le16(i + off);
1360	}
1361	for (r = 0; uc_dup_table[r][0]; r++)
1362		for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
1363			uc[i + 1] = cpu_to_le16(i);
1364	for (r = 0; uc_byte_table[r][0]; r++) {
1365		k = uc_byte_table[r][1];
1366		uc[uc_byte_table[r][0]] = cpu_to_le16(k);
1367	}
1368	for (r=0; newuppercase[r].first; r++) {
1369		puc = &newuppercase[r];
1370		if ((puc->osmajor < UPCASE_MAJOR)
1371		  || ((puc->osmajor == UPCASE_MAJOR)
1372		     && (puc->osminor <= UPCASE_MINOR))) {
1373			off = puc->diff;
1374			for (i = puc->first; i <= puc->last; i += puc->step)
1375				uc[i] = cpu_to_le16(i + off);
1376		}
1377	}
1378}
1379
1380/*
1381 *		Allocate and build the default upcase table
1382 *
1383 *	Returns the number of entries
1384 *		0 if failed
1385 */
1386
1387#define UPCASE_LEN 65536 /* default number of entries in upcase */
1388
1389u32 ntfs_upcase_build_default(ntfschar **upcase)
1390{
1391	u32 upcase_len = 0;
1392
1393	*upcase = (ntfschar*)ntfs_malloc(UPCASE_LEN*2);
1394	if (*upcase) {
1395		ntfs_upcase_table_build(*upcase, UPCASE_LEN*2);
1396		upcase_len = UPCASE_LEN;
1397	}
1398	return (upcase_len);
1399}
1400
1401/*
1402 *		Build a table for converting to lower case
1403 *
1404 *	This is only meaningful when there is a single lower case
1405 *	character leading to an upper case one, and currently the
1406 *	only exception is the greek letter sigma which has a single
1407 *	upper case glyph (code U+03A3), but two lower case glyphs
1408 *	(code U+03C3 and U+03C2, the latter to be used at the end
1409 *	of a word). In the following implementation the upper case
1410 *	sigma will be lowercased as U+03C3.
1411 */
1412
1413ntfschar *ntfs_locase_table_build(const ntfschar *uc, u32 uc_cnt)
1414{
1415	ntfschar *lc;
1416	u32 upp;
1417	u32 i;
1418
1419	lc = (ntfschar*)ntfs_malloc(uc_cnt*sizeof(ntfschar));
1420	if (lc) {
1421		for (i=0; i<uc_cnt; i++)
1422			lc[i] = cpu_to_le16(i);
1423		for (i=0; i<uc_cnt; i++) {
1424			upp = le16_to_cpu(uc[i]);
1425			if ((upp != i) && (upp < uc_cnt))
1426				lc[upp] = cpu_to_le16(i);
1427		}
1428	} else
1429		ntfs_log_error("Could not build the locase table\n");
1430	return (lc);
1431}
1432
1433/**
1434 * ntfs_str2ucs - convert a string to a valid NTFS file name
1435 * @s:		input string
1436 * @len:	length of output buffer in Unicode characters
1437 *
1438 * Convert the input @s string into the corresponding little endian,
1439 * 2-byte Unicode string. The length of the converted string is less
1440 * or equal to the maximum length allowed by the NTFS format (255).
1441 *
1442 * If @s is NULL then return AT_UNNAMED.
1443 *
1444 * On success the function returns the Unicode string in an allocated
1445 * buffer and the caller is responsible to free it when it's not needed
1446 * anymore.
1447 *
1448 * On error NULL is returned and errno is set to the error code.
1449 */
1450ntfschar *ntfs_str2ucs(const char *s, int *len)
1451{
1452	ntfschar *ucs = NULL;
1453
1454	if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) {
1455		ntfs_log_perror("Couldn't convert '%s' to Unicode", s);
1456		return NULL;
1457	}
1458	if (*len > NTFS_MAX_NAME_LEN) {
1459		free(ucs);
1460		errno = ENAMETOOLONG;
1461		return NULL;
1462	}
1463	if (!ucs || !*len) {
1464		ucs  = AT_UNNAMED;
1465		*len = 0;
1466	}
1467	return ucs;
1468}
1469
1470/**
1471 * ntfs_ucsfree - free memory allocated by ntfs_str2ucs()
1472 * @ucs		input string to be freed
1473 *
1474 * Free memory at @ucs and which was allocated by ntfs_str2ucs.
1475 *
1476 * Return value: none.
1477 */
1478void ntfs_ucsfree(ntfschar *ucs)
1479{
1480	if (ucs && (ucs != AT_UNNAMED))
1481		free(ucs);
1482}
1483
1484/*
1485 *		Check whether a name contains no chars forbidden
1486 *	for DOS or Win32 use
1487 *
1488 *	If @strict is TRUE, then trailing dots and spaces are forbidden.
1489 *	These names are technically allowed in the Win32 namespace, but
1490 *	they can be problematic.  See comment for FILE_NAME_WIN32.
1491 *
1492 *	If there is a bad char, errno is set to EINVAL
1493 */
1494
1495BOOL ntfs_forbidden_chars(const ntfschar *name, int len, BOOL strict)
1496{
1497	BOOL forbidden;
1498	int ch;
1499	int i;
1500	static const u32 mainset = (1L << ('\"' - 0x20))
1501			| (1L << ('*' - 0x20))
1502			| (1L << ('/' - 0x20))
1503			| (1L << (':' - 0x20))
1504			| (1L << ('<' - 0x20))
1505			| (1L << ('>' - 0x20))
1506			| (1L << ('?' - 0x20));
1507
1508	forbidden = (len == 0) ||
1509		    (strict && (name[len-1] == const_cpu_to_le16(' ') ||
1510				name[len-1] == const_cpu_to_le16('.')));
1511	for (i=0; i<len; i++) {
1512		ch = le16_to_cpu(name[i]);
1513		if ((ch < 0x20)
1514		    || ((ch < 0x40)
1515			&& ((1L << (ch - 0x20)) & mainset))
1516		    || (ch == '\\')
1517		    || (ch == '|'))
1518			forbidden = TRUE;
1519	}
1520	if (forbidden)
1521		errno = EINVAL;
1522	return (forbidden);
1523}
1524
1525/*
1526 *		Check whether a name contains no forbidden chars and
1527 *	is not a reserved name for DOS or Win32 use
1528 *
1529 *	The reserved names are CON, PRN, AUX, NUL, COM1..COM9, LPT1..LPT9
1530 *	with no suffix or any suffix.
1531 *
1532 *	If @strict is TRUE, then trailing dots and spaces are forbidden.
1533 *	These names are technically allowed in the Win32 namespace, but
1534 *	they can be problematic.  See comment for FILE_NAME_WIN32.
1535 *
1536 *	If the name is forbidden, errno is set to EINVAL
1537 */
1538
1539BOOL ntfs_forbidden_names(ntfs_volume *vol, const ntfschar *name, int len,
1540			  BOOL strict)
1541{
1542	BOOL forbidden;
1543	int h;
1544	static const ntfschar dot = const_cpu_to_le16('.');
1545	static const ntfschar con[] = { const_cpu_to_le16('c'),
1546			const_cpu_to_le16('o'), const_cpu_to_le16('n') };
1547	static const ntfschar prn[] = { const_cpu_to_le16('p'),
1548			const_cpu_to_le16('r'), const_cpu_to_le16('n') };
1549	static const ntfschar aux[] = { const_cpu_to_le16('a'),
1550			const_cpu_to_le16('u'), const_cpu_to_le16('x') };
1551	static const ntfschar nul[] = { const_cpu_to_le16('n'),
1552			const_cpu_to_le16('u'), const_cpu_to_le16('l') };
1553	static const ntfschar com[] = { const_cpu_to_le16('c'),
1554			const_cpu_to_le16('o'), const_cpu_to_le16('m') };
1555	static const ntfschar lpt[] = { const_cpu_to_le16('l'),
1556			const_cpu_to_le16('p'), const_cpu_to_le16('t') };
1557
1558	forbidden = ntfs_forbidden_chars(name, len, strict);
1559	if (!forbidden && (len >= 3)) {
1560		/*
1561		 * Rough hash check to tell whether the first couple of chars
1562		 * may be one of CO PR AU NU LP or lowercase variants.
1563		 */
1564		h = ((le16_to_cpu(name[0]) & 31)*48)
1565				^ ((le16_to_cpu(name[1]) & 31)*165);
1566		if ((h % 23) == 17) {
1567			/* do a full check, depending on the third char */
1568			switch (le16_to_cpu(name[2]) & ~0x20) {
1569			case 'N' :
1570				if (((len == 3) || (name[3] == dot))
1571				    && (!ntfs_ucsncasecmp(name, con, 3,
1572						vol->upcase, vol->upcase_len)
1573					|| !ntfs_ucsncasecmp(name, prn, 3,
1574						vol->upcase, vol->upcase_len)))
1575					forbidden = TRUE;
1576				break;
1577			case 'X' :
1578				if (((len == 3) || (name[3] == dot))
1579				    && !ntfs_ucsncasecmp(name, aux, 3,
1580						vol->upcase, vol->upcase_len))
1581					forbidden = TRUE;
1582				break;
1583			case 'L' :
1584				if (((len == 3) || (name[3] == dot))
1585				    && !ntfs_ucsncasecmp(name, nul, 3,
1586						vol->upcase, vol->upcase_len))
1587					forbidden = TRUE;
1588				break;
1589			case 'M' :
1590				if ((len > 3)
1591				    && (le16_to_cpu(name[3]) >= '1')
1592				    && (le16_to_cpu(name[3]) <= '9')
1593				    && ((len == 4) || (name[4] == dot))
1594				    && !ntfs_ucsncasecmp(name, com, 3,
1595						vol->upcase, vol->upcase_len))
1596					forbidden = TRUE;
1597				break;
1598			case 'T' :
1599				if ((len > 3)
1600				    && (le16_to_cpu(name[3]) >= '1')
1601				    && (le16_to_cpu(name[3]) <= '9')
1602				    && ((len == 4) || (name[4] == dot))
1603				    && !ntfs_ucsncasecmp(name, lpt, 3,
1604						vol->upcase, vol->upcase_len))
1605					forbidden = TRUE;
1606				break;
1607			}
1608		}
1609	}
1610
1611	if (forbidden)
1612		errno = EINVAL;
1613	return (forbidden);
1614}
1615
1616/*
1617 *		Check whether the same name can be used as a DOS and
1618 *	a Win32 name
1619 *
1620 *	The names must be the same, or the short name the uppercase
1621 *	variant of the long name
1622 */
1623
1624BOOL ntfs_collapsible_chars(ntfs_volume *vol,
1625			const ntfschar *shortname, int shortlen,
1626			const ntfschar *longname, int longlen)
1627{
1628	BOOL collapsible;
1629	unsigned int ch;
1630	unsigned int cs;
1631	int i;
1632
1633	collapsible = shortlen == longlen;
1634	for (i=0; collapsible && (i<shortlen); i++) {
1635		ch = le16_to_cpu(longname[i]);
1636		cs = le16_to_cpu(shortname[i]);
1637		if ((cs != ch)
1638		    && ((ch >= vol->upcase_len)
1639			|| (cs >= vol->upcase_len)
1640			|| (vol->upcase[cs] != vol->upcase[ch])))
1641				collapsible = FALSE;
1642	}
1643	return (collapsible);
1644}
1645
1646/*
1647 * Define the character encoding to be used.
1648 * Use UTF-8 unless specified otherwise.
1649 */
1650
1651int ntfs_set_char_encoding(const char *locale)
1652{
1653	use_utf8 = 0;
1654	if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8")
1655	    || strstr(locale,"utf-8") || strstr(locale,"UTF-8"))
1656		use_utf8 = 1;
1657	else
1658#ifndef __HAIKU__
1659		if (setlocale(LC_ALL, locale))
1660			use_utf8 = 0;
1661		else
1662#endif
1663		{
1664			ntfs_log_error("Invalid locale, encoding to UTF-8\n");
1665			use_utf8 = 1;
1666	 	}
1667	return 0; /* always successful */
1668}
1669
1670#if defined(__APPLE__) || defined(__DARWIN__)
1671
1672int ntfs_macosx_normalize_filenames(int normalize) {
1673#ifdef ENABLE_NFCONV
1674	if (normalize == 0 || normalize == 1) {
1675		nfconvert_utf8 = normalize;
1676		return 0;
1677	}
1678	else {
1679		return -1;
1680	}
1681#else
1682	return -1;
1683#endif /* ENABLE_NFCONV */
1684}
1685
1686int ntfs_macosx_normalize_utf8(const char *utf8_string, char **target,
1687		int composed)
1688{
1689#ifdef ENABLE_NFCONV
1690	/* For this code to compile, the CoreFoundation framework must be fed to
1691	 * the linker. */
1692	CFStringRef cfSourceString;
1693	CFMutableStringRef cfMutableString;
1694	CFRange rangeToProcess;
1695	CFIndex requiredBufferLength;
1696	char *result = NULL;
1697	int resultLength = -1;
1698
1699	/* Convert the UTF-8 string to a CFString. */
1700	cfSourceString = CFStringCreateWithCString(kCFAllocatorDefault,
1701		utf8_string, kCFStringEncodingUTF8);
1702	if (cfSourceString == NULL) {
1703		ntfs_log_error("CFStringCreateWithCString failed!\n");
1704		return -2;
1705	}
1706
1707	/* Create a mutable string from cfSourceString that we are free to
1708	 * modify. */
1709	cfMutableString = CFStringCreateMutableCopy(kCFAllocatorDefault, 0,
1710		cfSourceString);
1711	CFRelease(cfSourceString); /* End-of-life. */
1712	if (cfMutableString == NULL) {
1713		ntfs_log_error("CFStringCreateMutableCopy failed!\n");
1714		return -3;
1715	}
1716
1717	/* Normalize the mutable string to the desired normalization form. */
1718	CFStringNormalize(cfMutableString, (composed != 0 ?
1719		kCFStringNormalizationFormC : kCFStringNormalizationFormD));
1720
1721	/* Store the resulting string in a '\0'-terminated UTF-8 encoded char*
1722	 * buffer. */
1723	rangeToProcess = CFRangeMake(0, CFStringGetLength(cfMutableString));
1724	if (CFStringGetBytes(cfMutableString, rangeToProcess,
1725		kCFStringEncodingUTF8, 0, false, NULL, 0,
1726		&requiredBufferLength) > 0)
1727	{
1728		resultLength = sizeof(char) * (requiredBufferLength + 1);
1729		result = ntfs_calloc(resultLength);
1730
1731		if (result != NULL) {
1732			if (CFStringGetBytes(cfMutableString, rangeToProcess,
1733				kCFStringEncodingUTF8, 0, false,
1734				(UInt8*) result, resultLength - 1,
1735				&requiredBufferLength) <= 0)
1736			{
1737				ntfs_log_error("Could not perform UTF-8 "
1738					"conversion of normalized "
1739					"CFMutableString.\n");
1740				free(result);
1741				result = NULL;
1742			}
1743		}
1744		else {
1745			ntfs_log_error("Could not perform a ntfs_calloc of %d "
1746				"bytes for char *result.\n", resultLength);
1747		}
1748	}
1749	else {
1750		ntfs_log_error("Could not perform check for required length of "
1751			"UTF-8 conversion of normalized CFMutableString.\n");
1752	}
1753
1754	CFRelease(cfMutableString);
1755
1756	if (result != NULL) {
1757	 	*target = result;
1758		return resultLength - 1;
1759	}
1760	else {
1761		return -1;
1762	}
1763#else
1764	return -1;
1765#endif /* ENABLE_NFCONV */
1766}
1767#endif /* defined(__APPLE__) || defined(__DARWIN__) */
1768