1/**
2 * unistr.c - Unicode string handling. Originated from the Linux-NTFS project.
3 *
4 * Copyright (c) 2000-2004 Anton Altaparmakov
5 * Copyright (c) 2002-2009 Szabolcs Szakacsits
6 * Copyright (c) 2008-2009 Jean-Pierre Andre
7 * Copyright (c) 2008      Bernhard Kaindl
8 *
9 * This program/include file is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as published
11 * by the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program/include file is distributed in the hope that it will be
15 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
16 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program (in the main directory of the NTFS-3G
21 * distribution in the file COPYING); if not, write to the Free Software
22 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23 */
24
25#ifdef HAVE_CONFIG_H
26#include "config.h"
27#endif
28
29#ifdef HAVE_STDIO_H
30#include <stdio.h>
31#endif
32#ifdef HAVE_STDLIB_H
33#include <stdlib.h>
34#endif
35#ifdef HAVE_WCHAR_H
36#include <wchar.h>
37#endif
38#ifdef HAVE_STRING_H
39#include <string.h>
40#endif
41#ifdef HAVE_ERRNO_H
42#include <errno.h>
43#endif
44#ifdef HAVE_LOCALE_H
45#include <locale.h>
46#endif
47
48#include "compat.h"
49#include "attrib.h"
50#include "types.h"
51#include "unistr.h"
52#include "debug.h"
53#include "logging.h"
54#include "misc.h"
55
56#define NOREVBOM 0  /* JPA rejecting U+FFFE and U+FFFF, open to debate */
57
58#define MB_CUR_MAX 1    // Foxconn added pling 03/30/2009, for uclibc
59
60/*
61 * IMPORTANT
62 * =========
63 *
64 * All these routines assume that the Unicode characters are in little endian
65 * encoding inside the strings!!!
66 */
67
68static int use_utf8 = 1; /* use UTF-8 encoding for file names */
69
70/*
71 * This is used by the name collation functions to quickly determine what
72 * characters are (in)valid.
73 */
74#if 0
75static const u8 legal_ansi_char_array[0x40] = {
76	0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
77	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
78
79	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
80	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
81
82	0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
83	0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
84
85	0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
86	0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
87};
88#endif
89
90/**
91 * ntfs_names_are_equal - compare two Unicode names for equality
92 * @s1:			name to compare to @s2
93 * @s1_len:		length in Unicode characters of @s1
94 * @s2:			name to compare to @s1
95 * @s2_len:		length in Unicode characters of @s2
96 * @ic:			ignore case bool
97 * @upcase:		upcase table (only if @ic == IGNORE_CASE)
98 * @upcase_size:	length in Unicode characters of @upcase (if present)
99 *
100 * Compare the names @s1 and @s2 and return TRUE (1) if the names are
101 * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
102 * the @upcase table is used to perform a case insensitive comparison.
103 */
104BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len,
105		const ntfschar *s2, size_t s2_len,
106		const IGNORE_CASE_BOOL ic,
107		const ntfschar *upcase, const u32 upcase_size)
108{
109	if (s1_len != s2_len)
110		return FALSE;
111	if (!s1_len)
112		return TRUE;
113	if (ic == CASE_SENSITIVE)
114		return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE;
115	return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE:
116								       TRUE;
117}
118
119/**
120 * ntfs_names_collate - collate two Unicode names
121 * @name1:	first Unicode name to compare
122 * @name1_len:	length of first Unicode name to compare
123 * @name2:	second Unicode name to compare
124 * @name2_len:	length of second Unicode name to compare
125 * @err_val:	if @name1 contains an invalid character return this value
126 * @ic:		either CASE_SENSITIVE or IGNORE_CASE
127 * @upcase:	upcase table (ignored if @ic is CASE_SENSITIVE)
128 * @upcase_len:	upcase table size (ignored if @ic is CASE_SENSITIVE)
129 *
130 * ntfs_names_collate() collates two Unicode names and returns:
131 *
132 *  -1 if the first name collates before the second one,
133 *   0 if the names match,
134 *   1 if the second name collates before the first one, or
135 * @err_val if an invalid character is found in @name1 during the comparison.
136 *
137 * The following characters are considered invalid: '"', '*', '<', '>' and '?'.
138 */
139int ntfs_names_collate(const ntfschar *name1, const u32 name1_len,
140		const ntfschar *name2, const u32 name2_len,
141		const int err_val __attribute__((unused)),
142		const IGNORE_CASE_BOOL ic, const ntfschar *upcase,
143		const u32 upcase_len)
144{
145	u32 cnt;
146	ntfschar c1, c2;
147
148#ifdef DEBUG
149	if (!name1 || !name2 || (ic && (!upcase || !upcase_len))) {
150		ntfs_log_debug("ntfs_names_collate received NULL pointer!\n");
151		exit(1);
152	}
153#endif
154	for (cnt = 0; cnt < min(name1_len, name2_len); ++cnt) {
155		c1 = le16_to_cpu(*name1);
156		name1++;
157		c2 = le16_to_cpu(*name2);
158		name2++;
159		if (ic) {
160			if (c1 < upcase_len)
161				c1 = le16_to_cpu(upcase[c1]);
162			if (c2 < upcase_len)
163				c2 = le16_to_cpu(upcase[c2]);
164		}
165#if 0
166		if (c1 < 64 && legal_ansi_char_array[c1] & 8)
167			return err_val;
168#endif
169		if (c1 < c2)
170			return -1;
171		if (c1 > c2)
172			return 1;
173	}
174	if (name1_len < name2_len)
175		return -1;
176	if (name1_len == name2_len)
177		return 0;
178	/* name1_len > name2_len */
179#if 0
180	c1 = le16_to_cpu(*name1);
181	if (c1 < 64 && legal_ansi_char_array[c1] & 8)
182		return err_val;
183#endif
184	return 1;
185}
186
187/**
188 * ntfs_ucsncmp - compare two little endian Unicode strings
189 * @s1:		first string
190 * @s2:		second string
191 * @n:		maximum unicode characters to compare
192 *
193 * Compare the first @n characters of the Unicode strings @s1 and @s2,
194 * The strings in little endian format and appropriate le16_to_cpu()
195 * conversion is performed on non-little endian machines.
196 *
197 * The function returns an integer less than, equal to, or greater than zero
198 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
199 * to be less than, to match, or be greater than @s2.
200 */
201int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
202{
203	ntfschar c1, c2;
204	size_t i;
205
206#ifdef DEBUG
207	if (!s1 || !s2) {
208		ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n");
209		exit(1);
210	}
211#endif
212	for (i = 0; i < n; ++i) {
213		c1 = le16_to_cpu(s1[i]);
214		c2 = le16_to_cpu(s2[i]);
215		if (c1 < c2)
216			return -1;
217		if (c1 > c2)
218			return 1;
219		if (!c1)
220			break;
221	}
222	return 0;
223}
224
225/**
226 * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
227 * @s1:			first string
228 * @s2:			second string
229 * @n:			maximum unicode characters to compare
230 * @upcase:		upcase table
231 * @upcase_size:	upcase table size in Unicode characters
232 *
233 * Compare the first @n characters of the Unicode strings @s1 and @s2,
234 * ignoring case. The strings in little endian format and appropriate
235 * le16_to_cpu() conversion is performed on non-little endian machines.
236 *
237 * Each character is uppercased using the @upcase table before the comparison.
238 *
239 * The function returns an integer less than, equal to, or greater than zero
240 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
241 * to be less than, to match, or be greater than @s2.
242 */
243int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
244		const ntfschar *upcase, const u32 upcase_size)
245{
246	ntfschar c1, c2;
247	size_t i;
248
249#ifdef DEBUG
250	if (!s1 || !s2 || !upcase) {
251		ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n");
252		exit(1);
253	}
254#endif
255	for (i = 0; i < n; ++i) {
256		if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
257			c1 = le16_to_cpu(upcase[c1]);
258		if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
259			c2 = le16_to_cpu(upcase[c2]);
260		if (c1 < c2)
261			return -1;
262		if (c1 > c2)
263			return 1;
264		if (!c1)
265			break;
266	}
267	return 0;
268}
269
270/**
271 * ntfs_ucsnlen - determine the length of a little endian Unicode string
272 * @s:		pointer to Unicode string
273 * @maxlen:	maximum length of string @s
274 *
275 * Return the number of Unicode characters in the little endian Unicode
276 * string @s up to a maximum of maxlen Unicode characters, not including
277 * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s
278 * and @s + @maxlen, @maxlen is returned.
279 *
280 * This function never looks beyond @s + @maxlen.
281 */
282u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen)
283{
284	u32 i;
285
286	for (i = 0; i < maxlen; i++) {
287		if (!le16_to_cpu(s[i]))
288			break;
289	}
290	return i;
291}
292
293/**
294 * ntfs_ucsndup - duplicate little endian Unicode string
295 * @s:		pointer to Unicode string
296 * @maxlen:	maximum length of string @s
297 *
298 * Return a pointer to a new little endian Unicode string which is a duplicate
299 * of the string s.  Memory for the new string is obtained with ntfs_malloc(3),
300 * and can be freed with free(3).
301 *
302 * A maximum of @maxlen Unicode characters are copied and a terminating
303 * (ntfschar)'\0' little endian Unicode character is added.
304 *
305 * This function never looks beyond @s + @maxlen.
306 *
307 * Return a pointer to the new little endian Unicode string on success and NULL
308 * on failure with errno set to the error code.
309 */
310ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen)
311{
312	ntfschar *dst;
313	u32 len;
314
315	len = ntfs_ucsnlen(s, maxlen);
316	dst = ntfs_malloc((len + 1) * sizeof(ntfschar));
317	if (dst) {
318		memcpy(dst, s, len * sizeof(ntfschar));
319		dst[len] = cpu_to_le16(L'\0');
320	}
321	return dst;
322}
323
324/**
325 * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent
326 * @name:
327 * @name_len:
328 * @upcase:
329 * @upcase_len:
330 *
331 * Description...
332 *
333 * Returns:
334 */
335void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase,
336		const u32 upcase_len)
337{
338	u32 i;
339	ntfschar u;
340
341	for (i = 0; i < name_len; i++)
342		if ((u = le16_to_cpu(name[i])) < upcase_len)
343			name[i] = upcase[u];
344}
345
346/**
347 * ntfs_file_value_upcase - Convert a filename to upper case
348 * @file_name_attr:
349 * @upcase:
350 * @upcase_len:
351 *
352 * Description...
353 *
354 * Returns:
355 */
356void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr,
357		const ntfschar *upcase, const u32 upcase_len)
358{
359	ntfs_name_upcase((ntfschar*)&file_name_attr->file_name,
360			file_name_attr->file_name_length, upcase, upcase_len);
361}
362
363/**
364 * ntfs_file_values_compare - Which of two filenames should be listed first
365 * @file_name_attr1:
366 * @file_name_attr2:
367 * @err_val:
368 * @ic:
369 * @upcase:
370 * @upcase_len:
371 *
372 * Description...
373 *
374 * Returns:
375 */
376int ntfs_file_values_compare(const FILE_NAME_ATTR *file_name_attr1,
377		const FILE_NAME_ATTR *file_name_attr2,
378		const int err_val, const IGNORE_CASE_BOOL ic,
379		const ntfschar *upcase, const u32 upcase_len)
380{
381	return ntfs_names_collate((ntfschar*)&file_name_attr1->file_name,
382			file_name_attr1->file_name_length,
383			(ntfschar*)&file_name_attr2->file_name,
384			file_name_attr2->file_name_length,
385			err_val, ic, upcase, upcase_len);
386}
387
388/*
389   NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
390   for now]) for path names, but the Unicode code points need to be
391   converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
392   glibc does this even without a locale in a hard-coded fashion as that
393   appears to be is easy because the low 7-bit ASCII range appears to be
394   available in all charsets but it does not convert anything if
395   there was some error with the locale setup or none set up like
396   when mount is called during early boot where he (by policy) do
397   not use locales (and may be not available if /usr is not yet mounted),
398   so this patch fixes the resulting issues for systems which use
399   UTF-8 and for others, specifying the locale in fstab brings them
400   the encoding which they want.
401
402   If no locale is defined or there was a problem with setting one
403   up and whenever nl_langinfo(CODESET) returns a sting starting with
404   "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
405   the bug where NTFS-3G does not show any path names which include
406   international characters!!! (and also fails on creating them) as result.
407
408   Author: Bernhard Kaindl <bk@suse.de>
409   Jean-Pierre Andre made it compliant with RFC3629/RFC2781.
410*/
411
412/*
413 * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
414 * null) to store a given UTF-16LE string.
415 *
416 * Return -1 with errno set if string has invalid byte sequence or too long.
417 */
418static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len)
419{
420	int i, ret = -1;
421	int count = 0;
422	BOOL surrog;
423
424	surrog = FALSE;
425	for (i = 0; i < ins_len && ins[i]; i++) {
426		unsigned short c = le16_to_cpu(ins[i]);
427		if (surrog) {
428			if ((c >= 0xdc00) && (c < 0xe000)) {
429				surrog = FALSE;
430				count += 4;
431			} else
432				goto fail;
433		} else
434			if (c < 0x80)
435				count++;
436			else if (c < 0x800)
437				count += 2;
438			else if (c < 0xd800)
439				count += 3;
440			else if (c < 0xdc00)
441				surrog = TRUE;
442#if NOREVBOM
443			else if ((c >= 0xe000) && (c < 0xfffe))
444#else
445			else if (c >= 0xe000)
446#endif
447				count += 3;
448			else
449				goto fail;
450		if (count > outs_len) {
451			errno = ENAMETOOLONG;
452			goto out;
453		}
454	}
455	if (surrog)
456		goto fail;
457
458	ret = count;
459out:
460	return ret;
461fail:
462	errno = EILSEQ;
463	goto out;
464}
465
466/*
467 * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string
468 * @ins:	input utf16 string buffer
469 * @ins_len:	length of input string in utf16 characters
470 * @outs:	on return contains the (allocated) output multibyte string
471 * @outs_len:	length of output buffer in bytes
472 *
473 * Return -1 with errno set if string has invalid byte sequence or too long.
474 */
475static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
476			      char **outs, int outs_len)
477{
478	char *t;
479	int i, size, ret = -1;
480	ntfschar halfpair;
481
482	halfpair = 0;
483	if (!*outs)
484		outs_len = PATH_MAX;
485
486	size = utf16_to_utf8_size(ins, ins_len, outs_len);
487
488	if (size < 0)
489		goto out;
490
491	if (!*outs) {
492		outs_len = size + 1;
493		*outs = ntfs_malloc(outs_len);
494		if (!*outs)
495			goto out;
496	}
497
498	t = *outs;
499
500	for (i = 0; i < ins_len && ins[i]; i++) {
501	    unsigned short c = le16_to_cpu(ins[i]);
502			/* size not double-checked */
503		if (halfpair) {
504			if ((c >= 0xdc00) && (c < 0xe000)) {
505				*t++ = 0xf0 + (((halfpair + 64) >> 8) & 7);
506				*t++ = 0x80 + (((halfpair + 64) >> 2) & 63);
507				*t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4);
508				*t++ = 0x80 + (c & 63);
509				halfpair = 0;
510			} else
511				goto fail;
512		} else if (c < 0x80) {
513			*t++ = c;
514	    	} else {
515			if (c < 0x800) {
516			   	*t++ = (0xc0 | ((c >> 6) & 0x3f));
517			        *t++ = 0x80 | (c & 0x3f);
518			} else if (c < 0xd800) {
519			   	*t++ = 0xe0 | (c >> 12);
520			   	*t++ = 0x80 | ((c >> 6) & 0x3f);
521		        	*t++ = 0x80 | (c & 0x3f);
522			} else if (c < 0xdc00)
523				halfpair = c;
524			else if (c >= 0xe000) {
525				*t++ = 0xe0 | (c >> 12);
526				*t++ = 0x80 | ((c >> 6) & 0x3f);
527			        *t++ = 0x80 | (c & 0x3f);
528			} else
529				goto fail;
530	        }
531	}
532	*t = '\0';
533	ret = t - *outs;
534out:
535	return ret;
536fail:
537	errno = EILSEQ;
538	goto out;
539}
540
541/*
542 * Return the amount of 16-bit elements in UTF-16LE needed
543 * (without the terminating null) to store given UTF-8 string.
544 *
545 * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
546 *
547 * Note: This does not check whether the input sequence is a valid utf8 string,
548 *	 and should be used only in context where such check is made!
549 */
550static int utf8_to_utf16_size(const char *s)
551{
552	int ret = -1;
553	unsigned int byte;
554	size_t count = 0;
555
556	while ((byte = *((const unsigned char *)s++))) {
557		if (++count >= PATH_MAX)
558			goto fail;
559		if (byte >= 0xF5) {
560			errno = EILSEQ;
561			goto out;
562		}
563		if (!*s)
564			break;
565		if (byte >= 0xC0)
566			s++;
567		if (!*s)
568			break;
569		if (byte >= 0xE0)
570			s++;
571		if (!*s)
572			break;
573		if (byte >= 0xF0) {
574			s++;
575			if (++count >= PATH_MAX)
576				goto fail;
577		}
578	}
579	ret = count;
580out:
581	return ret;
582fail:
583	errno = ENAMETOOLONG;
584	goto out;
585}
586/*
587 * This converts one UTF-8 sequence to cpu-endian Unicode value
588 * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF
589 *
590 * Return the number of used utf8 bytes or -1 with errno set
591 * if sequence is invalid.
592 */
593static int utf8_to_unicode(u32 *wc, const char *s)
594{
595    	unsigned int byte = *((const unsigned char *)s);
596
597					/* single byte */
598	if (byte == 0) {
599		*wc = (u32) 0;
600		return 0;
601	} else if (byte < 0x80) {
602		*wc = (u32) byte;
603		return 1;
604					/* double byte */
605	} else if (byte < 0xc2) {
606		goto fail;
607	} else if (byte < 0xE0) {
608		if (strlen(s) < 2)
609			goto fail;
610		if ((s[1] & 0xC0) == 0x80) {
611			*wc = ((u32)(byte & 0x1F) << 6)
612			    | ((u32)(s[1] & 0x3F));
613			return 2;
614		} else
615			goto fail;
616					/* three-byte */
617	} else if (byte < 0xF0) {
618		if (strlen(s) < 3)
619			goto fail;
620		if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
621			*wc = ((u32)(byte & 0x0F) << 12)
622			    | ((u32)(s[1] & 0x3F) << 6)
623			    | ((u32)(s[2] & 0x3F));
624			/* Check valid ranges */
625#if NOREVBOM
626			if (((*wc >= 0x800) && (*wc <= 0xD7FF))
627			  || ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
628				return 3;
629#else
630			if (((*wc >= 0x800) && (*wc <= 0xD7FF))
631			  || ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
632				return 3;
633#endif
634		}
635		goto fail;
636					/* four-byte */
637	} else if (byte < 0xF5) {
638		if (strlen(s) < 4)
639			goto fail;
640		if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)
641		  && ((s[3] & 0xC0) == 0x80)) {
642			*wc = ((u32)(byte & 0x07) << 18)
643			    | ((u32)(s[1] & 0x3F) << 12)
644			    | ((u32)(s[2] & 0x3F) << 6)
645			    | ((u32)(s[3] & 0x3F));
646		/* Check valid ranges */
647		if ((*wc <= 0x10ffff) && (*wc >= 0x10000))
648			return 4;
649		}
650		goto fail;
651	}
652fail:
653	errno = EILSEQ;
654	return -1;
655}
656
657/**
658 * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string
659 * @ins:	input multibyte string buffer
660 * @outs:	on return contains the (allocated) output utf16 string
661 * @outs_len:	length of output buffer in utf16 characters
662 *
663 * Return -1 with errno set.
664 */
665static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs)
666{
667	const char *t = ins;
668	u32 wc;
669	ntfschar *outpos;
670	int shorts, ret = -1;
671
672	shorts = utf8_to_utf16_size(ins);
673	if (shorts < 0)
674		goto fail;
675
676	if (!*outs) {
677		*outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar));
678		if (!*outs)
679			goto fail;
680	}
681
682	outpos = *outs;
683
684	while(1) {
685		int m  = utf8_to_unicode(&wc, t);
686		if (m < 0)
687			goto fail;
688		if (wc < 0x10000)
689			*outpos++ = cpu_to_le16(wc);
690		else {
691			wc -= 0x10000;
692			*outpos++ = cpu_to_le16((wc >> 10) + 0xd800);
693			*outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00);
694		}
695		if (m == 0)
696			break;
697		t += m;
698	}
699
700	ret = --outpos - *outs;
701fail:
702	return ret;
703}
704
705/**
706 * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
707 * @ins:	input Unicode string buffer
708 * @ins_len:	length of input string in Unicode characters
709 * @outs:	on return contains the (allocated) output multibyte string
710 * @outs_len:	length of output buffer in bytes
711 *
712 * Convert the input little endian, 2-byte Unicode string @ins, of length
713 * @ins_len into the multibyte string format dictated by the current locale.
714 *
715 * If *@outs is NULL, the function allocates the string and the caller is
716 * responsible for calling free(*@outs); when finished with it.
717 *
718 * On success the function returns the number of bytes written to the output
719 * string *@outs (>= 0), not counting the terminating NULL byte. If the output
720 * string buffer was allocated, *@outs is set to it.
721 *
722 * On error, -1 is returned, and errno is set to the error code. The following
723 * error codes can be expected:
724 *	EINVAL		Invalid arguments (e.g. @ins or @outs is NULL).
725 *	EILSEQ		The input string cannot be represented as a multibyte
726 *			sequence according to the current locale.
727 *	ENAMETOOLONG	Destination buffer is too small for input string.
728 *	ENOMEM		Not enough memory to allocate destination buffer.
729 */
730int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs,
731		int outs_len)
732{
733	char *mbs;
734	wchar_t wc;
735	int i, o, mbs_len;
736	int cnt = 0;
737#ifdef HAVE_MBSINIT
738	mbstate_t mbstate;
739#endif
740
741	if (!ins || !outs) {
742		errno = EINVAL;
743		return -1;
744	}
745	mbs = *outs;
746	mbs_len = outs_len;
747	if (mbs && !mbs_len) {
748		errno = ENAMETOOLONG;
749		return -1;
750	}
751	if (use_utf8)
752		return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len);
753	if (!mbs) {
754		mbs_len = (ins_len + 1) * MB_CUR_MAX;
755		mbs = ntfs_malloc(mbs_len);
756		if (!mbs)
757			return -1;
758	}
759#ifdef HAVE_MBSINIT
760	memset(&mbstate, 0, sizeof(mbstate));
761#else
762    printf("%s: sizeof(wchar_t) = %d\n", __FUNCTION__, sizeof(wchar_t));
763	//wctomb(NULL, 0);
764#endif
765	for (i = o = 0; i < ins_len; i++) {
766		/* Reallocate memory if necessary or abort. */
767		if ((int)(o + MB_CUR_MAX) > mbs_len) {
768			char *tc;
769			if (mbs == *outs) {
770				errno = ENAMETOOLONG;
771				return -1;
772			}
773			tc = ntfs_malloc((mbs_len + 64) & ~63);
774			if (!tc)
775				goto err_out;
776			memcpy(tc, mbs, mbs_len);
777			mbs_len = (mbs_len + 64) & ~63;
778			free(mbs);
779			mbs = tc;
780		}
781		/* Convert the LE Unicode character to a CPU wide character. */
782		wc = (wchar_t)le16_to_cpu(ins[i]);
783		if (!wc)
784			break;
785		/* Convert the CPU endian wide character to multibyte. */
786#ifdef HAVE_MBSINIT
787		cnt = wcrtomb(mbs + o, wc, &mbstate);
788#else
789//		cnt = wctomb(mbs + o, wc);
790        printf("********* wc = 0x%04x\n", wc);
791#endif
792		if (cnt == -1)
793			goto err_out;
794		if (cnt <= 0) {
795			ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt);
796			errno = EINVAL;
797			goto err_out;
798		}
799		o += cnt;
800	}
801#ifdef HAVE_MBSINIT
802	/* Make sure we are back in the initial state. */
803	if (!mbsinit(&mbstate)) {
804		ntfs_log_debug("Eeek. mbstate not in initial state!\n");
805		errno = EILSEQ;
806		goto err_out;
807	}
808#endif
809	/* Now write the NULL character. */
810	mbs[o] = '\0';
811	if (*outs != mbs)
812		*outs = mbs;
813	return o;
814err_out:
815	if (mbs != *outs) {
816		int eo = errno;
817		free(mbs);
818		errno = eo;
819	}
820	return -1;
821}
822
823/**
824 * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
825 * @ins:	input multibyte string buffer
826 * @outs:	on return contains the (allocated) output Unicode string
827 *
828 * Convert the input multibyte string @ins, from the current locale into the
829 * corresponding little endian, 2-byte Unicode string.
830 *
831 * The function allocates the string and the caller is responsible for calling
832 * free(*@outs); when finished with it.
833 *
834 * On success the function returns the number of Unicode characters written to
835 * the output string *@outs (>= 0), not counting the terminating Unicode NULL
836 * character.
837 *
838 * On error, -1 is returned, and errno is set to the error code. The following
839 * error codes can be expected:
840 *	EINVAL		Invalid arguments (e.g. @ins or @outs is NULL).
841 *	EILSEQ		The input string cannot be represented as a Unicode
842 *			string according to the current locale.
843 *	ENAMETOOLONG	Destination buffer is too small for input string.
844 *	ENOMEM		Not enough memory to allocate destination buffer.
845 */
846int ntfs_mbstoucs(const char *ins, ntfschar **outs)
847{
848	ntfschar *ucs;
849	const char *s;
850	wchar_t wc;
851	int i, o, cnt, ins_len, ucs_len, ins_size;
852#ifdef HAVE_MBSINIT
853	mbstate_t mbstate;
854#endif
855
856	if (!ins || !outs) {
857		errno = EINVAL;
858		return -1;
859	}
860
861	if (use_utf8)
862		return ntfs_utf8_to_utf16(ins, outs);
863
864	/* Determine the size of the multi-byte string in bytes. */
865	ins_size = strlen(ins);
866	/* Determine the length of the multi-byte string. */
867	s = ins;
868#if defined(HAVE_MBSINIT)
869	memset(&mbstate, 0, sizeof(mbstate));
870	ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate);
871#ifdef __CYGWIN32__
872	if (!ins_len && *ins) {
873		/* Older Cygwin had broken mbsrtowcs() implementation. */
874		ins_len = strlen(ins);
875	}
876#endif
877#elif !defined(DJGPP)
878	//ins_len = mbstowcs(NULL, s, 0);
879#else
880	/* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */
881	ins_len = strlen(ins);
882#endif
883	if (ins_len == -1)
884		return ins_len;
885#ifdef HAVE_MBSINIT
886	if ((s != ins) || !mbsinit(&mbstate)) {
887#else
888	if (s != ins) {
889#endif
890		errno = EILSEQ;
891		return -1;
892	}
893	/* Add the NULL terminator. */
894	ins_len++;
895	ucs_len = ins_len;
896	ucs = ntfs_malloc(ucs_len * sizeof(ntfschar));
897	if (!ucs)
898		return -1;
899#ifdef HAVE_MBSINIT
900	memset(&mbstate, 0, sizeof(mbstate));
901#else
902	//mbtowc(NULL, NULL, 0);
903#endif
904	for (i = o = cnt = 0; i < ins_size; i += cnt, o++) {
905		/* Reallocate memory if necessary. */
906		if (o >= ucs_len) {
907			ntfschar *tc;
908			ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63;
909			tc = realloc(ucs, ucs_len);
910			if (!tc)
911				goto err_out;
912			ucs = tc;
913			ucs_len /= sizeof(ntfschar);
914		}
915		/* Convert the multibyte character to a wide character. */
916#ifdef HAVE_MBSINIT
917		cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate);
918#else
919	//	cnt = mbtowc(&wc, ins + i, ins_size - i);
920    printf("%s: wc = 0x%lx\n", __FUNCTION__, wc);
921#endif
922		if (!cnt)
923			break;
924		if (cnt == -1)
925			goto err_out;
926		if (cnt < -1) {
927			ntfs_log_trace("Eeek. cnt = %i\n", cnt);
928			errno = EINVAL;
929			goto err_out;
930		}
931		/* Make sure we are not overflowing the NTFS Unicode set. */
932		if ((unsigned long)wc >= (unsigned long)(1 <<
933				(8 * sizeof(ntfschar)))) {
934			errno = EILSEQ;
935			goto err_out;
936		}
937		/* Convert the CPU wide character to a LE Unicode character. */
938		ucs[o] = cpu_to_le16(wc);
939	}
940#ifdef HAVE_MBSINIT
941	/* Make sure we are back in the initial state. */
942	if (!mbsinit(&mbstate)) {
943		ntfs_log_trace("Eeek. mbstate not in initial state!\n");
944		errno = EILSEQ;
945		goto err_out;
946	}
947#endif
948	/* Now write the NULL character. */
949	ucs[o] = cpu_to_le16(L'\0');
950	*outs = ucs;
951	return o;
952err_out:
953	free(ucs);
954	return -1;
955}
956
957/**
958 * ntfs_upcase_table_build - build the default upcase table for NTFS
959 * @uc:		destination buffer where to store the built table
960 * @uc_len:	size of destination buffer in bytes
961 *
962 * ntfs_upcase_table_build() builds the default upcase table for NTFS and
963 * stores it in the caller supplied buffer @uc of size @uc_len.
964 *
965 * Note, @uc_len must be at least 128kiB in size or bad things will happen!
966 */
967void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len)
968{
969	static int uc_run_table[][3] = { /* Start, End, Add */
970	{0x0061, 0x007B,  -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72,  74},
971	{0x00E0, 0x00F7,  -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76,  86},
972	{0x00F8, 0x00FF,  -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
973	{0x0256, 0x0258, -205}, {0x1F00, 0x1F08,   8}, {0x1F78, 0x1F7A, 128},
974	{0x028A, 0x028C, -217}, {0x1F10, 0x1F16,   8}, {0x1F7A, 0x1F7C, 112},
975	{0x03AC, 0x03AD,  -38}, {0x1F20, 0x1F28,   8}, {0x1F7C, 0x1F7E, 126},
976	{0x03AD, 0x03B0,  -37}, {0x1F30, 0x1F38,   8}, {0x1FB0, 0x1FB2,   8},
977	{0x03B1, 0x03C2,  -32}, {0x1F40, 0x1F46,   8}, {0x1FD0, 0x1FD2,   8},
978	{0x03C2, 0x03C3,  -31}, {0x1F51, 0x1F52,   8}, {0x1FE0, 0x1FE2,   8},
979	{0x03C3, 0x03CC,  -32}, {0x1F53, 0x1F54,   8}, {0x1FE5, 0x1FE6,   7},
980	{0x03CC, 0x03CD,  -64}, {0x1F55, 0x1F56,   8}, {0x2170, 0x2180, -16},
981	{0x03CD, 0x03CF,  -63}, {0x1F57, 0x1F58,   8}, {0x24D0, 0x24EA, -26},
982	{0x0430, 0x0450,  -32}, {0x1F60, 0x1F68,   8}, {0xFF41, 0xFF5B, -32},
983	{0}
984	};
985	static int uc_dup_table[][2] = { /* Start, End */
986	{0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
987	{0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
988	{0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
989	{0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
990	{0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
991	{0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
992	{0}
993	};
994	static int uc_byte_table[][2] = { /* Offset, Value */
995	{0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
996	{0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
997	{0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
998	{0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
999	{0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
1000	{0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
1001	{0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
1002	{0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
1003	{0}
1004	};
1005	int i, r;
1006
1007	memset((char*)uc, 0, uc_len);
1008	uc_len >>= 1;
1009	if (uc_len > 65536)
1010		uc_len = 65536;
1011	for (i = 0; (u32)i < uc_len; i++)
1012		uc[i] = i;
1013	for (r = 0; uc_run_table[r][0]; r++)
1014		for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
1015			uc[i] += uc_run_table[r][2];
1016	for (r = 0; uc_dup_table[r][0]; r++)
1017		for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
1018			uc[i + 1]--;
1019	for (r = 0; uc_byte_table[r][0]; r++)
1020		uc[uc_byte_table[r][0]] = uc_byte_table[r][1];
1021}
1022
1023/**
1024 * ntfs_str2ucs - convert a string to a valid NTFS file name
1025 * @s:		input string
1026 * @len:	length of output buffer in Unicode characters
1027 *
1028 * Convert the input @s string into the corresponding little endian,
1029 * 2-byte Unicode string. The length of the converted string is less
1030 * or equal to the maximum length allowed by the NTFS format (255).
1031 *
1032 * If @s is NULL then return AT_UNNAMED.
1033 *
1034 * On success the function returns the Unicode string in an allocated
1035 * buffer and the caller is responsible to free it when it's not needed
1036 * anymore.
1037 *
1038 * On error NULL is returned and errno is set to the error code.
1039 */
1040ntfschar *ntfs_str2ucs(const char *s, int *len)
1041{
1042	ntfschar *ucs = NULL;
1043
1044	if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) {
1045		ntfs_log_perror("Couldn't convert '%s' to Unicode", s);
1046		return NULL;
1047	}
1048	if (*len > NTFS_MAX_NAME_LEN) {
1049		free(ucs);
1050		errno = ENAMETOOLONG;
1051		return NULL;
1052	}
1053	if (!ucs || !*len) {
1054		ucs  = AT_UNNAMED;
1055		*len = 0;
1056	}
1057	return ucs;
1058}
1059
1060/**
1061 * ntfs_ucsfree - free memory allocated by ntfs_str2ucs()
1062 * @ucs		input string to be freed
1063 *
1064 * Free memory at @ucs and which was allocated by ntfs_str2ucs.
1065 *
1066 * Return value: none.
1067 */
1068void ntfs_ucsfree(ntfschar *ucs)
1069{
1070	if (ucs && (ucs != AT_UNNAMED))
1071		free(ucs);
1072}
1073
1074/*
1075 * Define the character encoding to be used.
1076 * Use UTF-8 unless specified otherwise.
1077 */
1078#if 0       /* Foxconn removed pling 04/01/2009, not used but have compiler errors */
1079int ntfs_set_char_encoding(const char *locale)
1080{
1081	use_utf8 = 0;
1082	if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8")
1083	    || strstr(locale,"utf-8") || strstr(locale,"UTF-8"))
1084		use_utf8 = 1;
1085	else
1086		if (setlocale(LC_ALL, locale))
1087			use_utf8 = 0;
1088		else {
1089			ntfs_log_error("Invalid locale, encoding to UTF-8\n");
1090			use_utf8 = 1;
1091	 	}
1092	return 0; /* always successful */
1093}
1094#endif
1095