1/*
2   Unix SMB/CIFS implementation.
3   Samba charset module for Mac OS X/Darwin
4   Copyright (C) Benjamin Riefenstahl 2003
5
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2 of the License, or
9   (at your option) any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program; if not, write to the Free Software
18   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19*/
20
21/*
22 * modules/charset_macosxfs.c
23 *
24 * A Samba charset module to use on Mac OS X/Darwin as the filesystem
25 * and display encoding.
26 *
27 * Actually two implementations are provided here.  The default
28 * implementation is based on the official CFString API.  The other is
29 * based on internal CFString APIs as defined in the OpenDarwin
30 * source.
31 */
32
33#include "includes.h"
34
35/*
36 * Include OS frameworks.  These are only needed in this module.
37 */
38#include <CoreFoundation/CFString.h>
39
40/*
41 * See if autoconf has found us the internal headers in some form.
42 */
43#if HAVE_COREFOUNDATION_CFSTRINGENCODINGCONVERTER_H
44#	include <Corefoundation/CFStringEncodingConverter.h>
45#	include <Corefoundation/CFUnicodePrecomposition.h>
46#	define USE_INTERNAL_API 1
47#elif HAVE_CFSTRINGENCODINGCONVERTER_H
48#	include <CFStringEncodingConverter.h>
49#	include <CFUnicodePrecomposition.h>
50#	define USE_INTERNAL_API 1
51#endif
52
53/*
54 * Compile time configuration: Do we want debug output?
55 */
56/* #define DEBUG_STRINGS 1 */
57
58/*
59 * A simple, but efficient memory provider for our buffers.
60 */
61static inline void *resize_buffer (void *buffer, size_t *size, size_t newsize)
62{
63	if (newsize > *size) {
64		*size = newsize + 128;
65		buffer = realloc(buffer, *size);
66	}
67	return buffer;
68}
69
70/*
71 * While there is a version of OpenDarwin for intel, the usual case is
72 * big-endian PPC.  So we need byte swapping to handle the
73 * little-endian byte order of the network protocol.  We also need an
74 * additional dynamic buffer to do this work for incoming data blocks,
75 * because we have to consider the original data as constant.
76 *
77 * We abstract the differences away by providing a simple facade with
78 * these functions/macros:
79 *
80 *	le_to_native(dst,src,len)
81 *	native_to_le(cp,len)
82 *	set_ucbuffer_with_le(buffer,bufsize,data,size)
83 *	set_ucbuffer_with_le_copy(buffer,bufsize,data,size,reserve)
84 */
85#ifdef WORDS_BIGENDIAN
86
87static inline void swap_bytes (char * dst, const char * src, size_t len)
88{
89	const char *srcend = src + len;
90	while (src < srcend) {
91		dst[0] = src[1];
92		dst[1] = src[0];
93		dst += 2;
94		src += 2;
95	}
96}
97static inline void swap_bytes_inplace (char * cp, size_t len)
98{
99	char temp;
100	char *end = cp + len;
101	while (cp  < end) {
102		temp = cp[1];
103		cp[1] = cp[0];
104		cp[0] = temp;
105		cp += 2;
106	}
107}
108
109#define le_to_native(dst,src,len)	swap_bytes(dst,src,len)
110#define native_to_le(cp,len)		swap_bytes_inplace(cp,len)
111#define set_ucbuffer_with_le(buffer,bufsize,data,size) \
112	set_ucbuffer_with_le_copy(buffer,bufsize,data,size,0)
113
114#else	/* ! WORDS_BIGENDIAN */
115
116#define le_to_native(dst,src,len)	memcpy(dst,src,len)
117#define native_to_le(cp,len)		/* nothing */
118#define	set_ucbuffer_with_le(buffer,bufsize,data,size) \
119	(((void)(bufsize)),(UniChar*)(data))
120
121#endif
122
123static inline UniChar *set_ucbuffer_with_le_copy (
124	UniChar *buffer, size_t *bufsize,
125	const void *data, size_t size, size_t reserve)
126{
127	buffer = resize_buffer(buffer, bufsize, size+reserve);
128	le_to_native((char*)buffer,data,size);
129	return buffer;
130}
131
132
133/*
134 * A simple hexdump function for debugging error conditions.
135 */
136#define	debug_out(s)	DEBUG(0,(s))
137
138#ifdef DEBUG_STRINGS
139
140static void hexdump( const char * label, const char * s, size_t len )
141{
142	size_t restlen = len;
143	debug_out("<<<<<<<\n");
144	debug_out(label);
145	debug_out("\n");
146	while (restlen > 0) {
147		char line[100];
148		size_t i, j;
149		char * d = line;
150#undef sprintf
151		d += sprintf(d, "%04X ", (unsigned)(len-restlen));
152		*d++ = ' ';
153		for( i = 0; i<restlen && i<8; ++i ) {
154			d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
155		}
156		for( j = i; j<8; ++j ) {
157			d += sprintf(d, "   ");
158		}
159		*d++ = ' ';
160		for( i = 8; i<restlen && i<16; ++i ) {
161			d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
162		}
163		for( j = i; j<16; ++j ) {
164			d += sprintf(d, "   ");
165		}
166		*d++ = ' ';
167		for( i = 0; i<restlen && i<16; ++i ) {
168			if(s[i] < ' ' || s[i] >= 0x7F || !isprint(s[i]))
169				*d++ = '.';
170			else
171				*d++ = s[i];
172		}
173		*d++ = '\n';
174		*d = 0;
175		restlen -= i;
176		s += i;
177		debug_out(line);
178	}
179	debug_out(">>>>>>>\n");
180}
181
182#else	/* !DEBUG_STRINGS */
183
184#define hexdump(label,s,len) /* nothing */
185
186#endif
187
188
189#if !USE_INTERNAL_API
190
191/*
192 * An implementation based on documented Mac OS X APIs.
193 *
194 * This does a certain amount of memory management, creating and
195 * manipulating CFString objects.  We try to minimize the impact by
196 * keeping those objects around and re-using them.  We also use
197 * external backing store for the CFStrings where this is possible and
198 * benficial.
199 *
200 * The Unicode normalizations forms available at this level are
201 * generic, not specifically for the file system.  So they may not be
202 * perfect fits.
203 */
204static size_t macosxfs_encoding_pull(
205	void *cd,				/* Encoder handle */
206	char **inbuf, size_t *inbytesleft,	/* Script string */
207	char **outbuf, size_t *outbytesleft)	/* UTF-16-LE string */
208{
209	static const int script_code = kCFStringEncodingUTF8;
210	static CFMutableStringRef cfstring = NULL;
211	size_t outsize;
212	CFRange range;
213
214	(void) cd; /* UNUSED */
215
216	if (0 == *inbytesleft) {
217		return 0;
218	}
219
220	if (NULL == cfstring) {
221		/*
222		 * A version with an external backing store as in the
223		 * push function should have been more efficient, but
224		 * testing shows, that it is actually slower (!).
225		 * Maybe kCFAllocatorDefault gets shortcut evaluation
226		 * internally, while kCFAllocatorNull doesn't.
227		 */
228		cfstring = CFStringCreateMutable(kCFAllocatorDefault,0);
229	}
230
231	/*
232	 * Three methods of appending to a CFString, choose the most
233	 * efficient.
234	 */
235	if (0 == (*inbuf)[*inbytesleft-1]) {
236		CFStringAppendCString(cfstring, *inbuf, script_code);
237	} else if (*inbytesleft <= 255) {
238		Str255 buffer;
239		buffer[0] = *inbytesleft;
240		memcpy(buffer+1, *inbuf, buffer[0]);
241		CFStringAppendPascalString(cfstring, buffer, script_code);
242	} else {
243		/*
244		 * We would like to use a fixed buffer and a loop
245		 * here, but than we can't garantee that the input is
246		 * well-formed UTF-8, as we are supposed to do.
247		 */
248		static char *buffer = NULL;
249		static size_t buflen = 0;
250		buffer = resize_buffer(buffer, &buflen, *inbytesleft+1);
251		memcpy(buffer, *inbuf, *inbytesleft);
252		buffer[*inbytesleft] = 0;
253		CFStringAppendCString(cfstring, *inbuf, script_code);
254	}
255
256	/*
257	 * Compose characters, using the non-canonical composition
258	 * form.
259	 */
260	CFStringNormalize(cfstring, kCFStringNormalizationFormC);
261
262	outsize = CFStringGetLength(cfstring);
263	range = CFRangeMake(0,outsize);
264
265	if (outsize == 0) {
266		/*
267		 * HACK: smbd/mangle_hash2.c:is_legal_name() expects
268		 * errors here.  That function will always pass 2
269		 * characters.  smbd/open.c:check_for_pipe() cuts a
270		 * patchname to 10 characters blindly.  Suppress the
271		 * debug output in those cases.
272		 */
273		if(2 != *inbytesleft && 10 != *inbytesleft) {
274			debug_out("String conversion: "
275				  "An unknown error occurred\n");
276			hexdump("UTF8->UTF16LE (old) input",
277				*inbuf, *inbytesleft);
278		}
279		errno = EILSEQ; /* Not sure, but this is what we have
280				 * actually seen. */
281		return -1;
282	}
283	if (outsize*2 > *outbytesleft) {
284		CFStringDelete(cfstring, range);
285		debug_out("String conversion: "
286			  "Output buffer too small\n");
287		hexdump("UTF8->UTF16LE (old) input",
288			*inbuf, *inbytesleft);
289		errno = E2BIG;
290		return -1;
291	}
292
293        CFStringGetCharacters(cfstring, range, (UniChar*)*outbuf);
294	CFStringDelete(cfstring, range);
295
296	native_to_le(*outbuf, outsize*2);
297
298	/*
299	 * Add a converted null byte, if the CFString conversions
300	 * prevented that until now.
301	 */
302	if (0 == (*inbuf)[*inbytesleft-1] &&
303	    (0 != (*outbuf)[outsize*2-1] || 0 != (*outbuf)[outsize*2-2])) {
304
305		if ((outsize*2+2) > *outbytesleft) {
306			debug_out("String conversion: "
307				  "Output buffer too small\n");
308			hexdump("UTF8->UTF16LE (old) input",
309				*inbuf, *inbytesleft);
310			errno = E2BIG;
311			return -1;
312		}
313
314		(*outbuf)[outsize*2] = (*outbuf)[outsize*2+1] = 0;
315		outsize += 2;
316	}
317
318	*inbuf += *inbytesleft;
319	*inbytesleft = 0;
320	*outbuf += outsize*2;
321	*outbytesleft -= outsize*2;
322
323	return 0;
324}
325
326static size_t macosxfs_encoding_push(
327	void *cd,				/* Encoder handle */
328	char **inbuf, size_t *inbytesleft,	/* UTF-16-LE string */
329	char **outbuf, size_t *outbytesleft)	/* Script string */
330{
331	static const int script_code = kCFStringEncodingUTF8;
332	static CFMutableStringRef cfstring = NULL;
333	static UniChar *buffer = NULL;
334	static size_t buflen = 0;
335	CFIndex outsize, cfsize, charsconverted;
336
337	(void) cd; /* UNUSED */
338
339	if (0 == *inbytesleft) {
340		return 0;
341	}
342
343	/*
344	 * We need a buffer that can hold 4 times the original data,
345	 * because that is the theoretical maximum that decomposition
346	 * can create currently (in Unicode 4.0).
347	 */
348	buffer = set_ucbuffer_with_le_copy(
349		buffer, &buflen, *inbuf, *inbytesleft, 3 * *inbytesleft);
350
351	if (NULL == cfstring) {
352		cfstring = CFStringCreateMutableWithExternalCharactersNoCopy(
353			kCFAllocatorDefault,
354			buffer, *inbytesleft/2, buflen/2,
355			kCFAllocatorNull);
356	} else {
357		CFStringSetExternalCharactersNoCopy(
358			cfstring,
359			buffer, *inbytesleft/2, buflen/2);
360	}
361
362	/*
363	 * Decompose characters, using the non-canonical decomposition
364	 * form.
365	 *
366	 * NB: This isn't exactly what HFS+ wants (see note on
367	 * kCFStringEncodingUseHFSPlusCanonical in
368	 * CFStringEncodingConverter.h), but AFAIK it's the best that
369	 * the official API can do.
370	 */
371	CFStringNormalize(cfstring, kCFStringNormalizationFormD);
372
373	cfsize = CFStringGetLength(cfstring);
374	charsconverted = CFStringGetBytes(
375		cfstring, CFRangeMake(0,cfsize),
376		script_code, 0, False,
377		*outbuf, *outbytesleft, &outsize);
378
379	if (0 == charsconverted) {
380		debug_out("String conversion: "
381			  "Buffer too small or not convertable\n");
382		hexdump("UTF16LE->UTF8 (old) input",
383			*inbuf, *inbytesleft);
384		errno = EILSEQ; /* Probably more likely. */
385		return -1;
386	}
387
388	/*
389	 * Add a converted null byte, if the CFString conversions
390	 * prevented that until now.
391	 */
392	if (0 == (*inbuf)[*inbytesleft-1] && 0 == (*inbuf)[*inbytesleft-2] &&
393	    (0 != (*outbuf)[outsize-1])) {
394
395		if (((size_t)outsize+1) > *outbytesleft) {
396			debug_out("String conversion: "
397				  "Output buffer too small\n");
398			hexdump("UTF16LE->UTF8 (old) input",
399				*inbuf, *inbytesleft);
400			errno = E2BIG;
401			return -1;
402		}
403
404		(*outbuf)[outsize] = 0;
405		++outsize;
406	}
407
408	*inbuf += *inbytesleft;
409	*inbytesleft = 0;
410	*outbuf += outsize;
411	*outbytesleft -= outsize;
412
413	return 0;
414}
415
416#else /* USE_INTERNAL_API */
417
418/*
419 * An implementation based on internal code as known from the
420 * OpenDarwin CVS.
421 *
422 * This code doesn't need much memory management because it uses
423 * functions that operate on the raw memory directly.
424 *
425 * The push routine here is faster and more compatible with HFS+ than
426 * the other implementation above.  The pull routine is only faster
427 * for some strings, slightly slower for others.  The pull routine
428 * looses because it has to iterate over the data twice, once to
429 * decode UTF-8 and than to do the character composition required by
430 * Windows.
431 */
432static size_t macosxfs_encoding_pull(
433	void *cd,				/* Encoder handle */
434	char **inbuf, size_t *inbytesleft,	/* Script string */
435	char **outbuf, size_t *outbytesleft)	/* UTF-16-LE string */
436{
437	static const int script_code = kCFStringEncodingUTF8;
438	UInt32 srcCharsUsed = 0;
439	UInt32 dstCharsUsed = 0;
440	UInt32 result;
441	uint32_t dstDecomposedUsed = 0;
442	uint32_t dstPrecomposedUsed = 0;
443
444	(void) cd; /* UNUSED */
445
446	if (0 == *inbytesleft) {
447		return 0;
448	}
449
450        result = CFStringEncodingBytesToUnicode(
451		script_code, kCFStringEncodingComposeCombinings,
452		*inbuf, *inbytesleft, &srcCharsUsed,
453		(UniChar*)*outbuf, *outbytesleft, &dstCharsUsed);
454
455	switch(result) {
456	case kCFStringEncodingConversionSuccess:
457		if (*inbytesleft == srcCharsUsed)
458			break;
459		else
460			; /*fall through*/
461	case kCFStringEncodingInsufficientOutputBufferLength:
462		debug_out("String conversion: "
463			  "Output buffer too small\n");
464		hexdump("UTF8->UTF16LE (new) input",
465			*inbuf, *inbytesleft);
466		errno = E2BIG;
467		return -1;
468	case kCFStringEncodingInvalidInputStream:
469		/*
470		 * HACK: smbd/mangle_hash2.c:is_legal_name() expects
471		 * errors here.  That function will always pass 2
472		 * characters.  smbd/open.c:check_for_pipe() cuts a
473		 * patchname to 10 characters blindly.  Suppress the
474		 * debug output in those cases.
475		 */
476		if(2 != *inbytesleft && 10 != *inbytesleft) {
477			debug_out("String conversion: "
478				  "Invalid input sequence\n");
479			hexdump("UTF8->UTF16LE (new) input",
480				*inbuf, *inbytesleft);
481		}
482		errno = EILSEQ;
483		return -1;
484	case kCFStringEncodingConverterUnavailable:
485		debug_out("String conversion: "
486			  "Unknown encoding\n");
487		hexdump("UTF8->UTF16LE (new) input",
488			*inbuf, *inbytesleft);
489		errno = EINVAL;
490		return -1;
491	}
492
493	/*
494	 * It doesn't look like CFStringEncodingBytesToUnicode() can
495	 * produce precomposed characters (flags=ComposeCombinings
496	 * doesn't do it), so we need another pass over the data here.
497	 * We can do this in-place, as the string can only get
498	 * shorter.
499	 *
500	 * (Actually in theory there should be an internal
501	 * decomposition and reordering before the actual composition
502	 * step.  But we should be able to rely on that we always get
503	 * fully decomposed strings for input, so this can't create
504	 * problems in reality.)
505	 */
506	CFUniCharPrecompose(
507		(const UTF16Char *)*outbuf, dstCharsUsed, &dstDecomposedUsed,
508		(UTF16Char *)*outbuf, dstCharsUsed, &dstPrecomposedUsed);
509
510	native_to_le(*outbuf, dstPrecomposedUsed*2);
511
512	*inbuf += srcCharsUsed;
513	*inbytesleft -= srcCharsUsed;
514	*outbuf += dstPrecomposedUsed*2;
515	*outbytesleft -= dstPrecomposedUsed*2;
516
517	return 0;
518}
519
520static size_t macosxfs_encoding_push(
521	void *cd,				/* Encoder handle */
522	char **inbuf, size_t *inbytesleft,	/* UTF-16-LE string */
523	char **outbuf, size_t *outbytesleft)	/* Script string */
524{
525	static const int script_code = kCFStringEncodingUTF8;
526	static UniChar *buffer = NULL;
527	static size_t buflen = 0;
528	UInt32 srcCharsUsed=0, dstCharsUsed=0, result;
529
530	(void) cd; /* UNUSED */
531
532	if (0 == *inbytesleft) {
533		return 0;
534	}
535
536	buffer = set_ucbuffer_with_le(
537		buffer, &buflen, *inbuf, *inbytesleft);
538
539	result = CFStringEncodingUnicodeToBytes(
540		script_code, kCFStringEncodingUseHFSPlusCanonical,
541		buffer, *inbytesleft/2, &srcCharsUsed,
542		*outbuf, *outbytesleft, &dstCharsUsed);
543
544	switch(result) {
545	case kCFStringEncodingConversionSuccess:
546		if (*inbytesleft/2 == srcCharsUsed)
547			break;
548		else
549			; /*fall through*/
550	case kCFStringEncodingInsufficientOutputBufferLength:
551		debug_out("String conversion: "
552			  "Output buffer too small\n");
553		hexdump("UTF16LE->UTF8 (new) input",
554			*inbuf, *inbytesleft);
555		errno = E2BIG;
556		return -1;
557	case kCFStringEncodingInvalidInputStream:
558		/*
559		 * HACK: smbd/open.c:check_for_pipe():is_legal_name()
560		 * cuts a pathname to 10 characters blindly.  Suppress
561		 * the debug output in those cases.
562		 */
563		if(10 != *inbytesleft) {
564			debug_out("String conversion: "
565				  "Invalid input sequence\n");
566			hexdump("UTF16LE->UTF8 (new) input",
567				*inbuf, *inbytesleft);
568		}
569		errno = EILSEQ;
570		return -1;
571	case kCFStringEncodingConverterUnavailable:
572		debug_out("String conversion: "
573			  "Unknown encoding\n");
574		hexdump("UTF16LE->UTF8 (new) input",
575			*inbuf, *inbytesleft);
576		errno = EINVAL;
577		return -1;
578	}
579
580	*inbuf += srcCharsUsed*2;
581	*inbytesleft -= srcCharsUsed*2;
582	*outbuf += dstCharsUsed;
583	*outbytesleft -= dstCharsUsed;
584
585	return 0;
586}
587
588#endif /* USE_INTERNAL_API */
589
590/*
591 * For initialization, actually install the encoding as "macosxfs".
592 */
593static struct charset_functions macosxfs_encoding_functions = {
594	"MACOSXFS", macosxfs_encoding_pull, macosxfs_encoding_push
595};
596
597NTSTATUS init_module(void)
598{
599	return smb_register_charset(&macosxfs_encoding_functions);
600}
601
602/* eof */
603