1/* 2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#ifndef _SYS_UTFCONV_H_ 30#define _SYS_UTFCONV_H_ 31 32#include <sys/appleapiopts.h> 33#include <sys/cdefs.h> 34 35#ifdef KERNEL 36#ifdef __APPLE_API_UNSTABLE 37 38/* 39 * UTF-8 encode/decode flags 40 */ 41#define UTF_REVERSE_ENDIAN 0x0001 /* reverse UCS-2 byte order */ 42#define UTF_NO_NULL_TERM 0x0002 /* do not add null termination */ 43#define UTF_DECOMPOSED 0x0004 /* generate fully decomposed UCS-2 */ 44#define UTF_PRECOMPOSED 0x0008 /* generate precomposed UCS-2 */ 45#define UTF_ESCAPE_ILLEGAL 0x0010 /* escape illegal UTF-8 */ 46#define UTF_SFM_CONVERSIONS 0x0020 /* Use SFM mappings for illegal NTFS chars */ 47 48#define UTF_BIG_ENDIAN \ 49 ((BYTE_ORDER == BIG_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN) 50 51#define UTF_LITTLE_ENDIAN \ 52 ((BYTE_ORDER == LITTLE_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN) 53 54__BEGIN_DECLS 55 56 57/* 58 * unicode_combinable - Test for a combining unicode character. 59 * 60 * This function is similar to __CFUniCharIsNonBaseCharacter except 61 * that it also includes Hangul Jamo characters. 62 */ 63 64int unicode_combinable(u_int16_t character); 65 66/* 67 * Test for a precomposed character. 68 * 69 * Similar to __CFUniCharIsDecomposableCharacter. 70 */ 71 72int unicode_decomposeable(u_int16_t character); 73 74 75/* 76 * utf8_encodelen - Calculate the UTF-8 encoding length 77 * 78 * This function takes an Unicode input string, ucsp, of ucslen bytes 79 * and calculates the size of the UTF-8 output in bytes (not including 80 * a NULL termination byte). The string must reside in kernel memory. 81 * 82 * FLAGS 83 * UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime 84 * 85 * UTF_BIG_ENDIAN: Unicode byte order is always big endian 86 * 87 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian 88 * 89 * UTF_DECOMPOSED: assume fully decomposed output 90 * 91 * ERRORS 92 * None 93 */ 94size_t 95utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, 96 int flags); 97 98 99/* 100 * utf8_encodestr - Encodes a Unicode string into UTF-8 101 * 102 * This function takes an Unicode input string, ucsp, of ucslen bytes 103 * and produces the UTF-8 output into a buffer of buflen bytes pointed 104 * to by utf8p. The size of the output in bytes (not including a NULL 105 * termination byte) is returned in utf8len. The UTF-8 string output 106 * is NULL terminated. Both buffers must reside in kernel memory. 107 * 108 * If '/' chars are possible in the Unicode input then an alternate 109 * (replacement) char must be provided in altslash. 110 * 111 * FLAGS 112 * UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime 113 * 114 * UTF_BIG_ENDIAN: Unicode byte order is always big endian 115 * 116 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian 117 * 118 * UTF_NO_NULL_TERM: do not add null termination to output string 119 * 120 * UTF_DECOMPOSED: generate fully decomposed output 121 * 122 * ERRORS 123 * ENAMETOOLONG: output did not fit; only utf8len bytes were encoded 124 * 125 * EINVAL: illegal Unicode char encountered 126 */ 127int 128utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p, 129 size_t * utf8len, size_t buflen, u_int16_t altslash, int flags); 130 131 132/* 133 * utf8_decodestr - Decodes a UTF-8 string into Unicode 134 * 135 * This function takes an UTF-8 input string, utf8p, of utf8len bytes 136 * and produces the Unicode output into a buffer of buflen bytes pointed 137 * to by ucsp. The size of the output in bytes (not including a NULL 138 * termination byte) is returned in ucslen. Both buffers must reside 139 * in kernel memory. 140 * 141 * If '/' chars are allowed in the Unicode output then an alternate 142 * (replacement) char must be provided in altslash. 143 * 144 * FLAGS 145 * UTF_REV_ENDIAN: Unicode byte order is opposite current runtime 146 * 147 * UTF_BIG_ENDIAN: Unicode byte order is always big endian 148 * 149 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian 150 * 151 * UTF_DECOMPOSED: generate fully decomposed output (NFD) 152 * 153 * UTF_PRECOMPOSED: generate precomposed output (NFC) 154 * 155 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input 156 * 157 * ERRORS 158 * ENAMETOOLONG: output did not fit; only ucslen bytes were decoded. 159 * 160 * EINVAL: illegal UTF-8 sequence encountered. 161 */ 162int 163utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp, 164 size_t *ucslen, size_t buflen, u_int16_t altslash, int flags); 165 166 167/* 168 * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD) 169 * 170 * This function takes an UTF-8 input string, instr, of inlen bytes 171 * and produces normalized UTF-8 output into a buffer of buflen bytes 172 * pointed to by outstr. The size of the output in bytes (not including 173 * a NULL termination byte) is returned in outlen. In-place conversions 174 * are not supported (i.e. instr != outstr). Both buffers must reside 175 * in kernel memory. 176 * 177 * FLAGS 178 * UTF_DECOMPOSED: output string will be fully decomposed (NFD) 179 * 180 * UTF_PRECOMPOSED: output string will be precomposed (NFC) 181 * 182 * UTF_NO_NULL_TERM: do not add null termination to output string 183 * 184 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input 185 * 186 * ERRORS 187 * ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes 188 * 189 * EINVAL: illegal UTF-8 sequence encountered or invalid flags 190 */ 191int 192utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr, 193 size_t *outlen, size_t buflen, int flags); 194 195 196/* 197 * utf8_validatestr - validates a UTF-8 string 198 * 199 * This function takes an UTF-8 input string, utf8p, of utf8len bytes 200 * and determines if its valid UTF-8. The string must reside in kernel 201 * memory. 202 * 203 * ERRORS 204 * EINVAL: illegal UTF-8 sequence encountered. 205 */ 206int 207utf8_validatestr(const u_int8_t* utf8p, size_t utf8len); 208 209 210__END_DECLS 211 212#endif /* __APPLE_API_UNSTABLE */ 213#endif /* KERNEL */ 214 215#endif /* !_SYS_UTFCONV_H_ */ 216