1249259Sdim/*===--- ConvertUTF.h - Universal Character Names conversions ---------------=== 2249259Sdim * 3249259Sdim * The LLVM Compiler Infrastructure 4249259Sdim * 5249259Sdim * This file is distributed under the University of Illinois Open Source 6249259Sdim * License. See LICENSE.TXT for details. 7249259Sdim * 8249259Sdim *==------------------------------------------------------------------------==*/ 9249259Sdim/* 10249259Sdim * Copyright 2001-2004 Unicode, Inc. 11249259Sdim * 12249259Sdim * Disclaimer 13249259Sdim * 14249259Sdim * This source code is provided as is by Unicode, Inc. No claims are 15249259Sdim * made as to fitness for any particular purpose. No warranties of any 16249259Sdim * kind are expressed or implied. The recipient agrees to determine 17249259Sdim * applicability of information provided. If this file has been 18249259Sdim * purchased on magnetic or optical media from Unicode, Inc., the 19249259Sdim * sole remedy for any claim will be exchange of defective media 20249259Sdim * within 90 days of receipt. 21249259Sdim * 22249259Sdim * Limitations on Rights to Redistribute This Code 23249259Sdim * 24249259Sdim * Unicode, Inc. hereby grants the right to freely use the information 25249259Sdim * supplied in this file in the creation of products supporting the 26249259Sdim * Unicode Standard, and to make copies of this file in any form 27249259Sdim * for internal or external distribution as long as this notice 28249259Sdim * remains attached. 29249259Sdim */ 30249259Sdim 31249259Sdim/* --------------------------------------------------------------------- 32249259Sdim 33249259Sdim Conversions between UTF32, UTF-16, and UTF-8. Header file. 34249259Sdim 35249259Sdim Several funtions are included here, forming a complete set of 36249259Sdim conversions between the three formats. UTF-7 is not included 37249259Sdim here, but is handled in a separate source file. 38249259Sdim 39249259Sdim Each of these routines takes pointers to input buffers and output 40249259Sdim buffers. The input buffers are const. 41249259Sdim 42249259Sdim Each routine converts the text between *sourceStart and sourceEnd, 43249259Sdim putting the result into the buffer between *targetStart and 44249259Sdim targetEnd. Note: the end pointers are *after* the last item: e.g. 45249259Sdim *(sourceEnd - 1) is the last item. 46249259Sdim 47249259Sdim The return result indicates whether the conversion was successful, 48249259Sdim and if not, whether the problem was in the source or target buffers. 49249259Sdim (Only the first encountered problem is indicated.) 50249259Sdim 51249259Sdim After the conversion, *sourceStart and *targetStart are both 52249259Sdim updated to point to the end of last text successfully converted in 53249259Sdim the respective buffers. 54249259Sdim 55249259Sdim Input parameters: 56249259Sdim sourceStart - pointer to a pointer to the source buffer. 57249259Sdim The contents of this are modified on return so that 58249259Sdim it points at the next thing to be converted. 59249259Sdim targetStart - similarly, pointer to pointer to the target buffer. 60249259Sdim sourceEnd, targetEnd - respectively pointers to the ends of the 61249259Sdim two buffers, for overflow checking only. 62249259Sdim 63249259Sdim These conversion functions take a ConversionFlags argument. When this 64249259Sdim flag is set to strict, both irregular sequences and isolated surrogates 65249259Sdim will cause an error. When the flag is set to lenient, both irregular 66249259Sdim sequences and isolated surrogates are converted. 67249259Sdim 68249259Sdim Whether the flag is strict or lenient, all illegal sequences will cause 69249259Sdim an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>, 70249259Sdim or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code 71249259Sdim must check for illegal sequences. 72249259Sdim 73249259Sdim When the flag is set to lenient, characters over 0x10FFFF are converted 74249259Sdim to the replacement character; otherwise (when the flag is set to strict) 75249259Sdim they constitute an error. 76249259Sdim 77249259Sdim Output parameters: 78249259Sdim The value "sourceIllegal" is returned from some routines if the input 79249259Sdim sequence is malformed. When "sourceIllegal" is returned, the source 80249259Sdim value will point to the illegal value that caused the problem. E.g., 81249259Sdim in UTF-8 when a sequence is malformed, it points to the start of the 82249259Sdim malformed sequence. 83249259Sdim 84249259Sdim Author: Mark E. Davis, 1994. 85249259Sdim Rev History: Rick McGowan, fixes & updates May 2001. 86249259Sdim Fixes & updates, Sept 2001. 87249259Sdim 88249259Sdim------------------------------------------------------------------------ */ 89249259Sdim 90263508Sdim#ifndef LLVM_SUPPORT_CONVERTUTF_H 91263508Sdim#define LLVM_SUPPORT_CONVERTUTF_H 92249259Sdim 93249259Sdim/* --------------------------------------------------------------------- 94249259Sdim The following 4 definitions are compiler-specific. 95249259Sdim The C standard does not guarantee that wchar_t has at least 96249259Sdim 16 bits, so wchar_t is no less portable than unsigned short! 97249259Sdim All should be unsigned values to avoid sign extension during 98249259Sdim bit mask & shift operations. 99249259Sdim------------------------------------------------------------------------ */ 100249259Sdim 101249259Sdimtypedef unsigned int UTF32; /* at least 32 bits */ 102249259Sdimtypedef unsigned short UTF16; /* at least 16 bits */ 103249259Sdimtypedef unsigned char UTF8; /* typically 8 bits */ 104249259Sdimtypedef unsigned char Boolean; /* 0 or 1 */ 105249259Sdim 106249259Sdim/* Some fundamental constants */ 107249259Sdim#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD 108249259Sdim#define UNI_MAX_BMP (UTF32)0x0000FFFF 109249259Sdim#define UNI_MAX_UTF16 (UTF32)0x0010FFFF 110249259Sdim#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF 111249259Sdim#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF 112249259Sdim 113249259Sdim#define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4 114249259Sdim 115263508Sdim#define UNI_UTF16_BYTE_ORDER_MARK_NATIVE 0xFEFF 116263508Sdim#define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE 117263508Sdim 118249259Sdimtypedef enum { 119249259Sdim conversionOK, /* conversion successful */ 120249259Sdim sourceExhausted, /* partial character in source, but hit end */ 121249259Sdim targetExhausted, /* insuff. room in target for conversion */ 122249259Sdim sourceIllegal /* source sequence is illegal/malformed */ 123249259Sdim} ConversionResult; 124249259Sdim 125249259Sdimtypedef enum { 126249259Sdim strictConversion = 0, 127249259Sdim lenientConversion 128249259Sdim} ConversionFlags; 129249259Sdim 130249259Sdim/* This is for C++ and does no harm in C */ 131249259Sdim#ifdef __cplusplus 132249259Sdimextern "C" { 133249259Sdim#endif 134249259Sdim 135249259SdimConversionResult ConvertUTF8toUTF16 ( 136249259Sdim const UTF8** sourceStart, const UTF8* sourceEnd, 137249259Sdim UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); 138249259Sdim 139249259SdimConversionResult ConvertUTF8toUTF32 ( 140249259Sdim const UTF8** sourceStart, const UTF8* sourceEnd, 141249259Sdim UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); 142249259Sdim 143249259SdimConversionResult ConvertUTF16toUTF8 ( 144249259Sdim const UTF16** sourceStart, const UTF16* sourceEnd, 145249259Sdim UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); 146249259Sdim 147249259SdimConversionResult ConvertUTF32toUTF8 ( 148249259Sdim const UTF32** sourceStart, const UTF32* sourceEnd, 149249259Sdim UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); 150249259Sdim 151249259SdimConversionResult ConvertUTF16toUTF32 ( 152249259Sdim const UTF16** sourceStart, const UTF16* sourceEnd, 153249259Sdim UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); 154249259Sdim 155249259SdimConversionResult ConvertUTF32toUTF16 ( 156249259Sdim const UTF32** sourceStart, const UTF32* sourceEnd, 157249259Sdim UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); 158249259Sdim 159249259SdimBoolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); 160249259Sdim 161249259SdimBoolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd); 162249259Sdim 163249259Sdimunsigned getNumBytesForUTF8(UTF8 firstByte); 164249259Sdim 165249259Sdim#ifdef __cplusplus 166249259Sdim} 167249259Sdim 168249259Sdim/*************************************************************************/ 169249259Sdim/* Below are LLVM-specific wrappers of the functions above. */ 170249259Sdim 171263508Sdim#include "llvm/ADT/ArrayRef.h" 172249259Sdim#include "llvm/ADT/StringRef.h" 173249259Sdim 174249259Sdimnamespace llvm { 175249259Sdim 176249259Sdim/** 177249259Sdim * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on 178249259Sdim * WideCharWidth. The converted data is written to ResultPtr, which needs to 179249259Sdim * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success, 180249259Sdim * ResultPtr will point one after the end of the copied string. On failure, 181249259Sdim * ResultPtr will not be changed, and ErrorPtr will be set to the location of 182249259Sdim * the first character which could not be converted. 183249259Sdim * \return true on success. 184249259Sdim */ 185249259Sdimbool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source, 186249259Sdim char *&ResultPtr, const UTF8 *&ErrorPtr); 187249259Sdim 188249259Sdim/** 189249259Sdim * Convert an Unicode code point to UTF8 sequence. 190249259Sdim * 191249259Sdim * \param Source a Unicode code point. 192249259Sdim * \param [in,out] ResultPtr pointer to the output buffer, needs to be at least 193249259Sdim * \c UNI_MAX_UTF8_BYTES_PER_CODE_POINT bytes. On success \c ResultPtr is 194249259Sdim * updated one past end of the converted sequence. 195249259Sdim * 196249259Sdim * \returns true on success. 197249259Sdim */ 198249259Sdimbool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr); 199249259Sdim 200249259Sdim/** 201249259Sdim * Convert the first UTF8 sequence in the given source buffer to a UTF32 202249259Sdim * code point. 203249259Sdim * 204249259Sdim * \param [in,out] source A pointer to the source buffer. If the conversion 205249259Sdim * succeeds, this pointer will be updated to point to the byte just past the 206249259Sdim * end of the converted sequence. 207249259Sdim * \param sourceEnd A pointer just past the end of the source buffer. 208249259Sdim * \param [out] target The converted code 209249259Sdim * \param flags Whether the conversion is strict or lenient. 210249259Sdim * 211249259Sdim * \returns conversionOK on success 212249259Sdim * 213249259Sdim * \sa ConvertUTF8toUTF32 214249259Sdim */ 215249259Sdimstatic inline ConversionResult convertUTF8Sequence(const UTF8 **source, 216249259Sdim const UTF8 *sourceEnd, 217249259Sdim UTF32 *target, 218249259Sdim ConversionFlags flags) { 219249259Sdim if (*source == sourceEnd) 220249259Sdim return sourceExhausted; 221249259Sdim unsigned size = getNumBytesForUTF8(**source); 222249259Sdim if ((ptrdiff_t)size > sourceEnd - *source) 223249259Sdim return sourceExhausted; 224249259Sdim return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags); 225249259Sdim} 226263508Sdim 227263508Sdim/** 228263508Sdim * Returns true if a blob of text starts with a UTF-16 big or little endian byte 229263508Sdim * order mark. 230263508Sdim */ 231263508Sdimbool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes); 232263508Sdim 233263508Sdim/** 234263508Sdim * Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string. 235263508Sdim * 236263508Sdim * \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text. 237263508Sdim * \param [out] Out Converted UTF-8 is stored here on success. 238263508Sdim * \returns true on success 239263508Sdim */ 240263508Sdimbool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out); 241263508Sdim 242249259Sdim} /* end namespace llvm */ 243249259Sdim 244249259Sdim#endif 245249259Sdim 246249259Sdim/* --------------------------------------------------------------------- */ 247249259Sdim 248249259Sdim#endif 249