1249259Sdim/*===--- ConvertUTF.h - Universal Character Names conversions ---------------===
2249259Sdim *
3249259Sdim *                     The LLVM Compiler Infrastructure
4249259Sdim *
5249259Sdim * This file is distributed under the University of Illinois Open Source
6249259Sdim * License. See LICENSE.TXT for details.
7249259Sdim *
8249259Sdim *==------------------------------------------------------------------------==*/
9249259Sdim/*
10249259Sdim * Copyright 2001-2004 Unicode, Inc.
11249259Sdim *
12249259Sdim * Disclaimer
13249259Sdim *
14249259Sdim * This source code is provided as is by Unicode, Inc. No claims are
15249259Sdim * made as to fitness for any particular purpose. No warranties of any
16249259Sdim * kind are expressed or implied. The recipient agrees to determine
17249259Sdim * applicability of information provided. If this file has been
18249259Sdim * purchased on magnetic or optical media from Unicode, Inc., the
19249259Sdim * sole remedy for any claim will be exchange of defective media
20249259Sdim * within 90 days of receipt.
21249259Sdim *
22249259Sdim * Limitations on Rights to Redistribute This Code
23249259Sdim *
24249259Sdim * Unicode, Inc. hereby grants the right to freely use the information
25249259Sdim * supplied in this file in the creation of products supporting the
26249259Sdim * Unicode Standard, and to make copies of this file in any form
27249259Sdim * for internal or external distribution as long as this notice
28249259Sdim * remains attached.
29249259Sdim */
30249259Sdim
31249259Sdim/* ---------------------------------------------------------------------
32249259Sdim
33249259Sdim    Conversions between UTF32, UTF-16, and UTF-8.  Header file.
34249259Sdim
35249259Sdim    Several funtions are included here, forming a complete set of
36249259Sdim    conversions between the three formats.  UTF-7 is not included
37249259Sdim    here, but is handled in a separate source file.
38249259Sdim
39249259Sdim    Each of these routines takes pointers to input buffers and output
40249259Sdim    buffers.  The input buffers are const.
41249259Sdim
42249259Sdim    Each routine converts the text between *sourceStart and sourceEnd,
43249259Sdim    putting the result into the buffer between *targetStart and
44249259Sdim    targetEnd. Note: the end pointers are *after* the last item: e.g.
45249259Sdim    *(sourceEnd - 1) is the last item.
46249259Sdim
47249259Sdim    The return result indicates whether the conversion was successful,
48249259Sdim    and if not, whether the problem was in the source or target buffers.
49249259Sdim    (Only the first encountered problem is indicated.)
50249259Sdim
51249259Sdim    After the conversion, *sourceStart and *targetStart are both
52249259Sdim    updated to point to the end of last text successfully converted in
53249259Sdim    the respective buffers.
54249259Sdim
55249259Sdim    Input parameters:
56249259Sdim        sourceStart - pointer to a pointer to the source buffer.
57249259Sdim                The contents of this are modified on return so that
58249259Sdim                it points at the next thing to be converted.
59249259Sdim        targetStart - similarly, pointer to pointer to the target buffer.
60249259Sdim        sourceEnd, targetEnd - respectively pointers to the ends of the
61249259Sdim                two buffers, for overflow checking only.
62249259Sdim
63249259Sdim    These conversion functions take a ConversionFlags argument. When this
64249259Sdim    flag is set to strict, both irregular sequences and isolated surrogates
65249259Sdim    will cause an error.  When the flag is set to lenient, both irregular
66249259Sdim    sequences and isolated surrogates are converted.
67249259Sdim
68249259Sdim    Whether the flag is strict or lenient, all illegal sequences will cause
69249259Sdim    an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
70249259Sdim    or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
71249259Sdim    must check for illegal sequences.
72249259Sdim
73249259Sdim    When the flag is set to lenient, characters over 0x10FFFF are converted
74249259Sdim    to the replacement character; otherwise (when the flag is set to strict)
75249259Sdim    they constitute an error.
76249259Sdim
77249259Sdim    Output parameters:
78249259Sdim        The value "sourceIllegal" is returned from some routines if the input
79249259Sdim        sequence is malformed.  When "sourceIllegal" is returned, the source
80249259Sdim        value will point to the illegal value that caused the problem. E.g.,
81249259Sdim        in UTF-8 when a sequence is malformed, it points to the start of the
82249259Sdim        malformed sequence.
83249259Sdim
84249259Sdim    Author: Mark E. Davis, 1994.
85249259Sdim    Rev History: Rick McGowan, fixes & updates May 2001.
86249259Sdim         Fixes & updates, Sept 2001.
87249259Sdim
88249259Sdim------------------------------------------------------------------------ */
89249259Sdim
90263508Sdim#ifndef LLVM_SUPPORT_CONVERTUTF_H
91263508Sdim#define LLVM_SUPPORT_CONVERTUTF_H
92249259Sdim
93249259Sdim/* ---------------------------------------------------------------------
94249259Sdim    The following 4 definitions are compiler-specific.
95249259Sdim    The C standard does not guarantee that wchar_t has at least
96249259Sdim    16 bits, so wchar_t is no less portable than unsigned short!
97249259Sdim    All should be unsigned values to avoid sign extension during
98249259Sdim    bit mask & shift operations.
99249259Sdim------------------------------------------------------------------------ */
100249259Sdim
101249259Sdimtypedef unsigned int    UTF32;  /* at least 32 bits */
102249259Sdimtypedef unsigned short  UTF16;  /* at least 16 bits */
103249259Sdimtypedef unsigned char   UTF8;   /* typically 8 bits */
104249259Sdimtypedef unsigned char   Boolean; /* 0 or 1 */
105249259Sdim
106249259Sdim/* Some fundamental constants */
107249259Sdim#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
108249259Sdim#define UNI_MAX_BMP (UTF32)0x0000FFFF
109249259Sdim#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
110249259Sdim#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
111249259Sdim#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
112249259Sdim
113249259Sdim#define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4
114249259Sdim
115263508Sdim#define UNI_UTF16_BYTE_ORDER_MARK_NATIVE  0xFEFF
116263508Sdim#define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE
117263508Sdim
118249259Sdimtypedef enum {
119249259Sdim  conversionOK,           /* conversion successful */
120249259Sdim  sourceExhausted,        /* partial character in source, but hit end */
121249259Sdim  targetExhausted,        /* insuff. room in target for conversion */
122249259Sdim  sourceIllegal           /* source sequence is illegal/malformed */
123249259Sdim} ConversionResult;
124249259Sdim
125249259Sdimtypedef enum {
126249259Sdim  strictConversion = 0,
127249259Sdim  lenientConversion
128249259Sdim} ConversionFlags;
129249259Sdim
130249259Sdim/* This is for C++ and does no harm in C */
131249259Sdim#ifdef __cplusplus
132249259Sdimextern "C" {
133249259Sdim#endif
134249259Sdim
135249259SdimConversionResult ConvertUTF8toUTF16 (
136249259Sdim  const UTF8** sourceStart, const UTF8* sourceEnd,
137249259Sdim  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
138249259Sdim
139249259SdimConversionResult ConvertUTF8toUTF32 (
140249259Sdim  const UTF8** sourceStart, const UTF8* sourceEnd,
141249259Sdim  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
142249259Sdim
143249259SdimConversionResult ConvertUTF16toUTF8 (
144249259Sdim  const UTF16** sourceStart, const UTF16* sourceEnd,
145249259Sdim  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
146249259Sdim
147249259SdimConversionResult ConvertUTF32toUTF8 (
148249259Sdim  const UTF32** sourceStart, const UTF32* sourceEnd,
149249259Sdim  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
150249259Sdim
151249259SdimConversionResult ConvertUTF16toUTF32 (
152249259Sdim  const UTF16** sourceStart, const UTF16* sourceEnd,
153249259Sdim  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
154249259Sdim
155249259SdimConversionResult ConvertUTF32toUTF16 (
156249259Sdim  const UTF32** sourceStart, const UTF32* sourceEnd,
157249259Sdim  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
158249259Sdim
159249259SdimBoolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
160249259Sdim
161249259SdimBoolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
162249259Sdim
163249259Sdimunsigned getNumBytesForUTF8(UTF8 firstByte);
164249259Sdim
165249259Sdim#ifdef __cplusplus
166249259Sdim}
167249259Sdim
168249259Sdim/*************************************************************************/
169249259Sdim/* Below are LLVM-specific wrappers of the functions above. */
170249259Sdim
171263508Sdim#include "llvm/ADT/ArrayRef.h"
172249259Sdim#include "llvm/ADT/StringRef.h"
173249259Sdim
174249259Sdimnamespace llvm {
175249259Sdim
176249259Sdim/**
177249259Sdim * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on
178249259Sdim * WideCharWidth. The converted data is written to ResultPtr, which needs to
179249259Sdim * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success,
180249259Sdim * ResultPtr will point one after the end of the copied string. On failure,
181249259Sdim * ResultPtr will not be changed, and ErrorPtr will be set to the location of
182249259Sdim * the first character which could not be converted.
183249259Sdim * \return true on success.
184249259Sdim */
185249259Sdimbool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
186249259Sdim                       char *&ResultPtr, const UTF8 *&ErrorPtr);
187249259Sdim
188249259Sdim/**
189249259Sdim * Convert an Unicode code point to UTF8 sequence.
190249259Sdim *
191249259Sdim * \param Source a Unicode code point.
192249259Sdim * \param [in,out] ResultPtr pointer to the output buffer, needs to be at least
193249259Sdim * \c UNI_MAX_UTF8_BYTES_PER_CODE_POINT bytes.  On success \c ResultPtr is
194249259Sdim * updated one past end of the converted sequence.
195249259Sdim *
196249259Sdim * \returns true on success.
197249259Sdim */
198249259Sdimbool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr);
199249259Sdim
200249259Sdim/**
201249259Sdim * Convert the first UTF8 sequence in the given source buffer to a UTF32
202249259Sdim * code point.
203249259Sdim *
204249259Sdim * \param [in,out] source A pointer to the source buffer. If the conversion
205249259Sdim * succeeds, this pointer will be updated to point to the byte just past the
206249259Sdim * end of the converted sequence.
207249259Sdim * \param sourceEnd A pointer just past the end of the source buffer.
208249259Sdim * \param [out] target The converted code
209249259Sdim * \param flags Whether the conversion is strict or lenient.
210249259Sdim *
211249259Sdim * \returns conversionOK on success
212249259Sdim *
213249259Sdim * \sa ConvertUTF8toUTF32
214249259Sdim */
215249259Sdimstatic inline ConversionResult convertUTF8Sequence(const UTF8 **source,
216249259Sdim                                                   const UTF8 *sourceEnd,
217249259Sdim                                                   UTF32 *target,
218249259Sdim                                                   ConversionFlags flags) {
219249259Sdim  if (*source == sourceEnd)
220249259Sdim    return sourceExhausted;
221249259Sdim  unsigned size = getNumBytesForUTF8(**source);
222249259Sdim  if ((ptrdiff_t)size > sourceEnd - *source)
223249259Sdim    return sourceExhausted;
224249259Sdim  return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
225249259Sdim}
226263508Sdim
227263508Sdim/**
228263508Sdim * Returns true if a blob of text starts with a UTF-16 big or little endian byte
229263508Sdim * order mark.
230263508Sdim */
231263508Sdimbool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes);
232263508Sdim
233263508Sdim/**
234263508Sdim * Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string.
235263508Sdim *
236263508Sdim * \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text.
237263508Sdim * \param [out] Out Converted UTF-8 is stored here on success.
238263508Sdim * \returns true on success
239263508Sdim */
240263508Sdimbool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
241263508Sdim
242249259Sdim} /* end namespace llvm */
243249259Sdim
244249259Sdim#endif
245249259Sdim
246249259Sdim/* --------------------------------------------------------------------- */
247249259Sdim
248249259Sdim#endif
249