llvm/Support/ConvertUTF.h

249259Sdim/*===--- ConvertUTF.h - Universal Character Names conversions ---------------===
249259Sdim *
249259Sdim *                     The LLVM Compiler Infrastructure
249259Sdim *
249259Sdim * This file is distributed under the University of Illinois Open Source
249259Sdim * License. See LICENSE.TXT for details.
249259Sdim *
249259Sdim *==------------------------------------------------------------------------==*/
249259Sdim/*
249259Sdim * Copyright 2001-2004 Unicode, Inc.
249259Sdim *
249259Sdim * Disclaimer
249259Sdim *
249259Sdim * This source code is provided as is by Unicode, Inc. No claims are
249259Sdim * made as to fitness for any particular purpose. No warranties of any
249259Sdim * kind are expressed or implied. The recipient agrees to determine
249259Sdim * applicability of information provided. If this file has been
249259Sdim * purchased on magnetic or optical media from Unicode, Inc., the
249259Sdim * sole remedy for any claim will be exchange of defective media
249259Sdim * within 90 days of receipt.
249259Sdim *
249259Sdim * Limitations on Rights to Redistribute This Code
249259Sdim *
249259Sdim * Unicode, Inc. hereby grants the right to freely use the information
249259Sdim * supplied in this file in the creation of products supporting the
249259Sdim * Unicode Standard, and to make copies of this file in any form
249259Sdim * for internal or external distribution as long as this notice
249259Sdim * remains attached.
249259Sdim */
249259Sdim
249259Sdim/* ---------------------------------------------------------------------
249259Sdim
249259Sdim    Conversions between UTF32, UTF-16, and UTF-8.  Header file.
249259Sdim
249259Sdim    Several funtions are included here, forming a complete set of
249259Sdim    conversions between the three formats.  UTF-7 is not included
249259Sdim    here, but is handled in a separate source file.
249259Sdim
249259Sdim    Each of these routines takes pointers to input buffers and output
249259Sdim    buffers.  The input buffers are const.
249259Sdim
249259Sdim    Each routine converts the text between *sourceStart and sourceEnd,
249259Sdim    putting the result into the buffer between *targetStart and
249259Sdim    targetEnd. Note: the end pointers are *after* the last item: e.g.
249259Sdim    *(sourceEnd - 1) is the last item.
249259Sdim
249259Sdim    The return result indicates whether the conversion was successful,
249259Sdim    and if not, whether the problem was in the source or target buffers.
249259Sdim    (Only the first encountered problem is indicated.)
249259Sdim
249259Sdim    After the conversion, *sourceStart and *targetStart are both
249259Sdim    updated to point to the end of last text successfully converted in
249259Sdim    the respective buffers.
249259Sdim
249259Sdim    Input parameters:
249259Sdim        sourceStart - pointer to a pointer to the source buffer.
249259Sdim                The contents of this are modified on return so that
249259Sdim                it points at the next thing to be converted.
249259Sdim        targetStart - similarly, pointer to pointer to the target buffer.
249259Sdim        sourceEnd, targetEnd - respectively pointers to the ends of the
249259Sdim                two buffers, for overflow checking only.
249259Sdim
249259Sdim    These conversion functions take a ConversionFlags argument. When this
249259Sdim    flag is set to strict, both irregular sequences and isolated surrogates
249259Sdim    will cause an error.  When the flag is set to lenient, both irregular
249259Sdim    sequences and isolated surrogates are converted.
249259Sdim
249259Sdim    Whether the flag is strict or lenient, all illegal sequences will cause
249259Sdim    an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
249259Sdim    or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
249259Sdim    must check for illegal sequences.
249259Sdim
249259Sdim    When the flag is set to lenient, characters over 0x10FFFF are converted
249259Sdim    to the replacement character; otherwise (when the flag is set to strict)
249259Sdim    they constitute an error.
249259Sdim
249259Sdim    Output parameters:
249259Sdim        The value "sourceIllegal" is returned from some routines if the input
249259Sdim        sequence is malformed.  When "sourceIllegal" is returned, the source
249259Sdim        value will point to the illegal value that caused the problem. E.g.,
249259Sdim        in UTF-8 when a sequence is malformed, it points to the start of the
249259Sdim        malformed sequence.
249259Sdim
249259Sdim    Author: Mark E. Davis, 1994.
249259Sdim    Rev History: Rick McGowan, fixes & updates May 2001.
249259Sdim         Fixes & updates, Sept 2001.
249259Sdim
249259Sdim------------------------------------------------------------------------ */
249259Sdim
263508Sdim#ifndef LLVM_SUPPORT_CONVERTUTF_H
263508Sdim#define LLVM_SUPPORT_CONVERTUTF_H
249259Sdim
249259Sdim/* ---------------------------------------------------------------------
249259Sdim    The following 4 definitions are compiler-specific.
249259Sdim    The C standard does not guarantee that wchar_t has at least
249259Sdim    16 bits, so wchar_t is no less portable than unsigned short!
249259Sdim    All should be unsigned values to avoid sign extension during
249259Sdim    bit mask & shift operations.
249259Sdim------------------------------------------------------------------------ */
249259Sdim
249259Sdimtypedef unsigned int    UTF32;  /* at least 32 bits */
249259Sdimtypedef unsigned short  UTF16;  /* at least 16 bits */
249259Sdimtypedef unsigned char   UTF8;   /* typically 8 bits */
249259Sdimtypedef unsigned char   Boolean; /* 0 or 1 */
249259Sdim
249259Sdim/* Some fundamental constants */
249259Sdim#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
249259Sdim#define UNI_MAX_BMP (UTF32)0x0000FFFF
249259Sdim#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
249259Sdim#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
249259Sdim#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
249259Sdim
249259Sdim#define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4
249259Sdim
263508Sdim#define UNI_UTF16_BYTE_ORDER_MARK_NATIVE  0xFEFF
263508Sdim#define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE
263508Sdim
249259Sdimtypedef enum {
249259Sdim  conversionOK,           /* conversion successful */
249259Sdim  sourceExhausted,        /* partial character in source, but hit end */
249259Sdim  targetExhausted,        /* insuff. room in target for conversion */
249259Sdim  sourceIllegal           /* source sequence is illegal/malformed */
249259Sdim} ConversionResult;
249259Sdim
249259Sdimtypedef enum {
249259Sdim  strictConversion = 0,
249259Sdim  lenientConversion
249259Sdim} ConversionFlags;
249259Sdim
249259Sdim/* This is for C++ and does no harm in C */
249259Sdim#ifdef __cplusplus
249259Sdimextern "C" {
249259Sdim#endif
249259Sdim
249259SdimConversionResult ConvertUTF8toUTF16 (
249259Sdim  const UTF8** sourceStart, const UTF8* sourceEnd,
249259Sdim  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
249259Sdim
249259SdimConversionResult ConvertUTF8toUTF32 (
249259Sdim  const UTF8** sourceStart, const UTF8* sourceEnd,
249259Sdim  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
249259Sdim
249259SdimConversionResult ConvertUTF16toUTF8 (
249259Sdim  const UTF16** sourceStart, const UTF16* sourceEnd,
249259Sdim  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
249259Sdim
249259SdimConversionResult ConvertUTF32toUTF8 (
249259Sdim  const UTF32** sourceStart, const UTF32* sourceEnd,
249259Sdim  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
249259Sdim
249259SdimConversionResult ConvertUTF16toUTF32 (
249259Sdim  const UTF16** sourceStart, const UTF16* sourceEnd,
249259Sdim  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
249259Sdim
249259SdimConversionResult ConvertUTF32toUTF16 (
249259Sdim  const UTF32** sourceStart, const UTF32* sourceEnd,
249259Sdim  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
249259Sdim
249259SdimBoolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
249259Sdim
249259SdimBoolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
249259Sdim
249259Sdimunsigned getNumBytesForUTF8(UTF8 firstByte);
249259Sdim
249259Sdim#ifdef __cplusplus
249259Sdim}
249259Sdim
249259Sdim/*************************************************************************/
249259Sdim/* Below are LLVM-specific wrappers of the functions above. */
249259Sdim
263508Sdim#include "llvm/ADT/ArrayRef.h"
249259Sdim#include "llvm/ADT/StringRef.h"
249259Sdim
249259Sdimnamespace llvm {
249259Sdim
249259Sdim/**
249259Sdim * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on
249259Sdim * WideCharWidth. The converted data is written to ResultPtr, which needs to
249259Sdim * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success,
249259Sdim * ResultPtr will point one after the end of the copied string. On failure,
249259Sdim * ResultPtr will not be changed, and ErrorPtr will be set to the location of
249259Sdim * the first character which could not be converted.
249259Sdim * \return true on success.
249259Sdim */
249259Sdimbool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
249259Sdim                       char *&ResultPtr, const UTF8 *&ErrorPtr);
249259Sdim
249259Sdim/**
249259Sdim * Convert an Unicode code point to UTF8 sequence.
249259Sdim *
249259Sdim * \param Source a Unicode code point.
249259Sdim * \param [in,out] ResultPtr pointer to the output buffer, needs to be at least
249259Sdim * \c UNI_MAX_UTF8_BYTES_PER_CODE_POINT bytes.  On success \c ResultPtr is
249259Sdim * updated one past end of the converted sequence.
249259Sdim *
249259Sdim * \returns true on success.
249259Sdim */
249259Sdimbool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr);
249259Sdim
249259Sdim/**
249259Sdim * Convert the first UTF8 sequence in the given source buffer to a UTF32
249259Sdim * code point.
249259Sdim *
249259Sdim * \param [in,out] source A pointer to the source buffer. If the conversion
249259Sdim * succeeds, this pointer will be updated to point to the byte just past the
249259Sdim * end of the converted sequence.
249259Sdim * \param sourceEnd A pointer just past the end of the source buffer.
249259Sdim * \param [out] target The converted code
249259Sdim * \param flags Whether the conversion is strict or lenient.
249259Sdim *
249259Sdim * \returns conversionOK on success
249259Sdim *
249259Sdim * \sa ConvertUTF8toUTF32
249259Sdim */
249259Sdimstatic inline ConversionResult convertUTF8Sequence(const UTF8 **source,
249259Sdim                                                   const UTF8 *sourceEnd,
249259Sdim                                                   UTF32 *target,
249259Sdim                                                   ConversionFlags flags) {
249259Sdim  if (*source == sourceEnd)
249259Sdim    return sourceExhausted;
249259Sdim  unsigned size = getNumBytesForUTF8(**source);
249259Sdim  if ((ptrdiff_t)size > sourceEnd - *source)
249259Sdim    return sourceExhausted;
249259Sdim  return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
249259Sdim}
263508Sdim
263508Sdim/**
263508Sdim * Returns true if a blob of text starts with a UTF-16 big or little endian byte
263508Sdim * order mark.
263508Sdim */
263508Sdimbool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes);
263508Sdim
263508Sdim/**
263508Sdim * Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string.
263508Sdim *
263508Sdim * \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text.
263508Sdim * \param [out] Out Converted UTF-8 is stored here on success.
263508Sdim * \returns true on success
263508Sdim */
263508Sdimbool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
263508Sdim
249259Sdim} /* end namespace llvm */
249259Sdim
249259Sdim#endif
249259Sdim
249259Sdim/* --------------------------------------------------------------------- */
249259Sdim
249259Sdim#endif