Encoding.h revision 309124
124139Sjoerg//===--- Encoding.h - Format C++ code -------------------------------------===// 224139Sjoerg// 324139Sjoerg// The LLVM Compiler Infrastructure 424139Sjoerg// 524139Sjoerg// This file is distributed under the University of Illinois Open Source 624139Sjoerg// License. See LICENSE.TXT for details. 7175420Speter// 8175420Speter//===----------------------------------------------------------------------===// 924139Sjoerg/// 1024139Sjoerg/// \file 11175420Speter/// \brief Contains functions for text encoding manipulation. Supports UTF-8, 12175420Speter/// 8-bit encodings and escape sequences in C++ string literals. 13175420Speter/// 14175420Speter//===----------------------------------------------------------------------===// 15175420Speter 16175420Speter#ifndef LLVM_CLANG_LIB_FORMAT_ENCODING_H 17175420Speter#define LLVM_CLANG_LIB_FORMAT_ENCODING_H 18175420Speter 19175420Speter#include "clang/Basic/LLVM.h" 20175420Speter#include "llvm/ADT/StringRef.h" 21175420Speter#include "llvm/Support/ConvertUTF.h" 22239750Sjhb#include "llvm/Support/Unicode.h" 23239750Sjhb 24175420Speternamespace clang { 25175420Speternamespace format { 26175420Speternamespace encoding { 27175420Speter 28175420Speterenum Encoding { 29175420Speter Encoding_UTF8, 30175420Speter Encoding_Unknown // We treat all other encodings as 8-bit encodings. 31175420Speter}; 3224139Sjoerg 33175420Speter/// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8, 34/// it is considered UTF8, otherwise we treat it as some 8-bit encoding. 35inline Encoding detectEncoding(StringRef Text) { 36 const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin()); 37 const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end()); 38 if (::isLegalUTF8String(&Ptr, BufEnd)) 39 return Encoding_UTF8; 40 return Encoding_Unknown; 41} 42 43inline unsigned getCodePointCountUTF8(StringRef Text) { 44 unsigned CodePoints = 0; 45 for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) { 46 ++CodePoints; 47 } 48 return CodePoints; 49} 50 51/// \brief Gets the number of code points in the Text using the specified 52/// Encoding. 53inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) { 54 switch (Encoding) { 55 case Encoding_UTF8: 56 return getCodePointCountUTF8(Text); 57 default: 58 return Text.size(); 59 } 60} 61 62/// \brief Returns the number of columns required to display the \p Text on a 63/// generic Unicode-capable terminal. Text is assumed to use the specified 64/// \p Encoding. 65inline unsigned columnWidth(StringRef Text, Encoding Encoding) { 66 if (Encoding == Encoding_UTF8) { 67 int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text); 68 // FIXME: Figure out the correct way to handle this in the presence of both 69 // printable and unprintable multi-byte UTF-8 characters. Falling back to 70 // returning the number of bytes may cause problems, as columnWidth suddenly 71 // becomes non-additive. 72 if (ContentWidth >= 0) 73 return ContentWidth; 74 } 75 return Text.size(); 76} 77 78/// \brief Returns the number of columns required to display the \p Text, 79/// starting from the \p StartColumn on a terminal with the \p TabWidth. The 80/// text is assumed to use the specified \p Encoding. 81inline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, 82 unsigned TabWidth, Encoding Encoding) { 83 unsigned TotalWidth = 0; 84 StringRef Tail = Text; 85 for (;;) { 86 StringRef::size_type TabPos = Tail.find('\t'); 87 if (TabPos == StringRef::npos) 88 return TotalWidth + columnWidth(Tail, Encoding); 89 TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding); 90 TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth; 91 Tail = Tail.substr(TabPos + 1); 92 } 93} 94 95/// \brief Gets the number of bytes in a sequence representing a single 96/// codepoint and starting with FirstChar in the specified Encoding. 97inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) { 98 switch (Encoding) { 99 case Encoding_UTF8: 100 return getNumBytesForUTF8(FirstChar); 101 default: 102 return 1; 103 } 104} 105 106inline bool isOctDigit(char c) { return '0' <= c && c <= '7'; } 107 108inline bool isHexDigit(char c) { 109 return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || 110 ('A' <= c && c <= 'F'); 111} 112 113/// \brief Gets the length of an escape sequence inside a C++ string literal. 114/// Text should span from the beginning of the escape sequence (starting with a 115/// backslash) to the end of the string literal. 116inline unsigned getEscapeSequenceLength(StringRef Text) { 117 assert(Text[0] == '\\'); 118 if (Text.size() < 2) 119 return 1; 120 121 switch (Text[1]) { 122 case 'u': 123 return 6; 124 case 'U': 125 return 10; 126 case 'x': { 127 unsigned I = 2; // Point after '\x'. 128 while (I < Text.size() && isHexDigit(Text[I])) 129 ++I; 130 return I; 131 } 132 default: 133 if (isOctDigit(Text[1])) { 134 unsigned I = 1; 135 while (I < Text.size() && I < 4 && isOctDigit(Text[I])) 136 ++I; 137 return I; 138 } 139 return 1 + getNumBytesForUTF8(Text[1]); 140 } 141} 142 143} // namespace encoding 144} // namespace format 145} // namespace clang 146 147#endif 148