1198090Srdivacky//===-- StringRef.cpp - Lightweight String References ---------------------===// 2198090Srdivacky// 3198090Srdivacky// The LLVM Compiler Infrastructure 4198090Srdivacky// 5198090Srdivacky// This file is distributed under the University of Illinois Open Source 6198090Srdivacky// License. See LICENSE.TXT for details. 7198090Srdivacky// 8198090Srdivacky//===----------------------------------------------------------------------===// 9198090Srdivacky 10198090Srdivacky#include "llvm/ADT/StringRef.h" 11204642Srdivacky#include "llvm/ADT/APInt.h" 12252723Sdim#include "llvm/ADT/Hashing.h" 13218893Sdim#include "llvm/ADT/OwningPtr.h" 14235633Sdim#include "llvm/ADT/edit_distance.h" 15212904Sdim#include <bitset> 16202375Srdivacky 17198090Srdivackyusing namespace llvm; 18198090Srdivacky 19198090Srdivacky// MSVC emits references to this into the translation units which reference it. 20198090Srdivacky#ifndef _MSC_VER 21198090Srdivackyconst size_t StringRef::npos; 22198090Srdivacky#endif 23198090Srdivacky 24199481Srdivackystatic char ascii_tolower(char x) { 25199481Srdivacky if (x >= 'A' && x <= 'Z') 26199481Srdivacky return x - 'A' + 'a'; 27199481Srdivacky return x; 28199481Srdivacky} 29199481Srdivacky 30235633Sdimstatic char ascii_toupper(char x) { 31235633Sdim if (x >= 'a' && x <= 'z') 32235633Sdim return x - 'a' + 'A'; 33235633Sdim return x; 34235633Sdim} 35235633Sdim 36208599Srdivackystatic bool ascii_isdigit(char x) { 37208599Srdivacky return x >= '0' && x <= '9'; 38208599Srdivacky} 39208599Srdivacky 40263509Sdim// strncasecmp() is not available on non-POSIX systems, so define an 41263509Sdim// alternative function here. 42263509Sdimstatic int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 43263509Sdim for (size_t I = 0; I < Length; ++I) { 44263509Sdim unsigned char LHC = ascii_tolower(LHS[I]); 45263509Sdim unsigned char RHC = ascii_tolower(RHS[I]); 46199481Srdivacky if (LHC != RHC) 47199481Srdivacky return LHC < RHC ? -1 : 1; 48199481Srdivacky } 49263509Sdim return 0; 50263509Sdim} 51199481Srdivacky 52263509Sdim/// compare_lower - Compare strings, ignoring case. 53263509Sdimint StringRef::compare_lower(StringRef RHS) const { 54263509Sdim if (int Res = ascii_strncasecmp(Data, RHS.Data, min(Length, RHS.Length))) 55263509Sdim return Res; 56199481Srdivacky if (Length == RHS.Length) 57212904Sdim return 0; 58199481Srdivacky return Length < RHS.Length ? -1 : 1; 59199481Srdivacky} 60199481Srdivacky 61263509Sdim/// Check if this string starts with the given \p Prefix, ignoring case. 62263509Sdimbool StringRef::startswith_lower(StringRef Prefix) const { 63263509Sdim return Length >= Prefix.Length && 64263509Sdim ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 65263509Sdim} 66263509Sdim 67263509Sdim/// Check if this string ends with the given \p Suffix, ignoring case. 68263509Sdimbool StringRef::endswith_lower(StringRef Suffix) const { 69263509Sdim return Length >= Suffix.Length && 70263509Sdim ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 71263509Sdim} 72263509Sdim 73208599Srdivacky/// compare_numeric - Compare strings, handle embedded numbers. 74208599Srdivackyint StringRef::compare_numeric(StringRef RHS) const { 75208599Srdivacky for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) { 76226890Sdim // Check for sequences of digits. 77208599Srdivacky if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { 78226890Sdim // The longer sequence of numbers is considered larger. 79226890Sdim // This doesn't really handle prefixed zeros well. 80226890Sdim size_t J; 81226890Sdim for (J = I + 1; J != E + 1; ++J) { 82208599Srdivacky bool ld = J < Length && ascii_isdigit(Data[J]); 83208599Srdivacky bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); 84208599Srdivacky if (ld != rd) 85208599Srdivacky return rd ? -1 : 1; 86208599Srdivacky if (!rd) 87208599Srdivacky break; 88208599Srdivacky } 89226890Sdim // The two number sequences have the same length (J-I), just memcmp them. 90226890Sdim if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 91226890Sdim return Res < 0 ? -1 : 1; 92226890Sdim // Identical number sequences, continue search after the numbers. 93226890Sdim I = J - 1; 94226890Sdim continue; 95208599Srdivacky } 96226890Sdim if (Data[I] != RHS.Data[I]) 97226890Sdim return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 98208599Srdivacky } 99208599Srdivacky if (Length == RHS.Length) 100212904Sdim return 0; 101208599Srdivacky return Length < RHS.Length ? -1 : 1; 102208599Srdivacky} 103208599Srdivacky 104201360Srdivacky// Compute the edit distance between the two given strings. 105218893Sdimunsigned StringRef::edit_distance(llvm::StringRef Other, 106218893Sdim bool AllowReplacements, 107263509Sdim unsigned MaxEditDistance) const { 108235633Sdim return llvm::ComputeEditDistance( 109235633Sdim llvm::ArrayRef<char>(data(), size()), 110235633Sdim llvm::ArrayRef<char>(Other.data(), Other.size()), 111235633Sdim AllowReplacements, MaxEditDistance); 112235633Sdim} 113201360Srdivacky 114235633Sdim//===----------------------------------------------------------------------===// 115235633Sdim// String Operations 116235633Sdim//===----------------------------------------------------------------------===// 117235633Sdim 118235633Sdimstd::string StringRef::lower() const { 119235633Sdim std::string Result(size(), char()); 120235633Sdim for (size_type i = 0, e = size(); i != e; ++i) { 121235633Sdim Result[i] = ascii_tolower(Data[i]); 122218893Sdim } 123235633Sdim return Result; 124235633Sdim} 125218893Sdim 126235633Sdimstd::string StringRef::upper() const { 127235633Sdim std::string Result(size(), char()); 128235633Sdim for (size_type i = 0, e = size(); i != e; ++i) { 129235633Sdim Result[i] = ascii_toupper(Data[i]); 130201360Srdivacky } 131202375Srdivacky return Result; 132201360Srdivacky} 133201360Srdivacky 134198090Srdivacky//===----------------------------------------------------------------------===// 135198090Srdivacky// String Searching 136198090Srdivacky//===----------------------------------------------------------------------===// 137198090Srdivacky 138198090Srdivacky 139198090Srdivacky/// find - Search for the first string \arg Str in the string. 140198090Srdivacky/// 141221345Sdim/// \return - The index of the first occurrence of \arg Str, or npos if not 142198090Srdivacky/// found. 143199481Srdivackysize_t StringRef::find(StringRef Str, size_t From) const { 144198090Srdivacky size_t N = Str.size(); 145198090Srdivacky if (N > Length) 146198090Srdivacky return npos; 147235633Sdim 148235633Sdim // For short haystacks or unsupported needles fall back to the naive algorithm 149235633Sdim if (Length < 16 || N > 255 || N == 0) { 150235633Sdim for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i) 151235633Sdim if (substr(i, N).equals(Str)) 152235633Sdim return i; 153235633Sdim return npos; 154235633Sdim } 155235633Sdim 156235633Sdim if (From >= Length) 157235633Sdim return npos; 158235633Sdim 159235633Sdim // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 160235633Sdim uint8_t BadCharSkip[256]; 161235633Sdim std::memset(BadCharSkip, N, 256); 162235633Sdim for (unsigned i = 0; i != N-1; ++i) 163235633Sdim BadCharSkip[(uint8_t)Str[i]] = N-1-i; 164235633Sdim 165235633Sdim unsigned Len = Length-From, Pos = From; 166235633Sdim while (Len >= N) { 167235633Sdim if (substr(Pos, N).equals(Str)) // See if this is the correct substring. 168235633Sdim return Pos; 169235633Sdim 170235633Sdim // Otherwise skip the appropriate number of bytes. 171235633Sdim uint8_t Skip = BadCharSkip[(uint8_t)(*this)[Pos+N-1]]; 172235633Sdim Len -= Skip; 173235633Sdim Pos += Skip; 174235633Sdim } 175235633Sdim 176198090Srdivacky return npos; 177198090Srdivacky} 178198090Srdivacky 179198090Srdivacky/// rfind - Search for the last string \arg Str in the string. 180198090Srdivacky/// 181221345Sdim/// \return - The index of the last occurrence of \arg Str, or npos if not 182198090Srdivacky/// found. 183199481Srdivackysize_t StringRef::rfind(StringRef Str) const { 184198090Srdivacky size_t N = Str.size(); 185198090Srdivacky if (N > Length) 186198090Srdivacky return npos; 187198090Srdivacky for (size_t i = Length - N + 1, e = 0; i != e;) { 188198090Srdivacky --i; 189198090Srdivacky if (substr(i, N).equals(Str)) 190198090Srdivacky return i; 191198090Srdivacky } 192198090Srdivacky return npos; 193198090Srdivacky} 194198090Srdivacky 195199481Srdivacky/// find_first_of - Find the first character in the string that is in \arg 196199481Srdivacky/// Chars, or npos if not found. 197199481Srdivacky/// 198212904Sdim/// Note: O(size() + Chars.size()) 199199481SrdivackyStringRef::size_type StringRef::find_first_of(StringRef Chars, 200199481Srdivacky size_t From) const { 201212904Sdim std::bitset<1 << CHAR_BIT> CharBits; 202212904Sdim for (size_type i = 0; i != Chars.size(); ++i) 203212904Sdim CharBits.set((unsigned char)Chars[i]); 204212904Sdim 205199989Srdivacky for (size_type i = min(From, Length), e = Length; i != e; ++i) 206212904Sdim if (CharBits.test((unsigned char)Data[i])) 207198090Srdivacky return i; 208198090Srdivacky return npos; 209198090Srdivacky} 210198090Srdivacky 211198090Srdivacky/// find_first_not_of - Find the first character in the string that is not 212199481Srdivacky/// \arg C or npos if not found. 213199481SrdivackyStringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 214199989Srdivacky for (size_type i = min(From, Length), e = Length; i != e; ++i) 215199481Srdivacky if (Data[i] != C) 216199481Srdivacky return i; 217199481Srdivacky return npos; 218199481Srdivacky} 219199481Srdivacky 220199481Srdivacky/// find_first_not_of - Find the first character in the string that is not 221199481Srdivacky/// in the string \arg Chars, or npos if not found. 222199481Srdivacky/// 223212904Sdim/// Note: O(size() + Chars.size()) 224199481SrdivackyStringRef::size_type StringRef::find_first_not_of(StringRef Chars, 225199481Srdivacky size_t From) const { 226212904Sdim std::bitset<1 << CHAR_BIT> CharBits; 227212904Sdim for (size_type i = 0; i != Chars.size(); ++i) 228212904Sdim CharBits.set((unsigned char)Chars[i]); 229212904Sdim 230199989Srdivacky for (size_type i = min(From, Length), e = Length; i != e; ++i) 231212904Sdim if (!CharBits.test((unsigned char)Data[i])) 232198090Srdivacky return i; 233198090Srdivacky return npos; 234198090Srdivacky} 235198090Srdivacky 236218893Sdim/// find_last_of - Find the last character in the string that is in \arg C, 237218893Sdim/// or npos if not found. 238218893Sdim/// 239218893Sdim/// Note: O(size() + Chars.size()) 240218893SdimStringRef::size_type StringRef::find_last_of(StringRef Chars, 241218893Sdim size_t From) const { 242218893Sdim std::bitset<1 << CHAR_BIT> CharBits; 243218893Sdim for (size_type i = 0; i != Chars.size(); ++i) 244218893Sdim CharBits.set((unsigned char)Chars[i]); 245198090Srdivacky 246218893Sdim for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) 247218893Sdim if (CharBits.test((unsigned char)Data[i])) 248218893Sdim return i; 249218893Sdim return npos; 250218893Sdim} 251218893Sdim 252245431Sdim/// find_last_not_of - Find the last character in the string that is not 253245431Sdim/// \arg C, or npos if not found. 254245431SdimStringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 255245431Sdim for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) 256245431Sdim if (Data[i] != C) 257245431Sdim return i; 258245431Sdim return npos; 259245431Sdim} 260245431Sdim 261245431Sdim/// find_last_not_of - Find the last character in the string that is not in 262245431Sdim/// \arg Chars, or npos if not found. 263245431Sdim/// 264245431Sdim/// Note: O(size() + Chars.size()) 265245431SdimStringRef::size_type StringRef::find_last_not_of(StringRef Chars, 266245431Sdim size_t From) const { 267245431Sdim std::bitset<1 << CHAR_BIT> CharBits; 268245431Sdim for (size_type i = 0, e = Chars.size(); i != e; ++i) 269245431Sdim CharBits.set((unsigned char)Chars[i]); 270245431Sdim 271245431Sdim for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) 272245431Sdim if (!CharBits.test((unsigned char)Data[i])) 273245431Sdim return i; 274245431Sdim return npos; 275245431Sdim} 276245431Sdim 277235633Sdimvoid StringRef::split(SmallVectorImpl<StringRef> &A, 278235633Sdim StringRef Separators, int MaxSplit, 279235633Sdim bool KeepEmpty) const { 280235633Sdim StringRef rest = *this; 281235633Sdim 282235633Sdim // rest.data() is used to distinguish cases like "a," that splits into 283235633Sdim // "a" + "" and "a" that splits into "a" + 0. 284235633Sdim for (int splits = 0; 285235633Sdim rest.data() != NULL && (MaxSplit < 0 || splits < MaxSplit); 286235633Sdim ++splits) { 287235633Sdim std::pair<StringRef, StringRef> p = rest.split(Separators); 288235633Sdim 289235633Sdim if (KeepEmpty || p.first.size() != 0) 290235633Sdim A.push_back(p.first); 291235633Sdim rest = p.second; 292235633Sdim } 293235633Sdim // If we have a tail left, add it. 294235633Sdim if (rest.data() != NULL && (rest.size() != 0 || KeepEmpty)) 295235633Sdim A.push_back(rest); 296235633Sdim} 297235633Sdim 298198090Srdivacky//===----------------------------------------------------------------------===// 299198090Srdivacky// Helpful Algorithms 300198090Srdivacky//===----------------------------------------------------------------------===// 301198090Srdivacky 302198090Srdivacky/// count - Return the number of non-overlapped occurrences of \arg Str in 303198090Srdivacky/// the string. 304199481Srdivackysize_t StringRef::count(StringRef Str) const { 305198090Srdivacky size_t Count = 0; 306198090Srdivacky size_t N = Str.size(); 307198090Srdivacky if (N > Length) 308198090Srdivacky return 0; 309198090Srdivacky for (size_t i = 0, e = Length - N + 1; i != e; ++i) 310198090Srdivacky if (substr(i, N).equals(Str)) 311198090Srdivacky ++Count; 312198090Srdivacky return Count; 313198090Srdivacky} 314198090Srdivacky 315204642Srdivackystatic unsigned GetAutoSenseRadix(StringRef &Str) { 316204642Srdivacky if (Str.startswith("0x")) { 317204642Srdivacky Str = Str.substr(2); 318204642Srdivacky return 16; 319245431Sdim } 320245431Sdim 321245431Sdim if (Str.startswith("0b")) { 322204642Srdivacky Str = Str.substr(2); 323204642Srdivacky return 2; 324245431Sdim } 325245431Sdim 326245431Sdim if (Str.startswith("0o")) { 327245431Sdim Str = Str.substr(2); 328204642Srdivacky return 8; 329204642Srdivacky } 330245431Sdim 331245431Sdim if (Str.startswith("0")) 332245431Sdim return 8; 333245431Sdim 334245431Sdim return 10; 335204642Srdivacky} 336204642Srdivacky 337204642Srdivacky 338198090Srdivacky/// GetAsUnsignedInteger - Workhorse method that converts a integer character 339198090Srdivacky/// sequence of radix up to 36 to an unsigned long long value. 340235633Sdimbool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 341235633Sdim unsigned long long &Result) { 342198090Srdivacky // Autosense radix if not specified. 343204642Srdivacky if (Radix == 0) 344204642Srdivacky Radix = GetAutoSenseRadix(Str); 345218893Sdim 346198090Srdivacky // Empty strings (after the radix autosense) are invalid. 347198090Srdivacky if (Str.empty()) return true; 348218893Sdim 349198090Srdivacky // Parse all the bytes of the string given this radix. Watch for overflow. 350198090Srdivacky Result = 0; 351198090Srdivacky while (!Str.empty()) { 352198090Srdivacky unsigned CharVal; 353198090Srdivacky if (Str[0] >= '0' && Str[0] <= '9') 354198090Srdivacky CharVal = Str[0]-'0'; 355198090Srdivacky else if (Str[0] >= 'a' && Str[0] <= 'z') 356198090Srdivacky CharVal = Str[0]-'a'+10; 357198090Srdivacky else if (Str[0] >= 'A' && Str[0] <= 'Z') 358198090Srdivacky CharVal = Str[0]-'A'+10; 359198090Srdivacky else 360198090Srdivacky return true; 361218893Sdim 362198090Srdivacky // If the parsed value is larger than the integer radix, the string is 363198090Srdivacky // invalid. 364198090Srdivacky if (CharVal >= Radix) 365198090Srdivacky return true; 366218893Sdim 367198090Srdivacky // Add in this character. 368198090Srdivacky unsigned long long PrevResult = Result; 369198090Srdivacky Result = Result*Radix+CharVal; 370218893Sdim 371245431Sdim // Check for overflow by shifting back and seeing if bits were lost. 372245431Sdim if (Result/Radix < PrevResult) 373198090Srdivacky return true; 374198090Srdivacky 375198090Srdivacky Str = Str.substr(1); 376198090Srdivacky } 377218893Sdim 378198090Srdivacky return false; 379198090Srdivacky} 380198090Srdivacky 381235633Sdimbool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 382235633Sdim long long &Result) { 383198090Srdivacky unsigned long long ULLVal; 384218893Sdim 385198090Srdivacky // Handle positive strings first. 386235633Sdim if (Str.empty() || Str.front() != '-') { 387235633Sdim if (getAsUnsignedInteger(Str, Radix, ULLVal) || 388198090Srdivacky // Check for value so large it overflows a signed value. 389198090Srdivacky (long long)ULLVal < 0) 390198090Srdivacky return true; 391198090Srdivacky Result = ULLVal; 392198090Srdivacky return false; 393198090Srdivacky } 394218893Sdim 395198090Srdivacky // Get the positive part of the value. 396235633Sdim if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) || 397198090Srdivacky // Reject values so large they'd overflow as negative signed, but allow 398198090Srdivacky // "-0". This negates the unsigned so that the negative isn't undefined 399198090Srdivacky // on signed overflow. 400198090Srdivacky (long long)-ULLVal > 0) 401198090Srdivacky return true; 402218893Sdim 403198090Srdivacky Result = -ULLVal; 404198090Srdivacky return false; 405198090Srdivacky} 406198090Srdivacky 407204642Srdivackybool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 408204642Srdivacky StringRef Str = *this; 409204642Srdivacky 410204642Srdivacky // Autosense radix if not specified. 411204642Srdivacky if (Radix == 0) 412204642Srdivacky Radix = GetAutoSenseRadix(Str); 413204642Srdivacky 414204642Srdivacky assert(Radix > 1 && Radix <= 36); 415218893Sdim 416204642Srdivacky // Empty strings (after the radix autosense) are invalid. 417204642Srdivacky if (Str.empty()) return true; 418204642Srdivacky 419204642Srdivacky // Skip leading zeroes. This can be a significant improvement if 420204642Srdivacky // it means we don't need > 64 bits. 421204642Srdivacky while (!Str.empty() && Str.front() == '0') 422204642Srdivacky Str = Str.substr(1); 423204642Srdivacky 424204642Srdivacky // If it was nothing but zeroes.... 425204642Srdivacky if (Str.empty()) { 426204642Srdivacky Result = APInt(64, 0); 427204642Srdivacky return false; 428204642Srdivacky } 429204642Srdivacky 430204642Srdivacky // (Over-)estimate the required number of bits. 431204642Srdivacky unsigned Log2Radix = 0; 432204642Srdivacky while ((1U << Log2Radix) < Radix) Log2Radix++; 433204642Srdivacky bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 434204642Srdivacky 435204642Srdivacky unsigned BitWidth = Log2Radix * Str.size(); 436204642Srdivacky if (BitWidth < Result.getBitWidth()) 437204642Srdivacky BitWidth = Result.getBitWidth(); // don't shrink the result 438245431Sdim else if (BitWidth > Result.getBitWidth()) 439218893Sdim Result = Result.zext(BitWidth); 440204642Srdivacky 441204642Srdivacky APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 442204642Srdivacky if (!IsPowerOf2Radix) { 443204642Srdivacky // These must have the same bit-width as Result. 444204642Srdivacky RadixAP = APInt(BitWidth, Radix); 445204642Srdivacky CharAP = APInt(BitWidth, 0); 446204642Srdivacky } 447204642Srdivacky 448204642Srdivacky // Parse all the bytes of the string given this radix. 449204642Srdivacky Result = 0; 450204642Srdivacky while (!Str.empty()) { 451204642Srdivacky unsigned CharVal; 452204642Srdivacky if (Str[0] >= '0' && Str[0] <= '9') 453204642Srdivacky CharVal = Str[0]-'0'; 454204642Srdivacky else if (Str[0] >= 'a' && Str[0] <= 'z') 455204642Srdivacky CharVal = Str[0]-'a'+10; 456204642Srdivacky else if (Str[0] >= 'A' && Str[0] <= 'Z') 457204642Srdivacky CharVal = Str[0]-'A'+10; 458204642Srdivacky else 459204642Srdivacky return true; 460218893Sdim 461204642Srdivacky // If the parsed value is larger than the integer radix, the string is 462204642Srdivacky // invalid. 463204642Srdivacky if (CharVal >= Radix) 464204642Srdivacky return true; 465218893Sdim 466204642Srdivacky // Add in this character. 467204642Srdivacky if (IsPowerOf2Radix) { 468204642Srdivacky Result <<= Log2Radix; 469204642Srdivacky Result |= CharVal; 470204642Srdivacky } else { 471204642Srdivacky Result *= RadixAP; 472204642Srdivacky CharAP = CharVal; 473204642Srdivacky Result += CharAP; 474204642Srdivacky } 475204642Srdivacky 476204642Srdivacky Str = Str.substr(1); 477204642Srdivacky } 478218893Sdim 479204642Srdivacky return false; 480204642Srdivacky} 481235633Sdim 482235633Sdim 483235633Sdim// Implementation of StringRef hashing. 484235633Sdimhash_code llvm::hash_value(StringRef S) { 485235633Sdim return hash_combine_range(S.begin(), S.end()); 486235633Sdim} 487