1//===-- StringRef.cpp - Lightweight String References ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "llvm/ADT/StringRef.h"
10#include "llvm/ADT/APFloat.h"
11#include "llvm/ADT/APInt.h"
12#include "llvm/ADT/Hashing.h"
13#include "llvm/ADT/StringExtras.h"
14#include "llvm/ADT/edit_distance.h"
15#include "llvm/Support/Error.h"
16#include <bitset>
17
18using namespace llvm;
19
20// MSVC emits references to this into the translation units which reference it.
21#ifndef _MSC_VER
22const size_t StringRef::npos;
23#endif
24
25// strncasecmp() is not available on non-POSIX systems, so define an
26// alternative function here.
27static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
28  for (size_t I = 0; I < Length; ++I) {
29    unsigned char LHC = toLower(LHS[I]);
30    unsigned char RHC = toLower(RHS[I]);
31    if (LHC != RHC)
32      return LHC < RHC ? -1 : 1;
33  }
34  return 0;
35}
36
37/// compare_lower - Compare strings, ignoring case.
38int StringRef::compare_lower(StringRef RHS) const {
39  if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length)))
40    return Res;
41  if (Length == RHS.Length)
42    return 0;
43  return Length < RHS.Length ? -1 : 1;
44}
45
46/// Check if this string starts with the given \p Prefix, ignoring case.
47bool StringRef::startswith_lower(StringRef Prefix) const {
48  return Length >= Prefix.Length &&
49      ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0;
50}
51
52/// Check if this string ends with the given \p Suffix, ignoring case.
53bool StringRef::endswith_lower(StringRef Suffix) const {
54  return Length >= Suffix.Length &&
55      ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
56}
57
58size_t StringRef::find_lower(char C, size_t From) const {
59  char L = toLower(C);
60  return find_if([L](char D) { return toLower(D) == L; }, From);
61}
62
63/// compare_numeric - Compare strings, handle embedded numbers.
64int StringRef::compare_numeric(StringRef RHS) const {
65  for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) {
66    // Check for sequences of digits.
67    if (isDigit(Data[I]) && isDigit(RHS.Data[I])) {
68      // The longer sequence of numbers is considered larger.
69      // This doesn't really handle prefixed zeros well.
70      size_t J;
71      for (J = I + 1; J != E + 1; ++J) {
72        bool ld = J < Length && isDigit(Data[J]);
73        bool rd = J < RHS.Length && isDigit(RHS.Data[J]);
74        if (ld != rd)
75          return rd ? -1 : 1;
76        if (!rd)
77          break;
78      }
79      // The two number sequences have the same length (J-I), just memcmp them.
80      if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
81        return Res < 0 ? -1 : 1;
82      // Identical number sequences, continue search after the numbers.
83      I = J - 1;
84      continue;
85    }
86    if (Data[I] != RHS.Data[I])
87      return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
88  }
89  if (Length == RHS.Length)
90    return 0;
91  return Length < RHS.Length ? -1 : 1;
92}
93
94// Compute the edit distance between the two given strings.
95unsigned StringRef::edit_distance(llvm::StringRef Other,
96                                  bool AllowReplacements,
97                                  unsigned MaxEditDistance) const {
98  return llvm::ComputeEditDistance(
99      makeArrayRef(data(), size()),
100      makeArrayRef(Other.data(), Other.size()),
101      AllowReplacements, MaxEditDistance);
102}
103
104//===----------------------------------------------------------------------===//
105// String Operations
106//===----------------------------------------------------------------------===//
107
108std::string StringRef::lower() const {
109  std::string Result(size(), char());
110  for (size_type i = 0, e = size(); i != e; ++i) {
111    Result[i] = toLower(Data[i]);
112  }
113  return Result;
114}
115
116std::string StringRef::upper() const {
117  std::string Result(size(), char());
118  for (size_type i = 0, e = size(); i != e; ++i) {
119    Result[i] = toUpper(Data[i]);
120  }
121  return Result;
122}
123
124//===----------------------------------------------------------------------===//
125// String Searching
126//===----------------------------------------------------------------------===//
127
128
129/// find - Search for the first string \arg Str in the string.
130///
131/// \return - The index of the first occurrence of \arg Str, or npos if not
132/// found.
133size_t StringRef::find(StringRef Str, size_t From) const {
134  if (From > Length)
135    return npos;
136
137  const char *Start = Data + From;
138  size_t Size = Length - From;
139
140  const char *Needle = Str.data();
141  size_t N = Str.size();
142  if (N == 0)
143    return From;
144  if (Size < N)
145    return npos;
146  if (N == 1) {
147    const char *Ptr = (const char *)::memchr(Start, Needle[0], Size);
148    return Ptr == nullptr ? npos : Ptr - Data;
149  }
150
151  const char *Stop = Start + (Size - N + 1);
152
153  // For short haystacks or unsupported needles fall back to the naive algorithm
154  if (Size < 16 || N > 255) {
155    do {
156      if (std::memcmp(Start, Needle, N) == 0)
157        return Start - Data;
158      ++Start;
159    } while (Start < Stop);
160    return npos;
161  }
162
163  // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
164  uint8_t BadCharSkip[256];
165  std::memset(BadCharSkip, N, 256);
166  for (unsigned i = 0; i != N-1; ++i)
167    BadCharSkip[(uint8_t)Str[i]] = N-1-i;
168
169  do {
170    uint8_t Last = Start[N - 1];
171    if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - 1]))
172      if (std::memcmp(Start, Needle, N - 1) == 0)
173        return Start - Data;
174
175    // Otherwise skip the appropriate number of bytes.
176    Start += BadCharSkip[Last];
177  } while (Start < Stop);
178
179  return npos;
180}
181
182size_t StringRef::find_lower(StringRef Str, size_t From) const {
183  StringRef This = substr(From);
184  while (This.size() >= Str.size()) {
185    if (This.startswith_lower(Str))
186      return From;
187    This = This.drop_front();
188    ++From;
189  }
190  return npos;
191}
192
193size_t StringRef::rfind_lower(char C, size_t From) const {
194  From = std::min(From, Length);
195  size_t i = From;
196  while (i != 0) {
197    --i;
198    if (toLower(Data[i]) == toLower(C))
199      return i;
200  }
201  return npos;
202}
203
204/// rfind - Search for the last string \arg Str in the string.
205///
206/// \return - The index of the last occurrence of \arg Str, or npos if not
207/// found.
208size_t StringRef::rfind(StringRef Str) const {
209  size_t N = Str.size();
210  if (N > Length)
211    return npos;
212  for (size_t i = Length - N + 1, e = 0; i != e;) {
213    --i;
214    if (substr(i, N).equals(Str))
215      return i;
216  }
217  return npos;
218}
219
220size_t StringRef::rfind_lower(StringRef Str) const {
221  size_t N = Str.size();
222  if (N > Length)
223    return npos;
224  for (size_t i = Length - N + 1, e = 0; i != e;) {
225    --i;
226    if (substr(i, N).equals_lower(Str))
227      return i;
228  }
229  return npos;
230}
231
232/// find_first_of - Find the first character in the string that is in \arg
233/// Chars, or npos if not found.
234///
235/// Note: O(size() + Chars.size())
236StringRef::size_type StringRef::find_first_of(StringRef Chars,
237                                              size_t From) const {
238  std::bitset<1 << CHAR_BIT> CharBits;
239  for (size_type i = 0; i != Chars.size(); ++i)
240    CharBits.set((unsigned char)Chars[i]);
241
242  for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
243    if (CharBits.test((unsigned char)Data[i]))
244      return i;
245  return npos;
246}
247
248/// find_first_not_of - Find the first character in the string that is not
249/// \arg C or npos if not found.
250StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
251  for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
252    if (Data[i] != C)
253      return i;
254  return npos;
255}
256
257/// find_first_not_of - Find the first character in the string that is not
258/// in the string \arg Chars, or npos if not found.
259///
260/// Note: O(size() + Chars.size())
261StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
262                                                  size_t From) const {
263  std::bitset<1 << CHAR_BIT> CharBits;
264  for (size_type i = 0; i != Chars.size(); ++i)
265    CharBits.set((unsigned char)Chars[i]);
266
267  for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
268    if (!CharBits.test((unsigned char)Data[i]))
269      return i;
270  return npos;
271}
272
273/// find_last_of - Find the last character in the string that is in \arg C,
274/// or npos if not found.
275///
276/// Note: O(size() + Chars.size())
277StringRef::size_type StringRef::find_last_of(StringRef Chars,
278                                             size_t From) const {
279  std::bitset<1 << CHAR_BIT> CharBits;
280  for (size_type i = 0; i != Chars.size(); ++i)
281    CharBits.set((unsigned char)Chars[i]);
282
283  for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
284    if (CharBits.test((unsigned char)Data[i]))
285      return i;
286  return npos;
287}
288
289/// find_last_not_of - Find the last character in the string that is not
290/// \arg C, or npos if not found.
291StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
292  for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
293    if (Data[i] != C)
294      return i;
295  return npos;
296}
297
298/// find_last_not_of - Find the last character in the string that is not in
299/// \arg Chars, or npos if not found.
300///
301/// Note: O(size() + Chars.size())
302StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
303                                                 size_t From) const {
304  std::bitset<1 << CHAR_BIT> CharBits;
305  for (size_type i = 0, e = Chars.size(); i != e; ++i)
306    CharBits.set((unsigned char)Chars[i]);
307
308  for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
309    if (!CharBits.test((unsigned char)Data[i]))
310      return i;
311  return npos;
312}
313
314void StringRef::split(SmallVectorImpl<StringRef> &A,
315                      StringRef Separator, int MaxSplit,
316                      bool KeepEmpty) const {
317  StringRef S = *this;
318
319  // Count down from MaxSplit. When MaxSplit is -1, this will just split
320  // "forever". This doesn't support splitting more than 2^31 times
321  // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
322  // but that seems unlikely to be useful.
323  while (MaxSplit-- != 0) {
324    size_t Idx = S.find(Separator);
325    if (Idx == npos)
326      break;
327
328    // Push this split.
329    if (KeepEmpty || Idx > 0)
330      A.push_back(S.slice(0, Idx));
331
332    // Jump forward.
333    S = S.slice(Idx + Separator.size(), npos);
334  }
335
336  // Push the tail.
337  if (KeepEmpty || !S.empty())
338    A.push_back(S);
339}
340
341void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator,
342                      int MaxSplit, bool KeepEmpty) const {
343  StringRef S = *this;
344
345  // Count down from MaxSplit. When MaxSplit is -1, this will just split
346  // "forever". This doesn't support splitting more than 2^31 times
347  // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
348  // but that seems unlikely to be useful.
349  while (MaxSplit-- != 0) {
350    size_t Idx = S.find(Separator);
351    if (Idx == npos)
352      break;
353
354    // Push this split.
355    if (KeepEmpty || Idx > 0)
356      A.push_back(S.slice(0, Idx));
357
358    // Jump forward.
359    S = S.slice(Idx + 1, npos);
360  }
361
362  // Push the tail.
363  if (KeepEmpty || !S.empty())
364    A.push_back(S);
365}
366
367//===----------------------------------------------------------------------===//
368// Helpful Algorithms
369//===----------------------------------------------------------------------===//
370
371/// count - Return the number of non-overlapped occurrences of \arg Str in
372/// the string.
373size_t StringRef::count(StringRef Str) const {
374  size_t Count = 0;
375  size_t N = Str.size();
376  if (!N || N > Length)
377    return 0;
378  for (size_t i = 0, e = Length - N + 1; i < e;) {
379    if (substr(i, N).equals(Str)) {
380      ++Count;
381      i += N;
382    }
383    else
384      ++i;
385  }
386  return Count;
387}
388
389static unsigned GetAutoSenseRadix(StringRef &Str) {
390  if (Str.empty())
391    return 10;
392
393  if (Str.startswith("0x") || Str.startswith("0X")) {
394    Str = Str.substr(2);
395    return 16;
396  }
397
398  if (Str.startswith("0b") || Str.startswith("0B")) {
399    Str = Str.substr(2);
400    return 2;
401  }
402
403  if (Str.startswith("0o")) {
404    Str = Str.substr(2);
405    return 8;
406  }
407
408  if (Str[0] == '0' && Str.size() > 1 && isDigit(Str[1])) {
409    Str = Str.substr(1);
410    return 8;
411  }
412
413  return 10;
414}
415
416bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix,
417                                  unsigned long long &Result) {
418  // Autosense radix if not specified.
419  if (Radix == 0)
420    Radix = GetAutoSenseRadix(Str);
421
422  // Empty strings (after the radix autosense) are invalid.
423  if (Str.empty()) return true;
424
425  // Parse all the bytes of the string given this radix.  Watch for overflow.
426  StringRef Str2 = Str;
427  Result = 0;
428  while (!Str2.empty()) {
429    unsigned CharVal;
430    if (Str2[0] >= '0' && Str2[0] <= '9')
431      CharVal = Str2[0] - '0';
432    else if (Str2[0] >= 'a' && Str2[0] <= 'z')
433      CharVal = Str2[0] - 'a' + 10;
434    else if (Str2[0] >= 'A' && Str2[0] <= 'Z')
435      CharVal = Str2[0] - 'A' + 10;
436    else
437      break;
438
439    // If the parsed value is larger than the integer radix, we cannot
440    // consume any more characters.
441    if (CharVal >= Radix)
442      break;
443
444    // Add in this character.
445    unsigned long long PrevResult = Result;
446    Result = Result * Radix + CharVal;
447
448    // Check for overflow by shifting back and seeing if bits were lost.
449    if (Result / Radix < PrevResult)
450      return true;
451
452    Str2 = Str2.substr(1);
453  }
454
455  // We consider the operation a failure if no characters were consumed
456  // successfully.
457  if (Str.size() == Str2.size())
458    return true;
459
460  Str = Str2;
461  return false;
462}
463
464bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix,
465                                long long &Result) {
466  unsigned long long ULLVal;
467
468  // Handle positive strings first.
469  if (Str.empty() || Str.front() != '-') {
470    if (consumeUnsignedInteger(Str, Radix, ULLVal) ||
471        // Check for value so large it overflows a signed value.
472        (long long)ULLVal < 0)
473      return true;
474    Result = ULLVal;
475    return false;
476  }
477
478  // Get the positive part of the value.
479  StringRef Str2 = Str.drop_front(1);
480  if (consumeUnsignedInteger(Str2, Radix, ULLVal) ||
481      // Reject values so large they'd overflow as negative signed, but allow
482      // "-0".  This negates the unsigned so that the negative isn't undefined
483      // on signed overflow.
484      (long long)-ULLVal > 0)
485    return true;
486
487  Str = Str2;
488  Result = -ULLVal;
489  return false;
490}
491
492/// GetAsUnsignedInteger - Workhorse method that converts a integer character
493/// sequence of radix up to 36 to an unsigned long long value.
494bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
495                                unsigned long long &Result) {
496  if (consumeUnsignedInteger(Str, Radix, Result))
497    return true;
498
499  // For getAsUnsignedInteger, we require the whole string to be consumed or
500  // else we consider it a failure.
501  return !Str.empty();
502}
503
504bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
505                              long long &Result) {
506  if (consumeSignedInteger(Str, Radix, Result))
507    return true;
508
509  // For getAsSignedInteger, we require the whole string to be consumed or else
510  // we consider it a failure.
511  return !Str.empty();
512}
513
514bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
515  StringRef Str = *this;
516
517  // Autosense radix if not specified.
518  if (Radix == 0)
519    Radix = GetAutoSenseRadix(Str);
520
521  assert(Radix > 1 && Radix <= 36);
522
523  // Empty strings (after the radix autosense) are invalid.
524  if (Str.empty()) return true;
525
526  // Skip leading zeroes.  This can be a significant improvement if
527  // it means we don't need > 64 bits.
528  while (!Str.empty() && Str.front() == '0')
529    Str = Str.substr(1);
530
531  // If it was nothing but zeroes....
532  if (Str.empty()) {
533    Result = APInt(64, 0);
534    return false;
535  }
536
537  // (Over-)estimate the required number of bits.
538  unsigned Log2Radix = 0;
539  while ((1U << Log2Radix) < Radix) Log2Radix++;
540  bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
541
542  unsigned BitWidth = Log2Radix * Str.size();
543  if (BitWidth < Result.getBitWidth())
544    BitWidth = Result.getBitWidth(); // don't shrink the result
545  else if (BitWidth > Result.getBitWidth())
546    Result = Result.zext(BitWidth);
547
548  APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
549  if (!IsPowerOf2Radix) {
550    // These must have the same bit-width as Result.
551    RadixAP = APInt(BitWidth, Radix);
552    CharAP = APInt(BitWidth, 0);
553  }
554
555  // Parse all the bytes of the string given this radix.
556  Result = 0;
557  while (!Str.empty()) {
558    unsigned CharVal;
559    if (Str[0] >= '0' && Str[0] <= '9')
560      CharVal = Str[0]-'0';
561    else if (Str[0] >= 'a' && Str[0] <= 'z')
562      CharVal = Str[0]-'a'+10;
563    else if (Str[0] >= 'A' && Str[0] <= 'Z')
564      CharVal = Str[0]-'A'+10;
565    else
566      return true;
567
568    // If the parsed value is larger than the integer radix, the string is
569    // invalid.
570    if (CharVal >= Radix)
571      return true;
572
573    // Add in this character.
574    if (IsPowerOf2Radix) {
575      Result <<= Log2Radix;
576      Result |= CharVal;
577    } else {
578      Result *= RadixAP;
579      CharAP = CharVal;
580      Result += CharAP;
581    }
582
583    Str = Str.substr(1);
584  }
585
586  return false;
587}
588
589bool StringRef::getAsDouble(double &Result, bool AllowInexact) const {
590  APFloat F(0.0);
591  auto StatusOrErr = F.convertFromString(*this, APFloat::rmNearestTiesToEven);
592  if (errorToBool(StatusOrErr.takeError()))
593    return true;
594
595  APFloat::opStatus Status = *StatusOrErr;
596  if (Status != APFloat::opOK) {
597    if (!AllowInexact || !(Status & APFloat::opInexact))
598      return true;
599  }
600
601  Result = F.convertToDouble();
602  return false;
603}
604
605// Implementation of StringRef hashing.
606hash_code llvm::hash_value(StringRef S) {
607  return hash_combine_range(S.begin(), S.end());
608}
609