1 //===-- StringRef.cpp - Lightweight String References ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/ADT/StringRef.h" 10 #include "llvm/ADT/APFloat.h" 11 #include "llvm/ADT/APInt.h" 12 #include "llvm/ADT/Hashing.h" 13 #include "llvm/ADT/StringExtras.h" 14 #include "llvm/ADT/edit_distance.h" 15 #include "llvm/Support/Error.h" 16 #include <bitset> 17 18 using namespace llvm; 19 20 // MSVC emits references to this into the translation units which reference it. 21 #ifndef _MSC_VER 22 constexpr size_t StringRef::npos; 23 #endif 24 25 // strncasecmp() is not available on non-POSIX systems, so define an 26 // alternative function here. 27 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 28 for (size_t I = 0; I < Length; ++I) { 29 unsigned char LHC = toLower(LHS[I]); 30 unsigned char RHC = toLower(RHS[I]); 31 if (LHC != RHC) 32 return LHC < RHC ? -1 : 1; 33 } 34 return 0; 35 } 36 37 int StringRef::compare_insensitive(StringRef RHS) const { 38 if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length))) 39 return Res; 40 if (Length == RHS.Length) 41 return 0; 42 return Length < RHS.Length ? -1 : 1; 43 } 44 45 bool StringRef::starts_with_insensitive(StringRef Prefix) const { 46 return Length >= Prefix.Length && 47 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 48 } 49 50 bool StringRef::ends_with_insensitive(StringRef Suffix) const { 51 return Length >= Suffix.Length && 52 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 53 } 54 55 size_t StringRef::find_insensitive(char C, size_t From) const { 56 char L = toLower(C); 57 return find_if([L](char D) { return toLower(D) == L; }, From); 58 } 59 60 /// compare_numeric - Compare strings, handle embedded numbers. 61 int StringRef::compare_numeric(StringRef RHS) const { 62 for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) { 63 // Check for sequences of digits. 64 if (isDigit(Data[I]) && isDigit(RHS.Data[I])) { 65 // The longer sequence of numbers is considered larger. 66 // This doesn't really handle prefixed zeros well. 67 size_t J; 68 for (J = I + 1; J != E + 1; ++J) { 69 bool ld = J < Length && isDigit(Data[J]); 70 bool rd = J < RHS.Length && isDigit(RHS.Data[J]); 71 if (ld != rd) 72 return rd ? -1 : 1; 73 if (!rd) 74 break; 75 } 76 // The two number sequences have the same length (J-I), just memcmp them. 77 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 78 return Res < 0 ? -1 : 1; 79 // Identical number sequences, continue search after the numbers. 80 I = J - 1; 81 continue; 82 } 83 if (Data[I] != RHS.Data[I]) 84 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 85 } 86 if (Length == RHS.Length) 87 return 0; 88 return Length < RHS.Length ? -1 : 1; 89 } 90 91 // Compute the edit distance between the two given strings. 92 unsigned StringRef::edit_distance(llvm::StringRef Other, 93 bool AllowReplacements, 94 unsigned MaxEditDistance) const { 95 return llvm::ComputeEditDistance(ArrayRef(data(), size()), 96 ArrayRef(Other.data(), Other.size()), 97 AllowReplacements, MaxEditDistance); 98 } 99 100 unsigned llvm::StringRef::edit_distance_insensitive( 101 StringRef Other, bool AllowReplacements, unsigned MaxEditDistance) const { 102 return llvm::ComputeMappedEditDistance( 103 ArrayRef(data(), size()), ArrayRef(Other.data(), Other.size()), 104 llvm::toLower, AllowReplacements, MaxEditDistance); 105 } 106 107 //===----------------------------------------------------------------------===// 108 // String Operations 109 //===----------------------------------------------------------------------===// 110 111 std::string StringRef::lower() const { 112 return std::string(map_iterator(begin(), toLower), 113 map_iterator(end(), toLower)); 114 } 115 116 std::string StringRef::upper() const { 117 return std::string(map_iterator(begin(), toUpper), 118 map_iterator(end(), toUpper)); 119 } 120 121 //===----------------------------------------------------------------------===// 122 // String Searching 123 //===----------------------------------------------------------------------===// 124 125 126 /// find - Search for the first string \arg Str in the string. 127 /// 128 /// \return - The index of the first occurrence of \arg Str, or npos if not 129 /// found. 130 size_t StringRef::find(StringRef Str, size_t From) const { 131 if (From > Length) 132 return npos; 133 134 const char *Start = Data + From; 135 size_t Size = Length - From; 136 137 const char *Needle = Str.data(); 138 size_t N = Str.size(); 139 if (N == 0) 140 return From; 141 if (Size < N) 142 return npos; 143 if (N == 1) { 144 const char *Ptr = (const char *)::memchr(Start, Needle[0], Size); 145 return Ptr == nullptr ? npos : Ptr - Data; 146 } 147 148 const char *Stop = Start + (Size - N + 1); 149 150 if (N == 2) { 151 // Provide a fast path for newline finding (CRLF case) in InclusionRewriter. 152 // Not the most optimized strategy, but getting memcmp inlined should be 153 // good enough. 154 do { 155 if (std::memcmp(Start, Needle, 2) == 0) 156 return Start - Data; 157 ++Start; 158 } while (Start < Stop); 159 return npos; 160 } 161 162 // For short haystacks or unsupported needles fall back to the naive algorithm 163 if (Size < 16 || N > 255) { 164 do { 165 if (std::memcmp(Start, Needle, N) == 0) 166 return Start - Data; 167 ++Start; 168 } while (Start < Stop); 169 return npos; 170 } 171 172 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 173 uint8_t BadCharSkip[256]; 174 std::memset(BadCharSkip, N, 256); 175 for (unsigned i = 0; i != N-1; ++i) 176 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 177 178 do { 179 uint8_t Last = Start[N - 1]; 180 if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - 1])) 181 if (std::memcmp(Start, Needle, N - 1) == 0) 182 return Start - Data; 183 184 // Otherwise skip the appropriate number of bytes. 185 Start += BadCharSkip[Last]; 186 } while (Start < Stop); 187 188 return npos; 189 } 190 191 size_t StringRef::find_insensitive(StringRef Str, size_t From) const { 192 StringRef This = substr(From); 193 while (This.size() >= Str.size()) { 194 if (This.startswith_insensitive(Str)) 195 return From; 196 This = This.drop_front(); 197 ++From; 198 } 199 return npos; 200 } 201 202 size_t StringRef::rfind_insensitive(char C, size_t From) const { 203 From = std::min(From, Length); 204 size_t i = From; 205 while (i != 0) { 206 --i; 207 if (toLower(Data[i]) == toLower(C)) 208 return i; 209 } 210 return npos; 211 } 212 213 /// rfind - Search for the last string \arg Str in the string. 214 /// 215 /// \return - The index of the last occurrence of \arg Str, or npos if not 216 /// found. 217 size_t StringRef::rfind(StringRef Str) const { 218 size_t N = Str.size(); 219 if (N > Length) 220 return npos; 221 for (size_t i = Length - N + 1, e = 0; i != e;) { 222 --i; 223 if (substr(i, N).equals(Str)) 224 return i; 225 } 226 return npos; 227 } 228 229 size_t StringRef::rfind_insensitive(StringRef Str) const { 230 size_t N = Str.size(); 231 if (N > Length) 232 return npos; 233 for (size_t i = Length - N + 1, e = 0; i != e;) { 234 --i; 235 if (substr(i, N).equals_insensitive(Str)) 236 return i; 237 } 238 return npos; 239 } 240 241 /// find_first_of - Find the first character in the string that is in \arg 242 /// Chars, or npos if not found. 243 /// 244 /// Note: O(size() + Chars.size()) 245 StringRef::size_type StringRef::find_first_of(StringRef Chars, 246 size_t From) const { 247 std::bitset<1 << CHAR_BIT> CharBits; 248 for (char C : Chars) 249 CharBits.set((unsigned char)C); 250 251 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 252 if (CharBits.test((unsigned char)Data[i])) 253 return i; 254 return npos; 255 } 256 257 /// find_first_not_of - Find the first character in the string that is not 258 /// \arg C or npos if not found. 259 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 260 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 261 if (Data[i] != C) 262 return i; 263 return npos; 264 } 265 266 /// find_first_not_of - Find the first character in the string that is not 267 /// in the string \arg Chars, or npos if not found. 268 /// 269 /// Note: O(size() + Chars.size()) 270 StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 271 size_t From) const { 272 std::bitset<1 << CHAR_BIT> CharBits; 273 for (char C : Chars) 274 CharBits.set((unsigned char)C); 275 276 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 277 if (!CharBits.test((unsigned char)Data[i])) 278 return i; 279 return npos; 280 } 281 282 /// find_last_of - Find the last character in the string that is in \arg C, 283 /// or npos if not found. 284 /// 285 /// Note: O(size() + Chars.size()) 286 StringRef::size_type StringRef::find_last_of(StringRef Chars, 287 size_t From) const { 288 std::bitset<1 << CHAR_BIT> CharBits; 289 for (char C : Chars) 290 CharBits.set((unsigned char)C); 291 292 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 293 if (CharBits.test((unsigned char)Data[i])) 294 return i; 295 return npos; 296 } 297 298 /// find_last_not_of - Find the last character in the string that is not 299 /// \arg C, or npos if not found. 300 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 301 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 302 if (Data[i] != C) 303 return i; 304 return npos; 305 } 306 307 /// find_last_not_of - Find the last character in the string that is not in 308 /// \arg Chars, or npos if not found. 309 /// 310 /// Note: O(size() + Chars.size()) 311 StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 312 size_t From) const { 313 std::bitset<1 << CHAR_BIT> CharBits; 314 for (char C : Chars) 315 CharBits.set((unsigned char)C); 316 317 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 318 if (!CharBits.test((unsigned char)Data[i])) 319 return i; 320 return npos; 321 } 322 323 void StringRef::split(SmallVectorImpl<StringRef> &A, 324 StringRef Separator, int MaxSplit, 325 bool KeepEmpty) const { 326 StringRef S = *this; 327 328 // Count down from MaxSplit. When MaxSplit is -1, this will just split 329 // "forever". This doesn't support splitting more than 2^31 times 330 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 331 // but that seems unlikely to be useful. 332 while (MaxSplit-- != 0) { 333 size_t Idx = S.find(Separator); 334 if (Idx == npos) 335 break; 336 337 // Push this split. 338 if (KeepEmpty || Idx > 0) 339 A.push_back(S.slice(0, Idx)); 340 341 // Jump forward. 342 S = S.slice(Idx + Separator.size(), npos); 343 } 344 345 // Push the tail. 346 if (KeepEmpty || !S.empty()) 347 A.push_back(S); 348 } 349 350 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, 351 int MaxSplit, bool KeepEmpty) const { 352 StringRef S = *this; 353 354 // Count down from MaxSplit. When MaxSplit is -1, this will just split 355 // "forever". This doesn't support splitting more than 2^31 times 356 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 357 // but that seems unlikely to be useful. 358 while (MaxSplit-- != 0) { 359 size_t Idx = S.find(Separator); 360 if (Idx == npos) 361 break; 362 363 // Push this split. 364 if (KeepEmpty || Idx > 0) 365 A.push_back(S.slice(0, Idx)); 366 367 // Jump forward. 368 S = S.slice(Idx + 1, npos); 369 } 370 371 // Push the tail. 372 if (KeepEmpty || !S.empty()) 373 A.push_back(S); 374 } 375 376 //===----------------------------------------------------------------------===// 377 // Helpful Algorithms 378 //===----------------------------------------------------------------------===// 379 380 /// count - Return the number of non-overlapped occurrences of \arg Str in 381 /// the string. 382 size_t StringRef::count(StringRef Str) const { 383 size_t Count = 0; 384 size_t Pos = 0; 385 size_t N = Str.size(); 386 // TODO: For an empty `Str` we return 0 for legacy reasons. Consider changing 387 // this to `Length + 1` which is more in-line with the function 388 // description. 389 if (!N) 390 return 0; 391 while ((Pos = find(Str, Pos)) != npos) { 392 ++Count; 393 Pos += N; 394 } 395 return Count; 396 } 397 398 static unsigned GetAutoSenseRadix(StringRef &Str) { 399 if (Str.empty()) 400 return 10; 401 402 if (Str.startswith("0x") || Str.startswith("0X")) { 403 Str = Str.substr(2); 404 return 16; 405 } 406 407 if (Str.startswith("0b") || Str.startswith("0B")) { 408 Str = Str.substr(2); 409 return 2; 410 } 411 412 if (Str.startswith("0o")) { 413 Str = Str.substr(2); 414 return 8; 415 } 416 417 if (Str[0] == '0' && Str.size() > 1 && isDigit(Str[1])) { 418 Str = Str.substr(1); 419 return 8; 420 } 421 422 return 10; 423 } 424 425 bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix, 426 unsigned long long &Result) { 427 // Autosense radix if not specified. 428 if (Radix == 0) 429 Radix = GetAutoSenseRadix(Str); 430 431 // Empty strings (after the radix autosense) are invalid. 432 if (Str.empty()) return true; 433 434 // Parse all the bytes of the string given this radix. Watch for overflow. 435 StringRef Str2 = Str; 436 Result = 0; 437 while (!Str2.empty()) { 438 unsigned CharVal; 439 if (Str2[0] >= '0' && Str2[0] <= '9') 440 CharVal = Str2[0] - '0'; 441 else if (Str2[0] >= 'a' && Str2[0] <= 'z') 442 CharVal = Str2[0] - 'a' + 10; 443 else if (Str2[0] >= 'A' && Str2[0] <= 'Z') 444 CharVal = Str2[0] - 'A' + 10; 445 else 446 break; 447 448 // If the parsed value is larger than the integer radix, we cannot 449 // consume any more characters. 450 if (CharVal >= Radix) 451 break; 452 453 // Add in this character. 454 unsigned long long PrevResult = Result; 455 Result = Result * Radix + CharVal; 456 457 // Check for overflow by shifting back and seeing if bits were lost. 458 if (Result / Radix < PrevResult) 459 return true; 460 461 Str2 = Str2.substr(1); 462 } 463 464 // We consider the operation a failure if no characters were consumed 465 // successfully. 466 if (Str.size() == Str2.size()) 467 return true; 468 469 Str = Str2; 470 return false; 471 } 472 473 bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix, 474 long long &Result) { 475 unsigned long long ULLVal; 476 477 // Handle positive strings first. 478 if (Str.empty() || Str.front() != '-') { 479 if (consumeUnsignedInteger(Str, Radix, ULLVal) || 480 // Check for value so large it overflows a signed value. 481 (long long)ULLVal < 0) 482 return true; 483 Result = ULLVal; 484 return false; 485 } 486 487 // Get the positive part of the value. 488 StringRef Str2 = Str.drop_front(1); 489 if (consumeUnsignedInteger(Str2, Radix, ULLVal) || 490 // Reject values so large they'd overflow as negative signed, but allow 491 // "-0". This negates the unsigned so that the negative isn't undefined 492 // on signed overflow. 493 (long long)-ULLVal > 0) 494 return true; 495 496 Str = Str2; 497 Result = -ULLVal; 498 return false; 499 } 500 501 /// GetAsUnsignedInteger - Workhorse method that converts a integer character 502 /// sequence of radix up to 36 to an unsigned long long value. 503 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 504 unsigned long long &Result) { 505 if (consumeUnsignedInteger(Str, Radix, Result)) 506 return true; 507 508 // For getAsUnsignedInteger, we require the whole string to be consumed or 509 // else we consider it a failure. 510 return !Str.empty(); 511 } 512 513 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 514 long long &Result) { 515 if (consumeSignedInteger(Str, Radix, Result)) 516 return true; 517 518 // For getAsSignedInteger, we require the whole string to be consumed or else 519 // we consider it a failure. 520 return !Str.empty(); 521 } 522 523 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 524 StringRef Str = *this; 525 526 // Autosense radix if not specified. 527 if (Radix == 0) 528 Radix = GetAutoSenseRadix(Str); 529 530 assert(Radix > 1 && Radix <= 36); 531 532 // Empty strings (after the radix autosense) are invalid. 533 if (Str.empty()) return true; 534 535 // Skip leading zeroes. This can be a significant improvement if 536 // it means we don't need > 64 bits. 537 while (!Str.empty() && Str.front() == '0') 538 Str = Str.substr(1); 539 540 // If it was nothing but zeroes.... 541 if (Str.empty()) { 542 Result = APInt(64, 0); 543 return false; 544 } 545 546 // (Over-)estimate the required number of bits. 547 unsigned Log2Radix = 0; 548 while ((1U << Log2Radix) < Radix) Log2Radix++; 549 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 550 551 unsigned BitWidth = Log2Radix * Str.size(); 552 if (BitWidth < Result.getBitWidth()) 553 BitWidth = Result.getBitWidth(); // don't shrink the result 554 else if (BitWidth > Result.getBitWidth()) 555 Result = Result.zext(BitWidth); 556 557 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 558 if (!IsPowerOf2Radix) { 559 // These must have the same bit-width as Result. 560 RadixAP = APInt(BitWidth, Radix); 561 CharAP = APInt(BitWidth, 0); 562 } 563 564 // Parse all the bytes of the string given this radix. 565 Result = 0; 566 while (!Str.empty()) { 567 unsigned CharVal; 568 if (Str[0] >= '0' && Str[0] <= '9') 569 CharVal = Str[0]-'0'; 570 else if (Str[0] >= 'a' && Str[0] <= 'z') 571 CharVal = Str[0]-'a'+10; 572 else if (Str[0] >= 'A' && Str[0] <= 'Z') 573 CharVal = Str[0]-'A'+10; 574 else 575 return true; 576 577 // If the parsed value is larger than the integer radix, the string is 578 // invalid. 579 if (CharVal >= Radix) 580 return true; 581 582 // Add in this character. 583 if (IsPowerOf2Radix) { 584 Result <<= Log2Radix; 585 Result |= CharVal; 586 } else { 587 Result *= RadixAP; 588 CharAP = CharVal; 589 Result += CharAP; 590 } 591 592 Str = Str.substr(1); 593 } 594 595 return false; 596 } 597 598 bool StringRef::getAsDouble(double &Result, bool AllowInexact) const { 599 APFloat F(0.0); 600 auto StatusOrErr = F.convertFromString(*this, APFloat::rmNearestTiesToEven); 601 if (errorToBool(StatusOrErr.takeError())) 602 return true; 603 604 APFloat::opStatus Status = *StatusOrErr; 605 if (Status != APFloat::opOK) { 606 if (!AllowInexact || !(Status & APFloat::opInexact)) 607 return true; 608 } 609 610 Result = F.convertToDouble(); 611 return false; 612 } 613 614 // Implementation of StringRef hashing. 615 hash_code llvm::hash_value(StringRef S) { 616 return hash_combine_range(S.begin(), S.end()); 617 } 618 619 unsigned DenseMapInfo<StringRef, void>::getHashValue(StringRef Val) { 620 assert(Val.data() != getEmptyKey().data() && 621 "Cannot hash the empty key!"); 622 assert(Val.data() != getTombstoneKey().data() && 623 "Cannot hash the tombstone key!"); 624 return (unsigned)(hash_value(Val)); 625 } 626