1 //===-- StringRef.cpp - Lightweight String References ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/ADT/StringRef.h" 10 #include "llvm/ADT/APFloat.h" 11 #include "llvm/ADT/APInt.h" 12 #include "llvm/ADT/Hashing.h" 13 #include "llvm/ADT/StringExtras.h" 14 #include "llvm/ADT/edit_distance.h" 15 #include "llvm/Support/Error.h" 16 #include <bitset> 17 18 using namespace llvm; 19 20 // MSVC emits references to this into the translation units which reference it. 21 #ifndef _MSC_VER 22 constexpr size_t StringRef::npos; 23 #endif 24 25 // strncasecmp() is not available on non-POSIX systems, so define an 26 // alternative function here. 27 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 28 for (size_t I = 0; I < Length; ++I) { 29 unsigned char LHC = toLower(LHS[I]); 30 unsigned char RHC = toLower(RHS[I]); 31 if (LHC != RHC) 32 return LHC < RHC ? -1 : 1; 33 } 34 return 0; 35 } 36 37 int StringRef::compare_insensitive(StringRef RHS) const { 38 if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length))) 39 return Res; 40 if (Length == RHS.Length) 41 return 0; 42 return Length < RHS.Length ? -1 : 1; 43 } 44 45 bool StringRef::starts_with_insensitive(StringRef Prefix) const { 46 return Length >= Prefix.Length && 47 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 48 } 49 50 bool StringRef::ends_with_insensitive(StringRef Suffix) const { 51 return Length >= Suffix.Length && 52 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 53 } 54 55 size_t StringRef::find_insensitive(char C, size_t From) const { 56 char L = toLower(C); 57 return find_if([L](char D) { return toLower(D) == L; }, From); 58 } 59 60 /// compare_numeric - Compare strings, handle embedded numbers. 61 int StringRef::compare_numeric(StringRef RHS) const { 62 for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) { 63 // Check for sequences of digits. 64 if (isDigit(Data[I]) && isDigit(RHS.Data[I])) { 65 // The longer sequence of numbers is considered larger. 66 // This doesn't really handle prefixed zeros well. 67 size_t J; 68 for (J = I + 1; J != E + 1; ++J) { 69 bool ld = J < Length && isDigit(Data[J]); 70 bool rd = J < RHS.Length && isDigit(RHS.Data[J]); 71 if (ld != rd) 72 return rd ? -1 : 1; 73 if (!rd) 74 break; 75 } 76 // The two number sequences have the same length (J-I), just memcmp them. 77 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 78 return Res < 0 ? -1 : 1; 79 // Identical number sequences, continue search after the numbers. 80 I = J - 1; 81 continue; 82 } 83 if (Data[I] != RHS.Data[I]) 84 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 85 } 86 if (Length == RHS.Length) 87 return 0; 88 return Length < RHS.Length ? -1 : 1; 89 } 90 91 // Compute the edit distance between the two given strings. 92 unsigned StringRef::edit_distance(llvm::StringRef Other, 93 bool AllowReplacements, 94 unsigned MaxEditDistance) const { 95 return llvm::ComputeEditDistance( 96 makeArrayRef(data(), size()), 97 makeArrayRef(Other.data(), Other.size()), 98 AllowReplacements, MaxEditDistance); 99 } 100 101 unsigned llvm::StringRef::edit_distance_insensitive( 102 StringRef Other, bool AllowReplacements, unsigned MaxEditDistance) const { 103 return llvm::ComputeMappedEditDistance( 104 makeArrayRef(data(), size()), makeArrayRef(Other.data(), Other.size()), 105 llvm::toLower, AllowReplacements, MaxEditDistance); 106 } 107 108 //===----------------------------------------------------------------------===// 109 // String Operations 110 //===----------------------------------------------------------------------===// 111 112 std::string StringRef::lower() const { 113 return std::string(map_iterator(begin(), toLower), 114 map_iterator(end(), toLower)); 115 } 116 117 std::string StringRef::upper() const { 118 return std::string(map_iterator(begin(), toUpper), 119 map_iterator(end(), toUpper)); 120 } 121 122 //===----------------------------------------------------------------------===// 123 // String Searching 124 //===----------------------------------------------------------------------===// 125 126 127 /// find - Search for the first string \arg Str in the string. 128 /// 129 /// \return - The index of the first occurrence of \arg Str, or npos if not 130 /// found. 131 size_t StringRef::find(StringRef Str, size_t From) const { 132 if (From > Length) 133 return npos; 134 135 const char *Start = Data + From; 136 size_t Size = Length - From; 137 138 const char *Needle = Str.data(); 139 size_t N = Str.size(); 140 if (N == 0) 141 return From; 142 if (Size < N) 143 return npos; 144 if (N == 1) { 145 const char *Ptr = (const char *)::memchr(Start, Needle[0], Size); 146 return Ptr == nullptr ? npos : Ptr - Data; 147 } 148 149 const char *Stop = Start + (Size - N + 1); 150 151 if (N == 2) { 152 // Provide a fast path for newline finding (CRLF case) in InclusionRewriter. 153 // Not the most optimized strategy, but getting memcmp inlined should be 154 // good enough. 155 do { 156 if (std::memcmp(Start, Needle, 2) == 0) 157 return Start - Data; 158 ++Start; 159 } while (Start < Stop); 160 return npos; 161 } 162 163 // For short haystacks or unsupported needles fall back to the naive algorithm 164 if (Size < 16 || N > 255) { 165 do { 166 if (std::memcmp(Start, Needle, N) == 0) 167 return Start - Data; 168 ++Start; 169 } while (Start < Stop); 170 return npos; 171 } 172 173 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 174 uint8_t BadCharSkip[256]; 175 std::memset(BadCharSkip, N, 256); 176 for (unsigned i = 0; i != N-1; ++i) 177 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 178 179 do { 180 uint8_t Last = Start[N - 1]; 181 if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - 1])) 182 if (std::memcmp(Start, Needle, N - 1) == 0) 183 return Start - Data; 184 185 // Otherwise skip the appropriate number of bytes. 186 Start += BadCharSkip[Last]; 187 } while (Start < Stop); 188 189 return npos; 190 } 191 192 size_t StringRef::find_insensitive(StringRef Str, size_t From) const { 193 StringRef This = substr(From); 194 while (This.size() >= Str.size()) { 195 if (This.startswith_insensitive(Str)) 196 return From; 197 This = This.drop_front(); 198 ++From; 199 } 200 return npos; 201 } 202 203 size_t StringRef::rfind_insensitive(char C, size_t From) const { 204 From = std::min(From, Length); 205 size_t i = From; 206 while (i != 0) { 207 --i; 208 if (toLower(Data[i]) == toLower(C)) 209 return i; 210 } 211 return npos; 212 } 213 214 /// rfind - Search for the last string \arg Str in the string. 215 /// 216 /// \return - The index of the last occurrence of \arg Str, or npos if not 217 /// found. 218 size_t StringRef::rfind(StringRef Str) const { 219 size_t N = Str.size(); 220 if (N > Length) 221 return npos; 222 for (size_t i = Length - N + 1, e = 0; i != e;) { 223 --i; 224 if (substr(i, N).equals(Str)) 225 return i; 226 } 227 return npos; 228 } 229 230 size_t StringRef::rfind_insensitive(StringRef Str) const { 231 size_t N = Str.size(); 232 if (N > Length) 233 return npos; 234 for (size_t i = Length - N + 1, e = 0; i != e;) { 235 --i; 236 if (substr(i, N).equals_insensitive(Str)) 237 return i; 238 } 239 return npos; 240 } 241 242 /// find_first_of - Find the first character in the string that is in \arg 243 /// Chars, or npos if not found. 244 /// 245 /// Note: O(size() + Chars.size()) 246 StringRef::size_type StringRef::find_first_of(StringRef Chars, 247 size_t From) const { 248 std::bitset<1 << CHAR_BIT> CharBits; 249 for (char C : Chars) 250 CharBits.set((unsigned char)C); 251 252 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 253 if (CharBits.test((unsigned char)Data[i])) 254 return i; 255 return npos; 256 } 257 258 /// find_first_not_of - Find the first character in the string that is not 259 /// \arg C or npos if not found. 260 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 261 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 262 if (Data[i] != C) 263 return i; 264 return npos; 265 } 266 267 /// find_first_not_of - Find the first character in the string that is not 268 /// in the string \arg Chars, or npos if not found. 269 /// 270 /// Note: O(size() + Chars.size()) 271 StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 272 size_t From) const { 273 std::bitset<1 << CHAR_BIT> CharBits; 274 for (char C : Chars) 275 CharBits.set((unsigned char)C); 276 277 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 278 if (!CharBits.test((unsigned char)Data[i])) 279 return i; 280 return npos; 281 } 282 283 /// find_last_of - Find the last character in the string that is in \arg C, 284 /// or npos if not found. 285 /// 286 /// Note: O(size() + Chars.size()) 287 StringRef::size_type StringRef::find_last_of(StringRef Chars, 288 size_t From) const { 289 std::bitset<1 << CHAR_BIT> CharBits; 290 for (char C : Chars) 291 CharBits.set((unsigned char)C); 292 293 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 294 if (CharBits.test((unsigned char)Data[i])) 295 return i; 296 return npos; 297 } 298 299 /// find_last_not_of - Find the last character in the string that is not 300 /// \arg C, or npos if not found. 301 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 302 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 303 if (Data[i] != C) 304 return i; 305 return npos; 306 } 307 308 /// find_last_not_of - Find the last character in the string that is not in 309 /// \arg Chars, or npos if not found. 310 /// 311 /// Note: O(size() + Chars.size()) 312 StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 313 size_t From) const { 314 std::bitset<1 << CHAR_BIT> CharBits; 315 for (char C : Chars) 316 CharBits.set((unsigned char)C); 317 318 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 319 if (!CharBits.test((unsigned char)Data[i])) 320 return i; 321 return npos; 322 } 323 324 void StringRef::split(SmallVectorImpl<StringRef> &A, 325 StringRef Separator, int MaxSplit, 326 bool KeepEmpty) const { 327 StringRef S = *this; 328 329 // Count down from MaxSplit. When MaxSplit is -1, this will just split 330 // "forever". This doesn't support splitting more than 2^31 times 331 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 332 // but that seems unlikely to be useful. 333 while (MaxSplit-- != 0) { 334 size_t Idx = S.find(Separator); 335 if (Idx == npos) 336 break; 337 338 // Push this split. 339 if (KeepEmpty || Idx > 0) 340 A.push_back(S.slice(0, Idx)); 341 342 // Jump forward. 343 S = S.slice(Idx + Separator.size(), npos); 344 } 345 346 // Push the tail. 347 if (KeepEmpty || !S.empty()) 348 A.push_back(S); 349 } 350 351 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, 352 int MaxSplit, bool KeepEmpty) const { 353 StringRef S = *this; 354 355 // Count down from MaxSplit. When MaxSplit is -1, this will just split 356 // "forever". This doesn't support splitting more than 2^31 times 357 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 358 // but that seems unlikely to be useful. 359 while (MaxSplit-- != 0) { 360 size_t Idx = S.find(Separator); 361 if (Idx == npos) 362 break; 363 364 // Push this split. 365 if (KeepEmpty || Idx > 0) 366 A.push_back(S.slice(0, Idx)); 367 368 // Jump forward. 369 S = S.slice(Idx + 1, npos); 370 } 371 372 // Push the tail. 373 if (KeepEmpty || !S.empty()) 374 A.push_back(S); 375 } 376 377 //===----------------------------------------------------------------------===// 378 // Helpful Algorithms 379 //===----------------------------------------------------------------------===// 380 381 /// count - Return the number of non-overlapped occurrences of \arg Str in 382 /// the string. 383 size_t StringRef::count(StringRef Str) const { 384 size_t Count = 0; 385 size_t N = Str.size(); 386 if (!N || N > Length) 387 return 0; 388 for (size_t i = 0, e = Length - N + 1; i < e;) { 389 if (substr(i, N).equals(Str)) { 390 ++Count; 391 i += N; 392 } 393 else 394 ++i; 395 } 396 return Count; 397 } 398 399 static unsigned GetAutoSenseRadix(StringRef &Str) { 400 if (Str.empty()) 401 return 10; 402 403 if (Str.startswith("0x") || Str.startswith("0X")) { 404 Str = Str.substr(2); 405 return 16; 406 } 407 408 if (Str.startswith("0b") || Str.startswith("0B")) { 409 Str = Str.substr(2); 410 return 2; 411 } 412 413 if (Str.startswith("0o")) { 414 Str = Str.substr(2); 415 return 8; 416 } 417 418 if (Str[0] == '0' && Str.size() > 1 && isDigit(Str[1])) { 419 Str = Str.substr(1); 420 return 8; 421 } 422 423 return 10; 424 } 425 426 bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix, 427 unsigned long long &Result) { 428 // Autosense radix if not specified. 429 if (Radix == 0) 430 Radix = GetAutoSenseRadix(Str); 431 432 // Empty strings (after the radix autosense) are invalid. 433 if (Str.empty()) return true; 434 435 // Parse all the bytes of the string given this radix. Watch for overflow. 436 StringRef Str2 = Str; 437 Result = 0; 438 while (!Str2.empty()) { 439 unsigned CharVal; 440 if (Str2[0] >= '0' && Str2[0] <= '9') 441 CharVal = Str2[0] - '0'; 442 else if (Str2[0] >= 'a' && Str2[0] <= 'z') 443 CharVal = Str2[0] - 'a' + 10; 444 else if (Str2[0] >= 'A' && Str2[0] <= 'Z') 445 CharVal = Str2[0] - 'A' + 10; 446 else 447 break; 448 449 // If the parsed value is larger than the integer radix, we cannot 450 // consume any more characters. 451 if (CharVal >= Radix) 452 break; 453 454 // Add in this character. 455 unsigned long long PrevResult = Result; 456 Result = Result * Radix + CharVal; 457 458 // Check for overflow by shifting back and seeing if bits were lost. 459 if (Result / Radix < PrevResult) 460 return true; 461 462 Str2 = Str2.substr(1); 463 } 464 465 // We consider the operation a failure if no characters were consumed 466 // successfully. 467 if (Str.size() == Str2.size()) 468 return true; 469 470 Str = Str2; 471 return false; 472 } 473 474 bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix, 475 long long &Result) { 476 unsigned long long ULLVal; 477 478 // Handle positive strings first. 479 if (Str.empty() || Str.front() != '-') { 480 if (consumeUnsignedInteger(Str, Radix, ULLVal) || 481 // Check for value so large it overflows a signed value. 482 (long long)ULLVal < 0) 483 return true; 484 Result = ULLVal; 485 return false; 486 } 487 488 // Get the positive part of the value. 489 StringRef Str2 = Str.drop_front(1); 490 if (consumeUnsignedInteger(Str2, Radix, ULLVal) || 491 // Reject values so large they'd overflow as negative signed, but allow 492 // "-0". This negates the unsigned so that the negative isn't undefined 493 // on signed overflow. 494 (long long)-ULLVal > 0) 495 return true; 496 497 Str = Str2; 498 Result = -ULLVal; 499 return false; 500 } 501 502 /// GetAsUnsignedInteger - Workhorse method that converts a integer character 503 /// sequence of radix up to 36 to an unsigned long long value. 504 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 505 unsigned long long &Result) { 506 if (consumeUnsignedInteger(Str, Radix, Result)) 507 return true; 508 509 // For getAsUnsignedInteger, we require the whole string to be consumed or 510 // else we consider it a failure. 511 return !Str.empty(); 512 } 513 514 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 515 long long &Result) { 516 if (consumeSignedInteger(Str, Radix, Result)) 517 return true; 518 519 // For getAsSignedInteger, we require the whole string to be consumed or else 520 // we consider it a failure. 521 return !Str.empty(); 522 } 523 524 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 525 StringRef Str = *this; 526 527 // Autosense radix if not specified. 528 if (Radix == 0) 529 Radix = GetAutoSenseRadix(Str); 530 531 assert(Radix > 1 && Radix <= 36); 532 533 // Empty strings (after the radix autosense) are invalid. 534 if (Str.empty()) return true; 535 536 // Skip leading zeroes. This can be a significant improvement if 537 // it means we don't need > 64 bits. 538 while (!Str.empty() && Str.front() == '0') 539 Str = Str.substr(1); 540 541 // If it was nothing but zeroes.... 542 if (Str.empty()) { 543 Result = APInt(64, 0); 544 return false; 545 } 546 547 // (Over-)estimate the required number of bits. 548 unsigned Log2Radix = 0; 549 while ((1U << Log2Radix) < Radix) Log2Radix++; 550 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 551 552 unsigned BitWidth = Log2Radix * Str.size(); 553 if (BitWidth < Result.getBitWidth()) 554 BitWidth = Result.getBitWidth(); // don't shrink the result 555 else if (BitWidth > Result.getBitWidth()) 556 Result = Result.zext(BitWidth); 557 558 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 559 if (!IsPowerOf2Radix) { 560 // These must have the same bit-width as Result. 561 RadixAP = APInt(BitWidth, Radix); 562 CharAP = APInt(BitWidth, 0); 563 } 564 565 // Parse all the bytes of the string given this radix. 566 Result = 0; 567 while (!Str.empty()) { 568 unsigned CharVal; 569 if (Str[0] >= '0' && Str[0] <= '9') 570 CharVal = Str[0]-'0'; 571 else if (Str[0] >= 'a' && Str[0] <= 'z') 572 CharVal = Str[0]-'a'+10; 573 else if (Str[0] >= 'A' && Str[0] <= 'Z') 574 CharVal = Str[0]-'A'+10; 575 else 576 return true; 577 578 // If the parsed value is larger than the integer radix, the string is 579 // invalid. 580 if (CharVal >= Radix) 581 return true; 582 583 // Add in this character. 584 if (IsPowerOf2Radix) { 585 Result <<= Log2Radix; 586 Result |= CharVal; 587 } else { 588 Result *= RadixAP; 589 CharAP = CharVal; 590 Result += CharAP; 591 } 592 593 Str = Str.substr(1); 594 } 595 596 return false; 597 } 598 599 bool StringRef::getAsDouble(double &Result, bool AllowInexact) const { 600 APFloat F(0.0); 601 auto StatusOrErr = F.convertFromString(*this, APFloat::rmNearestTiesToEven); 602 if (errorToBool(StatusOrErr.takeError())) 603 return true; 604 605 APFloat::opStatus Status = *StatusOrErr; 606 if (Status != APFloat::opOK) { 607 if (!AllowInexact || !(Status & APFloat::opInexact)) 608 return true; 609 } 610 611 Result = F.convertToDouble(); 612 return false; 613 } 614 615 // Implementation of StringRef hashing. 616 hash_code llvm::hash_value(StringRef S) { 617 return hash_combine_range(S.begin(), S.end()); 618 } 619 620 unsigned DenseMapInfo<StringRef, void>::getHashValue(StringRef Val) { 621 assert(Val.data() != getEmptyKey().data() && 622 "Cannot hash the empty key!"); 623 assert(Val.data() != getTombstoneKey().data() && 624 "Cannot hash the tombstone key!"); 625 return (unsigned)(hash_value(Val)); 626 } 627