1 //===-- StringRef.cpp - Lightweight String References ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "llvm/ADT/StringRef.h" 11 #include "llvm/ADT/APInt.h" 12 #include "llvm/ADT/Hashing.h" 13 #include "llvm/ADT/edit_distance.h" 14 #include <bitset> 15 16 using namespace llvm; 17 18 // MSVC emits references to this into the translation units which reference it. 19 #ifndef _MSC_VER 20 const size_t StringRef::npos; 21 #endif 22 23 static char ascii_tolower(char x) { 24 if (x >= 'A' && x <= 'Z') 25 return x - 'A' + 'a'; 26 return x; 27 } 28 29 static char ascii_toupper(char x) { 30 if (x >= 'a' && x <= 'z') 31 return x - 'a' + 'A'; 32 return x; 33 } 34 35 static bool ascii_isdigit(char x) { 36 return x >= '0' && x <= '9'; 37 } 38 39 // strncasecmp() is not available on non-POSIX systems, so define an 40 // alternative function here. 41 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 42 for (size_t I = 0; I < Length; ++I) { 43 unsigned char LHC = ascii_tolower(LHS[I]); 44 unsigned char RHC = ascii_tolower(RHS[I]); 45 if (LHC != RHC) 46 return LHC < RHC ? -1 : 1; 47 } 48 return 0; 49 } 50 51 /// compare_lower - Compare strings, ignoring case. 52 int StringRef::compare_lower(StringRef RHS) const { 53 if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length))) 54 return Res; 55 if (Length == RHS.Length) 56 return 0; 57 return Length < RHS.Length ? -1 : 1; 58 } 59 60 /// Check if this string starts with the given \p Prefix, ignoring case. 61 bool StringRef::startswith_lower(StringRef Prefix) const { 62 return Length >= Prefix.Length && 63 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 64 } 65 66 /// Check if this string ends with the given \p Suffix, ignoring case. 67 bool StringRef::endswith_lower(StringRef Suffix) const { 68 return Length >= Suffix.Length && 69 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 70 } 71 72 size_t StringRef::find_lower(char C, size_t From) const { 73 char L = ascii_tolower(C); 74 return find_if([L](char D) { return ascii_tolower(D) == L; }, From); 75 } 76 77 /// compare_numeric - Compare strings, handle embedded numbers. 78 int StringRef::compare_numeric(StringRef RHS) const { 79 for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) { 80 // Check for sequences of digits. 81 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { 82 // The longer sequence of numbers is considered larger. 83 // This doesn't really handle prefixed zeros well. 84 size_t J; 85 for (J = I + 1; J != E + 1; ++J) { 86 bool ld = J < Length && ascii_isdigit(Data[J]); 87 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); 88 if (ld != rd) 89 return rd ? -1 : 1; 90 if (!rd) 91 break; 92 } 93 // The two number sequences have the same length (J-I), just memcmp them. 94 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 95 return Res < 0 ? -1 : 1; 96 // Identical number sequences, continue search after the numbers. 97 I = J - 1; 98 continue; 99 } 100 if (Data[I] != RHS.Data[I]) 101 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 102 } 103 if (Length == RHS.Length) 104 return 0; 105 return Length < RHS.Length ? -1 : 1; 106 } 107 108 // Compute the edit distance between the two given strings. 109 unsigned StringRef::edit_distance(llvm::StringRef Other, 110 bool AllowReplacements, 111 unsigned MaxEditDistance) const { 112 return llvm::ComputeEditDistance( 113 makeArrayRef(data(), size()), 114 makeArrayRef(Other.data(), Other.size()), 115 AllowReplacements, MaxEditDistance); 116 } 117 118 //===----------------------------------------------------------------------===// 119 // String Operations 120 //===----------------------------------------------------------------------===// 121 122 std::string StringRef::lower() const { 123 std::string Result(size(), char()); 124 for (size_type i = 0, e = size(); i != e; ++i) { 125 Result[i] = ascii_tolower(Data[i]); 126 } 127 return Result; 128 } 129 130 std::string StringRef::upper() const { 131 std::string Result(size(), char()); 132 for (size_type i = 0, e = size(); i != e; ++i) { 133 Result[i] = ascii_toupper(Data[i]); 134 } 135 return Result; 136 } 137 138 //===----------------------------------------------------------------------===// 139 // String Searching 140 //===----------------------------------------------------------------------===// 141 142 143 /// find - Search for the first string \arg Str in the string. 144 /// 145 /// \return - The index of the first occurrence of \arg Str, or npos if not 146 /// found. 147 size_t StringRef::find(StringRef Str, size_t From) const { 148 if (From > Length) 149 return npos; 150 151 const char *Needle = Str.data(); 152 size_t N = Str.size(); 153 if (N == 0) 154 return From; 155 156 size_t Size = Length - From; 157 if (Size < N) 158 return npos; 159 160 const char *Start = Data + From; 161 const char *Stop = Start + (Size - N + 1); 162 163 // For short haystacks or unsupported needles fall back to the naive algorithm 164 if (Size < 16 || N > 255) { 165 do { 166 if (std::memcmp(Start, Needle, N) == 0) 167 return Start - Data; 168 ++Start; 169 } while (Start < Stop); 170 return npos; 171 } 172 173 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 174 uint8_t BadCharSkip[256]; 175 std::memset(BadCharSkip, N, 256); 176 for (unsigned i = 0; i != N-1; ++i) 177 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 178 179 do { 180 if (std::memcmp(Start, Needle, N) == 0) 181 return Start - Data; 182 183 // Otherwise skip the appropriate number of bytes. 184 Start += BadCharSkip[(uint8_t)Start[N-1]]; 185 } while (Start < Stop); 186 187 return npos; 188 } 189 190 size_t StringRef::find_lower(StringRef Str, size_t From) const { 191 StringRef This = substr(From); 192 while (This.size() >= Str.size()) { 193 if (This.startswith_lower(Str)) 194 return From; 195 This = This.drop_front(); 196 ++From; 197 } 198 return npos; 199 } 200 201 size_t StringRef::rfind_lower(char C, size_t From) const { 202 From = std::min(From, Length); 203 size_t i = From; 204 while (i != 0) { 205 --i; 206 if (ascii_tolower(Data[i]) == ascii_tolower(C)) 207 return i; 208 } 209 return npos; 210 } 211 212 /// rfind - Search for the last string \arg Str in the string. 213 /// 214 /// \return - The index of the last occurrence of \arg Str, or npos if not 215 /// found. 216 size_t StringRef::rfind(StringRef Str) const { 217 size_t N = Str.size(); 218 if (N > Length) 219 return npos; 220 for (size_t i = Length - N + 1, e = 0; i != e;) { 221 --i; 222 if (substr(i, N).equals(Str)) 223 return i; 224 } 225 return npos; 226 } 227 228 size_t StringRef::rfind_lower(StringRef Str) const { 229 size_t N = Str.size(); 230 if (N > Length) 231 return npos; 232 for (size_t i = Length - N + 1, e = 0; i != e;) { 233 --i; 234 if (substr(i, N).equals_lower(Str)) 235 return i; 236 } 237 return npos; 238 } 239 240 /// find_first_of - Find the first character in the string that is in \arg 241 /// Chars, or npos if not found. 242 /// 243 /// Note: O(size() + Chars.size()) 244 StringRef::size_type StringRef::find_first_of(StringRef Chars, 245 size_t From) const { 246 std::bitset<1 << CHAR_BIT> CharBits; 247 for (size_type i = 0; i != Chars.size(); ++i) 248 CharBits.set((unsigned char)Chars[i]); 249 250 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 251 if (CharBits.test((unsigned char)Data[i])) 252 return i; 253 return npos; 254 } 255 256 /// find_first_not_of - Find the first character in the string that is not 257 /// \arg C or npos if not found. 258 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 259 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 260 if (Data[i] != C) 261 return i; 262 return npos; 263 } 264 265 /// find_first_not_of - Find the first character in the string that is not 266 /// in the string \arg Chars, or npos if not found. 267 /// 268 /// Note: O(size() + Chars.size()) 269 StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 270 size_t From) const { 271 std::bitset<1 << CHAR_BIT> CharBits; 272 for (size_type i = 0; i != Chars.size(); ++i) 273 CharBits.set((unsigned char)Chars[i]); 274 275 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 276 if (!CharBits.test((unsigned char)Data[i])) 277 return i; 278 return npos; 279 } 280 281 /// find_last_of - Find the last character in the string that is in \arg C, 282 /// or npos if not found. 283 /// 284 /// Note: O(size() + Chars.size()) 285 StringRef::size_type StringRef::find_last_of(StringRef Chars, 286 size_t From) const { 287 std::bitset<1 << CHAR_BIT> CharBits; 288 for (size_type i = 0; i != Chars.size(); ++i) 289 CharBits.set((unsigned char)Chars[i]); 290 291 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 292 if (CharBits.test((unsigned char)Data[i])) 293 return i; 294 return npos; 295 } 296 297 /// find_last_not_of - Find the last character in the string that is not 298 /// \arg C, or npos if not found. 299 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 300 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 301 if (Data[i] != C) 302 return i; 303 return npos; 304 } 305 306 /// find_last_not_of - Find the last character in the string that is not in 307 /// \arg Chars, or npos if not found. 308 /// 309 /// Note: O(size() + Chars.size()) 310 StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 311 size_t From) const { 312 std::bitset<1 << CHAR_BIT> CharBits; 313 for (size_type i = 0, e = Chars.size(); i != e; ++i) 314 CharBits.set((unsigned char)Chars[i]); 315 316 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 317 if (!CharBits.test((unsigned char)Data[i])) 318 return i; 319 return npos; 320 } 321 322 void StringRef::split(SmallVectorImpl<StringRef> &A, 323 StringRef Separator, int MaxSplit, 324 bool KeepEmpty) const { 325 StringRef S = *this; 326 327 // Count down from MaxSplit. When MaxSplit is -1, this will just split 328 // "forever". This doesn't support splitting more than 2^31 times 329 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 330 // but that seems unlikely to be useful. 331 while (MaxSplit-- != 0) { 332 size_t Idx = S.find(Separator); 333 if (Idx == npos) 334 break; 335 336 // Push this split. 337 if (KeepEmpty || Idx > 0) 338 A.push_back(S.slice(0, Idx)); 339 340 // Jump forward. 341 S = S.slice(Idx + Separator.size(), npos); 342 } 343 344 // Push the tail. 345 if (KeepEmpty || !S.empty()) 346 A.push_back(S); 347 } 348 349 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, 350 int MaxSplit, bool KeepEmpty) const { 351 StringRef S = *this; 352 353 // Count down from MaxSplit. When MaxSplit is -1, this will just split 354 // "forever". This doesn't support splitting more than 2^31 times 355 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 356 // but that seems unlikely to be useful. 357 while (MaxSplit-- != 0) { 358 size_t Idx = S.find(Separator); 359 if (Idx == npos) 360 break; 361 362 // Push this split. 363 if (KeepEmpty || Idx > 0) 364 A.push_back(S.slice(0, Idx)); 365 366 // Jump forward. 367 S = S.slice(Idx + 1, npos); 368 } 369 370 // Push the tail. 371 if (KeepEmpty || !S.empty()) 372 A.push_back(S); 373 } 374 375 //===----------------------------------------------------------------------===// 376 // Helpful Algorithms 377 //===----------------------------------------------------------------------===// 378 379 /// count - Return the number of non-overlapped occurrences of \arg Str in 380 /// the string. 381 size_t StringRef::count(StringRef Str) const { 382 size_t Count = 0; 383 size_t N = Str.size(); 384 if (N > Length) 385 return 0; 386 for (size_t i = 0, e = Length - N + 1; i != e; ++i) 387 if (substr(i, N).equals(Str)) 388 ++Count; 389 return Count; 390 } 391 392 static unsigned GetAutoSenseRadix(StringRef &Str) { 393 if (Str.empty()) 394 return 10; 395 396 if (Str.startswith("0x") || Str.startswith("0X")) { 397 Str = Str.substr(2); 398 return 16; 399 } 400 401 if (Str.startswith("0b") || Str.startswith("0B")) { 402 Str = Str.substr(2); 403 return 2; 404 } 405 406 if (Str.startswith("0o")) { 407 Str = Str.substr(2); 408 return 8; 409 } 410 411 if (Str[0] == '0' && Str.size() > 1 && ascii_isdigit(Str[1])) { 412 Str = Str.substr(1); 413 return 8; 414 } 415 416 return 10; 417 } 418 419 bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix, 420 unsigned long long &Result) { 421 // Autosense radix if not specified. 422 if (Radix == 0) 423 Radix = GetAutoSenseRadix(Str); 424 425 // Empty strings (after the radix autosense) are invalid. 426 if (Str.empty()) return true; 427 428 // Parse all the bytes of the string given this radix. Watch for overflow. 429 StringRef Str2 = Str; 430 Result = 0; 431 while (!Str2.empty()) { 432 unsigned CharVal; 433 if (Str2[0] >= '0' && Str2[0] <= '9') 434 CharVal = Str2[0] - '0'; 435 else if (Str2[0] >= 'a' && Str2[0] <= 'z') 436 CharVal = Str2[0] - 'a' + 10; 437 else if (Str2[0] >= 'A' && Str2[0] <= 'Z') 438 CharVal = Str2[0] - 'A' + 10; 439 else 440 break; 441 442 // If the parsed value is larger than the integer radix, we cannot 443 // consume any more characters. 444 if (CharVal >= Radix) 445 break; 446 447 // Add in this character. 448 unsigned long long PrevResult = Result; 449 Result = Result * Radix + CharVal; 450 451 // Check for overflow by shifting back and seeing if bits were lost. 452 if (Result / Radix < PrevResult) 453 return true; 454 455 Str2 = Str2.substr(1); 456 } 457 458 // We consider the operation a failure if no characters were consumed 459 // successfully. 460 if (Str.size() == Str2.size()) 461 return true; 462 463 Str = Str2; 464 return false; 465 } 466 467 bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix, 468 long long &Result) { 469 unsigned long long ULLVal; 470 471 // Handle positive strings first. 472 if (Str.empty() || Str.front() != '-') { 473 if (consumeUnsignedInteger(Str, Radix, ULLVal) || 474 // Check for value so large it overflows a signed value. 475 (long long)ULLVal < 0) 476 return true; 477 Result = ULLVal; 478 return false; 479 } 480 481 // Get the positive part of the value. 482 StringRef Str2 = Str.drop_front(1); 483 if (consumeUnsignedInteger(Str2, Radix, ULLVal) || 484 // Reject values so large they'd overflow as negative signed, but allow 485 // "-0". This negates the unsigned so that the negative isn't undefined 486 // on signed overflow. 487 (long long)-ULLVal > 0) 488 return true; 489 490 Str = Str2; 491 Result = -ULLVal; 492 return false; 493 } 494 495 /// GetAsUnsignedInteger - Workhorse method that converts a integer character 496 /// sequence of radix up to 36 to an unsigned long long value. 497 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 498 unsigned long long &Result) { 499 if (consumeUnsignedInteger(Str, Radix, Result)) 500 return true; 501 502 // For getAsUnsignedInteger, we require the whole string to be consumed or 503 // else we consider it a failure. 504 return !Str.empty(); 505 } 506 507 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 508 long long &Result) { 509 if (consumeSignedInteger(Str, Radix, Result)) 510 return true; 511 512 // For getAsSignedInteger, we require the whole string to be consumed or else 513 // we consider it a failure. 514 return !Str.empty(); 515 } 516 517 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 518 StringRef Str = *this; 519 520 // Autosense radix if not specified. 521 if (Radix == 0) 522 Radix = GetAutoSenseRadix(Str); 523 524 assert(Radix > 1 && Radix <= 36); 525 526 // Empty strings (after the radix autosense) are invalid. 527 if (Str.empty()) return true; 528 529 // Skip leading zeroes. This can be a significant improvement if 530 // it means we don't need > 64 bits. 531 while (!Str.empty() && Str.front() == '0') 532 Str = Str.substr(1); 533 534 // If it was nothing but zeroes.... 535 if (Str.empty()) { 536 Result = APInt(64, 0); 537 return false; 538 } 539 540 // (Over-)estimate the required number of bits. 541 unsigned Log2Radix = 0; 542 while ((1U << Log2Radix) < Radix) Log2Radix++; 543 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 544 545 unsigned BitWidth = Log2Radix * Str.size(); 546 if (BitWidth < Result.getBitWidth()) 547 BitWidth = Result.getBitWidth(); // don't shrink the result 548 else if (BitWidth > Result.getBitWidth()) 549 Result = Result.zext(BitWidth); 550 551 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 552 if (!IsPowerOf2Radix) { 553 // These must have the same bit-width as Result. 554 RadixAP = APInt(BitWidth, Radix); 555 CharAP = APInt(BitWidth, 0); 556 } 557 558 // Parse all the bytes of the string given this radix. 559 Result = 0; 560 while (!Str.empty()) { 561 unsigned CharVal; 562 if (Str[0] >= '0' && Str[0] <= '9') 563 CharVal = Str[0]-'0'; 564 else if (Str[0] >= 'a' && Str[0] <= 'z') 565 CharVal = Str[0]-'a'+10; 566 else if (Str[0] >= 'A' && Str[0] <= 'Z') 567 CharVal = Str[0]-'A'+10; 568 else 569 return true; 570 571 // If the parsed value is larger than the integer radix, the string is 572 // invalid. 573 if (CharVal >= Radix) 574 return true; 575 576 // Add in this character. 577 if (IsPowerOf2Radix) { 578 Result <<= Log2Radix; 579 Result |= CharVal; 580 } else { 581 Result *= RadixAP; 582 CharAP = CharVal; 583 Result += CharAP; 584 } 585 586 Str = Str.substr(1); 587 } 588 589 return false; 590 } 591 592 593 // Implementation of StringRef hashing. 594 hash_code llvm::hash_value(StringRef S) { 595 return hash_combine_range(S.begin(), S.end()); 596 } 597