1 //===-- StringRef.cpp - Lightweight String References ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "llvm/ADT/StringRef.h" 11 #include "llvm/ADT/APInt.h" 12 #include "llvm/ADT/Hashing.h" 13 #include "llvm/ADT/edit_distance.h" 14 #include <bitset> 15 16 using namespace llvm; 17 18 // MSVC emits references to this into the translation units which reference it. 19 #ifndef _MSC_VER 20 const size_t StringRef::npos; 21 #endif 22 23 static char ascii_tolower(char x) { 24 if (x >= 'A' && x <= 'Z') 25 return x - 'A' + 'a'; 26 return x; 27 } 28 29 static char ascii_toupper(char x) { 30 if (x >= 'a' && x <= 'z') 31 return x - 'a' + 'A'; 32 return x; 33 } 34 35 static bool ascii_isdigit(char x) { 36 return x >= '0' && x <= '9'; 37 } 38 39 // strncasecmp() is not available on non-POSIX systems, so define an 40 // alternative function here. 41 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 42 for (size_t I = 0; I < Length; ++I) { 43 unsigned char LHC = ascii_tolower(LHS[I]); 44 unsigned char RHC = ascii_tolower(RHS[I]); 45 if (LHC != RHC) 46 return LHC < RHC ? -1 : 1; 47 } 48 return 0; 49 } 50 51 /// compare_lower - Compare strings, ignoring case. 52 int StringRef::compare_lower(StringRef RHS) const { 53 if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length))) 54 return Res; 55 if (Length == RHS.Length) 56 return 0; 57 return Length < RHS.Length ? -1 : 1; 58 } 59 60 /// Check if this string starts with the given \p Prefix, ignoring case. 61 bool StringRef::startswith_lower(StringRef Prefix) const { 62 return Length >= Prefix.Length && 63 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 64 } 65 66 /// Check if this string ends with the given \p Suffix, ignoring case. 67 bool StringRef::endswith_lower(StringRef Suffix) const { 68 return Length >= Suffix.Length && 69 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 70 } 71 72 /// compare_numeric - Compare strings, handle embedded numbers. 73 int StringRef::compare_numeric(StringRef RHS) const { 74 for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) { 75 // Check for sequences of digits. 76 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { 77 // The longer sequence of numbers is considered larger. 78 // This doesn't really handle prefixed zeros well. 79 size_t J; 80 for (J = I + 1; J != E + 1; ++J) { 81 bool ld = J < Length && ascii_isdigit(Data[J]); 82 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); 83 if (ld != rd) 84 return rd ? -1 : 1; 85 if (!rd) 86 break; 87 } 88 // The two number sequences have the same length (J-I), just memcmp them. 89 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 90 return Res < 0 ? -1 : 1; 91 // Identical number sequences, continue search after the numbers. 92 I = J - 1; 93 continue; 94 } 95 if (Data[I] != RHS.Data[I]) 96 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 97 } 98 if (Length == RHS.Length) 99 return 0; 100 return Length < RHS.Length ? -1 : 1; 101 } 102 103 // Compute the edit distance between the two given strings. 104 unsigned StringRef::edit_distance(llvm::StringRef Other, 105 bool AllowReplacements, 106 unsigned MaxEditDistance) const { 107 return llvm::ComputeEditDistance( 108 makeArrayRef(data(), size()), 109 makeArrayRef(Other.data(), Other.size()), 110 AllowReplacements, MaxEditDistance); 111 } 112 113 //===----------------------------------------------------------------------===// 114 // String Operations 115 //===----------------------------------------------------------------------===// 116 117 std::string StringRef::lower() const { 118 std::string Result(size(), char()); 119 for (size_type i = 0, e = size(); i != e; ++i) { 120 Result[i] = ascii_tolower(Data[i]); 121 } 122 return Result; 123 } 124 125 std::string StringRef::upper() const { 126 std::string Result(size(), char()); 127 for (size_type i = 0, e = size(); i != e; ++i) { 128 Result[i] = ascii_toupper(Data[i]); 129 } 130 return Result; 131 } 132 133 //===----------------------------------------------------------------------===// 134 // String Searching 135 //===----------------------------------------------------------------------===// 136 137 138 /// find - Search for the first string \arg Str in the string. 139 /// 140 /// \return - The index of the first occurrence of \arg Str, or npos if not 141 /// found. 142 size_t StringRef::find(StringRef Str, size_t From) const { 143 if (From > Length) 144 return npos; 145 146 const char *Needle = Str.data(); 147 size_t N = Str.size(); 148 if (N == 0) 149 return From; 150 151 size_t Size = Length - From; 152 if (Size < N) 153 return npos; 154 155 const char *Start = Data + From; 156 const char *Stop = Start + (Size - N + 1); 157 158 // For short haystacks or unsupported needles fall back to the naive algorithm 159 if (Size < 16 || N > 255) { 160 do { 161 if (std::memcmp(Start, Needle, N) == 0) 162 return Start - Data; 163 ++Start; 164 } while (Start < Stop); 165 return npos; 166 } 167 168 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 169 uint8_t BadCharSkip[256]; 170 std::memset(BadCharSkip, N, 256); 171 for (unsigned i = 0; i != N-1; ++i) 172 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 173 174 do { 175 if (std::memcmp(Start, Needle, N) == 0) 176 return Start - Data; 177 178 // Otherwise skip the appropriate number of bytes. 179 Start += BadCharSkip[(uint8_t)Start[N-1]]; 180 } while (Start < Stop); 181 182 return npos; 183 } 184 185 /// rfind - Search for the last string \arg Str in the string. 186 /// 187 /// \return - The index of the last occurrence of \arg Str, or npos if not 188 /// found. 189 size_t StringRef::rfind(StringRef Str) const { 190 size_t N = Str.size(); 191 if (N > Length) 192 return npos; 193 for (size_t i = Length - N + 1, e = 0; i != e;) { 194 --i; 195 if (substr(i, N).equals(Str)) 196 return i; 197 } 198 return npos; 199 } 200 201 /// find_first_of - Find the first character in the string that is in \arg 202 /// Chars, or npos if not found. 203 /// 204 /// Note: O(size() + Chars.size()) 205 StringRef::size_type StringRef::find_first_of(StringRef Chars, 206 size_t From) const { 207 std::bitset<1 << CHAR_BIT> CharBits; 208 for (size_type i = 0; i != Chars.size(); ++i) 209 CharBits.set((unsigned char)Chars[i]); 210 211 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 212 if (CharBits.test((unsigned char)Data[i])) 213 return i; 214 return npos; 215 } 216 217 /// find_first_not_of - Find the first character in the string that is not 218 /// \arg C or npos if not found. 219 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 220 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 221 if (Data[i] != C) 222 return i; 223 return npos; 224 } 225 226 /// find_first_not_of - Find the first character in the string that is not 227 /// in the string \arg Chars, or npos if not found. 228 /// 229 /// Note: O(size() + Chars.size()) 230 StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 231 size_t From) const { 232 std::bitset<1 << CHAR_BIT> CharBits; 233 for (size_type i = 0; i != Chars.size(); ++i) 234 CharBits.set((unsigned char)Chars[i]); 235 236 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 237 if (!CharBits.test((unsigned char)Data[i])) 238 return i; 239 return npos; 240 } 241 242 /// find_last_of - Find the last character in the string that is in \arg C, 243 /// or npos if not found. 244 /// 245 /// Note: O(size() + Chars.size()) 246 StringRef::size_type StringRef::find_last_of(StringRef Chars, 247 size_t From) const { 248 std::bitset<1 << CHAR_BIT> CharBits; 249 for (size_type i = 0; i != Chars.size(); ++i) 250 CharBits.set((unsigned char)Chars[i]); 251 252 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 253 if (CharBits.test((unsigned char)Data[i])) 254 return i; 255 return npos; 256 } 257 258 /// find_last_not_of - Find the last character in the string that is not 259 /// \arg C, or npos if not found. 260 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 261 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 262 if (Data[i] != C) 263 return i; 264 return npos; 265 } 266 267 /// find_last_not_of - Find the last character in the string that is not in 268 /// \arg Chars, or npos if not found. 269 /// 270 /// Note: O(size() + Chars.size()) 271 StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 272 size_t From) const { 273 std::bitset<1 << CHAR_BIT> CharBits; 274 for (size_type i = 0, e = Chars.size(); i != e; ++i) 275 CharBits.set((unsigned char)Chars[i]); 276 277 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 278 if (!CharBits.test((unsigned char)Data[i])) 279 return i; 280 return npos; 281 } 282 283 void StringRef::split(SmallVectorImpl<StringRef> &A, 284 StringRef Separator, int MaxSplit, 285 bool KeepEmpty) const { 286 StringRef S = *this; 287 288 // Count down from MaxSplit. When MaxSplit is -1, this will just split 289 // "forever". This doesn't support splitting more than 2^31 times 290 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 291 // but that seems unlikely to be useful. 292 while (MaxSplit-- != 0) { 293 size_t Idx = S.find(Separator); 294 if (Idx == npos) 295 break; 296 297 // Push this split. 298 if (KeepEmpty || Idx > 0) 299 A.push_back(S.slice(0, Idx)); 300 301 // Jump forward. 302 S = S.slice(Idx + Separator.size(), npos); 303 } 304 305 // Push the tail. 306 if (KeepEmpty || !S.empty()) 307 A.push_back(S); 308 } 309 310 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, 311 int MaxSplit, bool KeepEmpty) const { 312 StringRef S = *this; 313 314 // Count down from MaxSplit. When MaxSplit is -1, this will just split 315 // "forever". This doesn't support splitting more than 2^31 times 316 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 317 // but that seems unlikely to be useful. 318 while (MaxSplit-- != 0) { 319 size_t Idx = S.find(Separator); 320 if (Idx == npos) 321 break; 322 323 // Push this split. 324 if (KeepEmpty || Idx > 0) 325 A.push_back(S.slice(0, Idx)); 326 327 // Jump forward. 328 S = S.slice(Idx + 1, npos); 329 } 330 331 // Push the tail. 332 if (KeepEmpty || !S.empty()) 333 A.push_back(S); 334 } 335 336 //===----------------------------------------------------------------------===// 337 // Helpful Algorithms 338 //===----------------------------------------------------------------------===// 339 340 /// count - Return the number of non-overlapped occurrences of \arg Str in 341 /// the string. 342 size_t StringRef::count(StringRef Str) const { 343 size_t Count = 0; 344 size_t N = Str.size(); 345 if (N > Length) 346 return 0; 347 for (size_t i = 0, e = Length - N + 1; i != e; ++i) 348 if (substr(i, N).equals(Str)) 349 ++Count; 350 return Count; 351 } 352 353 static unsigned GetAutoSenseRadix(StringRef &Str) { 354 if (Str.empty()) 355 return 10; 356 357 if (Str.startswith("0x") || Str.startswith("0X")) { 358 Str = Str.substr(2); 359 return 16; 360 } 361 362 if (Str.startswith("0b") || Str.startswith("0B")) { 363 Str = Str.substr(2); 364 return 2; 365 } 366 367 if (Str.startswith("0o")) { 368 Str = Str.substr(2); 369 return 8; 370 } 371 372 if (Str[0] == '0' && Str.size() > 1 && ascii_isdigit(Str[1])) { 373 Str = Str.substr(1); 374 return 8; 375 } 376 377 return 10; 378 } 379 380 bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix, 381 unsigned long long &Result) { 382 // Autosense radix if not specified. 383 if (Radix == 0) 384 Radix = GetAutoSenseRadix(Str); 385 386 // Empty strings (after the radix autosense) are invalid. 387 if (Str.empty()) return true; 388 389 // Parse all the bytes of the string given this radix. Watch for overflow. 390 StringRef Str2 = Str; 391 Result = 0; 392 while (!Str2.empty()) { 393 unsigned CharVal; 394 if (Str2[0] >= '0' && Str2[0] <= '9') 395 CharVal = Str2[0] - '0'; 396 else if (Str2[0] >= 'a' && Str2[0] <= 'z') 397 CharVal = Str2[0] - 'a' + 10; 398 else if (Str2[0] >= 'A' && Str2[0] <= 'Z') 399 CharVal = Str2[0] - 'A' + 10; 400 else 401 break; 402 403 // If the parsed value is larger than the integer radix, we cannot 404 // consume any more characters. 405 if (CharVal >= Radix) 406 break; 407 408 // Add in this character. 409 unsigned long long PrevResult = Result; 410 Result = Result * Radix + CharVal; 411 412 // Check for overflow by shifting back and seeing if bits were lost. 413 if (Result / Radix < PrevResult) 414 return true; 415 416 Str2 = Str2.substr(1); 417 } 418 419 // We consider the operation a failure if no characters were consumed 420 // successfully. 421 if (Str.size() == Str2.size()) 422 return true; 423 424 Str = Str2; 425 return false; 426 } 427 428 bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix, 429 long long &Result) { 430 unsigned long long ULLVal; 431 432 // Handle positive strings first. 433 if (Str.empty() || Str.front() != '-') { 434 if (consumeUnsignedInteger(Str, Radix, ULLVal) || 435 // Check for value so large it overflows a signed value. 436 (long long)ULLVal < 0) 437 return true; 438 Result = ULLVal; 439 return false; 440 } 441 442 // Get the positive part of the value. 443 StringRef Str2 = Str.drop_front(1); 444 if (consumeUnsignedInteger(Str2, Radix, ULLVal) || 445 // Reject values so large they'd overflow as negative signed, but allow 446 // "-0". This negates the unsigned so that the negative isn't undefined 447 // on signed overflow. 448 (long long)-ULLVal > 0) 449 return true; 450 451 Str = Str2; 452 Result = -ULLVal; 453 return false; 454 } 455 456 /// GetAsUnsignedInteger - Workhorse method that converts a integer character 457 /// sequence of radix up to 36 to an unsigned long long value. 458 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 459 unsigned long long &Result) { 460 if (consumeUnsignedInteger(Str, Radix, Result)) 461 return true; 462 463 // For getAsUnsignedInteger, we require the whole string to be consumed or 464 // else we consider it a failure. 465 return !Str.empty(); 466 } 467 468 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 469 long long &Result) { 470 if (consumeSignedInteger(Str, Radix, Result)) 471 return true; 472 473 // For getAsSignedInteger, we require the whole string to be consumed or else 474 // we consider it a failure. 475 return !Str.empty(); 476 } 477 478 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 479 StringRef Str = *this; 480 481 // Autosense radix if not specified. 482 if (Radix == 0) 483 Radix = GetAutoSenseRadix(Str); 484 485 assert(Radix > 1 && Radix <= 36); 486 487 // Empty strings (after the radix autosense) are invalid. 488 if (Str.empty()) return true; 489 490 // Skip leading zeroes. This can be a significant improvement if 491 // it means we don't need > 64 bits. 492 while (!Str.empty() && Str.front() == '0') 493 Str = Str.substr(1); 494 495 // If it was nothing but zeroes.... 496 if (Str.empty()) { 497 Result = APInt(64, 0); 498 return false; 499 } 500 501 // (Over-)estimate the required number of bits. 502 unsigned Log2Radix = 0; 503 while ((1U << Log2Radix) < Radix) Log2Radix++; 504 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 505 506 unsigned BitWidth = Log2Radix * Str.size(); 507 if (BitWidth < Result.getBitWidth()) 508 BitWidth = Result.getBitWidth(); // don't shrink the result 509 else if (BitWidth > Result.getBitWidth()) 510 Result = Result.zext(BitWidth); 511 512 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 513 if (!IsPowerOf2Radix) { 514 // These must have the same bit-width as Result. 515 RadixAP = APInt(BitWidth, Radix); 516 CharAP = APInt(BitWidth, 0); 517 } 518 519 // Parse all the bytes of the string given this radix. 520 Result = 0; 521 while (!Str.empty()) { 522 unsigned CharVal; 523 if (Str[0] >= '0' && Str[0] <= '9') 524 CharVal = Str[0]-'0'; 525 else if (Str[0] >= 'a' && Str[0] <= 'z') 526 CharVal = Str[0]-'a'+10; 527 else if (Str[0] >= 'A' && Str[0] <= 'Z') 528 CharVal = Str[0]-'A'+10; 529 else 530 return true; 531 532 // If the parsed value is larger than the integer radix, the string is 533 // invalid. 534 if (CharVal >= Radix) 535 return true; 536 537 // Add in this character. 538 if (IsPowerOf2Radix) { 539 Result <<= Log2Radix; 540 Result |= CharVal; 541 } else { 542 Result *= RadixAP; 543 CharAP = CharVal; 544 Result += CharAP; 545 } 546 547 Str = Str.substr(1); 548 } 549 550 return false; 551 } 552 553 554 // Implementation of StringRef hashing. 555 hash_code llvm::hash_value(StringRef S) { 556 return hash_combine_range(S.begin(), S.end()); 557 } 558