1dc13a9a7Scgyurgyik //===-- String utils --------------------------------------------*- C++ -*-===// 2dc13a9a7Scgyurgyik // 3dc13a9a7Scgyurgyik // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4dc13a9a7Scgyurgyik // See https://llvm.org/LICENSE.txt for license information. 5dc13a9a7Scgyurgyik // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6dc13a9a7Scgyurgyik // 7dc13a9a7Scgyurgyik //===----------------------------------------------------------------------===// 8d85699ebSJoseph Huber // 9d85699ebSJoseph Huber // Standalone string utility functions. Utilities requiring memory allocations 10d6fc7d3aSJay Foad // should be placed in allocating_string_utils.h instead. 11d85699ebSJoseph Huber // 12d85699ebSJoseph Huber //===----------------------------------------------------------------------===// 13dc13a9a7Scgyurgyik 14270547f3SGuillaume Chatelet #ifndef LLVM_LIBC_SRC_STRING_STRING_UTILS_H 15270547f3SGuillaume Chatelet #define LLVM_LIBC_SRC_STRING_STRING_UTILS_H 16dc13a9a7Scgyurgyik 17*631a6e00SNick Desaulniers #include "hdr/types/size_t.h" 18e2d79758SGuillaume Chatelet #include "src/__support/CPP/bitset.h" 19*631a6e00SNick Desaulniers #include "src/__support/CPP/type_traits.h" // cpp::is_same_v 205ff3ff33SPetr Hosek #include "src/__support/macros/config.h" 21737e1cd1SGuillaume Chatelet #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY 221f578347SGuillaume Chatelet #include "src/string/memory_utils/inline_bzero.h" 231f578347SGuillaume Chatelet #include "src/string/memory_utils/inline_memcpy.h" 24dc13a9a7Scgyurgyik 255ff3ff33SPetr Hosek namespace LIBC_NAMESPACE_DECL { 26dc13a9a7Scgyurgyik namespace internal { 27dc13a9a7Scgyurgyik 28019a477cSRoland McGrath template <typename Word> LIBC_INLINE constexpr Word repeat_byte(Word byte) { 29a3b74581SMichael Jones constexpr size_t BITS_IN_BYTE = 8; 30a3b74581SMichael Jones constexpr size_t BYTE_MASK = 0xff; 31a3b74581SMichael Jones Word result = 0; 32a3b74581SMichael Jones byte = byte & BYTE_MASK; 33a3b74581SMichael Jones for (size_t i = 0; i < sizeof(Word); ++i) 34a3b74581SMichael Jones result = (result << BITS_IN_BYTE) | byte; 35a3b74581SMichael Jones return result; 36a3b74581SMichael Jones } 37a3b74581SMichael Jones 38a3b74581SMichael Jones // The goal of this function is to take in a block of arbitrary size and return 39a3b74581SMichael Jones // if it has any bytes equal to zero without branching. This is done by 40a3b74581SMichael Jones // transforming the block such that zero bytes become non-zero and non-zero 41a3b74581SMichael Jones // bytes become zero. 42a3b74581SMichael Jones // The first transformation relies on the properties of carrying in arithmetic 43a3b74581SMichael Jones // subtraction. Specifically, if 0x01 is subtracted from a byte that is 0x00, 44a3b74581SMichael Jones // then the result for that byte must be equal to 0xff (or 0xfe if the next byte 45a3b74581SMichael Jones // needs a carry as well). 46a3b74581SMichael Jones // The next transformation is a simple mask. All zero bytes will have the high 47a3b74581SMichael Jones // bit set after the subtraction, so each byte is masked with 0x80. This narrows 48a3b74581SMichael Jones // the set of bytes that result in a non-zero value to only zero bytes and bytes 49a3b74581SMichael Jones // with the high bit and any other bit set. 50a3b74581SMichael Jones // The final transformation masks the result of the previous transformations 51a3b74581SMichael Jones // with the inverse of the original byte. This means that any byte that had the 52a3b74581SMichael Jones // high bit set will no longer have it set, narrowing the list of bytes which 53a3b74581SMichael Jones // result in non-zero values to just the zero byte. 54019a477cSRoland McGrath template <typename Word> LIBC_INLINE constexpr bool has_zeroes(Word block) { 55a3b74581SMichael Jones constexpr Word LOW_BITS = repeat_byte<Word>(0x01); 56a3b74581SMichael Jones constexpr Word HIGH_BITS = repeat_byte<Word>(0x80); 57a3b74581SMichael Jones Word subtracted = block - LOW_BITS; 58a3b74581SMichael Jones Word inverted = ~block; 59a3b74581SMichael Jones return (subtracted & inverted & HIGH_BITS) != 0; 60a3b74581SMichael Jones } 61a3b74581SMichael Jones 62a3b74581SMichael Jones template <typename Word> 636363320bSSiva Chandra Reddy LIBC_INLINE size_t string_length_wide_read(const char *src) { 64a3b74581SMichael Jones const char *char_ptr = src; 65a3b74581SMichael Jones // Step 1: read 1 byte at a time to align to block size 66a3b74581SMichael Jones for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0; 67a3b74581SMichael Jones ++char_ptr) { 68a3b74581SMichael Jones if (*char_ptr == '\0') 69a3b74581SMichael Jones return char_ptr - src; 70a3b74581SMichael Jones } 71a3b74581SMichael Jones // Step 2: read blocks 72a3b74581SMichael Jones for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr); 73a3b74581SMichael Jones !has_zeroes<Word>(*block_ptr); ++block_ptr) { 74a3b74581SMichael Jones char_ptr = reinterpret_cast<const char *>(block_ptr); 75a3b74581SMichael Jones } 76a3b74581SMichael Jones // Step 3: find the zero in the block 77a3b74581SMichael Jones for (; *char_ptr != '\0'; ++char_ptr) { 78a3b74581SMichael Jones ; 79a3b74581SMichael Jones } 80a3b74581SMichael Jones return char_ptr - src; 81a3b74581SMichael Jones } 82a3b74581SMichael Jones 83a3b74581SMichael Jones // Returns the length of a string, denoted by the first occurrence 84a3b74581SMichael Jones // of a null terminator. 85*631a6e00SNick Desaulniers template <typename T> LIBC_INLINE size_t string_length(const T *src) { 8617114f8bSSiva Chandra #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ 87a3b74581SMichael Jones // Unsigned int is the default size for most processors, and on x86-64 it 88a3b74581SMichael Jones // performs better than larger sizes when the src pointer can't be assumed to 89a3b74581SMichael Jones // be aligned to a word boundary, so it's the size we use for reading the 90a3b74581SMichael Jones // string a block at a time. 91*631a6e00SNick Desaulniers if constexpr (cpp::is_same_v<T, char>) 92a3b74581SMichael Jones return string_length_wide_read<unsigned int>(src); 93a3b74581SMichael Jones #else 94*631a6e00SNick Desaulniers size_t length; 95*631a6e00SNick Desaulniers for (length = 0; *src; ++src, ++length) 96*631a6e00SNick Desaulniers ; 97*631a6e00SNick Desaulniers return length; 98a3b74581SMichael Jones #endif 99a3b74581SMichael Jones } 100a3b74581SMichael Jones 101a3b74581SMichael Jones template <typename Word> 1026363320bSSiva Chandra Reddy LIBC_INLINE void *find_first_character_wide_read(const unsigned char *src, 103a3b74581SMichael Jones unsigned char ch, size_t n) { 104a3b74581SMichael Jones const unsigned char *char_ptr = src; 105a3b74581SMichael Jones size_t cur = 0; 106a3b74581SMichael Jones 107a3b74581SMichael Jones // Step 1: read 1 byte at a time to align to block size 108a3b74581SMichael Jones for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n; 109a3b74581SMichael Jones ++char_ptr, ++cur) { 110a3b74581SMichael Jones if (*char_ptr == ch) 111a3b74581SMichael Jones return const_cast<unsigned char *>(char_ptr); 112a3b74581SMichael Jones } 113a3b74581SMichael Jones 114a3b74581SMichael Jones const Word ch_mask = repeat_byte<Word>(ch); 115a3b74581SMichael Jones 116a3b74581SMichael Jones // Step 2: read blocks 117a3b74581SMichael Jones for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr); 118a3b74581SMichael Jones !has_zeroes<Word>((*block_ptr) ^ ch_mask) && cur < n; 119a3b74581SMichael Jones ++block_ptr, cur += sizeof(Word)) { 120a3b74581SMichael Jones char_ptr = reinterpret_cast<const unsigned char *>(block_ptr); 121a3b74581SMichael Jones } 122a3b74581SMichael Jones 123a3b74581SMichael Jones // Step 3: find the match in the block 124a3b74581SMichael Jones for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) { 125a3b74581SMichael Jones ; 126a3b74581SMichael Jones } 127a3b74581SMichael Jones 128a3b74581SMichael Jones if (*char_ptr != ch || cur >= n) 129a3b74581SMichael Jones return static_cast<void *>(nullptr); 130a3b74581SMichael Jones 131a3b74581SMichael Jones return const_cast<unsigned char *>(char_ptr); 132a3b74581SMichael Jones } 133a3b74581SMichael Jones 1346363320bSSiva Chandra Reddy LIBC_INLINE void *find_first_character_byte_read(const unsigned char *src, 135c92d1aa4Scgyurgyik unsigned char ch, size_t n) { 136c92d1aa4Scgyurgyik for (; n && *src != ch; --n, ++src) 137c92d1aa4Scgyurgyik ; 138c92d1aa4Scgyurgyik return n ? const_cast<unsigned char *>(src) : nullptr; 139c92d1aa4Scgyurgyik } 140c92d1aa4Scgyurgyik 141a3b74581SMichael Jones // Returns the first occurrence of 'ch' within the first 'n' characters of 142a3b74581SMichael Jones // 'src'. If 'ch' is not found, returns nullptr. 1436363320bSSiva Chandra Reddy LIBC_INLINE void *find_first_character(const unsigned char *src, 144a3b74581SMichael Jones unsigned char ch, size_t max_strlen) { 14517114f8bSSiva Chandra #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ 146a3b74581SMichael Jones // If the maximum size of the string is small, the overhead of aligning to a 147a3b74581SMichael Jones // word boundary and generating a bitmask of the appropriate size may be 148a3b74581SMichael Jones // greater than the gains from reading larger chunks. Based on some testing, 149a3b74581SMichael Jones // the crossover point between when it's faster to just read bytewise and read 150a3b74581SMichael Jones // blocks is somewhere between 16 and 32, so 4 times the size of the block 151a3b74581SMichael Jones // should be in that range. 152a3b74581SMichael Jones // Unsigned int is used for the same reason as in strlen. 153a3b74581SMichael Jones using BlockType = unsigned int; 154a3b74581SMichael Jones if (max_strlen > (sizeof(BlockType) * 4)) { 155a3b74581SMichael Jones return find_first_character_wide_read<BlockType>(src, ch, max_strlen); 156a3b74581SMichael Jones } 157a3b74581SMichael Jones #endif 158a3b74581SMichael Jones return find_first_character_byte_read(src, ch, max_strlen); 159a3b74581SMichael Jones } 160a3b74581SMichael Jones 161dc13a9a7Scgyurgyik // Returns the maximum length span that contains only characters not found in 162dc13a9a7Scgyurgyik // 'segment'. If no characters are found, returns the length of 'src'. 1636363320bSSiva Chandra Reddy LIBC_INLINE size_t complementary_span(const char *src, const char *segment) { 164dc13a9a7Scgyurgyik const char *initial = src; 165e2d79758SGuillaume Chatelet cpp::bitset<256> bitset; 166dc13a9a7Scgyurgyik 167dc13a9a7Scgyurgyik for (; *segment; ++segment) 168c891ef6cSAlex Brachet bitset.set(*reinterpret_cast<const unsigned char *>(segment)); 169c891ef6cSAlex Brachet for (; *src && !bitset.test(*reinterpret_cast<const unsigned char *>(src)); 170c891ef6cSAlex Brachet ++src) 171dc13a9a7Scgyurgyik ; 172dc13a9a7Scgyurgyik return src - initial; 173dc13a9a7Scgyurgyik } 174dc13a9a7Scgyurgyik 175bc45bab7Sparallels // Given the similarities between strtok and strtok_r, we can implement both 176bc45bab7Sparallels // using a utility function. On the first call, 'src' is scanned for the 177bc45bab7Sparallels // first character not found in 'delimiter_string'. Once found, it scans until 178bc45bab7Sparallels // the first character in the 'delimiter_string' or the null terminator is 179bc45bab7Sparallels // found. We define this span as a token. The end of the token is appended with 180bc45bab7Sparallels // a null terminator, and the token is returned. The point where the last token 181bc45bab7Sparallels // is found is then stored within 'context' for subsequent calls. Subsequent 182bc45bab7Sparallels // calls will use 'context' when a nullptr is passed in for 'src'. Once the null 183bc45bab7Sparallels // terminating character is reached, returns a nullptr. 184b4ab398cSAlex Brachet template <bool SkipDelim = true> 1856363320bSSiva Chandra Reddy LIBC_INLINE char *string_token(char *__restrict src, 18679ce64eaScgyurgyik const char *__restrict delimiter_string, 18779ce64eaScgyurgyik char **__restrict saveptr) { 1880784e62cSAlfonso Gregory // Return nullptr immediately if both src AND saveptr are nullptr 18929f8e076SGuillaume Chatelet if (LIBC_UNLIKELY(src == nullptr && ((src = *saveptr) == nullptr))) 1900784e62cSAlfonso Gregory return nullptr; 1910784e62cSAlfonso Gregory 192e2d79758SGuillaume Chatelet cpp::bitset<256> delimiter_set; 1930784e62cSAlfonso Gregory for (; *delimiter_string != '\0'; ++delimiter_string) 194bc45bab7Sparallels delimiter_set.set(*delimiter_string); 195bc45bab7Sparallels 196b4ab398cSAlex Brachet if constexpr (SkipDelim) 1970784e62cSAlfonso Gregory for (; *src != '\0' && delimiter_set.test(*src); ++src) 198bc45bab7Sparallels ; 1990784e62cSAlfonso Gregory if (*src == '\0') { 200bc45bab7Sparallels *saveptr = src; 201bc45bab7Sparallels return nullptr; 202bc45bab7Sparallels } 203bc45bab7Sparallels char *token = src; 2040784e62cSAlfonso Gregory for (; *src != '\0'; ++src) { 2050784e62cSAlfonso Gregory if (delimiter_set.test(*src)) { 206bc45bab7Sparallels *src = '\0'; 207bc45bab7Sparallels ++src; 2080784e62cSAlfonso Gregory break; 2090784e62cSAlfonso Gregory } 210bc45bab7Sparallels } 211bc45bab7Sparallels *saveptr = src; 212bc45bab7Sparallels return token; 213bc45bab7Sparallels } 214bc45bab7Sparallels 2156363320bSSiva Chandra Reddy LIBC_INLINE size_t strlcpy(char *__restrict dst, const char *__restrict src, 216b1183305SAlex Brachet size_t size) { 217b1183305SAlex Brachet size_t len = internal::string_length(src); 218b1183305SAlex Brachet if (!size) 219b1183305SAlex Brachet return len; 220b1183305SAlex Brachet size_t n = len < size - 1 ? len : size - 1; 221b1183305SAlex Brachet inline_memcpy(dst, src, n); 222b03c8c4fSGeorge Burgess IV dst[n] = '\0'; 223b1183305SAlex Brachet return len; 224b1183305SAlex Brachet } 225b1183305SAlex Brachet 22682ca29ceSAlex Brachet template <bool ReturnNull = true> 2275bf8efd2SRoland McGrath LIBC_INLINE constexpr static char *strchr_implementation(const char *src, 2285bf8efd2SRoland McGrath int c) { 22982ca29ceSAlex Brachet char ch = static_cast<char>(c); 23082ca29ceSAlex Brachet for (; *src && *src != ch; ++src) 23182ca29ceSAlex Brachet ; 23282ca29ceSAlex Brachet char *ret = ReturnNull ? nullptr : const_cast<char *>(src); 23382ca29ceSAlex Brachet return *src == ch ? const_cast<char *>(src) : ret; 23482ca29ceSAlex Brachet } 23582ca29ceSAlex Brachet 2365bf8efd2SRoland McGrath LIBC_INLINE constexpr static char *strrchr_implementation(const char *src, 2375bf8efd2SRoland McGrath int c) { 23882ca29ceSAlex Brachet char ch = static_cast<char>(c); 23982ca29ceSAlex Brachet char *last_occurrence = nullptr; 24050c44478SGeorge Burgess IV while (true) { 24182ca29ceSAlex Brachet if (*src == ch) 24282ca29ceSAlex Brachet last_occurrence = const_cast<char *>(src); 24350c44478SGeorge Burgess IV if (!*src) 24482ca29ceSAlex Brachet return last_occurrence; 24550c44478SGeorge Burgess IV ++src; 24650c44478SGeorge Burgess IV } 24782ca29ceSAlex Brachet } 24882ca29ceSAlex Brachet 249dc13a9a7Scgyurgyik } // namespace internal 2505ff3ff33SPetr Hosek } // namespace LIBC_NAMESPACE_DECL 251dc13a9a7Scgyurgyik 252270547f3SGuillaume Chatelet #endif // LLVM_LIBC_SRC_STRING_STRING_UTILS_H 253