1 //===-- String utils --------------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Standalone string utility functions. Utilities requiring memory allocations 10 // should be placed in allocating_string_utils.h instead. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIBC_SRC_STRING_STRING_UTILS_H 15 #define LLVM_LIBC_SRC_STRING_STRING_UTILS_H 16 17 #include "hdr/types/size_t.h" 18 #include "src/__support/CPP/bitset.h" 19 #include "src/__support/CPP/type_traits.h" // cpp::is_same_v 20 #include "src/__support/macros/config.h" 21 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY 22 #include "src/string/memory_utils/inline_bzero.h" 23 #include "src/string/memory_utils/inline_memcpy.h" 24 25 namespace LIBC_NAMESPACE_DECL { 26 namespace internal { 27 28 template <typename Word> LIBC_INLINE constexpr Word repeat_byte(Word byte) { 29 constexpr size_t BITS_IN_BYTE = 8; 30 constexpr size_t BYTE_MASK = 0xff; 31 Word result = 0; 32 byte = byte & BYTE_MASK; 33 for (size_t i = 0; i < sizeof(Word); ++i) 34 result = (result << BITS_IN_BYTE) | byte; 35 return result; 36 } 37 38 // The goal of this function is to take in a block of arbitrary size and return 39 // if it has any bytes equal to zero without branching. This is done by 40 // transforming the block such that zero bytes become non-zero and non-zero 41 // bytes become zero. 42 // The first transformation relies on the properties of carrying in arithmetic 43 // subtraction. Specifically, if 0x01 is subtracted from a byte that is 0x00, 44 // then the result for that byte must be equal to 0xff (or 0xfe if the next byte 45 // needs a carry as well). 46 // The next transformation is a simple mask. All zero bytes will have the high 47 // bit set after the subtraction, so each byte is masked with 0x80. This narrows 48 // the set of bytes that result in a non-zero value to only zero bytes and bytes 49 // with the high bit and any other bit set. 50 // The final transformation masks the result of the previous transformations 51 // with the inverse of the original byte. This means that any byte that had the 52 // high bit set will no longer have it set, narrowing the list of bytes which 53 // result in non-zero values to just the zero byte. 54 template <typename Word> LIBC_INLINE constexpr bool has_zeroes(Word block) { 55 constexpr Word LOW_BITS = repeat_byte<Word>(0x01); 56 constexpr Word HIGH_BITS = repeat_byte<Word>(0x80); 57 Word subtracted = block - LOW_BITS; 58 Word inverted = ~block; 59 return (subtracted & inverted & HIGH_BITS) != 0; 60 } 61 62 template <typename Word> 63 LIBC_INLINE size_t string_length_wide_read(const char *src) { 64 const char *char_ptr = src; 65 // Step 1: read 1 byte at a time to align to block size 66 for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0; 67 ++char_ptr) { 68 if (*char_ptr == '\0') 69 return char_ptr - src; 70 } 71 // Step 2: read blocks 72 for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr); 73 !has_zeroes<Word>(*block_ptr); ++block_ptr) { 74 char_ptr = reinterpret_cast<const char *>(block_ptr); 75 } 76 // Step 3: find the zero in the block 77 for (; *char_ptr != '\0'; ++char_ptr) { 78 ; 79 } 80 return char_ptr - src; 81 } 82 83 // Returns the length of a string, denoted by the first occurrence 84 // of a null terminator. 85 template <typename T> LIBC_INLINE size_t string_length(const T *src) { 86 #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ 87 // Unsigned int is the default size for most processors, and on x86-64 it 88 // performs better than larger sizes when the src pointer can't be assumed to 89 // be aligned to a word boundary, so it's the size we use for reading the 90 // string a block at a time. 91 if constexpr (cpp::is_same_v<T, char>) 92 return string_length_wide_read<unsigned int>(src); 93 #else 94 size_t length; 95 for (length = 0; *src; ++src, ++length) 96 ; 97 return length; 98 #endif 99 } 100 101 template <typename Word> 102 LIBC_INLINE void *find_first_character_wide_read(const unsigned char *src, 103 unsigned char ch, size_t n) { 104 const unsigned char *char_ptr = src; 105 size_t cur = 0; 106 107 // Step 1: read 1 byte at a time to align to block size 108 for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n; 109 ++char_ptr, ++cur) { 110 if (*char_ptr == ch) 111 return const_cast<unsigned char *>(char_ptr); 112 } 113 114 const Word ch_mask = repeat_byte<Word>(ch); 115 116 // Step 2: read blocks 117 for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr); 118 !has_zeroes<Word>((*block_ptr) ^ ch_mask) && cur < n; 119 ++block_ptr, cur += sizeof(Word)) { 120 char_ptr = reinterpret_cast<const unsigned char *>(block_ptr); 121 } 122 123 // Step 3: find the match in the block 124 for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) { 125 ; 126 } 127 128 if (*char_ptr != ch || cur >= n) 129 return static_cast<void *>(nullptr); 130 131 return const_cast<unsigned char *>(char_ptr); 132 } 133 134 LIBC_INLINE void *find_first_character_byte_read(const unsigned char *src, 135 unsigned char ch, size_t n) { 136 for (; n && *src != ch; --n, ++src) 137 ; 138 return n ? const_cast<unsigned char *>(src) : nullptr; 139 } 140 141 // Returns the first occurrence of 'ch' within the first 'n' characters of 142 // 'src'. If 'ch' is not found, returns nullptr. 143 LIBC_INLINE void *find_first_character(const unsigned char *src, 144 unsigned char ch, size_t max_strlen) { 145 #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ 146 // If the maximum size of the string is small, the overhead of aligning to a 147 // word boundary and generating a bitmask of the appropriate size may be 148 // greater than the gains from reading larger chunks. Based on some testing, 149 // the crossover point between when it's faster to just read bytewise and read 150 // blocks is somewhere between 16 and 32, so 4 times the size of the block 151 // should be in that range. 152 // Unsigned int is used for the same reason as in strlen. 153 using BlockType = unsigned int; 154 if (max_strlen > (sizeof(BlockType) * 4)) { 155 return find_first_character_wide_read<BlockType>(src, ch, max_strlen); 156 } 157 #endif 158 return find_first_character_byte_read(src, ch, max_strlen); 159 } 160 161 // Returns the maximum length span that contains only characters not found in 162 // 'segment'. If no characters are found, returns the length of 'src'. 163 LIBC_INLINE size_t complementary_span(const char *src, const char *segment) { 164 const char *initial = src; 165 cpp::bitset<256> bitset; 166 167 for (; *segment; ++segment) 168 bitset.set(*reinterpret_cast<const unsigned char *>(segment)); 169 for (; *src && !bitset.test(*reinterpret_cast<const unsigned char *>(src)); 170 ++src) 171 ; 172 return src - initial; 173 } 174 175 // Given the similarities between strtok and strtok_r, we can implement both 176 // using a utility function. On the first call, 'src' is scanned for the 177 // first character not found in 'delimiter_string'. Once found, it scans until 178 // the first character in the 'delimiter_string' or the null terminator is 179 // found. We define this span as a token. The end of the token is appended with 180 // a null terminator, and the token is returned. The point where the last token 181 // is found is then stored within 'context' for subsequent calls. Subsequent 182 // calls will use 'context' when a nullptr is passed in for 'src'. Once the null 183 // terminating character is reached, returns a nullptr. 184 template <bool SkipDelim = true> 185 LIBC_INLINE char *string_token(char *__restrict src, 186 const char *__restrict delimiter_string, 187 char **__restrict saveptr) { 188 // Return nullptr immediately if both src AND saveptr are nullptr 189 if (LIBC_UNLIKELY(src == nullptr && ((src = *saveptr) == nullptr))) 190 return nullptr; 191 192 cpp::bitset<256> delimiter_set; 193 for (; *delimiter_string != '\0'; ++delimiter_string) 194 delimiter_set.set(*delimiter_string); 195 196 if constexpr (SkipDelim) 197 for (; *src != '\0' && delimiter_set.test(*src); ++src) 198 ; 199 if (*src == '\0') { 200 *saveptr = src; 201 return nullptr; 202 } 203 char *token = src; 204 for (; *src != '\0'; ++src) { 205 if (delimiter_set.test(*src)) { 206 *src = '\0'; 207 ++src; 208 break; 209 } 210 } 211 *saveptr = src; 212 return token; 213 } 214 215 LIBC_INLINE size_t strlcpy(char *__restrict dst, const char *__restrict src, 216 size_t size) { 217 size_t len = internal::string_length(src); 218 if (!size) 219 return len; 220 size_t n = len < size - 1 ? len : size - 1; 221 inline_memcpy(dst, src, n); 222 dst[n] = '\0'; 223 return len; 224 } 225 226 template <bool ReturnNull = true> 227 LIBC_INLINE constexpr static char *strchr_implementation(const char *src, 228 int c) { 229 char ch = static_cast<char>(c); 230 for (; *src && *src != ch; ++src) 231 ; 232 char *ret = ReturnNull ? nullptr : const_cast<char *>(src); 233 return *src == ch ? const_cast<char *>(src) : ret; 234 } 235 236 LIBC_INLINE constexpr static char *strrchr_implementation(const char *src, 237 int c) { 238 char ch = static_cast<char>(c); 239 char *last_occurrence = nullptr; 240 while (true) { 241 if (*src == ch) 242 last_occurrence = const_cast<char *>(src); 243 if (!*src) 244 return last_occurrence; 245 ++src; 246 } 247 } 248 249 } // namespace internal 250 } // namespace LIBC_NAMESPACE_DECL 251 252 #endif // LLVM_LIBC_SRC_STRING_STRING_UTILS_H 253