xref: /llvm-project/libc/src/string/string_utils.h (revision 631a6e0004e57ca85569b99ea411418627925697)
1 //===-- String utils --------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Standalone string utility functions. Utilities requiring memory allocations
10 // should be placed in allocating_string_utils.h instead.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIBC_SRC_STRING_STRING_UTILS_H
15 #define LLVM_LIBC_SRC_STRING_STRING_UTILS_H
16 
17 #include "hdr/types/size_t.h"
18 #include "src/__support/CPP/bitset.h"
19 #include "src/__support/CPP/type_traits.h" // cpp::is_same_v
20 #include "src/__support/macros/config.h"
21 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
22 #include "src/string/memory_utils/inline_bzero.h"
23 #include "src/string/memory_utils/inline_memcpy.h"
24 
25 namespace LIBC_NAMESPACE_DECL {
26 namespace internal {
27 
28 template <typename Word> LIBC_INLINE constexpr Word repeat_byte(Word byte) {
29   constexpr size_t BITS_IN_BYTE = 8;
30   constexpr size_t BYTE_MASK = 0xff;
31   Word result = 0;
32   byte = byte & BYTE_MASK;
33   for (size_t i = 0; i < sizeof(Word); ++i)
34     result = (result << BITS_IN_BYTE) | byte;
35   return result;
36 }
37 
38 // The goal of this function is to take in a block of arbitrary size and return
39 // if it has any bytes equal to zero without branching. This is done by
40 // transforming the block such that zero bytes become non-zero and non-zero
41 // bytes become zero.
42 // The first transformation relies on the properties of carrying in arithmetic
43 // subtraction. Specifically, if 0x01 is subtracted from a byte that is 0x00,
44 // then the result for that byte must be equal to 0xff (or 0xfe if the next byte
45 // needs a carry as well).
46 // The next transformation is a simple mask. All zero bytes will have the high
47 // bit set after the subtraction, so each byte is masked with 0x80. This narrows
48 // the set of bytes that result in a non-zero value to only zero bytes and bytes
49 // with the high bit and any other bit set.
50 // The final transformation masks the result of the previous transformations
51 // with the inverse of the original byte. This means that any byte that had the
52 // high bit set will no longer have it set, narrowing the list of bytes which
53 // result in non-zero values to just the zero byte.
54 template <typename Word> LIBC_INLINE constexpr bool has_zeroes(Word block) {
55   constexpr Word LOW_BITS = repeat_byte<Word>(0x01);
56   constexpr Word HIGH_BITS = repeat_byte<Word>(0x80);
57   Word subtracted = block - LOW_BITS;
58   Word inverted = ~block;
59   return (subtracted & inverted & HIGH_BITS) != 0;
60 }
61 
62 template <typename Word>
63 LIBC_INLINE size_t string_length_wide_read(const char *src) {
64   const char *char_ptr = src;
65   // Step 1: read 1 byte at a time to align to block size
66   for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0;
67        ++char_ptr) {
68     if (*char_ptr == '\0')
69       return char_ptr - src;
70   }
71   // Step 2: read blocks
72   for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
73        !has_zeroes<Word>(*block_ptr); ++block_ptr) {
74     char_ptr = reinterpret_cast<const char *>(block_ptr);
75   }
76   // Step 3: find the zero in the block
77   for (; *char_ptr != '\0'; ++char_ptr) {
78     ;
79   }
80   return char_ptr - src;
81 }
82 
83 // Returns the length of a string, denoted by the first occurrence
84 // of a null terminator.
85 template <typename T> LIBC_INLINE size_t string_length(const T *src) {
86 #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
87   // Unsigned int is the default size for most processors, and on x86-64 it
88   // performs better than larger sizes when the src pointer can't be assumed to
89   // be aligned to a word boundary, so it's the size we use for reading the
90   // string a block at a time.
91   if constexpr (cpp::is_same_v<T, char>)
92     return string_length_wide_read<unsigned int>(src);
93 #else
94   size_t length;
95   for (length = 0; *src; ++src, ++length)
96     ;
97   return length;
98 #endif
99 }
100 
101 template <typename Word>
102 LIBC_INLINE void *find_first_character_wide_read(const unsigned char *src,
103                                                  unsigned char ch, size_t n) {
104   const unsigned char *char_ptr = src;
105   size_t cur = 0;
106 
107   // Step 1: read 1 byte at a time to align to block size
108   for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n;
109        ++char_ptr, ++cur) {
110     if (*char_ptr == ch)
111       return const_cast<unsigned char *>(char_ptr);
112   }
113 
114   const Word ch_mask = repeat_byte<Word>(ch);
115 
116   // Step 2: read blocks
117   for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
118        !has_zeroes<Word>((*block_ptr) ^ ch_mask) && cur < n;
119        ++block_ptr, cur += sizeof(Word)) {
120     char_ptr = reinterpret_cast<const unsigned char *>(block_ptr);
121   }
122 
123   // Step 3: find the match in the block
124   for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) {
125     ;
126   }
127 
128   if (*char_ptr != ch || cur >= n)
129     return static_cast<void *>(nullptr);
130 
131   return const_cast<unsigned char *>(char_ptr);
132 }
133 
134 LIBC_INLINE void *find_first_character_byte_read(const unsigned char *src,
135                                                  unsigned char ch, size_t n) {
136   for (; n && *src != ch; --n, ++src)
137     ;
138   return n ? const_cast<unsigned char *>(src) : nullptr;
139 }
140 
141 // Returns the first occurrence of 'ch' within the first 'n' characters of
142 // 'src'. If 'ch' is not found, returns nullptr.
143 LIBC_INLINE void *find_first_character(const unsigned char *src,
144                                        unsigned char ch, size_t max_strlen) {
145 #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
146   // If the maximum size of the string is small, the overhead of aligning to a
147   // word boundary and generating a bitmask of the appropriate size may be
148   // greater than the gains from reading larger chunks. Based on some testing,
149   // the crossover point between when it's faster to just read bytewise and read
150   // blocks is somewhere between 16 and 32, so 4 times the size of the block
151   // should be in that range.
152   // Unsigned int is used for the same reason as in strlen.
153   using BlockType = unsigned int;
154   if (max_strlen > (sizeof(BlockType) * 4)) {
155     return find_first_character_wide_read<BlockType>(src, ch, max_strlen);
156   }
157 #endif
158   return find_first_character_byte_read(src, ch, max_strlen);
159 }
160 
161 // Returns the maximum length span that contains only characters not found in
162 // 'segment'. If no characters are found, returns the length of 'src'.
163 LIBC_INLINE size_t complementary_span(const char *src, const char *segment) {
164   const char *initial = src;
165   cpp::bitset<256> bitset;
166 
167   for (; *segment; ++segment)
168     bitset.set(*reinterpret_cast<const unsigned char *>(segment));
169   for (; *src && !bitset.test(*reinterpret_cast<const unsigned char *>(src));
170        ++src)
171     ;
172   return src - initial;
173 }
174 
175 // Given the similarities between strtok and strtok_r, we can implement both
176 // using a utility function. On the first call, 'src' is scanned for the
177 // first character not found in 'delimiter_string'. Once found, it scans until
178 // the first character in the 'delimiter_string' or the null terminator is
179 // found. We define this span as a token. The end of the token is appended with
180 // a null terminator, and the token is returned. The point where the last token
181 // is found is then stored within 'context' for subsequent calls. Subsequent
182 // calls will use 'context' when a nullptr is passed in for 'src'. Once the null
183 // terminating character is reached, returns a nullptr.
184 template <bool SkipDelim = true>
185 LIBC_INLINE char *string_token(char *__restrict src,
186                                const char *__restrict delimiter_string,
187                                char **__restrict saveptr) {
188   // Return nullptr immediately if both src AND saveptr are nullptr
189   if (LIBC_UNLIKELY(src == nullptr && ((src = *saveptr) == nullptr)))
190     return nullptr;
191 
192   cpp::bitset<256> delimiter_set;
193   for (; *delimiter_string != '\0'; ++delimiter_string)
194     delimiter_set.set(*delimiter_string);
195 
196   if constexpr (SkipDelim)
197     for (; *src != '\0' && delimiter_set.test(*src); ++src)
198       ;
199   if (*src == '\0') {
200     *saveptr = src;
201     return nullptr;
202   }
203   char *token = src;
204   for (; *src != '\0'; ++src) {
205     if (delimiter_set.test(*src)) {
206       *src = '\0';
207       ++src;
208       break;
209     }
210   }
211   *saveptr = src;
212   return token;
213 }
214 
215 LIBC_INLINE size_t strlcpy(char *__restrict dst, const char *__restrict src,
216                            size_t size) {
217   size_t len = internal::string_length(src);
218   if (!size)
219     return len;
220   size_t n = len < size - 1 ? len : size - 1;
221   inline_memcpy(dst, src, n);
222   dst[n] = '\0';
223   return len;
224 }
225 
226 template <bool ReturnNull = true>
227 LIBC_INLINE constexpr static char *strchr_implementation(const char *src,
228                                                          int c) {
229   char ch = static_cast<char>(c);
230   for (; *src && *src != ch; ++src)
231     ;
232   char *ret = ReturnNull ? nullptr : const_cast<char *>(src);
233   return *src == ch ? const_cast<char *>(src) : ret;
234 }
235 
236 LIBC_INLINE constexpr static char *strrchr_implementation(const char *src,
237                                                           int c) {
238   char ch = static_cast<char>(c);
239   char *last_occurrence = nullptr;
240   while (true) {
241     if (*src == ch)
242       last_occurrence = const_cast<char *>(src);
243     if (!*src)
244       return last_occurrence;
245     ++src;
246   }
247 }
248 
249 } // namespace internal
250 } // namespace LIBC_NAMESPACE_DECL
251 
252 #endif //  LLVM_LIBC_SRC_STRING_STRING_UTILS_H
253