xref: /llvm-project/libc/src/string/string_utils.h (revision 631a6e0004e57ca85569b99ea411418627925697)
1dc13a9a7Scgyurgyik //===-- String utils --------------------------------------------*- C++ -*-===//
2dc13a9a7Scgyurgyik //
3dc13a9a7Scgyurgyik // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4dc13a9a7Scgyurgyik // See https://llvm.org/LICENSE.txt for license information.
5dc13a9a7Scgyurgyik // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6dc13a9a7Scgyurgyik //
7dc13a9a7Scgyurgyik //===----------------------------------------------------------------------===//
8d85699ebSJoseph Huber //
9d85699ebSJoseph Huber // Standalone string utility functions. Utilities requiring memory allocations
10d6fc7d3aSJay Foad // should be placed in allocating_string_utils.h instead.
11d85699ebSJoseph Huber //
12d85699ebSJoseph Huber //===----------------------------------------------------------------------===//
13dc13a9a7Scgyurgyik 
14270547f3SGuillaume Chatelet #ifndef LLVM_LIBC_SRC_STRING_STRING_UTILS_H
15270547f3SGuillaume Chatelet #define LLVM_LIBC_SRC_STRING_STRING_UTILS_H
16dc13a9a7Scgyurgyik 
17*631a6e00SNick Desaulniers #include "hdr/types/size_t.h"
18e2d79758SGuillaume Chatelet #include "src/__support/CPP/bitset.h"
19*631a6e00SNick Desaulniers #include "src/__support/CPP/type_traits.h" // cpp::is_same_v
205ff3ff33SPetr Hosek #include "src/__support/macros/config.h"
21737e1cd1SGuillaume Chatelet #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
221f578347SGuillaume Chatelet #include "src/string/memory_utils/inline_bzero.h"
231f578347SGuillaume Chatelet #include "src/string/memory_utils/inline_memcpy.h"
24dc13a9a7Scgyurgyik 
255ff3ff33SPetr Hosek namespace LIBC_NAMESPACE_DECL {
26dc13a9a7Scgyurgyik namespace internal {
27dc13a9a7Scgyurgyik 
28019a477cSRoland McGrath template <typename Word> LIBC_INLINE constexpr Word repeat_byte(Word byte) {
29a3b74581SMichael Jones   constexpr size_t BITS_IN_BYTE = 8;
30a3b74581SMichael Jones   constexpr size_t BYTE_MASK = 0xff;
31a3b74581SMichael Jones   Word result = 0;
32a3b74581SMichael Jones   byte = byte & BYTE_MASK;
33a3b74581SMichael Jones   for (size_t i = 0; i < sizeof(Word); ++i)
34a3b74581SMichael Jones     result = (result << BITS_IN_BYTE) | byte;
35a3b74581SMichael Jones   return result;
36a3b74581SMichael Jones }
37a3b74581SMichael Jones 
38a3b74581SMichael Jones // The goal of this function is to take in a block of arbitrary size and return
39a3b74581SMichael Jones // if it has any bytes equal to zero without branching. This is done by
40a3b74581SMichael Jones // transforming the block such that zero bytes become non-zero and non-zero
41a3b74581SMichael Jones // bytes become zero.
42a3b74581SMichael Jones // The first transformation relies on the properties of carrying in arithmetic
43a3b74581SMichael Jones // subtraction. Specifically, if 0x01 is subtracted from a byte that is 0x00,
44a3b74581SMichael Jones // then the result for that byte must be equal to 0xff (or 0xfe if the next byte
45a3b74581SMichael Jones // needs a carry as well).
46a3b74581SMichael Jones // The next transformation is a simple mask. All zero bytes will have the high
47a3b74581SMichael Jones // bit set after the subtraction, so each byte is masked with 0x80. This narrows
48a3b74581SMichael Jones // the set of bytes that result in a non-zero value to only zero bytes and bytes
49a3b74581SMichael Jones // with the high bit and any other bit set.
50a3b74581SMichael Jones // The final transformation masks the result of the previous transformations
51a3b74581SMichael Jones // with the inverse of the original byte. This means that any byte that had the
52a3b74581SMichael Jones // high bit set will no longer have it set, narrowing the list of bytes which
53a3b74581SMichael Jones // result in non-zero values to just the zero byte.
54019a477cSRoland McGrath template <typename Word> LIBC_INLINE constexpr bool has_zeroes(Word block) {
55a3b74581SMichael Jones   constexpr Word LOW_BITS = repeat_byte<Word>(0x01);
56a3b74581SMichael Jones   constexpr Word HIGH_BITS = repeat_byte<Word>(0x80);
57a3b74581SMichael Jones   Word subtracted = block - LOW_BITS;
58a3b74581SMichael Jones   Word inverted = ~block;
59a3b74581SMichael Jones   return (subtracted & inverted & HIGH_BITS) != 0;
60a3b74581SMichael Jones }
61a3b74581SMichael Jones 
62a3b74581SMichael Jones template <typename Word>
636363320bSSiva Chandra Reddy LIBC_INLINE size_t string_length_wide_read(const char *src) {
64a3b74581SMichael Jones   const char *char_ptr = src;
65a3b74581SMichael Jones   // Step 1: read 1 byte at a time to align to block size
66a3b74581SMichael Jones   for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0;
67a3b74581SMichael Jones        ++char_ptr) {
68a3b74581SMichael Jones     if (*char_ptr == '\0')
69a3b74581SMichael Jones       return char_ptr - src;
70a3b74581SMichael Jones   }
71a3b74581SMichael Jones   // Step 2: read blocks
72a3b74581SMichael Jones   for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
73a3b74581SMichael Jones        !has_zeroes<Word>(*block_ptr); ++block_ptr) {
74a3b74581SMichael Jones     char_ptr = reinterpret_cast<const char *>(block_ptr);
75a3b74581SMichael Jones   }
76a3b74581SMichael Jones   // Step 3: find the zero in the block
77a3b74581SMichael Jones   for (; *char_ptr != '\0'; ++char_ptr) {
78a3b74581SMichael Jones     ;
79a3b74581SMichael Jones   }
80a3b74581SMichael Jones   return char_ptr - src;
81a3b74581SMichael Jones }
82a3b74581SMichael Jones 
83a3b74581SMichael Jones // Returns the length of a string, denoted by the first occurrence
84a3b74581SMichael Jones // of a null terminator.
85*631a6e00SNick Desaulniers template <typename T> LIBC_INLINE size_t string_length(const T *src) {
8617114f8bSSiva Chandra #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
87a3b74581SMichael Jones   // Unsigned int is the default size for most processors, and on x86-64 it
88a3b74581SMichael Jones   // performs better than larger sizes when the src pointer can't be assumed to
89a3b74581SMichael Jones   // be aligned to a word boundary, so it's the size we use for reading the
90a3b74581SMichael Jones   // string a block at a time.
91*631a6e00SNick Desaulniers   if constexpr (cpp::is_same_v<T, char>)
92a3b74581SMichael Jones     return string_length_wide_read<unsigned int>(src);
93a3b74581SMichael Jones #else
94*631a6e00SNick Desaulniers   size_t length;
95*631a6e00SNick Desaulniers   for (length = 0; *src; ++src, ++length)
96*631a6e00SNick Desaulniers     ;
97*631a6e00SNick Desaulniers   return length;
98a3b74581SMichael Jones #endif
99a3b74581SMichael Jones }
100a3b74581SMichael Jones 
101a3b74581SMichael Jones template <typename Word>
1026363320bSSiva Chandra Reddy LIBC_INLINE void *find_first_character_wide_read(const unsigned char *src,
103a3b74581SMichael Jones                                                  unsigned char ch, size_t n) {
104a3b74581SMichael Jones   const unsigned char *char_ptr = src;
105a3b74581SMichael Jones   size_t cur = 0;
106a3b74581SMichael Jones 
107a3b74581SMichael Jones   // Step 1: read 1 byte at a time to align to block size
108a3b74581SMichael Jones   for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n;
109a3b74581SMichael Jones        ++char_ptr, ++cur) {
110a3b74581SMichael Jones     if (*char_ptr == ch)
111a3b74581SMichael Jones       return const_cast<unsigned char *>(char_ptr);
112a3b74581SMichael Jones   }
113a3b74581SMichael Jones 
114a3b74581SMichael Jones   const Word ch_mask = repeat_byte<Word>(ch);
115a3b74581SMichael Jones 
116a3b74581SMichael Jones   // Step 2: read blocks
117a3b74581SMichael Jones   for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
118a3b74581SMichael Jones        !has_zeroes<Word>((*block_ptr) ^ ch_mask) && cur < n;
119a3b74581SMichael Jones        ++block_ptr, cur += sizeof(Word)) {
120a3b74581SMichael Jones     char_ptr = reinterpret_cast<const unsigned char *>(block_ptr);
121a3b74581SMichael Jones   }
122a3b74581SMichael Jones 
123a3b74581SMichael Jones   // Step 3: find the match in the block
124a3b74581SMichael Jones   for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) {
125a3b74581SMichael Jones     ;
126a3b74581SMichael Jones   }
127a3b74581SMichael Jones 
128a3b74581SMichael Jones   if (*char_ptr != ch || cur >= n)
129a3b74581SMichael Jones     return static_cast<void *>(nullptr);
130a3b74581SMichael Jones 
131a3b74581SMichael Jones   return const_cast<unsigned char *>(char_ptr);
132a3b74581SMichael Jones }
133a3b74581SMichael Jones 
1346363320bSSiva Chandra Reddy LIBC_INLINE void *find_first_character_byte_read(const unsigned char *src,
135c92d1aa4Scgyurgyik                                                  unsigned char ch, size_t n) {
136c92d1aa4Scgyurgyik   for (; n && *src != ch; --n, ++src)
137c92d1aa4Scgyurgyik     ;
138c92d1aa4Scgyurgyik   return n ? const_cast<unsigned char *>(src) : nullptr;
139c92d1aa4Scgyurgyik }
140c92d1aa4Scgyurgyik 
141a3b74581SMichael Jones // Returns the first occurrence of 'ch' within the first 'n' characters of
142a3b74581SMichael Jones // 'src'. If 'ch' is not found, returns nullptr.
1436363320bSSiva Chandra Reddy LIBC_INLINE void *find_first_character(const unsigned char *src,
144a3b74581SMichael Jones                                        unsigned char ch, size_t max_strlen) {
14517114f8bSSiva Chandra #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
146a3b74581SMichael Jones   // If the maximum size of the string is small, the overhead of aligning to a
147a3b74581SMichael Jones   // word boundary and generating a bitmask of the appropriate size may be
148a3b74581SMichael Jones   // greater than the gains from reading larger chunks. Based on some testing,
149a3b74581SMichael Jones   // the crossover point between when it's faster to just read bytewise and read
150a3b74581SMichael Jones   // blocks is somewhere between 16 and 32, so 4 times the size of the block
151a3b74581SMichael Jones   // should be in that range.
152a3b74581SMichael Jones   // Unsigned int is used for the same reason as in strlen.
153a3b74581SMichael Jones   using BlockType = unsigned int;
154a3b74581SMichael Jones   if (max_strlen > (sizeof(BlockType) * 4)) {
155a3b74581SMichael Jones     return find_first_character_wide_read<BlockType>(src, ch, max_strlen);
156a3b74581SMichael Jones   }
157a3b74581SMichael Jones #endif
158a3b74581SMichael Jones   return find_first_character_byte_read(src, ch, max_strlen);
159a3b74581SMichael Jones }
160a3b74581SMichael Jones 
161dc13a9a7Scgyurgyik // Returns the maximum length span that contains only characters not found in
162dc13a9a7Scgyurgyik // 'segment'. If no characters are found, returns the length of 'src'.
1636363320bSSiva Chandra Reddy LIBC_INLINE size_t complementary_span(const char *src, const char *segment) {
164dc13a9a7Scgyurgyik   const char *initial = src;
165e2d79758SGuillaume Chatelet   cpp::bitset<256> bitset;
166dc13a9a7Scgyurgyik 
167dc13a9a7Scgyurgyik   for (; *segment; ++segment)
168c891ef6cSAlex Brachet     bitset.set(*reinterpret_cast<const unsigned char *>(segment));
169c891ef6cSAlex Brachet   for (; *src && !bitset.test(*reinterpret_cast<const unsigned char *>(src));
170c891ef6cSAlex Brachet        ++src)
171dc13a9a7Scgyurgyik     ;
172dc13a9a7Scgyurgyik   return src - initial;
173dc13a9a7Scgyurgyik }
174dc13a9a7Scgyurgyik 
175bc45bab7Sparallels // Given the similarities between strtok and strtok_r, we can implement both
176bc45bab7Sparallels // using a utility function. On the first call, 'src' is scanned for the
177bc45bab7Sparallels // first character not found in 'delimiter_string'. Once found, it scans until
178bc45bab7Sparallels // the first character in the 'delimiter_string' or the null terminator is
179bc45bab7Sparallels // found. We define this span as a token. The end of the token is appended with
180bc45bab7Sparallels // a null terminator, and the token is returned. The point where the last token
181bc45bab7Sparallels // is found is then stored within 'context' for subsequent calls. Subsequent
182bc45bab7Sparallels // calls will use 'context' when a nullptr is passed in for 'src'. Once the null
183bc45bab7Sparallels // terminating character is reached, returns a nullptr.
184b4ab398cSAlex Brachet template <bool SkipDelim = true>
1856363320bSSiva Chandra Reddy LIBC_INLINE char *string_token(char *__restrict src,
18679ce64eaScgyurgyik                                const char *__restrict delimiter_string,
18779ce64eaScgyurgyik                                char **__restrict saveptr) {
1880784e62cSAlfonso Gregory   // Return nullptr immediately if both src AND saveptr are nullptr
18929f8e076SGuillaume Chatelet   if (LIBC_UNLIKELY(src == nullptr && ((src = *saveptr) == nullptr)))
1900784e62cSAlfonso Gregory     return nullptr;
1910784e62cSAlfonso Gregory 
192e2d79758SGuillaume Chatelet   cpp::bitset<256> delimiter_set;
1930784e62cSAlfonso Gregory   for (; *delimiter_string != '\0'; ++delimiter_string)
194bc45bab7Sparallels     delimiter_set.set(*delimiter_string);
195bc45bab7Sparallels 
196b4ab398cSAlex Brachet   if constexpr (SkipDelim)
1970784e62cSAlfonso Gregory     for (; *src != '\0' && delimiter_set.test(*src); ++src)
198bc45bab7Sparallels       ;
1990784e62cSAlfonso Gregory   if (*src == '\0') {
200bc45bab7Sparallels     *saveptr = src;
201bc45bab7Sparallels     return nullptr;
202bc45bab7Sparallels   }
203bc45bab7Sparallels   char *token = src;
2040784e62cSAlfonso Gregory   for (; *src != '\0'; ++src) {
2050784e62cSAlfonso Gregory     if (delimiter_set.test(*src)) {
206bc45bab7Sparallels       *src = '\0';
207bc45bab7Sparallels       ++src;
2080784e62cSAlfonso Gregory       break;
2090784e62cSAlfonso Gregory     }
210bc45bab7Sparallels   }
211bc45bab7Sparallels   *saveptr = src;
212bc45bab7Sparallels   return token;
213bc45bab7Sparallels }
214bc45bab7Sparallels 
2156363320bSSiva Chandra Reddy LIBC_INLINE size_t strlcpy(char *__restrict dst, const char *__restrict src,
216b1183305SAlex Brachet                            size_t size) {
217b1183305SAlex Brachet   size_t len = internal::string_length(src);
218b1183305SAlex Brachet   if (!size)
219b1183305SAlex Brachet     return len;
220b1183305SAlex Brachet   size_t n = len < size - 1 ? len : size - 1;
221b1183305SAlex Brachet   inline_memcpy(dst, src, n);
222b03c8c4fSGeorge Burgess IV   dst[n] = '\0';
223b1183305SAlex Brachet   return len;
224b1183305SAlex Brachet }
225b1183305SAlex Brachet 
22682ca29ceSAlex Brachet template <bool ReturnNull = true>
2275bf8efd2SRoland McGrath LIBC_INLINE constexpr static char *strchr_implementation(const char *src,
2285bf8efd2SRoland McGrath                                                          int c) {
22982ca29ceSAlex Brachet   char ch = static_cast<char>(c);
23082ca29ceSAlex Brachet   for (; *src && *src != ch; ++src)
23182ca29ceSAlex Brachet     ;
23282ca29ceSAlex Brachet   char *ret = ReturnNull ? nullptr : const_cast<char *>(src);
23382ca29ceSAlex Brachet   return *src == ch ? const_cast<char *>(src) : ret;
23482ca29ceSAlex Brachet }
23582ca29ceSAlex Brachet 
2365bf8efd2SRoland McGrath LIBC_INLINE constexpr static char *strrchr_implementation(const char *src,
2375bf8efd2SRoland McGrath                                                           int c) {
23882ca29ceSAlex Brachet   char ch = static_cast<char>(c);
23982ca29ceSAlex Brachet   char *last_occurrence = nullptr;
24050c44478SGeorge Burgess IV   while (true) {
24182ca29ceSAlex Brachet     if (*src == ch)
24282ca29ceSAlex Brachet       last_occurrence = const_cast<char *>(src);
24350c44478SGeorge Burgess IV     if (!*src)
24482ca29ceSAlex Brachet       return last_occurrence;
24550c44478SGeorge Burgess IV     ++src;
24650c44478SGeorge Burgess IV   }
24782ca29ceSAlex Brachet }
24882ca29ceSAlex Brachet 
249dc13a9a7Scgyurgyik } // namespace internal
2505ff3ff33SPetr Hosek } // namespace LIBC_NAMESPACE_DECL
251dc13a9a7Scgyurgyik 
252270547f3SGuillaume Chatelet #endif //  LLVM_LIBC_SRC_STRING_STRING_UTILS_H
253