1*09d4459fSDaniel Fojt /* Copyright (C) 1991, 1993, 1996-1997, 1999-2000, 2003-2004, 2006, 2008-2020
2680a9cb8SJohn Marino Free Software Foundation, Inc.
3680a9cb8SJohn Marino
4680a9cb8SJohn Marino Based on strlen implementation by Torbjorn Granlund (tege@sics.se),
5680a9cb8SJohn Marino with help from Dan Sahlin (dan@sics.se) and
6680a9cb8SJohn Marino commentary by Jim Blandy (jimb@ai.mit.edu);
7680a9cb8SJohn Marino adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
8680a9cb8SJohn Marino and implemented in glibc by Roland McGrath (roland@ai.mit.edu).
9680a9cb8SJohn Marino Extension to memchr2 implemented by Eric Blake (ebb9@byu.net).
10680a9cb8SJohn Marino
11680a9cb8SJohn Marino This program is free software: you can redistribute it and/or modify it
12680a9cb8SJohn Marino under the terms of the GNU General Public License as published by the
13680a9cb8SJohn Marino Free Software Foundation; either version 3 of the License, or any
14680a9cb8SJohn Marino later version.
15680a9cb8SJohn Marino
16680a9cb8SJohn Marino This program is distributed in the hope that it will be useful,
17680a9cb8SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of
18680a9cb8SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19680a9cb8SJohn Marino GNU General Public License for more details.
20680a9cb8SJohn Marino
21680a9cb8SJohn Marino You should have received a copy of the GNU General Public License
22*09d4459fSDaniel Fojt along with this program. If not, see <https://www.gnu.org/licenses/>. */
23680a9cb8SJohn Marino
24680a9cb8SJohn Marino #include <config.h>
25680a9cb8SJohn Marino
26680a9cb8SJohn Marino #include "memchr2.h"
27680a9cb8SJohn Marino
28680a9cb8SJohn Marino #include <limits.h>
29680a9cb8SJohn Marino #include <stdint.h>
30680a9cb8SJohn Marino #include <string.h>
31680a9cb8SJohn Marino
32680a9cb8SJohn Marino /* Return the first address of either C1 or C2 (treated as unsigned
33680a9cb8SJohn Marino char) that occurs within N bytes of the memory region S. If
34680a9cb8SJohn Marino neither byte appears, return NULL. */
35680a9cb8SJohn Marino void *
memchr2(void const * s,int c1_in,int c2_in,size_t n)36680a9cb8SJohn Marino memchr2 (void const *s, int c1_in, int c2_in, size_t n)
37680a9cb8SJohn Marino {
38680a9cb8SJohn Marino /* On 32-bit hardware, choosing longword to be a 32-bit unsigned
39680a9cb8SJohn Marino long instead of a 64-bit uintmax_t tends to give better
40680a9cb8SJohn Marino performance. On 64-bit hardware, unsigned long is generally 64
41680a9cb8SJohn Marino bits already. Change this typedef to experiment with
42680a9cb8SJohn Marino performance. */
43680a9cb8SJohn Marino typedef unsigned long int longword;
44680a9cb8SJohn Marino
45680a9cb8SJohn Marino const unsigned char *char_ptr;
46680a9cb8SJohn Marino void const *void_ptr;
47680a9cb8SJohn Marino const longword *longword_ptr;
48680a9cb8SJohn Marino longword repeated_one;
49680a9cb8SJohn Marino longword repeated_c1;
50680a9cb8SJohn Marino longword repeated_c2;
51680a9cb8SJohn Marino unsigned char c1;
52680a9cb8SJohn Marino unsigned char c2;
53680a9cb8SJohn Marino
54680a9cb8SJohn Marino c1 = (unsigned char) c1_in;
55680a9cb8SJohn Marino c2 = (unsigned char) c2_in;
56680a9cb8SJohn Marino
57680a9cb8SJohn Marino if (c1 == c2)
58680a9cb8SJohn Marino return memchr (s, c1, n);
59680a9cb8SJohn Marino
60680a9cb8SJohn Marino /* Handle the first few bytes by reading one byte at a time.
61680a9cb8SJohn Marino Do this until VOID_PTR is aligned on a longword boundary. */
62680a9cb8SJohn Marino for (void_ptr = s;
63680a9cb8SJohn Marino n > 0 && (uintptr_t) void_ptr % sizeof (longword) != 0;
64680a9cb8SJohn Marino --n)
65680a9cb8SJohn Marino {
66680a9cb8SJohn Marino char_ptr = void_ptr;
67680a9cb8SJohn Marino if (*char_ptr == c1 || *char_ptr == c2)
68680a9cb8SJohn Marino return (void *) void_ptr;
69680a9cb8SJohn Marino void_ptr = char_ptr + 1;
70680a9cb8SJohn Marino }
71680a9cb8SJohn Marino
72680a9cb8SJohn Marino longword_ptr = void_ptr;
73680a9cb8SJohn Marino
74680a9cb8SJohn Marino /* All these elucidatory comments refer to 4-byte longwords,
75680a9cb8SJohn Marino but the theory applies equally well to any size longwords. */
76680a9cb8SJohn Marino
77680a9cb8SJohn Marino /* Compute auxiliary longword values:
78680a9cb8SJohn Marino repeated_one is a value which has a 1 in every byte.
79680a9cb8SJohn Marino repeated_c1 has c1 in every byte.
80680a9cb8SJohn Marino repeated_c2 has c2 in every byte. */
81680a9cb8SJohn Marino repeated_one = 0x01010101;
82680a9cb8SJohn Marino repeated_c1 = c1 | (c1 << 8);
83680a9cb8SJohn Marino repeated_c2 = c2 | (c2 << 8);
84680a9cb8SJohn Marino repeated_c1 |= repeated_c1 << 16;
85680a9cb8SJohn Marino repeated_c2 |= repeated_c2 << 16;
86680a9cb8SJohn Marino if (0xffffffffU < (longword) -1)
87680a9cb8SJohn Marino {
88680a9cb8SJohn Marino repeated_one |= repeated_one << 31 << 1;
89680a9cb8SJohn Marino repeated_c1 |= repeated_c1 << 31 << 1;
90680a9cb8SJohn Marino repeated_c2 |= repeated_c2 << 31 << 1;
91680a9cb8SJohn Marino if (8 < sizeof (longword))
92680a9cb8SJohn Marino {
93680a9cb8SJohn Marino size_t i;
94680a9cb8SJohn Marino
95680a9cb8SJohn Marino for (i = 64; i < sizeof (longword) * 8; i *= 2)
96680a9cb8SJohn Marino {
97680a9cb8SJohn Marino repeated_one |= repeated_one << i;
98680a9cb8SJohn Marino repeated_c1 |= repeated_c1 << i;
99680a9cb8SJohn Marino repeated_c2 |= repeated_c2 << i;
100680a9cb8SJohn Marino }
101680a9cb8SJohn Marino }
102680a9cb8SJohn Marino }
103680a9cb8SJohn Marino
104680a9cb8SJohn Marino /* Instead of the traditional loop which tests each byte, we will test a
105680a9cb8SJohn Marino longword at a time. The tricky part is testing if *any of the four*
106680a9cb8SJohn Marino bytes in the longword in question are equal to c1 or c2. We first use
107680a9cb8SJohn Marino an xor with repeated_c1 and repeated_c2, respectively. This reduces
108680a9cb8SJohn Marino the task to testing whether *any of the four* bytes in longword1 or
109680a9cb8SJohn Marino longword2 is zero.
110680a9cb8SJohn Marino
111680a9cb8SJohn Marino Let's consider longword1. We compute tmp1 =
112680a9cb8SJohn Marino ((longword1 - repeated_one) & ~longword1) & (repeated_one << 7).
113680a9cb8SJohn Marino That is, we perform the following operations:
114680a9cb8SJohn Marino 1. Subtract repeated_one.
115680a9cb8SJohn Marino 2. & ~longword1.
116680a9cb8SJohn Marino 3. & a mask consisting of 0x80 in every byte.
117680a9cb8SJohn Marino Consider what happens in each byte:
118680a9cb8SJohn Marino - If a byte of longword1 is zero, step 1 and 2 transform it into 0xff,
119680a9cb8SJohn Marino and step 3 transforms it into 0x80. A carry can also be propagated
120680a9cb8SJohn Marino to more significant bytes.
121680a9cb8SJohn Marino - If a byte of longword1 is nonzero, let its lowest 1 bit be at
122680a9cb8SJohn Marino position k (0 <= k <= 7); so the lowest k bits are 0. After step 1,
123680a9cb8SJohn Marino the byte ends in a single bit of value 0 and k bits of value 1.
124680a9cb8SJohn Marino After step 2, the result is just k bits of value 1: 2^k - 1. After
125680a9cb8SJohn Marino step 3, the result is 0. And no carry is produced.
126680a9cb8SJohn Marino So, if longword1 has only non-zero bytes, tmp1 is zero.
127680a9cb8SJohn Marino Whereas if longword1 has a zero byte, call j the position of the least
128680a9cb8SJohn Marino significant zero byte. Then the result has a zero at positions 0, ...,
129680a9cb8SJohn Marino j-1 and a 0x80 at position j. We cannot predict the result at the more
130680a9cb8SJohn Marino significant bytes (positions j+1..3), but it does not matter since we
131680a9cb8SJohn Marino already have a non-zero bit at position 8*j+7.
132680a9cb8SJohn Marino
133680a9cb8SJohn Marino Similarly, we compute tmp2 =
134680a9cb8SJohn Marino ((longword2 - repeated_one) & ~longword2) & (repeated_one << 7).
135680a9cb8SJohn Marino
136680a9cb8SJohn Marino The test whether any byte in longword1 or longword2 is zero is equivalent
137680a9cb8SJohn Marino to testing whether tmp1 is nonzero or tmp2 is nonzero. We can combine
138680a9cb8SJohn Marino this into a single test, whether (tmp1 | tmp2) is nonzero. */
139680a9cb8SJohn Marino
140680a9cb8SJohn Marino while (n >= sizeof (longword))
141680a9cb8SJohn Marino {
142680a9cb8SJohn Marino longword longword1 = *longword_ptr ^ repeated_c1;
143680a9cb8SJohn Marino longword longword2 = *longword_ptr ^ repeated_c2;
144680a9cb8SJohn Marino
145680a9cb8SJohn Marino if (((((longword1 - repeated_one) & ~longword1)
146680a9cb8SJohn Marino | ((longword2 - repeated_one) & ~longword2))
147680a9cb8SJohn Marino & (repeated_one << 7)) != 0)
148680a9cb8SJohn Marino break;
149680a9cb8SJohn Marino longword_ptr++;
150680a9cb8SJohn Marino n -= sizeof (longword);
151680a9cb8SJohn Marino }
152680a9cb8SJohn Marino
153680a9cb8SJohn Marino char_ptr = (const unsigned char *) longword_ptr;
154680a9cb8SJohn Marino
155680a9cb8SJohn Marino /* At this point, we know that either n < sizeof (longword), or one of the
156680a9cb8SJohn Marino sizeof (longword) bytes starting at char_ptr is == c1 or == c2. On
157680a9cb8SJohn Marino little-endian machines, we could determine the first such byte without
158680a9cb8SJohn Marino any further memory accesses, just by looking at the (tmp1 | tmp2) result
159680a9cb8SJohn Marino from the last loop iteration. But this does not work on big-endian
160680a9cb8SJohn Marino machines. Choose code that works in both cases. */
161680a9cb8SJohn Marino
162680a9cb8SJohn Marino for (; n > 0; --n, ++char_ptr)
163680a9cb8SJohn Marino {
164680a9cb8SJohn Marino if (*char_ptr == c1 || *char_ptr == c2)
165680a9cb8SJohn Marino return (void *) char_ptr;
166680a9cb8SJohn Marino }
167680a9cb8SJohn Marino
168680a9cb8SJohn Marino return NULL;
169680a9cb8SJohn Marino }
170