1 /**********************************************************************
2 Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
29
30 /**
31 * @file memcpy_inline.h
32 * @brief Defines intrinsic memcpy functions used by the new hashing API
33 *
34 */
35
36 #ifndef _MEMCPY_H_
37 #define _MEMCPY_H_
38
39 #if defined(__i386__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
40 #include "intrinreg.h"
41 #endif
42 #include <string.h>
43 #include <assert.h>
44
45 #ifdef __cplusplus
46 extern "C" {
47 #endif
48
49 #if defined(__i386__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
50
51 #define memcpy_varlen memcpy_sse_varlen
52 #define memcpy_fixedlen memcpy_sse_fixedlen
53
54 #define memclr_varlen memclr_sse_varlen
55 #define memclr_fixedlen memclr_sse_fixedlen
56
57 static inline void
58 memcpy_lte32_sse_fixedlen(void *dst, const void *src, size_t nbytes);
59 static inline void
60 memcpy_gte16_sse_fixedlen(void *dst, const void *src, size_t nbytes);
61 static inline void
62 memcpy_sse_fixedlen(void *dst, const void *src, size_t nbytes);
63
64 static inline void
65 memcpy_lte32_sse_varlen(void *dst, const void *src, size_t nbytes);
66 static inline void
67 memcpy_gte16_sse_varlen(void *dst, const void *src, size_t nbytes);
68 static inline void
69 memcpy_sse_varlen(void *dst, const void *src, size_t nbytes);
70
71 static inline void
72 memclr_lte32_sse_fixedlen(void *dst, size_t nbytes);
73 static inline void
74 memclr_gte16_sse_fixedlen(void *dst, size_t nbytes);
75 static inline void
76 memclr_sse_fixedlen(void *dst, size_t nbytes);
77
78 static inline void
79 memclr_lte32_sse_varlen(void *dst, size_t nbytes);
80 static inline void
81 memclr_gte16_sse_varlen(void *dst, size_t nbytes);
82 static inline void
83 memclr_sse_varlen(void *dst, size_t nbytes);
84
85 #define MEMCPY_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, src, nbytes) \
86 do { \
87 intrinreg##N head; \
88 intrinreg##N tail; \
89 assert(N <= nbytes && nbytes <= 2 * N); \
90 if (N == 1 || (fixedwidth && nbytes == N)) { \
91 head = load_intrinreg##N(src); \
92 store_intrinreg##N(dst, head); \
93 } else { \
94 head = load_intrinreg##N(src); \
95 tail = load_intrinreg##N( \
96 (const void *) ((const char *) src + (nbytes - N))); \
97 store_intrinreg##N(dst, head); \
98 store_intrinreg##N((void *) ((char *) dst + (nbytes - N)), tail); \
99 } \
100 } while (0)
101
102 #define MEMCLR_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, nbytes) \
103 do { \
104 const intrinreg##N zero = { 0 }; \
105 assert(N <= nbytes && nbytes <= 2 * N); \
106 if (N == 1 || (fixedwidth && nbytes == N)) { \
107 store_intrinreg##N(dst, zero); \
108 } else { \
109 store_intrinreg##N(dst, zero); \
110 store_intrinreg##N((void *) ((char *) dst + (nbytes - N)), zero); \
111 } \
112 } while (0)
113
114 // Define load/store functions uniformly.
115
116 #define load_intrinreg16(src) _mm_loadu_ps((const float *) src)
117 #define store_intrinreg16(dst, val) _mm_storeu_ps((float *) dst, val)
118
119 static inline intrinreg8
load_intrinreg8(const void * src)120 load_intrinreg8(const void *src)
121 {
122 return *(intrinreg8 *) src;
123 }
124
125 static inline void
store_intrinreg8(void * dst,intrinreg8 val)126 store_intrinreg8(void *dst, intrinreg8 val)
127 {
128 *(intrinreg8 *) dst = val;
129 }
130
131 static inline intrinreg4
load_intrinreg4(const void * src)132 load_intrinreg4(const void *src)
133 {
134 return *(intrinreg4 *) src;
135 }
136
137 static inline void
store_intrinreg4(void * dst,intrinreg4 val)138 store_intrinreg4(void *dst, intrinreg4 val)
139 {
140 *(intrinreg4 *) dst = val;
141 }
142
143 static inline intrinreg2
load_intrinreg2(const void * src)144 load_intrinreg2(const void *src)
145 {
146 return *(intrinreg2 *) src;
147 }
148
149 static inline void
store_intrinreg2(void * dst,intrinreg2 val)150 store_intrinreg2(void *dst, intrinreg2 val)
151 {
152 *(intrinreg2 *) dst = val;
153 }
154
155 static inline intrinreg1
load_intrinreg1(const void * src)156 load_intrinreg1(const void *src)
157 {
158 return *(intrinreg1 *) src;
159 }
160
161 static inline void
store_intrinreg1(void * dst,intrinreg1 val)162 store_intrinreg1(void *dst, intrinreg1 val)
163 {
164 *(intrinreg1 *) dst = val;
165 }
166
167 static inline void
memcpy_gte16_sse_fixedlen(void * dst,const void * src,size_t nbytes)168 memcpy_gte16_sse_fixedlen(void *dst, const void *src, size_t nbytes)
169 {
170 size_t i;
171 size_t j;
172 intrinreg16 pool[4];
173 size_t remaining_moves;
174 size_t tail_offset;
175 int do_tail;
176 assert(nbytes >= 16);
177
178 for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4) {
179 for (j = 0; j < 4; j++)
180 pool[j] =
181 load_intrinreg16((const void *) ((const char *) src + i + 16 * j));
182 for (j = 0; j < 4; j++)
183 store_intrinreg16((void *) ((char *) dst + i + 16 * j), pool[j]);
184 }
185
186 remaining_moves = (nbytes - i) / 16;
187 tail_offset = nbytes - 16;
188 do_tail = (tail_offset & (16 - 1));
189
190 for (j = 0; j < remaining_moves; j++)
191 pool[j] = load_intrinreg16((const void *) ((const char *) src + i + 16 * j));
192
193 if (do_tail)
194 pool[j] = load_intrinreg16((const void *) ((const char *) src + tail_offset));
195
196 for (j = 0; j < remaining_moves; j++)
197 store_intrinreg16((void *) ((char *) dst + i + 16 * j), pool[j]);
198
199 if (do_tail)
200 store_intrinreg16((void *) ((char *) dst + tail_offset), pool[j]);
201 }
202
203 static inline void
memclr_gte16_sse_fixedlen(void * dst,size_t nbytes)204 memclr_gte16_sse_fixedlen(void *dst, size_t nbytes)
205 {
206 size_t i;
207 size_t j;
208 const intrinreg16 zero = { 0 };
209 size_t remaining_moves;
210 size_t tail_offset;
211 int do_tail;
212 assert(nbytes >= 16);
213
214 for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4)
215 for (j = 0; j < 4; j++)
216 store_intrinreg16((void *) ((char *) dst + i + 16 * j), zero);
217
218 remaining_moves = (nbytes - i) / 16;
219 tail_offset = nbytes - 16;
220 do_tail = (tail_offset & (16 - 1));
221
222 for (j = 0; j < remaining_moves; j++)
223 store_intrinreg16((void *) ((char *) dst + i + 16 * j), zero);
224
225 if (do_tail)
226 store_intrinreg16((void *) ((char *) dst + tail_offset), zero);
227 }
228
229 static inline void
memcpy_lte32_sse_fixedlen(void * dst,const void * src,size_t nbytes)230 memcpy_lte32_sse_fixedlen(void *dst, const void *src, size_t nbytes)
231 {
232 assert(nbytes <= 32);
233 if (nbytes >= 16)
234 MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 1, dst, src, nbytes);
235 else if (nbytes >= 8)
236 MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 1, dst, src, nbytes);
237 else if (nbytes >= 4)
238 MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 1, dst, src, nbytes);
239 else if (nbytes >= 2)
240 MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 1, dst, src, nbytes);
241 else if (nbytes >= 1)
242 MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 1, dst, src, nbytes);
243 }
244
245 static inline void
memclr_lte32_sse_fixedlen(void * dst,size_t nbytes)246 memclr_lte32_sse_fixedlen(void *dst, size_t nbytes)
247 {
248 assert(nbytes <= 32);
249 if (nbytes >= 16)
250 MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 1, dst, nbytes);
251 else if (nbytes >= 8)
252 MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 1, dst, nbytes);
253 else if (nbytes >= 4)
254 MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 1, dst, nbytes);
255 else if (nbytes >= 2)
256 MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 1, dst, nbytes);
257 else if (nbytes >= 1)
258 MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 1, dst, nbytes);
259 }
260
261 static inline void
memcpy_lte32_sse_varlen(void * dst,const void * src,size_t nbytes)262 memcpy_lte32_sse_varlen(void *dst, const void *src, size_t nbytes)
263 {
264 assert(nbytes <= 32);
265 if (nbytes >= 16)
266 MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 0, dst, src, nbytes);
267 else if (nbytes >= 8)
268 MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 0, dst, src, nbytes);
269 else if (nbytes >= 4)
270 MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 0, dst, src, nbytes);
271 else if (nbytes >= 2)
272 MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 0, dst, src, nbytes);
273 else if (nbytes >= 1)
274 MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 0, dst, src, nbytes);
275 }
276
277 static inline void
memclr_lte32_sse_varlen(void * dst,size_t nbytes)278 memclr_lte32_sse_varlen(void *dst, size_t nbytes)
279 {
280 assert(nbytes <= 32);
281 if (nbytes >= 16)
282 MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 0, dst, nbytes);
283 else if (nbytes >= 8)
284 MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 0, dst, nbytes);
285 else if (nbytes >= 4)
286 MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 0, dst, nbytes);
287 else if (nbytes >= 2)
288 MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 0, dst, nbytes);
289 else if (nbytes >= 1)
290 MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 0, dst, nbytes);
291 }
292
293 static inline void
memcpy_gte16_sse_varlen(void * dst,const void * src,size_t nbytes)294 memcpy_gte16_sse_varlen(void *dst, const void *src, size_t nbytes)
295 {
296 size_t i = 0;
297 intrinreg16 tail;
298
299 assert(nbytes >= 16);
300
301 while (i + 128 <= nbytes) {
302 memcpy_gte16_sse_fixedlen((void *) ((char *) dst + i),
303 (const void *) ((const char *) src + i), 128);
304 i += 128;
305 }
306 if (i + 64 <= nbytes) {
307 memcpy_gte16_sse_fixedlen((void *) ((char *) dst + i),
308 (const void *) ((const char *) src + i), 64);
309 i += 64;
310 }
311 if (i + 32 <= nbytes) {
312 memcpy_gte16_sse_fixedlen((void *) ((char *) dst + i),
313 (const void *) ((const char *) src + i), 32);
314 i += 32;
315 }
316 if (i + 16 <= nbytes) {
317 memcpy_gte16_sse_fixedlen((void *) ((char *) dst + i),
318 (const void *) ((const char *) src + i), 16);
319 }
320
321 i = nbytes - 16;
322 tail = load_intrinreg16((const void *) ((const char *) src + i));
323 store_intrinreg16((void *) ((char *) dst + i), tail);
324 }
325
326 static inline void
memclr_gte16_sse_varlen(void * dst,size_t nbytes)327 memclr_gte16_sse_varlen(void *dst, size_t nbytes)
328 {
329 size_t i = 0;
330 const intrinreg16 zero = { 0 };
331
332 assert(nbytes >= 16);
333
334 while (i + 128 <= nbytes) {
335 memclr_gte16_sse_fixedlen((void *) ((char *) dst + i), 128);
336 i += 128;
337 }
338 if (i + 64 <= nbytes) {
339 memclr_gte16_sse_fixedlen((void *) ((char *) dst + i), 64);
340 i += 64;
341 }
342 if (i + 32 <= nbytes) {
343 memclr_gte16_sse_fixedlen((void *) ((char *) dst + i), 32);
344 i += 32;
345 }
346 if (i + 16 <= nbytes) {
347 memclr_gte16_sse_fixedlen((void *) ((char *) dst + i), 16);
348 }
349
350 i = nbytes - 16;
351 store_intrinreg16((void *) ((char *) dst + i), zero);
352 }
353
354 static inline void
memcpy_sse_fixedlen(void * dst,const void * src,size_t nbytes)355 memcpy_sse_fixedlen(void *dst, const void *src, size_t nbytes)
356 {
357 if (nbytes >= 16)
358 memcpy_gte16_sse_fixedlen(dst, src, nbytes);
359 else
360 memcpy_lte32_sse_fixedlen(dst, src, nbytes);
361 }
362
363 static inline void
memclr_sse_fixedlen(void * dst,size_t nbytes)364 memclr_sse_fixedlen(void *dst, size_t nbytes)
365 {
366 if (nbytes >= 16)
367 memclr_gte16_sse_fixedlen(dst, nbytes);
368 else
369 memclr_lte32_sse_fixedlen(dst, nbytes);
370 }
371
372 static inline void
memcpy_sse_varlen(void * dst,const void * src,size_t nbytes)373 memcpy_sse_varlen(void *dst, const void *src, size_t nbytes)
374 {
375 if (nbytes >= 16)
376 memcpy_gte16_sse_varlen(dst, src, nbytes);
377 else
378 memcpy_lte32_sse_varlen(dst, src, nbytes);
379 }
380
381 static inline void
memclr_sse_varlen(void * dst,size_t nbytes)382 memclr_sse_varlen(void *dst, size_t nbytes)
383 {
384 if (nbytes >= 16)
385 memclr_gte16_sse_varlen(dst, nbytes);
386 else
387 memclr_lte32_sse_varlen(dst, nbytes);
388 }
389 #else
390 #define memcpy_varlen memcpy
391 #define memcpy_fixedlen memcpy
392
393 #define memclr_varlen(dst, n) memset(dst, 0, n)
394 #define memclr_fixedlen(dst, n) memset(dst, 0, n)
395
396 #endif
397
398 #ifdef __cplusplus
399 }
400 #endif
401
402 #endif // __MEMCPY_H
403