xref: /isa-l_crypto/include/memcpy_inline.h (revision 1de5344d2e55af6b245565bde0dec2868130cf14)
1 /**********************************************************************
2   Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions
6   are met:
7     * Redistributions of source code must retain the above copyright
8       notice, this list of conditions and the following disclaimer.
9     * Redistributions in binary form must reproduce the above copyright
10       notice, this list of conditions and the following disclaimer in
11       the documentation and/or other materials provided with the
12       distribution.
13     * Neither the name of Intel Corporation nor the names of its
14       contributors may be used to endorse or promote products derived
15       from this software without specific prior written permission.
16 
17   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
29 
30 /**
31  *  @file  memcpy_inline.h
32  *  @brief Defines intrinsic memcpy functions used by the new hashing API
33  *
34  */
35 
36 #ifndef _MEMCPY_H_
37 #define _MEMCPY_H_
38 
39 #if defined(__i386__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
40 #include "intrinreg.h"
41 #endif
42 #include <string.h>
43 #include <assert.h>
44 
45 #ifdef __cplusplus
46 extern "C" {
47 #endif
48 
49 #if defined(__i386__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
50 
51 #define memcpy_varlen   memcpy_sse_varlen
52 #define memcpy_fixedlen memcpy_sse_fixedlen
53 
54 #define memclr_varlen   memclr_sse_varlen
55 #define memclr_fixedlen memclr_sse_fixedlen
56 
57 static inline void
58 memcpy_lte32_sse_fixedlen(void *dst, const void *src, size_t nbytes);
59 static inline void
60 memcpy_gte16_sse_fixedlen(void *dst, const void *src, size_t nbytes);
61 static inline void
62 memcpy_sse_fixedlen(void *dst, const void *src, size_t nbytes);
63 
64 static inline void
65 memcpy_lte32_sse_varlen(void *dst, const void *src, size_t nbytes);
66 static inline void
67 memcpy_gte16_sse_varlen(void *dst, const void *src, size_t nbytes);
68 static inline void
69 memcpy_sse_varlen(void *dst, const void *src, size_t nbytes);
70 
71 static inline void
72 memclr_lte32_sse_fixedlen(void *dst, size_t nbytes);
73 static inline void
74 memclr_gte16_sse_fixedlen(void *dst, size_t nbytes);
75 static inline void
76 memclr_sse_fixedlen(void *dst, size_t nbytes);
77 
78 static inline void
79 memclr_lte32_sse_varlen(void *dst, size_t nbytes);
80 static inline void
81 memclr_gte16_sse_varlen(void *dst, size_t nbytes);
82 static inline void
83 memclr_sse_varlen(void *dst, size_t nbytes);
84 
85 #define MEMCPY_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, src, nbytes)                             \
86         do {                                                                                       \
87                 intrinreg##N head;                                                                 \
88                 intrinreg##N tail;                                                                 \
89                 assert(N <= nbytes && nbytes <= 2 * N);                                            \
90                 if (N == 1 || (fixedwidth && nbytes == N)) {                                       \
91                         head = load_intrinreg##N(src);                                             \
92                         store_intrinreg##N(dst, head);                                             \
93                 } else {                                                                           \
94                         head = load_intrinreg##N(src);                                             \
95                         tail = load_intrinreg##N(                                                  \
96                                 (const void *) ((const char *) src + (nbytes - N)));               \
97                         store_intrinreg##N(dst, head);                                             \
98                         store_intrinreg##N((void *) ((char *) dst + (nbytes - N)), tail);          \
99                 }                                                                                  \
100         } while (0)
101 
102 #define MEMCLR_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, nbytes)                                  \
103         do {                                                                                       \
104                 const intrinreg##N zero = { 0 };                                                   \
105                 assert(N <= nbytes && nbytes <= 2 * N);                                            \
106                 if (N == 1 || (fixedwidth && nbytes == N)) {                                       \
107                         store_intrinreg##N(dst, zero);                                             \
108                 } else {                                                                           \
109                         store_intrinreg##N(dst, zero);                                             \
110                         store_intrinreg##N((void *) ((char *) dst + (nbytes - N)), zero);          \
111                 }                                                                                  \
112         } while (0)
113 
114 // Define load/store functions uniformly.
115 
116 #define load_intrinreg16(src)       _mm_loadu_ps((const float *) src)
117 #define store_intrinreg16(dst, val) _mm_storeu_ps((float *) dst, val)
118 
119 static inline intrinreg8
load_intrinreg8(const void * src)120 load_intrinreg8(const void *src)
121 {
122         return *(intrinreg8 *) src;
123 }
124 
125 static inline void
store_intrinreg8(void * dst,intrinreg8 val)126 store_intrinreg8(void *dst, intrinreg8 val)
127 {
128         *(intrinreg8 *) dst = val;
129 }
130 
131 static inline intrinreg4
load_intrinreg4(const void * src)132 load_intrinreg4(const void *src)
133 {
134         return *(intrinreg4 *) src;
135 }
136 
137 static inline void
store_intrinreg4(void * dst,intrinreg4 val)138 store_intrinreg4(void *dst, intrinreg4 val)
139 {
140         *(intrinreg4 *) dst = val;
141 }
142 
143 static inline intrinreg2
load_intrinreg2(const void * src)144 load_intrinreg2(const void *src)
145 {
146         return *(intrinreg2 *) src;
147 }
148 
149 static inline void
store_intrinreg2(void * dst,intrinreg2 val)150 store_intrinreg2(void *dst, intrinreg2 val)
151 {
152         *(intrinreg2 *) dst = val;
153 }
154 
155 static inline intrinreg1
load_intrinreg1(const void * src)156 load_intrinreg1(const void *src)
157 {
158         return *(intrinreg1 *) src;
159 }
160 
161 static inline void
store_intrinreg1(void * dst,intrinreg1 val)162 store_intrinreg1(void *dst, intrinreg1 val)
163 {
164         *(intrinreg1 *) dst = val;
165 }
166 
167 static inline void
memcpy_gte16_sse_fixedlen(void * dst,const void * src,size_t nbytes)168 memcpy_gte16_sse_fixedlen(void *dst, const void *src, size_t nbytes)
169 {
170         size_t i;
171         size_t j;
172         intrinreg16 pool[4];
173         size_t remaining_moves;
174         size_t tail_offset;
175         int do_tail;
176         assert(nbytes >= 16);
177 
178         for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4) {
179                 for (j = 0; j < 4; j++)
180                         pool[j] =
181                                 load_intrinreg16((const void *) ((const char *) src + i + 16 * j));
182                 for (j = 0; j < 4; j++)
183                         store_intrinreg16((void *) ((char *) dst + i + 16 * j), pool[j]);
184         }
185 
186         remaining_moves = (nbytes - i) / 16;
187         tail_offset = nbytes - 16;
188         do_tail = (tail_offset & (16 - 1));
189 
190         for (j = 0; j < remaining_moves; j++)
191                 pool[j] = load_intrinreg16((const void *) ((const char *) src + i + 16 * j));
192 
193         if (do_tail)
194                 pool[j] = load_intrinreg16((const void *) ((const char *) src + tail_offset));
195 
196         for (j = 0; j < remaining_moves; j++)
197                 store_intrinreg16((void *) ((char *) dst + i + 16 * j), pool[j]);
198 
199         if (do_tail)
200                 store_intrinreg16((void *) ((char *) dst + tail_offset), pool[j]);
201 }
202 
203 static inline void
memclr_gte16_sse_fixedlen(void * dst,size_t nbytes)204 memclr_gte16_sse_fixedlen(void *dst, size_t nbytes)
205 {
206         size_t i;
207         size_t j;
208         const intrinreg16 zero = { 0 };
209         size_t remaining_moves;
210         size_t tail_offset;
211         int do_tail;
212         assert(nbytes >= 16);
213 
214         for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4)
215                 for (j = 0; j < 4; j++)
216                         store_intrinreg16((void *) ((char *) dst + i + 16 * j), zero);
217 
218         remaining_moves = (nbytes - i) / 16;
219         tail_offset = nbytes - 16;
220         do_tail = (tail_offset & (16 - 1));
221 
222         for (j = 0; j < remaining_moves; j++)
223                 store_intrinreg16((void *) ((char *) dst + i + 16 * j), zero);
224 
225         if (do_tail)
226                 store_intrinreg16((void *) ((char *) dst + tail_offset), zero);
227 }
228 
229 static inline void
memcpy_lte32_sse_fixedlen(void * dst,const void * src,size_t nbytes)230 memcpy_lte32_sse_fixedlen(void *dst, const void *src, size_t nbytes)
231 {
232         assert(nbytes <= 32);
233         if (nbytes >= 16)
234                 MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 1, dst, src, nbytes);
235         else if (nbytes >= 8)
236                 MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 1, dst, src, nbytes);
237         else if (nbytes >= 4)
238                 MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 1, dst, src, nbytes);
239         else if (nbytes >= 2)
240                 MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 1, dst, src, nbytes);
241         else if (nbytes >= 1)
242                 MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 1, dst, src, nbytes);
243 }
244 
245 static inline void
memclr_lte32_sse_fixedlen(void * dst,size_t nbytes)246 memclr_lte32_sse_fixedlen(void *dst, size_t nbytes)
247 {
248         assert(nbytes <= 32);
249         if (nbytes >= 16)
250                 MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 1, dst, nbytes);
251         else if (nbytes >= 8)
252                 MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 1, dst, nbytes);
253         else if (nbytes >= 4)
254                 MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 1, dst, nbytes);
255         else if (nbytes >= 2)
256                 MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 1, dst, nbytes);
257         else if (nbytes >= 1)
258                 MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 1, dst, nbytes);
259 }
260 
261 static inline void
memcpy_lte32_sse_varlen(void * dst,const void * src,size_t nbytes)262 memcpy_lte32_sse_varlen(void *dst, const void *src, size_t nbytes)
263 {
264         assert(nbytes <= 32);
265         if (nbytes >= 16)
266                 MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 0, dst, src, nbytes);
267         else if (nbytes >= 8)
268                 MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 0, dst, src, nbytes);
269         else if (nbytes >= 4)
270                 MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 0, dst, src, nbytes);
271         else if (nbytes >= 2)
272                 MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 0, dst, src, nbytes);
273         else if (nbytes >= 1)
274                 MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 0, dst, src, nbytes);
275 }
276 
277 static inline void
memclr_lte32_sse_varlen(void * dst,size_t nbytes)278 memclr_lte32_sse_varlen(void *dst, size_t nbytes)
279 {
280         assert(nbytes <= 32);
281         if (nbytes >= 16)
282                 MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 0, dst, nbytes);
283         else if (nbytes >= 8)
284                 MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 0, dst, nbytes);
285         else if (nbytes >= 4)
286                 MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 0, dst, nbytes);
287         else if (nbytes >= 2)
288                 MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 0, dst, nbytes);
289         else if (nbytes >= 1)
290                 MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 0, dst, nbytes);
291 }
292 
293 static inline void
memcpy_gte16_sse_varlen(void * dst,const void * src,size_t nbytes)294 memcpy_gte16_sse_varlen(void *dst, const void *src, size_t nbytes)
295 {
296         size_t i = 0;
297         intrinreg16 tail;
298 
299         assert(nbytes >= 16);
300 
301         while (i + 128 <= nbytes) {
302                 memcpy_gte16_sse_fixedlen((void *) ((char *) dst + i),
303                                           (const void *) ((const char *) src + i), 128);
304                 i += 128;
305         }
306         if (i + 64 <= nbytes) {
307                 memcpy_gte16_sse_fixedlen((void *) ((char *) dst + i),
308                                           (const void *) ((const char *) src + i), 64);
309                 i += 64;
310         }
311         if (i + 32 <= nbytes) {
312                 memcpy_gte16_sse_fixedlen((void *) ((char *) dst + i),
313                                           (const void *) ((const char *) src + i), 32);
314                 i += 32;
315         }
316         if (i + 16 <= nbytes) {
317                 memcpy_gte16_sse_fixedlen((void *) ((char *) dst + i),
318                                           (const void *) ((const char *) src + i), 16);
319         }
320 
321         i = nbytes - 16;
322         tail = load_intrinreg16((const void *) ((const char *) src + i));
323         store_intrinreg16((void *) ((char *) dst + i), tail);
324 }
325 
326 static inline void
memclr_gte16_sse_varlen(void * dst,size_t nbytes)327 memclr_gte16_sse_varlen(void *dst, size_t nbytes)
328 {
329         size_t i = 0;
330         const intrinreg16 zero = { 0 };
331 
332         assert(nbytes >= 16);
333 
334         while (i + 128 <= nbytes) {
335                 memclr_gte16_sse_fixedlen((void *) ((char *) dst + i), 128);
336                 i += 128;
337         }
338         if (i + 64 <= nbytes) {
339                 memclr_gte16_sse_fixedlen((void *) ((char *) dst + i), 64);
340                 i += 64;
341         }
342         if (i + 32 <= nbytes) {
343                 memclr_gte16_sse_fixedlen((void *) ((char *) dst + i), 32);
344                 i += 32;
345         }
346         if (i + 16 <= nbytes) {
347                 memclr_gte16_sse_fixedlen((void *) ((char *) dst + i), 16);
348         }
349 
350         i = nbytes - 16;
351         store_intrinreg16((void *) ((char *) dst + i), zero);
352 }
353 
354 static inline void
memcpy_sse_fixedlen(void * dst,const void * src,size_t nbytes)355 memcpy_sse_fixedlen(void *dst, const void *src, size_t nbytes)
356 {
357         if (nbytes >= 16)
358                 memcpy_gte16_sse_fixedlen(dst, src, nbytes);
359         else
360                 memcpy_lte32_sse_fixedlen(dst, src, nbytes);
361 }
362 
363 static inline void
memclr_sse_fixedlen(void * dst,size_t nbytes)364 memclr_sse_fixedlen(void *dst, size_t nbytes)
365 {
366         if (nbytes >= 16)
367                 memclr_gte16_sse_fixedlen(dst, nbytes);
368         else
369                 memclr_lte32_sse_fixedlen(dst, nbytes);
370 }
371 
372 static inline void
memcpy_sse_varlen(void * dst,const void * src,size_t nbytes)373 memcpy_sse_varlen(void *dst, const void *src, size_t nbytes)
374 {
375         if (nbytes >= 16)
376                 memcpy_gte16_sse_varlen(dst, src, nbytes);
377         else
378                 memcpy_lte32_sse_varlen(dst, src, nbytes);
379 }
380 
381 static inline void
memclr_sse_varlen(void * dst,size_t nbytes)382 memclr_sse_varlen(void *dst, size_t nbytes)
383 {
384         if (nbytes >= 16)
385                 memclr_gte16_sse_varlen(dst, nbytes);
386         else
387                 memclr_lte32_sse_varlen(dst, nbytes);
388 }
389 #else
390 #define memcpy_varlen   memcpy
391 #define memcpy_fixedlen memcpy
392 
393 #define memclr_varlen(dst, n)   memset(dst, 0, n)
394 #define memclr_fixedlen(dst, n) memset(dst, 0, n)
395 
396 #endif
397 
398 #ifdef __cplusplus
399 }
400 #endif
401 
402 #endif // __MEMCPY_H
403