xref: /isa-l_crypto/include/memcpy_inline.h (revision 2a268c597b745b55509ea38d00753e5caa85e9ac)
1 /**********************************************************************
2   Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions
6   are met:
7     * Redistributions of source code must retain the above copyright
8       notice, this list of conditions and the following disclaimer.
9     * Redistributions in binary form must reproduce the above copyright
10       notice, this list of conditions and the following disclaimer in
11       the documentation and/or other materials provided with the
12       distribution.
13     * Neither the name of Intel Corporation nor the names of its
14       contributors may be used to endorse or promote products derived
15       from this software without specific prior written permission.
16 
17   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
29 
30 
31 /**
32  *  @file  memcpy_inline.h
33  *  @brief Defines intrinsic memcpy functions used by the new hashing API
34  *
35  */
36 
37 #ifndef _MEMCPY_H_
38 #define _MEMCPY_H_
39 
40 #if defined(__i386__) || defined(__x86_64__) || defined( _M_X64) \
41 	|| defined(_M_IX86)
42 #include "intrinreg.h"
43 #endif
44 #include <string.h>
45 #include <assert.h>
46 
47 #ifdef __cplusplus
48 extern "C" {
49 #endif
50 
51 #if defined(__i386__) || defined(__x86_64__) || defined( _M_X64) \
52 	|| defined(_M_IX86)
53 
54 #define memcpy_varlen   memcpy_sse_varlen
55 #define memcpy_fixedlen memcpy_sse_fixedlen
56 
57 #define memclr_varlen   memclr_sse_varlen
58 #define memclr_fixedlen memclr_sse_fixedlen
59 
60 static inline void memcpy_lte32_sse_fixedlen(void* dst, const void* src, size_t nbytes);
61 static inline void memcpy_gte16_sse_fixedlen(void* dst, const void* src, size_t nbytes);
62 static inline void memcpy_sse_fixedlen      (void* dst, const void* src, size_t nbytes);
63 
64 static inline void memcpy_lte32_sse_varlen  (void* dst, const void* src, size_t nbytes);
65 static inline void memcpy_gte16_sse_varlen  (void* dst, const void* src, size_t nbytes);
66 static inline void memcpy_sse_varlen        (void* dst, const void* src, size_t nbytes);
67 
68 
69 static inline void memclr_lte32_sse_fixedlen(void* dst, size_t nbytes);
70 static inline void memclr_gte16_sse_fixedlen(void* dst, size_t nbytes);
71 static inline void memclr_sse_fixedlen      (void* dst, size_t nbytes);
72 
73 static inline void memclr_lte32_sse_varlen  (void* dst, size_t nbytes);
74 static inline void memclr_gte16_sse_varlen  (void* dst, size_t nbytes);
75 static inline void memclr_sse_varlen        (void* dst, size_t nbytes);
76 
77 #define MEMCPY_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, src, nbytes) \
78 	do { \
79 		intrinreg##N head; \
80 		intrinreg##N tail; \
81 		assert(N <= nbytes && nbytes <= 2*N); \
82 		if(N == 1 || (fixedwidth && nbytes==N) ) { \
83 			head = load_intrinreg##N(src); \
84 			store_intrinreg##N(dst, head); \
85 		} \
86 		else { \
87 			head = load_intrinreg##N(src); \
88 			tail = load_intrinreg##N((const void*)((const char*)src + (nbytes - N))); \
89 			store_intrinreg##N(dst, head); \
90 			store_intrinreg##N((void*)((char*)dst + (nbytes - N)), tail); \
91 		} \
92 	} while(0)
93 
94 #define MEMCLR_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, nbytes) \
95 	do { \
96 		const intrinreg##N zero = {0}; \
97 		assert(N <= nbytes && nbytes <= 2*N); \
98 		if(N == 1 || (fixedwidth && nbytes==N) ) { \
99 			store_intrinreg##N(dst, zero); \
100 		} \
101 		else { \
102 			store_intrinreg##N(dst, zero); \
103 			store_intrinreg##N((void*)((char*)dst + (nbytes - N)), zero); \
104 		} \
105 	} while(0)
106 
107 // Define load/store functions uniformly.
108 
109 #define load_intrinreg16(src)  _mm_loadu_ps((const float*) src)
110 #define store_intrinreg16(dst,val) _mm_storeu_ps((float*) dst, val)
111 
112 static inline intrinreg8 load_intrinreg8(const void *src)
113 {
114 	return *(intrinreg8 *) src;
115 }
116 
117 static inline void store_intrinreg8(void *dst, intrinreg8 val)
118 {
119 	*(intrinreg8 *) dst = val;
120 }
121 
122 static inline intrinreg4 load_intrinreg4(const void *src)
123 {
124 	return *(intrinreg4 *) src;
125 }
126 
127 static inline void store_intrinreg4(void *dst, intrinreg4 val)
128 {
129 	*(intrinreg4 *) dst = val;
130 }
131 
132 static inline intrinreg2 load_intrinreg2(const void *src)
133 {
134 	return *(intrinreg2 *) src;
135 }
136 
137 static inline void store_intrinreg2(void *dst, intrinreg2 val)
138 {
139 	*(intrinreg2 *) dst = val;
140 }
141 
142 static inline intrinreg1 load_intrinreg1(const void *src)
143 {
144 	return *(intrinreg1 *) src;
145 }
146 
147 static inline void store_intrinreg1(void *dst, intrinreg1 val)
148 {
149 	*(intrinreg1 *) dst = val;
150 }
151 
152 static inline void memcpy_gte16_sse_fixedlen(void *dst, const void *src, size_t nbytes)
153 {
154 	size_t i;
155 	size_t j;
156 	intrinreg16 pool[4];
157 	size_t remaining_moves;
158 	size_t tail_offset;
159 	int do_tail;
160 	assert(nbytes >= 16);
161 
162 	for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4) {
163 		for (j = 0; j < 4; j++)
164 			pool[j] =
165 			    load_intrinreg16((const void *)((const char *)src + i + 16 * j));
166 		for (j = 0; j < 4; j++)
167 			store_intrinreg16((void *)((char *)dst + i + 16 * j), pool[j]);
168 	}
169 
170 	remaining_moves = (nbytes - i) / 16;
171 	tail_offset = nbytes - 16;
172 	do_tail = (tail_offset & (16 - 1));
173 
174 	for (j = 0; j < remaining_moves; j++)
175 		pool[j] = load_intrinreg16((const void *)((const char *)src + i + 16 * j));
176 
177 	if (do_tail)
178 		pool[j] = load_intrinreg16((const void *)((const char *)src + tail_offset));
179 
180 	for (j = 0; j < remaining_moves; j++)
181 		store_intrinreg16((void *)((char *)dst + i + 16 * j), pool[j]);
182 
183 	if (do_tail)
184 		store_intrinreg16((void *)((char *)dst + tail_offset), pool[j]);
185 }
186 
187 static inline void memclr_gte16_sse_fixedlen(void *dst, size_t nbytes)
188 {
189 	size_t i;
190 	size_t j;
191 	const intrinreg16 zero = { 0 };
192 	size_t remaining_moves;
193 	size_t tail_offset;
194 	int do_tail;
195 	assert(nbytes >= 16);
196 
197 	for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4)
198 		for (j = 0; j < 4; j++)
199 			store_intrinreg16((void *)((char *)dst + i + 16 * j), zero);
200 
201 	remaining_moves = (nbytes - i) / 16;
202 	tail_offset = nbytes - 16;
203 	do_tail = (tail_offset & (16 - 1));
204 
205 	for (j = 0; j < remaining_moves; j++)
206 		store_intrinreg16((void *)((char *)dst + i + 16 * j), zero);
207 
208 	if (do_tail)
209 		store_intrinreg16((void *)((char *)dst + tail_offset), zero);
210 }
211 
212 static inline void memcpy_lte32_sse_fixedlen(void *dst, const void *src, size_t nbytes)
213 {
214 	assert(nbytes <= 32);
215 	if (nbytes >= 16)
216 		MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 1, dst, src, nbytes);
217 	else if (nbytes >= 8)
218 		MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 1, dst, src, nbytes);
219 	else if (nbytes >= 4)
220 		MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 1, dst, src, nbytes);
221 	else if (nbytes >= 2)
222 		MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 1, dst, src, nbytes);
223 	else if (nbytes >= 1)
224 		MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 1, dst, src, nbytes);
225 }
226 
227 static inline void memclr_lte32_sse_fixedlen(void *dst, size_t nbytes)
228 {
229 	assert(nbytes <= 32);
230 	if (nbytes >= 16)
231 		MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 1, dst, nbytes);
232 	else if (nbytes >= 8)
233 		MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 1, dst, nbytes);
234 	else if (nbytes >= 4)
235 		MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 1, dst, nbytes);
236 	else if (nbytes >= 2)
237 		MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 1, dst, nbytes);
238 	else if (nbytes >= 1)
239 		MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 1, dst, nbytes);
240 }
241 
242 static inline void memcpy_lte32_sse_varlen(void *dst, const void *src, size_t nbytes)
243 {
244 	assert(nbytes <= 32);
245 	if (nbytes >= 16)
246 		MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 0, dst, src, nbytes);
247 	else if (nbytes >= 8)
248 		MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 0, dst, src, nbytes);
249 	else if (nbytes >= 4)
250 		MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 0, dst, src, nbytes);
251 	else if (nbytes >= 2)
252 		MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 0, dst, src, nbytes);
253 	else if (nbytes >= 1)
254 		MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 0, dst, src, nbytes);
255 }
256 
257 static inline void memclr_lte32_sse_varlen(void *dst, size_t nbytes)
258 {
259 	assert(nbytes <= 32);
260 	if (nbytes >= 16)
261 		MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 0, dst, nbytes);
262 	else if (nbytes >= 8)
263 		MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 0, dst, nbytes);
264 	else if (nbytes >= 4)
265 		MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 0, dst, nbytes);
266 	else if (nbytes >= 2)
267 		MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 0, dst, nbytes);
268 	else if (nbytes >= 1)
269 		MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 0, dst, nbytes);
270 }
271 
272 static inline void memcpy_gte16_sse_varlen(void *dst, const void *src, size_t nbytes)
273 {
274 	size_t i = 0;
275 	intrinreg16 tail;
276 
277 	assert(nbytes >= 16);
278 
279 	while (i + 128 <= nbytes) {
280 		memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
281 					  (const void *)((const char *)src + i), 128);
282 		i += 128;
283 	}
284 	if (i + 64 <= nbytes) {
285 		memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
286 					  (const void *)((const char *)src + i), 64);
287 		i += 64;
288 	}
289 	if (i + 32 <= nbytes) {
290 		memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
291 					  (const void *)((const char *)src + i), 32);
292 		i += 32;
293 	}
294 	if (i + 16 <= nbytes) {
295 		memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
296 					  (const void *)((const char *)src + i), 16);
297 	}
298 
299 	i = nbytes - 16;
300 	tail = load_intrinreg16((const void *)((const char *)src + i));
301 	store_intrinreg16((void *)((char *)dst + i), tail);
302 }
303 
304 static inline void memclr_gte16_sse_varlen(void *dst, size_t nbytes)
305 {
306 	size_t i = 0;
307 	const intrinreg16 zero = { 0 };
308 
309 	assert(nbytes >= 16);
310 
311 	while (i + 128 <= nbytes) {
312 		memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 128);
313 		i += 128;
314 	}
315 	if (i + 64 <= nbytes) {
316 		memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 64);
317 		i += 64;
318 	}
319 	if (i + 32 <= nbytes) {
320 		memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 32);
321 		i += 32;
322 	}
323 	if (i + 16 <= nbytes) {
324 		memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 16);
325 	}
326 
327 	i = nbytes - 16;
328 	store_intrinreg16((void *)((char *)dst + i), zero);
329 }
330 
331 static inline void memcpy_sse_fixedlen(void *dst, const void *src, size_t nbytes)
332 {
333 	if (nbytes >= 16)
334 		memcpy_gte16_sse_fixedlen(dst, src, nbytes);
335 	else
336 		memcpy_lte32_sse_fixedlen(dst, src, nbytes);
337 }
338 
339 static inline void memclr_sse_fixedlen(void *dst, size_t nbytes)
340 {
341 	if (nbytes >= 16)
342 		memclr_gte16_sse_fixedlen(dst, nbytes);
343 	else
344 		memclr_lte32_sse_fixedlen(dst, nbytes);
345 }
346 
347 static inline void memcpy_sse_varlen(void *dst, const void *src, size_t nbytes)
348 {
349 	if (nbytes >= 16)
350 		memcpy_gte16_sse_varlen(dst, src, nbytes);
351 	else
352 		memcpy_lte32_sse_varlen(dst, src, nbytes);
353 }
354 
355 static inline void memclr_sse_varlen(void *dst, size_t nbytes)
356 {
357 	if (nbytes >= 16)
358 		memclr_gte16_sse_varlen(dst, nbytes);
359 	else
360 		memclr_lte32_sse_varlen(dst, nbytes);
361 }
362 #else
363 #define memcpy_varlen   memcpy
364 #define memcpy_fixedlen memcpy
365 
366 #define memclr_varlen(dst,n)   memset(dst,0,n)
367 #define memclr_fixedlen(dst,n) memset(dst,0,n)
368 
369 #endif
370 
371 #ifdef __cplusplus
372 }
373 #endif
374 
375 #endif // __MEMCPY_H
376