xref: /dpdk/lib/eal/x86/include/rte_memcpy.h (revision e9fd1ebf981f361844aea9ec94e17f4bda5e1479)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #ifndef _RTE_MEMCPY_X86_64_H_
6 #define _RTE_MEMCPY_X86_64_H_
7 
8 /**
9  * @file
10  *
11  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
12  */
13 
14 #include <stdio.h>
15 #include <stdint.h>
16 #include <string.h>
17 #include <rte_vect.h>
18 #include <rte_common.h>
19 #include <rte_config.h>
20 
21 #ifdef __cplusplus
22 extern "C" {
23 #endif
24 
25 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
26 #pragma GCC diagnostic push
27 #pragma GCC diagnostic ignored "-Wstringop-overflow"
28 #endif
29 
30 /**
31  * Copy bytes from one location to another. The locations must not overlap.
32  *
33  * @note This is implemented as a macro, so it's address should not be taken
34  * and care is needed as parameter expressions may be evaluated multiple times.
35  *
36  * @param dst
37  *   Pointer to the destination of the data.
38  * @param src
39  *   Pointer to the source data.
40  * @param n
41  *   Number of bytes to copy.
42  * @return
43  *   Pointer to the destination data.
44  */
45 static __rte_always_inline void *
46 rte_memcpy(void *dst, const void *src, size_t n);
47 
48 /**
49  * Copy bytes from one location to another,
50  * locations should not overlap.
51  * Use with n <= 15.
52  */
53 static __rte_always_inline void *
54 rte_mov15_or_less(void *dst, const void *src, size_t n)
55 {
56 	/**
57 	 * Use the following structs to avoid violating C standard
58 	 * alignment requirements and to avoid strict aliasing bugs
59 	 */
60 	struct rte_uint64_alias {
61 		uint64_t val;
62 	} __rte_packed __rte_may_alias;
63 	struct rte_uint32_alias {
64 		uint32_t val;
65 	} __rte_packed __rte_may_alias;
66 	struct rte_uint16_alias {
67 		uint16_t val;
68 	} __rte_packed __rte_may_alias;
69 
70 	void *ret = dst;
71 	if (n & 8) {
72 		((struct rte_uint64_alias *)dst)->val =
73 			((const struct rte_uint64_alias *)src)->val;
74 		src = (const uint64_t *)src + 1;
75 		dst = (uint64_t *)dst + 1;
76 	}
77 	if (n & 4) {
78 		((struct rte_uint32_alias *)dst)->val =
79 			((const struct rte_uint32_alias *)src)->val;
80 		src = (const uint32_t *)src + 1;
81 		dst = (uint32_t *)dst + 1;
82 	}
83 	if (n & 2) {
84 		((struct rte_uint16_alias *)dst)->val =
85 			((const struct rte_uint16_alias *)src)->val;
86 		src = (const uint16_t *)src + 1;
87 		dst = (uint16_t *)dst + 1;
88 	}
89 	if (n & 1)
90 		*(uint8_t *)dst = *(const uint8_t *)src;
91 	return ret;
92 }
93 
94 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
95 
96 #define ALIGNMENT_MASK 0x3F
97 
98 /**
99  * AVX512 implementation below
100  */
101 
102 /**
103  * Copy 16 bytes from one location to another,
104  * locations should not overlap.
105  */
106 static __rte_always_inline void
107 rte_mov16(uint8_t *dst, const uint8_t *src)
108 {
109 	__m128i xmm0;
110 
111 	xmm0 = _mm_loadu_si128((const __m128i *)src);
112 	_mm_storeu_si128((__m128i *)dst, xmm0);
113 }
114 
115 /**
116  * Copy 32 bytes from one location to another,
117  * locations should not overlap.
118  */
119 static __rte_always_inline void
120 rte_mov32(uint8_t *dst, const uint8_t *src)
121 {
122 	__m256i ymm0;
123 
124 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
125 	_mm256_storeu_si256((__m256i *)dst, ymm0);
126 }
127 
128 /**
129  * Copy 64 bytes from one location to another,
130  * locations should not overlap.
131  */
132 static __rte_always_inline void
133 rte_mov64(uint8_t *dst, const uint8_t *src)
134 {
135 	__m512i zmm0;
136 
137 	zmm0 = _mm512_loadu_si512((const void *)src);
138 	_mm512_storeu_si512((void *)dst, zmm0);
139 }
140 
141 /**
142  * Copy 128 bytes from one location to another,
143  * locations should not overlap.
144  */
145 static __rte_always_inline void
146 rte_mov128(uint8_t *dst, const uint8_t *src)
147 {
148 	rte_mov64(dst + 0 * 64, src + 0 * 64);
149 	rte_mov64(dst + 1 * 64, src + 1 * 64);
150 }
151 
152 /**
153  * Copy 256 bytes from one location to another,
154  * locations should not overlap.
155  */
156 static __rte_always_inline void
157 rte_mov256(uint8_t *dst, const uint8_t *src)
158 {
159 	rte_mov64(dst + 0 * 64, src + 0 * 64);
160 	rte_mov64(dst + 1 * 64, src + 1 * 64);
161 	rte_mov64(dst + 2 * 64, src + 2 * 64);
162 	rte_mov64(dst + 3 * 64, src + 3 * 64);
163 }
164 
165 /**
166  * Copy 128-byte blocks from one location to another,
167  * locations should not overlap.
168  */
169 static __rte_always_inline void
170 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
171 {
172 	__m512i zmm0, zmm1;
173 
174 	while (n >= 128) {
175 		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
176 		n -= 128;
177 		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
178 		src = src + 128;
179 		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
180 		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
181 		dst = dst + 128;
182 	}
183 }
184 
185 /**
186  * Copy 512-byte blocks from one location to another,
187  * locations should not overlap.
188  */
189 static inline void
190 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
191 {
192 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
193 
194 	while (n >= 512) {
195 		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
196 		n -= 512;
197 		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
198 		zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
199 		zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
200 		zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
201 		zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
202 		zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
203 		zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
204 		src = src + 512;
205 		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
206 		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
207 		_mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
208 		_mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
209 		_mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
210 		_mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
211 		_mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
212 		_mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
213 		dst = dst + 512;
214 	}
215 }
216 
217 static __rte_always_inline void *
218 rte_memcpy_generic(void *dst, const void *src, size_t n)
219 {
220 	void *ret = dst;
221 	size_t dstofss;
222 	size_t bits;
223 
224 	/**
225 	 * Copy less than 16 bytes
226 	 */
227 	if (n < 16) {
228 		return rte_mov15_or_less(dst, src, n);
229 	}
230 
231 	/**
232 	 * Fast way when copy size doesn't exceed 512 bytes
233 	 */
234 	if (n <= 32) {
235 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
236 		rte_mov16((uint8_t *)dst - 16 + n,
237 				  (const uint8_t *)src - 16 + n);
238 		return ret;
239 	}
240 	if (n <= 64) {
241 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
242 		rte_mov32((uint8_t *)dst - 32 + n,
243 				  (const uint8_t *)src - 32 + n);
244 		return ret;
245 	}
246 	if (n <= 512) {
247 		if (n >= 256) {
248 			n -= 256;
249 			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
250 			src = (const uint8_t *)src + 256;
251 			dst = (uint8_t *)dst + 256;
252 		}
253 		if (n >= 128) {
254 			n -= 128;
255 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
256 			src = (const uint8_t *)src + 128;
257 			dst = (uint8_t *)dst + 128;
258 		}
259 COPY_BLOCK_128_BACK63:
260 		if (n > 64) {
261 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
262 			rte_mov64((uint8_t *)dst - 64 + n,
263 					  (const uint8_t *)src - 64 + n);
264 			return ret;
265 		}
266 		if (n > 0)
267 			rte_mov64((uint8_t *)dst - 64 + n,
268 					  (const uint8_t *)src - 64 + n);
269 		return ret;
270 	}
271 
272 	/**
273 	 * Make store aligned when copy size exceeds 512 bytes
274 	 */
275 	dstofss = ((uintptr_t)dst & 0x3F);
276 	if (dstofss > 0) {
277 		dstofss = 64 - dstofss;
278 		n -= dstofss;
279 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
280 		src = (const uint8_t *)src + dstofss;
281 		dst = (uint8_t *)dst + dstofss;
282 	}
283 
284 	/**
285 	 * Copy 512-byte blocks.
286 	 * Use copy block function for better instruction order control,
287 	 * which is important when load is unaligned.
288 	 */
289 	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
290 	bits = n;
291 	n = n & 511;
292 	bits -= n;
293 	src = (const uint8_t *)src + bits;
294 	dst = (uint8_t *)dst + bits;
295 
296 	/**
297 	 * Copy 128-byte blocks.
298 	 * Use copy block function for better instruction order control,
299 	 * which is important when load is unaligned.
300 	 */
301 	if (n >= 128) {
302 		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
303 		bits = n;
304 		n = n & 127;
305 		bits -= n;
306 		src = (const uint8_t *)src + bits;
307 		dst = (uint8_t *)dst + bits;
308 	}
309 
310 	/**
311 	 * Copy whatever left
312 	 */
313 	goto COPY_BLOCK_128_BACK63;
314 }
315 
316 #elif defined __AVX2__
317 
318 #define ALIGNMENT_MASK 0x1F
319 
320 /**
321  * AVX2 implementation below
322  */
323 
324 /**
325  * Copy 16 bytes from one location to another,
326  * locations should not overlap.
327  */
328 static __rte_always_inline void
329 rte_mov16(uint8_t *dst, const uint8_t *src)
330 {
331 	__m128i xmm0;
332 
333 	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
334 	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
335 }
336 
337 /**
338  * Copy 32 bytes from one location to another,
339  * locations should not overlap.
340  */
341 static __rte_always_inline void
342 rte_mov32(uint8_t *dst, const uint8_t *src)
343 {
344 	__m256i ymm0;
345 
346 	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
347 	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
348 }
349 
350 /**
351  * Copy 64 bytes from one location to another,
352  * locations should not overlap.
353  */
354 static __rte_always_inline void
355 rte_mov64(uint8_t *dst, const uint8_t *src)
356 {
357 	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
358 	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
359 }
360 
361 /**
362  * Copy 128 bytes from one location to another,
363  * locations should not overlap.
364  */
365 static __rte_always_inline void
366 rte_mov128(uint8_t *dst, const uint8_t *src)
367 {
368 	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
369 	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
370 	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
371 	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
372 }
373 
374 /**
375  * Copy 256 bytes from one location to another,
376  * locations should not overlap.
377  */
378 static __rte_always_inline void
379 rte_mov256(uint8_t *dst, const uint8_t *src)
380 {
381 	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
382 	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
383 	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
384 	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
385 	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
386 	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
387 	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
388 	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
389 }
390 
391 /**
392  * Copy 128-byte blocks from one location to another,
393  * locations should not overlap.
394  */
395 static __rte_always_inline void
396 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
397 {
398 	__m256i ymm0, ymm1, ymm2, ymm3;
399 
400 	while (n >= 128) {
401 		ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)
402 					  ((const uint8_t *)src + 0 * 32));
403 		n -= 128;
404 		ymm1 = _mm256_loadu_si256((const __m256i *)(const void *)
405 					  ((const uint8_t *)src + 1 * 32));
406 		ymm2 = _mm256_loadu_si256((const __m256i *)(const void *)
407 					  ((const uint8_t *)src + 2 * 32));
408 		ymm3 = _mm256_loadu_si256((const __m256i *)(const void *)
409 					  ((const uint8_t *)src + 3 * 32));
410 		src = (const uint8_t *)src + 128;
411 		_mm256_storeu_si256((__m256i *)(void *)
412 				    ((uint8_t *)dst + 0 * 32), ymm0);
413 		_mm256_storeu_si256((__m256i *)(void *)
414 				    ((uint8_t *)dst + 1 * 32), ymm1);
415 		_mm256_storeu_si256((__m256i *)(void *)
416 				    ((uint8_t *)dst + 2 * 32), ymm2);
417 		_mm256_storeu_si256((__m256i *)(void *)
418 				    ((uint8_t *)dst + 3 * 32), ymm3);
419 		dst = (uint8_t *)dst + 128;
420 	}
421 }
422 
423 static __rte_always_inline void *
424 rte_memcpy_generic(void *dst, const void *src, size_t n)
425 {
426 	void *ret = dst;
427 	size_t dstofss;
428 	size_t bits;
429 
430 	/**
431 	 * Copy less than 16 bytes
432 	 */
433 	if (n < 16) {
434 		return rte_mov15_or_less(dst, src, n);
435 	}
436 
437 	/**
438 	 * Fast way when copy size doesn't exceed 256 bytes
439 	 */
440 	if (n <= 32) {
441 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
442 		rte_mov16((uint8_t *)dst - 16 + n,
443 				(const uint8_t *)src - 16 + n);
444 		return ret;
445 	}
446 	if (n <= 48) {
447 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
448 		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
449 		rte_mov16((uint8_t *)dst - 16 + n,
450 				(const uint8_t *)src - 16 + n);
451 		return ret;
452 	}
453 	if (n <= 64) {
454 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
455 		rte_mov32((uint8_t *)dst - 32 + n,
456 				(const uint8_t *)src - 32 + n);
457 		return ret;
458 	}
459 	if (n <= 256) {
460 		if (n >= 128) {
461 			n -= 128;
462 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
463 			src = (const uint8_t *)src + 128;
464 			dst = (uint8_t *)dst + 128;
465 		}
466 COPY_BLOCK_128_BACK31:
467 		if (n >= 64) {
468 			n -= 64;
469 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
470 			src = (const uint8_t *)src + 64;
471 			dst = (uint8_t *)dst + 64;
472 		}
473 		if (n > 32) {
474 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
475 			rte_mov32((uint8_t *)dst - 32 + n,
476 					(const uint8_t *)src - 32 + n);
477 			return ret;
478 		}
479 		if (n > 0) {
480 			rte_mov32((uint8_t *)dst - 32 + n,
481 					(const uint8_t *)src - 32 + n);
482 		}
483 		return ret;
484 	}
485 
486 	/**
487 	 * Make store aligned when copy size exceeds 256 bytes
488 	 */
489 	dstofss = (uintptr_t)dst & 0x1F;
490 	if (dstofss > 0) {
491 		dstofss = 32 - dstofss;
492 		n -= dstofss;
493 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
494 		src = (const uint8_t *)src + dstofss;
495 		dst = (uint8_t *)dst + dstofss;
496 	}
497 
498 	/**
499 	 * Copy 128-byte blocks
500 	 */
501 	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
502 	bits = n;
503 	n = n & 127;
504 	bits -= n;
505 	src = (const uint8_t *)src + bits;
506 	dst = (uint8_t *)dst + bits;
507 
508 	/**
509 	 * Copy whatever left
510 	 */
511 	goto COPY_BLOCK_128_BACK31;
512 }
513 
514 #else /* __AVX512F__ */
515 
516 #define ALIGNMENT_MASK 0x0F
517 
518 /**
519  * SSE & AVX implementation below
520  */
521 
522 /**
523  * Copy 16 bytes from one location to another,
524  * locations should not overlap.
525  */
526 static __rte_always_inline void
527 rte_mov16(uint8_t *dst, const uint8_t *src)
528 {
529 	__m128i xmm0;
530 
531 	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
532 	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
533 }
534 
535 /**
536  * Copy 32 bytes from one location to another,
537  * locations should not overlap.
538  */
539 static __rte_always_inline void
540 rte_mov32(uint8_t *dst, const uint8_t *src)
541 {
542 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
543 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
544 }
545 
546 /**
547  * Copy 64 bytes from one location to another,
548  * locations should not overlap.
549  */
550 static __rte_always_inline void
551 rte_mov64(uint8_t *dst, const uint8_t *src)
552 {
553 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
554 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
555 	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
556 	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
557 }
558 
559 /**
560  * Copy 128 bytes from one location to another,
561  * locations should not overlap.
562  */
563 static __rte_always_inline void
564 rte_mov128(uint8_t *dst, const uint8_t *src)
565 {
566 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
567 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
568 	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
569 	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
570 	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
571 	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
572 	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
573 	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
574 }
575 
576 /**
577  * Copy 256 bytes from one location to another,
578  * locations should not overlap.
579  */
580 static inline void
581 rte_mov256(uint8_t *dst, const uint8_t *src)
582 {
583 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
584 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
585 	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
586 	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
587 	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
588 	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
589 	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
590 	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
591 	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
592 	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
593 	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
594 	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
595 	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
596 	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
597 	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
598 	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
599 }
600 
601 /**
602  * Macro for copying unaligned block from one location to another with constant load offset,
603  * 47 bytes leftover maximum,
604  * locations should not overlap.
605  * Requirements:
606  * - Store is aligned
607  * - Load offset is <offset>, which must be immediate value within [1, 15]
608  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
609  * - <dst>, <src>, <len> must be variables
610  * - __m128i <xmm0> ~ <xmm8> must be pre-defined
611  */
612 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
613 {                                                                                            \
614     size_t tmp;                                                                                                \
615     while (len >= 128 + 16 - offset) {                                                                      \
616         xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));                  \
617         len -= 128;                                                                                         \
618         xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));                  \
619         xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));                  \
620         xmm3 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 3 * 16));                  \
621         xmm4 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 4 * 16));                  \
622         xmm5 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 5 * 16));                  \
623         xmm6 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 6 * 16));                  \
624         xmm7 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 7 * 16));                  \
625         xmm8 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 8 * 16));                  \
626         src = (const uint8_t *)src + 128;                                                                   \
627         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
628         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
629         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
630         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
631         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
632         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
633         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
634         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
635         dst = (uint8_t *)dst + 128;                                                                         \
636     }                                                                                                       \
637     tmp = len;                                                                                              \
638     len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
639     tmp -= len;                                                                                             \
640     src = (const uint8_t *)src + tmp;                                                                       \
641     dst = (uint8_t *)dst + tmp;                                                                             \
642     if (len >= 32 + 16 - offset) {                                                                          \
643         while (len >= 32 + 16 - offset) {                                                                   \
644             xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));              \
645             len -= 32;                                                                                      \
646             xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));              \
647             xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));              \
648             src = (const uint8_t *)src + 32;                                                                \
649             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
650             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
651             dst = (uint8_t *)dst + 32;                                                                      \
652         }                                                                                                   \
653         tmp = len;                                                                                          \
654         len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
655         tmp -= len;                                                                                         \
656         src = (const uint8_t *)src + tmp;                                                                   \
657         dst = (uint8_t *)dst + tmp;                                                                         \
658     }                                                                                                       \
659 }
660 
661 /**
662  * Macro for copying unaligned block from one location to another,
663  * 47 bytes leftover maximum,
664  * locations should not overlap.
665  * Use switch here because the aligning instruction requires immediate value for shift count.
666  * Requirements:
667  * - Store is aligned
668  * - Load offset is <offset>, which must be within [1, 15]
669  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
670  * - <dst>, <src>, <len> must be variables
671  * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
672  */
673 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
674 {                                                      \
675     switch (offset) {                                                 \
676     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
677     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
678     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
679     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
680     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
681     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
682     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
683     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
684     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
685     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
686     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
687     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
688     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
689     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
690     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
691     default:;                                                         \
692     }                                                                 \
693 }
694 
695 static __rte_always_inline void *
696 rte_memcpy_generic(void *dst, const void *src, size_t n)
697 {
698 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
699 	void *ret = dst;
700 	size_t dstofss;
701 	size_t srcofs;
702 
703 	/**
704 	 * Copy less than 16 bytes
705 	 */
706 	if (n < 16) {
707 		return rte_mov15_or_less(dst, src, n);
708 	}
709 
710 	/**
711 	 * Fast way when copy size doesn't exceed 512 bytes
712 	 */
713 	if (n <= 32) {
714 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
715 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
716 		return ret;
717 	}
718 	if (n <= 48) {
719 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
720 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
721 		return ret;
722 	}
723 	if (n <= 64) {
724 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
725 		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
726 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
727 		return ret;
728 	}
729 	if (n <= 128) {
730 		goto COPY_BLOCK_128_BACK15;
731 	}
732 	if (n <= 512) {
733 		if (n >= 256) {
734 			n -= 256;
735 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
736 			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
737 			src = (const uint8_t *)src + 256;
738 			dst = (uint8_t *)dst + 256;
739 		}
740 COPY_BLOCK_255_BACK15:
741 		if (n >= 128) {
742 			n -= 128;
743 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
744 			src = (const uint8_t *)src + 128;
745 			dst = (uint8_t *)dst + 128;
746 		}
747 COPY_BLOCK_128_BACK15:
748 		if (n >= 64) {
749 			n -= 64;
750 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
751 			src = (const uint8_t *)src + 64;
752 			dst = (uint8_t *)dst + 64;
753 		}
754 COPY_BLOCK_64_BACK15:
755 		if (n >= 32) {
756 			n -= 32;
757 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
758 			src = (const uint8_t *)src + 32;
759 			dst = (uint8_t *)dst + 32;
760 		}
761 		if (n > 16) {
762 			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
763 			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
764 			return ret;
765 		}
766 		if (n > 0) {
767 			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
768 		}
769 		return ret;
770 	}
771 
772 	/**
773 	 * Make store aligned when copy size exceeds 512 bytes,
774 	 * and make sure the first 15 bytes are copied, because
775 	 * unaligned copy functions require up to 15 bytes
776 	 * backwards access.
777 	 */
778 	dstofss = (uintptr_t)dst & 0x0F;
779 	if (dstofss > 0) {
780 		dstofss = 16 - dstofss + 16;
781 		n -= dstofss;
782 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
783 		src = (const uint8_t *)src + dstofss;
784 		dst = (uint8_t *)dst + dstofss;
785 	}
786 	srcofs = ((uintptr_t)src & 0x0F);
787 
788 	/**
789 	 * For aligned copy
790 	 */
791 	if (srcofs == 0) {
792 		/**
793 		 * Copy 256-byte blocks
794 		 */
795 		for (; n >= 256; n -= 256) {
796 			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
797 			dst = (uint8_t *)dst + 256;
798 			src = (const uint8_t *)src + 256;
799 		}
800 
801 		/**
802 		 * Copy whatever left
803 		 */
804 		goto COPY_BLOCK_255_BACK15;
805 	}
806 
807 	/**
808 	 * For copy with unaligned load
809 	 */
810 	MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
811 
812 	/**
813 	 * Copy whatever left
814 	 */
815 	goto COPY_BLOCK_64_BACK15;
816 }
817 
818 #endif /* __AVX512F__ */
819 
820 static __rte_always_inline void *
821 rte_memcpy_aligned(void *dst, const void *src, size_t n)
822 {
823 	void *ret = dst;
824 
825 	/* Copy size < 16 bytes */
826 	if (n < 16) {
827 		return rte_mov15_or_less(dst, src, n);
828 	}
829 
830 	/* Copy 16 <= size <= 32 bytes */
831 	if (n <= 32) {
832 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
833 		rte_mov16((uint8_t *)dst - 16 + n,
834 				(const uint8_t *)src - 16 + n);
835 
836 		return ret;
837 	}
838 
839 	/* Copy 32 < size <= 64 bytes */
840 	if (n <= 64) {
841 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
842 		rte_mov32((uint8_t *)dst - 32 + n,
843 				(const uint8_t *)src - 32 + n);
844 
845 		return ret;
846 	}
847 
848 	/* Copy 64 bytes blocks */
849 	for (; n > 64; n -= 64) {
850 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
851 		dst = (uint8_t *)dst + 64;
852 		src = (const uint8_t *)src + 64;
853 	}
854 
855 	/* Copy whatever left */
856 	rte_mov64((uint8_t *)dst - 64 + n,
857 			(const uint8_t *)src - 64 + n);
858 
859 	return ret;
860 }
861 
862 static __rte_always_inline void *
863 rte_memcpy(void *dst, const void *src, size_t n)
864 {
865 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
866 		return rte_memcpy_aligned(dst, src, n);
867 	else
868 		return rte_memcpy_generic(dst, src, n);
869 }
870 
871 #undef ALIGNMENT_MASK
872 
873 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
874 #pragma GCC diagnostic pop
875 #endif
876 
877 #ifdef __cplusplus
878 }
879 #endif
880 
881 #endif /* _RTE_MEMCPY_X86_64_H_ */
882