xref: /dpdk/lib/eal/x86/include/rte_memcpy.h (revision 42a8fc7daa46256d150278fc9a7a846e27945a0c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #ifndef _RTE_MEMCPY_X86_64_H_
6 #define _RTE_MEMCPY_X86_64_H_
7 
8 /**
9  * @file
10  *
11  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
12  */
13 
14 #include <stdio.h>
15 #include <stdint.h>
16 #include <string.h>
17 #include <rte_vect.h>
18 #include <rte_common.h>
19 #include <rte_config.h>
20 
21 #ifdef __cplusplus
22 extern "C" {
23 #endif
24 
25 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
26 #pragma GCC diagnostic push
27 #pragma GCC diagnostic ignored "-Wstringop-overflow"
28 #endif
29 
30 /**
31  * Copy bytes from one location to another. The locations must not overlap.
32  *
33  * @note This is implemented as a macro, so it's address should not be taken
34  * and care is needed as parameter expressions may be evaluated multiple times.
35  *
36  * @param dst
37  *   Pointer to the destination of the data.
38  * @param src
39  *   Pointer to the source data.
40  * @param n
41  *   Number of bytes to copy.
42  * @return
43  *   Pointer to the destination data.
44  */
45 static __rte_always_inline void *
46 rte_memcpy(void *dst, const void *src, size_t n);
47 
48 /**
49  * Copy bytes from one location to another,
50  * locations should not overlap.
51  * Use with n <= 15.
52  */
53 static __rte_always_inline void *
54 rte_mov15_or_less(void *dst, const void *src, size_t n)
55 {
56 	/**
57 	 * Use the following structs to avoid violating C standard
58 	 * alignment requirements and to avoid strict aliasing bugs
59 	 */
60 	struct rte_uint64_alias {
61 		uint64_t val;
62 	} __rte_packed __rte_may_alias;
63 	struct rte_uint32_alias {
64 		uint32_t val;
65 	} __rte_packed __rte_may_alias;
66 	struct rte_uint16_alias {
67 		uint16_t val;
68 	} __rte_packed __rte_may_alias;
69 
70 	void *ret = dst;
71 	if (n & 8) {
72 		((struct rte_uint64_alias *)dst)->val =
73 			((const struct rte_uint64_alias *)src)->val;
74 		src = (const uint64_t *)src + 1;
75 		dst = (uint64_t *)dst + 1;
76 	}
77 	if (n & 4) {
78 		((struct rte_uint32_alias *)dst)->val =
79 			((const struct rte_uint32_alias *)src)->val;
80 		src = (const uint32_t *)src + 1;
81 		dst = (uint32_t *)dst + 1;
82 	}
83 	if (n & 2) {
84 		((struct rte_uint16_alias *)dst)->val =
85 			((const struct rte_uint16_alias *)src)->val;
86 		src = (const uint16_t *)src + 1;
87 		dst = (uint16_t *)dst + 1;
88 	}
89 	if (n & 1)
90 		*(uint8_t *)dst = *(const uint8_t *)src;
91 	return ret;
92 }
93 
94 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
95 
96 #define ALIGNMENT_MASK 0x3F
97 
98 /**
99  * AVX512 implementation below
100  */
101 
102 /**
103  * Copy 16 bytes from one location to another,
104  * locations should not overlap.
105  */
106 static __rte_always_inline void
107 rte_mov16(uint8_t *dst, const uint8_t *src)
108 {
109 	__m128i xmm0;
110 
111 	xmm0 = _mm_loadu_si128((const __m128i *)src);
112 	_mm_storeu_si128((__m128i *)dst, xmm0);
113 }
114 
115 /**
116  * Copy 32 bytes from one location to another,
117  * locations should not overlap.
118  */
119 static __rte_always_inline void
120 rte_mov32(uint8_t *dst, const uint8_t *src)
121 {
122 	__m256i ymm0;
123 
124 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
125 	_mm256_storeu_si256((__m256i *)dst, ymm0);
126 }
127 
128 /**
129  * Copy 64 bytes from one location to another,
130  * locations should not overlap.
131  */
132 static __rte_always_inline void
133 rte_mov64(uint8_t *dst, const uint8_t *src)
134 {
135 	__m512i zmm0;
136 
137 	zmm0 = _mm512_loadu_si512((const void *)src);
138 	_mm512_storeu_si512((void *)dst, zmm0);
139 }
140 
141 /**
142  * Copy 128 bytes from one location to another,
143  * locations should not overlap.
144  */
145 static __rte_always_inline void
146 rte_mov128(uint8_t *dst, const uint8_t *src)
147 {
148 	rte_mov64(dst + 0 * 64, src + 0 * 64);
149 	rte_mov64(dst + 1 * 64, src + 1 * 64);
150 }
151 
152 /**
153  * Copy 256 bytes from one location to another,
154  * locations should not overlap.
155  */
156 static __rte_always_inline void
157 rte_mov256(uint8_t *dst, const uint8_t *src)
158 {
159 	rte_mov64(dst + 0 * 64, src + 0 * 64);
160 	rte_mov64(dst + 1 * 64, src + 1 * 64);
161 	rte_mov64(dst + 2 * 64, src + 2 * 64);
162 	rte_mov64(dst + 3 * 64, src + 3 * 64);
163 }
164 
165 /**
166  * Copy 128-byte blocks from one location to another,
167  * locations should not overlap.
168  */
169 static __rte_always_inline void
170 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
171 {
172 	__m512i zmm0, zmm1;
173 
174 	while (n >= 128) {
175 		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
176 		n -= 128;
177 		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
178 		src = src + 128;
179 		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
180 		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
181 		dst = dst + 128;
182 	}
183 }
184 
185 /**
186  * Copy 512-byte blocks from one location to another,
187  * locations should not overlap.
188  */
189 static inline void
190 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
191 {
192 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
193 
194 	while (n >= 512) {
195 		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
196 		n -= 512;
197 		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
198 		zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
199 		zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
200 		zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
201 		zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
202 		zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
203 		zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
204 		src = src + 512;
205 		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
206 		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
207 		_mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
208 		_mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
209 		_mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
210 		_mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
211 		_mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
212 		_mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
213 		dst = dst + 512;
214 	}
215 }
216 
217 static __rte_always_inline void *
218 rte_memcpy_generic(void *dst, const void *src, size_t n)
219 {
220 	void *ret = dst;
221 	size_t dstofss;
222 	size_t bits;
223 
224 	/**
225 	 * Copy less than 16 bytes
226 	 */
227 	if (n < 16) {
228 		return rte_mov15_or_less(dst, src, n);
229 	}
230 
231 	/**
232 	 * Fast way when copy size doesn't exceed 512 bytes
233 	 */
234 	if (n <= 32) {
235 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
236 		rte_mov16((uint8_t *)dst - 16 + n,
237 				  (const uint8_t *)src - 16 + n);
238 		return ret;
239 	}
240 	if (n <= 64) {
241 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
242 		rte_mov32((uint8_t *)dst - 32 + n,
243 				  (const uint8_t *)src - 32 + n);
244 		return ret;
245 	}
246 	if (n <= 512) {
247 		if (n >= 256) {
248 			n -= 256;
249 			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
250 			src = (const uint8_t *)src + 256;
251 			dst = (uint8_t *)dst + 256;
252 		}
253 		if (n >= 128) {
254 			n -= 128;
255 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
256 			src = (const uint8_t *)src + 128;
257 			dst = (uint8_t *)dst + 128;
258 		}
259 COPY_BLOCK_128_BACK63:
260 		if (n > 64) {
261 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
262 			rte_mov64((uint8_t *)dst - 64 + n,
263 					  (const uint8_t *)src - 64 + n);
264 			return ret;
265 		}
266 		if (n > 0)
267 			rte_mov64((uint8_t *)dst - 64 + n,
268 					  (const uint8_t *)src - 64 + n);
269 		return ret;
270 	}
271 
272 	/**
273 	 * Make store aligned when copy size exceeds 512 bytes
274 	 */
275 	dstofss = ((uintptr_t)dst & 0x3F);
276 	if (dstofss > 0) {
277 		dstofss = 64 - dstofss;
278 		n -= dstofss;
279 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
280 		src = (const uint8_t *)src + dstofss;
281 		dst = (uint8_t *)dst + dstofss;
282 	}
283 
284 	/**
285 	 * Copy 512-byte blocks.
286 	 * Use copy block function for better instruction order control,
287 	 * which is important when load is unaligned.
288 	 */
289 	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
290 	bits = n;
291 	n = n & 511;
292 	bits -= n;
293 	src = (const uint8_t *)src + bits;
294 	dst = (uint8_t *)dst + bits;
295 
296 	/**
297 	 * Copy 128-byte blocks.
298 	 * Use copy block function for better instruction order control,
299 	 * which is important when load is unaligned.
300 	 */
301 	if (n >= 128) {
302 		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
303 		bits = n;
304 		n = n & 127;
305 		bits -= n;
306 		src = (const uint8_t *)src + bits;
307 		dst = (uint8_t *)dst + bits;
308 	}
309 
310 	/**
311 	 * Copy whatever left
312 	 */
313 	goto COPY_BLOCK_128_BACK63;
314 }
315 
316 #elif defined __AVX2__
317 
318 #define ALIGNMENT_MASK 0x1F
319 
320 /**
321  * AVX2 implementation below
322  */
323 
324 /**
325  * Copy 16 bytes from one location to another,
326  * locations should not overlap.
327  */
328 static __rte_always_inline void
329 rte_mov16(uint8_t *dst, const uint8_t *src)
330 {
331 	__m128i xmm0;
332 
333 	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
334 	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
335 }
336 
337 /**
338  * Copy 32 bytes from one location to another,
339  * locations should not overlap.
340  */
341 static __rte_always_inline void
342 rte_mov32(uint8_t *dst, const uint8_t *src)
343 {
344 	__m256i ymm0;
345 
346 	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
347 	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
348 }
349 
350 /**
351  * Copy 64 bytes from one location to another,
352  * locations should not overlap.
353  */
354 static __rte_always_inline void
355 rte_mov64(uint8_t *dst, const uint8_t *src)
356 {
357 	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
358 	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
359 }
360 
361 /**
362  * Copy 128 bytes from one location to another,
363  * locations should not overlap.
364  */
365 static __rte_always_inline void
366 rte_mov128(uint8_t *dst, const uint8_t *src)
367 {
368 	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
369 	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
370 	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
371 	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
372 }
373 
374 /**
375  * Copy 128-byte blocks from one location to another,
376  * locations should not overlap.
377  */
378 static __rte_always_inline void
379 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
380 {
381 	__m256i ymm0, ymm1, ymm2, ymm3;
382 
383 	while (n >= 128) {
384 		ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)
385 					  ((const uint8_t *)src + 0 * 32));
386 		n -= 128;
387 		ymm1 = _mm256_loadu_si256((const __m256i *)(const void *)
388 					  ((const uint8_t *)src + 1 * 32));
389 		ymm2 = _mm256_loadu_si256((const __m256i *)(const void *)
390 					  ((const uint8_t *)src + 2 * 32));
391 		ymm3 = _mm256_loadu_si256((const __m256i *)(const void *)
392 					  ((const uint8_t *)src + 3 * 32));
393 		src = (const uint8_t *)src + 128;
394 		_mm256_storeu_si256((__m256i *)(void *)
395 				    ((uint8_t *)dst + 0 * 32), ymm0);
396 		_mm256_storeu_si256((__m256i *)(void *)
397 				    ((uint8_t *)dst + 1 * 32), ymm1);
398 		_mm256_storeu_si256((__m256i *)(void *)
399 				    ((uint8_t *)dst + 2 * 32), ymm2);
400 		_mm256_storeu_si256((__m256i *)(void *)
401 				    ((uint8_t *)dst + 3 * 32), ymm3);
402 		dst = (uint8_t *)dst + 128;
403 	}
404 }
405 
406 static __rte_always_inline void *
407 rte_memcpy_generic(void *dst, const void *src, size_t n)
408 {
409 	void *ret = dst;
410 	size_t dstofss;
411 	size_t bits;
412 
413 	/**
414 	 * Copy less than 16 bytes
415 	 */
416 	if (n < 16) {
417 		return rte_mov15_or_less(dst, src, n);
418 	}
419 
420 	/**
421 	 * Fast way when copy size doesn't exceed 256 bytes
422 	 */
423 	if (n <= 32) {
424 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
425 		rte_mov16((uint8_t *)dst - 16 + n,
426 				(const uint8_t *)src - 16 + n);
427 		return ret;
428 	}
429 	if (n <= 48) {
430 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
431 		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
432 		rte_mov16((uint8_t *)dst - 16 + n,
433 				(const uint8_t *)src - 16 + n);
434 		return ret;
435 	}
436 	if (n <= 64) {
437 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
438 		rte_mov32((uint8_t *)dst - 32 + n,
439 				(const uint8_t *)src - 32 + n);
440 		return ret;
441 	}
442 	if (n <= 256) {
443 		if (n >= 128) {
444 			n -= 128;
445 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
446 			src = (const uint8_t *)src + 128;
447 			dst = (uint8_t *)dst + 128;
448 		}
449 COPY_BLOCK_128_BACK31:
450 		if (n >= 64) {
451 			n -= 64;
452 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
453 			src = (const uint8_t *)src + 64;
454 			dst = (uint8_t *)dst + 64;
455 		}
456 		if (n > 32) {
457 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
458 			rte_mov32((uint8_t *)dst - 32 + n,
459 					(const uint8_t *)src - 32 + n);
460 			return ret;
461 		}
462 		if (n > 0) {
463 			rte_mov32((uint8_t *)dst - 32 + n,
464 					(const uint8_t *)src - 32 + n);
465 		}
466 		return ret;
467 	}
468 
469 	/**
470 	 * Make store aligned when copy size exceeds 256 bytes
471 	 */
472 	dstofss = (uintptr_t)dst & 0x1F;
473 	if (dstofss > 0) {
474 		dstofss = 32 - dstofss;
475 		n -= dstofss;
476 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
477 		src = (const uint8_t *)src + dstofss;
478 		dst = (uint8_t *)dst + dstofss;
479 	}
480 
481 	/**
482 	 * Copy 128-byte blocks
483 	 */
484 	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
485 	bits = n;
486 	n = n & 127;
487 	bits -= n;
488 	src = (const uint8_t *)src + bits;
489 	dst = (uint8_t *)dst + bits;
490 
491 	/**
492 	 * Copy whatever left
493 	 */
494 	goto COPY_BLOCK_128_BACK31;
495 }
496 
497 #else /* __AVX512F__ */
498 
499 #define ALIGNMENT_MASK 0x0F
500 
501 /**
502  * SSE & AVX implementation below
503  */
504 
505 /**
506  * Copy 16 bytes from one location to another,
507  * locations should not overlap.
508  */
509 static __rte_always_inline void
510 rte_mov16(uint8_t *dst, const uint8_t *src)
511 {
512 	__m128i xmm0;
513 
514 	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
515 	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
516 }
517 
518 /**
519  * Copy 32 bytes from one location to another,
520  * locations should not overlap.
521  */
522 static __rte_always_inline void
523 rte_mov32(uint8_t *dst, const uint8_t *src)
524 {
525 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
526 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
527 }
528 
529 /**
530  * Copy 64 bytes from one location to another,
531  * locations should not overlap.
532  */
533 static __rte_always_inline void
534 rte_mov64(uint8_t *dst, const uint8_t *src)
535 {
536 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
537 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
538 	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
539 	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
540 }
541 
542 /**
543  * Copy 128 bytes from one location to another,
544  * locations should not overlap.
545  */
546 static __rte_always_inline void
547 rte_mov128(uint8_t *dst, const uint8_t *src)
548 {
549 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
550 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
551 	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
552 	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
553 	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
554 	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
555 	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
556 	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
557 }
558 
559 /**
560  * Copy 256 bytes from one location to another,
561  * locations should not overlap.
562  */
563 static inline void
564 rte_mov256(uint8_t *dst, const uint8_t *src)
565 {
566 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
567 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
568 	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
569 	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
570 	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
571 	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
572 	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
573 	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
574 	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
575 	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
576 	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
577 	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
578 	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
579 	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
580 	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
581 	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
582 }
583 
584 /**
585  * Macro for copying unaligned block from one location to another with constant load offset,
586  * 47 bytes leftover maximum,
587  * locations should not overlap.
588  * Requirements:
589  * - Store is aligned
590  * - Load offset is <offset>, which must be immediate value within [1, 15]
591  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
592  * - <dst>, <src>, <len> must be variables
593  * - __m128i <xmm0> ~ <xmm8> must be pre-defined
594  */
595 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
596 __extension__ ({                                                                                            \
597     size_t tmp;                                                                                                \
598     while (len >= 128 + 16 - offset) {                                                                      \
599         xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));                  \
600         len -= 128;                                                                                         \
601         xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));                  \
602         xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));                  \
603         xmm3 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 3 * 16));                  \
604         xmm4 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 4 * 16));                  \
605         xmm5 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 5 * 16));                  \
606         xmm6 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 6 * 16));                  \
607         xmm7 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 7 * 16));                  \
608         xmm8 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 8 * 16));                  \
609         src = (const uint8_t *)src + 128;                                                                   \
610         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
611         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
612         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
613         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
614         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
615         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
616         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
617         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
618         dst = (uint8_t *)dst + 128;                                                                         \
619     }                                                                                                       \
620     tmp = len;                                                                                              \
621     len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
622     tmp -= len;                                                                                             \
623     src = (const uint8_t *)src + tmp;                                                                       \
624     dst = (uint8_t *)dst + tmp;                                                                             \
625     if (len >= 32 + 16 - offset) {                                                                          \
626         while (len >= 32 + 16 - offset) {                                                                   \
627             xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));              \
628             len -= 32;                                                                                      \
629             xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));              \
630             xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));              \
631             src = (const uint8_t *)src + 32;                                                                \
632             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
633             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
634             dst = (uint8_t *)dst + 32;                                                                      \
635         }                                                                                                   \
636         tmp = len;                                                                                          \
637         len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
638         tmp -= len;                                                                                         \
639         src = (const uint8_t *)src + tmp;                                                                   \
640         dst = (uint8_t *)dst + tmp;                                                                         \
641     }                                                                                                       \
642 })
643 
644 /**
645  * Macro for copying unaligned block from one location to another,
646  * 47 bytes leftover maximum,
647  * locations should not overlap.
648  * Use switch here because the aligning instruction requires immediate value for shift count.
649  * Requirements:
650  * - Store is aligned
651  * - Load offset is <offset>, which must be within [1, 15]
652  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
653  * - <dst>, <src>, <len> must be variables
654  * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
655  */
656 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
657 __extension__ ({                                                      \
658     switch (offset) {                                                 \
659     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
660     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
661     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
662     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
663     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
664     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
665     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
666     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
667     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
668     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
669     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
670     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
671     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
672     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
673     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
674     default:;                                                         \
675     }                                                                 \
676 })
677 
678 static __rte_always_inline void *
679 rte_memcpy_generic(void *dst, const void *src, size_t n)
680 {
681 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
682 	void *ret = dst;
683 	size_t dstofss;
684 	size_t srcofs;
685 
686 	/**
687 	 * Copy less than 16 bytes
688 	 */
689 	if (n < 16) {
690 		return rte_mov15_or_less(dst, src, n);
691 	}
692 
693 	/**
694 	 * Fast way when copy size doesn't exceed 512 bytes
695 	 */
696 	if (n <= 32) {
697 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
698 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
699 		return ret;
700 	}
701 	if (n <= 48) {
702 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
703 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
704 		return ret;
705 	}
706 	if (n <= 64) {
707 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
708 		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
709 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
710 		return ret;
711 	}
712 	if (n <= 128) {
713 		goto COPY_BLOCK_128_BACK15;
714 	}
715 	if (n <= 512) {
716 		if (n >= 256) {
717 			n -= 256;
718 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
719 			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
720 			src = (const uint8_t *)src + 256;
721 			dst = (uint8_t *)dst + 256;
722 		}
723 COPY_BLOCK_255_BACK15:
724 		if (n >= 128) {
725 			n -= 128;
726 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
727 			src = (const uint8_t *)src + 128;
728 			dst = (uint8_t *)dst + 128;
729 		}
730 COPY_BLOCK_128_BACK15:
731 		if (n >= 64) {
732 			n -= 64;
733 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
734 			src = (const uint8_t *)src + 64;
735 			dst = (uint8_t *)dst + 64;
736 		}
737 COPY_BLOCK_64_BACK15:
738 		if (n >= 32) {
739 			n -= 32;
740 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
741 			src = (const uint8_t *)src + 32;
742 			dst = (uint8_t *)dst + 32;
743 		}
744 		if (n > 16) {
745 			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
746 			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
747 			return ret;
748 		}
749 		if (n > 0) {
750 			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
751 		}
752 		return ret;
753 	}
754 
755 	/**
756 	 * Make store aligned when copy size exceeds 512 bytes,
757 	 * and make sure the first 15 bytes are copied, because
758 	 * unaligned copy functions require up to 15 bytes
759 	 * backwards access.
760 	 */
761 	dstofss = (uintptr_t)dst & 0x0F;
762 	if (dstofss > 0) {
763 		dstofss = 16 - dstofss + 16;
764 		n -= dstofss;
765 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
766 		src = (const uint8_t *)src + dstofss;
767 		dst = (uint8_t *)dst + dstofss;
768 	}
769 	srcofs = ((uintptr_t)src & 0x0F);
770 
771 	/**
772 	 * For aligned copy
773 	 */
774 	if (srcofs == 0) {
775 		/**
776 		 * Copy 256-byte blocks
777 		 */
778 		for (; n >= 256; n -= 256) {
779 			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
780 			dst = (uint8_t *)dst + 256;
781 			src = (const uint8_t *)src + 256;
782 		}
783 
784 		/**
785 		 * Copy whatever left
786 		 */
787 		goto COPY_BLOCK_255_BACK15;
788 	}
789 
790 	/**
791 	 * For copy with unaligned load
792 	 */
793 	MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
794 
795 	/**
796 	 * Copy whatever left
797 	 */
798 	goto COPY_BLOCK_64_BACK15;
799 }
800 
801 #endif /* __AVX512F__ */
802 
803 static __rte_always_inline void *
804 rte_memcpy_aligned(void *dst, const void *src, size_t n)
805 {
806 	void *ret = dst;
807 
808 	/* Copy size < 16 bytes */
809 	if (n < 16) {
810 		return rte_mov15_or_less(dst, src, n);
811 	}
812 
813 	/* Copy 16 <= size <= 32 bytes */
814 	if (n <= 32) {
815 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
816 		rte_mov16((uint8_t *)dst - 16 + n,
817 				(const uint8_t *)src - 16 + n);
818 
819 		return ret;
820 	}
821 
822 	/* Copy 32 < size <= 64 bytes */
823 	if (n <= 64) {
824 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
825 		rte_mov32((uint8_t *)dst - 32 + n,
826 				(const uint8_t *)src - 32 + n);
827 
828 		return ret;
829 	}
830 
831 	/* Copy 64 bytes blocks */
832 	for (; n >= 64; n -= 64) {
833 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
834 		dst = (uint8_t *)dst + 64;
835 		src = (const uint8_t *)src + 64;
836 	}
837 
838 	/* Copy whatever left */
839 	rte_mov64((uint8_t *)dst - 64 + n,
840 			(const uint8_t *)src - 64 + n);
841 
842 	return ret;
843 }
844 
845 static __rte_always_inline void *
846 rte_memcpy(void *dst, const void *src, size_t n)
847 {
848 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
849 		return rte_memcpy_aligned(dst, src, n);
850 	else
851 		return rte_memcpy_generic(dst, src, n);
852 }
853 
854 #undef ALIGNMENT_MASK
855 
856 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
857 #pragma GCC diagnostic pop
858 #endif
859 
860 #ifdef __cplusplus
861 }
862 #endif
863 
864 #endif /* _RTE_MEMCPY_X86_64_H_ */
865