xref: /dpdk/lib/eal/x86/include/rte_memcpy.h (revision b53d106d34b5c638f5a2cbdfee0da5bd42d4383f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #ifndef _RTE_MEMCPY_X86_64_H_
6 #define _RTE_MEMCPY_X86_64_H_
7 
8 /**
9  * @file
10  *
11  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
12  */
13 
14 #include <stdio.h>
15 #include <stdint.h>
16 #include <string.h>
17 #include <rte_vect.h>
18 #include <rte_common.h>
19 #include <rte_config.h>
20 
21 #ifdef __cplusplus
22 extern "C" {
23 #endif
24 
25 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
26 #pragma GCC diagnostic push
27 #pragma GCC diagnostic ignored "-Wstringop-overflow"
28 #endif
29 
30 /**
31  * Copy bytes from one location to another. The locations must not overlap.
32  *
33  * @note This is implemented as a macro, so it's address should not be taken
34  * and care is needed as parameter expressions may be evaluated multiple times.
35  *
36  * @param dst
37  *   Pointer to the destination of the data.
38  * @param src
39  *   Pointer to the source data.
40  * @param n
41  *   Number of bytes to copy.
42  * @return
43  *   Pointer to the destination data.
44  */
45 static __rte_always_inline void *
46 rte_memcpy(void *dst, const void *src, size_t n);
47 
48 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
49 
50 #define ALIGNMENT_MASK 0x3F
51 
52 /**
53  * AVX512 implementation below
54  */
55 
56 /**
57  * Copy 16 bytes from one location to another,
58  * locations should not overlap.
59  */
60 static __rte_always_inline void
61 rte_mov16(uint8_t *dst, const uint8_t *src)
62 {
63 	__m128i xmm0;
64 
65 	xmm0 = _mm_loadu_si128((const __m128i *)src);
66 	_mm_storeu_si128((__m128i *)dst, xmm0);
67 }
68 
69 /**
70  * Copy 32 bytes from one location to another,
71  * locations should not overlap.
72  */
73 static __rte_always_inline void
74 rte_mov32(uint8_t *dst, const uint8_t *src)
75 {
76 	__m256i ymm0;
77 
78 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
79 	_mm256_storeu_si256((__m256i *)dst, ymm0);
80 }
81 
82 /**
83  * Copy 64 bytes from one location to another,
84  * locations should not overlap.
85  */
86 static __rte_always_inline void
87 rte_mov64(uint8_t *dst, const uint8_t *src)
88 {
89 	__m512i zmm0;
90 
91 	zmm0 = _mm512_loadu_si512((const void *)src);
92 	_mm512_storeu_si512((void *)dst, zmm0);
93 }
94 
95 /**
96  * Copy 128 bytes from one location to another,
97  * locations should not overlap.
98  */
99 static __rte_always_inline void
100 rte_mov128(uint8_t *dst, const uint8_t *src)
101 {
102 	rte_mov64(dst + 0 * 64, src + 0 * 64);
103 	rte_mov64(dst + 1 * 64, src + 1 * 64);
104 }
105 
106 /**
107  * Copy 256 bytes from one location to another,
108  * locations should not overlap.
109  */
110 static __rte_always_inline void
111 rte_mov256(uint8_t *dst, const uint8_t *src)
112 {
113 	rte_mov64(dst + 0 * 64, src + 0 * 64);
114 	rte_mov64(dst + 1 * 64, src + 1 * 64);
115 	rte_mov64(dst + 2 * 64, src + 2 * 64);
116 	rte_mov64(dst + 3 * 64, src + 3 * 64);
117 }
118 
119 /**
120  * Copy 128-byte blocks from one location to another,
121  * locations should not overlap.
122  */
123 static __rte_always_inline void
124 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
125 {
126 	__m512i zmm0, zmm1;
127 
128 	while (n >= 128) {
129 		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
130 		n -= 128;
131 		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
132 		src = src + 128;
133 		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
134 		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
135 		dst = dst + 128;
136 	}
137 }
138 
139 /**
140  * Copy 512-byte blocks from one location to another,
141  * locations should not overlap.
142  */
143 static inline void
144 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
145 {
146 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
147 
148 	while (n >= 512) {
149 		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
150 		n -= 512;
151 		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
152 		zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
153 		zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
154 		zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
155 		zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
156 		zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
157 		zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
158 		src = src + 512;
159 		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
160 		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
161 		_mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
162 		_mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
163 		_mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
164 		_mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
165 		_mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
166 		_mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
167 		dst = dst + 512;
168 	}
169 }
170 
171 static __rte_always_inline void *
172 rte_memcpy_generic(void *dst, const void *src, size_t n)
173 {
174 	uintptr_t dstu = (uintptr_t)dst;
175 	uintptr_t srcu = (uintptr_t)src;
176 	void *ret = dst;
177 	size_t dstofss;
178 	size_t bits;
179 
180 	/**
181 	 * Copy less than 16 bytes
182 	 */
183 	if (n < 16) {
184 		if (n & 0x01) {
185 			*(uint8_t *)dstu = *(const uint8_t *)srcu;
186 			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
187 			dstu = (uintptr_t)((uint8_t *)dstu + 1);
188 		}
189 		if (n & 0x02) {
190 			*(uint16_t *)dstu = *(const uint16_t *)srcu;
191 			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
192 			dstu = (uintptr_t)((uint16_t *)dstu + 1);
193 		}
194 		if (n & 0x04) {
195 			*(uint32_t *)dstu = *(const uint32_t *)srcu;
196 			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
197 			dstu = (uintptr_t)((uint32_t *)dstu + 1);
198 		}
199 		if (n & 0x08)
200 			*(uint64_t *)dstu = *(const uint64_t *)srcu;
201 		return ret;
202 	}
203 
204 	/**
205 	 * Fast way when copy size doesn't exceed 512 bytes
206 	 */
207 	if (n <= 32) {
208 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
209 		rte_mov16((uint8_t *)dst - 16 + n,
210 				  (const uint8_t *)src - 16 + n);
211 		return ret;
212 	}
213 	if (n <= 64) {
214 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
215 		rte_mov32((uint8_t *)dst - 32 + n,
216 				  (const uint8_t *)src - 32 + n);
217 		return ret;
218 	}
219 	if (n <= 512) {
220 		if (n >= 256) {
221 			n -= 256;
222 			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
223 			src = (const uint8_t *)src + 256;
224 			dst = (uint8_t *)dst + 256;
225 		}
226 		if (n >= 128) {
227 			n -= 128;
228 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
229 			src = (const uint8_t *)src + 128;
230 			dst = (uint8_t *)dst + 128;
231 		}
232 COPY_BLOCK_128_BACK63:
233 		if (n > 64) {
234 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
235 			rte_mov64((uint8_t *)dst - 64 + n,
236 					  (const uint8_t *)src - 64 + n);
237 			return ret;
238 		}
239 		if (n > 0)
240 			rte_mov64((uint8_t *)dst - 64 + n,
241 					  (const uint8_t *)src - 64 + n);
242 		return ret;
243 	}
244 
245 	/**
246 	 * Make store aligned when copy size exceeds 512 bytes
247 	 */
248 	dstofss = ((uintptr_t)dst & 0x3F);
249 	if (dstofss > 0) {
250 		dstofss = 64 - dstofss;
251 		n -= dstofss;
252 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
253 		src = (const uint8_t *)src + dstofss;
254 		dst = (uint8_t *)dst + dstofss;
255 	}
256 
257 	/**
258 	 * Copy 512-byte blocks.
259 	 * Use copy block function for better instruction order control,
260 	 * which is important when load is unaligned.
261 	 */
262 	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
263 	bits = n;
264 	n = n & 511;
265 	bits -= n;
266 	src = (const uint8_t *)src + bits;
267 	dst = (uint8_t *)dst + bits;
268 
269 	/**
270 	 * Copy 128-byte blocks.
271 	 * Use copy block function for better instruction order control,
272 	 * which is important when load is unaligned.
273 	 */
274 	if (n >= 128) {
275 		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
276 		bits = n;
277 		n = n & 127;
278 		bits -= n;
279 		src = (const uint8_t *)src + bits;
280 		dst = (uint8_t *)dst + bits;
281 	}
282 
283 	/**
284 	 * Copy whatever left
285 	 */
286 	goto COPY_BLOCK_128_BACK63;
287 }
288 
289 #elif defined __AVX2__
290 
291 #define ALIGNMENT_MASK 0x1F
292 
293 /**
294  * AVX2 implementation below
295  */
296 
297 /**
298  * Copy 16 bytes from one location to another,
299  * locations should not overlap.
300  */
301 static __rte_always_inline void
302 rte_mov16(uint8_t *dst, const uint8_t *src)
303 {
304 	__m128i xmm0;
305 
306 	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
307 	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
308 }
309 
310 /**
311  * Copy 32 bytes from one location to another,
312  * locations should not overlap.
313  */
314 static __rte_always_inline void
315 rte_mov32(uint8_t *dst, const uint8_t *src)
316 {
317 	__m256i ymm0;
318 
319 	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
320 	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
321 }
322 
323 /**
324  * Copy 64 bytes from one location to another,
325  * locations should not overlap.
326  */
327 static __rte_always_inline void
328 rte_mov64(uint8_t *dst, const uint8_t *src)
329 {
330 	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
331 	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
332 }
333 
334 /**
335  * Copy 128 bytes from one location to another,
336  * locations should not overlap.
337  */
338 static __rte_always_inline void
339 rte_mov128(uint8_t *dst, const uint8_t *src)
340 {
341 	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
342 	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
343 	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
344 	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
345 }
346 
347 /**
348  * Copy 128-byte blocks from one location to another,
349  * locations should not overlap.
350  */
351 static __rte_always_inline void
352 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
353 {
354 	__m256i ymm0, ymm1, ymm2, ymm3;
355 
356 	while (n >= 128) {
357 		ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)
358 					  ((const uint8_t *)src + 0 * 32));
359 		n -= 128;
360 		ymm1 = _mm256_loadu_si256((const __m256i *)(const void *)
361 					  ((const uint8_t *)src + 1 * 32));
362 		ymm2 = _mm256_loadu_si256((const __m256i *)(const void *)
363 					  ((const uint8_t *)src + 2 * 32));
364 		ymm3 = _mm256_loadu_si256((const __m256i *)(const void *)
365 					  ((const uint8_t *)src + 3 * 32));
366 		src = (const uint8_t *)src + 128;
367 		_mm256_storeu_si256((__m256i *)(void *)
368 				    ((uint8_t *)dst + 0 * 32), ymm0);
369 		_mm256_storeu_si256((__m256i *)(void *)
370 				    ((uint8_t *)dst + 1 * 32), ymm1);
371 		_mm256_storeu_si256((__m256i *)(void *)
372 				    ((uint8_t *)dst + 2 * 32), ymm2);
373 		_mm256_storeu_si256((__m256i *)(void *)
374 				    ((uint8_t *)dst + 3 * 32), ymm3);
375 		dst = (uint8_t *)dst + 128;
376 	}
377 }
378 
379 static __rte_always_inline void *
380 rte_memcpy_generic(void *dst, const void *src, size_t n)
381 {
382 	uintptr_t dstu = (uintptr_t)dst;
383 	uintptr_t srcu = (uintptr_t)src;
384 	void *ret = dst;
385 	size_t dstofss;
386 	size_t bits;
387 
388 	/**
389 	 * Copy less than 16 bytes
390 	 */
391 	if (n < 16) {
392 		if (n & 0x01) {
393 			*(uint8_t *)dstu = *(const uint8_t *)srcu;
394 			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
395 			dstu = (uintptr_t)((uint8_t *)dstu + 1);
396 		}
397 		if (n & 0x02) {
398 			*(uint16_t *)dstu = *(const uint16_t *)srcu;
399 			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
400 			dstu = (uintptr_t)((uint16_t *)dstu + 1);
401 		}
402 		if (n & 0x04) {
403 			*(uint32_t *)dstu = *(const uint32_t *)srcu;
404 			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
405 			dstu = (uintptr_t)((uint32_t *)dstu + 1);
406 		}
407 		if (n & 0x08) {
408 			*(uint64_t *)dstu = *(const uint64_t *)srcu;
409 		}
410 		return ret;
411 	}
412 
413 	/**
414 	 * Fast way when copy size doesn't exceed 256 bytes
415 	 */
416 	if (n <= 32) {
417 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
418 		rte_mov16((uint8_t *)dst - 16 + n,
419 				(const uint8_t *)src - 16 + n);
420 		return ret;
421 	}
422 	if (n <= 48) {
423 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
424 		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
425 		rte_mov16((uint8_t *)dst - 16 + n,
426 				(const uint8_t *)src - 16 + n);
427 		return ret;
428 	}
429 	if (n <= 64) {
430 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
431 		rte_mov32((uint8_t *)dst - 32 + n,
432 				(const uint8_t *)src - 32 + n);
433 		return ret;
434 	}
435 	if (n <= 256) {
436 		if (n >= 128) {
437 			n -= 128;
438 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
439 			src = (const uint8_t *)src + 128;
440 			dst = (uint8_t *)dst + 128;
441 		}
442 COPY_BLOCK_128_BACK31:
443 		if (n >= 64) {
444 			n -= 64;
445 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
446 			src = (const uint8_t *)src + 64;
447 			dst = (uint8_t *)dst + 64;
448 		}
449 		if (n > 32) {
450 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
451 			rte_mov32((uint8_t *)dst - 32 + n,
452 					(const uint8_t *)src - 32 + n);
453 			return ret;
454 		}
455 		if (n > 0) {
456 			rte_mov32((uint8_t *)dst - 32 + n,
457 					(const uint8_t *)src - 32 + n);
458 		}
459 		return ret;
460 	}
461 
462 	/**
463 	 * Make store aligned when copy size exceeds 256 bytes
464 	 */
465 	dstofss = (uintptr_t)dst & 0x1F;
466 	if (dstofss > 0) {
467 		dstofss = 32 - dstofss;
468 		n -= dstofss;
469 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
470 		src = (const uint8_t *)src + dstofss;
471 		dst = (uint8_t *)dst + dstofss;
472 	}
473 
474 	/**
475 	 * Copy 128-byte blocks
476 	 */
477 	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
478 	bits = n;
479 	n = n & 127;
480 	bits -= n;
481 	src = (const uint8_t *)src + bits;
482 	dst = (uint8_t *)dst + bits;
483 
484 	/**
485 	 * Copy whatever left
486 	 */
487 	goto COPY_BLOCK_128_BACK31;
488 }
489 
490 #else /* __AVX512F__ */
491 
492 #define ALIGNMENT_MASK 0x0F
493 
494 /**
495  * SSE & AVX implementation below
496  */
497 
498 /**
499  * Copy 16 bytes from one location to another,
500  * locations should not overlap.
501  */
502 static __rte_always_inline void
503 rte_mov16(uint8_t *dst, const uint8_t *src)
504 {
505 	__m128i xmm0;
506 
507 	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
508 	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
509 }
510 
511 /**
512  * Copy 32 bytes from one location to another,
513  * locations should not overlap.
514  */
515 static __rte_always_inline void
516 rte_mov32(uint8_t *dst, const uint8_t *src)
517 {
518 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
519 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
520 }
521 
522 /**
523  * Copy 64 bytes from one location to another,
524  * locations should not overlap.
525  */
526 static __rte_always_inline void
527 rte_mov64(uint8_t *dst, const uint8_t *src)
528 {
529 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
530 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
531 	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
532 	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
533 }
534 
535 /**
536  * Copy 128 bytes from one location to another,
537  * locations should not overlap.
538  */
539 static __rte_always_inline void
540 rte_mov128(uint8_t *dst, const uint8_t *src)
541 {
542 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
543 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
544 	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
545 	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
546 	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
547 	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
548 	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
549 	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
550 }
551 
552 /**
553  * Copy 256 bytes from one location to another,
554  * locations should not overlap.
555  */
556 static inline void
557 rte_mov256(uint8_t *dst, const uint8_t *src)
558 {
559 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
560 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
561 	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
562 	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
563 	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
564 	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
565 	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
566 	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
567 	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
568 	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
569 	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
570 	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
571 	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
572 	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
573 	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
574 	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
575 }
576 
577 /**
578  * Macro for copying unaligned block from one location to another with constant load offset,
579  * 47 bytes leftover maximum,
580  * locations should not overlap.
581  * Requirements:
582  * - Store is aligned
583  * - Load offset is <offset>, which must be immediate value within [1, 15]
584  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
585  * - <dst>, <src>, <len> must be variables
586  * - __m128i <xmm0> ~ <xmm8> must be pre-defined
587  */
588 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
589 __extension__ ({                                                                                            \
590     size_t tmp;                                                                                                \
591     while (len >= 128 + 16 - offset) {                                                                      \
592         xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));                  \
593         len -= 128;                                                                                         \
594         xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));                  \
595         xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));                  \
596         xmm3 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 3 * 16));                  \
597         xmm4 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 4 * 16));                  \
598         xmm5 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 5 * 16));                  \
599         xmm6 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 6 * 16));                  \
600         xmm7 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 7 * 16));                  \
601         xmm8 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 8 * 16));                  \
602         src = (const uint8_t *)src + 128;                                                                   \
603         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
604         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
605         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
606         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
607         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
608         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
609         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
610         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
611         dst = (uint8_t *)dst + 128;                                                                         \
612     }                                                                                                       \
613     tmp = len;                                                                                              \
614     len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
615     tmp -= len;                                                                                             \
616     src = (const uint8_t *)src + tmp;                                                                       \
617     dst = (uint8_t *)dst + tmp;                                                                             \
618     if (len >= 32 + 16 - offset) {                                                                          \
619         while (len >= 32 + 16 - offset) {                                                                   \
620             xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));              \
621             len -= 32;                                                                                      \
622             xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));              \
623             xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));              \
624             src = (const uint8_t *)src + 32;                                                                \
625             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
626             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
627             dst = (uint8_t *)dst + 32;                                                                      \
628         }                                                                                                   \
629         tmp = len;                                                                                          \
630         len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
631         tmp -= len;                                                                                         \
632         src = (const uint8_t *)src + tmp;                                                                   \
633         dst = (uint8_t *)dst + tmp;                                                                         \
634     }                                                                                                       \
635 })
636 
637 /**
638  * Macro for copying unaligned block from one location to another,
639  * 47 bytes leftover maximum,
640  * locations should not overlap.
641  * Use switch here because the aligning instruction requires immediate value for shift count.
642  * Requirements:
643  * - Store is aligned
644  * - Load offset is <offset>, which must be within [1, 15]
645  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
646  * - <dst>, <src>, <len> must be variables
647  * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
648  */
649 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
650 __extension__ ({                                                      \
651     switch (offset) {                                                 \
652     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
653     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
654     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
655     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
656     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
657     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
658     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
659     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
660     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
661     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
662     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
663     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
664     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
665     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
666     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
667     default:;                                                         \
668     }                                                                 \
669 })
670 
671 static __rte_always_inline void *
672 rte_memcpy_generic(void *dst, const void *src, size_t n)
673 {
674 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
675 	uintptr_t dstu = (uintptr_t)dst;
676 	uintptr_t srcu = (uintptr_t)src;
677 	void *ret = dst;
678 	size_t dstofss;
679 	size_t srcofs;
680 
681 	/**
682 	 * Copy less than 16 bytes
683 	 */
684 	if (n < 16) {
685 		if (n & 0x01) {
686 			*(uint8_t *)dstu = *(const uint8_t *)srcu;
687 			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
688 			dstu = (uintptr_t)((uint8_t *)dstu + 1);
689 		}
690 		if (n & 0x02) {
691 			*(uint16_t *)dstu = *(const uint16_t *)srcu;
692 			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
693 			dstu = (uintptr_t)((uint16_t *)dstu + 1);
694 		}
695 		if (n & 0x04) {
696 			*(uint32_t *)dstu = *(const uint32_t *)srcu;
697 			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
698 			dstu = (uintptr_t)((uint32_t *)dstu + 1);
699 		}
700 		if (n & 0x08) {
701 			*(uint64_t *)dstu = *(const uint64_t *)srcu;
702 		}
703 		return ret;
704 	}
705 
706 	/**
707 	 * Fast way when copy size doesn't exceed 512 bytes
708 	 */
709 	if (n <= 32) {
710 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
711 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
712 		return ret;
713 	}
714 	if (n <= 48) {
715 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
716 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
717 		return ret;
718 	}
719 	if (n <= 64) {
720 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
721 		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
722 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
723 		return ret;
724 	}
725 	if (n <= 128) {
726 		goto COPY_BLOCK_128_BACK15;
727 	}
728 	if (n <= 512) {
729 		if (n >= 256) {
730 			n -= 256;
731 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
732 			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
733 			src = (const uint8_t *)src + 256;
734 			dst = (uint8_t *)dst + 256;
735 		}
736 COPY_BLOCK_255_BACK15:
737 		if (n >= 128) {
738 			n -= 128;
739 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
740 			src = (const uint8_t *)src + 128;
741 			dst = (uint8_t *)dst + 128;
742 		}
743 COPY_BLOCK_128_BACK15:
744 		if (n >= 64) {
745 			n -= 64;
746 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
747 			src = (const uint8_t *)src + 64;
748 			dst = (uint8_t *)dst + 64;
749 		}
750 COPY_BLOCK_64_BACK15:
751 		if (n >= 32) {
752 			n -= 32;
753 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
754 			src = (const uint8_t *)src + 32;
755 			dst = (uint8_t *)dst + 32;
756 		}
757 		if (n > 16) {
758 			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
759 			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
760 			return ret;
761 		}
762 		if (n > 0) {
763 			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
764 		}
765 		return ret;
766 	}
767 
768 	/**
769 	 * Make store aligned when copy size exceeds 512 bytes,
770 	 * and make sure the first 15 bytes are copied, because
771 	 * unaligned copy functions require up to 15 bytes
772 	 * backwards access.
773 	 */
774 	dstofss = (uintptr_t)dst & 0x0F;
775 	if (dstofss > 0) {
776 		dstofss = 16 - dstofss + 16;
777 		n -= dstofss;
778 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
779 		src = (const uint8_t *)src + dstofss;
780 		dst = (uint8_t *)dst + dstofss;
781 	}
782 	srcofs = ((uintptr_t)src & 0x0F);
783 
784 	/**
785 	 * For aligned copy
786 	 */
787 	if (srcofs == 0) {
788 		/**
789 		 * Copy 256-byte blocks
790 		 */
791 		for (; n >= 256; n -= 256) {
792 			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
793 			dst = (uint8_t *)dst + 256;
794 			src = (const uint8_t *)src + 256;
795 		}
796 
797 		/**
798 		 * Copy whatever left
799 		 */
800 		goto COPY_BLOCK_255_BACK15;
801 	}
802 
803 	/**
804 	 * For copy with unaligned load
805 	 */
806 	MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
807 
808 	/**
809 	 * Copy whatever left
810 	 */
811 	goto COPY_BLOCK_64_BACK15;
812 }
813 
814 #endif /* __AVX512F__ */
815 
816 static __rte_always_inline void *
817 rte_memcpy_aligned(void *dst, const void *src, size_t n)
818 {
819 	void *ret = dst;
820 
821 	/* Copy size <= 16 bytes */
822 	if (n < 16) {
823 		if (n & 0x01) {
824 			*(uint8_t *)dst = *(const uint8_t *)src;
825 			src = (const uint8_t *)src + 1;
826 			dst = (uint8_t *)dst + 1;
827 		}
828 		if (n & 0x02) {
829 			*(uint16_t *)dst = *(const uint16_t *)src;
830 			src = (const uint16_t *)src + 1;
831 			dst = (uint16_t *)dst + 1;
832 		}
833 		if (n & 0x04) {
834 			*(uint32_t *)dst = *(const uint32_t *)src;
835 			src = (const uint32_t *)src + 1;
836 			dst = (uint32_t *)dst + 1;
837 		}
838 		if (n & 0x08)
839 			*(uint64_t *)dst = *(const uint64_t *)src;
840 
841 		return ret;
842 	}
843 
844 	/* Copy 16 <= size <= 32 bytes */
845 	if (n <= 32) {
846 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
847 		rte_mov16((uint8_t *)dst - 16 + n,
848 				(const uint8_t *)src - 16 + n);
849 
850 		return ret;
851 	}
852 
853 	/* Copy 32 < size <= 64 bytes */
854 	if (n <= 64) {
855 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
856 		rte_mov32((uint8_t *)dst - 32 + n,
857 				(const uint8_t *)src - 32 + n);
858 
859 		return ret;
860 	}
861 
862 	/* Copy 64 bytes blocks */
863 	for (; n >= 64; n -= 64) {
864 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
865 		dst = (uint8_t *)dst + 64;
866 		src = (const uint8_t *)src + 64;
867 	}
868 
869 	/* Copy whatever left */
870 	rte_mov64((uint8_t *)dst - 64 + n,
871 			(const uint8_t *)src - 64 + n);
872 
873 	return ret;
874 }
875 
876 static __rte_always_inline void *
877 rte_memcpy(void *dst, const void *src, size_t n)
878 {
879 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
880 		return rte_memcpy_aligned(dst, src, n);
881 	else
882 		return rte_memcpy_generic(dst, src, n);
883 }
884 
885 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
886 #pragma GCC diagnostic pop
887 #endif
888 
889 #ifdef __cplusplus
890 }
891 #endif
892 
893 #endif /* _RTE_MEMCPY_X86_64_H_ */
894