xref: /dpdk/lib/eal/x86/include/rte_memcpy.h (revision fba9875559906e04eaeb74532f4cfd51194259a2)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #ifndef _RTE_MEMCPY_X86_64_H_
6 #define _RTE_MEMCPY_X86_64_H_
7 
8 /**
9  * @file
10  *
11  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
12  */
13 
14 #include <stdio.h>
15 #include <stdint.h>
16 #include <string.h>
17 #include <rte_vect.h>
18 #include <rte_common.h>
19 #include <rte_config.h>
20 
21 #ifdef __cplusplus
22 extern "C" {
23 #endif
24 
25 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
26 #pragma GCC diagnostic push
27 #pragma GCC diagnostic ignored "-Wstringop-overflow"
28 #endif
29 
30 /*
31  * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
32  * There are no problems with AVX2.
33  */
34 #if defined __AVX2__
35 #define RTE_MEMCPY_AVX
36 #elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))
37 #define RTE_MEMCPY_AVX
38 #endif
39 
40 /**
41  * Copy bytes from one location to another. The locations must not overlap.
42  *
43  * @note This is implemented as a macro, so it's address should not be taken
44  * and care is needed as parameter expressions may be evaluated multiple times.
45  *
46  * @param dst
47  *   Pointer to the destination of the data.
48  * @param src
49  *   Pointer to the source data.
50  * @param n
51  *   Number of bytes to copy.
52  * @return
53  *   Pointer to the destination data.
54  */
55 static __rte_always_inline void *
56 rte_memcpy(void *dst, const void *src, size_t n);
57 
58 /**
59  * Copy bytes from one location to another,
60  * locations should not overlap.
61  * Use with n <= 15.
62  */
63 static __rte_always_inline void *
64 rte_mov15_or_less(void *dst, const void *src, size_t n)
65 {
66 	/**
67 	 * Use the following structs to avoid violating C standard
68 	 * alignment requirements and to avoid strict aliasing bugs
69 	 */
70 	struct __rte_packed_begin rte_uint64_alias {
71 		uint64_t val;
72 	} __rte_packed_end __rte_may_alias;
73 	struct __rte_packed_begin rte_uint32_alias {
74 		uint32_t val;
75 	} __rte_packed_end __rte_may_alias;
76 	struct __rte_packed_begin rte_uint16_alias {
77 		uint16_t val;
78 	} __rte_packed_end __rte_may_alias;
79 
80 	void *ret = dst;
81 	if (n & 8) {
82 		((struct rte_uint64_alias *)dst)->val =
83 			((const struct rte_uint64_alias *)src)->val;
84 		src = (const uint64_t *)src + 1;
85 		dst = (uint64_t *)dst + 1;
86 	}
87 	if (n & 4) {
88 		((struct rte_uint32_alias *)dst)->val =
89 			((const struct rte_uint32_alias *)src)->val;
90 		src = (const uint32_t *)src + 1;
91 		dst = (uint32_t *)dst + 1;
92 	}
93 	if (n & 2) {
94 		((struct rte_uint16_alias *)dst)->val =
95 			((const struct rte_uint16_alias *)src)->val;
96 		src = (const uint16_t *)src + 1;
97 		dst = (uint16_t *)dst + 1;
98 	}
99 	if (n & 1)
100 		*(uint8_t *)dst = *(const uint8_t *)src;
101 	return ret;
102 }
103 
104 /**
105  * Copy 16 bytes from one location to another,
106  * locations should not overlap.
107  */
108 static __rte_always_inline void
109 rte_mov16(uint8_t *dst, const uint8_t *src)
110 {
111 	__m128i xmm0;
112 
113 	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
114 	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
115 }
116 
117 /**
118  * Copy 32 bytes from one location to another,
119  * locations should not overlap.
120  */
121 static __rte_always_inline void
122 rte_mov32(uint8_t *dst, const uint8_t *src)
123 {
124 #if defined RTE_MEMCPY_AVX
125 	__m256i ymm0;
126 
127 	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
128 	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
129 #else /* SSE implementation */
130 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
131 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
132 #endif
133 }
134 
135 /**
136  * Copy 64 bytes from one location to another,
137  * locations should not overlap.
138  */
139 static __rte_always_inline void
140 rte_mov64(uint8_t *dst, const uint8_t *src)
141 {
142 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
143 	__m512i zmm0;
144 
145 	zmm0 = _mm512_loadu_si512((const void *)src);
146 	_mm512_storeu_si512((void *)dst, zmm0);
147 #else /* AVX2, AVX & SSE implementation */
148 	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
149 	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
150 #endif
151 }
152 
153 /**
154  * Copy 128 bytes from one location to another,
155  * locations should not overlap.
156  */
157 static __rte_always_inline void
158 rte_mov128(uint8_t *dst, const uint8_t *src)
159 {
160 	rte_mov64(dst + 0 * 64, src + 0 * 64);
161 	rte_mov64(dst + 1 * 64, src + 1 * 64);
162 }
163 
164 /**
165  * Copy 256 bytes from one location to another,
166  * locations should not overlap.
167  */
168 static __rte_always_inline void
169 rte_mov256(uint8_t *dst, const uint8_t *src)
170 {
171 	rte_mov128(dst + 0 * 128, src + 0 * 128);
172 	rte_mov128(dst + 1 * 128, src + 1 * 128);
173 }
174 
175 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
176 
177 /**
178  * AVX512 implementation below
179  */
180 
181 #define ALIGNMENT_MASK 0x3F
182 
183 /**
184  * Copy 128-byte blocks from one location to another,
185  * locations should not overlap.
186  */
187 static __rte_always_inline void
188 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
189 {
190 	__m512i zmm0, zmm1;
191 
192 	while (n >= 128) {
193 		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
194 		n -= 128;
195 		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
196 		src = src + 128;
197 		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
198 		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
199 		dst = dst + 128;
200 	}
201 }
202 
203 /**
204  * Copy 512-byte blocks from one location to another,
205  * locations should not overlap.
206  */
207 static inline void
208 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
209 {
210 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
211 
212 	while (n >= 512) {
213 		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
214 		n -= 512;
215 		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
216 		zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
217 		zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
218 		zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
219 		zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
220 		zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
221 		zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
222 		src = src + 512;
223 		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
224 		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
225 		_mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
226 		_mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
227 		_mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
228 		_mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
229 		_mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
230 		_mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
231 		dst = dst + 512;
232 	}
233 }
234 
235 static __rte_always_inline void *
236 rte_memcpy_generic(void *dst, const void *src, size_t n)
237 {
238 	void *ret = dst;
239 	size_t dstofss;
240 	size_t bits;
241 
242 	/**
243 	 * Copy less than 16 bytes
244 	 */
245 	if (n < 16) {
246 		return rte_mov15_or_less(dst, src, n);
247 	}
248 
249 	/**
250 	 * Fast way when copy size doesn't exceed 512 bytes
251 	 */
252 	if (__rte_constant(n) && n == 32) {
253 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
254 		return ret;
255 	}
256 	if (n <= 32) {
257 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
258 		if (__rte_constant(n) && n == 16)
259 			return ret; /* avoid (harmless) duplicate copy */
260 		rte_mov16((uint8_t *)dst - 16 + n,
261 				  (const uint8_t *)src - 16 + n);
262 		return ret;
263 	}
264 	if (__rte_constant(n) && n == 64) {
265 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
266 		return ret;
267 	}
268 	if (n <= 64) {
269 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
270 		rte_mov32((uint8_t *)dst - 32 + n,
271 				  (const uint8_t *)src - 32 + n);
272 		return ret;
273 	}
274 	if (n <= 512) {
275 		if (n >= 256) {
276 			n -= 256;
277 			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
278 			src = (const uint8_t *)src + 256;
279 			dst = (uint8_t *)dst + 256;
280 		}
281 		if (n >= 128) {
282 			n -= 128;
283 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
284 			src = (const uint8_t *)src + 128;
285 			dst = (uint8_t *)dst + 128;
286 		}
287 COPY_BLOCK_128_BACK63:
288 		if (n > 64) {
289 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
290 			rte_mov64((uint8_t *)dst - 64 + n,
291 					  (const uint8_t *)src - 64 + n);
292 			return ret;
293 		}
294 		if (n > 0)
295 			rte_mov64((uint8_t *)dst - 64 + n,
296 					  (const uint8_t *)src - 64 + n);
297 		return ret;
298 	}
299 
300 	/**
301 	 * Make store aligned when copy size exceeds 512 bytes
302 	 */
303 	dstofss = ((uintptr_t)dst & 0x3F);
304 	if (dstofss > 0) {
305 		dstofss = 64 - dstofss;
306 		n -= dstofss;
307 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
308 		src = (const uint8_t *)src + dstofss;
309 		dst = (uint8_t *)dst + dstofss;
310 	}
311 
312 	/**
313 	 * Copy 512-byte blocks.
314 	 * Use copy block function for better instruction order control,
315 	 * which is important when load is unaligned.
316 	 */
317 	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
318 	bits = n;
319 	n = n & 511;
320 	bits -= n;
321 	src = (const uint8_t *)src + bits;
322 	dst = (uint8_t *)dst + bits;
323 
324 	/**
325 	 * Copy 128-byte blocks.
326 	 * Use copy block function for better instruction order control,
327 	 * which is important when load is unaligned.
328 	 */
329 	if (n >= 128) {
330 		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
331 		bits = n;
332 		n = n & 127;
333 		bits -= n;
334 		src = (const uint8_t *)src + bits;
335 		dst = (uint8_t *)dst + bits;
336 	}
337 
338 	/**
339 	 * Copy whatever left
340 	 */
341 	goto COPY_BLOCK_128_BACK63;
342 }
343 
344 #elif defined RTE_MEMCPY_AVX
345 
346 /**
347  * AVX implementation below
348  */
349 
350 #define ALIGNMENT_MASK 0x1F
351 
352 /**
353  * Copy 128-byte blocks from one location to another,
354  * locations should not overlap.
355  */
356 static __rte_always_inline void
357 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
358 {
359 	__m256i ymm0, ymm1, ymm2, ymm3;
360 
361 	while (n >= 128) {
362 		ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)
363 					  ((const uint8_t *)src + 0 * 32));
364 		n -= 128;
365 		ymm1 = _mm256_loadu_si256((const __m256i *)(const void *)
366 					  ((const uint8_t *)src + 1 * 32));
367 		ymm2 = _mm256_loadu_si256((const __m256i *)(const void *)
368 					  ((const uint8_t *)src + 2 * 32));
369 		ymm3 = _mm256_loadu_si256((const __m256i *)(const void *)
370 					  ((const uint8_t *)src + 3 * 32));
371 		src = (const uint8_t *)src + 128;
372 		_mm256_storeu_si256((__m256i *)(void *)
373 				    ((uint8_t *)dst + 0 * 32), ymm0);
374 		_mm256_storeu_si256((__m256i *)(void *)
375 				    ((uint8_t *)dst + 1 * 32), ymm1);
376 		_mm256_storeu_si256((__m256i *)(void *)
377 				    ((uint8_t *)dst + 2 * 32), ymm2);
378 		_mm256_storeu_si256((__m256i *)(void *)
379 				    ((uint8_t *)dst + 3 * 32), ymm3);
380 		dst = (uint8_t *)dst + 128;
381 	}
382 }
383 
384 static __rte_always_inline void *
385 rte_memcpy_generic(void *dst, const void *src, size_t n)
386 {
387 	void *ret = dst;
388 	size_t dstofss;
389 	size_t bits;
390 
391 	/**
392 	 * Copy less than 16 bytes
393 	 */
394 	if (n < 16) {
395 		return rte_mov15_or_less(dst, src, n);
396 	}
397 
398 	/**
399 	 * Fast way when copy size doesn't exceed 256 bytes
400 	 */
401 	if (__rte_constant(n) && n == 32) {
402 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
403 		return ret;
404 	}
405 	if (n <= 32) {
406 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
407 		if (__rte_constant(n) && n == 16)
408 			return ret; /* avoid (harmless) duplicate copy */
409 		rte_mov16((uint8_t *)dst - 16 + n,
410 				(const uint8_t *)src - 16 + n);
411 		return ret;
412 	}
413 	if (n <= 64) {
414 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
415 		rte_mov32((uint8_t *)dst - 32 + n,
416 				(const uint8_t *)src - 32 + n);
417 		return ret;
418 	}
419 	if (n <= 256) {
420 		if (n >= 128) {
421 			n -= 128;
422 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
423 			src = (const uint8_t *)src + 128;
424 			dst = (uint8_t *)dst + 128;
425 		}
426 COPY_BLOCK_128_BACK31:
427 		if (n >= 64) {
428 			n -= 64;
429 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
430 			src = (const uint8_t *)src + 64;
431 			dst = (uint8_t *)dst + 64;
432 		}
433 		if (n > 32) {
434 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
435 			rte_mov32((uint8_t *)dst - 32 + n,
436 					(const uint8_t *)src - 32 + n);
437 			return ret;
438 		}
439 		if (n > 0) {
440 			rte_mov32((uint8_t *)dst - 32 + n,
441 					(const uint8_t *)src - 32 + n);
442 		}
443 		return ret;
444 	}
445 
446 	/**
447 	 * Make store aligned when copy size exceeds 256 bytes
448 	 */
449 	dstofss = (uintptr_t)dst & 0x1F;
450 	if (dstofss > 0) {
451 		dstofss = 32 - dstofss;
452 		n -= dstofss;
453 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
454 		src = (const uint8_t *)src + dstofss;
455 		dst = (uint8_t *)dst + dstofss;
456 	}
457 
458 	/**
459 	 * Copy 128-byte blocks
460 	 */
461 	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
462 	bits = n;
463 	n = n & 127;
464 	bits -= n;
465 	src = (const uint8_t *)src + bits;
466 	dst = (uint8_t *)dst + bits;
467 
468 	/**
469 	 * Copy whatever left
470 	 */
471 	goto COPY_BLOCK_128_BACK31;
472 }
473 
474 #else /* __AVX512F__ */
475 
476 /**
477  * SSE implementation below
478  */
479 
480 #define ALIGNMENT_MASK 0x0F
481 
482 /**
483  * Macro for copying unaligned block from one location to another with constant load offset,
484  * 47 bytes leftover maximum,
485  * locations should not overlap.
486  * Requirements:
487  * - Store is aligned
488  * - Load offset is <offset>, which must be immediate value within [1, 15]
489  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
490  * - <dst>, <src>, <len> must be variables
491  * - __m128i <xmm0> ~ <xmm8> must be pre-defined
492  */
493 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
494 {                                                                                            \
495     size_t tmp;                                                                                                \
496     while (len >= 128 + 16 - offset) {                                                                      \
497         xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));                  \
498         len -= 128;                                                                                         \
499         xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));                  \
500         xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));                  \
501         xmm3 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 3 * 16));                  \
502         xmm4 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 4 * 16));                  \
503         xmm5 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 5 * 16));                  \
504         xmm6 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 6 * 16));                  \
505         xmm7 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 7 * 16));                  \
506         xmm8 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 8 * 16));                  \
507         src = (const uint8_t *)src + 128;                                                                   \
508         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
509         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
510         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
511         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
512         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
513         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
514         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
515         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
516         dst = (uint8_t *)dst + 128;                                                                         \
517     }                                                                                                       \
518     tmp = len;                                                                                              \
519     len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
520     tmp -= len;                                                                                             \
521     src = (const uint8_t *)src + tmp;                                                                       \
522     dst = (uint8_t *)dst + tmp;                                                                             \
523     if (len >= 32 + 16 - offset) {                                                                          \
524         while (len >= 32 + 16 - offset) {                                                                   \
525             xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));              \
526             len -= 32;                                                                                      \
527             xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));              \
528             xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));              \
529             src = (const uint8_t *)src + 32;                                                                \
530             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
531             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
532             dst = (uint8_t *)dst + 32;                                                                      \
533         }                                                                                                   \
534         tmp = len;                                                                                          \
535         len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
536         tmp -= len;                                                                                         \
537         src = (const uint8_t *)src + tmp;                                                                   \
538         dst = (uint8_t *)dst + tmp;                                                                         \
539     }                                                                                                       \
540 }
541 
542 /**
543  * Macro for copying unaligned block from one location to another,
544  * 47 bytes leftover maximum,
545  * locations should not overlap.
546  * Use switch here because the aligning instruction requires immediate value for shift count.
547  * Requirements:
548  * - Store is aligned
549  * - Load offset is <offset>, which must be within [1, 15]
550  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
551  * - <dst>, <src>, <len> must be variables
552  * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
553  */
554 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
555 {                                                      \
556     switch (offset) {                                                 \
557     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
558     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
559     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
560     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
561     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
562     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
563     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
564     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
565     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
566     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
567     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
568     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
569     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
570     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
571     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
572     default:;                                                         \
573     }                                                                 \
574 }
575 
576 static __rte_always_inline void *
577 rte_memcpy_generic(void *dst, const void *src, size_t n)
578 {
579 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
580 	void *ret = dst;
581 	size_t dstofss;
582 	size_t srcofs;
583 
584 	/**
585 	 * Copy less than 16 bytes
586 	 */
587 	if (n < 16) {
588 		return rte_mov15_or_less(dst, src, n);
589 	}
590 
591 	/**
592 	 * Fast way when copy size doesn't exceed 512 bytes
593 	 */
594 	if (n <= 32) {
595 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
596 		if (__rte_constant(n) && n == 16)
597 			return ret; /* avoid (harmless) duplicate copy */
598 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
599 		return ret;
600 	}
601 	if (n <= 64) {
602 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
603 		if (n > 48)
604 			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
605 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
606 		return ret;
607 	}
608 	if (n <= 128) {
609 		goto COPY_BLOCK_128_BACK15;
610 	}
611 	if (n <= 512) {
612 		if (n >= 256) {
613 			n -= 256;
614 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
615 			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
616 			src = (const uint8_t *)src + 256;
617 			dst = (uint8_t *)dst + 256;
618 		}
619 COPY_BLOCK_255_BACK15:
620 		if (n >= 128) {
621 			n -= 128;
622 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
623 			src = (const uint8_t *)src + 128;
624 			dst = (uint8_t *)dst + 128;
625 		}
626 COPY_BLOCK_128_BACK15:
627 		if (n >= 64) {
628 			n -= 64;
629 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
630 			src = (const uint8_t *)src + 64;
631 			dst = (uint8_t *)dst + 64;
632 		}
633 COPY_BLOCK_64_BACK15:
634 		if (n >= 32) {
635 			n -= 32;
636 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
637 			src = (const uint8_t *)src + 32;
638 			dst = (uint8_t *)dst + 32;
639 		}
640 		if (n > 16) {
641 			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
642 			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
643 			return ret;
644 		}
645 		if (n > 0) {
646 			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
647 		}
648 		return ret;
649 	}
650 
651 	/**
652 	 * Make store aligned when copy size exceeds 512 bytes,
653 	 * and make sure the first 15 bytes are copied, because
654 	 * unaligned copy functions require up to 15 bytes
655 	 * backwards access.
656 	 */
657 	dstofss = (uintptr_t)dst & 0x0F;
658 	if (dstofss > 0) {
659 		dstofss = 16 - dstofss + 16;
660 		n -= dstofss;
661 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
662 		src = (const uint8_t *)src + dstofss;
663 		dst = (uint8_t *)dst + dstofss;
664 	}
665 	srcofs = ((uintptr_t)src & 0x0F);
666 
667 	/**
668 	 * For aligned copy
669 	 */
670 	if (srcofs == 0) {
671 		/**
672 		 * Copy 256-byte blocks
673 		 */
674 		for (; n >= 256; n -= 256) {
675 			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
676 			dst = (uint8_t *)dst + 256;
677 			src = (const uint8_t *)src + 256;
678 		}
679 
680 		/**
681 		 * Copy whatever left
682 		 */
683 		goto COPY_BLOCK_255_BACK15;
684 	}
685 
686 	/**
687 	 * For copy with unaligned load
688 	 */
689 	MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
690 
691 	/**
692 	 * Copy whatever left
693 	 */
694 	goto COPY_BLOCK_64_BACK15;
695 }
696 
697 #endif /* __AVX512F__ */
698 
699 static __rte_always_inline void *
700 rte_memcpy_aligned(void *dst, const void *src, size_t n)
701 {
702 	void *ret = dst;
703 
704 	/* Copy size < 16 bytes */
705 	if (n < 16) {
706 		return rte_mov15_or_less(dst, src, n);
707 	}
708 
709 	/* Copy 16 <= size <= 32 bytes */
710 	if (__rte_constant(n) && n == 32) {
711 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
712 		return ret;
713 	}
714 	if (n <= 32) {
715 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
716 		if (__rte_constant(n) && n == 16)
717 			return ret; /* avoid (harmless) duplicate copy */
718 		rte_mov16((uint8_t *)dst - 16 + n,
719 				(const uint8_t *)src - 16 + n);
720 
721 		return ret;
722 	}
723 
724 	/* Copy 32 < size <= 64 bytes */
725 	if (__rte_constant(n) && n == 64) {
726 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
727 		return ret;
728 	}
729 	if (n <= 64) {
730 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
731 		rte_mov32((uint8_t *)dst - 32 + n,
732 				(const uint8_t *)src - 32 + n);
733 
734 		return ret;
735 	}
736 
737 	/* Copy 64 bytes blocks */
738 	for (; n > 64; n -= 64) {
739 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
740 		dst = (uint8_t *)dst + 64;
741 		src = (const uint8_t *)src + 64;
742 	}
743 
744 	/* Copy whatever left */
745 	rte_mov64((uint8_t *)dst - 64 + n,
746 			(const uint8_t *)src - 64 + n);
747 
748 	return ret;
749 }
750 
751 static __rte_always_inline void *
752 rte_memcpy(void *dst, const void *src, size_t n)
753 {
754 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
755 		return rte_memcpy_aligned(dst, src, n);
756 	else
757 		return rte_memcpy_generic(dst, src, n);
758 }
759 
760 #undef ALIGNMENT_MASK
761 
762 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
763 #pragma GCC diagnostic pop
764 #endif
765 
766 #ifdef __cplusplus
767 }
768 #endif
769 
770 #endif /* _RTE_MEMCPY_X86_64_H_ */
771