xref: /dpdk/lib/eal/ppc/include/rte_memcpy.h (revision 719834a6849e1daf4a70ff7742bbcc3ae7e25607)
199a2dd95SBruce Richardson /*
299a2dd95SBruce Richardson  * SPDX-License-Identifier: BSD-3-Clause
3c13e6177SDavid Christensen  * Copyright (C) IBM Corporation 2014,2021
499a2dd95SBruce Richardson  */
599a2dd95SBruce Richardson 
699a2dd95SBruce Richardson #ifndef _RTE_MEMCPY_PPC_64_H_
799a2dd95SBruce Richardson #define _RTE_MEMCPY_PPC_64_H_
899a2dd95SBruce Richardson 
999a2dd95SBruce Richardson #include <stdint.h>
1099a2dd95SBruce Richardson #include <string.h>
1199a2dd95SBruce Richardson 
1299a2dd95SBruce Richardson #include "rte_altivec.h"
1399a2dd95SBruce Richardson #include "rte_common.h"
1499a2dd95SBruce Richardson 
15*719834a6SMattias Rönnblom #include "generic/rte_memcpy.h"
16*719834a6SMattias Rönnblom 
1799a2dd95SBruce Richardson #ifdef __cplusplus
1899a2dd95SBruce Richardson extern "C" {
1999a2dd95SBruce Richardson #endif
2099a2dd95SBruce Richardson 
21c13e6177SDavid Christensen #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 90000)
2299a2dd95SBruce Richardson #pragma GCC diagnostic push
2399a2dd95SBruce Richardson #pragma GCC diagnostic ignored "-Warray-bounds"
2499a2dd95SBruce Richardson #endif
2599a2dd95SBruce Richardson 
26c13e6177SDavid Christensen #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
27c13e6177SDavid Christensen #pragma GCC diagnostic push
28c13e6177SDavid Christensen #pragma GCC diagnostic ignored "-Wstringop-overflow"
29c13e6177SDavid Christensen #endif
30c13e6177SDavid Christensen 
3199a2dd95SBruce Richardson static inline void
3299a2dd95SBruce Richardson rte_mov16(uint8_t *dst, const uint8_t *src)
3399a2dd95SBruce Richardson {
3499a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
3599a2dd95SBruce Richardson }
3699a2dd95SBruce Richardson 
3799a2dd95SBruce Richardson static inline void
3899a2dd95SBruce Richardson rte_mov32(uint8_t *dst, const uint8_t *src)
3999a2dd95SBruce Richardson {
4099a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
4199a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
4299a2dd95SBruce Richardson }
4399a2dd95SBruce Richardson 
4499a2dd95SBruce Richardson static inline void
4599a2dd95SBruce Richardson rte_mov48(uint8_t *dst, const uint8_t *src)
4699a2dd95SBruce Richardson {
4799a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
4899a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
4999a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
5099a2dd95SBruce Richardson }
5199a2dd95SBruce Richardson 
5299a2dd95SBruce Richardson static inline void
5399a2dd95SBruce Richardson rte_mov64(uint8_t *dst, const uint8_t *src)
5499a2dd95SBruce Richardson {
5599a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
5699a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
5799a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
5899a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
5999a2dd95SBruce Richardson }
6099a2dd95SBruce Richardson 
6199a2dd95SBruce Richardson static inline void
6299a2dd95SBruce Richardson rte_mov128(uint8_t *dst, const uint8_t *src)
6399a2dd95SBruce Richardson {
6499a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
6599a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
6699a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
6799a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
6899a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(64, src), 64, dst);
6999a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(80, src), 80, dst);
7099a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(96, src), 96, dst);
7199a2dd95SBruce Richardson 	vec_vsx_st(vec_vsx_ld(112, src), 112, dst);
7299a2dd95SBruce Richardson }
7399a2dd95SBruce Richardson 
7499a2dd95SBruce Richardson static inline void
7599a2dd95SBruce Richardson rte_mov256(uint8_t *dst, const uint8_t *src)
7699a2dd95SBruce Richardson {
7799a2dd95SBruce Richardson 	rte_mov128(dst, src);
7899a2dd95SBruce Richardson 	rte_mov128(dst + 128, src + 128);
7999a2dd95SBruce Richardson }
8099a2dd95SBruce Richardson 
8199a2dd95SBruce Richardson #define rte_memcpy(dst, src, n)              \
8299a2dd95SBruce Richardson 	__extension__ ({                     \
8399a2dd95SBruce Richardson 	(__builtin_constant_p(n)) ?          \
8499a2dd95SBruce Richardson 	memcpy((dst), (src), (n)) :          \
8599a2dd95SBruce Richardson 	rte_memcpy_func((dst), (src), (n)); })
8699a2dd95SBruce Richardson 
8799a2dd95SBruce Richardson static inline void *
8899a2dd95SBruce Richardson rte_memcpy_func(void *dst, const void *src, size_t n)
8999a2dd95SBruce Richardson {
9099a2dd95SBruce Richardson 	void *ret = dst;
9199a2dd95SBruce Richardson 
9299a2dd95SBruce Richardson 	/* We can't copy < 16 bytes using XMM registers so do it manually. */
9399a2dd95SBruce Richardson 	if (n < 16) {
9499a2dd95SBruce Richardson 		if (n & 0x01) {
9599a2dd95SBruce Richardson 			*(uint8_t *)dst = *(const uint8_t *)src;
9699a2dd95SBruce Richardson 			dst = (uint8_t *)dst + 1;
9799a2dd95SBruce Richardson 			src = (const uint8_t *)src + 1;
9899a2dd95SBruce Richardson 		}
9999a2dd95SBruce Richardson 		if (n & 0x02) {
10099a2dd95SBruce Richardson 			*(uint16_t *)dst = *(const uint16_t *)src;
10199a2dd95SBruce Richardson 			dst = (uint16_t *)dst + 1;
10299a2dd95SBruce Richardson 			src = (const uint16_t *)src + 1;
10399a2dd95SBruce Richardson 		}
10499a2dd95SBruce Richardson 		if (n & 0x04) {
10599a2dd95SBruce Richardson 			*(uint32_t *)dst = *(const uint32_t *)src;
10699a2dd95SBruce Richardson 			dst = (uint32_t *)dst + 1;
10799a2dd95SBruce Richardson 			src = (const uint32_t *)src + 1;
10899a2dd95SBruce Richardson 		}
10999a2dd95SBruce Richardson 		if (n & 0x08)
11099a2dd95SBruce Richardson 			*(uint64_t *)dst = *(const uint64_t *)src;
11199a2dd95SBruce Richardson 		return ret;
11299a2dd95SBruce Richardson 	}
11399a2dd95SBruce Richardson 
11499a2dd95SBruce Richardson 	/* Special fast cases for <= 128 bytes */
11599a2dd95SBruce Richardson 	if (n <= 32) {
11699a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
11799a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst - 16 + n,
11899a2dd95SBruce Richardson 			(const uint8_t *)src - 16 + n);
11999a2dd95SBruce Richardson 		return ret;
12099a2dd95SBruce Richardson 	}
12199a2dd95SBruce Richardson 
12299a2dd95SBruce Richardson 	if (n <= 64) {
12399a2dd95SBruce Richardson 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
12499a2dd95SBruce Richardson 		rte_mov32((uint8_t *)dst - 32 + n,
12599a2dd95SBruce Richardson 			(const uint8_t *)src - 32 + n);
12699a2dd95SBruce Richardson 		return ret;
12799a2dd95SBruce Richardson 	}
12899a2dd95SBruce Richardson 
12999a2dd95SBruce Richardson 	if (n <= 128) {
13099a2dd95SBruce Richardson 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
13199a2dd95SBruce Richardson 		rte_mov64((uint8_t *)dst - 64 + n,
13299a2dd95SBruce Richardson 			(const uint8_t *)src - 64 + n);
13399a2dd95SBruce Richardson 		return ret;
13499a2dd95SBruce Richardson 	}
13599a2dd95SBruce Richardson 
13699a2dd95SBruce Richardson 	/*
13799a2dd95SBruce Richardson 	 * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
13899a2dd95SBruce Richardson 	 * copies was found to be faster than doing 128 and 32 byte copies as
13999a2dd95SBruce Richardson 	 * well.
14099a2dd95SBruce Richardson 	 */
14199a2dd95SBruce Richardson 	for ( ; n >= 256; n -= 256) {
14299a2dd95SBruce Richardson 		rte_mov256((uint8_t *)dst, (const uint8_t *)src);
14399a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 256;
14499a2dd95SBruce Richardson 		src = (const uint8_t *)src + 256;
14599a2dd95SBruce Richardson 	}
14699a2dd95SBruce Richardson 
14799a2dd95SBruce Richardson 	/*
14899a2dd95SBruce Richardson 	 * We split the remaining bytes (which will be less than 256) into
14999a2dd95SBruce Richardson 	 * 64byte (2^6) chunks.
15099a2dd95SBruce Richardson 	 * Using incrementing integers in the case labels of a switch statement
15199a2dd95SBruce Richardson 	 * encourages the compiler to use a jump table. To get incrementing
15299a2dd95SBruce Richardson 	 * integers, we shift the 2 relevant bits to the LSB position to first
15399a2dd95SBruce Richardson 	 * get decrementing integers, and then subtract.
15499a2dd95SBruce Richardson 	 */
15599a2dd95SBruce Richardson 	switch (3 - (n >> 6)) {
15699a2dd95SBruce Richardson 	case 0x00:
15799a2dd95SBruce Richardson 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
15899a2dd95SBruce Richardson 		n -= 64;
15999a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 64;
16099a2dd95SBruce Richardson 		src = (const uint8_t *)src + 64;      /* fallthrough */
16199a2dd95SBruce Richardson 	case 0x01:
16299a2dd95SBruce Richardson 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
16399a2dd95SBruce Richardson 		n -= 64;
16499a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 64;
16599a2dd95SBruce Richardson 		src = (const uint8_t *)src + 64;      /* fallthrough */
16699a2dd95SBruce Richardson 	case 0x02:
16799a2dd95SBruce Richardson 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
16899a2dd95SBruce Richardson 		n -= 64;
16999a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 64;
17099a2dd95SBruce Richardson 		src = (const uint8_t *)src + 64;      /* fallthrough */
17199a2dd95SBruce Richardson 	default:
17299a2dd95SBruce Richardson 		;
17399a2dd95SBruce Richardson 	}
17499a2dd95SBruce Richardson 
17599a2dd95SBruce Richardson 	/*
17699a2dd95SBruce Richardson 	 * We split the remaining bytes (which will be less than 64) into
17799a2dd95SBruce Richardson 	 * 16byte (2^4) chunks, using the same switch structure as above.
17899a2dd95SBruce Richardson 	 */
17999a2dd95SBruce Richardson 	switch (3 - (n >> 4)) {
18099a2dd95SBruce Richardson 	case 0x00:
18199a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
18299a2dd95SBruce Richardson 		n -= 16;
18399a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 16;
18499a2dd95SBruce Richardson 		src = (const uint8_t *)src + 16;      /* fallthrough */
18599a2dd95SBruce Richardson 	case 0x01:
18699a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
18799a2dd95SBruce Richardson 		n -= 16;
18899a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 16;
18999a2dd95SBruce Richardson 		src = (const uint8_t *)src + 16;      /* fallthrough */
19099a2dd95SBruce Richardson 	case 0x02:
19199a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
19299a2dd95SBruce Richardson 		n -= 16;
19399a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 16;
19499a2dd95SBruce Richardson 		src = (const uint8_t *)src + 16;      /* fallthrough */
19599a2dd95SBruce Richardson 	default:
19699a2dd95SBruce Richardson 		;
19799a2dd95SBruce Richardson 	}
19899a2dd95SBruce Richardson 
19999a2dd95SBruce Richardson 	/* Copy any remaining bytes, without going beyond end of buffers */
20099a2dd95SBruce Richardson 	if (n != 0)
20199a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst - 16 + n,
20299a2dd95SBruce Richardson 			(const uint8_t *)src - 16 + n);
20399a2dd95SBruce Richardson 	return ret;
20499a2dd95SBruce Richardson }
20599a2dd95SBruce Richardson 
206c13e6177SDavid Christensen #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
207c13e6177SDavid Christensen #pragma GCC diagnostic pop
208c13e6177SDavid Christensen #endif
209c13e6177SDavid Christensen 
210c13e6177SDavid Christensen #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 90000)
21199a2dd95SBruce Richardson #pragma GCC diagnostic pop
21299a2dd95SBruce Richardson #endif
21399a2dd95SBruce Richardson 
21499a2dd95SBruce Richardson #ifdef __cplusplus
21599a2dd95SBruce Richardson }
21699a2dd95SBruce Richardson #endif
21799a2dd95SBruce Richardson 
21899a2dd95SBruce Richardson #endif /* _RTE_MEMCPY_PPC_64_H_ */
219