199a2dd95SBruce Richardson /* 299a2dd95SBruce Richardson * SPDX-License-Identifier: BSD-3-Clause 3c13e6177SDavid Christensen * Copyright (C) IBM Corporation 2014,2021 499a2dd95SBruce Richardson */ 599a2dd95SBruce Richardson 699a2dd95SBruce Richardson #ifndef _RTE_MEMCPY_PPC_64_H_ 799a2dd95SBruce Richardson #define _RTE_MEMCPY_PPC_64_H_ 899a2dd95SBruce Richardson 999a2dd95SBruce Richardson #include <stdint.h> 1099a2dd95SBruce Richardson #include <string.h> 1199a2dd95SBruce Richardson 1299a2dd95SBruce Richardson #include "rte_altivec.h" 1399a2dd95SBruce Richardson #include "rte_common.h" 1499a2dd95SBruce Richardson 15*719834a6SMattias Rönnblom #include "generic/rte_memcpy.h" 16*719834a6SMattias Rönnblom 1799a2dd95SBruce Richardson #ifdef __cplusplus 1899a2dd95SBruce Richardson extern "C" { 1999a2dd95SBruce Richardson #endif 2099a2dd95SBruce Richardson 21c13e6177SDavid Christensen #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 90000) 2299a2dd95SBruce Richardson #pragma GCC diagnostic push 2399a2dd95SBruce Richardson #pragma GCC diagnostic ignored "-Warray-bounds" 2499a2dd95SBruce Richardson #endif 2599a2dd95SBruce Richardson 26c13e6177SDavid Christensen #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 27c13e6177SDavid Christensen #pragma GCC diagnostic push 28c13e6177SDavid Christensen #pragma GCC diagnostic ignored "-Wstringop-overflow" 29c13e6177SDavid Christensen #endif 30c13e6177SDavid Christensen 3199a2dd95SBruce Richardson static inline void 3299a2dd95SBruce Richardson rte_mov16(uint8_t *dst, const uint8_t *src) 3399a2dd95SBruce Richardson { 3499a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 3599a2dd95SBruce Richardson } 3699a2dd95SBruce Richardson 3799a2dd95SBruce Richardson static inline void 3899a2dd95SBruce Richardson rte_mov32(uint8_t *dst, const uint8_t *src) 3999a2dd95SBruce Richardson { 4099a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 4199a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(16, src), 16, dst); 4299a2dd95SBruce Richardson } 4399a2dd95SBruce Richardson 4499a2dd95SBruce Richardson static inline void 4599a2dd95SBruce Richardson rte_mov48(uint8_t *dst, const uint8_t *src) 4699a2dd95SBruce Richardson { 4799a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 4899a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(16, src), 16, dst); 4999a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(32, src), 32, dst); 5099a2dd95SBruce Richardson } 5199a2dd95SBruce Richardson 5299a2dd95SBruce Richardson static inline void 5399a2dd95SBruce Richardson rte_mov64(uint8_t *dst, const uint8_t *src) 5499a2dd95SBruce Richardson { 5599a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 5699a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(16, src), 16, dst); 5799a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(32, src), 32, dst); 5899a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(48, src), 48, dst); 5999a2dd95SBruce Richardson } 6099a2dd95SBruce Richardson 6199a2dd95SBruce Richardson static inline void 6299a2dd95SBruce Richardson rte_mov128(uint8_t *dst, const uint8_t *src) 6399a2dd95SBruce Richardson { 6499a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 6599a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(16, src), 16, dst); 6699a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(32, src), 32, dst); 6799a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(48, src), 48, dst); 6899a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(64, src), 64, dst); 6999a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(80, src), 80, dst); 7099a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(96, src), 96, dst); 7199a2dd95SBruce Richardson vec_vsx_st(vec_vsx_ld(112, src), 112, dst); 7299a2dd95SBruce Richardson } 7399a2dd95SBruce Richardson 7499a2dd95SBruce Richardson static inline void 7599a2dd95SBruce Richardson rte_mov256(uint8_t *dst, const uint8_t *src) 7699a2dd95SBruce Richardson { 7799a2dd95SBruce Richardson rte_mov128(dst, src); 7899a2dd95SBruce Richardson rte_mov128(dst + 128, src + 128); 7999a2dd95SBruce Richardson } 8099a2dd95SBruce Richardson 8199a2dd95SBruce Richardson #define rte_memcpy(dst, src, n) \ 8299a2dd95SBruce Richardson __extension__ ({ \ 8399a2dd95SBruce Richardson (__builtin_constant_p(n)) ? \ 8499a2dd95SBruce Richardson memcpy((dst), (src), (n)) : \ 8599a2dd95SBruce Richardson rte_memcpy_func((dst), (src), (n)); }) 8699a2dd95SBruce Richardson 8799a2dd95SBruce Richardson static inline void * 8899a2dd95SBruce Richardson rte_memcpy_func(void *dst, const void *src, size_t n) 8999a2dd95SBruce Richardson { 9099a2dd95SBruce Richardson void *ret = dst; 9199a2dd95SBruce Richardson 9299a2dd95SBruce Richardson /* We can't copy < 16 bytes using XMM registers so do it manually. */ 9399a2dd95SBruce Richardson if (n < 16) { 9499a2dd95SBruce Richardson if (n & 0x01) { 9599a2dd95SBruce Richardson *(uint8_t *)dst = *(const uint8_t *)src; 9699a2dd95SBruce Richardson dst = (uint8_t *)dst + 1; 9799a2dd95SBruce Richardson src = (const uint8_t *)src + 1; 9899a2dd95SBruce Richardson } 9999a2dd95SBruce Richardson if (n & 0x02) { 10099a2dd95SBruce Richardson *(uint16_t *)dst = *(const uint16_t *)src; 10199a2dd95SBruce Richardson dst = (uint16_t *)dst + 1; 10299a2dd95SBruce Richardson src = (const uint16_t *)src + 1; 10399a2dd95SBruce Richardson } 10499a2dd95SBruce Richardson if (n & 0x04) { 10599a2dd95SBruce Richardson *(uint32_t *)dst = *(const uint32_t *)src; 10699a2dd95SBruce Richardson dst = (uint32_t *)dst + 1; 10799a2dd95SBruce Richardson src = (const uint32_t *)src + 1; 10899a2dd95SBruce Richardson } 10999a2dd95SBruce Richardson if (n & 0x08) 11099a2dd95SBruce Richardson *(uint64_t *)dst = *(const uint64_t *)src; 11199a2dd95SBruce Richardson return ret; 11299a2dd95SBruce Richardson } 11399a2dd95SBruce Richardson 11499a2dd95SBruce Richardson /* Special fast cases for <= 128 bytes */ 11599a2dd95SBruce Richardson if (n <= 32) { 11699a2dd95SBruce Richardson rte_mov16((uint8_t *)dst, (const uint8_t *)src); 11799a2dd95SBruce Richardson rte_mov16((uint8_t *)dst - 16 + n, 11899a2dd95SBruce Richardson (const uint8_t *)src - 16 + n); 11999a2dd95SBruce Richardson return ret; 12099a2dd95SBruce Richardson } 12199a2dd95SBruce Richardson 12299a2dd95SBruce Richardson if (n <= 64) { 12399a2dd95SBruce Richardson rte_mov32((uint8_t *)dst, (const uint8_t *)src); 12499a2dd95SBruce Richardson rte_mov32((uint8_t *)dst - 32 + n, 12599a2dd95SBruce Richardson (const uint8_t *)src - 32 + n); 12699a2dd95SBruce Richardson return ret; 12799a2dd95SBruce Richardson } 12899a2dd95SBruce Richardson 12999a2dd95SBruce Richardson if (n <= 128) { 13099a2dd95SBruce Richardson rte_mov64((uint8_t *)dst, (const uint8_t *)src); 13199a2dd95SBruce Richardson rte_mov64((uint8_t *)dst - 64 + n, 13299a2dd95SBruce Richardson (const uint8_t *)src - 64 + n); 13399a2dd95SBruce Richardson return ret; 13499a2dd95SBruce Richardson } 13599a2dd95SBruce Richardson 13699a2dd95SBruce Richardson /* 13799a2dd95SBruce Richardson * For large copies > 128 bytes. This combination of 256, 64 and 16 byte 13899a2dd95SBruce Richardson * copies was found to be faster than doing 128 and 32 byte copies as 13999a2dd95SBruce Richardson * well. 14099a2dd95SBruce Richardson */ 14199a2dd95SBruce Richardson for ( ; n >= 256; n -= 256) { 14299a2dd95SBruce Richardson rte_mov256((uint8_t *)dst, (const uint8_t *)src); 14399a2dd95SBruce Richardson dst = (uint8_t *)dst + 256; 14499a2dd95SBruce Richardson src = (const uint8_t *)src + 256; 14599a2dd95SBruce Richardson } 14699a2dd95SBruce Richardson 14799a2dd95SBruce Richardson /* 14899a2dd95SBruce Richardson * We split the remaining bytes (which will be less than 256) into 14999a2dd95SBruce Richardson * 64byte (2^6) chunks. 15099a2dd95SBruce Richardson * Using incrementing integers in the case labels of a switch statement 15199a2dd95SBruce Richardson * encourages the compiler to use a jump table. To get incrementing 15299a2dd95SBruce Richardson * integers, we shift the 2 relevant bits to the LSB position to first 15399a2dd95SBruce Richardson * get decrementing integers, and then subtract. 15499a2dd95SBruce Richardson */ 15599a2dd95SBruce Richardson switch (3 - (n >> 6)) { 15699a2dd95SBruce Richardson case 0x00: 15799a2dd95SBruce Richardson rte_mov64((uint8_t *)dst, (const uint8_t *)src); 15899a2dd95SBruce Richardson n -= 64; 15999a2dd95SBruce Richardson dst = (uint8_t *)dst + 64; 16099a2dd95SBruce Richardson src = (const uint8_t *)src + 64; /* fallthrough */ 16199a2dd95SBruce Richardson case 0x01: 16299a2dd95SBruce Richardson rte_mov64((uint8_t *)dst, (const uint8_t *)src); 16399a2dd95SBruce Richardson n -= 64; 16499a2dd95SBruce Richardson dst = (uint8_t *)dst + 64; 16599a2dd95SBruce Richardson src = (const uint8_t *)src + 64; /* fallthrough */ 16699a2dd95SBruce Richardson case 0x02: 16799a2dd95SBruce Richardson rte_mov64((uint8_t *)dst, (const uint8_t *)src); 16899a2dd95SBruce Richardson n -= 64; 16999a2dd95SBruce Richardson dst = (uint8_t *)dst + 64; 17099a2dd95SBruce Richardson src = (const uint8_t *)src + 64; /* fallthrough */ 17199a2dd95SBruce Richardson default: 17299a2dd95SBruce Richardson ; 17399a2dd95SBruce Richardson } 17499a2dd95SBruce Richardson 17599a2dd95SBruce Richardson /* 17699a2dd95SBruce Richardson * We split the remaining bytes (which will be less than 64) into 17799a2dd95SBruce Richardson * 16byte (2^4) chunks, using the same switch structure as above. 17899a2dd95SBruce Richardson */ 17999a2dd95SBruce Richardson switch (3 - (n >> 4)) { 18099a2dd95SBruce Richardson case 0x00: 18199a2dd95SBruce Richardson rte_mov16((uint8_t *)dst, (const uint8_t *)src); 18299a2dd95SBruce Richardson n -= 16; 18399a2dd95SBruce Richardson dst = (uint8_t *)dst + 16; 18499a2dd95SBruce Richardson src = (const uint8_t *)src + 16; /* fallthrough */ 18599a2dd95SBruce Richardson case 0x01: 18699a2dd95SBruce Richardson rte_mov16((uint8_t *)dst, (const uint8_t *)src); 18799a2dd95SBruce Richardson n -= 16; 18899a2dd95SBruce Richardson dst = (uint8_t *)dst + 16; 18999a2dd95SBruce Richardson src = (const uint8_t *)src + 16; /* fallthrough */ 19099a2dd95SBruce Richardson case 0x02: 19199a2dd95SBruce Richardson rte_mov16((uint8_t *)dst, (const uint8_t *)src); 19299a2dd95SBruce Richardson n -= 16; 19399a2dd95SBruce Richardson dst = (uint8_t *)dst + 16; 19499a2dd95SBruce Richardson src = (const uint8_t *)src + 16; /* fallthrough */ 19599a2dd95SBruce Richardson default: 19699a2dd95SBruce Richardson ; 19799a2dd95SBruce Richardson } 19899a2dd95SBruce Richardson 19999a2dd95SBruce Richardson /* Copy any remaining bytes, without going beyond end of buffers */ 20099a2dd95SBruce Richardson if (n != 0) 20199a2dd95SBruce Richardson rte_mov16((uint8_t *)dst - 16 + n, 20299a2dd95SBruce Richardson (const uint8_t *)src - 16 + n); 20399a2dd95SBruce Richardson return ret; 20499a2dd95SBruce Richardson } 20599a2dd95SBruce Richardson 206c13e6177SDavid Christensen #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 207c13e6177SDavid Christensen #pragma GCC diagnostic pop 208c13e6177SDavid Christensen #endif 209c13e6177SDavid Christensen 210c13e6177SDavid Christensen #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 90000) 21199a2dd95SBruce Richardson #pragma GCC diagnostic pop 21299a2dd95SBruce Richardson #endif 21399a2dd95SBruce Richardson 21499a2dd95SBruce Richardson #ifdef __cplusplus 21599a2dd95SBruce Richardson } 21699a2dd95SBruce Richardson #endif 21799a2dd95SBruce Richardson 21899a2dd95SBruce Richardson #endif /* _RTE_MEMCPY_PPC_64_H_ */ 219