1 /* 2 * SPDX-License-Identifier: BSD-3-Clause 3 * Copyright (C) IBM Corporation 2014,2021 4 */ 5 6 #ifndef _RTE_MEMCPY_PPC_64_H_ 7 #define _RTE_MEMCPY_PPC_64_H_ 8 9 #include <stdint.h> 10 #include <string.h> 11 12 #include "rte_altivec.h" 13 #include "rte_common.h" 14 15 #include "generic/rte_memcpy.h" 16 17 #ifdef __cplusplus 18 extern "C" { 19 #endif 20 21 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 90000) 22 #pragma GCC diagnostic push 23 #pragma GCC diagnostic ignored "-Warray-bounds" 24 #endif 25 26 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 27 #pragma GCC diagnostic push 28 #pragma GCC diagnostic ignored "-Wstringop-overflow" 29 #endif 30 31 static inline void 32 rte_mov16(uint8_t *dst, const uint8_t *src) 33 { 34 vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 35 } 36 37 static inline void 38 rte_mov32(uint8_t *dst, const uint8_t *src) 39 { 40 vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 41 vec_vsx_st(vec_vsx_ld(16, src), 16, dst); 42 } 43 44 static inline void 45 rte_mov48(uint8_t *dst, const uint8_t *src) 46 { 47 vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 48 vec_vsx_st(vec_vsx_ld(16, src), 16, dst); 49 vec_vsx_st(vec_vsx_ld(32, src), 32, dst); 50 } 51 52 static inline void 53 rte_mov64(uint8_t *dst, const uint8_t *src) 54 { 55 vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 56 vec_vsx_st(vec_vsx_ld(16, src), 16, dst); 57 vec_vsx_st(vec_vsx_ld(32, src), 32, dst); 58 vec_vsx_st(vec_vsx_ld(48, src), 48, dst); 59 } 60 61 static inline void 62 rte_mov128(uint8_t *dst, const uint8_t *src) 63 { 64 vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 65 vec_vsx_st(vec_vsx_ld(16, src), 16, dst); 66 vec_vsx_st(vec_vsx_ld(32, src), 32, dst); 67 vec_vsx_st(vec_vsx_ld(48, src), 48, dst); 68 vec_vsx_st(vec_vsx_ld(64, src), 64, dst); 69 vec_vsx_st(vec_vsx_ld(80, src), 80, dst); 70 vec_vsx_st(vec_vsx_ld(96, src), 96, dst); 71 vec_vsx_st(vec_vsx_ld(112, src), 112, dst); 72 } 73 74 static inline void 75 rte_mov256(uint8_t *dst, const uint8_t *src) 76 { 77 rte_mov128(dst, src); 78 rte_mov128(dst + 128, src + 128); 79 } 80 81 #define rte_memcpy(dst, src, n) \ 82 __extension__ ({ \ 83 (__builtin_constant_p(n)) ? \ 84 memcpy((dst), (src), (n)) : \ 85 rte_memcpy_func((dst), (src), (n)); }) 86 87 static inline void * 88 rte_memcpy_func(void *dst, const void *src, size_t n) 89 { 90 void *ret = dst; 91 92 /* We can't copy < 16 bytes using XMM registers so do it manually. */ 93 if (n < 16) { 94 if (n & 0x01) { 95 *(uint8_t *)dst = *(const uint8_t *)src; 96 dst = (uint8_t *)dst + 1; 97 src = (const uint8_t *)src + 1; 98 } 99 if (n & 0x02) { 100 *(uint16_t *)dst = *(const uint16_t *)src; 101 dst = (uint16_t *)dst + 1; 102 src = (const uint16_t *)src + 1; 103 } 104 if (n & 0x04) { 105 *(uint32_t *)dst = *(const uint32_t *)src; 106 dst = (uint32_t *)dst + 1; 107 src = (const uint32_t *)src + 1; 108 } 109 if (n & 0x08) 110 *(uint64_t *)dst = *(const uint64_t *)src; 111 return ret; 112 } 113 114 /* Special fast cases for <= 128 bytes */ 115 if (n <= 32) { 116 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 117 rte_mov16((uint8_t *)dst - 16 + n, 118 (const uint8_t *)src - 16 + n); 119 return ret; 120 } 121 122 if (n <= 64) { 123 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 124 rte_mov32((uint8_t *)dst - 32 + n, 125 (const uint8_t *)src - 32 + n); 126 return ret; 127 } 128 129 if (n <= 128) { 130 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 131 rte_mov64((uint8_t *)dst - 64 + n, 132 (const uint8_t *)src - 64 + n); 133 return ret; 134 } 135 136 /* 137 * For large copies > 128 bytes. This combination of 256, 64 and 16 byte 138 * copies was found to be faster than doing 128 and 32 byte copies as 139 * well. 140 */ 141 for ( ; n >= 256; n -= 256) { 142 rte_mov256((uint8_t *)dst, (const uint8_t *)src); 143 dst = (uint8_t *)dst + 256; 144 src = (const uint8_t *)src + 256; 145 } 146 147 /* 148 * We split the remaining bytes (which will be less than 256) into 149 * 64byte (2^6) chunks. 150 * Using incrementing integers in the case labels of a switch statement 151 * encourages the compiler to use a jump table. To get incrementing 152 * integers, we shift the 2 relevant bits to the LSB position to first 153 * get decrementing integers, and then subtract. 154 */ 155 switch (3 - (n >> 6)) { 156 case 0x00: 157 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 158 n -= 64; 159 dst = (uint8_t *)dst + 64; 160 src = (const uint8_t *)src + 64; /* fallthrough */ 161 case 0x01: 162 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 163 n -= 64; 164 dst = (uint8_t *)dst + 64; 165 src = (const uint8_t *)src + 64; /* fallthrough */ 166 case 0x02: 167 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 168 n -= 64; 169 dst = (uint8_t *)dst + 64; 170 src = (const uint8_t *)src + 64; /* fallthrough */ 171 default: 172 ; 173 } 174 175 /* 176 * We split the remaining bytes (which will be less than 64) into 177 * 16byte (2^4) chunks, using the same switch structure as above. 178 */ 179 switch (3 - (n >> 4)) { 180 case 0x00: 181 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 182 n -= 16; 183 dst = (uint8_t *)dst + 16; 184 src = (const uint8_t *)src + 16; /* fallthrough */ 185 case 0x01: 186 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 187 n -= 16; 188 dst = (uint8_t *)dst + 16; 189 src = (const uint8_t *)src + 16; /* fallthrough */ 190 case 0x02: 191 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 192 n -= 16; 193 dst = (uint8_t *)dst + 16; 194 src = (const uint8_t *)src + 16; /* fallthrough */ 195 default: 196 ; 197 } 198 199 /* Copy any remaining bytes, without going beyond end of buffers */ 200 if (n != 0) 201 rte_mov16((uint8_t *)dst - 16 + n, 202 (const uint8_t *)src - 16 + n); 203 return ret; 204 } 205 206 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 207 #pragma GCC diagnostic pop 208 #endif 209 210 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 90000) 211 #pragma GCC diagnostic pop 212 #endif 213 214 #ifdef __cplusplus 215 } 216 #endif 217 218 #endif /* _RTE_MEMCPY_PPC_64_H_ */ 219