1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2024 Arm Limited 3 */ 4 5 #ifndef RTE_PTR_COMPRESS_H 6 #define RTE_PTR_COMPRESS_H 7 8 /** 9 * @file 10 * Pointer compression and decompression functions. 11 * 12 * When passing arrays full of pointers between threads, memory containing 13 * the pointers is copied multiple times which is especially costly between 14 * cores. These functions allow us to compress the pointers. 15 * 16 * Compression takes advantage of the fact that pointers are usually located in 17 * a limited memory region. We compress them by converting them to offsets from 18 * a base memory address. Offsets can be stored in fewer bytes. 19 * 20 * The compression functions come in two varieties: 32-bit and 16-bit. 21 * 22 * To determine how many bits are needed to compress the pointer, calculate 23 * the biggest offset possible (highest value pointer - base pointer) 24 * and shift the value right according to alignment (shift by exponent of the 25 * power of 2 of alignment: aligned by 4 - shift by 2, aligned by 8 - shift by 26 * 3, etc.). The resulting value must fit in either 32 or 16 bits. You may 27 * use the macros provided in this file to do it programmatically. 28 * 29 * For usage example and further explanation please see this library's 30 * documentation in the programming guide. 31 */ 32 33 #include <stdint.h> 34 #include <inttypes.h> 35 36 #include <rte_bitops.h> 37 #include <rte_branch_prediction.h> 38 #include <rte_common.h> 39 #include <rte_debug.h> 40 #include <rte_vect.h> 41 42 #ifdef __cplusplus 43 extern "C" { 44 #endif 45 46 /** 47 * Calculate how many bits are required to store pointers within a given memory 48 * region as offsets. This can help decide which pointer compression functions 49 * can be used. 50 * 51 * @param mem_length 52 * Length of the memory region the pointers are constrained to. 53 * @return 54 * Number of bits required to store a value. 55 **/ 56 #define RTE_PTR_COMPRESS_BITS_NEEDED_FOR_POINTER_WITHIN_RANGE(mem_length) \ 57 (((uint64_t)mem_length) < 2 ? 1 : \ 58 (sizeof(uint64_t) * CHAR_BIT - \ 59 rte_clz64((uint64_t)mem_length - 1))) 60 61 /** 62 * Calculate how many bits in the address can be dropped without losing any 63 * information thanks to the alignment of the address. 64 * 65 * @param alignment 66 * Memory alignment. 67 * @return 68 * Size of shift allowed without dropping any information from the pointer. 69 **/ 70 #define RTE_PTR_COMPRESS_BIT_SHIFT_FROM_ALIGNMENT(alignment) \ 71 ((alignment) == 0 ? 0 : rte_ctz64((uint64_t)alignment)) 72 73 /** 74 * Determine if rte_ptr_compress_16_shift can be used to compress pointers 75 * that contain addresses of memory objects whose memory is aligned by 76 * a given amount and contained in a given memory region. 77 * 78 * @param mem_length 79 * The length of the memory region that contains the objects pointed to. 80 * @param obj_alignment 81 * The alignment of objects pointed to. 82 * @return 83 * 1 if function can be used, 0 otherwise. 84 **/ 85 #define RTE_PTR_COMPRESS_CAN_COMPRESS_16_SHIFT(mem_length, obj_alignment) \ 86 ((RTE_PTR_COMPRESS_BITS_NEEDED_FOR_POINTER_WITHIN_RANGE(mem_length) - \ 87 RTE_PTR_COMPRESS_BIT_SHIFT_FROM_ALIGNMENT(obj_alignment)) <= 16 ? 1 : 0) 88 89 /** 90 * Determine if rte_ptr_compress_32_shift can be used to compress pointers 91 * that contain addresses of memory objects whose memory is aligned by 92 * a given amount and contained in a given memory region. 93 * 94 * @param mem_length 95 * The length of the memory region that contains the objects pointed to. 96 * @param obj_alignment 97 * The alignment of objects pointed to. 98 * @return 99 * 1 if function can be used, 0 otherwise. 100 **/ 101 #define RTE_PTR_COMPRESS_CAN_COMPRESS_32_SHIFT(mem_length, obj_alignment) \ 102 ((RTE_PTR_COMPRESS_BITS_NEEDED_FOR_POINTER_WITHIN_RANGE(mem_length) - \ 103 RTE_PTR_COMPRESS_BIT_SHIFT_FROM_ALIGNMENT(obj_alignment)) <= 32 ? 1 : 0) 104 105 /** 106 * Compress pointers into 32-bit offsets from base pointer. 107 * 108 * @note It is programmer's responsibility to ensure the resulting offsets fit 109 * into 32 bits. Alignment of the structures pointed to by the pointers allows 110 * us to drop bits from the offsets. This is controlled by the bit_shift 111 * parameter. This means that if structures are aligned by 8 bytes they must be 112 * within 32GB of the base pointer. If there is no such alignment guarantee they 113 * must be within 4GB. 114 * 115 * @param ptr_base 116 * A pointer used to calculate offsets of pointers in src_table. 117 * @param src_table 118 * A pointer to an array of pointers. 119 * @param dest_table 120 * A pointer to an array of compressed pointers returned by this function. 121 * @param n 122 * The number of objects to compress, must be strictly positive. 123 * @param bit_shift 124 * Byte alignment of memory pointed to by the pointers allows for 125 * bits to be dropped from the offset and hence widen the memory region that 126 * can be covered. This controls how many bits are right shifted. 127 **/ 128 static __rte_always_inline void 129 rte_ptr_compress_32_shift(void *ptr_base, void * const *src_table, 130 uint32_t *dest_table, size_t n, uint8_t bit_shift) 131 { 132 size_t i = 0; 133 #if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32 134 svuint64_t v_ptr_table; 135 do { 136 svbool_t pg = svwhilelt_b64(i, n); 137 v_ptr_table = svld1_u64(pg, (uint64_t *)src_table + i); 138 v_ptr_table = svsub_x(pg, v_ptr_table, (uint64_t)ptr_base); 139 v_ptr_table = svlsr_x(pg, v_ptr_table, bit_shift); 140 svst1w(pg, &dest_table[i], v_ptr_table); 141 i += svcntd(); 142 } while (i < n); 143 #elif defined __ARM_NEON && !defined RTE_ARCH_ARMv8_AARCH32 144 uintptr_t ptr_diff; 145 uint64x2_t v_ptr_table; 146 /* right shift is done by left shifting by negative int */ 147 int64x2_t v_shift = vdupq_n_s64(-bit_shift); 148 uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base); 149 const size_t n_even = n & ~0x1; 150 for (; i < n_even; i += 2) { 151 v_ptr_table = vld1q_u64((const uint64_t *)src_table + i); 152 v_ptr_table = vsubq_u64(v_ptr_table, v_ptr_base); 153 v_ptr_table = vshlq_u64(v_ptr_table, v_shift); 154 vst1_u32(dest_table + i, vqmovn_u64(v_ptr_table)); 155 } 156 /* process leftover single item in case of odd number of n */ 157 if (unlikely(n & 0x1)) { 158 ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base); 159 dest_table[i] = (uint32_t) (ptr_diff >> bit_shift); 160 } 161 #else 162 uintptr_t ptr_diff; 163 for (; i < n; i++) { 164 ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base); 165 ptr_diff = ptr_diff >> bit_shift; 166 RTE_ASSERT(ptr_diff <= UINT32_MAX); 167 dest_table[i] = (uint32_t) ptr_diff; 168 } 169 #endif 170 } 171 172 /** 173 * Decompress pointers from 32-bit offsets from base pointer. 174 * 175 * @param ptr_base 176 * A pointer which was used to calculate offsets in src_table. 177 * @param src_table 178 * A pointer to an array to compressed pointers. 179 * @param dest_table 180 * A pointer to an array of decompressed pointers returned by this function. 181 * @param n 182 * The number of objects to decompress, must be strictly positive. 183 * @param bit_shift 184 * Byte alignment of memory pointed to by the pointers allows for 185 * bits to be dropped from the offset and hence widen the memory region that 186 * can be covered. This controls how many bits are left shifted when pointers 187 * are recovered from the offsets. 188 **/ 189 static __rte_always_inline void 190 rte_ptr_decompress_32_shift(void *ptr_base, uint32_t const *src_table, 191 void **dest_table, size_t n, uint8_t bit_shift) 192 { 193 size_t i = 0; 194 #if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32 195 svuint64_t v_ptr_table; 196 do { 197 svbool_t pg = svwhilelt_b64(i, n); 198 v_ptr_table = svld1uw_u64(pg, &src_table[i]); 199 v_ptr_table = svlsl_x(pg, v_ptr_table, bit_shift); 200 v_ptr_table = svadd_x(pg, v_ptr_table, (uint64_t)ptr_base); 201 svst1(pg, (uint64_t *)dest_table + i, v_ptr_table); 202 i += svcntd(); 203 } while (i < n); 204 #elif defined __ARM_NEON && !defined RTE_ARCH_ARMv8_AARCH32 205 uintptr_t ptr_diff; 206 uint64x2_t v_ptr_table; 207 int64x2_t v_shift = vdupq_n_s64(bit_shift); 208 uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base); 209 const size_t n_even = n & ~0x1; 210 for (; i < n_even; i += 2) { 211 v_ptr_table = vmovl_u32(vld1_u32(src_table + i)); 212 v_ptr_table = vshlq_u64(v_ptr_table, v_shift); 213 v_ptr_table = vaddq_u64(v_ptr_table, v_ptr_base); 214 vst1q_u64((uint64_t *)dest_table + i, v_ptr_table); 215 } 216 /* process leftover single item in case of odd number of n */ 217 if (unlikely(n & 0x1)) { 218 ptr_diff = ((uintptr_t) src_table[i]) << bit_shift; 219 dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff); 220 } 221 #else 222 uintptr_t ptr_diff; 223 for (; i < n; i++) { 224 ptr_diff = ((uintptr_t) src_table[i]) << bit_shift; 225 dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff); 226 } 227 #endif 228 } 229 230 /** 231 * Compress pointers into 16-bit offsets from base pointer. 232 * 233 * @note It is programmer's responsibility to ensure the resulting offsets fit 234 * into 16 bits. Alignment of the structures pointed to by the pointers allows 235 * us to drop bits from the offsets. This is controlled by the bit_shift 236 * parameter. This means that if structures are aligned by 8 bytes they must be 237 * within 256KB of the base pointer. If there is no such alignment guarantee 238 * they must be within 64KB. 239 * 240 * @param ptr_base 241 * A pointer used to calculate offsets of pointers in src_table. 242 * @param src_table 243 * A pointer to an array of pointers. 244 * @param dest_table 245 * A pointer to an array of compressed pointers returned by this function. 246 * @param n 247 * The number of objects to compress, must be strictly positive. 248 * @param bit_shift 249 * Byte alignment of memory pointed to by the pointers allows for 250 * bits to be dropped from the offset and hence widen the memory region that 251 * can be covered. This controls how many bits are right shifted. 252 **/ 253 static __rte_always_inline void 254 rte_ptr_compress_16_shift(void *ptr_base, void * const *src_table, 255 uint16_t *dest_table, size_t n, uint8_t bit_shift) 256 { 257 258 size_t i = 0; 259 #if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32 260 svuint64_t v_ptr_table; 261 do { 262 svbool_t pg = svwhilelt_b64(i, n); 263 v_ptr_table = svld1_u64(pg, (uint64_t *)src_table + i); 264 v_ptr_table = svsub_x(pg, v_ptr_table, (uint64_t)ptr_base); 265 v_ptr_table = svlsr_x(pg, v_ptr_table, bit_shift); 266 svst1h(pg, &dest_table[i], v_ptr_table); 267 i += svcntd(); 268 } while (i < n); 269 #else 270 uintptr_t ptr_diff; 271 for (; i < n; i++) { 272 ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base); 273 ptr_diff = ptr_diff >> bit_shift; 274 RTE_ASSERT(ptr_diff <= UINT16_MAX); 275 dest_table[i] = (uint16_t) ptr_diff; 276 } 277 #endif 278 } 279 280 /** 281 * Decompress pointers from 16-bit offsets from base pointer. 282 * 283 * @param ptr_base 284 * A pointer which was used to calculate offsets in src_table. 285 * @param src_table 286 * A pointer to an array to compressed pointers. 287 * @param dest_table 288 * A pointer to an array of decompressed pointers returned by this function. 289 * @param n 290 * The number of objects to decompress, must be strictly positive. 291 * @param bit_shift 292 * Byte alignment of memory pointed to by the pointers allows for 293 * bits to be dropped from the offset and hence widen the memory region that 294 * can be covered. This controls how many bits are left shifted when pointers 295 * are recovered from the offsets. 296 **/ 297 static __rte_always_inline void 298 rte_ptr_decompress_16_shift(void *ptr_base, uint16_t const *src_table, 299 void **dest_table, size_t n, uint8_t bit_shift) 300 { 301 size_t i = 0; 302 #if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32 303 svuint64_t v_ptr_table; 304 do { 305 svbool_t pg = svwhilelt_b64(i, n); 306 v_ptr_table = svld1uh_u64(pg, &src_table[i]); 307 v_ptr_table = svlsl_x(pg, v_ptr_table, bit_shift); 308 v_ptr_table = svadd_x(pg, v_ptr_table, (uint64_t)ptr_base); 309 svst1(pg, (uint64_t *)dest_table + i, v_ptr_table); 310 i += svcntd(); 311 } while (i < n); 312 #else 313 uintptr_t ptr_diff; 314 for (; i < n; i++) { 315 ptr_diff = ((uintptr_t) src_table[i]) << bit_shift; 316 dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff); 317 } 318 #endif 319 } 320 321 #ifdef __cplusplus 322 } 323 #endif 324 325 #endif /* RTE_PTR_COMPRESS_H */ 326