1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2024 Arm Limited
3 */
4
5 #ifndef RTE_PTR_COMPRESS_H
6 #define RTE_PTR_COMPRESS_H
7
8 /**
9 * @file
10 * Pointer compression and decompression functions.
11 *
12 * When passing arrays full of pointers between threads, memory containing
13 * the pointers is copied multiple times which is especially costly between
14 * cores. These functions allow us to compress the pointers.
15 *
16 * Compression takes advantage of the fact that pointers are usually located in
17 * a limited memory region. We compress them by converting them to offsets from
18 * a base memory address. Offsets can be stored in fewer bytes.
19 *
20 * The compression functions come in two varieties: 32-bit and 16-bit.
21 *
22 * To determine how many bits are needed to compress the pointer, calculate
23 * the biggest offset possible (highest value pointer - base pointer)
24 * and shift the value right according to alignment (shift by exponent of the
25 * power of 2 of alignment: aligned by 4 - shift by 2, aligned by 8 - shift by
26 * 3, etc.). The resulting value must fit in either 32 or 16 bits. You may
27 * use the macros provided in this file to do it programmatically.
28 *
29 * For usage example and further explanation please see this library's
30 * documentation in the programming guide.
31 */
32
33 #include <stdint.h>
34 #include <inttypes.h>
35
36 #include <rte_bitops.h>
37 #include <rte_branch_prediction.h>
38 #include <rte_common.h>
39 #include <rte_debug.h>
40 #include <rte_vect.h>
41
42 #ifdef __cplusplus
43 extern "C" {
44 #endif
45
46 /**
47 * Calculate how many bits are required to store pointers within a given memory
48 * region as offsets. This can help decide which pointer compression functions
49 * can be used.
50 *
51 * @param mem_length
52 * Length of the memory region the pointers are constrained to.
53 * @return
54 * Number of bits required to store a value.
55 **/
56 #define RTE_PTR_COMPRESS_BITS_NEEDED_FOR_POINTER_WITHIN_RANGE(mem_length) \
57 (((uint64_t)mem_length) < 2 ? 1 : \
58 (sizeof(uint64_t) * CHAR_BIT - \
59 rte_clz64((uint64_t)mem_length - 1)))
60
61 /**
62 * Calculate how many bits in the address can be dropped without losing any
63 * information thanks to the alignment of the address.
64 *
65 * @param alignment
66 * Memory alignment.
67 * @return
68 * Size of shift allowed without dropping any information from the pointer.
69 **/
70 #define RTE_PTR_COMPRESS_BIT_SHIFT_FROM_ALIGNMENT(alignment) \
71 ((alignment) == 0 ? 0 : rte_ctz64((uint64_t)alignment))
72
73 /**
74 * Determine if rte_ptr_compress_16_shift can be used to compress pointers
75 * that contain addresses of memory objects whose memory is aligned by
76 * a given amount and contained in a given memory region.
77 *
78 * @param mem_length
79 * The length of the memory region that contains the objects pointed to.
80 * @param obj_alignment
81 * The alignment of objects pointed to.
82 * @return
83 * 1 if function can be used, 0 otherwise.
84 **/
85 #define RTE_PTR_COMPRESS_CAN_COMPRESS_16_SHIFT(mem_length, obj_alignment) \
86 ((RTE_PTR_COMPRESS_BITS_NEEDED_FOR_POINTER_WITHIN_RANGE(mem_length) - \
87 RTE_PTR_COMPRESS_BIT_SHIFT_FROM_ALIGNMENT(obj_alignment)) <= 16 ? 1 : 0)
88
89 /**
90 * Determine if rte_ptr_compress_32_shift can be used to compress pointers
91 * that contain addresses of memory objects whose memory is aligned by
92 * a given amount and contained in a given memory region.
93 *
94 * @param mem_length
95 * The length of the memory region that contains the objects pointed to.
96 * @param obj_alignment
97 * The alignment of objects pointed to.
98 * @return
99 * 1 if function can be used, 0 otherwise.
100 **/
101 #define RTE_PTR_COMPRESS_CAN_COMPRESS_32_SHIFT(mem_length, obj_alignment) \
102 ((RTE_PTR_COMPRESS_BITS_NEEDED_FOR_POINTER_WITHIN_RANGE(mem_length) - \
103 RTE_PTR_COMPRESS_BIT_SHIFT_FROM_ALIGNMENT(obj_alignment)) <= 32 ? 1 : 0)
104
105 /**
106 * Compress pointers into 32-bit offsets from base pointer.
107 *
108 * @note It is programmer's responsibility to ensure the resulting offsets fit
109 * into 32 bits. Alignment of the structures pointed to by the pointers allows
110 * us to drop bits from the offsets. This is controlled by the bit_shift
111 * parameter. This means that if structures are aligned by 8 bytes they must be
112 * within 32GB of the base pointer. If there is no such alignment guarantee they
113 * must be within 4GB.
114 *
115 * @param ptr_base
116 * A pointer used to calculate offsets of pointers in src_table.
117 * @param src_table
118 * A pointer to an array of pointers.
119 * @param dest_table
120 * A pointer to an array of compressed pointers returned by this function.
121 * @param n
122 * The number of objects to compress, must be strictly positive.
123 * @param bit_shift
124 * Byte alignment of memory pointed to by the pointers allows for
125 * bits to be dropped from the offset and hence widen the memory region that
126 * can be covered. This controls how many bits are right shifted.
127 **/
128 static __rte_always_inline void
rte_ptr_compress_32_shift(void * ptr_base,void * const * src_table,uint32_t * dest_table,size_t n,uint8_t bit_shift)129 rte_ptr_compress_32_shift(void *ptr_base, void * const *src_table,
130 uint32_t *dest_table, size_t n, uint8_t bit_shift)
131 {
132 size_t i = 0;
133 #if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
134 svuint64_t v_ptr_table;
135 do {
136 svbool_t pg = svwhilelt_b64(i, n);
137 v_ptr_table = svld1_u64(pg, (uint64_t *)src_table + i);
138 v_ptr_table = svsub_x(pg, v_ptr_table, (uint64_t)ptr_base);
139 v_ptr_table = svlsr_x(pg, v_ptr_table, bit_shift);
140 svst1w(pg, &dest_table[i], v_ptr_table);
141 i += svcntd();
142 } while (i < n);
143 #elif defined __ARM_NEON && !defined RTE_ARCH_ARMv8_AARCH32
144 uintptr_t ptr_diff;
145 uint64x2_t v_ptr_table;
146 /* right shift is done by left shifting by negative int */
147 int64x2_t v_shift = vdupq_n_s64(-bit_shift);
148 uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base);
149 const size_t n_even = n & ~0x1;
150 for (; i < n_even; i += 2) {
151 v_ptr_table = vld1q_u64((const uint64_t *)src_table + i);
152 v_ptr_table = vsubq_u64(v_ptr_table, v_ptr_base);
153 v_ptr_table = vshlq_u64(v_ptr_table, v_shift);
154 vst1_u32(dest_table + i, vqmovn_u64(v_ptr_table));
155 }
156 /* process leftover single item in case of odd number of n */
157 if (unlikely(n & 0x1)) {
158 ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
159 dest_table[i] = (uint32_t) (ptr_diff >> bit_shift);
160 }
161 #else
162 uintptr_t ptr_diff;
163 for (; i < n; i++) {
164 ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
165 ptr_diff = ptr_diff >> bit_shift;
166 RTE_ASSERT(ptr_diff <= UINT32_MAX);
167 dest_table[i] = (uint32_t) ptr_diff;
168 }
169 #endif
170 }
171
172 /**
173 * Decompress pointers from 32-bit offsets from base pointer.
174 *
175 * @param ptr_base
176 * A pointer which was used to calculate offsets in src_table.
177 * @param src_table
178 * A pointer to an array to compressed pointers.
179 * @param dest_table
180 * A pointer to an array of decompressed pointers returned by this function.
181 * @param n
182 * The number of objects to decompress, must be strictly positive.
183 * @param bit_shift
184 * Byte alignment of memory pointed to by the pointers allows for
185 * bits to be dropped from the offset and hence widen the memory region that
186 * can be covered. This controls how many bits are left shifted when pointers
187 * are recovered from the offsets.
188 **/
189 static __rte_always_inline void
rte_ptr_decompress_32_shift(void * ptr_base,uint32_t const * src_table,void ** dest_table,size_t n,uint8_t bit_shift)190 rte_ptr_decompress_32_shift(void *ptr_base, uint32_t const *src_table,
191 void **dest_table, size_t n, uint8_t bit_shift)
192 {
193 size_t i = 0;
194 #if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
195 svuint64_t v_ptr_table;
196 do {
197 svbool_t pg = svwhilelt_b64(i, n);
198 v_ptr_table = svld1uw_u64(pg, &src_table[i]);
199 v_ptr_table = svlsl_x(pg, v_ptr_table, bit_shift);
200 v_ptr_table = svadd_x(pg, v_ptr_table, (uint64_t)ptr_base);
201 svst1(pg, (uint64_t *)dest_table + i, v_ptr_table);
202 i += svcntd();
203 } while (i < n);
204 #elif defined __ARM_NEON && !defined RTE_ARCH_ARMv8_AARCH32
205 uintptr_t ptr_diff;
206 uint64x2_t v_ptr_table;
207 int64x2_t v_shift = vdupq_n_s64(bit_shift);
208 uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base);
209 const size_t n_even = n & ~0x1;
210 for (; i < n_even; i += 2) {
211 v_ptr_table = vmovl_u32(vld1_u32(src_table + i));
212 v_ptr_table = vshlq_u64(v_ptr_table, v_shift);
213 v_ptr_table = vaddq_u64(v_ptr_table, v_ptr_base);
214 vst1q_u64((uint64_t *)dest_table + i, v_ptr_table);
215 }
216 /* process leftover single item in case of odd number of n */
217 if (unlikely(n & 0x1)) {
218 ptr_diff = ((uintptr_t) src_table[i]) << bit_shift;
219 dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff);
220 }
221 #else
222 uintptr_t ptr_diff;
223 for (; i < n; i++) {
224 ptr_diff = ((uintptr_t) src_table[i]) << bit_shift;
225 dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff);
226 }
227 #endif
228 }
229
230 /**
231 * Compress pointers into 16-bit offsets from base pointer.
232 *
233 * @note It is programmer's responsibility to ensure the resulting offsets fit
234 * into 16 bits. Alignment of the structures pointed to by the pointers allows
235 * us to drop bits from the offsets. This is controlled by the bit_shift
236 * parameter. This means that if structures are aligned by 8 bytes they must be
237 * within 256KB of the base pointer. If there is no such alignment guarantee
238 * they must be within 64KB.
239 *
240 * @param ptr_base
241 * A pointer used to calculate offsets of pointers in src_table.
242 * @param src_table
243 * A pointer to an array of pointers.
244 * @param dest_table
245 * A pointer to an array of compressed pointers returned by this function.
246 * @param n
247 * The number of objects to compress, must be strictly positive.
248 * @param bit_shift
249 * Byte alignment of memory pointed to by the pointers allows for
250 * bits to be dropped from the offset and hence widen the memory region that
251 * can be covered. This controls how many bits are right shifted.
252 **/
253 static __rte_always_inline void
rte_ptr_compress_16_shift(void * ptr_base,void * const * src_table,uint16_t * dest_table,size_t n,uint8_t bit_shift)254 rte_ptr_compress_16_shift(void *ptr_base, void * const *src_table,
255 uint16_t *dest_table, size_t n, uint8_t bit_shift)
256 {
257
258 size_t i = 0;
259 #if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
260 svuint64_t v_ptr_table;
261 do {
262 svbool_t pg = svwhilelt_b64(i, n);
263 v_ptr_table = svld1_u64(pg, (uint64_t *)src_table + i);
264 v_ptr_table = svsub_x(pg, v_ptr_table, (uint64_t)ptr_base);
265 v_ptr_table = svlsr_x(pg, v_ptr_table, bit_shift);
266 svst1h(pg, &dest_table[i], v_ptr_table);
267 i += svcntd();
268 } while (i < n);
269 #else
270 uintptr_t ptr_diff;
271 for (; i < n; i++) {
272 ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
273 ptr_diff = ptr_diff >> bit_shift;
274 RTE_ASSERT(ptr_diff <= UINT16_MAX);
275 dest_table[i] = (uint16_t) ptr_diff;
276 }
277 #endif
278 }
279
280 /**
281 * Decompress pointers from 16-bit offsets from base pointer.
282 *
283 * @param ptr_base
284 * A pointer which was used to calculate offsets in src_table.
285 * @param src_table
286 * A pointer to an array to compressed pointers.
287 * @param dest_table
288 * A pointer to an array of decompressed pointers returned by this function.
289 * @param n
290 * The number of objects to decompress, must be strictly positive.
291 * @param bit_shift
292 * Byte alignment of memory pointed to by the pointers allows for
293 * bits to be dropped from the offset and hence widen the memory region that
294 * can be covered. This controls how many bits are left shifted when pointers
295 * are recovered from the offsets.
296 **/
297 static __rte_always_inline void
rte_ptr_decompress_16_shift(void * ptr_base,uint16_t const * src_table,void ** dest_table,size_t n,uint8_t bit_shift)298 rte_ptr_decompress_16_shift(void *ptr_base, uint16_t const *src_table,
299 void **dest_table, size_t n, uint8_t bit_shift)
300 {
301 size_t i = 0;
302 #if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
303 svuint64_t v_ptr_table;
304 do {
305 svbool_t pg = svwhilelt_b64(i, n);
306 v_ptr_table = svld1uh_u64(pg, &src_table[i]);
307 v_ptr_table = svlsl_x(pg, v_ptr_table, bit_shift);
308 v_ptr_table = svadd_x(pg, v_ptr_table, (uint64_t)ptr_base);
309 svst1(pg, (uint64_t *)dest_table + i, v_ptr_table);
310 i += svcntd();
311 } while (i < n);
312 #else
313 uintptr_t ptr_diff;
314 for (; i < n; i++) {
315 ptr_diff = ((uintptr_t) src_table[i]) << bit_shift;
316 dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff);
317 }
318 #endif
319 }
320
321 #ifdef __cplusplus
322 }
323 #endif
324
325 #endif /* RTE_PTR_COMPRESS_H */
326