xref: /dpdk/lib/ptr_compress/rte_ptr_compress.h (revision fc10d6bd7632dff48a9f191e8d43872f47509e09)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2024 Arm Limited
3  */
4 
5 #ifndef RTE_PTR_COMPRESS_H
6 #define RTE_PTR_COMPRESS_H
7 
8 /**
9  * @file
10  * Pointer compression and decompression functions.
11  *
12  * When passing arrays full of pointers between threads, memory containing
13  * the pointers is copied multiple times which is especially costly between
14  * cores. These functions allow us to compress the pointers.
15  *
16  * Compression takes advantage of the fact that pointers are usually located in
17  * a limited memory region. We compress them by converting them to offsets from
18  * a base memory address. Offsets can be stored in fewer bytes.
19  *
20  * The compression functions come in two varieties: 32-bit and 16-bit.
21  *
22  * To determine how many bits are needed to compress the pointer, calculate
23  * the biggest offset possible (highest value pointer - base pointer)
24  * and shift the value right according to alignment (shift by exponent of the
25  * power of 2 of alignment: aligned by 4 - shift by 2, aligned by 8 - shift by
26  * 3, etc.). The resulting value must fit in either 32 or 16 bits. You may
27  * use the macros provided in this file to do it programmatically.
28  *
29  * For usage example and further explanation please see this library's
30  * documentation in the programming guide.
31  */
32 
33 #include <stdint.h>
34 #include <inttypes.h>
35 
36 #include <rte_bitops.h>
37 #include <rte_branch_prediction.h>
38 #include <rte_common.h>
39 #include <rte_debug.h>
40 #include <rte_vect.h>
41 
42 #ifdef __cplusplus
43 extern "C" {
44 #endif
45 
46 /**
47  * Calculate how many bits are required to store pointers within a given memory
48  * region as offsets. This can help decide which pointer compression functions
49  * can be used.
50  *
51  * @param mem_length
52  *   Length of the memory region the pointers are constrained to.
53  * @return
54  *   Number of bits required to store a value.
55  **/
56 #define RTE_PTR_COMPRESS_BITS_NEEDED_FOR_POINTER_WITHIN_RANGE(mem_length) \
57 	(((uint64_t)mem_length) < 2 ? 1 : \
58 		(sizeof(uint64_t) * CHAR_BIT - \
59 		 rte_clz64((uint64_t)mem_length - 1)))
60 
61 /**
62  * Calculate how many bits in the address can be dropped without losing any
63  * information thanks to the alignment of the address.
64  *
65  * @param alignment
66  *   Memory alignment.
67  * @return
68  *   Size of shift allowed without dropping any information from the pointer.
69  **/
70 #define RTE_PTR_COMPRESS_BIT_SHIFT_FROM_ALIGNMENT(alignment) \
71 	((alignment) == 0 ? 0 : rte_ctz64((uint64_t)alignment))
72 
73 /**
74  * Determine if rte_ptr_compress_16_shift can be used to compress pointers
75  * that contain addresses of memory objects whose memory is aligned by
76  * a given amount and contained in a given memory region.
77  *
78  * @param mem_length
79  *   The length of the memory region that contains the objects pointed to.
80  * @param obj_alignment
81  *   The alignment of objects pointed to.
82  * @return
83  *   1 if function can be used, 0 otherwise.
84  **/
85 #define RTE_PTR_COMPRESS_CAN_COMPRESS_16_SHIFT(mem_length, obj_alignment) \
86 	((RTE_PTR_COMPRESS_BITS_NEEDED_FOR_POINTER_WITHIN_RANGE(mem_length) - \
87 	RTE_PTR_COMPRESS_BIT_SHIFT_FROM_ALIGNMENT(obj_alignment)) <= 16 ? 1 : 0)
88 
89 /**
90  * Determine if rte_ptr_compress_32_shift can be used to compress pointers
91  * that contain addresses of memory objects whose memory is aligned by
92  * a given amount and contained in a given memory region.
93  *
94  * @param mem_length
95  *   The length of the memory region that contains the objects pointed to.
96  * @param obj_alignment
97  *   The alignment of objects pointed to.
98  * @return
99  *   1 if function can be used, 0 otherwise.
100  **/
101 #define RTE_PTR_COMPRESS_CAN_COMPRESS_32_SHIFT(mem_length, obj_alignment) \
102 	((RTE_PTR_COMPRESS_BITS_NEEDED_FOR_POINTER_WITHIN_RANGE(mem_length) - \
103 	RTE_PTR_COMPRESS_BIT_SHIFT_FROM_ALIGNMENT(obj_alignment)) <= 32 ? 1 : 0)
104 
105 /**
106  * Compress pointers into 32-bit offsets from base pointer.
107  *
108  * @note It is programmer's responsibility to ensure the resulting offsets fit
109  * into 32 bits. Alignment of the structures pointed to by the pointers allows
110  * us to drop bits from the offsets. This is controlled by the bit_shift
111  * parameter. This means that if structures are aligned by 8 bytes they must be
112  * within 32GB of the base pointer. If there is no such alignment guarantee they
113  * must be within 4GB.
114  *
115  * @param ptr_base
116  *   A pointer used to calculate offsets of pointers in src_table.
117  * @param src_table
118  *   A pointer to an array of pointers.
119  * @param dest_table
120  *   A pointer to an array of compressed pointers returned by this function.
121  * @param n
122  *   The number of objects to compress, must be strictly positive.
123  * @param bit_shift
124  *   Byte alignment of memory pointed to by the pointers allows for
125  *   bits to be dropped from the offset and hence widen the memory region that
126  *   can be covered. This controls how many bits are right shifted.
127  **/
128 static __rte_always_inline void
rte_ptr_compress_32_shift(void * ptr_base,void * const * src_table,uint32_t * dest_table,size_t n,uint8_t bit_shift)129 rte_ptr_compress_32_shift(void *ptr_base, void * const *src_table,
130 		uint32_t *dest_table, size_t n, uint8_t bit_shift)
131 {
132 	size_t i = 0;
133 #if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
134 	svuint64_t v_ptr_table;
135 	do {
136 		svbool_t pg = svwhilelt_b64(i, n);
137 		v_ptr_table = svld1_u64(pg, (uint64_t *)src_table + i);
138 		v_ptr_table = svsub_x(pg, v_ptr_table, (uint64_t)ptr_base);
139 		v_ptr_table = svlsr_x(pg, v_ptr_table, bit_shift);
140 		svst1w(pg, &dest_table[i], v_ptr_table);
141 		i += svcntd();
142 	} while (i < n);
143 #elif defined __ARM_NEON && !defined RTE_ARCH_ARMv8_AARCH32
144 	uintptr_t ptr_diff;
145 	uint64x2_t v_ptr_table;
146 	/* right shift is done by left shifting by negative int */
147 	int64x2_t v_shift = vdupq_n_s64(-bit_shift);
148 	uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base);
149 	const size_t n_even = n & ~0x1;
150 	for (; i < n_even; i += 2) {
151 		v_ptr_table = vld1q_u64((const uint64_t *)src_table + i);
152 		v_ptr_table = vsubq_u64(v_ptr_table, v_ptr_base);
153 		v_ptr_table = vshlq_u64(v_ptr_table, v_shift);
154 		vst1_u32(dest_table + i, vqmovn_u64(v_ptr_table));
155 	}
156 	/* process leftover single item in case of odd number of n */
157 	if (unlikely(n & 0x1)) {
158 		ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
159 		dest_table[i] = (uint32_t) (ptr_diff >> bit_shift);
160 	}
161 #else
162 	uintptr_t ptr_diff;
163 	for (; i < n; i++) {
164 		ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
165 		ptr_diff = ptr_diff >> bit_shift;
166 		RTE_ASSERT(ptr_diff <= UINT32_MAX);
167 		dest_table[i] = (uint32_t) ptr_diff;
168 	}
169 #endif
170 }
171 
172 /**
173  * Decompress pointers from 32-bit offsets from base pointer.
174  *
175  * @param ptr_base
176  *   A pointer which was used to calculate offsets in src_table.
177  * @param src_table
178  *   A pointer to an array to compressed pointers.
179  * @param dest_table
180  *   A pointer to an array of decompressed pointers returned by this function.
181  * @param n
182  *   The number of objects to decompress, must be strictly positive.
183  * @param bit_shift
184  *   Byte alignment of memory pointed to by the pointers allows for
185  *   bits to be dropped from the offset and hence widen the memory region that
186  *   can be covered. This controls how many bits are left shifted when pointers
187  *   are recovered from the offsets.
188  **/
189 static __rte_always_inline void
rte_ptr_decompress_32_shift(void * ptr_base,uint32_t const * src_table,void ** dest_table,size_t n,uint8_t bit_shift)190 rte_ptr_decompress_32_shift(void *ptr_base, uint32_t const *src_table,
191 		void **dest_table, size_t n, uint8_t bit_shift)
192 {
193 	size_t i = 0;
194 #if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
195 	svuint64_t v_ptr_table;
196 	do {
197 		svbool_t pg = svwhilelt_b64(i, n);
198 		v_ptr_table = svld1uw_u64(pg, &src_table[i]);
199 		v_ptr_table = svlsl_x(pg, v_ptr_table, bit_shift);
200 		v_ptr_table = svadd_x(pg, v_ptr_table, (uint64_t)ptr_base);
201 		svst1(pg, (uint64_t *)dest_table + i, v_ptr_table);
202 		i += svcntd();
203 	} while (i < n);
204 #elif defined __ARM_NEON && !defined RTE_ARCH_ARMv8_AARCH32
205 	uintptr_t ptr_diff;
206 	uint64x2_t v_ptr_table;
207 	int64x2_t v_shift = vdupq_n_s64(bit_shift);
208 	uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base);
209 	const size_t n_even = n & ~0x1;
210 	for (; i < n_even; i += 2) {
211 		v_ptr_table = vmovl_u32(vld1_u32(src_table + i));
212 		v_ptr_table = vshlq_u64(v_ptr_table, v_shift);
213 		v_ptr_table = vaddq_u64(v_ptr_table, v_ptr_base);
214 		vst1q_u64((uint64_t *)dest_table + i, v_ptr_table);
215 	}
216 	/* process leftover single item in case of odd number of n */
217 	if (unlikely(n & 0x1)) {
218 		ptr_diff = ((uintptr_t) src_table[i]) << bit_shift;
219 		dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff);
220 	}
221 #else
222 	uintptr_t ptr_diff;
223 	for (; i < n; i++) {
224 		ptr_diff = ((uintptr_t) src_table[i]) << bit_shift;
225 		dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff);
226 	}
227 #endif
228 }
229 
230 /**
231  * Compress pointers into 16-bit offsets from base pointer.
232  *
233  * @note It is programmer's responsibility to ensure the resulting offsets fit
234  * into 16 bits. Alignment of the structures pointed to by the pointers allows
235  * us to drop bits from the offsets. This is controlled by the bit_shift
236  * parameter. This means that if structures are aligned by 8 bytes they must be
237  * within 256KB of the base pointer. If there is no such alignment guarantee
238  * they must be within 64KB.
239  *
240  * @param ptr_base
241  *   A pointer used to calculate offsets of pointers in src_table.
242  * @param src_table
243  *   A pointer to an array of pointers.
244  * @param dest_table
245  *   A pointer to an array of compressed pointers returned by this function.
246  * @param n
247  *   The number of objects to compress, must be strictly positive.
248  * @param bit_shift
249  *   Byte alignment of memory pointed to by the pointers allows for
250  *   bits to be dropped from the offset and hence widen the memory region that
251  *   can be covered. This controls how many bits are right shifted.
252  **/
253 static __rte_always_inline void
rte_ptr_compress_16_shift(void * ptr_base,void * const * src_table,uint16_t * dest_table,size_t n,uint8_t bit_shift)254 rte_ptr_compress_16_shift(void *ptr_base, void * const *src_table,
255 		uint16_t *dest_table, size_t n, uint8_t bit_shift)
256 {
257 
258 	size_t i = 0;
259 #if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
260 	svuint64_t v_ptr_table;
261 	do {
262 		svbool_t pg = svwhilelt_b64(i, n);
263 		v_ptr_table = svld1_u64(pg, (uint64_t *)src_table + i);
264 		v_ptr_table = svsub_x(pg, v_ptr_table, (uint64_t)ptr_base);
265 		v_ptr_table = svlsr_x(pg, v_ptr_table, bit_shift);
266 		svst1h(pg, &dest_table[i], v_ptr_table);
267 		i += svcntd();
268 	} while (i < n);
269 #else
270 	uintptr_t ptr_diff;
271 	for (; i < n; i++) {
272 		ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
273 		ptr_diff = ptr_diff >> bit_shift;
274 		RTE_ASSERT(ptr_diff <= UINT16_MAX);
275 		dest_table[i] = (uint16_t) ptr_diff;
276 	}
277 #endif
278 }
279 
280 /**
281  * Decompress pointers from 16-bit offsets from base pointer.
282  *
283  * @param ptr_base
284  *   A pointer which was used to calculate offsets in src_table.
285  * @param src_table
286  *   A pointer to an array to compressed pointers.
287  * @param dest_table
288  *   A pointer to an array of decompressed pointers returned by this function.
289  * @param n
290  *   The number of objects to decompress, must be strictly positive.
291  * @param bit_shift
292  *   Byte alignment of memory pointed to by the pointers allows for
293  *   bits to be dropped from the offset and hence widen the memory region that
294  *   can be covered. This controls how many bits are left shifted when pointers
295  *   are recovered from the offsets.
296  **/
297 static __rte_always_inline void
rte_ptr_decompress_16_shift(void * ptr_base,uint16_t const * src_table,void ** dest_table,size_t n,uint8_t bit_shift)298 rte_ptr_decompress_16_shift(void *ptr_base, uint16_t const *src_table,
299 		void **dest_table, size_t n, uint8_t bit_shift)
300 {
301 	size_t i = 0;
302 #if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
303 	svuint64_t v_ptr_table;
304 	do {
305 		svbool_t pg = svwhilelt_b64(i, n);
306 		v_ptr_table = svld1uh_u64(pg, &src_table[i]);
307 		v_ptr_table = svlsl_x(pg, v_ptr_table, bit_shift);
308 		v_ptr_table = svadd_x(pg, v_ptr_table, (uint64_t)ptr_base);
309 		svst1(pg, (uint64_t *)dest_table + i, v_ptr_table);
310 		i += svcntd();
311 	} while (i < n);
312 #else
313 	uintptr_t ptr_diff;
314 	for (; i < n; i++) {
315 		ptr_diff = ((uintptr_t) src_table[i]) << bit_shift;
316 		dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff);
317 	}
318 #endif
319 }
320 
321 #ifdef __cplusplus
322 }
323 #endif
324 
325 #endif /* RTE_PTR_COMPRESS_H */
326