xref: /dpdk/lib/eal/x86/include/rte_atomic.h (revision 719834a6849e1daf4a70ff7742bbcc3ae7e25607)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #ifndef _RTE_ATOMIC_X86_H_
6 #define _RTE_ATOMIC_X86_H_
7 
8 #include <stdint.h>
9 #include <rte_common.h>
10 #include <rte_config.h>
11 #include <emmintrin.h>
12 #include "generic/rte_atomic.h"
13 
14 #if RTE_MAX_LCORE == 1
15 #define MPLOCKED                        /**< No need to insert MP lock prefix. */
16 #else
17 #define MPLOCKED        "lock ; "       /**< Insert MP lock prefix. */
18 #endif
19 
20 #define	rte_mb() _mm_mfence()
21 
22 #define	rte_wmb() _mm_sfence()
23 
24 #define	rte_rmb() _mm_lfence()
25 
26 #define rte_smp_wmb() rte_compiler_barrier()
27 
28 #define rte_smp_rmb() rte_compiler_barrier()
29 
30 #ifdef __cplusplus
31 extern "C" {
32 #endif
33 
34 /*
35  * From Intel Software Development Manual; Vol 3;
36  * 8.2.2 Memory Ordering in P6 and More Recent Processor Families:
37  * ...
38  * . Reads are not reordered with other reads.
39  * . Writes are not reordered with older reads.
40  * . Writes to memory are not reordered with other writes,
41  *   with the following exceptions:
42  *   . streaming stores (writes) executed with the non-temporal move
43  *     instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and
44  *   . string operations (see Section 8.2.4.1).
45  *  ...
46  * . Reads may be reordered with older writes to different locations but not
47  * with older writes to the same location.
48  * . Reads or writes cannot be reordered with I/O instructions,
49  * locked instructions, or serializing instructions.
50  * . Reads cannot pass earlier LFENCE and MFENCE instructions.
51  * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE instructions.
52  * . LFENCE instructions cannot pass earlier reads.
53  * . SFENCE instructions cannot pass earlier writes ...
54  * . MFENCE instructions cannot pass earlier reads, writes ...
55  *
56  * As pointed by Java guys, that makes possible to use lock-prefixed
57  * instructions to get the same effect as mfence and on most modern HW
58  * that gives a better performance then using mfence:
59  * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
60  * Basic idea is to use lock prefixed add with some dummy memory location
61  * as the destination. From their experiments 128B(2 cache lines) below
62  * current stack pointer looks like a good candidate.
63  * So below we use that technique for rte_smp_mb() implementation.
64  */
65 
66 static __rte_always_inline void
67 rte_smp_mb(void)
68 {
69 #ifdef RTE_TOOLCHAIN_MSVC
70 	_mm_mfence();
71 #else
72 #ifdef RTE_ARCH_I686
73 	asm volatile("lock addl $0, -128(%%esp); " ::: "memory");
74 #else
75 	asm volatile("lock addl $0, -128(%%rsp); " ::: "memory");
76 #endif
77 #endif
78 }
79 
80 #define rte_io_mb() rte_mb()
81 
82 #define rte_io_wmb() rte_compiler_barrier()
83 
84 #define rte_io_rmb() rte_compiler_barrier()
85 
86 /**
87  * Synchronization fence between threads based on the specified memory order.
88  *
89  * On x86 the __rte_atomic_thread_fence(rte_memory_order_seq_cst) generates full 'mfence'
90  * which is quite expensive. The optimized implementation of rte_smp_mb is
91  * used instead.
92  */
93 static __rte_always_inline void
94 rte_atomic_thread_fence(rte_memory_order memorder)
95 {
96 	if (memorder == rte_memory_order_seq_cst)
97 		rte_smp_mb();
98 	else
99 		__rte_atomic_thread_fence(memorder);
100 }
101 
102 #ifdef __cplusplus
103 }
104 #endif
105 
106 #ifndef RTE_TOOLCHAIN_MSVC
107 
108 /*------------------------- 16 bit atomic operations -------------------------*/
109 
110 #ifdef __cplusplus
111 extern "C" {
112 #endif
113 
114 #ifndef RTE_FORCE_INTRINSICS
115 static inline int
116 rte_atomic16_cmpset(volatile uint16_t *dst, uint16_t exp, uint16_t src)
117 {
118 	uint8_t res;
119 
120 	asm volatile(
121 			MPLOCKED
122 			"cmpxchgw %[src], %[dst];"
123 			"sete %[res];"
124 			: [res] "=a" (res),     /* output */
125 			  [dst] "=m" (*dst)
126 			: [src] "r" (src),      /* input */
127 			  "a" (exp),
128 			  "m" (*dst)
129 			: "memory");            /* no-clobber list */
130 	return res;
131 }
132 
133 static inline uint16_t
134 rte_atomic16_exchange(volatile uint16_t *dst, uint16_t val)
135 {
136 	asm volatile(
137 			MPLOCKED
138 			"xchgw %0, %1;"
139 			: "=r" (val), "=m" (*dst)
140 			: "0" (val),  "m" (*dst)
141 			: "memory");         /* no-clobber list */
142 	return val;
143 }
144 
145 static inline int rte_atomic16_test_and_set(rte_atomic16_t *v)
146 {
147 	return rte_atomic16_cmpset((volatile uint16_t *)&v->cnt, 0, 1);
148 }
149 
150 static inline void
151 rte_atomic16_inc(rte_atomic16_t *v)
152 {
153 	asm volatile(
154 			MPLOCKED
155 			"incw %[cnt]"
156 			: [cnt] "=m" (v->cnt)   /* output */
157 			: "m" (v->cnt)          /* input */
158 			);
159 }
160 
161 static inline void
162 rte_atomic16_dec(rte_atomic16_t *v)
163 {
164 	asm volatile(
165 			MPLOCKED
166 			"decw %[cnt]"
167 			: [cnt] "=m" (v->cnt)   /* output */
168 			: "m" (v->cnt)          /* input */
169 			);
170 }
171 
172 static inline int rte_atomic16_inc_and_test(rte_atomic16_t *v)
173 {
174 	uint8_t ret;
175 
176 	asm volatile(
177 			MPLOCKED
178 			"incw %[cnt] ; "
179 			"sete %[ret]"
180 			: [cnt] "+m" (v->cnt),  /* output */
181 			  [ret] "=qm" (ret)
182 			);
183 	return ret != 0;
184 }
185 
186 static inline int rte_atomic16_dec_and_test(rte_atomic16_t *v)
187 {
188 	uint8_t ret;
189 
190 	asm volatile(MPLOCKED
191 			"decw %[cnt] ; "
192 			"sete %[ret]"
193 			: [cnt] "+m" (v->cnt),  /* output */
194 			  [ret] "=qm" (ret)
195 			);
196 	return ret != 0;
197 }
198 
199 /*------------------------- 32 bit atomic operations -------------------------*/
200 
201 static inline int
202 rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src)
203 {
204 	uint8_t res;
205 
206 	asm volatile(
207 			MPLOCKED
208 			"cmpxchgl %[src], %[dst];"
209 			"sete %[res];"
210 			: [res] "=a" (res),     /* output */
211 			  [dst] "=m" (*dst)
212 			: [src] "r" (src),      /* input */
213 			  "a" (exp),
214 			  "m" (*dst)
215 			: "memory");            /* no-clobber list */
216 	return res;
217 }
218 
219 static inline uint32_t
220 rte_atomic32_exchange(volatile uint32_t *dst, uint32_t val)
221 {
222 	asm volatile(
223 			MPLOCKED
224 			"xchgl %0, %1;"
225 			: "=r" (val), "=m" (*dst)
226 			: "0" (val),  "m" (*dst)
227 			: "memory");         /* no-clobber list */
228 	return val;
229 }
230 
231 static inline int rte_atomic32_test_and_set(rte_atomic32_t *v)
232 {
233 	return rte_atomic32_cmpset((volatile uint32_t *)&v->cnt, 0, 1);
234 }
235 
236 static inline void
237 rte_atomic32_inc(rte_atomic32_t *v)
238 {
239 	asm volatile(
240 			MPLOCKED
241 			"incl %[cnt]"
242 			: [cnt] "=m" (v->cnt)   /* output */
243 			: "m" (v->cnt)          /* input */
244 			);
245 }
246 
247 static inline void
248 rte_atomic32_dec(rte_atomic32_t *v)
249 {
250 	asm volatile(
251 			MPLOCKED
252 			"decl %[cnt]"
253 			: [cnt] "=m" (v->cnt)   /* output */
254 			: "m" (v->cnt)          /* input */
255 			);
256 }
257 
258 static inline int rte_atomic32_inc_and_test(rte_atomic32_t *v)
259 {
260 	uint8_t ret;
261 
262 	asm volatile(
263 			MPLOCKED
264 			"incl %[cnt] ; "
265 			"sete %[ret]"
266 			: [cnt] "+m" (v->cnt),  /* output */
267 			  [ret] "=qm" (ret)
268 			);
269 	return ret != 0;
270 }
271 
272 static inline int rte_atomic32_dec_and_test(rte_atomic32_t *v)
273 {
274 	uint8_t ret;
275 
276 	asm volatile(MPLOCKED
277 			"decl %[cnt] ; "
278 			"sete %[ret]"
279 			: [cnt] "+m" (v->cnt),  /* output */
280 			  [ret] "=qm" (ret)
281 			);
282 	return ret != 0;
283 }
284 
285 #ifdef __cplusplus
286 }
287 #endif
288 
289 #endif
290 
291 #ifdef RTE_ARCH_I686
292 #include "rte_atomic_32.h"
293 #else
294 #include "rte_atomic_64.h"
295 #endif
296 
297 #endif
298 
299 #endif /* _RTE_ATOMIC_X86_H_ */
300