1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 5 #ifndef _RTE_ATOMIC_X86_H_ 6 #define _RTE_ATOMIC_X86_H_ 7 8 #include <stdint.h> 9 #include <rte_common.h> 10 #include <rte_config.h> 11 #include <emmintrin.h> 12 #include "generic/rte_atomic.h" 13 14 #if RTE_MAX_LCORE == 1 15 #define MPLOCKED /**< No need to insert MP lock prefix. */ 16 #else 17 #define MPLOCKED "lock ; " /**< Insert MP lock prefix. */ 18 #endif 19 20 #define rte_mb() _mm_mfence() 21 22 #define rte_wmb() _mm_sfence() 23 24 #define rte_rmb() _mm_lfence() 25 26 #define rte_smp_wmb() rte_compiler_barrier() 27 28 #define rte_smp_rmb() rte_compiler_barrier() 29 30 #ifdef __cplusplus 31 extern "C" { 32 #endif 33 34 /* 35 * From Intel Software Development Manual; Vol 3; 36 * 8.2.2 Memory Ordering in P6 and More Recent Processor Families: 37 * ... 38 * . Reads are not reordered with other reads. 39 * . Writes are not reordered with older reads. 40 * . Writes to memory are not reordered with other writes, 41 * with the following exceptions: 42 * . streaming stores (writes) executed with the non-temporal move 43 * instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and 44 * . string operations (see Section 8.2.4.1). 45 * ... 46 * . Reads may be reordered with older writes to different locations but not 47 * with older writes to the same location. 48 * . Reads or writes cannot be reordered with I/O instructions, 49 * locked instructions, or serializing instructions. 50 * . Reads cannot pass earlier LFENCE and MFENCE instructions. 51 * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE instructions. 52 * . LFENCE instructions cannot pass earlier reads. 53 * . SFENCE instructions cannot pass earlier writes ... 54 * . MFENCE instructions cannot pass earlier reads, writes ... 55 * 56 * As pointed by Java guys, that makes possible to use lock-prefixed 57 * instructions to get the same effect as mfence and on most modern HW 58 * that gives a better performance then using mfence: 59 * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ 60 * Basic idea is to use lock prefixed add with some dummy memory location 61 * as the destination. From their experiments 128B(2 cache lines) below 62 * current stack pointer looks like a good candidate. 63 * So below we use that technique for rte_smp_mb() implementation. 64 */ 65 66 static __rte_always_inline void 67 rte_smp_mb(void) 68 { 69 #ifdef RTE_TOOLCHAIN_MSVC 70 _mm_mfence(); 71 #else 72 #ifdef RTE_ARCH_I686 73 asm volatile("lock addl $0, -128(%%esp); " ::: "memory"); 74 #else 75 asm volatile("lock addl $0, -128(%%rsp); " ::: "memory"); 76 #endif 77 #endif 78 } 79 80 #define rte_io_mb() rte_mb() 81 82 #define rte_io_wmb() rte_compiler_barrier() 83 84 #define rte_io_rmb() rte_compiler_barrier() 85 86 /** 87 * Synchronization fence between threads based on the specified memory order. 88 * 89 * On x86 the __rte_atomic_thread_fence(rte_memory_order_seq_cst) generates full 'mfence' 90 * which is quite expensive. The optimized implementation of rte_smp_mb is 91 * used instead. 92 */ 93 static __rte_always_inline void 94 rte_atomic_thread_fence(rte_memory_order memorder) 95 { 96 if (memorder == rte_memory_order_seq_cst) 97 rte_smp_mb(); 98 else 99 __rte_atomic_thread_fence(memorder); 100 } 101 102 #ifdef __cplusplus 103 } 104 #endif 105 106 #ifndef RTE_TOOLCHAIN_MSVC 107 108 /*------------------------- 16 bit atomic operations -------------------------*/ 109 110 #ifdef __cplusplus 111 extern "C" { 112 #endif 113 114 #ifndef RTE_FORCE_INTRINSICS 115 static inline int 116 rte_atomic16_cmpset(volatile uint16_t *dst, uint16_t exp, uint16_t src) 117 { 118 uint8_t res; 119 120 asm volatile( 121 MPLOCKED 122 "cmpxchgw %[src], %[dst];" 123 "sete %[res];" 124 : [res] "=a" (res), /* output */ 125 [dst] "=m" (*dst) 126 : [src] "r" (src), /* input */ 127 "a" (exp), 128 "m" (*dst) 129 : "memory"); /* no-clobber list */ 130 return res; 131 } 132 133 static inline uint16_t 134 rte_atomic16_exchange(volatile uint16_t *dst, uint16_t val) 135 { 136 asm volatile( 137 MPLOCKED 138 "xchgw %0, %1;" 139 : "=r" (val), "=m" (*dst) 140 : "0" (val), "m" (*dst) 141 : "memory"); /* no-clobber list */ 142 return val; 143 } 144 145 static inline int rte_atomic16_test_and_set(rte_atomic16_t *v) 146 { 147 return rte_atomic16_cmpset((volatile uint16_t *)&v->cnt, 0, 1); 148 } 149 150 static inline void 151 rte_atomic16_inc(rte_atomic16_t *v) 152 { 153 asm volatile( 154 MPLOCKED 155 "incw %[cnt]" 156 : [cnt] "=m" (v->cnt) /* output */ 157 : "m" (v->cnt) /* input */ 158 ); 159 } 160 161 static inline void 162 rte_atomic16_dec(rte_atomic16_t *v) 163 { 164 asm volatile( 165 MPLOCKED 166 "decw %[cnt]" 167 : [cnt] "=m" (v->cnt) /* output */ 168 : "m" (v->cnt) /* input */ 169 ); 170 } 171 172 static inline int rte_atomic16_inc_and_test(rte_atomic16_t *v) 173 { 174 uint8_t ret; 175 176 asm volatile( 177 MPLOCKED 178 "incw %[cnt] ; " 179 "sete %[ret]" 180 : [cnt] "+m" (v->cnt), /* output */ 181 [ret] "=qm" (ret) 182 ); 183 return ret != 0; 184 } 185 186 static inline int rte_atomic16_dec_and_test(rte_atomic16_t *v) 187 { 188 uint8_t ret; 189 190 asm volatile(MPLOCKED 191 "decw %[cnt] ; " 192 "sete %[ret]" 193 : [cnt] "+m" (v->cnt), /* output */ 194 [ret] "=qm" (ret) 195 ); 196 return ret != 0; 197 } 198 199 /*------------------------- 32 bit atomic operations -------------------------*/ 200 201 static inline int 202 rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src) 203 { 204 uint8_t res; 205 206 asm volatile( 207 MPLOCKED 208 "cmpxchgl %[src], %[dst];" 209 "sete %[res];" 210 : [res] "=a" (res), /* output */ 211 [dst] "=m" (*dst) 212 : [src] "r" (src), /* input */ 213 "a" (exp), 214 "m" (*dst) 215 : "memory"); /* no-clobber list */ 216 return res; 217 } 218 219 static inline uint32_t 220 rte_atomic32_exchange(volatile uint32_t *dst, uint32_t val) 221 { 222 asm volatile( 223 MPLOCKED 224 "xchgl %0, %1;" 225 : "=r" (val), "=m" (*dst) 226 : "0" (val), "m" (*dst) 227 : "memory"); /* no-clobber list */ 228 return val; 229 } 230 231 static inline int rte_atomic32_test_and_set(rte_atomic32_t *v) 232 { 233 return rte_atomic32_cmpset((volatile uint32_t *)&v->cnt, 0, 1); 234 } 235 236 static inline void 237 rte_atomic32_inc(rte_atomic32_t *v) 238 { 239 asm volatile( 240 MPLOCKED 241 "incl %[cnt]" 242 : [cnt] "=m" (v->cnt) /* output */ 243 : "m" (v->cnt) /* input */ 244 ); 245 } 246 247 static inline void 248 rte_atomic32_dec(rte_atomic32_t *v) 249 { 250 asm volatile( 251 MPLOCKED 252 "decl %[cnt]" 253 : [cnt] "=m" (v->cnt) /* output */ 254 : "m" (v->cnt) /* input */ 255 ); 256 } 257 258 static inline int rte_atomic32_inc_and_test(rte_atomic32_t *v) 259 { 260 uint8_t ret; 261 262 asm volatile( 263 MPLOCKED 264 "incl %[cnt] ; " 265 "sete %[ret]" 266 : [cnt] "+m" (v->cnt), /* output */ 267 [ret] "=qm" (ret) 268 ); 269 return ret != 0; 270 } 271 272 static inline int rte_atomic32_dec_and_test(rte_atomic32_t *v) 273 { 274 uint8_t ret; 275 276 asm volatile(MPLOCKED 277 "decl %[cnt] ; " 278 "sete %[ret]" 279 : [cnt] "+m" (v->cnt), /* output */ 280 [ret] "=qm" (ret) 281 ); 282 return ret != 0; 283 } 284 285 #ifdef __cplusplus 286 } 287 #endif 288 289 #endif 290 291 #ifdef RTE_ARCH_I686 292 #include "rte_atomic_32.h" 293 #else 294 #include "rte_atomic_64.h" 295 #endif 296 297 #endif 298 299 #endif /* _RTE_ATOMIC_X86_H_ */ 300