1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 5 #ifndef _RTE_ATOMIC_X86_H_ 6 #define _RTE_ATOMIC_X86_H_ 7 8 #ifdef __cplusplus 9 extern "C" { 10 #endif 11 12 #include <stdint.h> 13 #include <rte_common.h> 14 #include <rte_config.h> 15 #include <emmintrin.h> 16 #include "generic/rte_atomic.h" 17 18 #if RTE_MAX_LCORE == 1 19 #define MPLOCKED /**< No need to insert MP lock prefix. */ 20 #else 21 #define MPLOCKED "lock ; " /**< Insert MP lock prefix. */ 22 #endif 23 24 #define rte_mb() _mm_mfence() 25 26 #define rte_wmb() _mm_sfence() 27 28 #define rte_rmb() _mm_lfence() 29 30 #define rte_smp_wmb() rte_compiler_barrier() 31 32 #define rte_smp_rmb() rte_compiler_barrier() 33 34 /* 35 * From Intel Software Development Manual; Vol 3; 36 * 8.2.2 Memory Ordering in P6 and More Recent Processor Families: 37 * ... 38 * . Reads are not reordered with other reads. 39 * . Writes are not reordered with older reads. 40 * . Writes to memory are not reordered with other writes, 41 * with the following exceptions: 42 * . streaming stores (writes) executed with the non-temporal move 43 * instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and 44 * . string operations (see Section 8.2.4.1). 45 * ... 46 * . Reads may be reordered with older writes to different locations but not 47 * with older writes to the same location. 48 * . Reads or writes cannot be reordered with I/O instructions, 49 * locked instructions, or serializing instructions. 50 * . Reads cannot pass earlier LFENCE and MFENCE instructions. 51 * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE instructions. 52 * . LFENCE instructions cannot pass earlier reads. 53 * . SFENCE instructions cannot pass earlier writes ... 54 * . MFENCE instructions cannot pass earlier reads, writes ... 55 * 56 * As pointed by Java guys, that makes possible to use lock-prefixed 57 * instructions to get the same effect as mfence and on most modern HW 58 * that gives a better performance then using mfence: 59 * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ 60 * Basic idea is to use lock prefixed add with some dummy memory location 61 * as the destination. From their experiments 128B(2 cache lines) below 62 * current stack pointer looks like a good candidate. 63 * So below we use that techinque for rte_smp_mb() implementation. 64 */ 65 66 static __rte_always_inline void 67 rte_smp_mb(void) 68 { 69 #ifdef RTE_ARCH_I686 70 asm volatile("lock addl $0, -128(%%esp); " ::: "memory"); 71 #else 72 asm volatile("lock addl $0, -128(%%rsp); " ::: "memory"); 73 #endif 74 } 75 76 #define rte_io_mb() rte_mb() 77 78 #define rte_io_wmb() rte_compiler_barrier() 79 80 #define rte_io_rmb() rte_compiler_barrier() 81 82 /** 83 * Synchronization fence between threads based on the specified memory order. 84 * 85 * On x86 the __atomic_thread_fence(__ATOMIC_SEQ_CST) generates full 'mfence' 86 * which is quite expensive. The optimized implementation of rte_smp_mb is 87 * used instead. 88 */ 89 static __rte_always_inline void 90 rte_atomic_thread_fence(int memorder) 91 { 92 if (memorder == __ATOMIC_SEQ_CST) 93 rte_smp_mb(); 94 else 95 __atomic_thread_fence(memorder); 96 } 97 98 /*------------------------- 16 bit atomic operations -------------------------*/ 99 100 #ifndef RTE_FORCE_INTRINSICS 101 static inline int 102 rte_atomic16_cmpset(volatile uint16_t *dst, uint16_t exp, uint16_t src) 103 { 104 uint8_t res; 105 106 asm volatile( 107 MPLOCKED 108 "cmpxchgw %[src], %[dst];" 109 "sete %[res];" 110 : [res] "=a" (res), /* output */ 111 [dst] "=m" (*dst) 112 : [src] "r" (src), /* input */ 113 "a" (exp), 114 "m" (*dst) 115 : "memory"); /* no-clobber list */ 116 return res; 117 } 118 119 static inline uint16_t 120 rte_atomic16_exchange(volatile uint16_t *dst, uint16_t val) 121 { 122 asm volatile( 123 MPLOCKED 124 "xchgw %0, %1;" 125 : "=r" (val), "=m" (*dst) 126 : "0" (val), "m" (*dst) 127 : "memory"); /* no-clobber list */ 128 return val; 129 } 130 131 static inline int rte_atomic16_test_and_set(rte_atomic16_t *v) 132 { 133 return rte_atomic16_cmpset((volatile uint16_t *)&v->cnt, 0, 1); 134 } 135 136 static inline void 137 rte_atomic16_inc(rte_atomic16_t *v) 138 { 139 asm volatile( 140 MPLOCKED 141 "incw %[cnt]" 142 : [cnt] "=m" (v->cnt) /* output */ 143 : "m" (v->cnt) /* input */ 144 ); 145 } 146 147 static inline void 148 rte_atomic16_dec(rte_atomic16_t *v) 149 { 150 asm volatile( 151 MPLOCKED 152 "decw %[cnt]" 153 : [cnt] "=m" (v->cnt) /* output */ 154 : "m" (v->cnt) /* input */ 155 ); 156 } 157 158 static inline int rte_atomic16_inc_and_test(rte_atomic16_t *v) 159 { 160 uint8_t ret; 161 162 asm volatile( 163 MPLOCKED 164 "incw %[cnt] ; " 165 "sete %[ret]" 166 : [cnt] "+m" (v->cnt), /* output */ 167 [ret] "=qm" (ret) 168 ); 169 return ret != 0; 170 } 171 172 static inline int rte_atomic16_dec_and_test(rte_atomic16_t *v) 173 { 174 uint8_t ret; 175 176 asm volatile(MPLOCKED 177 "decw %[cnt] ; " 178 "sete %[ret]" 179 : [cnt] "+m" (v->cnt), /* output */ 180 [ret] "=qm" (ret) 181 ); 182 return ret != 0; 183 } 184 185 /*------------------------- 32 bit atomic operations -------------------------*/ 186 187 static inline int 188 rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src) 189 { 190 uint8_t res; 191 192 asm volatile( 193 MPLOCKED 194 "cmpxchgl %[src], %[dst];" 195 "sete %[res];" 196 : [res] "=a" (res), /* output */ 197 [dst] "=m" (*dst) 198 : [src] "r" (src), /* input */ 199 "a" (exp), 200 "m" (*dst) 201 : "memory"); /* no-clobber list */ 202 return res; 203 } 204 205 static inline uint32_t 206 rte_atomic32_exchange(volatile uint32_t *dst, uint32_t val) 207 { 208 asm volatile( 209 MPLOCKED 210 "xchgl %0, %1;" 211 : "=r" (val), "=m" (*dst) 212 : "0" (val), "m" (*dst) 213 : "memory"); /* no-clobber list */ 214 return val; 215 } 216 217 static inline int rte_atomic32_test_and_set(rte_atomic32_t *v) 218 { 219 return rte_atomic32_cmpset((volatile uint32_t *)&v->cnt, 0, 1); 220 } 221 222 static inline void 223 rte_atomic32_inc(rte_atomic32_t *v) 224 { 225 asm volatile( 226 MPLOCKED 227 "incl %[cnt]" 228 : [cnt] "=m" (v->cnt) /* output */ 229 : "m" (v->cnt) /* input */ 230 ); 231 } 232 233 static inline void 234 rte_atomic32_dec(rte_atomic32_t *v) 235 { 236 asm volatile( 237 MPLOCKED 238 "decl %[cnt]" 239 : [cnt] "=m" (v->cnt) /* output */ 240 : "m" (v->cnt) /* input */ 241 ); 242 } 243 244 static inline int rte_atomic32_inc_and_test(rte_atomic32_t *v) 245 { 246 uint8_t ret; 247 248 asm volatile( 249 MPLOCKED 250 "incl %[cnt] ; " 251 "sete %[ret]" 252 : [cnt] "+m" (v->cnt), /* output */ 253 [ret] "=qm" (ret) 254 ); 255 return ret != 0; 256 } 257 258 static inline int rte_atomic32_dec_and_test(rte_atomic32_t *v) 259 { 260 uint8_t ret; 261 262 asm volatile(MPLOCKED 263 "decl %[cnt] ; " 264 "sete %[ret]" 265 : [cnt] "+m" (v->cnt), /* output */ 266 [ret] "=qm" (ret) 267 ); 268 return ret != 0; 269 } 270 #endif 271 272 #ifdef RTE_ARCH_I686 273 #include "rte_atomic_32.h" 274 #else 275 #include "rte_atomic_64.h" 276 #endif 277 278 #ifdef __cplusplus 279 } 280 #endif 281 282 #endif /* _RTE_ATOMIC_X86_H_ */ 283