1 /* Macros for atomic functionality for tile. 2 Copyright (C) 2011-2013 Free Software Foundation, Inc. 3 Contributed by Walter Lee (walt@tilera.com) 4 5 This file is free software; you can redistribute it and/or modify it 6 under the terms of the GNU General Public License as published by the 7 Free Software Foundation; either version 3, or (at your option) any 8 later version. 9 10 This file is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 25 /* Provides macros for common atomic functionality. */ 26 27 #ifndef _ATOMIC_H_ 28 #define _ATOMIC_H_ 29 30 #ifdef __tilegx__ 31 /* Atomic instruction macros 32 33 The macros provided by atomic.h simplify access to the TILE-Gx 34 architecture's atomic instructions. The architecture provides a 35 variety of atomic instructions, including "exchange", "compare and 36 exchange", "fetch and ADD", "fetch and AND", "fetch and OR", and 37 "fetch and ADD if greater than or equal to zero". 38 39 No barrier or fence semantics are implied by any of the atomic 40 instructions for manipulating memory; you must specify the barriers 41 that you wish explicitly, using the provided macros. 42 43 Any integral 32- or 64-bit value can be used as the argument 44 to these macros, such as "int", "long long", "unsigned long", etc. 45 The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data. 46 The "exchange" and "compare and exchange" macros may also take 47 pointer values. We use the pseudo-type "VAL" in the documentation 48 to indicate the use of an appropriate type. */ 49 #else 50 /* Atomic instruction macros 51 52 The macros provided by atomic.h simplify access to the Tile 53 architecture's atomic instructions. Since the architecture 54 supports test-and-set as its only in-silicon atomic operation, many 55 of the operations provided by this header are implemented as 56 fast-path calls to Linux emulation routines. 57 58 Using the kernel for atomic operations allows userspace to take 59 advantage of the kernel's existing atomic-integer support (managed 60 by a distributed array of locks). The kernel provides proper 61 ordering among simultaneous atomic operations on different cores, 62 and guarantees a process can not be context-switched part way 63 through an atomic operation. By virtue of sharing the kernel 64 atomic implementation, the userspace atomic operations 65 are compatible with the atomic methods provided by the kernel's 66 futex() syscall API. Note that these operations never cause Linux 67 kernel scheduling, and are in fact invisible to the kernel; they 68 simply act as regular function calls but with an elevated privilege 69 level. Note that the kernel's distributed lock array is hashed by 70 using only VA bits from the atomic value's address (to avoid the 71 performance hit of page table locking and multiple page-table 72 lookups to get the PA) and only the VA bits that are below page 73 granularity (to properly lock simultaneous accesses to the same 74 page mapped at different VAs). As a result, simultaneous atomic 75 operations on values whose addresses are at the same offset on a 76 page will contend in the kernel for the same lock array element. 77 78 No barrier or fence semantics are implied by any of the atomic 79 instructions for manipulating memory; you must specify the barriers 80 that you wish explicitly, using the provided macros. 81 82 Any integral 32- or 64-bit value can be used as the argument 83 to these macros, such as "int", "long long", "unsigned long", etc. 84 The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data. 85 The "exchange" and "compare and exchange" macros may also take 86 pointer values. We use the pseudo-type "VAL" in the documentation 87 to indicate the use of an appropriate type. 88 89 The 32-bit routines are implemented using a single kernel fast 90 syscall, as is the 64-bit compare-and-exchange. The other 64-bit 91 routines are implemented by looping over the 64-bit 92 compare-and-exchange routine, so may be potentially less efficient. */ 93 #endif 94 95 #ifdef __tilegx__ 96 #include <arch/spr_def.h> 97 #else 98 #include <asm/unistd.h> 99 #endif 100 101 102 /* 32-bit integer compare-and-exchange. */ 103 static __inline __attribute__ ((always_inline)) 104 int arch_atomic_val_compare_and_exchange_4 (volatile int *mem, 105 int oldval, int newval) 106 { 107 #ifdef __tilegx__ 108 __insn_mtspr (SPR_CMPEXCH_VALUE, oldval); 109 return __insn_cmpexch4 (mem, newval); 110 #else 111 int result; 112 __asm__ __volatile__ ("swint1":"=R00" (result), 113 "=m" (*mem):"R10" (__NR_FAST_cmpxchg), "R00" (mem), 114 "R01" (oldval), "R02" (newval), "m" (*mem):"r20", 115 "r21", "r22", "r23", "r24", "r25", "r26", "r27", 116 "r28", "r29", "memory"); 117 return result; 118 #endif 119 } 120 121 /* 64-bit integer compare-and-exchange. */ 122 static __inline __attribute__ ((always_inline)) 123 long long arch_atomic_val_compare_and_exchange_8 (volatile long long 124 *mem, long long oldval, 125 long long newval) 126 { 127 #ifdef __tilegx__ 128 __insn_mtspr (SPR_CMPEXCH_VALUE, oldval); 129 return __insn_cmpexch (mem, newval); 130 #else 131 unsigned int result_lo, result_hi; 132 unsigned int oldval_lo = oldval & 0xffffffffu, oldval_hi = oldval >> 32; 133 unsigned int newval_lo = newval & 0xffffffffu, newval_hi = newval >> 32; 134 __asm__ __volatile__ ("swint1":"=R00" (result_lo), "=R01" (result_hi), 135 "=m" (*mem):"R10" (__NR_FAST_cmpxchg64), "R00" (mem), 136 "R02" (oldval_lo), "R03" (oldval_hi), 137 "R04" (newval_lo), "R05" (newval_hi), 138 "m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25", 139 "r26", "r27", "r28", "r29", "memory"); 140 return ((long long) result_hi) << 32 | result_lo; 141 #endif 142 } 143 144 /* This non-existent symbol is called for sizes other than "4" and "8", 145 indicating a bug in the caller. */ 146 extern int __arch_atomic_error_bad_argument_size (void) 147 __attribute__ ((warning ("sizeof atomic argument not 4 or 8"))); 148 149 150 #define arch_atomic_val_compare_and_exchange(mem, o, n) \ 151 __extension__ ({ \ 152 (__typeof(*(mem)))(__typeof(*(mem)-*(mem))) \ 153 ((sizeof(*(mem)) == 8) ? \ 154 arch_atomic_val_compare_and_exchange_8( \ 155 (volatile long long*)(mem), (__typeof((o)-(o)))(o), \ 156 (__typeof((n)-(n)))(n)) : \ 157 (sizeof(*(mem)) == 4) ? \ 158 arch_atomic_val_compare_and_exchange_4( \ 159 (volatile int*)(mem), (__typeof((o)-(o)))(o), \ 160 (__typeof((n)-(n)))(n)) : \ 161 __arch_atomic_error_bad_argument_size()); \ 162 }) 163 164 #define arch_atomic_bool_compare_and_exchange(mem, o, n) \ 165 __extension__ ({ \ 166 __typeof(o) __o = (o); \ 167 __builtin_expect( \ 168 __o == arch_atomic_val_compare_and_exchange((mem), __o, (n)), 1); \ 169 }) 170 171 172 /* Loop with compare_and_exchange until we guess the correct value. 173 Normally "expr" will be an expression using __old and __value. */ 174 #define __arch_atomic_update_cmpxchg(mem, value, expr) \ 175 __extension__ ({ \ 176 __typeof(value) __value = (value); \ 177 __typeof(*(mem)) *__mem = (mem), __old = *__mem, __guess; \ 178 do { \ 179 __guess = __old; \ 180 __old = arch_atomic_val_compare_and_exchange(__mem, __old, (expr)); \ 181 } while (__builtin_expect(__old != __guess, 0)); \ 182 __old; \ 183 }) 184 185 #ifdef __tilegx__ 186 187 /* Generic atomic op with 8- or 4-byte variant. 188 The _mask, _addend, and _expr arguments are ignored on tilegx. */ 189 #define __arch_atomic_update(mem, value, op, _mask, _addend, _expr) \ 190 __extension__ ({ \ 191 ((__typeof(*(mem))) \ 192 ((sizeof(*(mem)) == 8) ? (__typeof(*(mem)-*(mem)))__insn_##op( \ 193 (volatile void *)(mem), \ 194 (long long)(__typeof((value)-(value)))(value)) : \ 195 (sizeof(*(mem)) == 4) ? (int)__insn_##op##4( \ 196 (volatile void *)(mem), \ 197 (int)(__typeof((value)-(value)))(value)) : \ 198 __arch_atomic_error_bad_argument_size())); \ 199 }) 200 201 #else 202 203 /* This uses TILEPro's fast syscall support to atomically compute: 204 205 int old = *ptr; 206 *ptr = (old & mask) + addend; 207 return old; 208 209 This primitive can be used for atomic exchange, add, or, and. 210 Only 32-bit support is provided. */ 211 static __inline __attribute__ ((always_inline)) 212 int 213 __arch_atomic_update_4 (volatile int *mem, int mask, int addend) 214 { 215 int result; 216 __asm__ __volatile__ ("swint1":"=R00" (result), 217 "=m" (*mem):"R10" (__NR_FAST_atomic_update), 218 "R00" (mem), "R01" (mask), "R02" (addend), 219 "m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25", 220 "r26", "r27", "r28", "r29", "memory"); 221 return result; 222 } 223 224 /* Generic atomic op with 8- or 4-byte variant. 225 The _op argument is ignored on tilepro. */ 226 #define __arch_atomic_update(mem, value, _op, mask, addend, expr) \ 227 __extension__ ({ \ 228 (__typeof(*(mem)))(__typeof(*(mem)-*(mem))) \ 229 ((sizeof(*(mem)) == 8) ? \ 230 __arch_atomic_update_cmpxchg((mem), (value), (expr)) : \ 231 (sizeof(*(mem)) == 4) ? \ 232 __arch_atomic_update_4((volatile int*)(mem), \ 233 (__typeof((mask)-(mask)))(mask), \ 234 (__typeof((addend)-(addend)))(addend)) : \ 235 __arch_atomic_error_bad_argument_size()); \ 236 }) 237 238 #endif /* __tilegx__ */ 239 240 241 #define arch_atomic_exchange(mem, newvalue) \ 242 __arch_atomic_update(mem, newvalue, exch, 0, newvalue, __value) 243 244 #define arch_atomic_add(mem, value) \ 245 __arch_atomic_update(mem, value, fetchadd, -1, value, __old + __value) 246 247 #define arch_atomic_sub(mem, value) arch_atomic_add((mem), -(value)) 248 249 #define arch_atomic_increment(mem) arch_atomic_add((mem), 1) 250 251 #define arch_atomic_decrement(mem) arch_atomic_add((mem), -1) 252 253 #define arch_atomic_and(mem, mask) \ 254 __arch_atomic_update(mem, mask, fetchand, mask, 0, __old & __value) 255 256 #define arch_atomic_or(mem, mask) \ 257 __arch_atomic_update(mem, mask, fetchor, ~mask, mask, __old | __value) 258 259 #define arch_atomic_xor(mem, mask) \ 260 __arch_atomic_update_cmpxchg(mem, mask, __old ^ __value) 261 262 #define arch_atomic_nand(mem, mask) \ 263 __arch_atomic_update_cmpxchg(mem, mask, ~(__old & __value)) 264 265 #define arch_atomic_bit_set(mem, bit) \ 266 __extension__ ({ \ 267 __typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit); \ 268 __mask & arch_atomic_or((mem), __mask); \ 269 }) 270 271 #define arch_atomic_bit_clear(mem, bit) \ 272 __extension__ ({ \ 273 __typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit); \ 274 __mask & arch_atomic_and((mem), ~__mask); \ 275 }) 276 277 #ifdef __tilegx__ 278 /* Atomically store a new value to memory. 279 Note that you can freely use types of any size here, unlike the 280 other atomic routines, which require 32- or 64-bit types. 281 This accessor is provided for compatibility with TILEPro, which 282 required an explicit atomic operation for stores that needed 283 to be atomic with respect to other atomic methods in this header. */ 284 #define arch_atomic_write(mem, value) ((void) (*(mem) = (value))) 285 #else 286 #define arch_atomic_write(mem, value) \ 287 do { \ 288 __typeof(mem) __aw_mem = (mem); \ 289 __typeof(value) __aw_val = (value); \ 290 unsigned int *__aw_mem32, __aw_intval, __aw_val32, __aw_off, __aw_mask; \ 291 __aw_intval = (__typeof((value) - (value)))__aw_val; \ 292 switch (sizeof(*__aw_mem)) { \ 293 case 8: \ 294 __arch_atomic_update_cmpxchg(__aw_mem, __aw_val, __value); \ 295 break; \ 296 case 4: \ 297 __arch_atomic_update_4((int *)__aw_mem, 0, __aw_intval); \ 298 break; \ 299 case 2: \ 300 __aw_off = 8 * ((long)__aw_mem & 0x2); \ 301 __aw_mask = 0xffffU << __aw_off; \ 302 __aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x2); \ 303 __aw_val32 = (__aw_intval << __aw_off) & __aw_mask; \ 304 __arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32, \ 305 (__old & ~__aw_mask) | __value); \ 306 break; \ 307 case 1: \ 308 __aw_off = 8 * ((long)__aw_mem & 0x3); \ 309 __aw_mask = 0xffU << __aw_off; \ 310 __aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x3); \ 311 __aw_val32 = (__aw_intval << __aw_off) & __aw_mask; \ 312 __arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32, \ 313 (__old & ~__aw_mask) | __value); \ 314 break; \ 315 } \ 316 } while (0) 317 #endif 318 319 /* Compiler barrier. 320 321 This macro prevents loads or stores from being moved by the compiler 322 across the macro. Any loaded value that was loaded before this 323 macro must then be reloaded by the compiler. */ 324 #define arch_atomic_compiler_barrier() __asm__ __volatile__("" ::: "memory") 325 326 /* Full memory barrier. 327 328 This macro has the semantics of arch_atomic_compiler_barrer(), but also 329 ensures that previous stores are visible to other cores, and that 330 all previous loaded values have been placed into their target 331 register on this core. */ 332 #define arch_atomic_full_barrier() __insn_mf() 333 334 /* Read memory barrier. 335 336 Ensure that all reads by this processor that occurred prior to the 337 read memory barrier have completed, and that no reads that occur 338 after the read memory barrier on this processor are initiated 339 before the barrier. 340 341 On current TILE chips a read barrier is implemented as a full barrier, 342 but this may not be true in later versions of the architecture. 343 344 See also arch_atomic_acquire_barrier() for the appropriate idiom to use 345 to ensure no reads are lifted above an atomic lock instruction. */ 346 #define arch_atomic_read_barrier() arch_atomic_full_barrier() 347 348 /* Write memory barrier. 349 350 Ensure that all writes by this processor that occurred prior to the 351 write memory barrier have completed, and that no writes that occur 352 after the write memory barrier on this processor are initiated 353 before the barrier. 354 355 On current TILE chips a write barrier is implemented as a full barrier, 356 but this may not be true in later versions of the architecture. 357 358 See also arch_atomic_release_barrier() for the appropriate idiom to use 359 to ensure all writes are complete prior to an atomic unlock instruction. */ 360 #define arch_atomic_write_barrier() arch_atomic_full_barrier() 361 362 /* Lock acquisition barrier. 363 364 Ensure that no load operations that follow this macro in the 365 program can issue prior to the barrier. Without such a barrier, 366 the compiler can reorder them to issue earlier, or the hardware can 367 issue them speculatively. The latter is not currently done in the 368 Tile microarchitecture, but using this operation improves 369 portability to future implementations. 370 371 This operation is intended to be used as part of the "acquire" 372 path for locking, that is, when entering a critical section. 373 This should be done after the atomic operation that actually 374 acquires the lock, and in conjunction with a "control dependency" 375 that checks the atomic operation result to see if the lock was 376 in fact acquired. See the arch_atomic_read_barrier() macro 377 for a heavier-weight barrier to use in certain unusual constructs, 378 or arch_atomic_acquire_barrier_value() if no control dependency exists. */ 379 #define arch_atomic_acquire_barrier() arch_atomic_compiler_barrier() 380 381 /* Lock release barrier. 382 383 Ensure that no store operations that precede this macro in the 384 program complete subsequent to the barrier. Without such a 385 barrier, the compiler can reorder stores to issue later, or stores 386 can be still outstanding in the memory network. 387 388 This operation is intended to be used as part of the "release" path 389 for locking, that is, when leaving a critical section. This should 390 be done before the operation (such as a store of zero) that 391 actually releases the lock. */ 392 #define arch_atomic_release_barrier() arch_atomic_write_barrier() 393 394 /* Barrier until the read of a particular value is complete. 395 396 This is occasionally useful when constructing certain locking 397 scenarios. For example, you might write a routine that issues an 398 atomic instruction to enter a critical section, then reads one or 399 more values within the critical section without checking to see if 400 the critical section was in fact acquired, and only later checks 401 the atomic instruction result to see if the lock was acquired. If 402 so the routine could properly release the lock and know that the 403 values that were read were valid. 404 405 In this scenario, it is required to wait for the result of the 406 atomic instruction, even if the value itself is not checked. This 407 guarantees that if the atomic instruction succeeded in taking the lock, 408 the lock was held before any reads in the critical section issued. */ 409 #define arch_atomic_acquire_barrier_value(val) \ 410 __asm__ __volatile__("move %0, %0" :: "r"(val)) 411 412 /* Access the given variable in memory exactly once. 413 414 In some contexts, an algorithm may need to force access to memory, 415 since otherwise the compiler may think it can optimize away a 416 memory load or store; for example, in a loop when polling memory to 417 see if another cpu has updated it yet. Generally this is only 418 required for certain very carefully hand-tuned algorithms; using it 419 unnecessarily may result in performance losses. 420 421 A related use of this macro is to ensure that the compiler does not 422 rematerialize the value of "x" by reloading it from memory 423 unexpectedly; the "volatile" marking will prevent the compiler from 424 being able to rematerialize. This is helpful if an algorithm needs 425 to read a variable without locking, but needs it to have the same 426 value if it ends up being used several times within the algorithm. 427 428 Note that multiple uses of this macro are guaranteed to be ordered, 429 i.e. the compiler will not reorder stores or loads that are wrapped 430 in arch_atomic_access_once(). */ 431 #define arch_atomic_access_once(x) (*(volatile __typeof(x) *)&(x)) 432 433 434 435 #endif /* !_ATOMIC_H_ */ 436