xref: /netbsd-src/external/gpl3/gcc/dist/libgcc/config/tilepro/atomic.h (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1 /* Macros for atomic functionality for tile.
2    Copyright (C) 2011-2013 Free Software Foundation, Inc.
3    Contributed by Walter Lee (walt@tilera.com)
4 
5    This file is free software; you can redistribute it and/or modify it
6    under the terms of the GNU General Public License as published by the
7    Free Software Foundation; either version 3, or (at your option) any
8    later version.
9 
10    This file is distributed in the hope that it will be useful, but
11    WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    General Public License for more details.
14 
15    Under Section 7 of GPL version 3, you are granted additional
16    permissions described in the GCC Runtime Library Exception, version
17    3.1, as published by the Free Software Foundation.
18 
19    You should have received a copy of the GNU General Public License and
20    a copy of the GCC Runtime Library Exception along with this program;
21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22    <http://www.gnu.org/licenses/>.  */
23 
24 
25 /* Provides macros for common atomic functionality.  */
26 
27 #ifndef _ATOMIC_H_
28 #define _ATOMIC_H_
29 
30 #ifdef __tilegx__
31 /* Atomic instruction macros
32 
33    The macros provided by atomic.h simplify access to the TILE-Gx
34    architecture's atomic instructions.  The architecture provides a
35    variety of atomic instructions, including "exchange", "compare and
36    exchange", "fetch and ADD", "fetch and AND", "fetch and OR", and
37    "fetch and ADD if greater than or equal to zero".
38 
39    No barrier or fence semantics are implied by any of the atomic
40    instructions for manipulating memory; you must specify the barriers
41    that you wish explicitly, using the provided macros.
42 
43    Any integral 32- or 64-bit value can be used as the argument
44    to these macros, such as "int", "long long", "unsigned long", etc.
45    The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data.
46    The "exchange" and "compare and exchange" macros may also take
47    pointer values.  We use the pseudo-type "VAL" in the documentation
48    to indicate the use of an appropriate type.  */
49 #else
50 /* Atomic instruction macros
51 
52    The macros provided by atomic.h simplify access to the Tile
53    architecture's atomic instructions.  Since the architecture
54    supports test-and-set as its only in-silicon atomic operation, many
55    of the operations provided by this header are implemented as
56    fast-path calls to Linux emulation routines.
57 
58    Using the kernel for atomic operations allows userspace to take
59    advantage of the kernel's existing atomic-integer support (managed
60    by a distributed array of locks).  The kernel provides proper
61    ordering among simultaneous atomic operations on different cores,
62    and guarantees a process can not be context-switched part way
63    through an atomic operation.  By virtue of sharing the kernel
64    atomic implementation, the userspace atomic operations
65    are compatible with the atomic methods provided by the kernel's
66    futex() syscall API.  Note that these operations never cause Linux
67    kernel scheduling, and are in fact invisible to the kernel; they
68    simply act as regular function calls but with an elevated privilege
69    level.  Note that the kernel's distributed lock array is hashed by
70    using only VA bits from the atomic value's address (to avoid the
71    performance hit of page table locking and multiple page-table
72    lookups to get the PA) and only the VA bits that are below page
73    granularity (to properly lock simultaneous accesses to the same
74    page mapped at different VAs).  As a result, simultaneous atomic
75    operations on values whose addresses are at the same offset on a
76    page will contend in the kernel for the same lock array element.
77 
78    No barrier or fence semantics are implied by any of the atomic
79    instructions for manipulating memory; you must specify the barriers
80    that you wish explicitly, using the provided macros.
81 
82    Any integral 32- or 64-bit value can be used as the argument
83    to these macros, such as "int", "long long", "unsigned long", etc.
84    The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data.
85    The "exchange" and "compare and exchange" macros may also take
86    pointer values.  We use the pseudo-type "VAL" in the documentation
87    to indicate the use of an appropriate type.
88 
89    The 32-bit routines are implemented using a single kernel fast
90    syscall, as is the 64-bit compare-and-exchange.  The other 64-bit
91    routines are implemented by looping over the 64-bit
92    compare-and-exchange routine, so may be potentially less efficient.  */
93 #endif
94 
95 #ifdef __tilegx__
96 #include <arch/spr_def.h>
97 #else
98 #include <asm/unistd.h>
99 #endif
100 
101 
102 /* 32-bit integer compare-and-exchange.  */
103 static __inline __attribute__ ((always_inline))
104      int arch_atomic_val_compare_and_exchange_4 (volatile int *mem,
105 						 int oldval, int newval)
106 {
107 #ifdef __tilegx__
108   __insn_mtspr (SPR_CMPEXCH_VALUE, oldval);
109   return __insn_cmpexch4 (mem, newval);
110 #else
111   int result;
112   __asm__ __volatile__ ("swint1":"=R00" (result),
113 			"=m" (*mem):"R10" (__NR_FAST_cmpxchg), "R00" (mem),
114 			"R01" (oldval), "R02" (newval), "m" (*mem):"r20",
115 			"r21", "r22", "r23", "r24", "r25", "r26", "r27",
116 			"r28", "r29", "memory");
117   return result;
118 #endif
119 }
120 
121 /* 64-bit integer compare-and-exchange.  */
122 static __inline __attribute__ ((always_inline))
123      long long arch_atomic_val_compare_and_exchange_8 (volatile long long
124 						       *mem, long long oldval,
125 						       long long newval)
126 {
127 #ifdef __tilegx__
128   __insn_mtspr (SPR_CMPEXCH_VALUE, oldval);
129   return __insn_cmpexch (mem, newval);
130 #else
131   unsigned int result_lo, result_hi;
132   unsigned int oldval_lo = oldval & 0xffffffffu, oldval_hi = oldval >> 32;
133   unsigned int newval_lo = newval & 0xffffffffu, newval_hi = newval >> 32;
134   __asm__ __volatile__ ("swint1":"=R00" (result_lo), "=R01" (result_hi),
135 			"=m" (*mem):"R10" (__NR_FAST_cmpxchg64), "R00" (mem),
136 			"R02" (oldval_lo), "R03" (oldval_hi),
137 			"R04" (newval_lo), "R05" (newval_hi),
138 			"m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25",
139 			"r26", "r27", "r28", "r29", "memory");
140   return ((long long) result_hi) << 32 | result_lo;
141 #endif
142 }
143 
144 /* This non-existent symbol is called for sizes other than "4" and "8",
145    indicating a bug in the caller.  */
146 extern int __arch_atomic_error_bad_argument_size (void)
147   __attribute__ ((warning ("sizeof atomic argument not 4 or 8")));
148 
149 
150 #define arch_atomic_val_compare_and_exchange(mem, o, n)                 \
151   __extension__ ({                                                      \
152     (__typeof(*(mem)))(__typeof(*(mem)-*(mem)))                         \
153       ((sizeof(*(mem)) == 8) ?                                          \
154        arch_atomic_val_compare_and_exchange_8(                          \
155          (volatile long long*)(mem), (__typeof((o)-(o)))(o),            \
156          (__typeof((n)-(n)))(n)) :                                      \
157        (sizeof(*(mem)) == 4) ?                                          \
158        arch_atomic_val_compare_and_exchange_4(                          \
159          (volatile int*)(mem), (__typeof((o)-(o)))(o),                  \
160          (__typeof((n)-(n)))(n)) :                                      \
161        __arch_atomic_error_bad_argument_size());                        \
162   })
163 
164 #define arch_atomic_bool_compare_and_exchange(mem, o, n)                \
165   __extension__ ({                                                      \
166     __typeof(o) __o = (o);                                              \
167     __builtin_expect(                                                   \
168       __o == arch_atomic_val_compare_and_exchange((mem), __o, (n)), 1); \
169   })
170 
171 
172 /* Loop with compare_and_exchange until we guess the correct value.
173    Normally "expr" will be an expression using __old and __value.  */
174 #define __arch_atomic_update_cmpxchg(mem, value, expr)                  \
175   __extension__ ({                                                      \
176     __typeof(value) __value = (value);                                  \
177     __typeof(*(mem)) *__mem = (mem), __old = *__mem, __guess;           \
178     do {                                                                \
179       __guess = __old;                                                  \
180       __old = arch_atomic_val_compare_and_exchange(__mem, __old, (expr));    \
181     } while (__builtin_expect(__old != __guess, 0));                    \
182     __old;                                                              \
183   })
184 
185 #ifdef __tilegx__
186 
187 /* Generic atomic op with 8- or 4-byte variant.
188    The _mask, _addend, and _expr arguments are ignored on tilegx.  */
189 #define __arch_atomic_update(mem, value, op, _mask, _addend, _expr)     \
190   __extension__ ({                                                      \
191     ((__typeof(*(mem)))                                                 \
192      ((sizeof(*(mem)) == 8) ? (__typeof(*(mem)-*(mem)))__insn_##op(     \
193         (volatile void *)(mem),                                         \
194         (long long)(__typeof((value)-(value)))(value)) :                \
195       (sizeof(*(mem)) == 4) ? (int)__insn_##op##4(                      \
196         (volatile void *)(mem),                                         \
197         (int)(__typeof((value)-(value)))(value)) :                      \
198       __arch_atomic_error_bad_argument_size()));                        \
199   })
200 
201 #else
202 
203 /* This uses TILEPro's fast syscall support to atomically compute:
204 
205    int old = *ptr;
206    *ptr = (old & mask) + addend;
207    return old;
208 
209    This primitive can be used for atomic exchange, add, or, and.
210    Only 32-bit support is provided.  */
211 static __inline __attribute__ ((always_inline))
212      int
213      __arch_atomic_update_4 (volatile int *mem, int mask, int addend)
214 {
215   int result;
216   __asm__ __volatile__ ("swint1":"=R00" (result),
217 			"=m" (*mem):"R10" (__NR_FAST_atomic_update),
218 			"R00" (mem), "R01" (mask), "R02" (addend),
219 			"m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25",
220 			"r26", "r27", "r28", "r29", "memory");
221   return result;
222 }
223 
224 /* Generic atomic op with 8- or 4-byte variant.
225    The _op argument is ignored on tilepro.  */
226 #define __arch_atomic_update(mem, value, _op, mask, addend, expr)       \
227   __extension__ ({                                                      \
228     (__typeof(*(mem)))(__typeof(*(mem)-*(mem)))                         \
229       ((sizeof(*(mem)) == 8) ?                                          \
230        __arch_atomic_update_cmpxchg((mem), (value), (expr)) :           \
231        (sizeof(*(mem)) == 4) ?                                          \
232        __arch_atomic_update_4((volatile int*)(mem),                     \
233                               (__typeof((mask)-(mask)))(mask),          \
234                               (__typeof((addend)-(addend)))(addend)) :  \
235        __arch_atomic_error_bad_argument_size());                        \
236   })
237 
238 #endif /* __tilegx__ */
239 
240 
241 #define arch_atomic_exchange(mem, newvalue) \
242   __arch_atomic_update(mem, newvalue, exch, 0, newvalue, __value)
243 
244 #define arch_atomic_add(mem, value) \
245   __arch_atomic_update(mem, value, fetchadd, -1, value, __old + __value)
246 
247 #define arch_atomic_sub(mem, value) arch_atomic_add((mem), -(value))
248 
249 #define arch_atomic_increment(mem) arch_atomic_add((mem), 1)
250 
251 #define arch_atomic_decrement(mem) arch_atomic_add((mem), -1)
252 
253 #define arch_atomic_and(mem, mask) \
254   __arch_atomic_update(mem, mask, fetchand, mask, 0, __old & __value)
255 
256 #define arch_atomic_or(mem, mask) \
257   __arch_atomic_update(mem, mask, fetchor, ~mask, mask, __old | __value)
258 
259 #define arch_atomic_xor(mem, mask) \
260   __arch_atomic_update_cmpxchg(mem, mask, __old ^ __value)
261 
262 #define arch_atomic_nand(mem, mask) \
263   __arch_atomic_update_cmpxchg(mem, mask, ~(__old & __value))
264 
265 #define arch_atomic_bit_set(mem, bit)                                   \
266   __extension__ ({                                                      \
267     __typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit);             \
268     __mask & arch_atomic_or((mem), __mask);                             \
269   })
270 
271 #define arch_atomic_bit_clear(mem, bit)                                 \
272   __extension__ ({                                                      \
273     __typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit);             \
274     __mask & arch_atomic_and((mem), ~__mask);                           \
275   })
276 
277 #ifdef __tilegx__
278 /* Atomically store a new value to memory.
279    Note that you can freely use types of any size here, unlike the
280    other atomic routines, which require 32- or 64-bit types.
281    This accessor is provided for compatibility with TILEPro, which
282    required an explicit atomic operation for stores that needed
283    to be atomic with respect to other atomic methods in this header.  */
284 #define arch_atomic_write(mem, value) ((void) (*(mem) = (value)))
285 #else
286 #define arch_atomic_write(mem, value)                                   \
287   do {                                                                  \
288     __typeof(mem) __aw_mem = (mem);                                     \
289     __typeof(value) __aw_val = (value);                                 \
290     unsigned int *__aw_mem32, __aw_intval, __aw_val32, __aw_off, __aw_mask; \
291     __aw_intval = (__typeof((value) - (value)))__aw_val;                \
292     switch (sizeof(*__aw_mem)) {                                        \
293     case 8:                                                             \
294       __arch_atomic_update_cmpxchg(__aw_mem, __aw_val, __value);        \
295       break;                                                            \
296     case 4:                                                             \
297       __arch_atomic_update_4((int *)__aw_mem, 0, __aw_intval);          \
298       break;                                                            \
299     case 2:                                                             \
300       __aw_off = 8 * ((long)__aw_mem & 0x2);                            \
301       __aw_mask = 0xffffU << __aw_off;                                  \
302       __aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x2);             \
303       __aw_val32 = (__aw_intval << __aw_off) & __aw_mask;               \
304       __arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32,              \
305                                    (__old & ~__aw_mask) | __value);     \
306       break;                                                            \
307     case 1:                                                             \
308       __aw_off = 8 * ((long)__aw_mem & 0x3);                            \
309       __aw_mask = 0xffU << __aw_off;                                    \
310       __aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x3);             \
311       __aw_val32 = (__aw_intval << __aw_off) & __aw_mask;               \
312       __arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32,              \
313                                    (__old & ~__aw_mask) | __value);     \
314       break;                                                            \
315     }                                                                   \
316   } while (0)
317 #endif
318 
319 /* Compiler barrier.
320 
321    This macro prevents loads or stores from being moved by the compiler
322    across the macro.  Any loaded value that was loaded before this
323    macro must then be reloaded by the compiler.  */
324 #define arch_atomic_compiler_barrier() __asm__ __volatile__("" ::: "memory")
325 
326 /* Full memory barrier.
327 
328    This macro has the semantics of arch_atomic_compiler_barrer(), but also
329    ensures that previous stores are visible to other cores, and that
330    all previous loaded values have been placed into their target
331    register on this core.  */
332 #define arch_atomic_full_barrier() __insn_mf()
333 
334 /* Read memory barrier.
335 
336    Ensure that all reads by this processor that occurred prior to the
337    read memory barrier have completed, and that no reads that occur
338    after the read memory barrier on this processor are initiated
339    before the barrier.
340 
341    On current TILE chips a read barrier is implemented as a full barrier,
342    but this may not be true in later versions of the architecture.
343 
344    See also arch_atomic_acquire_barrier() for the appropriate idiom to use
345    to ensure no reads are lifted above an atomic lock instruction.  */
346 #define arch_atomic_read_barrier() arch_atomic_full_barrier()
347 
348 /* Write memory barrier.
349 
350    Ensure that all writes by this processor that occurred prior to the
351    write memory barrier have completed, and that no writes that occur
352    after the write memory barrier on this processor are initiated
353    before the barrier.
354 
355    On current TILE chips a write barrier is implemented as a full barrier,
356    but this may not be true in later versions of the architecture.
357 
358    See also arch_atomic_release_barrier() for the appropriate idiom to use
359    to ensure all writes are complete prior to an atomic unlock instruction.  */
360 #define arch_atomic_write_barrier() arch_atomic_full_barrier()
361 
362 /* Lock acquisition barrier.
363 
364    Ensure that no load operations that follow this macro in the
365    program can issue prior to the barrier.  Without such a barrier,
366    the compiler can reorder them to issue earlier, or the hardware can
367    issue them speculatively.  The latter is not currently done in the
368    Tile microarchitecture, but using this operation improves
369    portability to future implementations.
370 
371    This operation is intended to be used as part of the "acquire"
372    path for locking, that is, when entering a critical section.
373    This should be done after the atomic operation that actually
374    acquires the lock, and in conjunction with a "control dependency"
375    that checks the atomic operation result to see if the lock was
376    in fact acquired.  See the arch_atomic_read_barrier() macro
377    for a heavier-weight barrier to use in certain unusual constructs,
378    or arch_atomic_acquire_barrier_value() if no control dependency exists.  */
379 #define arch_atomic_acquire_barrier() arch_atomic_compiler_barrier()
380 
381 /* Lock release barrier.
382 
383    Ensure that no store operations that precede this macro in the
384    program complete subsequent to the barrier.  Without such a
385    barrier, the compiler can reorder stores to issue later, or stores
386    can be still outstanding in the memory network.
387 
388    This operation is intended to be used as part of the "release" path
389    for locking, that is, when leaving a critical section.  This should
390    be done before the operation (such as a store of zero) that
391    actually releases the lock.  */
392 #define arch_atomic_release_barrier() arch_atomic_write_barrier()
393 
394 /* Barrier until the read of a particular value is complete.
395 
396    This is occasionally useful when constructing certain locking
397    scenarios.  For example, you might write a routine that issues an
398    atomic instruction to enter a critical section, then reads one or
399    more values within the critical section without checking to see if
400    the critical section was in fact acquired, and only later checks
401    the atomic instruction result to see if the lock was acquired.  If
402    so the routine could properly release the lock and know that the
403    values that were read were valid.
404 
405    In this scenario, it is required to wait for the result of the
406    atomic instruction, even if the value itself is not checked.  This
407    guarantees that if the atomic instruction succeeded in taking the lock,
408    the lock was held before any reads in the critical section issued.  */
409 #define arch_atomic_acquire_barrier_value(val) \
410   __asm__ __volatile__("move %0, %0" :: "r"(val))
411 
412 /* Access the given variable in memory exactly once.
413 
414    In some contexts, an algorithm may need to force access to memory,
415    since otherwise the compiler may think it can optimize away a
416    memory load or store; for example, in a loop when polling memory to
417    see if another cpu has updated it yet.  Generally this is only
418    required for certain very carefully hand-tuned algorithms; using it
419    unnecessarily may result in performance losses.
420 
421    A related use of this macro is to ensure that the compiler does not
422    rematerialize the value of "x" by reloading it from memory
423    unexpectedly; the "volatile" marking will prevent the compiler from
424    being able to rematerialize.  This is helpful if an algorithm needs
425    to read a variable without locking, but needs it to have the same
426    value if it ends up being used several times within the algorithm.
427 
428    Note that multiple uses of this macro are guaranteed to be ordered,
429    i.e. the compiler will not reorder stores or loads that are wrapped
430    in arch_atomic_access_once().  */
431 #define arch_atomic_access_once(x) (*(volatile __typeof(x) *)&(x))
432 
433 
434 
435 #endif /* !_ATOMIC_H_ */
436