xref: /llvm-project/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll (revision e28e93550a74752714db6fffe50233aa96e536a5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
3
4define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) {
5; CHECK-LABEL: InferNothing:
6; CHECK:       ; %bb.0: ; %entry
7; CHECK-NEXT:    s_load_dword s6, s[4:5], 0x24
8; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
9; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
10; CHECK-NEXT:    s_ashr_i32 s7, s6, 31
11; CHECK-NEXT:    v_mov_b32_e32 v0, s2
12; CHECK-NEXT:    v_mov_b32_e32 v1, s3
13; CHECK-NEXT:    s_lshl_b64 s[2:3], s[6:7], 3
14; CHECK-NEXT:    s_add_u32 s0, s2, s0
15; CHECK-NEXT:    s_addc_u32 s1, s3, s1
16; CHECK-NEXT:    v_mov_b32_e32 v3, s1
17; CHECK-NEXT:    v_add_co_u32_e64 v2, vcc, -8, s0
18; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
19; CHECK-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1]
20; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
21; CHECK-NEXT:    buffer_wbinvl1_vol
22; CHECK-NEXT:    s_endpgm
23entry:
24  %i = add nsw i32 %a, -1
25  %i.2 = sext i32 %i to i64
26  %i.3 = getelementptr inbounds double, ptr %b, i64 %i.2
27  %i.4 = atomicrmw fadd ptr %i.3, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
28  ret void
29}
30
31define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, double %c) {
32; CHECK-LABEL: InferFadd:
33; CHECK:       ; %bb.0: ; %entry
34; CHECK-NEXT:    s_mov_b64 s[0:1], exec
35; CHECK-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
36; CHECK-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
37; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
38; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], vcc
39; CHECK-NEXT:    s_cbranch_execz .LBB1_2
40; CHECK-NEXT:  ; %bb.1:
41; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x24
42; CHECK-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x2c
43; CHECK-NEXT:    v_mov_b32_e32 v2, 0
44; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
45; CHECK-NEXT:    s_ashr_i32 s3, s2, 31
46; CHECK-NEXT:    s_lshl_b64 s[2:3], s[2:3], 3
47; CHECK-NEXT:    s_add_u32 s2, s8, s2
48; CHECK-NEXT:    s_addc_u32 s3, s9, s3
49; CHECK-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
50; CHECK-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
51; CHECK-NEXT:    v_mul_f64 v[0:1], s[10:11], v[0:1]
52; CHECK-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3] offset:-8
53; CHECK-NEXT:    s_waitcnt vmcnt(0)
54; CHECK-NEXT:    buffer_wbinvl1_vol
55; CHECK-NEXT:  .LBB1_2:
56; CHECK-NEXT:    s_endpgm
57entry:
58  %i = add nsw i32 %a, -1
59  %i.2 = sext i32 %i to i64
60  %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2
61  %i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr
62  %0 = atomicrmw fadd ptr %i.4, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
63  ret void
64}
65
66define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, double %c, ptr %d) {
67; CHECK-LABEL: InferMixed:
68; CHECK:       ; %bb.0: ; %entry
69; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x3c
70; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
71; CHECK-NEXT:    s_mov_b64 s[6:7], exec
72; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
73; CHECK-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
74; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
75; CHECK-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
76; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
77; CHECK-NEXT:    buffer_wbinvl1_vol
78; CHECK-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
79; CHECK-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
80; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
81; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], vcc
82; CHECK-NEXT:    s_cbranch_execz .LBB2_2
83; CHECK-NEXT:  ; %bb.1:
84; CHECK-NEXT:    s_load_dword s4, s[4:5], 0x24
85; CHECK-NEXT:    v_mov_b32_e32 v2, 0
86; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
87; CHECK-NEXT:    s_ashr_i32 s5, s4, 31
88; CHECK-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
89; CHECK-NEXT:    s_add_u32 s0, s0, s4
90; CHECK-NEXT:    s_addc_u32 s1, s1, s5
91; CHECK-NEXT:    s_bcnt1_i32_b64 s4, s[6:7]
92; CHECK-NEXT:    v_cvt_f64_u32_e32 v[0:1], s4
93; CHECK-NEXT:    v_mul_f64 v[0:1], s[2:3], v[0:1]
94; CHECK-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-7
95; CHECK-NEXT:    s_waitcnt vmcnt(0)
96; CHECK-NEXT:    buffer_wbinvl1_vol
97; CHECK-NEXT:  .LBB2_2:
98; CHECK-NEXT:    s_endpgm
99entry:
100  %i = add nsw i32 %a, -1
101  %i.2 = sext i32 %i to i64
102  %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2
103  br label %bb1
104
105bb1:                                              ; preds = %entry
106  %i.7 = ptrtoint ptr addrspace(1) %i.3 to i64
107  %i.8 = add nsw i64 %i.7, 1
108  %i.9 = inttoptr i64 %i.8 to ptr addrspace(1)
109  %0 = atomicrmw fadd ptr %d, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
110  %i.11 = addrspacecast ptr addrspace(1) %i.9 to ptr
111  %1 = atomicrmw fadd ptr %i.11, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
112  ret void
113}
114
115define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, double %c) {
116; CHECK-LABEL: InferPHI:
117; CHECK:       ; %bb.0: ; %entry
118; CHECK-NEXT:    s_load_dword s6, s[4:5], 0x24
119; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
120; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
121; CHECK-NEXT:    s_ashr_i32 s7, s6, 31
122; CHECK-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
123; CHECK-NEXT:    s_add_u32 s0, s0, s4
124; CHECK-NEXT:    s_addc_u32 s1, s1, s5
125; CHECK-NEXT:    s_add_u32 s4, s0, -8
126; CHECK-NEXT:    s_addc_u32 s5, s1, -1
127; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 9
128; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
129; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
130; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
131; CHECK-NEXT:  .LBB3_1: ; %bb0
132; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
133; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
134; CHECK-NEXT:    s_cbranch_vccnz .LBB3_1
135; CHECK-NEXT:  ; %bb.2: ; %bb1
136; CHECK-NEXT:    s_mov_b64 s[0:1], exec
137; CHECK-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
138; CHECK-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
139; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
140; CHECK-NEXT:    s_and_saveexec_b64 s[6:7], vcc
141; CHECK-NEXT:    s_cbranch_execz .LBB3_4
142; CHECK-NEXT:  ; %bb.3:
143; CHECK-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
144; CHECK-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
145; CHECK-NEXT:    v_mul_f64 v[0:1], s[2:3], v[0:1]
146; CHECK-NEXT:    v_mov_b32_e32 v2, 0
147; CHECK-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
148; CHECK-NEXT:    s_waitcnt vmcnt(0)
149; CHECK-NEXT:    buffer_wbinvl1_vol
150; CHECK-NEXT:  .LBB3_4:
151; CHECK-NEXT:    s_endpgm
152entry:
153  %i = add nsw i32 %a, -1
154  %i.2 = sext i32 %i to i64
155  %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2
156  %i.4 = ptrtoint ptr addrspace(1) %i.3 to i64
157  br label %bb0
158
159bb0:                                              ; preds = %bb0, %entry
160  %phi = phi ptr addrspace(1) [ %i.3, %entry ], [ %i.9, %bb0 ]
161  %i.7 = ptrtoint ptr addrspace(1) %phi to i64
162  %i.8 = sub nsw i64 %i.7, 1
163  %cmp2 = icmp eq i64 %i.8, 0
164  %i.9 = inttoptr i64 %i.7 to ptr addrspace(1)
165  br i1 %cmp2, label %bb1, label %bb0
166
167bb1:                                              ; preds = %bb0
168  %i.10 = addrspacecast ptr addrspace(1) %i.9 to ptr
169  %0 = atomicrmw fadd ptr %i.10, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
170  ret void
171}
172
173attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
174attributes #1 = { mustprogress nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
175
176!0 = !{}
177!1 = !{i32 5, i32 6}
178