xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
3
4define amdgpu_kernel void @divergent_or3_b32(ptr addrspace(1) %arg) {
5; GCN-LABEL: divergent_or3_b32:
6; GCN:       ; %bb.0: ; %bb
7; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
8; GCN-NEXT:    v_lshlrev_b32_e32 v3, 4, v0
9; GCN-NEXT:    s_waitcnt lgkmcnt(0)
10; GCN-NEXT:    global_load_dwordx3 v[0:2], v3, s[0:1]
11; GCN-NEXT:    s_waitcnt vmcnt(0)
12; GCN-NEXT:    v_or3_b32 v0, v1, v0, v2
13; GCN-NEXT:    v_not_b32_e32 v0, v0
14; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
15; GCN-NEXT:    s_endpgm
16bb:
17  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
18  %i1 = zext i32 %i to i64
19  %i2 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %arg, i64 %i1
20  %i3 = load <3 x i32>, ptr addrspace(1) %i2, align 16
21  %i4 = extractelement <3 x i32> %i3, i64 0
22  %i5 = extractelement <3 x i32> %i3, i64 1
23  %i6 = extractelement <3 x i32> %i3, i64 2
24  %i7 = or i32 %i5, %i4
25  %i8 = or i32 %i7, %i6
26  %i9 = xor i32 %i8, -1
27  store i32 %i9, ptr addrspace(1) %i2, align 16
28  ret void
29}
30
31define amdgpu_kernel void @divergent_or3_b64(ptr addrspace(1) %arg) {
32; GCN-LABEL: divergent_or3_b64:
33; GCN:       ; %bb.0: ; %bb
34; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
35; GCN-NEXT:    v_lshlrev_b32_e32 v6, 5, v0
36; GCN-NEXT:    s_waitcnt lgkmcnt(0)
37; GCN-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
38; GCN-NEXT:    global_load_dwordx4 v[0:3], v6, s[0:1]
39; GCN-NEXT:    s_waitcnt vmcnt(0)
40; GCN-NEXT:    v_or3_b32 v1, v3, v1, v5
41; GCN-NEXT:    v_or3_b32 v0, v2, v0, v4
42; GCN-NEXT:    v_not_b32_e32 v1, v1
43; GCN-NEXT:    v_not_b32_e32 v0, v0
44; GCN-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
45; GCN-NEXT:    s_endpgm
46bb:
47  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
48  %i1 = zext i32 %i to i64
49  %i2 = getelementptr inbounds <3 x i64>, ptr addrspace(1) %arg, i64 %i1
50  %i3 = load <3 x i64>, ptr addrspace(1) %i2, align 32
51  %i4 = extractelement <3 x i64> %i3, i64 0
52  %i5 = extractelement <3 x i64> %i3, i64 1
53  %i6 = extractelement <3 x i64> %i3, i64 2
54  %i7 = or i64 %i5, %i4
55  %i8 = or i64 %i7, %i6
56  %i9 = xor i64 %i8, -1
57  store i64 %i9, ptr addrspace(1) %i2, align 32
58  ret void
59}
60
61define amdgpu_kernel void @divergent_and3_b32(ptr addrspace(1) %arg) {
62; GCN-LABEL: divergent_and3_b32:
63; GCN:       ; %bb.0: ; %bb
64; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
65; GCN-NEXT:    v_lshlrev_b32_e32 v3, 4, v0
66; GCN-NEXT:    s_waitcnt lgkmcnt(0)
67; GCN-NEXT:    global_load_dwordx3 v[0:2], v3, s[0:1]
68; GCN-NEXT:    s_waitcnt vmcnt(0)
69; GCN-NEXT:    v_and_b32_e32 v0, v1, v0
70; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
71; GCN-NEXT:    v_not_b32_e32 v0, v0
72; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
73; GCN-NEXT:    s_endpgm
74bb:
75  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
76  %i1 = zext i32 %i to i64
77  %i2 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %arg, i64 %i1
78  %i3 = load <3 x i32>, ptr addrspace(1) %i2, align 16
79  %i4 = extractelement <3 x i32> %i3, i64 0
80  %i5 = extractelement <3 x i32> %i3, i64 1
81  %i6 = extractelement <3 x i32> %i3, i64 2
82  %i7 = and i32 %i5, %i4
83  %i8 = and i32 %i7, %i6
84  %i9 = xor i32 %i8, -1
85  store i32 %i9, ptr addrspace(1) %i2, align 16
86  ret void
87}
88
89define amdgpu_kernel void @divergent_and3_b64(ptr addrspace(1) %arg) {
90; GCN-LABEL: divergent_and3_b64:
91; GCN:       ; %bb.0: ; %bb
92; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
93; GCN-NEXT:    v_lshlrev_b32_e32 v6, 5, v0
94; GCN-NEXT:    s_waitcnt lgkmcnt(0)
95; GCN-NEXT:    global_load_dwordx4 v[0:3], v6, s[0:1]
96; GCN-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
97; GCN-NEXT:    s_waitcnt vmcnt(1)
98; GCN-NEXT:    v_and_b32_e32 v1, v3, v1
99; GCN-NEXT:    v_and_b32_e32 v0, v2, v0
100; GCN-NEXT:    s_waitcnt vmcnt(0)
101; GCN-NEXT:    v_and_b32_e32 v1, v1, v5
102; GCN-NEXT:    v_and_b32_e32 v0, v0, v4
103; GCN-NEXT:    v_not_b32_e32 v1, v1
104; GCN-NEXT:    v_not_b32_e32 v0, v0
105; GCN-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
106; GCN-NEXT:    s_endpgm
107bb:
108  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
109  %i1 = zext i32 %i to i64
110  %i2 = getelementptr inbounds <3 x i64>, ptr addrspace(1) %arg, i64 %i1
111  %i3 = load <3 x i64>, ptr addrspace(1) %i2, align 32
112  %i4 = extractelement <3 x i64> %i3, i64 0
113  %i5 = extractelement <3 x i64> %i3, i64 1
114  %i6 = extractelement <3 x i64> %i3, i64 2
115  %i7 = and i64 %i5, %i4
116  %i8 = and i64 %i7, %i6
117  %i9 = xor i64 %i8, -1
118  store i64 %i9, ptr addrspace(1) %i2, align 32
119  ret void
120}
121
122define amdgpu_kernel void @divergent_xor3_b32(ptr addrspace(1) %arg) {
123; GCN-LABEL: divergent_xor3_b32:
124; GCN:       ; %bb.0: ; %bb
125; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
126; GCN-NEXT:    v_lshlrev_b32_e32 v3, 4, v0
127; GCN-NEXT:    s_waitcnt lgkmcnt(0)
128; GCN-NEXT:    global_load_dwordx3 v[0:2], v3, s[0:1]
129; GCN-NEXT:    s_waitcnt vmcnt(0)
130; GCN-NEXT:    v_xor_b32_e32 v0, v1, v0
131; GCN-NEXT:    v_xnor_b32_e32 v0, v0, v2
132; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
133; GCN-NEXT:    s_endpgm
134bb:
135  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
136  %i1 = zext i32 %i to i64
137  %i2 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %arg, i64 %i1
138  %i3 = load <3 x i32>, ptr addrspace(1) %i2, align 16
139  %i4 = extractelement <3 x i32> %i3, i64 0
140  %i5 = extractelement <3 x i32> %i3, i64 1
141  %i6 = extractelement <3 x i32> %i3, i64 2
142  %i7 = xor i32 %i5, %i4
143  %i8 = xor i32 %i7, %i6
144  %i9 = xor i32 %i8, -1
145  store i32 %i9, ptr addrspace(1) %i2, align 16
146  ret void
147}
148
149define amdgpu_kernel void @divergent_xor3_b64(ptr addrspace(1) %arg) {
150; GCN-LABEL: divergent_xor3_b64:
151; GCN:       ; %bb.0: ; %bb
152; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
153; GCN-NEXT:    v_lshlrev_b32_e32 v6, 5, v0
154; GCN-NEXT:    s_waitcnt lgkmcnt(0)
155; GCN-NEXT:    global_load_dwordx4 v[0:3], v6, s[0:1]
156; GCN-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
157; GCN-NEXT:    s_waitcnt vmcnt(1)
158; GCN-NEXT:    v_xor_b32_e32 v1, v3, v1
159; GCN-NEXT:    v_xor_b32_e32 v0, v2, v0
160; GCN-NEXT:    s_waitcnt vmcnt(0)
161; GCN-NEXT:    v_xnor_b32_e32 v1, v1, v5
162; GCN-NEXT:    v_xnor_b32_e32 v0, v0, v4
163; GCN-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
164; GCN-NEXT:    s_endpgm
165bb:
166  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
167  %i1 = zext i32 %i to i64
168  %i2 = getelementptr inbounds <3 x i64>, ptr addrspace(1) %arg, i64 %i1
169  %i3 = load <3 x i64>, ptr addrspace(1) %i2, align 32
170  %i4 = extractelement <3 x i64> %i3, i64 0
171  %i5 = extractelement <3 x i64> %i3, i64 1
172  %i6 = extractelement <3 x i64> %i3, i64 2
173  %i7 = xor i64 %i5, %i4
174  %i8 = xor i64 %i7, %i6
175  %i9 = xor i64 %i8, -1
176  store i64 %i9, ptr addrspace(1) %i2, align 32
177  ret void
178}
179
180define amdgpu_kernel void @uniform_or3_b32(ptr addrspace(1) %arg) {
181; GCN-LABEL: uniform_or3_b32:
182; GCN:       ; %bb.0: ; %bb
183; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
184; GCN-NEXT:    v_mov_b32_e32 v0, 0
185; GCN-NEXT:    s_waitcnt lgkmcnt(0)
186; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
187; GCN-NEXT:    s_waitcnt lgkmcnt(0)
188; GCN-NEXT:    s_or_b32 s0, s1, s0
189; GCN-NEXT:    s_nor_b32 s0, s0, s2
190; GCN-NEXT:    v_mov_b32_e32 v1, s0
191; GCN-NEXT:    global_store_dword v0, v1, s[6:7]
192; GCN-NEXT:    s_endpgm
193bb:
194  %i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16
195  %i4 = extractelement <3 x i32> %i3, i64 0
196  %i5 = extractelement <3 x i32> %i3, i64 1
197  %i6 = extractelement <3 x i32> %i3, i64 2
198  %i7 = or i32 %i5, %i4
199  %i8 = or i32 %i7, %i6
200  %i9 = xor i32 %i8, -1
201  store i32 %i9, ptr addrspace(1) %arg, align 16
202  ret void
203}
204
205define amdgpu_kernel void @uniform_or3_b64(ptr addrspace(1) %arg) {
206; GCN-LABEL: uniform_or3_b64:
207; GCN:       ; %bb.0: ; %bb
208; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
209; GCN-NEXT:    v_mov_b32_e32 v2, 0
210; GCN-NEXT:    s_waitcnt lgkmcnt(0)
211; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
212; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
213; GCN-NEXT:    s_waitcnt lgkmcnt(0)
214; GCN-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
215; GCN-NEXT:    s_nor_b64 s[0:1], s[0:1], s[4:5]
216; GCN-NEXT:    v_mov_b32_e32 v0, s0
217; GCN-NEXT:    v_mov_b32_e32 v1, s1
218; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
219; GCN-NEXT:    s_endpgm
220bb:
221  %i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32
222  %i4 = extractelement <3 x i64> %i3, i64 0
223  %i5 = extractelement <3 x i64> %i3, i64 1
224  %i6 = extractelement <3 x i64> %i3, i64 2
225  %i7 = or i64 %i5, %i4
226  %i8 = or i64 %i7, %i6
227  %i9 = xor i64 %i8, -1
228  store i64 %i9, ptr addrspace(1) %arg, align 32
229  ret void
230}
231
232define amdgpu_kernel void @uniform_and3_b32(ptr addrspace(1) %arg) {
233; GCN-LABEL: uniform_and3_b32:
234; GCN:       ; %bb.0: ; %bb
235; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
236; GCN-NEXT:    v_mov_b32_e32 v0, 0
237; GCN-NEXT:    s_waitcnt lgkmcnt(0)
238; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
239; GCN-NEXT:    s_waitcnt lgkmcnt(0)
240; GCN-NEXT:    s_and_b32 s0, s1, s0
241; GCN-NEXT:    s_nand_b32 s0, s0, s2
242; GCN-NEXT:    v_mov_b32_e32 v1, s0
243; GCN-NEXT:    global_store_dword v0, v1, s[6:7]
244; GCN-NEXT:    s_endpgm
245bb:
246  %i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16
247  %i4 = extractelement <3 x i32> %i3, i64 0
248  %i5 = extractelement <3 x i32> %i3, i64 1
249  %i6 = extractelement <3 x i32> %i3, i64 2
250  %i7 = and i32 %i5, %i4
251  %i8 = and i32 %i7, %i6
252  %i9 = xor i32 %i8, -1
253  store i32 %i9, ptr addrspace(1) %arg, align 16
254  ret void
255}
256
257define amdgpu_kernel void @uniform_and3_b64(ptr addrspace(1) %arg) {
258; GCN-LABEL: uniform_and3_b64:
259; GCN:       ; %bb.0: ; %bb
260; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
261; GCN-NEXT:    v_mov_b32_e32 v2, 0
262; GCN-NEXT:    s_waitcnt lgkmcnt(0)
263; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
264; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
265; GCN-NEXT:    s_waitcnt lgkmcnt(0)
266; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
267; GCN-NEXT:    s_nand_b64 s[0:1], s[0:1], s[4:5]
268; GCN-NEXT:    v_mov_b32_e32 v0, s0
269; GCN-NEXT:    v_mov_b32_e32 v1, s1
270; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
271; GCN-NEXT:    s_endpgm
272bb:
273  %i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32
274  %i4 = extractelement <3 x i64> %i3, i64 0
275  %i5 = extractelement <3 x i64> %i3, i64 1
276  %i6 = extractelement <3 x i64> %i3, i64 2
277  %i7 = and i64 %i5, %i4
278  %i8 = and i64 %i7, %i6
279  %i9 = xor i64 %i8, -1
280  store i64 %i9, ptr addrspace(1) %arg, align 32
281  ret void
282}
283
284define amdgpu_kernel void @uniform_xor3_b32(ptr addrspace(1) %arg) {
285; GCN-LABEL: uniform_xor3_b32:
286; GCN:       ; %bb.0: ; %bb
287; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
288; GCN-NEXT:    v_mov_b32_e32 v0, 0
289; GCN-NEXT:    s_waitcnt lgkmcnt(0)
290; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
291; GCN-NEXT:    s_waitcnt lgkmcnt(0)
292; GCN-NEXT:    s_xor_b32 s0, s1, s0
293; GCN-NEXT:    s_xnor_b32 s0, s0, s2
294; GCN-NEXT:    v_mov_b32_e32 v1, s0
295; GCN-NEXT:    global_store_dword v0, v1, s[6:7]
296; GCN-NEXT:    s_endpgm
297bb:
298  %i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16
299  %i4 = extractelement <3 x i32> %i3, i64 0
300  %i5 = extractelement <3 x i32> %i3, i64 1
301  %i6 = extractelement <3 x i32> %i3, i64 2
302  %i7 = xor i32 %i5, %i4
303  %i8 = xor i32 %i7, %i6
304  %i9 = xor i32 %i8, -1
305  store i32 %i9, ptr addrspace(1) %arg, align 16
306  ret void
307}
308
309define amdgpu_kernel void @uniform_xor3_b64(ptr addrspace(1) %arg) {
310; GCN-LABEL: uniform_xor3_b64:
311; GCN:       ; %bb.0: ; %bb
312; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
313; GCN-NEXT:    v_mov_b32_e32 v2, 0
314; GCN-NEXT:    s_waitcnt lgkmcnt(0)
315; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
316; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
317; GCN-NEXT:    s_waitcnt lgkmcnt(0)
318; GCN-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
319; GCN-NEXT:    s_xnor_b64 s[0:1], s[0:1], s[4:5]
320; GCN-NEXT:    v_mov_b32_e32 v0, s0
321; GCN-NEXT:    v_mov_b32_e32 v1, s1
322; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
323; GCN-NEXT:    s_endpgm
324bb:
325  %i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32
326  %i4 = extractelement <3 x i64> %i3, i64 0
327  %i5 = extractelement <3 x i64> %i3, i64 1
328  %i6 = extractelement <3 x i64> %i3, i64 2
329  %i7 = xor i64 %i5, %i4
330  %i8 = xor i64 %i7, %i6
331  %i9 = xor i64 %i8, -1
332  store i64 %i9, ptr addrspace(1) %arg, align 32
333  ret void
334}
335
336declare i32 @llvm.amdgcn.workitem.id.x()
337