xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll (revision 5a81a559d69fb84e1e8ef623ac4b642081c14c51)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3
4define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
5; GCN-LABEL: set_inactive:
6; GCN:       ; %bb.0:
7; GCN-NEXT:    s_load_dword s3, s[4:5], 0x2c
8; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
9; GCN-NEXT:    s_mov_b32 s2, -1
10; GCN-NEXT:    s_waitcnt lgkmcnt(0)
11; GCN-NEXT:    v_mov_b32_e32 v1, s3
12; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
13; GCN-NEXT:    v_cndmask_b32_e64 v0, 42, v1, s[4:5]
14; GCN-NEXT:    s_mov_b64 exec, s[4:5]
15; GCN-NEXT:    v_mov_b32_e32 v1, v0
16; GCN-NEXT:    s_mov_b32 s3, 0xf000
17; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
18; GCN-NEXT:    s_endpgm
19  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
20  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
21  store i32 %tmp, ptr addrspace(1) %out
22  ret void
23}
24
25define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
26; GCN-LABEL: set_inactive_imm_poison:
27; GCN:       ; %bb.0:
28; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
29; GCN-NEXT:    v_mov_b32_e32 v0, 1
30; GCN-NEXT:    v_mov_b32_e32 v0, v0
31; GCN-NEXT:    s_mov_b32 s2, -1
32; GCN-NEXT:    s_mov_b32 s3, 0xf000
33; GCN-NEXT:    s_waitcnt lgkmcnt(0)
34; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
35; GCN-NEXT:    s_endpgm
36  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
37  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
38  store i32 %tmp, ptr addrspace(1) %out
39  ret void
40}
41
42define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
43; GCN-LABEL: set_inactive_64:
44; GCN:       ; %bb.0:
45; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
46; GCN-NEXT:    s_waitcnt lgkmcnt(0)
47; GCN-NEXT:    v_mov_b32_e32 v2, s2
48; GCN-NEXT:    v_mov_b32_e32 v3, s3
49; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
50; GCN-NEXT:    s_mov_b32 s2, -1
51; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[4:5]
52; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s[4:5]
53; GCN-NEXT:    s_mov_b64 exec, s[4:5]
54; GCN-NEXT:    v_mov_b32_e32 v2, v0
55; GCN-NEXT:    v_mov_b32_e32 v3, v1
56; GCN-NEXT:    s_mov_b32 s3, 0xf000
57; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
58; GCN-NEXT:    s_endpgm
59  %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
60  %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
61  store i64 %tmp, ptr addrspace(1) %out
62  ret void
63}
64
65define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
66; GCN-LABEL: set_inactive_imm_poison_64:
67; GCN:       ; %bb.0:
68; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
69; GCN-NEXT:    v_mov_b32_e32 v0, 1
70; GCN-NEXT:    v_mov_b32_e32 v1, 0
71; GCN-NEXT:    v_mov_b32_e32 v0, v0
72; GCN-NEXT:    v_mov_b32_e32 v1, v1
73; GCN-NEXT:    s_mov_b32 s2, -1
74; GCN-NEXT:    s_mov_b32 s3, 0xf000
75; GCN-NEXT:    s_waitcnt lgkmcnt(0)
76; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
77; GCN-NEXT:    s_endpgm
78  %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
79  %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
80  store i64 %tmp, ptr addrspace(1) %out
81  ret void
82}
83
84define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) {
85; GCN-LABEL: set_inactive_scc:
86; GCN:       ; %bb.0:
87; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
88; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
89; GCN-NEXT:    s_waitcnt lgkmcnt(0)
90; GCN-NEXT:    s_buffer_load_dword s7, s[0:3], 0x0
91; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
92; GCN-NEXT:    v_mov_b32_e32 v1, s6
93; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
94; GCN-NEXT:    v_cndmask_b32_e64 v0, 42, v1, s[2:3]
95; GCN-NEXT:    s_mov_b64 exec, s[2:3]
96; GCN-NEXT:    s_waitcnt lgkmcnt(0)
97; GCN-NEXT:    s_cmp_lg_u32 s7, 56
98; GCN-NEXT:    v_mov_b32_e32 v1, v0
99; GCN-NEXT:    s_mov_b32 s2, 1
100; GCN-NEXT:    s_cbranch_scc0 .LBB4_2
101; GCN-NEXT:  ; %bb.1: ; %.one
102; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v1
103; GCN-NEXT:    s_mov_b32 s2, -1
104; GCN-NEXT:    s_mov_b32 s3, 0xf000
105; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0
106; GCN-NEXT:    s_mov_b32 s2, 0
107; GCN-NEXT:  .LBB4_2: ; %Flow
108; GCN-NEXT:    s_xor_b32 s2, s2, 1
109; GCN-NEXT:    s_and_b32 s2, s2, 1
110; GCN-NEXT:    s_cmp_lg_u32 s2, 0
111; GCN-NEXT:    s_cbranch_scc1 .LBB4_4
112; GCN-NEXT:  ; %bb.3: ; %.zero
113; GCN-NEXT:    s_mov_b32 s2, -1
114; GCN-NEXT:    s_mov_b32 s3, 0xf000
115; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
116; GCN-NEXT:  .LBB4_4: ; %.exit
117; GCN-NEXT:    s_endpgm
118  %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
119  %cmp = icmp eq i32 %val, 56
120  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
121  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
122  br i1 %cmp, label %.zero, label %.one
123
124.zero:
125  store i32 %tmp, ptr addrspace(1) %out
126  br label %.exit
127
128.one:
129  %tmp.1 = add i32 %tmp, 1
130  store i32 %tmp.1, ptr addrspace(1) %out
131  br label %.exit
132
133.exit:
134  ret void
135}
136
137define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
138; GCN-LABEL: set_inactive_f32:
139; GCN:       ; %bb.0:
140; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
141; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
142; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
143; GCN-NEXT:    v_mov_b32_e32 v0, 0x40400000
144; GCN-NEXT:    s_mov_b64 exec, s[2:3]
145; GCN-NEXT:    s_mov_b32 s2, -1
146; GCN-NEXT:    s_waitcnt lgkmcnt(0)
147; GCN-NEXT:    v_mov_b32_e32 v1, s6
148; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
149; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
150; GCN-NEXT:    s_mov_b64 exec, s[4:5]
151; GCN-NEXT:    v_mov_b32_e32 v1, v0
152; GCN-NEXT:    s_mov_b32 s3, 0xf000
153; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
154; GCN-NEXT:    s_endpgm
155  %tmp.0 = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
156  %tmp = call float @llvm.amdgcn.strict.wwm.f32(float %tmp.0)
157  store float %tmp, ptr addrspace(1) %out
158  ret void
159}
160
161define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
162; GCN-LABEL: set_inactive_f64:
163; GCN:       ; %bb.0:
164; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
165; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
166; GCN-NEXT:    v_mov_b32_e32 v0, 0xcccccccd
167; GCN-NEXT:    v_mov_b32_e32 v1, 0x4010cccc
168; GCN-NEXT:    s_mov_b64 exec, s[4:5]
169; GCN-NEXT:    s_waitcnt lgkmcnt(0)
170; GCN-NEXT:    v_mov_b32_e32 v2, s2
171; GCN-NEXT:    v_mov_b32_e32 v3, s3
172; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
173; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
174; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
175; GCN-NEXT:    s_mov_b64 exec, s[2:3]
176; GCN-NEXT:    v_mov_b32_e32 v2, v0
177; GCN-NEXT:    v_mov_b32_e32 v3, v1
178; GCN-NEXT:    s_mov_b32 s2, -1
179; GCN-NEXT:    s_mov_b32 s3, 0xf000
180; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
181; GCN-NEXT:    s_endpgm
182  %tmp.0 = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
183  %tmp = call double @llvm.amdgcn.strict.wwm.f64(double %tmp.0)
184  store double %tmp, ptr addrspace(1) %out
185  ret void
186}
187
188define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
189; GCN-LABEL: set_inactive_v2i16:
190; GCN:       ; %bb.0:
191; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
192; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
193; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
194; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
195; GCN-NEXT:    s_mov_b64 exec, s[2:3]
196; GCN-NEXT:    s_mov_b32 s2, -1
197; GCN-NEXT:    s_waitcnt lgkmcnt(0)
198; GCN-NEXT:    v_mov_b32_e32 v1, s6
199; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
200; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
201; GCN-NEXT:    s_mov_b64 exec, s[4:5]
202; GCN-NEXT:    v_mov_b32_e32 v1, v0
203; GCN-NEXT:    s_mov_b32 s3, 0xf000
204; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
205; GCN-NEXT:    s_endpgm
206  %tmp.0 = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
207  %tmp = call <2 x i16> @llvm.amdgcn.strict.wwm.v2i16(<2 x i16> %tmp.0)
208  store <2 x i16> %tmp, ptr addrspace(1) %out
209  ret void
210}
211
212define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
213; GCN-LABEL: set_inactive_v2f16:
214; GCN:       ; %bb.0:
215; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
216; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
217; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
218; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
219; GCN-NEXT:    s_mov_b64 exec, s[2:3]
220; GCN-NEXT:    s_mov_b32 s2, -1
221; GCN-NEXT:    s_waitcnt lgkmcnt(0)
222; GCN-NEXT:    v_mov_b32_e32 v1, s6
223; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
224; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
225; GCN-NEXT:    s_mov_b64 exec, s[4:5]
226; GCN-NEXT:    v_mov_b32_e32 v1, v0
227; GCN-NEXT:    s_mov_b32 s3, 0xf000
228; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
229; GCN-NEXT:    s_endpgm
230  %tmp.0 = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
231  %tmp = call <2 x half> @llvm.amdgcn.strict.wwm.v2i16(<2 x half> %tmp.0)
232  store <2 x half> %tmp, ptr addrspace(1) %out
233  ret void
234}
235
236define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
237; GCN-LABEL: set_inactive_v2i32:
238; GCN:       ; %bb.0:
239; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
240; GCN-NEXT:    s_waitcnt lgkmcnt(0)
241; GCN-NEXT:    v_mov_b32_e32 v2, s2
242; GCN-NEXT:    v_mov_b32_e32 v3, s3
243; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
244; GCN-NEXT:    s_mov_b32 s2, -1
245; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, v2, s[4:5]
246; GCN-NEXT:    v_cndmask_b32_e64 v1, 1, v3, s[4:5]
247; GCN-NEXT:    s_mov_b64 exec, s[4:5]
248; GCN-NEXT:    v_mov_b32_e32 v2, v0
249; GCN-NEXT:    v_mov_b32_e32 v3, v1
250; GCN-NEXT:    s_mov_b32 s3, 0xf000
251; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
252; GCN-NEXT:    s_endpgm
253  %tmp.0 = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
254  %tmp = call <2 x i32> @llvm.amdgcn.strict.wwm.v2i32(<2 x i32> %tmp.0)
255  store <2 x i32> %tmp, ptr addrspace(1) %out
256  ret void
257}
258
259define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
260; GCN-LABEL: set_inactive_v2f32:
261; GCN:       ; %bb.0:
262; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
263; GCN-NEXT:    s_waitcnt lgkmcnt(0)
264; GCN-NEXT:    v_mov_b32_e32 v2, s2
265; GCN-NEXT:    v_mov_b32_e32 v3, s3
266; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
267; GCN-NEXT:    s_mov_b32 s2, -1
268; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, v2, s[4:5]
269; GCN-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[4:5]
270; GCN-NEXT:    s_mov_b64 exec, s[4:5]
271; GCN-NEXT:    v_mov_b32_e32 v2, v0
272; GCN-NEXT:    v_mov_b32_e32 v3, v1
273; GCN-NEXT:    s_mov_b32 s3, 0xf000
274; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
275; GCN-NEXT:    s_endpgm
276  %tmp.0 = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
277  %tmp = call <2 x float> @llvm.amdgcn.strict.wwm.v2f32(<2 x float> %tmp.0)
278  store <2 x float> %tmp, ptr addrspace(1) %out
279  ret void
280}
281
282define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
283; GCN-LABEL: set_inactive_v2bf16:
284; GCN:       ; %bb.0:
285; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
286; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
287; GCN-NEXT:    s_mov_b32 s3, 0xf000
288; GCN-NEXT:    s_mov_b32 s2, -1
289; GCN-NEXT:    s_waitcnt lgkmcnt(0)
290; GCN-NEXT:    v_mov_b32_e32 v1, s6
291; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
292; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
293; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
294; GCN-NEXT:    s_mov_b64 exec, s[4:5]
295; GCN-NEXT:    v_mov_b32_e32 v1, v0
296; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
297; GCN-NEXT:    s_endpgm
298  %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
299  %tmp = call <2 x bfloat> @llvm.amdgcn.strict.wwm.v2bf16(<2 x bfloat> %tmp.0)
300  store <2 x bfloat> %tmp, ptr addrspace(1) %out
301  ret void
302}
303
304define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) {
305; GCN-LABEL: set_inactive_v4i16:
306; GCN:       ; %bb.0:
307; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
308; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
309; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
310; GCN-NEXT:    s_mov_b64 exec, s[4:5]
311; GCN-NEXT:    s_waitcnt lgkmcnt(0)
312; GCN-NEXT:    v_mov_b32_e32 v3, s2
313; GCN-NEXT:    v_mov_b32_e32 v4, s3
314; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
315; GCN-NEXT:    s_mov_b32 s2, -1
316; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v3, s[4:5]
317; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v4, s[4:5]
318; GCN-NEXT:    s_mov_b64 exec, s[4:5]
319; GCN-NEXT:    v_mov_b32_e32 v3, v1
320; GCN-NEXT:    v_mov_b32_e32 v4, v2
321; GCN-NEXT:    s_mov_b32 s3, 0xf000
322; GCN-NEXT:    buffer_store_dwordx2 v[3:4], off, s[0:3], 0
323; GCN-NEXT:    s_endpgm
324  %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
325  %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0)
326  store <4 x i16> %tmp, ptr addrspace(1) %out
327  ret void
328}
329
330define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
331; GCN-LABEL: set_inactive_v4f16:
332; GCN:       ; %bb.0:
333; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
334; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
335; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
336; GCN-NEXT:    s_mov_b64 exec, s[4:5]
337; GCN-NEXT:    s_waitcnt lgkmcnt(0)
338; GCN-NEXT:    v_mov_b32_e32 v3, s2
339; GCN-NEXT:    v_mov_b32_e32 v4, s3
340; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
341; GCN-NEXT:    s_mov_b32 s2, -1
342; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v3, s[4:5]
343; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v4, s[4:5]
344; GCN-NEXT:    s_mov_b64 exec, s[4:5]
345; GCN-NEXT:    v_mov_b32_e32 v3, v1
346; GCN-NEXT:    v_mov_b32_e32 v4, v2
347; GCN-NEXT:    s_mov_b32 s3, 0xf000
348; GCN-NEXT:    buffer_store_dwordx2 v[3:4], off, s[0:3], 0
349; GCN-NEXT:    s_endpgm
350  %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
351  %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0)
352  store <4 x half> %tmp, ptr addrspace(1) %out
353  ret void
354}
355
356define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) {
357; GCN-LABEL: set_inactive_v4bf16:
358; GCN:       ; %bb.0:
359; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
360; GCN-NEXT:    s_mov_b32 s7, 0xf000
361; GCN-NEXT:    s_mov_b32 s6, -1
362; GCN-NEXT:    s_waitcnt lgkmcnt(0)
363; GCN-NEXT:    s_mov_b32 s4, s0
364; GCN-NEXT:    s_mov_b32 s5, s1
365; GCN-NEXT:    v_mov_b32_e32 v2, s3
366; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
367; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
368; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
369; GCN-NEXT:    s_mov_b64 exec, s[0:1]
370; GCN-NEXT:    v_mov_b32_e32 v2, s2
371; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
372; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
373; GCN-NEXT:    s_mov_b64 exec, s[0:1]
374; GCN-NEXT:    v_mov_b32_e32 v2, v0
375; GCN-NEXT:    v_mov_b32_e32 v3, v1
376; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
377; GCN-NEXT:    s_endpgm
378  %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
379  %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0)
380  store <4 x bfloat> %tmp, ptr addrspace(1) %out
381  ret void
382}
383
384define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
385; GCN-LABEL: set_inactive_p0:
386; GCN:       ; %bb.0:
387; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
388; GCN-NEXT:    s_waitcnt lgkmcnt(0)
389; GCN-NEXT:    v_mov_b32_e32 v2, s2
390; GCN-NEXT:    v_mov_b32_e32 v3, s3
391; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
392; GCN-NEXT:    s_mov_b32 s2, -1
393; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[4:5]
394; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s[4:5]
395; GCN-NEXT:    s_mov_b64 exec, s[4:5]
396; GCN-NEXT:    v_mov_b32_e32 v2, v0
397; GCN-NEXT:    v_mov_b32_e32 v3, v1
398; GCN-NEXT:    s_mov_b32 s3, 0xf000
399; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
400; GCN-NEXT:    s_endpgm
401  %tmp.0 = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
402  %tmp = call ptr @llvm.amdgcn.strict.wwm.p0(ptr %tmp.0)
403  store ptr %tmp, ptr addrspace(1) %out
404  ret void
405}
406
407define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
408; GCN-LABEL: set_inactive_p2:
409; GCN:       ; %bb.0:
410; GCN-NEXT:    s_load_dword s3, s[4:5], 0x2c
411; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
412; GCN-NEXT:    s_mov_b32 s2, -1
413; GCN-NEXT:    s_waitcnt lgkmcnt(0)
414; GCN-NEXT:    v_mov_b32_e32 v1, s3
415; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
416; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
417; GCN-NEXT:    s_mov_b64 exec, s[4:5]
418; GCN-NEXT:    v_mov_b32_e32 v1, v0
419; GCN-NEXT:    s_mov_b32 s3, 0xf000
420; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
421; GCN-NEXT:    s_endpgm
422  %tmp.0 = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
423  %tmp = call ptr addrspace(2) @llvm.amdgcn.strict.wwm.p2(ptr addrspace(2) %tmp.0)
424  store ptr addrspace(2) %tmp, ptr addrspace(1) %out
425  ret void
426}
427
428define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
429; GCN-LABEL: set_inactive_p3:
430; GCN:       ; %bb.0:
431; GCN-NEXT:    s_load_dword s3, s[4:5], 0x2c
432; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
433; GCN-NEXT:    s_mov_b32 s2, -1
434; GCN-NEXT:    s_waitcnt lgkmcnt(0)
435; GCN-NEXT:    v_mov_b32_e32 v1, s3
436; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
437; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
438; GCN-NEXT:    s_mov_b64 exec, s[4:5]
439; GCN-NEXT:    v_mov_b32_e32 v1, v0
440; GCN-NEXT:    s_mov_b32 s3, 0xf000
441; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
442; GCN-NEXT:    s_endpgm
443  %tmp.0 = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
444  %tmp = call ptr addrspace(3) @llvm.amdgcn.strict.wwm.p3(ptr addrspace(3) %tmp.0)
445  store ptr addrspace(3) %tmp, ptr addrspace(1) %out
446  ret void
447}
448
449define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
450; GCN-LABEL: set_inactive_p5:
451; GCN:       ; %bb.0:
452; GCN-NEXT:    s_load_dword s3, s[4:5], 0x2c
453; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
454; GCN-NEXT:    s_mov_b32 s2, -1
455; GCN-NEXT:    s_waitcnt lgkmcnt(0)
456; GCN-NEXT:    v_mov_b32_e32 v1, s3
457; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
458; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
459; GCN-NEXT:    s_mov_b64 exec, s[4:5]
460; GCN-NEXT:    v_mov_b32_e32 v1, v0
461; GCN-NEXT:    s_mov_b32 s3, 0xf000
462; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
463; GCN-NEXT:    s_endpgm
464  %tmp.0 = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
465  %tmp = call ptr addrspace(5) @llvm.amdgcn.strict.wwm.p5(ptr addrspace(5) %tmp.0)
466  store ptr addrspace(5) %tmp, ptr addrspace(1) %out
467  ret void
468}
469
470define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
471; GCN-LABEL: set_inactive_p6:
472; GCN:       ; %bb.0:
473; GCN-NEXT:    s_load_dword s3, s[4:5], 0x2c
474; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
475; GCN-NEXT:    s_mov_b32 s2, -1
476; GCN-NEXT:    s_waitcnt lgkmcnt(0)
477; GCN-NEXT:    v_mov_b32_e32 v1, s3
478; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
479; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
480; GCN-NEXT:    s_mov_b64 exec, s[4:5]
481; GCN-NEXT:    v_mov_b32_e32 v1, v0
482; GCN-NEXT:    s_mov_b32 s3, 0xf000
483; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
484; GCN-NEXT:    s_endpgm
485  %tmp.0 = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
486  %tmp = call ptr addrspace(6) @llvm.amdgcn.strict.wwm.p6(ptr addrspace(6) %tmp.0)
487  store ptr addrspace(6) %tmp, ptr addrspace(1) %out
488  ret void
489}
490
491declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
492declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
493declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1
494declare i64 @llvm.amdgcn.strict.wwm.i64(i64) #1
495declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
496
497attributes #0 = { convergent readnone }
498attributes #1 = { convergent nounwind readnone speculatable willreturn }
499