xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -early-live-intervals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
4
5define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
6; GCN-LABEL: set_inactive:
7; GCN:       ; %bb.0:
8; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
9; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
10; GCN-NEXT:    s_mov_b32 s3, 0xf000
11; GCN-NEXT:    s_mov_b32 s2, -1
12; GCN-NEXT:    s_waitcnt lgkmcnt(0)
13; GCN-NEXT:    v_mov_b32_e32 v1, s6
14; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
15; GCN-NEXT:    v_cndmask_b32_e64 v0, 42, v1, s[4:5]
16; GCN-NEXT:    s_mov_b64 exec, s[4:5]
17; GCN-NEXT:    v_mov_b32_e32 v1, v0
18; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
19; GCN-NEXT:    s_endpgm
20  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
21  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
22  store i32 %tmp, ptr addrspace(1) %out
23  ret void
24}
25
26define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
27; GCN-LABEL: set_inactive_imm_poison:
28; GCN:       ; %bb.0:
29; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
30; GCN-NEXT:    v_mov_b32_e32 v0, 1
31; GCN-NEXT:    s_mov_b32 s3, 0xf000
32; GCN-NEXT:    s_mov_b32 s2, -1
33; GCN-NEXT:    v_mov_b32_e32 v1, v0
34; GCN-NEXT:    s_waitcnt lgkmcnt(0)
35; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
36; GCN-NEXT:    s_endpgm
37  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
38  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
39  store i32 %tmp, ptr addrspace(1) %out
40  ret void
41}
42
43define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
44; GCN-LABEL: set_inactive_64:
45; GCN:       ; %bb.0:
46; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
47; GCN-NEXT:    s_mov_b32 s7, 0xf000
48; GCN-NEXT:    s_mov_b32 s6, -1
49; GCN-NEXT:    s_waitcnt lgkmcnt(0)
50; GCN-NEXT:    s_mov_b32 s4, s0
51; GCN-NEXT:    s_mov_b32 s5, s1
52; GCN-NEXT:    v_mov_b32_e32 v2, s3
53; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
54; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
55; GCN-NEXT:    s_mov_b64 exec, s[0:1]
56; GCN-NEXT:    v_mov_b32_e32 v2, s2
57; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
58; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
59; GCN-NEXT:    s_mov_b64 exec, s[0:1]
60; GCN-NEXT:    v_mov_b32_e32 v2, v0
61; GCN-NEXT:    v_mov_b32_e32 v3, v1
62; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
63; GCN-NEXT:    s_endpgm
64  %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
65  %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
66  store i64 %tmp, ptr addrspace(1) %out
67  ret void
68}
69
70define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
71; GCN-LABEL: set_inactive_imm_poison_64:
72; GCN:       ; %bb.0:
73; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
74; GCN-NEXT:    v_mov_b32_e32 v0, 1
75; GCN-NEXT:    v_mov_b32_e32 v1, 0
76; GCN-NEXT:    v_mov_b32_e32 v2, v0
77; GCN-NEXT:    s_mov_b32 s3, 0xf000
78; GCN-NEXT:    s_mov_b32 s2, -1
79; GCN-NEXT:    v_mov_b32_e32 v3, v1
80; GCN-NEXT:    s_waitcnt lgkmcnt(0)
81; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
82; GCN-NEXT:    s_endpgm
83  %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
84  %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
85  store i64 %tmp, ptr addrspace(1) %out
86  ret void
87}
88
89define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) {
90; GCN-LABEL: set_inactive_scc:
91; GCN:       ; %bb.0:
92; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
93; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
94; GCN-NEXT:    s_waitcnt lgkmcnt(0)
95; GCN-NEXT:    s_buffer_load_dword s7, s[0:3], 0x0
96; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
97; GCN-NEXT:    v_mov_b32_e32 v1, s6
98; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
99; GCN-NEXT:    v_cndmask_b32_e64 v0, 42, v1, s[2:3]
100; GCN-NEXT:    s_mov_b64 exec, s[2:3]
101; GCN-NEXT:    s_waitcnt lgkmcnt(0)
102; GCN-NEXT:    s_cmp_lg_u32 s7, 56
103; GCN-NEXT:    v_mov_b32_e32 v1, v0
104; GCN-NEXT:    s_mov_b64 s[2:3], -1
105; GCN-NEXT:    s_cbranch_scc1 .LBB4_3
106; GCN-NEXT:  ; %bb.1: ; %Flow
107; GCN-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
108; GCN-NEXT:    s_cbranch_vccz .LBB4_4
109; GCN-NEXT:  .LBB4_2: ; %.exit
110; GCN-NEXT:    s_endpgm
111; GCN-NEXT:  .LBB4_3: ; %.one
112; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v1
113; GCN-NEXT:    s_mov_b32 s3, 0xf000
114; GCN-NEXT:    s_mov_b32 s2, -1
115; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0
116; GCN-NEXT:    s_cbranch_execnz .LBB4_2
117; GCN-NEXT:  .LBB4_4: ; %.zero
118; GCN-NEXT:    s_mov_b32 s3, 0xf000
119; GCN-NEXT:    s_mov_b32 s2, -1
120; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
121; GCN-NEXT:    s_endpgm
122  %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
123  %cmp = icmp eq i32 %val, 56
124  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
125  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
126  br i1 %cmp, label %.zero, label %.one
127
128.zero:
129  store i32 %tmp, ptr addrspace(1) %out
130  br label %.exit
131
132.one:
133  %tmp.1 = add i32 %tmp, 1
134  store i32 %tmp.1, ptr addrspace(1) %out
135  br label %.exit
136
137.exit:
138  ret void
139}
140
141define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
142; GCN-LABEL: set_inactive_f32:
143; GCN:       ; %bb.0:
144; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
145; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
146; GCN-NEXT:    s_mov_b32 s3, 0xf000
147; GCN-NEXT:    s_mov_b32 s2, -1
148; GCN-NEXT:    s_waitcnt lgkmcnt(0)
149; GCN-NEXT:    v_mov_b32_e32 v1, s6
150; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
151; GCN-NEXT:    v_mov_b32_e32 v0, 0x40400000
152; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
153; GCN-NEXT:    s_mov_b64 exec, s[4:5]
154; GCN-NEXT:    v_mov_b32_e32 v1, v0
155; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
156; GCN-NEXT:    s_endpgm
157  %tmp.0 = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
158  %tmp = call float @llvm.amdgcn.strict.wwm.f32(float %tmp.0)
159  store float %tmp, ptr addrspace(1) %out
160  ret void
161}
162
163define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
164; GCN-LABEL: set_inactive_f64:
165; GCN:       ; %bb.0:
166; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
167; GCN-NEXT:    s_mov_b32 s7, 0xf000
168; GCN-NEXT:    s_mov_b32 s6, -1
169; GCN-NEXT:    s_waitcnt lgkmcnt(0)
170; GCN-NEXT:    s_mov_b32 s4, s0
171; GCN-NEXT:    s_mov_b32 s5, s1
172; GCN-NEXT:    v_mov_b32_e32 v2, s3
173; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
174; GCN-NEXT:    v_mov_b32_e32 v0, 0x4010cccc
175; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
176; GCN-NEXT:    s_mov_b64 exec, s[0:1]
177; GCN-NEXT:    v_mov_b32_e32 v2, s2
178; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
179; GCN-NEXT:    v_mov_b32_e32 v0, 0xcccccccd
180; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
181; GCN-NEXT:    s_mov_b64 exec, s[0:1]
182; GCN-NEXT:    v_mov_b32_e32 v2, v0
183; GCN-NEXT:    v_mov_b32_e32 v3, v1
184; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
185; GCN-NEXT:    s_endpgm
186  %tmp.0 = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
187  %tmp = call double @llvm.amdgcn.strict.wwm.f64(double %tmp.0)
188  store double %tmp, ptr addrspace(1) %out
189  ret void
190}
191
192define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
193; GCN-LABEL: set_inactive_v2i16:
194; GCN:       ; %bb.0:
195; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
196; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
197; GCN-NEXT:    s_mov_b32 s3, 0xf000
198; GCN-NEXT:    s_mov_b32 s2, -1
199; GCN-NEXT:    s_waitcnt lgkmcnt(0)
200; GCN-NEXT:    v_mov_b32_e32 v1, s6
201; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
202; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
203; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
204; GCN-NEXT:    s_mov_b64 exec, s[4:5]
205; GCN-NEXT:    v_mov_b32_e32 v1, v0
206; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
207; GCN-NEXT:    s_endpgm
208  %tmp.0 = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
209  %tmp = call <2 x i16> @llvm.amdgcn.strict.wwm.v2i16(<2 x i16> %tmp.0)
210  store <2 x i16> %tmp, ptr addrspace(1) %out
211  ret void
212}
213
214define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
215; GCN-LABEL: set_inactive_v2f16:
216; GCN:       ; %bb.0:
217; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
218; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
219; GCN-NEXT:    s_mov_b32 s3, 0xf000
220; GCN-NEXT:    s_mov_b32 s2, -1
221; GCN-NEXT:    s_waitcnt lgkmcnt(0)
222; GCN-NEXT:    v_mov_b32_e32 v1, s6
223; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
224; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
225; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
226; GCN-NEXT:    s_mov_b64 exec, s[4:5]
227; GCN-NEXT:    v_mov_b32_e32 v1, v0
228; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
229; GCN-NEXT:    s_endpgm
230  %tmp.0 = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
231  %tmp = call <2 x half> @llvm.amdgcn.strict.wwm.v2i16(<2 x half> %tmp.0)
232  store <2 x half> %tmp, ptr addrspace(1) %out
233  ret void
234}
235
236define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
237; GCN-LABEL: set_inactive_v2i32:
238; GCN:       ; %bb.0:
239; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
240; GCN-NEXT:    s_mov_b32 s7, 0xf000
241; GCN-NEXT:    s_mov_b32 s6, -1
242; GCN-NEXT:    s_waitcnt lgkmcnt(0)
243; GCN-NEXT:    s_mov_b32 s4, s0
244; GCN-NEXT:    s_mov_b32 s5, s1
245; GCN-NEXT:    v_mov_b32_e32 v2, s3
246; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
247; GCN-NEXT:    v_cndmask_b32_e64 v1, 1, v2, s[0:1]
248; GCN-NEXT:    s_mov_b64 exec, s[0:1]
249; GCN-NEXT:    v_mov_b32_e32 v2, s2
250; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
251; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, v2, s[0:1]
252; GCN-NEXT:    s_mov_b64 exec, s[0:1]
253; GCN-NEXT:    v_mov_b32_e32 v2, v0
254; GCN-NEXT:    v_mov_b32_e32 v3, v1
255; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
256; GCN-NEXT:    s_endpgm
257  %tmp.0 = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
258  %tmp = call <2 x i32> @llvm.amdgcn.strict.wwm.v2i32(<2 x i32> %tmp.0)
259  store <2 x i32> %tmp, ptr addrspace(1) %out
260  ret void
261}
262
263define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
264; GCN-LABEL: set_inactive_v2f32:
265; GCN:       ; %bb.0:
266; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
267; GCN-NEXT:    s_mov_b32 s7, 0xf000
268; GCN-NEXT:    s_mov_b32 s6, -1
269; GCN-NEXT:    s_waitcnt lgkmcnt(0)
270; GCN-NEXT:    s_mov_b32 s4, s0
271; GCN-NEXT:    s_mov_b32 s5, s1
272; GCN-NEXT:    v_mov_b32_e32 v2, s3
273; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
274; GCN-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
275; GCN-NEXT:    s_mov_b64 exec, s[0:1]
276; GCN-NEXT:    v_mov_b32_e32 v2, s2
277; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
278; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, v2, s[0:1]
279; GCN-NEXT:    s_mov_b64 exec, s[0:1]
280; GCN-NEXT:    v_mov_b32_e32 v2, v0
281; GCN-NEXT:    v_mov_b32_e32 v3, v1
282; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
283; GCN-NEXT:    s_endpgm
284  %tmp.0 = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
285  %tmp = call <2 x float> @llvm.amdgcn.strict.wwm.v2f32(<2 x float> %tmp.0)
286  store <2 x float> %tmp, ptr addrspace(1) %out
287  ret void
288}
289
290define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
291; GCN-LABEL: set_inactive_v2bf16:
292; GCN:       ; %bb.0:
293; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
294; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
295; GCN-NEXT:    s_mov_b32 s3, 0xf000
296; GCN-NEXT:    s_mov_b32 s2, -1
297; GCN-NEXT:    s_waitcnt lgkmcnt(0)
298; GCN-NEXT:    v_mov_b32_e32 v1, s6
299; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
300; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
301; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
302; GCN-NEXT:    s_mov_b64 exec, s[4:5]
303; GCN-NEXT:    v_mov_b32_e32 v1, v0
304; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
305; GCN-NEXT:    s_endpgm
306  %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
307  %tmp = call <2 x bfloat> @llvm.amdgcn.strict.wwm.v2bf16(<2 x bfloat> %tmp.0)
308  store <2 x bfloat> %tmp, ptr addrspace(1) %out
309  ret void
310}
311
312define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) {
313; GCN-LABEL: set_inactive_v4i16:
314; GCN:       ; %bb.0:
315; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
316; GCN-NEXT:    s_mov_b32 s7, 0xf000
317; GCN-NEXT:    s_mov_b32 s6, -1
318; GCN-NEXT:    s_waitcnt lgkmcnt(0)
319; GCN-NEXT:    s_mov_b32 s4, s0
320; GCN-NEXT:    s_mov_b32 s5, s1
321; GCN-NEXT:    v_mov_b32_e32 v2, s3
322; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
323; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
324; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
325; GCN-NEXT:    s_mov_b64 exec, s[0:1]
326; GCN-NEXT:    v_mov_b32_e32 v2, s2
327; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
328; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
329; GCN-NEXT:    s_mov_b64 exec, s[0:1]
330; GCN-NEXT:    v_mov_b32_e32 v2, v0
331; GCN-NEXT:    v_mov_b32_e32 v3, v1
332; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
333; GCN-NEXT:    s_endpgm
334  %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
335  %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0)
336  store <4 x i16> %tmp, ptr addrspace(1) %out
337  ret void
338}
339
340define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
341; GCN-LABEL: set_inactive_v4f16:
342; GCN:       ; %bb.0:
343; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
344; GCN-NEXT:    s_mov_b32 s7, 0xf000
345; GCN-NEXT:    s_mov_b32 s6, -1
346; GCN-NEXT:    s_waitcnt lgkmcnt(0)
347; GCN-NEXT:    s_mov_b32 s4, s0
348; GCN-NEXT:    s_mov_b32 s5, s1
349; GCN-NEXT:    v_mov_b32_e32 v2, s3
350; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
351; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
352; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
353; GCN-NEXT:    s_mov_b64 exec, s[0:1]
354; GCN-NEXT:    v_mov_b32_e32 v2, s2
355; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
356; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
357; GCN-NEXT:    s_mov_b64 exec, s[0:1]
358; GCN-NEXT:    v_mov_b32_e32 v2, v0
359; GCN-NEXT:    v_mov_b32_e32 v3, v1
360; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
361; GCN-NEXT:    s_endpgm
362  %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
363  %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0)
364  store <4 x half> %tmp, ptr addrspace(1) %out
365  ret void
366}
367
368define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) {
369; GCN-LABEL: set_inactive_v4bf16:
370; GCN:       ; %bb.0:
371; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
372; GCN-NEXT:    s_mov_b32 s7, 0xf000
373; GCN-NEXT:    s_mov_b32 s6, -1
374; GCN-NEXT:    s_waitcnt lgkmcnt(0)
375; GCN-NEXT:    s_mov_b32 s4, s0
376; GCN-NEXT:    s_mov_b32 s5, s1
377; GCN-NEXT:    v_mov_b32_e32 v2, s3
378; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
379; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
380; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
381; GCN-NEXT:    s_mov_b64 exec, s[0:1]
382; GCN-NEXT:    v_mov_b32_e32 v2, s2
383; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
384; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
385; GCN-NEXT:    s_mov_b64 exec, s[0:1]
386; GCN-NEXT:    v_mov_b32_e32 v2, v0
387; GCN-NEXT:    v_mov_b32_e32 v3, v1
388; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
389; GCN-NEXT:    s_endpgm
390  %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
391  %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0)
392  store <4 x bfloat> %tmp, ptr addrspace(1) %out
393  ret void
394}
395
396define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
397; GCN-LABEL: set_inactive_p0:
398; GCN:       ; %bb.0:
399; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
400; GCN-NEXT:    s_mov_b32 s7, 0xf000
401; GCN-NEXT:    s_mov_b32 s6, -1
402; GCN-NEXT:    s_waitcnt lgkmcnt(0)
403; GCN-NEXT:    s_mov_b32 s4, s0
404; GCN-NEXT:    s_mov_b32 s5, s1
405; GCN-NEXT:    v_mov_b32_e32 v2, s3
406; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
407; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
408; GCN-NEXT:    s_mov_b64 exec, s[0:1]
409; GCN-NEXT:    v_mov_b32_e32 v2, s2
410; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
411; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
412; GCN-NEXT:    s_mov_b64 exec, s[0:1]
413; GCN-NEXT:    v_mov_b32_e32 v2, v0
414; GCN-NEXT:    v_mov_b32_e32 v3, v1
415; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
416; GCN-NEXT:    s_endpgm
417  %tmp.0 = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
418  %tmp = call ptr @llvm.amdgcn.strict.wwm.p0(ptr %tmp.0)
419  store ptr %tmp, ptr addrspace(1) %out
420  ret void
421}
422
423define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
424; GCN-LABEL: set_inactive_p2:
425; GCN:       ; %bb.0:
426; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
427; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
428; GCN-NEXT:    s_mov_b32 s3, 0xf000
429; GCN-NEXT:    s_mov_b32 s2, -1
430; GCN-NEXT:    s_waitcnt lgkmcnt(0)
431; GCN-NEXT:    v_mov_b32_e32 v1, s6
432; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
433; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
434; GCN-NEXT:    s_mov_b64 exec, s[4:5]
435; GCN-NEXT:    v_mov_b32_e32 v1, v0
436; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
437; GCN-NEXT:    s_endpgm
438  %tmp.0 = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
439  %tmp = call ptr addrspace(2) @llvm.amdgcn.strict.wwm.p2(ptr addrspace(2) %tmp.0)
440  store ptr addrspace(2) %tmp, ptr addrspace(1) %out
441  ret void
442}
443
444define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
445; GCN-LABEL: set_inactive_p3:
446; GCN:       ; %bb.0:
447; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
448; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
449; GCN-NEXT:    s_mov_b32 s3, 0xf000
450; GCN-NEXT:    s_mov_b32 s2, -1
451; GCN-NEXT:    s_waitcnt lgkmcnt(0)
452; GCN-NEXT:    v_mov_b32_e32 v1, s6
453; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
454; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
455; GCN-NEXT:    s_mov_b64 exec, s[4:5]
456; GCN-NEXT:    v_mov_b32_e32 v1, v0
457; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
458; GCN-NEXT:    s_endpgm
459  %tmp.0 = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
460  %tmp = call ptr addrspace(3) @llvm.amdgcn.strict.wwm.p3(ptr addrspace(3) %tmp.0)
461  store ptr addrspace(3) %tmp, ptr addrspace(1) %out
462  ret void
463}
464
465define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
466; GCN-LABEL: set_inactive_p5:
467; GCN:       ; %bb.0:
468; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
469; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
470; GCN-NEXT:    s_mov_b32 s3, 0xf000
471; GCN-NEXT:    s_mov_b32 s2, -1
472; GCN-NEXT:    s_waitcnt lgkmcnt(0)
473; GCN-NEXT:    v_mov_b32_e32 v1, s6
474; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
475; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
476; GCN-NEXT:    s_mov_b64 exec, s[4:5]
477; GCN-NEXT:    v_mov_b32_e32 v1, v0
478; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
479; GCN-NEXT:    s_endpgm
480  %tmp.0 = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
481  %tmp = call ptr addrspace(5) @llvm.amdgcn.strict.wwm.p5(ptr addrspace(5) %tmp.0)
482  store ptr addrspace(5) %tmp, ptr addrspace(1) %out
483  ret void
484}
485
486define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
487; GCN-LABEL: set_inactive_p6:
488; GCN:       ; %bb.0:
489; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
490; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
491; GCN-NEXT:    s_mov_b32 s3, 0xf000
492; GCN-NEXT:    s_mov_b32 s2, -1
493; GCN-NEXT:    s_waitcnt lgkmcnt(0)
494; GCN-NEXT:    v_mov_b32_e32 v1, s6
495; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
496; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
497; GCN-NEXT:    s_mov_b64 exec, s[4:5]
498; GCN-NEXT:    v_mov_b32_e32 v1, v0
499; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
500; GCN-NEXT:    s_endpgm
501  %tmp.0 = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
502  %tmp = call ptr addrspace(6) @llvm.amdgcn.strict.wwm.p6(ptr addrspace(6) %tmp.0)
503  store ptr addrspace(6) %tmp, ptr addrspace(1) %out
504  ret void
505}
506
507declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
508declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
509declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1
510declare i64 @llvm.amdgcn.strict.wwm.i64(i64) #1
511declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
512
513attributes #0 = { convergent readnone }
514attributes #1 = { convergent nounwind readnone speculatable willreturn }
515