xref: /llvm-project/llvm/test/CodeGen/AMDGPU/wqm.ll (revision f811482a744454c442456dd4275929b1eb1871b6)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GFX10-W32 %s
4
5; Check that WQM isn't triggered by image load/store intrinsics.
6define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
7; GFX9-W64-LABEL: test1:
8; GFX9-W64:       ; %bb.0: ; %main_body
9; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
10; GFX9-W64-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm
11; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
12; GFX9-W64-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm
13; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
14; GFX9-W64-NEXT:    ; return to shader part epilog
15;
16; GFX10-W32-LABEL: test1:
17; GFX10-W32:       ; %bb.0: ; %main_body
18; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
19; GFX10-W32-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
20; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
21; GFX10-W32-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
22; GFX10-W32-NEXT:    ; return to shader part epilog
23main_body:
24  %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
25  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
26  ret <4 x float> %tex
27}
28
29; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible
30define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
31; GFX9-W64-LABEL: test2:
32; GFX9-W64:       ; %bb.0: ; %main_body
33; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
34; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
35; GFX9-W64-NEXT:    s_mov_b32 m0, s3
36; GFX9-W64-NEXT:    s_nop 0
37; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
38; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
39; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
40; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
41; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
42; GFX9-W64-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
43; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
44; GFX9-W64-NEXT:    ; return to shader part epilog
45;
46; GFX10-W32-LABEL: test2:
47; GFX10-W32:       ; %bb.0: ; %main_body
48; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
49; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
50; GFX10-W32-NEXT:    s_mov_b32 m0, s3
51; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
52; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
53; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
54; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
55; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
56; GFX10-W32-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
57; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
58; GFX10-W32-NEXT:    ; return to shader part epilog
59main_body:
60  %inst23 = extractelement <2 x float> %pos, i32 0
61  %inst24 = extractelement <2 x float> %pos, i32 1
62  %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
63  %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
64  %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
65  %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
66  %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
67  ret <4 x float> %tex
68}
69
70; ... but disabled for stores (and, in this simple case, not re-enabled) ...
71define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
72; GFX9-W64-LABEL: test3:
73; GFX9-W64:       ; %bb.0: ; %main_body
74; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
75; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
76; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
77; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
78; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
79; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
80; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
81; GFX9-W64-NEXT:    ; return to shader part epilog
82;
83; GFX10-W32-LABEL: test3:
84; GFX10-W32:       ; %bb.0: ; %main_body
85; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
86; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
87; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
88; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
89; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
90; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
91; GFX10-W32-NEXT:    ; return to shader part epilog
92main_body:
93  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
94  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
95  %tex.2 = extractelement <4 x i32> %tex.1, i32 0
96
97  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0)
98
99  ret <4 x float> %tex
100}
101
102define amdgpu_ps <4 x float> @test3_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
103; GFX9-W64-LABEL: test3_ptr_buf:
104; GFX9-W64:       ; %bb.0: ; %main_body
105; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
106; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
107; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
108; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
109; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
110; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
111; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
112; GFX9-W64-NEXT:    ; return to shader part epilog
113;
114; GFX10-W32-LABEL: test3_ptr_buf:
115; GFX10-W32:       ; %bb.0: ; %main_body
116; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
117; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
118; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
119; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
120; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
121; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
122; GFX10-W32-NEXT:    ; return to shader part epilog
123main_body:
124  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
125  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
126  %tex.2 = extractelement <4 x i32> %tex.1, i32 0
127
128  call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %tex, ptr addrspace(8) undef, i32 %tex.2, i32 0, i32 0, i32 0)
129
130  ret <4 x float> %tex
131}
132
133; ... and disabled for export.
134define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
135; GFX9-W64-LABEL: test3x:
136; GFX9-W64:       ; %bb.0: ; %main_body
137; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
138; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
139; GFX9-W64-NEXT:    s_mov_b32 m0, s3
140; GFX9-W64-NEXT:    s_nop 0
141; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
142; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
143; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
144; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
145; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
146; GFX9-W64-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
147; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
148; GFX9-W64-NEXT:    exp mrt0 v0, v1, v2, v3 done vm
149; GFX9-W64-NEXT:    s_endpgm
150;
151; GFX10-W32-LABEL: test3x:
152; GFX10-W32:       ; %bb.0: ; %main_body
153; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
154; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
155; GFX10-W32-NEXT:    s_mov_b32 m0, s3
156; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
157; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
158; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
159; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
160; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
161; GFX10-W32-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
162; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
163; GFX10-W32-NEXT:    exp mrt0 v0, v1, v2, v3 done vm
164; GFX10-W32-NEXT:    s_endpgm
165main_body:
166  %inst23 = extractelement <2 x float> %pos, i32 0
167  %inst24 = extractelement <2 x float> %pos, i32 1
168  %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
169  %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
170  %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
171  %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
172  %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
173  %tex.0 = extractelement <4 x float> %tex, i32 0
174  %tex.1 = extractelement <4 x float> %tex, i32 1
175  %tex.2 = extractelement <4 x float> %tex, i32 2
176  %tex.3 = extractelement <4 x float> %tex, i32 3
177  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true)
178  ret void
179}
180
181; Check that WQM is re-enabled when required.
182define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) {
183; GFX9-W64-LABEL: test4:
184; GFX9-W64:       ; %bb.0: ; %main_body
185; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
186; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
187; GFX9-W64-NEXT:    v_mul_lo_u32 v4, v0, v1
188; GFX9-W64-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1
189; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
190; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
191; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
192; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
193; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
194; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
195; GFX9-W64-NEXT:    ; return to shader part epilog
196;
197; GFX10-W32-LABEL: test4:
198; GFX10-W32:       ; %bb.0: ; %main_body
199; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
200; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
201; GFX10-W32-NEXT:    v_mul_lo_u32 v4, v0, v1
202; GFX10-W32-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
203; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
204; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
205; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
206; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
207; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
208; GFX10-W32-NEXT:    ; return to shader part epilog
209main_body:
210  %c.1 = mul i32 %c, %d
211
212  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0)
213  %c.1.bc = bitcast i32 %c.1 to float
214  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
215  %tex0 = extractelement <4 x float> %tex, i32 0
216  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
217  ret <4 x float> %dtex
218}
219
220define amdgpu_ps <4 x float> @test4_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) {
221; GFX9-W64-LABEL: test4_ptr_buf:
222; GFX9-W64:       ; %bb.0: ; %main_body
223; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
224; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
225; GFX9-W64-NEXT:    v_mul_lo_u32 v4, v0, v1
226; GFX9-W64-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1
227; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
228; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
229; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
230; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
231; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
232; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
233; GFX9-W64-NEXT:    ; return to shader part epilog
234;
235; GFX10-W32-LABEL: test4_ptr_buf:
236; GFX10-W32:       ; %bb.0: ; %main_body
237; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
238; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
239; GFX10-W32-NEXT:    v_mul_lo_u32 v4, v0, v1
240; GFX10-W32-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
241; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
242; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
243; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
244; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
245; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
246; GFX10-W32-NEXT:    ; return to shader part epilog
247main_body:
248  %c.1 = mul i32 %c, %d
249
250  call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> undef, ptr addrspace(8) undef, i32 %c.1, i32 0, i32 0, i32 0)
251  %c.1.bc = bitcast i32 %c.1 to float
252  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
253  %tex0 = extractelement <4 x float> %tex, i32 0
254  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
255  ret <4 x float> %dtex
256}
257
258; Check that WQM is triggered by the wqm intrinsic.
259; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this
260; does not happen - the v_add should write the return reg directly.
261define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
262; GFX9-W64-LABEL: test5:
263; GFX9-W64:       ; %bb.0: ; %main_body
264; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
265; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
266; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
267; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
268; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
269; GFX9-W64-NEXT:    s_nop 0
270; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
271; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
272; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
273; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
274; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
275; GFX9-W64-NEXT:    ; return to shader part epilog
276;
277; GFX10-W32-LABEL: test5:
278; GFX10-W32:       ; %bb.0: ; %main_body
279; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
280; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
281; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
282; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
283; GFX10-W32-NEXT:    s_clause 0x1
284; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
285; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
286; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
287; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
288; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
289; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
290; GFX10-W32-NEXT:    ; return to shader part epilog
291main_body:
292  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
293  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
294  %out = fadd float %src0, %src1
295  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
296  ret float %out.0
297}
298
299define amdgpu_ps float @test5_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) {
300; GFX9-W64-LABEL: test5_ptr_buf:
301; GFX9-W64:       ; %bb.0: ; %main_body
302; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
303; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
304; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
305; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
306; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
307; GFX9-W64-NEXT:    s_nop 0
308; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
309; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
310; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
311; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
312; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
313; GFX9-W64-NEXT:    ; return to shader part epilog
314;
315; GFX10-W32-LABEL: test5_ptr_buf:
316; GFX10-W32:       ; %bb.0: ; %main_body
317; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
318; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
319; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
320; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
321; GFX10-W32-NEXT:    s_clause 0x1
322; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
323; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
324; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
325; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
326; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
327; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
328; GFX10-W32-NEXT:    ; return to shader part epilog
329main_body:
330  %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
331  %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
332  %out = fadd float %src0, %src1
333  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
334  ret float %out.0
335}
336
337; Check that the wqm intrinsic works correctly for integers.
338define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
339; GFX9-W64-LABEL: test6:
340; GFX9-W64:       ; %bb.0: ; %main_body
341; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
342; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
343; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
344; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
345; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
346; GFX9-W64-NEXT:    s_nop 0
347; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
348; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
349; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
350; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
351; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
352; GFX9-W64-NEXT:    ; return to shader part epilog
353;
354; GFX10-W32-LABEL: test6:
355; GFX10-W32:       ; %bb.0: ; %main_body
356; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
357; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
358; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
359; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
360; GFX10-W32-NEXT:    s_clause 0x1
361; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
362; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
363; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
364; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
365; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
366; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
367; GFX10-W32-NEXT:    ; return to shader part epilog
368main_body:
369  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
370  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
371  %out = fadd float %src0, %src1
372  %out.0 = bitcast float %out to i32
373  %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
374  %out.2 = bitcast i32 %out.1 to float
375  ret float %out.2
376}
377
378define amdgpu_ps float @test6_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) {
379; GFX9-W64-LABEL: test6_ptr_buf:
380; GFX9-W64:       ; %bb.0: ; %main_body
381; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
382; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
383; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
384; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
385; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
386; GFX9-W64-NEXT:    s_nop 0
387; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
388; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
389; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
390; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
391; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
392; GFX9-W64-NEXT:    ; return to shader part epilog
393;
394; GFX10-W32-LABEL: test6_ptr_buf:
395; GFX10-W32:       ; %bb.0: ; %main_body
396; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
397; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
398; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
399; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
400; GFX10-W32-NEXT:    s_clause 0x1
401; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
402; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
403; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
404; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
405; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
406; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
407; GFX10-W32-NEXT:    ; return to shader part epilog
408main_body:
409  %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
410  %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
411  %out = fadd float %src0, %src1
412  %out.0 = bitcast float %out to i32
413  %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
414  %out.2 = bitcast i32 %out.1 to float
415  ret float %out.2
416}
417
418; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
419
420; Check that WWM is triggered by the wwm intrinsic.
421define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
422; GFX9-W64-LABEL: test_wwm1:
423; GFX9-W64:       ; %bb.0: ; %main_body
424; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
425; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
426; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
427; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
428; GFX9-W64-NEXT:    s_nop 0
429; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
430; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
431; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
432; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
433; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
434; GFX9-W64-NEXT:    ; return to shader part epilog
435;
436; GFX10-W32-LABEL: test_wwm1:
437; GFX10-W32:       ; %bb.0: ; %main_body
438; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
439; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
440; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
441; GFX10-W32-NEXT:    s_clause 0x1
442; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
443; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
444; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
445; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
446; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
447; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
448; GFX10-W32-NEXT:    ; return to shader part epilog
449main_body:
450  %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
451  %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
452  %out = fadd float %src0, %src1
453  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
454  ret float %out.0
455}
456
457; Same as above, but with an integer type.
458define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
459; GFX9-W64-LABEL: test_wwm2:
460; GFX9-W64:       ; %bb.0: ; %main_body
461; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
462; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
463; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
464; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
465; GFX9-W64-NEXT:    s_nop 0
466; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
467; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
468; GFX9-W64-NEXT:    v_add_u32_e32 v1, v1, v2
469; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
470; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
471; GFX9-W64-NEXT:    ; return to shader part epilog
472;
473; GFX10-W32-LABEL: test_wwm2:
474; GFX10-W32:       ; %bb.0: ; %main_body
475; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
476; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
477; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
478; GFX10-W32-NEXT:    s_clause 0x1
479; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
480; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
481; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
482; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
483; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
484; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
485; GFX10-W32-NEXT:    ; return to shader part epilog
486main_body:
487  %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
488  %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
489  %src0.0 = bitcast float %src0 to i32
490  %src1.0 = bitcast float %src1 to i32
491  %out = add i32 %src0.0, %src1.0
492  %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
493  %out.1 = bitcast i32 %out.0 to float
494  ret float %out.1
495}
496
497; Check that we don't leave WWM on for computations that don't require WWM,
498; since that will lead clobbering things that aren't supposed to be clobbered
499; in cases like this.
500; We enforce this by checking that v_add gets emitted in the same block as
501; WWM computations.
502define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
503; GFX9-W64-LABEL: test_wwm3:
504; GFX9-W64:       ; %bb.0: ; %main_body
505; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
506; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
507; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
508; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
509; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
510; GFX9-W64-NEXT:    s_cbranch_execz .LBB13_2
511; GFX9-W64-NEXT:  ; %bb.1: ; %if
512; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
513; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
514; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
515; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
516; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v1
517; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
518; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
519; GFX9-W64-NEXT:    v_add_f32_e32 v0, v1, v0
520; GFX9-W64-NEXT:  .LBB13_2: ; %endif
521; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
522; GFX9-W64-NEXT:    ; return to shader part epilog
523;
524; GFX10-W32-LABEL: test_wwm3:
525; GFX10-W32:       ; %bb.0: ; %main_body
526; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
527; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
528; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
529; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
530; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
531; GFX10-W32-NEXT:    s_cbranch_execz .LBB13_2
532; GFX10-W32-NEXT:  ; %bb.1: ; %if
533; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
534; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
535; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
536; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
537; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v1
538; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
539; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
540; GFX10-W32-NEXT:    v_add_f32_e32 v0, v1, v0
541; GFX10-W32-NEXT:  .LBB13_2: ; %endif
542; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
543; GFX10-W32-NEXT:    ; return to shader part epilog
544main_body:
545  ; use mbcnt to make sure the branch is divergent
546  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
547  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
548  %cc = icmp uge i32 %hi, 16
549  br i1 %cc, label %endif, label %if
550
551if:
552  %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
553  %out = fadd float %src, %src
554  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
555  %out.1 = fadd float %src, %out.0
556  br label %endif
557
558endif:
559  %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
560  ret float %out.2
561}
562
563; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
564; write could clobber disabled channels in the non-WWM one.
565; We enforce this by checking that v_mov gets emitted in the same block as
566; WWM computations.
567define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
568; GFX9-W64-LABEL: test_wwm4:
569; GFX9-W64:       ; %bb.0: ; %main_body
570; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
571; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
572; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
573; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
574; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
575; GFX9-W64-NEXT:    s_cbranch_execz .LBB14_2
576; GFX9-W64-NEXT:  ; %bb.1: ; %if
577; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
578; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
579; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
580; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
581; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
582; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
583; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
584; GFX9-W64-NEXT:  .LBB14_2: ; %endif
585; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
586; GFX9-W64-NEXT:    ; return to shader part epilog
587;
588; GFX10-W32-LABEL: test_wwm4:
589; GFX10-W32:       ; %bb.0: ; %main_body
590; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
591; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
592; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
593; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
594; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
595; GFX10-W32-NEXT:    s_cbranch_execz .LBB14_2
596; GFX10-W32-NEXT:  ; %bb.1: ; %if
597; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
598; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
599; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
600; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
601; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
602; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
603; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
604; GFX10-W32-NEXT:  .LBB14_2: ; %endif
605; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
606; GFX10-W32-NEXT:    ; return to shader part epilog
607main_body:
608  ; use mbcnt to make sure the branch is divergent
609  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
610  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
611  %cc = icmp uge i32 %hi, 16
612  br i1 %cc, label %endif, label %if
613
614if:
615  %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
616  %out = fadd float %src, %src
617  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
618  br label %endif
619
620endif:
621  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
622  ret float %out.1
623}
624
625; Make sure the transition from Exact to WWM then WQM works properly.
626define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
627; GFX9-W64-LABEL: test_wwm5:
628; GFX9-W64:       ; %bb.0: ; %main_body
629; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
630; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
631; GFX9-W64-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
632; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
633; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
634; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
635; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
636; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
637; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
638; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
639; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
640; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
641; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
642; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
643; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
644; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
645; GFX9-W64-NEXT:    ; return to shader part epilog
646;
647; GFX10-W32-LABEL: test_wwm5:
648; GFX10-W32:       ; %bb.0: ; %main_body
649; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
650; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
651; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
652; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
653; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
654; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
655; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
656; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
657; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
658; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
659; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
660; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
661; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
662; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
663; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
664; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
665; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
666; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
667; GFX10-W32-NEXT:    ; return to shader part epilog
668main_body:
669  %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
670  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
671  %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
672  %temp = fadd float %src1, %src1
673  %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
674  %out = fadd float %temp.0, %temp.0
675  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
676  ret float %out.0
677}
678
679; Check that WWM is turned on correctly across basic block boundaries.
680; if..then..endif version
681;SI-CHECK: buffer_load_dword
682;VI-CHECK: flat_load_dword
683;SI-CHECK: buffer_load_dword
684;VI-CHECK: flat_load_dword
685define amdgpu_ps float @test_wwm6_then() {
686; GFX9-W64-LABEL: test_wwm6_then:
687; GFX9-W64:       ; %bb.0: ; %main_body
688; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
689; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
690; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
691; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
692; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
693; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
694; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
695; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
696; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
697; GFX9-W64-NEXT:    s_cbranch_execz .LBB16_2
698; GFX9-W64-NEXT:  ; %bb.1: ; %if
699; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
700; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
701; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
702; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
703; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
704; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
705; GFX9-W64-NEXT:  .LBB16_2: ; %endif
706; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
707; GFX9-W64-NEXT:    ; return to shader part epilog
708;
709; GFX10-W32-LABEL: test_wwm6_then:
710; GFX10-W32:       ; %bb.0: ; %main_body
711; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
712; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
713; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
714; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
715; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
716; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
717; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
718; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
719; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
720; GFX10-W32-NEXT:    s_cbranch_execz .LBB16_2
721; GFX10-W32-NEXT:  ; %bb.1: ; %if
722; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
723; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
724; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
725; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
726; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
727; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
728; GFX10-W32-NEXT:  .LBB16_2: ; %endif
729; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
730; GFX10-W32-NEXT:    ; return to shader part epilog
731main_body:
732  %src0 = load volatile float, ptr addrspace(1) undef
733  ; use mbcnt to make sure the branch is divergent
734  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
735  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
736  %cc = icmp uge i32 %hi, 16
737  br i1 %cc, label %endif, label %if
738
739if:
740  %src1 = load volatile float, ptr addrspace(1) undef
741  %out = fadd float %src0, %src1
742  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
743  br label %endif
744
745endif:
746  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
747  ret float %out.1
748}
749
750; Check that WWM is turned on correctly across basic block boundaries.
751; loop version
752;SI-CHECK: buffer_load_dword
753;VI-CHECK: flat_load_dword
754;SI-CHECK: buffer_load_dword
755;VI-CHECK: flat_load_dword
756define amdgpu_ps float @test_wwm6_loop() {
757; GFX9-W64-LABEL: test_wwm6_loop:
758; GFX9-W64:       ; %bb.0: ; %main_body
759; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
760; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
761; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
762; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
763; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
764; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
765; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
766; GFX9-W64-NEXT:  .LBB17_1: ; %loop
767; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
768; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
769; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
770; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
771; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
772; GFX9-W64-NEXT:    v_add_u32_e32 v3, -1, v3
773; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
774; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
775; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v2
776; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
777; GFX9-W64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
778; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
779; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
780; GFX9-W64-NEXT:    s_cbranch_execnz .LBB17_1
781; GFX9-W64-NEXT:  ; %bb.2: ; %endloop
782; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
783; GFX9-W64-NEXT:    ; return to shader part epilog
784;
785; GFX10-W32-LABEL: test_wwm6_loop:
786; GFX10-W32:       ; %bb.0: ; %main_body
787; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
788; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
789; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
790; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
791; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
792; GFX10-W32-NEXT:    s_mov_b32 s0, 0
793; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
794; GFX10-W32-NEXT:  .LBB17_1: ; %loop
795; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
796; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
797; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
798; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
799; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
800; GFX10-W32-NEXT:    v_add_nc_u32_e32 v3, -1, v3
801; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
802; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v2
803; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
804; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
805; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
806; GFX10-W32-NEXT:    s_or_b32 s0, vcc_lo, s0
807; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
808; GFX10-W32-NEXT:    s_cbranch_execnz .LBB17_1
809; GFX10-W32-NEXT:  ; %bb.2: ; %endloop
810; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
811; GFX10-W32-NEXT:    ; return to shader part epilog
812main_body:
813  %src0 = load volatile float, ptr addrspace(1) undef
814  ; use mbcnt to make sure the branch is divergent
815  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
816  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
817  br label %loop
818
819loop:
820  %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
821  %src1 = load volatile float, ptr addrspace(1) undef
822  %out = fadd float %src0, %src1
823  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
824  %counter.1 = sub i32 %counter, 1
825  %cc = icmp ne i32 %counter.1, 0
826  br i1 %cc, label %loop, label %endloop
827
828endloop:
829  ret float %out.0
830}
831
832; Check that @llvm.amdgcn.set.inactive disables WWM.
833define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
834; GFX9-W64-LABEL: test_wwm_set_inactive1:
835; GFX9-W64:       ; %bb.0: ; %main_body
836; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
837; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
838; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
839; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
840; GFX9-W64-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
841; GFX9-W64-NEXT:    v_add_u32_e32 v0, v0, v0
842; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
843; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
844; GFX9-W64-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
845; GFX9-W64-NEXT:    s_endpgm
846;
847; GFX10-W32-LABEL: test_wwm_set_inactive1:
848; GFX10-W32:       ; %bb.0: ; %main_body
849; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
850; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
851; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
852; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
853; GFX10-W32-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s0
854; GFX10-W32-NEXT:    v_add_nc_u32_e32 v0, v0, v0
855; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
856; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
857; GFX10-W32-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
858; GFX10-W32-NEXT:    s_endpgm
859main_body:
860  %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
861  %src.0 = bitcast float %src to i32
862  %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
863  %out = add i32 %src.1, %src.1
864  %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
865  %out.1 = bitcast i32 %out.0 to float
866  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
867  ret void
868}
869
870; Check that Strict WQM is triggered by the strict_wqm intrinsic.
871define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) {
872; GFX9-W64-LABEL: test_strict_wqm1:
873; GFX9-W64:       ; %bb.0: ; %main_body
874; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
875; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
876; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
877; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
878; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
879; GFX9-W64-NEXT:    s_nop 0
880; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
881; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
882; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
883; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
884; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
885; GFX9-W64-NEXT:    ; return to shader part epilog
886;
887; GFX10-W32-LABEL: test_strict_wqm1:
888; GFX10-W32:       ; %bb.0: ; %main_body
889; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
890; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
891; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
892; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
893; GFX10-W32-NEXT:    s_clause 0x1
894; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
895; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
896; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
897; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
898; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
899; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
900; GFX10-W32-NEXT:    ; return to shader part epilog
901main_body:
902  %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
903  %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
904  %out = fadd float %src0, %src1
905  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
906  ret float %out.0
907}
908
909; Same as above, but with an integer type.
910define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) {
911; GFX9-W64-LABEL: test_strict_wqm2:
912; GFX9-W64:       ; %bb.0: ; %main_body
913; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
914; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
915; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
916; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
917; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
918; GFX9-W64-NEXT:    s_nop 0
919; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
920; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
921; GFX9-W64-NEXT:    v_add_u32_e32 v1, v1, v2
922; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
923; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
924; GFX9-W64-NEXT:    ; return to shader part epilog
925;
926; GFX10-W32-LABEL: test_strict_wqm2:
927; GFX10-W32:       ; %bb.0: ; %main_body
928; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
929; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
930; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
931; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
932; GFX10-W32-NEXT:    s_clause 0x1
933; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
934; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
935; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
936; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
937; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
938; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
939; GFX10-W32-NEXT:    ; return to shader part epilog
940main_body:
941  %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
942  %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
943  %src0.0 = bitcast float %src0 to i32
944  %src1.0 = bitcast float %src1 to i32
945  %out = add i32 %src0.0, %src1.0
946  %out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out)
947  %out.1 = bitcast i32 %out.0 to float
948  ret float %out.1
949}
950
951; Check that we don't leave Strict WQM on for computations that don't require it,
952; since that will lead clobbering things that aren't supposed to be clobbered
953; in cases like this.
954; We enforce this by checking that v_add gets emitted in the same block as
955; WWM computations.
956define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) {
957; GFX9-W64-LABEL: test_strict_wqm3:
958; GFX9-W64:       ; %bb.0: ; %main_body
959; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
960; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
961; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
962; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
963; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
964; GFX9-W64-NEXT:    s_cbranch_execz .LBB21_2
965; GFX9-W64-NEXT:  ; %bb.1: ; %if
966; GFX9-W64-NEXT:    s_mov_b64 s[4:5], exec
967; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
968; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
969; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
970; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
971; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v1
972; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
973; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
974; GFX9-W64-NEXT:    v_add_f32_e32 v0, v1, v0
975; GFX9-W64-NEXT:  .LBB21_2: ; %endif
976; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
977; GFX9-W64-NEXT:    ; return to shader part epilog
978;
979; GFX10-W32-LABEL: test_strict_wqm3:
980; GFX10-W32:       ; %bb.0: ; %main_body
981; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
982; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
983; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
984; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
985; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
986; GFX10-W32-NEXT:    s_cbranch_execz .LBB21_2
987; GFX10-W32-NEXT:  ; %bb.1: ; %if
988; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
989; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
990; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
991; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
992; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
993; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v1
994; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
995; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
996; GFX10-W32-NEXT:    v_add_f32_e32 v0, v1, v0
997; GFX10-W32-NEXT:  .LBB21_2: ; %endif
998; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
999; GFX10-W32-NEXT:    ; return to shader part epilog
1000main_body:
1001  ; use mbcnt to make sure the branch is divergent
1002  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1003  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1004  %cc = icmp uge i32 %hi, 16
1005  br i1 %cc, label %endif, label %if
1006
1007if:
1008  %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1009  %out = fadd float %src, %src
1010  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1011  %out.1 = fadd float %src, %out.0
1012  br label %endif
1013
1014endif:
1015  %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
1016  ret float %out.2
1017}
1018
1019; Check that Strict WQM writes aren't coalesced with non-strict writes, since
1020; the Strict WQM write could clobber disabled channels in the non-strict one.
1021; We enforce this by checking that v_mov gets emitted in the same block as
1022; WWM computations.
1023define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) {
1024; GFX9-W64-LABEL: test_strict_wqm4:
1025; GFX9-W64:       ; %bb.0: ; %main_body
1026; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1027; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1028; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
1029; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
1030; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1031; GFX9-W64-NEXT:    s_cbranch_execz .LBB22_2
1032; GFX9-W64-NEXT:  ; %bb.1: ; %if
1033; GFX9-W64-NEXT:    s_mov_b64 s[4:5], exec
1034; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1035; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
1036; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1037; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1038; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
1039; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
1040; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
1041; GFX9-W64-NEXT:  .LBB22_2: ; %endif
1042; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
1043; GFX9-W64-NEXT:    ; return to shader part epilog
1044;
1045; GFX10-W32-LABEL: test_strict_wqm4:
1046; GFX10-W32:       ; %bb.0: ; %main_body
1047; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1048; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1049; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
1050; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
1051; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1052; GFX10-W32-NEXT:    s_cbranch_execz .LBB22_2
1053; GFX10-W32-NEXT:  ; %bb.1: ; %if
1054; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
1055; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1056; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
1057; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1058; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1059; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
1060; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
1061; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
1062; GFX10-W32-NEXT:  .LBB22_2: ; %endif
1063; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1064; GFX10-W32-NEXT:    ; return to shader part epilog
1065main_body:
1066  ; use mbcnt to make sure the branch is divergent
1067  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1068  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1069  %cc = icmp uge i32 %hi, 16
1070  br i1 %cc, label %endif, label %if
1071
1072if:
1073  %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1074  %out = fadd float %src, %src
1075  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1076  br label %endif
1077
1078endif:
1079  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
1080  ret float %out.1
1081}
1082
1083; Make sure the transition from Exact to Strict WQM then WQM works properly.
1084define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) {
1085; GFX9-W64-LABEL: test_strict_wqm5:
1086; GFX9-W64:       ; %bb.0: ; %main_body
1087; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1088; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
1089; GFX9-W64-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
1090; GFX9-W64-NEXT:    s_mov_b64 s[4:5], exec
1091; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1092; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1093; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1094; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
1095; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1096; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1097; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
1098; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
1099; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1100; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
1101; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
1102; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
1103; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
1104; GFX9-W64-NEXT:    ; return to shader part epilog
1105;
1106; GFX10-W32-LABEL: test_strict_wqm5:
1107; GFX10-W32:       ; %bb.0: ; %main_body
1108; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
1109; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
1110; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
1111; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1112; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1113; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
1114; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1115; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1116; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1117; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1118; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1119; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1120; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1121; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
1122; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1123; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1124; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
1125; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
1126; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
1127; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
1128; GFX10-W32-NEXT:    ; return to shader part epilog
1129main_body:
1130  %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
1131  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
1132  %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
1133  %temp = fadd float %src1, %src1
1134  %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
1135  %out = fadd float %temp.0, %temp.0
1136  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
1137  ret float %out.0
1138}
1139
1140; Check that Strict WQM is turned on correctly across basic block boundaries.
1141; if..then..endif version
1142;SI-CHECK: buffer_load_dword
1143;VI-CHECK: flat_load_dword
1144;SI-CHECK: buffer_load_dword
1145;VI-CHECK: flat_load_dword
1146define amdgpu_ps float @test_strict_wqm6_then() {
1147; GFX9-W64-LABEL: test_strict_wqm6_then:
1148; GFX9-W64:       ; %bb.0: ; %main_body
1149; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1150; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1151; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
1152; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1153; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
1154; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1155; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1156; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
1157; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
1158; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1159; GFX9-W64-NEXT:    s_cbranch_execz .LBB24_2
1160; GFX9-W64-NEXT:  ; %bb.1: ; %if
1161; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1162; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1163; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
1164; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1165; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
1166; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
1167; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
1168; GFX9-W64-NEXT:  .LBB24_2: ; %endif
1169; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1170; GFX9-W64-NEXT:    ; return to shader part epilog
1171;
1172; GFX10-W32-LABEL: test_strict_wqm6_then:
1173; GFX10-W32:       ; %bb.0: ; %main_body
1174; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1175; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1176; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
1177; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1178; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1179; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1180; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1181; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
1182; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
1183; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1184; GFX10-W32-NEXT:    s_cbranch_execz .LBB24_2
1185; GFX10-W32-NEXT:  ; %bb.1: ; %if
1186; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
1187; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1188; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
1189; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1190; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
1191; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
1192; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
1193; GFX10-W32-NEXT:  .LBB24_2: ; %endif
1194; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1195; GFX10-W32-NEXT:    ; return to shader part epilog
1196main_body:
1197  %src0 = load volatile float, ptr addrspace(1) undef
1198  ; use mbcnt to make sure the branch is divergent
1199  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1200  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1201  %cc = icmp uge i32 %hi, 16
1202  br i1 %cc, label %endif, label %if
1203
1204if:
1205  %src1 = load volatile float, ptr addrspace(1) undef
1206  %out = fadd float %src0, %src1
1207  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1208  br label %endif
1209
1210endif:
1211  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
1212  ret float %out.1
1213}
1214
1215; Check that Strict WQM is turned on correctly across basic block boundaries.
1216; loop version
1217;SI-CHECK: buffer_load_dword
1218;VI-CHECK: flat_load_dword
1219;SI-CHECK: buffer_load_dword
1220;VI-CHECK: flat_load_dword
1221define amdgpu_ps float @test_strict_wqm6_loop() {
1222; GFX9-W64-LABEL: test_strict_wqm6_loop:
1223; GFX9-W64:       ; %bb.0: ; %main_body
1224; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1225; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1226; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
1227; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1228; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
1229; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1230; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
1231; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
1232; GFX9-W64-NEXT:  .LBB25_1: ; %loop
1233; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1234; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1235; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1236; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
1237; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1238; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
1239; GFX9-W64-NEXT:    v_add_u32_e32 v3, -1, v3
1240; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1241; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1242; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1243; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v2
1244; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
1245; GFX9-W64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1246; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
1247; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1248; GFX9-W64-NEXT:    s_cbranch_execnz .LBB25_1
1249; GFX9-W64-NEXT:  ; %bb.2: ; %endloop
1250; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1251; GFX9-W64-NEXT:    ; return to shader part epilog
1252;
1253; GFX10-W32-LABEL: test_strict_wqm6_loop:
1254; GFX10-W32:       ; %bb.0: ; %main_body
1255; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1256; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1257; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
1258; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1259; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1260; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1261; GFX10-W32-NEXT:    s_mov_b32 s0, 0
1262; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
1263; GFX10-W32-NEXT:  .LBB25_1: ; %loop
1264; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1265; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
1266; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1267; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
1268; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1269; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
1270; GFX10-W32-NEXT:    v_add_nc_u32_e32 v3, -1, v3
1271; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
1272; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1273; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v2
1274; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
1275; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
1276; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1277; GFX10-W32-NEXT:    s_or_b32 s0, vcc_lo, s0
1278; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
1279; GFX10-W32-NEXT:    s_cbranch_execnz .LBB25_1
1280; GFX10-W32-NEXT:  ; %bb.2: ; %endloop
1281; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1282; GFX10-W32-NEXT:    ; return to shader part epilog
1283main_body:
1284  %src0 = load volatile float, ptr addrspace(1) undef
1285  ; use mbcnt to make sure the branch is divergent
1286  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1287  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1288  br label %loop
1289
1290loop:
1291  %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
1292  %src1 = load volatile float, ptr addrspace(1) undef
1293  %out = fadd float %src0, %src1
1294  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1295  %counter.1 = sub i32 %counter, 1
1296  %cc = icmp ne i32 %counter.1, 0
1297  br i1 %cc, label %loop, label %endloop
1298
1299endloop:
1300  ret float %out.0
1301}
1302
1303; Check that enabling WQM anywhere enables WQM for the set.inactive source.
1304define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
1305; GFX9-W64-LABEL: test_set_inactive2:
1306; GFX9-W64:       ; %bb.0: ; %main_body
1307; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1308; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1309; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s1
1310; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s0
1311; GFX9-W64-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 idxen
1312; GFX9-W64-NEXT:    s_nop 0
1313; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
1314; GFX9-W64-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $exec
1315; GFX9-W64-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
1316; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
1317; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1318; GFX9-W64-NEXT:    v_add_u32_e32 v1, v2, v1
1319; GFX9-W64-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1320; GFX9-W64-NEXT:    s_endpgm
1321;
1322; GFX10-W32-LABEL: test_set_inactive2:
1323; GFX10-W32:       ; %bb.0: ; %main_body
1324; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
1325; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1326; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s1
1327; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
1328; GFX10-W32-NEXT:    s_clause 0x1
1329; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
1330; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1331; GFX10-W32-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
1332; GFX10-W32-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $exec
1333; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
1334; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1335; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1336; GFX10-W32-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1337; GFX10-W32-NEXT:    s_endpgm
1338main_body:
1339  %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
1340  %src1.0 = bitcast float %src1 to i32
1341  %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef)
1342  %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
1343  %src0.0 = bitcast float %src0 to i32
1344  %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)
1345  %out = add i32 %src0.1, %src1.1
1346  %out.0 = bitcast i32 %out to float
1347  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.0, ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
1348  ret void
1349}
1350
1351; Check a case of one branch of an if-else requiring WQM, the other requiring
1352; exact.
1353; Note: In this particular case, the save-and-restore could be avoided if the
1354; analysis understood that the two branches of the if-else are mutually
1355; exclusive.
1356define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1357; GFX9-W64-LABEL: test_control_flow_0:
1358; GFX9-W64:       ; %bb.0: ; %main_body
1359; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1360; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1361; GFX9-W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
1362; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1363; GFX9-W64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
1364; GFX9-W64-NEXT:    s_cbranch_execz .LBB27_2
1365; GFX9-W64-NEXT:  ; %bb.1: ; %ELSE
1366; GFX9-W64-NEXT:    s_and_saveexec_b64 s[16:17], s[12:13]
1367; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1368; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1369; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
1370; GFX9-W64-NEXT:  .LBB27_2: ; %Flow
1371; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[14:15], s[14:15]
1372; GFX9-W64-NEXT:    s_cbranch_execz .LBB27_4
1373; GFX9-W64-NEXT:  ; %bb.3: ; %IF
1374; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1375; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1376; GFX9-W64-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
1377; GFX9-W64-NEXT:  .LBB27_4: ; %END
1378; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
1379; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1380; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1381; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
1382; GFX9-W64-NEXT:    ; return to shader part epilog
1383;
1384; GFX10-W32-LABEL: test_control_flow_0:
1385; GFX10-W32:       ; %bb.0: ; %main_body
1386; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1387; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1388; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
1389; GFX10-W32-NEXT:    v_cmpx_ne_u32_e32 0, v1
1390; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
1391; GFX10-W32-NEXT:    s_cbranch_execz .LBB27_2
1392; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
1393; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
1394; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1395; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1396; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
1397; GFX10-W32-NEXT:  .LBB27_2: ; %Flow
1398; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s13, s13
1399; GFX10-W32-NEXT:    s_cbranch_execz .LBB27_4
1400; GFX10-W32-NEXT:  ; %bb.3: ; %IF
1401; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1402; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1403; GFX10-W32-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1404; GFX10-W32-NEXT:  .LBB27_4: ; %END
1405; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
1406; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1407; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1408; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1409; GFX10-W32-NEXT:    ; return to shader part epilog
1410main_body:
1411  %cmp = icmp eq i32 %z, 0
1412  br i1 %cmp, label %IF, label %ELSE
1413
1414IF:
1415  %c.bc = bitcast i32 %c to float
1416  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1417  %tex0 = extractelement <4 x float> %tex, i32 0
1418  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1419  %data.if = extractelement <4 x float> %dtex, i32 0
1420  br label %END
1421
1422ELSE:
1423  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0)
1424  br label %END
1425
1426END:
1427  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
1428  ret float %r
1429}
1430
1431; Reverse branch order compared to the previous test.
1432define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1433; GFX9-W64-LABEL: test_control_flow_1:
1434; GFX9-W64:       ; %bb.0: ; %main_body
1435; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1436; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1437; GFX9-W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
1438; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1439; GFX9-W64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
1440; GFX9-W64-NEXT:    s_cbranch_execz .LBB28_2
1441; GFX9-W64-NEXT:  ; %bb.1: ; %IF
1442; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1443; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1444; GFX9-W64-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
1445; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1446; GFX9-W64-NEXT:  .LBB28_2: ; %Flow
1447; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], s[14:15]
1448; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1449; GFX9-W64-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
1450; GFX9-W64-NEXT:    s_xor_b64 exec, exec, s[0:1]
1451; GFX9-W64-NEXT:    s_cbranch_execz .LBB28_4
1452; GFX9-W64-NEXT:  ; %bb.3: ; %ELSE
1453; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1454; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1455; GFX9-W64-NEXT:  .LBB28_4: ; %END
1456; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1457; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1458; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
1459; GFX9-W64-NEXT:    ; return to shader part epilog
1460;
1461; GFX10-W32-LABEL: test_control_flow_1:
1462; GFX10-W32:       ; %bb.0: ; %main_body
1463; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1464; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1465; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
1466; GFX10-W32-NEXT:    v_cmpx_ne_u32_e32 0, v1
1467; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
1468; GFX10-W32-NEXT:    s_cbranch_execz .LBB28_2
1469; GFX10-W32-NEXT:  ; %bb.1: ; %IF
1470; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1471; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1472; GFX10-W32-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1473; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1474; GFX10-W32-NEXT:  .LBB28_2: ; %Flow
1475; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, s13
1476; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1477; GFX10-W32-NEXT:    s_and_b32 s0, exec_lo, s0
1478; GFX10-W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
1479; GFX10-W32-NEXT:    s_cbranch_execz .LBB28_4
1480; GFX10-W32-NEXT:  ; %bb.3: ; %ELSE
1481; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1482; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1483; GFX10-W32-NEXT:  .LBB28_4: ; %END
1484; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1485; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1486; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1487; GFX10-W32-NEXT:    ; return to shader part epilog
1488main_body:
1489  %cmp = icmp eq i32 %z, 0
1490  br i1 %cmp, label %ELSE, label %IF
1491
1492IF:
1493  %c.bc = bitcast i32 %c to float
1494  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1495  %tex0 = extractelement <4 x float> %tex, i32 0
1496  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1497  %data.if = extractelement <4 x float> %dtex, i32 0
1498  br label %END
1499
1500ELSE:
1501  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0)
1502  br label %END
1503
1504END:
1505  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
1506  ret float %r
1507}
1508
1509; Check that branch conditions are properly marked as needing WQM...
1510define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
1511; GFX9-W64-LABEL: test_control_flow_2:
1512; GFX9-W64:       ; %bb.0: ; %main_body
1513; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1514; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1515; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1516; GFX9-W64-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 idxen
1517; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1518; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 idxen
1519; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1520; GFX9-W64-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 idxen
1521; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1522; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
1523; GFX9-W64-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
1524; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1525; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1526; GFX9-W64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
1527; GFX9-W64-NEXT:  ; %bb.1: ; %ELSE
1528; GFX9-W64-NEXT:    v_lshlrev_b32_e32 v0, 2, v5
1529; GFX9-W64-NEXT:    ; implicit-def: $vgpr5
1530; GFX9-W64-NEXT:  ; %bb.2: ; %Flow
1531; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[14:15], s[14:15]
1532; GFX9-W64-NEXT:  ; %bb.3: ; %IF
1533; GFX9-W64-NEXT:    v_lshl_add_u32 v0, v5, 1, v5
1534; GFX9-W64-NEXT:  ; %bb.4: ; %END
1535; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
1536; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1537; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1538; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1539; GFX9-W64-NEXT:    ; return to shader part epilog
1540;
1541; GFX10-W32-LABEL: test_control_flow_2:
1542; GFX10-W32:       ; %bb.0: ; %main_body
1543; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1544; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1545; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1546; GFX10-W32-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 idxen
1547; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1548; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 idxen
1549; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1550; GFX10-W32-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0, v0
1551; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1552; GFX10-W32-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 idxen
1553; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1554; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1555; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
1556; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
1557; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
1558; GFX10-W32-NEXT:    v_lshlrev_b32_e32 v0, 2, v5
1559; GFX10-W32-NEXT:    ; implicit-def: $vgpr5
1560; GFX10-W32-NEXT:  ; %bb.2: ; %Flow
1561; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s13, s13
1562; GFX10-W32-NEXT:  ; %bb.3: ; %IF
1563; GFX10-W32-NEXT:    v_lshl_add_u32 v0, v5, 1, v5
1564; GFX10-W32-NEXT:  ; %bb.4: ; %END
1565; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
1566; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1567; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1568; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1569; GFX10-W32-NEXT:    ; return to shader part epilog
1570main_body:
1571  %idx.1 = extractelement <3 x i32> %idx, i32 0
1572  %data.1 = extractelement <2 x float> %data, i32 0
1573  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0)
1574
1575  ; The load that determines the branch (and should therefore be WQM) is
1576  ; surrounded by stores that require disabled WQM.
1577  %idx.2 = extractelement <3 x i32> %idx, i32 1
1578  %z = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx.2, i32 0, i32 0, i32 0)
1579
1580  %idx.3 = extractelement <3 x i32> %idx, i32 2
1581  %data.3 = extractelement <2 x float> %data, i32 1
1582  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.3, ptr addrspace(8) undef, i32 %idx.3, i32 0, i32 0, i32 0)
1583
1584  %cc = fcmp ogt float %z, 0.0
1585  br i1 %cc, label %IF, label %ELSE
1586
1587IF:
1588  %coord.IF = mul i32 %coord, 3
1589  br label %END
1590
1591ELSE:
1592  %coord.ELSE = mul i32 %coord, 4
1593  br label %END
1594
1595END:
1596  %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
1597  %coord.END.bc = bitcast i32 %coord.END to float
1598  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1599  ret <4 x float> %tex
1600}
1601
1602; ... but only if they really do need it.
1603define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
1604; GFX9-W64-LABEL: test_control_flow_3:
1605; GFX9-W64:       ; %bb.0: ; %main_body
1606; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1607; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1608; GFX9-W64-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
1609; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1610; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1611; GFX9-W64-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
1612; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1613; GFX9-W64-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v1
1614; GFX9-W64-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1615; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1616; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1617; GFX9-W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1618; GFX9-W64-NEXT:    s_cbranch_execnz .LBB30_3
1619; GFX9-W64-NEXT:  ; %bb.1: ; %Flow
1620; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
1621; GFX9-W64-NEXT:    s_cbranch_execnz .LBB30_4
1622; GFX9-W64-NEXT:  .LBB30_2: ; %END
1623; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1624; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1625; GFX9-W64-NEXT:    s_branch .LBB30_5
1626; GFX9-W64-NEXT:  .LBB30_3: ; %ELSE
1627; GFX9-W64-NEXT:    v_mul_f32_e32 v0, 4.0, v1
1628; GFX9-W64-NEXT:    ; implicit-def: $vgpr1
1629; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
1630; GFX9-W64-NEXT:    s_cbranch_execz .LBB30_2
1631; GFX9-W64-NEXT:  .LBB30_4: ; %IF
1632; GFX9-W64-NEXT:    v_mul_f32_e32 v0, 0x40400000, v1
1633; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1634; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1635; GFX9-W64-NEXT:    s_branch .LBB30_5
1636; GFX9-W64-NEXT:  .LBB30_5:
1637;
1638; GFX10-W32-LABEL: test_control_flow_3:
1639; GFX10-W32:       ; %bb.0: ; %main_body
1640; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1641; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1642; GFX10-W32-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1643; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1644; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1645; GFX10-W32-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1646; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1647; GFX10-W32-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1648; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1649; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1650; GFX10-W32-NEXT:    v_cmpx_nlt_f32_e32 0, v1
1651; GFX10-W32-NEXT:    s_xor_b32 s0, exec_lo, s0
1652; GFX10-W32-NEXT:    s_cbranch_execnz .LBB30_3
1653; GFX10-W32-NEXT:  ; %bb.1: ; %Flow
1654; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s0, s0
1655; GFX10-W32-NEXT:    s_cbranch_execnz .LBB30_4
1656; GFX10-W32-NEXT:  .LBB30_2: ; %END
1657; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1658; GFX10-W32-NEXT:    s_branch .LBB30_5
1659; GFX10-W32-NEXT:  .LBB30_3: ; %ELSE
1660; GFX10-W32-NEXT:    v_mul_f32_e32 v0, 4.0, v1
1661; GFX10-W32-NEXT:    ; implicit-def: $vgpr1
1662; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s0, s0
1663; GFX10-W32-NEXT:    s_cbranch_execz .LBB30_2
1664; GFX10-W32-NEXT:  .LBB30_4: ; %IF
1665; GFX10-W32-NEXT:    v_mul_f32_e32 v0, 0x40400000, v1
1666; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1667; GFX10-W32-NEXT:    s_branch .LBB30_5
1668; GFX10-W32-NEXT:  .LBB30_5:
1669main_body:
1670  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1671  %tex0 = extractelement <4 x float> %tex, i32 0
1672  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1673  %dtex.1 = extractelement <4 x float> %dtex, i32 0
1674  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %dtex.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1675
1676  %cc = fcmp ogt float %dtex.1, 0.0
1677  br i1 %cc, label %IF, label %ELSE
1678
1679IF:
1680  %tex.IF = fmul float %dtex.1, 3.0
1681  br label %END
1682
1683ELSE:
1684  %tex.ELSE = fmul float %dtex.1, 4.0
1685  br label %END
1686
1687END:
1688  %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
1689  ret float %tex.END
1690}
1691
1692; Another test that failed at some point because of terminator handling.
1693define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
1694; GFX9-W64-LABEL: test_control_flow_4:
1695; GFX9-W64:       ; %bb.0: ; %main_body
1696; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1697; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1698; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1699; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1700; GFX9-W64-NEXT:    s_cbranch_execz .LBB31_2
1701; GFX9-W64-NEXT:  ; %bb.1: ; %IF
1702; GFX9-W64-NEXT:    s_and_saveexec_b64 s[16:17], s[12:13]
1703; GFX9-W64-NEXT:    buffer_load_dword v1, off, s[0:3], 0
1704; GFX9-W64-NEXT:    v_mov_b32_e32 v2, 1
1705; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1706; GFX9-W64-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 idxen
1707; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
1708; GFX9-W64-NEXT:  .LBB31_2: ; %END
1709; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
1710; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1711; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1712; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1713; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1714; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1715; GFX9-W64-NEXT:    ; return to shader part epilog
1716;
1717; GFX10-W32-LABEL: test_control_flow_4:
1718; GFX10-W32:       ; %bb.0: ; %main_body
1719; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1720; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1721; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
1722; GFX10-W32-NEXT:    v_cmpx_eq_u32_e32 0, v1
1723; GFX10-W32-NEXT:    s_cbranch_execz .LBB31_2
1724; GFX10-W32-NEXT:  ; %bb.1: ; %IF
1725; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
1726; GFX10-W32-NEXT:    buffer_load_dword v1, off, s[0:3], 0
1727; GFX10-W32-NEXT:    v_mov_b32_e32 v2, 1
1728; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1729; GFX10-W32-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 idxen
1730; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
1731; GFX10-W32-NEXT:  .LBB31_2: ; %END
1732; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
1733; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1734; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1735; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1736; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1737; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1738; GFX10-W32-NEXT:    ; return to shader part epilog
1739main_body:
1740  %cond = icmp eq i32 %y, 0
1741  br i1 %cond, label %IF, label %END
1742
1743IF:
1744  %data = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 0, i32 0, i32 0)
1745  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0)
1746  br label %END
1747
1748END:
1749  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1750  %tex0 = extractelement <4 x float> %tex, i32 0
1751  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1752  ret <4 x float> %dtex
1753}
1754
1755; Kill is performed in WQM mode so that uniform kill behaves correctly ...
1756define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
1757; GFX9-W64-LABEL: test_kill_0:
1758; GFX9-W64:       ; %bb.0: ; %main_body
1759; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1760; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1761; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1762; GFX9-W64-NEXT:    image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf
1763; GFX9-W64-NEXT:    s_nop 0
1764; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1765; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1766; GFX9-W64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v6
1767; GFX9-W64-NEXT:    s_andn2_b64 s[12:13], s[12:13], vcc
1768; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB32_2
1769; GFX9-W64-NEXT:  ; %bb.1: ; %main_body
1770; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, vcc
1771; GFX9-W64-NEXT:    image_sample v0, v5, s[0:7], s[8:11] dmask:0x1
1772; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1773; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1774; GFX9-W64-NEXT:    image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf
1775; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1776; GFX9-W64-NEXT:    v_add_f32_e32 v0, v7, v11
1777; GFX9-W64-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 idxen
1778; GFX9-W64-NEXT:    v_add_f32_e32 v1, v8, v12
1779; GFX9-W64-NEXT:    v_add_f32_e32 v2, v9, v13
1780; GFX9-W64-NEXT:    v_add_f32_e32 v3, v10, v14
1781; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1782; GFX9-W64-NEXT:    s_branch .LBB32_3
1783; GFX9-W64-NEXT:  .LBB32_2:
1784; GFX9-W64-NEXT:    s_mov_b64 exec, 0
1785; GFX9-W64-NEXT:    exp null off, off, off, off done vm
1786; GFX9-W64-NEXT:    s_endpgm
1787; GFX9-W64-NEXT:  .LBB32_3:
1788;
1789; GFX10-W32-LABEL: test_kill_0:
1790; GFX10-W32:       ; %bb.0: ; %main_body
1791; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1792; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1793; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1794; GFX10-W32-NEXT:    image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1795; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1796; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1797; GFX10-W32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v6
1798; GFX10-W32-NEXT:    s_andn2_b32 s12, s12, vcc_lo
1799; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB32_2
1800; GFX10-W32-NEXT:  ; %bb.1: ; %main_body
1801; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
1802; GFX10-W32-NEXT:    image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1803; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1804; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1805; GFX10-W32-NEXT:    image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1806; GFX10-W32-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 idxen
1807; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1808; GFX10-W32-NEXT:    v_add_f32_e32 v4, v8, v12
1809; GFX10-W32-NEXT:    v_add_f32_e32 v5, v10, v14
1810; GFX10-W32-NEXT:    v_add_f32_e32 v0, v7, v11
1811; GFX10-W32-NEXT:    v_add_f32_e32 v2, v9, v13
1812; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v4
1813; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v5
1814; GFX10-W32-NEXT:    s_branch .LBB32_3
1815; GFX10-W32-NEXT:  .LBB32_2:
1816; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
1817; GFX10-W32-NEXT:    exp null off, off, off, off done vm
1818; GFX10-W32-NEXT:    s_endpgm
1819; GFX10-W32-NEXT:  .LBB32_3:
1820main_body:
1821  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1822  %idx.0 = extractelement <2 x i32> %idx, i32 0
1823  %data.0 = extractelement <2 x float> %data, i32 0
1824  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.0, ptr addrspace(8) undef, i32 %idx.0, i32 0, i32 0, i32 0)
1825
1826  %z.cmp = fcmp olt float %z, 0.0
1827  call void @llvm.amdgcn.kill(i1 %z.cmp)
1828
1829  %idx.1 = extractelement <2 x i32> %idx, i32 1
1830  %data.1 = extractelement <2 x float> %data, i32 1
1831  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0)
1832  %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1833  %tex2.0 = extractelement <4 x float> %tex2, i32 0
1834  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1835  %out = fadd <4 x float> %tex, %dtex
1836
1837  ret <4 x float> %out
1838}
1839
1840; ... but only if WQM is necessary.
1841define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
1842; GFX9-W64-LABEL: test_kill_1:
1843; GFX9-W64:       ; %bb.0: ; %main_body
1844; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1845; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1846; GFX9-W64-NEXT:    v_mov_b32_e32 v5, v0
1847; GFX9-W64-NEXT:    image_sample v0, v1, s[0:7], s[8:11] dmask:0x1
1848; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v2
1849; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1850; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1851; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1852; GFX9-W64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v4
1853; GFX9-W64-NEXT:    s_andn2_b64 s[12:13], s[12:13], vcc
1854; GFX9-W64-NEXT:    buffer_store_dword v5, off, s[0:3], 0
1855; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB33_2
1856; GFX9-W64-NEXT:  ; %bb.1: ; %main_body
1857; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, vcc
1858; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1859; GFX9-W64-NEXT:    s_branch .LBB33_3
1860; GFX9-W64-NEXT:  .LBB33_2:
1861; GFX9-W64-NEXT:    s_mov_b64 exec, 0
1862; GFX9-W64-NEXT:    exp null off, off, off, off done vm
1863; GFX9-W64-NEXT:    s_endpgm
1864; GFX9-W64-NEXT:  .LBB33_3:
1865;
1866; GFX10-W32-LABEL: test_kill_1:
1867; GFX10-W32:       ; %bb.0: ; %main_body
1868; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1869; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1870; GFX10-W32-NEXT:    v_mov_b32_e32 v5, v0
1871; GFX10-W32-NEXT:    image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1872; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v2
1873; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1874; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1875; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1876; GFX10-W32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v4
1877; GFX10-W32-NEXT:    buffer_store_dword v5, off, s[0:3], 0
1878; GFX10-W32-NEXT:    s_andn2_b32 s12, s12, vcc_lo
1879; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB33_2
1880; GFX10-W32-NEXT:  ; %bb.1: ; %main_body
1881; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
1882; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1883; GFX10-W32-NEXT:    s_branch .LBB33_3
1884; GFX10-W32-NEXT:  .LBB33_2:
1885; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
1886; GFX10-W32-NEXT:    exp null off, off, off, off done vm
1887; GFX10-W32-NEXT:    s_endpgm
1888; GFX10-W32-NEXT:  .LBB33_3:
1889main_body:
1890  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1891  %tex0 = extractelement <4 x float> %tex, i32 0
1892  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1893
1894  call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
1895
1896  %z.cmp = fcmp olt float %z, 0.0
1897  call void @llvm.amdgcn.kill(i1 %z.cmp)
1898
1899  ret <4 x float> %dtex
1900}
1901
1902; Check prolog shaders.
1903define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 {
1904; GFX9-W64-LABEL: test_prolog_1:
1905; GFX9-W64:       ; %bb.0: ; %main_body
1906; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1907; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1908; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
1909; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1910; GFX9-W64-NEXT:    ; return to shader part epilog
1911;
1912; GFX10-W32-LABEL: test_prolog_1:
1913; GFX10-W32:       ; %bb.0: ; %main_body
1914; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1915; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1916; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
1917; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1918; GFX10-W32-NEXT:    ; return to shader part epilog
1919main_body:
1920  %s = fadd float %a, %b
1921  ret float %s
1922}
1923
1924define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
1925; GFX9-W64-LABEL: test_loop_vcc:
1926; GFX9-W64:       ; %bb.0: ; %entry
1927; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1928; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1929; GFX9-W64-NEXT:    v_mov_b32_e32 v7, v3
1930; GFX9-W64-NEXT:    v_mov_b32_e32 v6, v2
1931; GFX9-W64-NEXT:    v_mov_b32_e32 v5, v1
1932; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
1933; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1934; GFX9-W64-NEXT:    image_store v[4:7], v0, s[0:7] dmask:0xf unorm
1935; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1936; GFX9-W64-NEXT:    v_mov_b32_e32 v8, 0
1937; GFX9-W64-NEXT:    s_mov_b32 s4, 0x40e00000
1938; GFX9-W64-NEXT:    s_branch .LBB35_2
1939; GFX9-W64-NEXT:  .LBB35_1: ; %body
1940; GFX9-W64-NEXT:    ; in Loop: Header=BB35_2 Depth=1
1941; GFX9-W64-NEXT:    image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf
1942; GFX9-W64-NEXT:    v_add_f32_e32 v8, 2.0, v8
1943; GFX9-W64-NEXT:    s_cbranch_execz .LBB35_4
1944; GFX9-W64-NEXT:  .LBB35_2: ; %loop
1945; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1946; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1947; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v4
1948; GFX9-W64-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v8
1949; GFX9-W64-NEXT:    v_mov_b32_e32 v1, v5
1950; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v6
1951; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v7
1952; GFX9-W64-NEXT:    s_cbranch_vccz .LBB35_1
1953; GFX9-W64-NEXT:  ; %bb.3:
1954; GFX9-W64-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
1955; GFX9-W64-NEXT:    ; implicit-def: $vgpr8
1956; GFX9-W64-NEXT:  .LBB35_4: ; %break
1957; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1958; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1959; GFX9-W64-NEXT:    ; return to shader part epilog
1960;
1961; GFX10-W32-LABEL: test_loop_vcc:
1962; GFX10-W32:       ; %bb.0: ; %entry
1963; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1964; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1965; GFX10-W32-NEXT:    v_mov_b32_e32 v8, 0
1966; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1967; GFX10-W32-NEXT:    image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
1968; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1969; GFX10-W32-NEXT:    s_branch .LBB35_2
1970; GFX10-W32-NEXT:    .p2align 6
1971; GFX10-W32-NEXT:  .LBB35_1: ; %body
1972; GFX10-W32-NEXT:    ; in Loop: Header=BB35_2 Depth=1
1973; GFX10-W32-NEXT:    image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
1974; GFX10-W32-NEXT:    v_add_f32_e32 v8, 2.0, v8
1975; GFX10-W32-NEXT:    s_cbranch_execz .LBB35_4
1976; GFX10-W32-NEXT:  .LBB35_2: ; %loop
1977; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1978; GFX10-W32-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8
1979; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1980; GFX10-W32-NEXT:    v_mov_b32_e32 v7, v3
1981; GFX10-W32-NEXT:    v_mov_b32_e32 v6, v2
1982; GFX10-W32-NEXT:    v_mov_b32_e32 v5, v1
1983; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
1984; GFX10-W32-NEXT:    s_cbranch_vccz .LBB35_1
1985; GFX10-W32-NEXT:  ; %bb.3:
1986; GFX10-W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1987; GFX10-W32-NEXT:    ; implicit-def: $vgpr8
1988; GFX10-W32-NEXT:  .LBB35_4: ; %break
1989; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1990; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1991; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v4
1992; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v5
1993; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v6
1994; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v7
1995; GFX10-W32-NEXT:    ; return to shader part epilog
1996entry:
1997  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0)
1998  br label %loop
1999
2000loop:
2001  %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
2002  %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
2003  %cc = fcmp ogt float %ctr.iv, 7.0
2004  br i1 %cc, label %break, label %body
2005
2006body:
2007  %c.iv0 = extractelement <4 x float> %c.iv, i32 0
2008  %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2009  %ctr.next = fadd float %ctr.iv, 2.0
2010  br label %loop
2011
2012break:
2013  ret <4 x float> %c.iv
2014}
2015
2016; Only intrinsic stores need exact execution -- other stores do not have
2017; externally visible effects and may require WQM for correctness.
2018define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
2019; GFX9-W64-LABEL: test_alloca:
2020; GFX9-W64:       ; %bb.0: ; %entry
2021; GFX9-W64-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2022; GFX9-W64-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2023; GFX9-W64-NEXT:    s_mov_b32 s10, -1
2024; GFX9-W64-NEXT:    s_mov_b32 s11, 0xe00000
2025; GFX9-W64-NEXT:    s_add_u32 s8, s8, s0
2026; GFX9-W64-NEXT:    s_addc_u32 s9, s9, 0
2027; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
2028; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2029; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
2030; GFX9-W64-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2031; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2032; GFX9-W64-NEXT:    buffer_store_dword v1, off, s[8:11], 0
2033; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2034; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
2035; GFX9-W64-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
2036; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
2037; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
2038; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2039; GFX9-W64-NEXT:    image_sample v[1:4], v1, s[0:7], s[0:3] dmask:0xf
2040; GFX9-W64-NEXT:    v_mov_b32_e32 v5, 1
2041; GFX9-W64-NEXT:    buffer_store_dword v0, v5, s[0:3], 0 idxen
2042; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
2043; GFX9-W64-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
2044; GFX9-W64-NEXT:    s_endpgm
2045;
2046; GFX10-W32-LABEL: test_alloca:
2047; GFX10-W32:       ; %bb.0: ; %entry
2048; GFX10-W32-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2049; GFX10-W32-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2050; GFX10-W32-NEXT:    s_mov_b32 s10, -1
2051; GFX10-W32-NEXT:    s_mov_b32 s11, 0x31c16000
2052; GFX10-W32-NEXT:    s_add_u32 s8, s8, s0
2053; GFX10-W32-NEXT:    s_addc_u32 s9, s9, 0
2054; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
2055; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2056; GFX10-W32-NEXT:    v_lshl_add_u32 v2, v2, 2, 0
2057; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
2058; GFX10-W32-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2059; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2060; GFX10-W32-NEXT:    buffer_store_dword v1, off, s[8:11], 0
2061; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
2062; GFX10-W32-NEXT:    buffer_load_dword v1, v2, s[8:11], 0 offen
2063; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
2064; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2065; GFX10-W32-NEXT:    image_sample v[1:4], v1, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2066; GFX10-W32-NEXT:    v_mov_b32_e32 v5, 1
2067; GFX10-W32-NEXT:    buffer_store_dword v0, v5, s[0:3], 0 idxen
2068; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2069; GFX10-W32-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
2070; GFX10-W32-NEXT:    s_endpgm
2071entry:
2072  %array = alloca [32 x i32], align 4, addrspace(5)
2073
2074  call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
2075
2076  store volatile i32 %a, ptr addrspace(5) %array, align 4
2077
2078  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0)
2079
2080  %c.gep = getelementptr [32 x i32], ptr addrspace(5) %array, i32 0, i32 %idx
2081  %c = load i32, ptr addrspace(5) %c.gep, align 4
2082  %c.bc = bitcast i32 %c to float
2083  %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2084  call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %t, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
2085
2086  ret void
2087}
2088
2089; Must return to exact at the end of a non-void returning shader,
2090; otherwise the EXEC mask exported by the epilog will be wrong. This is true
2091; even if the shader has no kills, because a kill could have happened in a
2092; previous shader fragment.
2093define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
2094; GFX9-W64-LABEL: test_nonvoid_return:
2095; GFX9-W64:       ; %bb.0:
2096; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
2097; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2098; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
2099; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
2100; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2101; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2102; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2103; GFX9-W64-NEXT:    ; return to shader part epilog
2104;
2105; GFX10-W32-LABEL: test_nonvoid_return:
2106; GFX10-W32:       ; %bb.0:
2107; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
2108; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2109; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
2110; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
2111; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2112; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2113; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2114; GFX10-W32-NEXT:    ; return to shader part epilog
2115  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2116  %tex0 = extractelement <4 x float> %tex, i32 0
2117  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2118  ret <4 x float> %dtex
2119}
2120
2121define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
2122; GFX9-W64-LABEL: test_nonvoid_return_unreachable:
2123; GFX9-W64:       ; %bb.0: ; %entry
2124; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2125; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
2126; GFX9-W64-NEXT:    s_and_b64 exec, exec, exec
2127; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2128; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2129; GFX9-W64-NEXT:    s_cmp_lt_i32 s0, 1
2130; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB38_2
2131; GFX9-W64-NEXT:  ; %bb.1: ; %else
2132; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2133; GFX9-W64-NEXT:    s_branch .LBB38_3
2134; GFX9-W64-NEXT:  .LBB38_2: ; %if
2135; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2136; GFX9-W64-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
2137; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2138; GFX9-W64-NEXT:  .LBB38_3:
2139;
2140; GFX10-W32-LABEL: test_nonvoid_return_unreachable:
2141; GFX10-W32:       ; %bb.0: ; %entry
2142; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2143; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
2144; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, exec_lo
2145; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2146; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2147; GFX10-W32-NEXT:    s_cmp_lt_i32 s0, 1
2148; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB38_2
2149; GFX10-W32-NEXT:  ; %bb.1: ; %else
2150; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2151; GFX10-W32-NEXT:    s_branch .LBB38_3
2152; GFX10-W32-NEXT:  .LBB38_2: ; %if
2153; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2154; GFX10-W32-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
2155; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
2156; GFX10-W32-NEXT:  .LBB38_3:
2157entry:
2158  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2159  %tex0 = extractelement <4 x float> %tex, i32 0
2160  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2161  %cc = icmp sgt i32 %c, 0
2162  br i1 %cc, label %if, label %else
2163
2164if:
2165  store volatile <4 x float> %dtex, ptr addrspace(1) undef
2166  unreachable
2167
2168else:
2169  ret <4 x float> %dtex
2170}
2171
2172; Test awareness that s_wqm_b64 clobbers SCC.
2173define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
2174; GFX9-W64-LABEL: test_scc:
2175; GFX9-W64:       ; %bb.0: ; %main_body
2176; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
2177; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2178; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
2179; GFX9-W64-NEXT:    s_cmp_lt_i32 s0, 1
2180; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB39_2
2181; GFX9-W64-NEXT:  ; %bb.1: ; %else
2182; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2183; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 1
2184; GFX9-W64-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf
2185; GFX9-W64-NEXT:    s_cbranch_execz .LBB39_3
2186; GFX9-W64-NEXT:    s_branch .LBB39_4
2187; GFX9-W64-NEXT:  .LBB39_2:
2188; GFX9-W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2189; GFX9-W64-NEXT:  .LBB39_3: ; %if
2190; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2191; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2192; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2193; GFX9-W64-NEXT:  .LBB39_4: ; %end
2194; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
2195; GFX9-W64-NEXT:    v_mov_b32_e32 v5, 1.0
2196; GFX9-W64-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 idxen
2197; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2198; GFX9-W64-NEXT:    ; return to shader part epilog
2199;
2200; GFX10-W32-LABEL: test_scc:
2201; GFX10-W32:       ; %bb.0: ; %main_body
2202; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
2203; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2204; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
2205; GFX10-W32-NEXT:    s_cmp_lt_i32 s0, 1
2206; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB39_2
2207; GFX10-W32-NEXT:  ; %bb.1: ; %else
2208; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 1
2209; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2210; GFX10-W32-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D
2211; GFX10-W32-NEXT:    s_cbranch_execz .LBB39_3
2212; GFX10-W32-NEXT:    s_branch .LBB39_4
2213; GFX10-W32-NEXT:  .LBB39_2:
2214; GFX10-W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2215; GFX10-W32-NEXT:  .LBB39_3: ; %if
2216; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2217; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2218; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2219; GFX10-W32-NEXT:  .LBB39_4: ; %end
2220; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
2221; GFX10-W32-NEXT:    v_mov_b32_e32 v5, 1.0
2222; GFX10-W32-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 idxen
2223; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2224; GFX10-W32-NEXT:    ; return to shader part epilog
2225main_body:
2226  %cc = icmp sgt i32 %sel, 0
2227  br i1 %cc, label %if, label %else
2228
2229if:
2230  %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2231  br label %end
2232
2233else:
2234  %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2235  br label %end
2236
2237end:
2238  %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
2239  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float 1.0, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2240  ret <4 x float> %r
2241}
2242
2243; Check a case of a block being entirely WQM except for a bit of WWM.
2244; There was a bug where it forgot to enter and leave WWM.
2245define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2246; GFX9-W64-LABEL: test_wwm_within_wqm:
2247; GFX9-W64:       ; %bb.0: ; %main_body
2248; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2249; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2250; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2251; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
2252; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
2253; GFX9-W64-NEXT:    s_cbranch_execz .LBB40_2
2254; GFX9-W64-NEXT:  ; %bb.1: ; %IF
2255; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2256; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2257; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2258; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2259; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v0, v0
2260; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2261; GFX9-W64-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
2262; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2263; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2264; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2265; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2266; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v1, v0
2267; GFX9-W64-NEXT:  .LBB40_2: ; %ENDIF
2268; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
2269; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2270; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2271; GFX9-W64-NEXT:    ; return to shader part epilog
2272;
2273; GFX10-W32-LABEL: test_wwm_within_wqm:
2274; GFX10-W32:       ; %bb.0: ; %main_body
2275; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2276; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2277; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2278; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 0
2279; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
2280; GFX10-W32-NEXT:    s_cbranch_execz .LBB40_2
2281; GFX10-W32-NEXT:  ; %bb.1: ; %IF
2282; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2283; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2284; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2285; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2286; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v0, v0
2287; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2288; GFX10-W32-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s0
2289; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2290; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2291; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2292; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2293; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v1, v0
2294; GFX10-W32-NEXT:  .LBB40_2: ; %ENDIF
2295; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
2296; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2297; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2298; GFX10-W32-NEXT:    ; return to shader part epilog
2299main_body:
2300  %cmp = icmp eq i32 %z, 0
2301  br i1 %cmp, label %IF, label %ENDIF
2302
2303IF:
2304  %c.bc = bitcast i32 %c to float
2305  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2306  %tex0 = extractelement <4 x float> %tex, i32 0
2307  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2308  %dataf = extractelement <4 x float> %dtex, i32 0
2309  %data1 = fptosi float %dataf to i32
2310  %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
2311  %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
2312  %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3)
2313  %data4f = sitofp i32 %data4 to float
2314  br label %ENDIF
2315
2316ENDIF:
2317  %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
2318  ret float %r
2319}
2320
2321; Check that WWM is triggered by the strict_wwm intrinsic.
2322define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
2323; GFX9-W64-LABEL: test_strict_wwm1:
2324; GFX9-W64:       ; %bb.0: ; %main_body
2325; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2326; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2327; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
2328; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2329; GFX9-W64-NEXT:    s_nop 0
2330; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2331; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2332; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
2333; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2334; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2335; GFX9-W64-NEXT:    ; return to shader part epilog
2336;
2337; GFX10-W32-LABEL: test_strict_wwm1:
2338; GFX10-W32:       ; %bb.0: ; %main_body
2339; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2340; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2341; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
2342; GFX10-W32-NEXT:    s_clause 0x1
2343; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2344; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2345; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2346; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
2347; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2348; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2349; GFX10-W32-NEXT:    ; return to shader part epilog
2350main_body:
2351  %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2352  %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2353  %out = fadd float %src0, %src1
2354  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2355  ret float %out.0
2356}
2357
2358; Same as above, but with an integer type.
2359define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
2360; GFX9-W64-LABEL: test_strict_wwm2:
2361; GFX9-W64:       ; %bb.0: ; %main_body
2362; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2363; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2364; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
2365; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2366; GFX9-W64-NEXT:    s_nop 0
2367; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2368; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2369; GFX9-W64-NEXT:    v_add_u32_e32 v1, v1, v2
2370; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2371; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2372; GFX9-W64-NEXT:    ; return to shader part epilog
2373;
2374; GFX10-W32-LABEL: test_strict_wwm2:
2375; GFX10-W32:       ; %bb.0: ; %main_body
2376; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2377; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2378; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
2379; GFX10-W32-NEXT:    s_clause 0x1
2380; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2381; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2382; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2383; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2384; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2385; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2386; GFX10-W32-NEXT:    ; return to shader part epilog
2387main_body:
2388  %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2389  %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2390  %src0.0 = bitcast float %src0 to i32
2391  %src1.0 = bitcast float %src1 to i32
2392  %out = add i32 %src0.0, %src1.0
2393  %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
2394  %out.1 = bitcast i32 %out.0 to float
2395  ret float %out.1
2396}
2397
2398; Check that we don't leave WWM on for computations that don't require WWM,
2399; since that will lead clobbering things that aren't supposed to be clobbered
2400; in cases like this.
2401; We enforce this by checking that v_add gets emitted in the same block as
2402; WWM computations.
2403define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) {
2404; GFX9-W64-LABEL: test_strict_wwm3:
2405; GFX9-W64:       ; %bb.0: ; %main_body
2406; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2407; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2408; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
2409; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2410; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2411; GFX9-W64-NEXT:    s_cbranch_execz .LBB43_2
2412; GFX9-W64-NEXT:  ; %bb.1: ; %if
2413; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
2414; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2415; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2416; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2417; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v1
2418; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
2419; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2420; GFX9-W64-NEXT:    v_add_f32_e32 v0, v1, v0
2421; GFX9-W64-NEXT:  .LBB43_2: ; %endif
2422; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
2423; GFX9-W64-NEXT:    ; return to shader part epilog
2424;
2425; GFX10-W32-LABEL: test_strict_wwm3:
2426; GFX10-W32:       ; %bb.0: ; %main_body
2427; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2428; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2429; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
2430; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2431; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2432; GFX10-W32-NEXT:    s_cbranch_execz .LBB43_2
2433; GFX10-W32-NEXT:  ; %bb.1: ; %if
2434; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2435; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2436; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2437; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2438; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v1
2439; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2440; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2441; GFX10-W32-NEXT:    v_add_f32_e32 v0, v1, v0
2442; GFX10-W32-NEXT:  .LBB43_2: ; %endif
2443; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2444; GFX10-W32-NEXT:    ; return to shader part epilog
2445main_body:
2446  ; use mbcnt to make sure the branch is divergent
2447  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2448  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2449  %cc = icmp uge i32 %hi, 16
2450  br i1 %cc, label %endif, label %if
2451
2452if:
2453  %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2454  %out = fadd float %src, %src
2455  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2456  %out.1 = fadd float %src, %out.0
2457  br label %endif
2458
2459endif:
2460  %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
2461  ret float %out.2
2462}
2463
2464; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
2465; write could clobber disabled channels in the non-WWM one.
2466; We enforce this by checking that v_mov gets emitted in the same block as
2467; WWM computations.
2468define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) {
2469; GFX9-W64-LABEL: test_strict_wwm4:
2470; GFX9-W64:       ; %bb.0: ; %main_body
2471; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2472; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2473; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
2474; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2475; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2476; GFX9-W64-NEXT:    s_cbranch_execz .LBB44_2
2477; GFX9-W64-NEXT:  ; %bb.1: ; %if
2478; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
2479; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2480; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2481; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2482; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
2483; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
2484; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2485; GFX9-W64-NEXT:  .LBB44_2: ; %endif
2486; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
2487; GFX9-W64-NEXT:    ; return to shader part epilog
2488;
2489; GFX10-W32-LABEL: test_strict_wwm4:
2490; GFX10-W32:       ; %bb.0: ; %main_body
2491; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2492; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2493; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
2494; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2495; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2496; GFX10-W32-NEXT:    s_cbranch_execz .LBB44_2
2497; GFX10-W32-NEXT:  ; %bb.1: ; %if
2498; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2499; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2500; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2501; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2502; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
2503; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2504; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2505; GFX10-W32-NEXT:  .LBB44_2: ; %endif
2506; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2507; GFX10-W32-NEXT:    ; return to shader part epilog
2508main_body:
2509  ; use mbcnt to make sure the branch is divergent
2510  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2511  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2512  %cc = icmp uge i32 %hi, 16
2513  br i1 %cc, label %endif, label %if
2514
2515if:
2516  %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2517  %out = fadd float %src, %src
2518  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2519  br label %endif
2520
2521endif:
2522  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
2523  ret float %out.1
2524}
2525
2526; Make sure the transition from Exact to WWM then WQM works properly.
2527define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
2528; GFX9-W64-LABEL: test_strict_wwm5:
2529; GFX9-W64:       ; %bb.0: ; %main_body
2530; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
2531; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
2532; GFX9-W64-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
2533; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2534; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
2535; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
2536; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
2537; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2538; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2539; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
2540; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
2541; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2542; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2543; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
2544; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
2545; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
2546; GFX9-W64-NEXT:    ; return to shader part epilog
2547;
2548; GFX10-W32-LABEL: test_strict_wwm5:
2549; GFX10-W32:       ; %bb.0: ; %main_body
2550; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
2551; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
2552; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
2553; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2554; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
2555; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2556; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2557; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
2558; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2559; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2560; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2561; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
2562; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2563; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2564; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2565; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
2566; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
2567; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
2568; GFX10-W32-NEXT:    ; return to shader part epilog
2569main_body:
2570  %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2571  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2572  %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2573  %temp = fadd float %src1, %src1
2574  %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
2575  %out = fadd float %temp.0, %temp.0
2576  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
2577  ret float %out.0
2578}
2579
2580; Check that WWM is turned on correctly across basic block boundaries.
2581; if..then..endif version
2582;SI-CHECK: buffer_load_dword
2583;VI-CHECK: flat_load_dword
2584;SI-CHECK: buffer_load_dword
2585;VI-CHECK: flat_load_dword
2586define amdgpu_ps float @test_strict_wwm6_then() {
2587; GFX9-W64-LABEL: test_strict_wwm6_then:
2588; GFX9-W64:       ; %bb.0: ; %main_body
2589; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2590; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
2591; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2592; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2593; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2594; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2595; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
2596; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2597; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2598; GFX9-W64-NEXT:    s_cbranch_execz .LBB46_2
2599; GFX9-W64-NEXT:  ; %bb.1: ; %if
2600; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2601; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
2602; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2603; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
2604; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2605; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2606; GFX9-W64-NEXT:  .LBB46_2: ; %endif
2607; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
2608; GFX9-W64-NEXT:    ; return to shader part epilog
2609;
2610; GFX10-W32-LABEL: test_strict_wwm6_then:
2611; GFX10-W32:       ; %bb.0: ; %main_body
2612; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2613; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
2614; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2615; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2616; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2617; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2618; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
2619; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2620; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2621; GFX10-W32-NEXT:    s_cbranch_execz .LBB46_2
2622; GFX10-W32-NEXT:  ; %bb.1: ; %if
2623; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2624; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
2625; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2626; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
2627; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2628; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2629; GFX10-W32-NEXT:  .LBB46_2: ; %endif
2630; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2631; GFX10-W32-NEXT:    ; return to shader part epilog
2632main_body:
2633  %src0 = load volatile float, ptr addrspace(1) undef
2634  ; use mbcnt to make sure the branch is divergent
2635  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2636  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2637  %cc = icmp uge i32 %hi, 16
2638  br i1 %cc, label %endif, label %if
2639
2640if:
2641  %src1 = load volatile float, ptr addrspace(1) undef
2642  %out = fadd float %src0, %src1
2643  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2644  br label %endif
2645
2646endif:
2647  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
2648  ret float %out.1
2649}
2650
2651; Check that WWM is turned on correctly across basic block boundaries.
2652; loop version
2653define amdgpu_ps float @test_strict_wwm6_loop() {
2654; GFX9-W64-LABEL: test_strict_wwm6_loop:
2655; GFX9-W64:       ; %bb.0: ; %main_body
2656; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2657; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
2658; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2659; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2660; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2661; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
2662; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
2663; GFX9-W64-NEXT:  .LBB47_1: ; %loop
2664; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
2665; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2666; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
2667; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2668; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2669; GFX9-W64-NEXT:    v_add_u32_e32 v3, -1, v3
2670; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2671; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2672; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v2
2673; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2674; GFX9-W64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2675; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2676; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2677; GFX9-W64-NEXT:    s_cbranch_execnz .LBB47_1
2678; GFX9-W64-NEXT:  ; %bb.2: ; %endloop
2679; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
2680; GFX9-W64-NEXT:    ; return to shader part epilog
2681;
2682; GFX10-W32-LABEL: test_strict_wwm6_loop:
2683; GFX10-W32:       ; %bb.0: ; %main_body
2684; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2685; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
2686; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2687; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2688; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2689; GFX10-W32-NEXT:    s_mov_b32 s0, 0
2690; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
2691; GFX10-W32-NEXT:  .LBB47_1: ; %loop
2692; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
2693; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2694; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
2695; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2696; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2697; GFX10-W32-NEXT:    v_add_nc_u32_e32 v3, -1, v3
2698; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2699; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v2
2700; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2701; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
2702; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2703; GFX10-W32-NEXT:    s_or_b32 s0, vcc_lo, s0
2704; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
2705; GFX10-W32-NEXT:    s_cbranch_execnz .LBB47_1
2706; GFX10-W32-NEXT:  ; %bb.2: ; %endloop
2707; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2708; GFX10-W32-NEXT:    ; return to shader part epilog
2709main_body:
2710  %src0 = load volatile float, ptr addrspace(1) undef
2711  ; use mbcnt to make sure the branch is divergent
2712  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2713  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2714  br label %loop
2715
2716loop:
2717  %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
2718  %src1 = load volatile float, ptr addrspace(1) undef
2719  %out = fadd float %src0, %src1
2720  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2721  %counter.1 = sub i32 %counter, 1
2722  %cc = icmp ne i32 %counter.1, 0
2723  br i1 %cc, label %loop, label %endloop
2724
2725endloop:
2726  ret float %out.0
2727}
2728
2729; Check that @llvm.amdgcn.set.inactive disables WWM.
2730define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
2731; GFX9-W64-LABEL: test_strict_wwm_set_inactive1:
2732; GFX9-W64:       ; %bb.0: ; %main_body
2733; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2734; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
2735; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2736; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2737; GFX9-W64-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
2738; GFX9-W64-NEXT:    v_add_u32_e32 v0, v0, v0
2739; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2740; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
2741; GFX9-W64-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
2742; GFX9-W64-NEXT:    s_endpgm
2743;
2744; GFX10-W32-LABEL: test_strict_wwm_set_inactive1:
2745; GFX10-W32:       ; %bb.0: ; %main_body
2746; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2747; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
2748; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2749; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2750; GFX10-W32-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s0
2751; GFX10-W32-NEXT:    v_add_nc_u32_e32 v0, v0, v0
2752; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2753; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
2754; GFX10-W32-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
2755; GFX10-W32-NEXT:    s_endpgm
2756main_body:
2757  %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2758  %src.0 = bitcast float %src to i32
2759  %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
2760  %out = add i32 %src.1, %src.1
2761  %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
2762  %out.1 = bitcast i32 %out.0 to float
2763  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2764  ret void
2765}
2766
2767; Check a case of a block being entirely WQM except for a bit of WWM.
2768; There was a bug where it forgot to enter and leave WWM.
2769define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2770; GFX9-W64-LABEL: test_strict_wwm_within_wqm:
2771; GFX9-W64:       ; %bb.0: ; %main_body
2772; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2773; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2774; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2775; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
2776; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
2777; GFX9-W64-NEXT:    s_cbranch_execz .LBB49_2
2778; GFX9-W64-NEXT:  ; %bb.1: ; %IF
2779; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2780; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2781; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2782; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2783; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v0, v0
2784; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2785; GFX9-W64-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
2786; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2787; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2788; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2789; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2790; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v1, v0
2791; GFX9-W64-NEXT:  .LBB49_2: ; %ENDIF
2792; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
2793; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2794; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2795; GFX9-W64-NEXT:    ; return to shader part epilog
2796;
2797; GFX10-W32-LABEL: test_strict_wwm_within_wqm:
2798; GFX10-W32:       ; %bb.0: ; %main_body
2799; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2800; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2801; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2802; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 0
2803; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
2804; GFX10-W32-NEXT:    s_cbranch_execz .LBB49_2
2805; GFX10-W32-NEXT:  ; %bb.1: ; %IF
2806; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2807; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2808; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2809; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2810; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v0, v0
2811; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2812; GFX10-W32-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s0
2813; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2814; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2815; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2816; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2817; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v1, v0
2818; GFX10-W32-NEXT:  .LBB49_2: ; %ENDIF
2819; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
2820; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2821; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2822; GFX10-W32-NEXT:    ; return to shader part epilog
2823main_body:
2824  %cmp = icmp eq i32 %z, 0
2825  br i1 %cmp, label %IF, label %ENDIF
2826
2827IF:
2828  %c.bc = bitcast i32 %c to float
2829  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2830  %tex0 = extractelement <4 x float> %tex, i32 0
2831  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2832  %dataf = extractelement <4 x float> %dtex, i32 0
2833  %data1 = fptosi float %dataf to i32
2834  %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
2835  %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
2836  %data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3)
2837  %data4f = sitofp i32 %data4 to float
2838  br label %ENDIF
2839
2840ENDIF:
2841  %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
2842  ret float %r
2843}
2844
2845; Check a case of a block being entirely WQM except for a bit of STRICT WQM.
2846define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2847; GFX9-W64-LABEL: test_strict_wqm_within_wqm:
2848; GFX9-W64:       ; %bb.0: ; %main_body
2849; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2850; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2851; GFX9-W64-NEXT:    s_mov_b64 s[14:15], exec
2852; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2853; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
2854; GFX9-W64-NEXT:    s_mov_b64 exec, s[14:15]
2855; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2856; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2857; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
2858; GFX9-W64-NEXT:    s_cbranch_execz .LBB50_2
2859; GFX9-W64-NEXT:  ; %bb.1: ; %IF
2860; GFX9-W64-NEXT:    s_mov_b64 s[16:17], exec
2861; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2862; GFX9-W64-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
2863; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2864; GFX9-W64-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
2865; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2866; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v2, v2
2867; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2868; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
2869; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2870; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2871; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v0, v0
2872; GFX9-W64-NEXT:  .LBB50_2: ; %ENDIF
2873; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
2874; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2875; GFX9-W64-NEXT:    ; return to shader part epilog
2876;
2877; GFX10-W32-LABEL: test_strict_wqm_within_wqm:
2878; GFX10-W32:       ; %bb.0: ; %main_body
2879; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2880; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2881; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
2882; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2883; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
2884; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s13
2885; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2886; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
2887; GFX10-W32-NEXT:    v_cmpx_eq_u32_e32 0, v1
2888; GFX10-W32-NEXT:    s_cbranch_execz .LBB50_2
2889; GFX10-W32-NEXT:  ; %bb.1: ; %IF
2890; GFX10-W32-NEXT:    s_mov_b32 s14, exec_lo
2891; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2892; GFX10-W32-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2893; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2894; GFX10-W32-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2895; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2896; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v2, v2
2897; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2898; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
2899; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2900; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2901; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v0, v0
2902; GFX10-W32-NEXT:  .LBB50_2: ; %ENDIF
2903; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
2904; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2905; GFX10-W32-NEXT:    ; return to shader part epilog
2906main_body:
2907  %cmp = icmp eq i32 %z, 0
2908  br i1 %cmp, label %IF, label %ENDIF
2909
2910IF:
2911  %c.bc = bitcast i32 %c to float
2912  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2913  %tex0 = extractelement <4 x float> %tex, i32 0
2914  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2915  %dataf = extractelement <4 x float> %dtex, i32 0
2916  %data1 = fptosi float %dataf to i32
2917  %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079)
2918  %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2)
2919  %data3f = sitofp i32 %data3 to float
2920  br label %ENDIF
2921
2922ENDIF:
2923  %r = phi float [ %data3f, %IF ], [ 0.0, %main_body ]
2924  ret float %r
2925}
2926
2927; WQM -> StrictWQM transition must be preserved because kill breaks WQM mask
2928define amdgpu_ps float @test_strict_wqm_within_wqm_with_kill(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data, i32 %wqm_data) {
2929; GFX9-W64-LABEL: test_strict_wqm_within_wqm_with_kill:
2930; GFX9-W64:       ; %bb.0: ; %main_body
2931; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2932; GFX9-W64-NEXT:    s_mov_b64 s[14:15], exec
2933; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2934; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v2
2935; GFX9-W64-NEXT:    s_mov_b64 exec, s[14:15]
2936; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2937; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2938; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2939; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2940; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2941; GFX9-W64-NEXT:    s_andn2_b64 s[0:1], exec, vcc
2942; GFX9-W64-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[0:1]
2943; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB51_2
2944; GFX9-W64-NEXT:  ; %bb.1: ; %main_body
2945; GFX9-W64-NEXT:    s_and_b64 exec, exec, vcc
2946; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
2947; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2948; GFX9-W64-NEXT:    ds_swizzle_b32 v3, v3 offset:swizzle(SWAP,2)
2949; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2950; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2951; GFX9-W64-NEXT:    v_mov_b32_e32 v1, v3
2952; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v1, v1
2953; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2954; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
2955; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
2956; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2957; GFX9-W64-NEXT:    s_branch .LBB51_3
2958; GFX9-W64-NEXT:  .LBB51_2:
2959; GFX9-W64-NEXT:    s_mov_b64 exec, 0
2960; GFX9-W64-NEXT:    exp null off, off, off, off done vm
2961; GFX9-W64-NEXT:    s_endpgm
2962; GFX9-W64-NEXT:  .LBB51_3:
2963;
2964; GFX10-W32-LABEL: test_strict_wqm_within_wqm_with_kill:
2965; GFX10-W32:       ; %bb.0: ; %main_body
2966; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2967; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
2968; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2969; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v2
2970; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s13
2971; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2972; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2973; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2974; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2975; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2976; GFX10-W32-NEXT:    s_andn2_b32 s0, exec_lo, vcc_lo
2977; GFX10-W32-NEXT:    s_andn2_b32 s12, s12, s0
2978; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB51_2
2979; GFX10-W32-NEXT:  ; %bb.1: ; %main_body
2980; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, vcc_lo
2981; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
2982; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2983; GFX10-W32-NEXT:    ds_swizzle_b32 v3, v3 offset:swizzle(SWAP,2)
2984; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2985; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2986; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v3
2987; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v1, v1
2988; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2989; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
2990; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
2991; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2992; GFX10-W32-NEXT:    s_branch .LBB51_3
2993; GFX10-W32-NEXT:  .LBB51_2:
2994; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
2995; GFX10-W32-NEXT:    exp null off, off, off, off done vm
2996; GFX10-W32-NEXT:    s_endpgm
2997; GFX10-W32-NEXT:  .LBB51_3:
2998main_body:
2999  %c.bc = bitcast i32 %c to float
3000  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3001  %tex0 = extractelement <4 x float> %tex, i32 0
3002  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3003  %cmp = icmp eq i32 %z, 0
3004  call void @llvm.amdgcn.kill(i1 %cmp)
3005  %dataf = extractelement <4 x float> %dtex, i32 0
3006  %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %wqm_data, i32 2079)
3007  %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2)
3008  %data3f = sitofp i32 %data3 to float
3009  %result.f = fadd float %dataf, %data3f
3010  %result.i = bitcast float %result.f to i32
3011  %result.wqm = call i32 @llvm.amdgcn.wqm.i32(i32 %result.i)
3012  %result = bitcast i32 %result.wqm to float
3013  ret float %result
3014}
3015
3016;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again.
3017define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, ptr addrspace(8) inreg %res2, float %inp, <8 x i32> inreg %res3) {
3018; GFX9-W64-LABEL: test_strict_wqm_strict_wwm_wqm:
3019; GFX9-W64:       ; %bb.0: ; %main_body
3020; GFX9-W64-NEXT:    s_mov_b64 s[28:29], exec
3021; GFX9-W64-NEXT:    s_mov_b32 s19, s17
3022; GFX9-W64-NEXT:    s_mov_b64 s[30:31], exec
3023; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3024; GFX9-W64-NEXT:    s_mov_b32 s23, s5
3025; GFX9-W64-NEXT:    s_mov_b32 s22, s4
3026; GFX9-W64-NEXT:    s_mov_b32 s21, s3
3027; GFX9-W64-NEXT:    s_mov_b32 s20, s2
3028; GFX9-W64-NEXT:    s_mov_b32 s27, s9
3029; GFX9-W64-NEXT:    s_mov_b32 s26, s8
3030; GFX9-W64-NEXT:    s_mov_b32 s25, s7
3031; GFX9-W64-NEXT:    s_mov_b32 s24, s6
3032; GFX9-W64-NEXT:    s_mov_b32 s18, s16
3033; GFX9-W64-NEXT:    s_mov_b32 s17, s15
3034; GFX9-W64-NEXT:    s_mov_b32 s16, s14
3035; GFX9-W64-NEXT:    s_mov_b32 s15, s13
3036; GFX9-W64-NEXT:    s_mov_b32 s14, s12
3037; GFX9-W64-NEXT:    s_mov_b32 s13, s11
3038; GFX9-W64-NEXT:    s_mov_b32 s12, s10
3039; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
3040; GFX9-W64-NEXT:    s_mov_b64 exec, s[30:31]
3041; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
3042; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
3043; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3044; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[20:23], 0 idxen
3045; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
3046; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
3047; GFX9-W64-NEXT:    v_mov_b32_e32 v3, s0
3048; GFX9-W64-NEXT:    buffer_load_dword v3, v3, s[24:27], 0 idxen
3049; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
3050; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
3051; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3052; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3053; GFX9-W64-NEXT:    v_add_f32_e32 v2, v2, v2
3054; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3055; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3056; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
3057; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3058; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3059; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v3
3060; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v4
3061; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[28:29]
3062; GFX9-W64-NEXT:    image_sample v0, v0, s[12:19], s[20:23] dmask:0x1
3063; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3064; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
3065; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[20:23], 0 idxen
3066; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3067; GFX9-W64-NEXT:    ; return to shader part epilog
3068;
3069; GFX10-W32-LABEL: test_strict_wqm_strict_wwm_wqm:
3070; GFX10-W32:       ; %bb.0: ; %main_body
3071; GFX10-W32-NEXT:    s_mov_b32 s28, exec_lo
3072; GFX10-W32-NEXT:    s_mov_b32 s19, s17
3073; GFX10-W32-NEXT:    s_mov_b32 s29, exec_lo
3074; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3075; GFX10-W32-NEXT:    s_mov_b32 s23, s5
3076; GFX10-W32-NEXT:    s_mov_b32 s22, s4
3077; GFX10-W32-NEXT:    s_mov_b32 s21, s3
3078; GFX10-W32-NEXT:    s_mov_b32 s20, s2
3079; GFX10-W32-NEXT:    s_mov_b32 s27, s9
3080; GFX10-W32-NEXT:    s_mov_b32 s26, s8
3081; GFX10-W32-NEXT:    s_mov_b32 s25, s7
3082; GFX10-W32-NEXT:    s_mov_b32 s24, s6
3083; GFX10-W32-NEXT:    s_mov_b32 s18, s16
3084; GFX10-W32-NEXT:    s_mov_b32 s17, s15
3085; GFX10-W32-NEXT:    s_mov_b32 s16, s14
3086; GFX10-W32-NEXT:    s_mov_b32 s15, s13
3087; GFX10-W32-NEXT:    s_mov_b32 s14, s12
3088; GFX10-W32-NEXT:    s_mov_b32 s13, s11
3089; GFX10-W32-NEXT:    s_mov_b32 s12, s10
3090; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
3091; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s29
3092; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
3093; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
3094; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3095; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[20:23], 0 idxen
3096; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
3097; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
3098; GFX10-W32-NEXT:    v_mov_b32_e32 v3, s0
3099; GFX10-W32-NEXT:    buffer_load_dword v3, v3, s[24:27], 0 idxen
3100; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
3101; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
3102; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3103; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3104; GFX10-W32-NEXT:    v_add_f32_e32 v2, v2, v2
3105; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3106; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3107; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
3108; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3109; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v3
3110; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3111; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v4
3112; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
3113; GFX10-W32-NEXT:    image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_1D
3114; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3115; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
3116; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[20:23], 0 idxen
3117; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3118; GFX10-W32-NEXT:    ; return to shader part epilog
3119main_body:
3120  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3121  %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3122  %temp = fadd float %reload, %reload
3123  %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
3124  %temp3 = fadd float %temp2, %temp2
3125  %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res2, i32 %idx0, i32 0, i32 0, i32 0)
3126  %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm)
3127  %temp5 = fadd float %temp3, %temp4
3128  %res.int = ptrtoint ptr addrspace(8) %res to i128
3129  %res.vec = bitcast i128 %res.int to <4 x i32>
3130  %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3131  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3132  %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3133  ret float %out
3134}
3135
3136define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) {
3137; GFX9-W64-LABEL: test_strict_wwm_strict_wqm_wqm:
3138; GFX9-W64:       ; %bb.0: ; %main_body
3139; GFX9-W64-NEXT:    s_mov_b64 s[20:21], exec
3140; GFX9-W64-NEXT:    s_mov_b32 s15, s13
3141; GFX9-W64-NEXT:    s_mov_b64 s[22:23], exec
3142; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3143; GFX9-W64-NEXT:    s_mov_b32 s19, s5
3144; GFX9-W64-NEXT:    s_mov_b32 s18, s4
3145; GFX9-W64-NEXT:    s_mov_b32 s17, s3
3146; GFX9-W64-NEXT:    s_mov_b32 s16, s2
3147; GFX9-W64-NEXT:    s_mov_b32 s14, s12
3148; GFX9-W64-NEXT:    s_mov_b32 s13, s11
3149; GFX9-W64-NEXT:    s_mov_b32 s12, s10
3150; GFX9-W64-NEXT:    s_mov_b32 s11, s9
3151; GFX9-W64-NEXT:    s_mov_b32 s10, s8
3152; GFX9-W64-NEXT:    s_mov_b32 s9, s7
3153; GFX9-W64-NEXT:    s_mov_b32 s8, s6
3154; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
3155; GFX9-W64-NEXT:    s_mov_b64 exec, s[22:23]
3156; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3157; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
3158; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
3159; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 idxen
3160; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
3161; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
3162; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3163; GFX9-W64-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 idxen
3164; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3165; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
3166; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3167; GFX9-W64-NEXT:    v_add_f32_e32 v2, v2, v2
3168; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3169; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3170; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
3171; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3172; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3173; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v3
3174; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v4
3175; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[20:21]
3176; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3177; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3178; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3179; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3180; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3181; GFX9-W64-NEXT:    ; return to shader part epilog
3182;
3183; GFX10-W32-LABEL: test_strict_wwm_strict_wqm_wqm:
3184; GFX10-W32:       ; %bb.0: ; %main_body
3185; GFX10-W32-NEXT:    s_mov_b32 s20, exec_lo
3186; GFX10-W32-NEXT:    s_mov_b32 s15, s13
3187; GFX10-W32-NEXT:    s_mov_b32 s21, exec_lo
3188; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3189; GFX10-W32-NEXT:    s_mov_b32 s19, s5
3190; GFX10-W32-NEXT:    s_mov_b32 s18, s4
3191; GFX10-W32-NEXT:    s_mov_b32 s17, s3
3192; GFX10-W32-NEXT:    s_mov_b32 s16, s2
3193; GFX10-W32-NEXT:    s_mov_b32 s14, s12
3194; GFX10-W32-NEXT:    s_mov_b32 s13, s11
3195; GFX10-W32-NEXT:    s_mov_b32 s12, s10
3196; GFX10-W32-NEXT:    s_mov_b32 s11, s9
3197; GFX10-W32-NEXT:    s_mov_b32 s10, s8
3198; GFX10-W32-NEXT:    s_mov_b32 s9, s7
3199; GFX10-W32-NEXT:    s_mov_b32 s8, s6
3200; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
3201; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s21
3202; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
3203; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
3204; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3205; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3206; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
3207; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 idxen
3208; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3209; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
3210; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3211; GFX10-W32-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 idxen
3212; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3213; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
3214; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3215; GFX10-W32-NEXT:    v_add_f32_e32 v2, v2, v2
3216; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3217; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3218; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
3219; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3220; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v3
3221; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3222; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v4
3223; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
3224; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3225; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3226; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3227; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3228; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3229; GFX10-W32-NEXT:    ; return to shader part epilog
3230main_body:
3231  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3232  %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3233  %temp = fadd float %reload, %reload
3234  %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
3235  %temp3 = fadd float %temp2, %temp2
3236  %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3237  %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
3238  %temp5 = fadd float %temp3, %temp4
3239  %res.int = ptrtoint ptr addrspace(8) %res to i128
3240  %res.vec = bitcast i128 %res.int to <4 x i32>
3241  %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3242  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3243  %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3244  ret float %out
3245}
3246
3247;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again.
3248define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) {
3249; GFX9-W64-LABEL: test_wqm_strict_wqm_wqm:
3250; GFX9-W64:       ; %bb.0: ; %main_body
3251; GFX9-W64-NEXT:    s_mov_b64 s[20:21], exec
3252; GFX9-W64-NEXT:    s_mov_b32 s15, s13
3253; GFX9-W64-NEXT:    s_mov_b64 s[22:23], exec
3254; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3255; GFX9-W64-NEXT:    s_mov_b32 s19, s5
3256; GFX9-W64-NEXT:    s_mov_b32 s18, s4
3257; GFX9-W64-NEXT:    s_mov_b32 s17, s3
3258; GFX9-W64-NEXT:    s_mov_b32 s16, s2
3259; GFX9-W64-NEXT:    s_mov_b32 s14, s12
3260; GFX9-W64-NEXT:    s_mov_b32 s13, s11
3261; GFX9-W64-NEXT:    s_mov_b32 s12, s10
3262; GFX9-W64-NEXT:    s_mov_b32 s11, s9
3263; GFX9-W64-NEXT:    s_mov_b32 s10, s8
3264; GFX9-W64-NEXT:    s_mov_b32 s9, s7
3265; GFX9-W64-NEXT:    s_mov_b32 s8, s6
3266; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
3267; GFX9-W64-NEXT:    s_mov_b64 exec, s[22:23]
3268; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3269; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3270; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s1
3271; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 idxen
3272; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
3273; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3274; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 idxen
3275; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3276; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3277; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3278; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3279; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3280; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v2
3281; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3282; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3283; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v3
3284; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[20:21]
3285; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3286; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3287; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3288; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3289; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3290; GFX9-W64-NEXT:    ; return to shader part epilog
3291;
3292; GFX10-W32-LABEL: test_wqm_strict_wqm_wqm:
3293; GFX10-W32:       ; %bb.0: ; %main_body
3294; GFX10-W32-NEXT:    s_mov_b32 s20, exec_lo
3295; GFX10-W32-NEXT:    s_mov_b32 s15, s13
3296; GFX10-W32-NEXT:    s_mov_b32 s21, exec_lo
3297; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3298; GFX10-W32-NEXT:    s_mov_b32 s19, s5
3299; GFX10-W32-NEXT:    s_mov_b32 s18, s4
3300; GFX10-W32-NEXT:    s_mov_b32 s17, s3
3301; GFX10-W32-NEXT:    s_mov_b32 s16, s2
3302; GFX10-W32-NEXT:    s_mov_b32 s14, s12
3303; GFX10-W32-NEXT:    s_mov_b32 s13, s11
3304; GFX10-W32-NEXT:    s_mov_b32 s12, s10
3305; GFX10-W32-NEXT:    s_mov_b32 s11, s9
3306; GFX10-W32-NEXT:    s_mov_b32 s10, s8
3307; GFX10-W32-NEXT:    s_mov_b32 s9, s7
3308; GFX10-W32-NEXT:    s_mov_b32 s8, s6
3309; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
3310; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s21
3311; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3312; GFX10-W32-NEXT:    v_mov_b32_e32 v3, s1
3313; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
3314; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3315; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3316; GFX10-W32-NEXT:    buffer_load_dword v0, v3, s[16:19], 0 idxen
3317; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
3318; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3319; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 idxen
3320; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3321; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3322; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3323; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3324; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v2
3325; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3326; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3327; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3328; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v3
3329; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
3330; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3331; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3332; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3333; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3334; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3335; GFX10-W32-NEXT:    ; return to shader part epilog
3336main_body:
3337  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3338  %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3339  %temp = fadd float %reload, %reload
3340  %res.int = ptrtoint ptr addrspace(8) %res to i128
3341  %res.vec = bitcast i128 %res.int to <4 x i32>
3342  %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3343  %temp2 = fadd float %tex, %tex
3344  %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3345  %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
3346  %temp4 = fadd float %temp2, %temp3
3347  %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3348  call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex2, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3349  %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3350  ret float %out
3351}
3352
3353; Check if the correct VCC register is selected. WQM pass incorrectly uses VCC for
3354; vector comparisons in Wave32 mode.
3355define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) inreg %0) {
3356; GFX9-W64-LABEL: test_for_deactivating_lanes_in_wave32:
3357; GFX9-W64:       ; %bb.0: ; %main_body
3358; GFX9-W64-NEXT:    s_mov_b32 s3, 0x31016fac
3359; GFX9-W64-NEXT:    s_mov_b32 s2, 32
3360; GFX9-W64-NEXT:    s_mov_b32 s1, 0x8000
3361; GFX9-W64-NEXT:    s_buffer_load_dword s0, s[0:3], 0x0
3362; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
3363; GFX9-W64-NEXT:    v_cmp_le_f32_e64 vcc, s0, 0
3364; GFX9-W64-NEXT:    s_andn2_b64 s[4:5], exec, vcc
3365; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB55_1
3366; GFX9-W64-NEXT:    s_endpgm
3367; GFX9-W64-NEXT:  .LBB55_1:
3368; GFX9-W64-NEXT:    s_mov_b64 exec, 0
3369; GFX9-W64-NEXT:    exp null off, off, off, off done vm
3370; GFX9-W64-NEXT:    s_endpgm
3371;
3372; GFX10-W32-LABEL: test_for_deactivating_lanes_in_wave32:
3373; GFX10-W32:       ; %bb.0: ; %main_body
3374; GFX10-W32-NEXT:    s_mov_b32 s3, 0x31016fac
3375; GFX10-W32-NEXT:    s_mov_b32 s2, 32
3376; GFX10-W32-NEXT:    s_mov_b32 s1, 0x8000
3377; GFX10-W32-NEXT:    s_buffer_load_dword s0, s[0:3], 0x0
3378; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
3379; GFX10-W32-NEXT:    v_cmp_le_f32_e64 vcc_lo, s0, 0
3380; GFX10-W32-NEXT:    s_andn2_b32 s4, exec_lo, vcc_lo
3381; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB55_1
3382; GFX10-W32-NEXT:    s_endpgm
3383; GFX10-W32-NEXT:  .LBB55_1:
3384; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
3385; GFX10-W32-NEXT:    exp null off, off, off, off done vm
3386; GFX10-W32-NEXT:    s_endpgm
3387main_body:
3388  %1 = ptrtoint ptr addrspace(6) %0 to i32
3389  %2 = insertelement <4 x i32> <i32 poison, i32 32768, i32 32, i32 822177708>, i32 %1, i32 0
3390  %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3
3391  %4 = fcmp nsz arcp ugt float %3, 0.000000e+00
3392  call void @llvm.amdgcn.kill(i1 %4) #1
3393  ret void
3394}
3395
3396; Test the interaction between wqm and llvm.amdgcn.init.exec.
3397define amdgpu_gs void @wqm_init_exec() {
3398; GFX9-W64-LABEL: wqm_init_exec:
3399; GFX9-W64:       ; %bb.0: ; %bb
3400; GFX9-W64-NEXT:    s_mov_b64 exec, -1
3401; GFX9-W64-NEXT:    s_mov_b32 s0, 0
3402; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
3403; GFX9-W64-NEXT:    s_mov_b32 s1, s0
3404; GFX9-W64-NEXT:    s_mov_b32 s2, s0
3405; GFX9-W64-NEXT:    s_mov_b32 s3, s0
3406; GFX9-W64-NEXT:    v_mov_b32_e32 v1, v0
3407; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
3408; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v0
3409; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3410; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3411; GFX9-W64-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $exec
3412; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
3413; GFX9-W64-NEXT:    ds_write_b32 v0, v1
3414; GFX9-W64-NEXT:    s_endpgm
3415;
3416; GFX10-W32-LABEL: wqm_init_exec:
3417; GFX10-W32:       ; %bb.0: ; %bb
3418; GFX10-W32-NEXT:    s_mov_b32 exec_lo, -1
3419; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
3420; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
3421; GFX10-W32-NEXT:    s_mov_b32 s0, 0
3422; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3423; GFX10-W32-NEXT:    s_mov_b32 s2, s0
3424; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
3425; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v0
3426; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
3427; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v0
3428; GFX10-W32-NEXT:    v_mov_b32_e32 v4, s0
3429; GFX10-W32-NEXT:    s_mov_b32 s1, s0
3430; GFX10-W32-NEXT:    s_mov_b32 s3, s0
3431; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3432; GFX10-W32-NEXT:    ds_write_b32 v0, v4
3433; GFX10-W32-NEXT:    s_endpgm
3434bb:
3435  call void @llvm.amdgcn.init.exec(i64 -1)
3436  call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
3437  %i = call i32 @llvm.amdgcn.wqm.i32(i32 0)
3438  store i32 %i, i32 addrspace(3)* null, align 4
3439  ret void
3440}
3441
3442; Test a case that failed machine verification.
3443define amdgpu_gs void @wqm_init_exec_switch(i32 %arg) {
3444; GFX9-W64-LABEL: wqm_init_exec_switch:
3445; GFX9-W64:       ; %bb.0:
3446; GFX9-W64-NEXT:    s_mov_b64 exec, 0
3447; GFX9-W64-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v0
3448; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3449; GFX9-W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
3450; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
3451; GFX9-W64-NEXT:    s_endpgm
3452;
3453; GFX10-W32-LABEL: wqm_init_exec_switch:
3454; GFX10-W32:       ; %bb.0:
3455; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
3456; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
3457; GFX10-W32-NEXT:    v_cmpx_lt_i32_e32 0, v0
3458; GFX10-W32-NEXT:    s_xor_b32 s0, exec_lo, s0
3459; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s0, s0
3460; GFX10-W32-NEXT:    s_endpgm
3461  call void @llvm.amdgcn.init.exec(i64 0)
3462  switch i32 %arg, label %bb1 [
3463    i32 0, label %bb3
3464    i32 1, label %bb2
3465  ]
3466bb1:
3467  ret void
3468bb2:
3469  ret void
3470bb3:
3471  ret void
3472}
3473
3474define amdgpu_gs void @wqm_init_exec_wwm() {
3475; GFX9-W64-LABEL: wqm_init_exec_wwm:
3476; GFX9-W64:       ; %bb.0:
3477; GFX9-W64-NEXT:    s_mov_b64 exec, 0
3478; GFX9-W64-NEXT:    s_mov_b32 s1, 0
3479; GFX9-W64-NEXT:    s_mov_b32 s0, s1
3480; GFX9-W64-NEXT:    s_cmp_lg_u64 exec, 0
3481; GFX9-W64-NEXT:    s_cselect_b64 s[2:3], -1, 0
3482; GFX9-W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
3483; GFX9-W64-NEXT:    s_cselect_b64 s[0:1], -1, 0
3484; GFX9-W64-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
3485; GFX9-W64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
3486; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
3487; GFX9-W64-NEXT:    exp mrt0 off, off, off, off
3488; GFX9-W64-NEXT:    s_endpgm
3489;
3490; GFX10-W32-LABEL: wqm_init_exec_wwm:
3491; GFX10-W32:       ; %bb.0:
3492; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
3493; GFX10-W32-NEXT:    s_mov_b32 s1, 0
3494; GFX10-W32-NEXT:    s_cmp_lg_u64 exec, 0
3495; GFX10-W32-NEXT:    s_mov_b32 s0, s1
3496; GFX10-W32-NEXT:    s_cselect_b32 s2, -1, 0
3497; GFX10-W32-NEXT:    s_cmp_lg_u64 s[0:1], 0
3498; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 0
3499; GFX10-W32-NEXT:    s_cselect_b32 s0, -1, 0
3500; GFX10-W32-NEXT:    s_xor_b32 s0, s2, s0
3501; GFX10-W32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s0
3502; GFX10-W32-NEXT:    exp mrt0 off, off, off, off
3503; GFX10-W32-NEXT:    s_endpgm
3504  call void @llvm.amdgcn.init.exec(i64 0)
3505  %i = call i64 @llvm.amdgcn.ballot.i64(i1 true)
3506  %i1 = call i32 @llvm.amdgcn.wwm.i32(i32 0)
3507  %i2 = insertelement <2 x i32> zeroinitializer, i32 %i1, i64 0
3508  %i3 = bitcast <2 x i32> %i2 to i64
3509  %i4 = icmp ne i64 %i, 0
3510  %i5 = icmp ne i64 %i3, 0
3511  %i6 = xor i1 %i4, %i5
3512  %i7 = uitofp i1 %i6 to float
3513  call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float %i7, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
3514  ret void
3515}
3516
3517; Check that exact regions with execz affected instructions are as short as possible
3518define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) {
3519; GFX9-W64-LABEL: short_exact_regions:
3520; GFX9-W64:       ; %bb.0: ; %main_body
3521; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
3522; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3523; GFX9-W64-NEXT:    image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf
3524; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
3525; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
3526; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
3527; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
3528; GFX9-W64-NEXT:    s_cbranch_execz .LBB59_2
3529; GFX9-W64-NEXT:  ; %bb.1: ; %if
3530; GFX9-W64-NEXT:    global_load_dword v0, v[1:2], off
3531; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3532; GFX9-W64-NEXT:    v_readfirstlane_b32 s16, v0
3533; GFX9-W64-NEXT:    s_buffer_load_dword s16, s[8:11], s16 offset:0x0
3534; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
3535; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s16
3536; GFX9-W64-NEXT:    s_and_saveexec_b64 s[16:17], s[12:13]
3537; GFX9-W64-NEXT:    buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
3538; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
3539; GFX9-W64-NEXT:  .LBB59_2: ; %endif
3540; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
3541; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3542; GFX9-W64-NEXT:    image_sample v0, v3, s[0:7], s[8:11] dmask:0x4
3543; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3544; GFX9-W64-NEXT:    v_add_f32_e32 v0, v4, v0
3545; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
3546; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
3547; GFX9-W64-NEXT:    ; return to shader part epilog
3548;
3549; GFX10-W32-LABEL: short_exact_regions:
3550; GFX10-W32:       ; %bb.0: ; %main_body
3551; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
3552; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3553; GFX10-W32-NEXT:    image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
3554; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
3555; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
3556; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
3557; GFX10-W32-NEXT:    v_cmpx_gt_u32_e32 16, v0
3558; GFX10-W32-NEXT:    s_cbranch_execz .LBB59_2
3559; GFX10-W32-NEXT:  ; %bb.1: ; %if
3560; GFX10-W32-NEXT:    global_load_dword v0, v[1:2], off
3561; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3562; GFX10-W32-NEXT:    v_readfirstlane_b32 s14, v0
3563; GFX10-W32-NEXT:    s_buffer_load_dword s14, s[8:11], s14 offset:0x0
3564; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
3565; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s14
3566; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
3567; GFX10-W32-NEXT:    buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
3568; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
3569; GFX10-W32-NEXT:  .LBB59_2: ; %endif
3570; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
3571; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3572; GFX10-W32-NEXT:    image_sample v0, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
3573; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3574; GFX10-W32-NEXT:    v_add_f32_e32 v0, v4, v0
3575; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
3576; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
3577; GFX10-W32-NEXT:    ; return to shader part epilog
3578main_body:
3579  %tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3580  %idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4
3581  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
3582  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
3583  %cc = icmp uge i32 %hi, 16
3584  br i1 %cc, label %endif, label %if
3585
3586if:
3587  %idx1 = extractelement <4 x i32> %idx0, i64 0
3588  %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1)
3589  %idx3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %sampler, i32 %idx2, i32 0)
3590
3591  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex1, <4 x i32> undef, i32 %idx3, i32 0, i32 0, i32 0)
3592  br label %endif
3593
3594endif:
3595  %d = extractelement <4 x float> %tex1, i64 0
3596  %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3597  %r0 = extractelement <4 x float> %tex1, i64 1
3598  %r1 = extractelement <4 x float> %tex2, i64 2
3599  %r2 = fadd float %r0, %r1
3600  %out = call float @llvm.amdgcn.wqm.f32(float %r2)
3601
3602  ret float %out
3603}
3604
3605; Check that exact regions shortening doesn't prevent early WQM exit
3606define amdgpu_ps float @short_exact_regions_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) {
3607; GFX9-W64-LABEL: short_exact_regions_2:
3608; GFX9-W64:       ; %bb.0: ; %main_body
3609; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
3610; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3611; GFX9-W64-NEXT:    image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3
3612; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
3613; GFX9-W64-NEXT:    global_load_dword v0, v[1:2], off
3614; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3615; GFX9-W64-NEXT:    image_sample v5, v3, s[0:7], s[8:11] dmask:0x4
3616; GFX9-W64-NEXT:    ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6 killed $sgpr7
3617; GFX9-W64-NEXT:    ; kill: killed $vgpr3
3618; GFX9-W64-NEXT:    ; kill: killed $vgpr1 killed $vgpr2
3619; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3620; GFX9-W64-NEXT:    v_readfirstlane_b32 s0, v0
3621; GFX9-W64-NEXT:    s_buffer_load_dword s0, s[8:11], s0 offset:0x0
3622; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3623; GFX9-W64-NEXT:    v_add_f32_e32 v0, v4, v5
3624; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
3625; GFX9-W64-NEXT:    v_add_f32_e32 v0, s0, v0
3626; GFX9-W64-NEXT:    ; return to shader part epilog
3627;
3628; GFX10-W32-LABEL: short_exact_regions_2:
3629; GFX10-W32:       ; %bb.0: ; %main_body
3630; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
3631; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3632; GFX10-W32-NEXT:    image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D
3633; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
3634; GFX10-W32-NEXT:    global_load_dword v0, v[1:2], off
3635; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3636; GFX10-W32-NEXT:    image_sample v1, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
3637; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3638; GFX10-W32-NEXT:    v_readfirstlane_b32 s0, v0
3639; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3640; GFX10-W32-NEXT:    v_add_f32_e32 v0, v4, v1
3641; GFX10-W32-NEXT:    s_buffer_load_dword s0, s[8:11], s0 offset:0x0
3642; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
3643; GFX10-W32-NEXT:    v_add_f32_e32 v0, s0, v0
3644; GFX10-W32-NEXT:    ; return to shader part epilog
3645main_body:
3646  %tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3647  %idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4
3648  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
3649  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
3650  %idx1 = extractelement <4 x i32> %idx0, i64 0
3651  %d = extractelement <4 x float> %tex1, i64 0
3652
3653  %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3654
3655  %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1)
3656  %idx3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %sampler, i32 %idx2, i32 0)
3657
3658  %r0 = extractelement <4 x float> %tex1, i64 1
3659  %r1 = extractelement <4 x float> %tex2, i64 2
3660  %r2 = fadd float %r0, %r1
3661  %out = fadd float %r2, %idx3
3662
3663  ret float %out
3664}
3665
3666declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
3667declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
3668
3669declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
3670declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
3671declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2
3672declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2
3673declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3
3674declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3
3675
3676declare void @llvm.amdgcn.struct.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) #2
3677declare void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) #2
3678declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32 immarg) #2
3679declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) #2
3680declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #3
3681declare float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32) #3
3682
3683declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3
3684declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3685declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3686declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3687declare void @llvm.amdgcn.kill(i1) #1
3688declare float @llvm.amdgcn.wqm.f32(float) #3
3689declare i32 @llvm.amdgcn.wqm.i32(i32) #3
3690declare float @llvm.amdgcn.strict.wwm.f32(float) #3
3691declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3
3692declare float @llvm.amdgcn.wwm.f32(float) #3
3693declare i32 @llvm.amdgcn.wwm.i32(i32) #3
3694declare float @llvm.amdgcn.strict.wqm.f32(float) #3
3695declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3
3696declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
3697declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
3698declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
3699declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
3700declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1
3701declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
3702declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
3703declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
3704declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7
3705declare i32 @llvm.amdgcn.readfirstlane.i32(i32)
3706
3707attributes #1 = { nounwind }
3708attributes #2 = { nounwind readonly }
3709attributes #3 = { nounwind readnone }
3710attributes #4 = { nounwind readnone convergent }
3711attributes #5 = { "amdgpu-ps-wqm-outputs" }
3712attributes #6 = { nounwind "InitialPSInputAddr"="2" }
3713attributes #7 = { nounwind readnone willreturn }
3714