xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll (revision ba52f06f9d92c7ca04b440f618f8d352ea121fcc)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
3; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s
4; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -o - %s | FileCheck -check-prefix=GFX10NSA %s
5; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -o - %s | FileCheck -check-prefix=GFX12 %s
6
7define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
8; GFX6-LABEL: gather4_2d:
9; GFX6:       ; %bb.0: ; %main_body
10; GFX6-NEXT:    s_mov_b64 s[14:15], exec
11; GFX6-NEXT:    s_mov_b32 s0, s2
12; GFX6-NEXT:    s_mov_b32 s1, s3
13; GFX6-NEXT:    s_mov_b32 s2, s4
14; GFX6-NEXT:    s_mov_b32 s3, s5
15; GFX6-NEXT:    s_mov_b32 s4, s6
16; GFX6-NEXT:    s_mov_b32 s5, s7
17; GFX6-NEXT:    s_mov_b32 s6, s8
18; GFX6-NEXT:    s_mov_b32 s7, s9
19; GFX6-NEXT:    s_mov_b32 s8, s10
20; GFX6-NEXT:    s_mov_b32 s9, s11
21; GFX6-NEXT:    s_mov_b32 s10, s12
22; GFX6-NEXT:    s_mov_b32 s11, s13
23; GFX6-NEXT:    s_wqm_b64 exec, exec
24; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
25; GFX6-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1
26; GFX6-NEXT:    s_waitcnt vmcnt(0)
27; GFX6-NEXT:    ; return to shader part epilog
28;
29; GFX10NSA-LABEL: gather4_2d:
30; GFX10NSA:       ; %bb.0: ; %main_body
31; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
32; GFX10NSA-NEXT:    s_mov_b32 s0, s2
33; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
34; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
35; GFX10NSA-NEXT:    s_mov_b32 s1, s3
36; GFX10NSA-NEXT:    s_mov_b32 s2, s4
37; GFX10NSA-NEXT:    s_mov_b32 s3, s5
38; GFX10NSA-NEXT:    s_mov_b32 s4, s6
39; GFX10NSA-NEXT:    s_mov_b32 s5, s7
40; GFX10NSA-NEXT:    s_mov_b32 s6, s8
41; GFX10NSA-NEXT:    s_mov_b32 s7, s9
42; GFX10NSA-NEXT:    s_mov_b32 s8, s10
43; GFX10NSA-NEXT:    s_mov_b32 s9, s11
44; GFX10NSA-NEXT:    s_mov_b32 s10, s12
45; GFX10NSA-NEXT:    s_mov_b32 s11, s13
46; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
47; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
48; GFX10NSA-NEXT:    ; return to shader part epilog
49;
50; GFX12-LABEL: gather4_2d:
51; GFX12:       ; %bb.0: ; %main_body
52; GFX12-NEXT:    s_mov_b32 s1, exec_lo
53; GFX12-NEXT:    s_mov_b32 s0, s2
54; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
55; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s1
56; GFX12-NEXT:    s_mov_b32 s1, s3
57; GFX12-NEXT:    s_mov_b32 s2, s4
58; GFX12-NEXT:    s_mov_b32 s3, s5
59; GFX12-NEXT:    s_mov_b32 s4, s6
60; GFX12-NEXT:    s_mov_b32 s5, s7
61; GFX12-NEXT:    s_mov_b32 s6, s8
62; GFX12-NEXT:    s_mov_b32 s7, s9
63; GFX12-NEXT:    s_mov_b32 s8, s10
64; GFX12-NEXT:    s_mov_b32 s9, s11
65; GFX12-NEXT:    s_mov_b32 s10, s12
66; GFX12-NEXT:    s_mov_b32 s11, s13
67; GFX12-NEXT:    image_gather4 v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
68; GFX12-NEXT:    s_wait_samplecnt 0x0
69; GFX12-NEXT:    ; return to shader part epilog
70main_body:
71  %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
72  ret <4 x float> %v
73}
74
75define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
76; GFX6-LABEL: gather4_2d_tfe:
77; GFX6:       ; %bb.0: ; %main_body
78; GFX6-NEXT:    s_mov_b64 s[14:15], exec
79; GFX6-NEXT:    s_mov_b32 s0, s2
80; GFX6-NEXT:    s_mov_b32 s1, s3
81; GFX6-NEXT:    s_mov_b32 s2, s4
82; GFX6-NEXT:    s_mov_b32 s3, s5
83; GFX6-NEXT:    s_mov_b32 s4, s6
84; GFX6-NEXT:    s_mov_b32 s5, s7
85; GFX6-NEXT:    s_mov_b32 s6, s8
86; GFX6-NEXT:    s_mov_b32 s7, s9
87; GFX6-NEXT:    s_mov_b32 s8, s10
88; GFX6-NEXT:    s_mov_b32 s9, s11
89; GFX6-NEXT:    s_mov_b32 s10, s12
90; GFX6-NEXT:    s_mov_b32 s11, s13
91; GFX6-NEXT:    s_wqm_b64 exec, exec
92; GFX6-NEXT:    v_mov_b32_e32 v5, v0
93; GFX6-NEXT:    v_mov_b32_e32 v0, 0
94; GFX6-NEXT:    v_mov_b32_e32 v6, v1
95; GFX6-NEXT:    v_mov_b32_e32 v1, v0
96; GFX6-NEXT:    v_mov_b32_e32 v2, v0
97; GFX6-NEXT:    v_mov_b32_e32 v3, v0
98; GFX6-NEXT:    v_mov_b32_e32 v4, v0
99; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
100; GFX6-NEXT:    image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 tfe
101; GFX6-NEXT:    s_waitcnt vmcnt(0)
102; GFX6-NEXT:    ; return to shader part epilog
103;
104; GFX10NSA-LABEL: gather4_2d_tfe:
105; GFX10NSA:       ; %bb.0: ; %main_body
106; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
107; GFX10NSA-NEXT:    s_mov_b32 s0, s2
108; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
109; GFX10NSA-NEXT:    v_mov_b32_e32 v5, v0
110; GFX10NSA-NEXT:    v_mov_b32_e32 v0, 0
111; GFX10NSA-NEXT:    v_mov_b32_e32 v6, v1
112; GFX10NSA-NEXT:    s_mov_b32 s1, s3
113; GFX10NSA-NEXT:    s_mov_b32 s2, s4
114; GFX10NSA-NEXT:    s_mov_b32 s3, s5
115; GFX10NSA-NEXT:    s_mov_b32 s4, s6
116; GFX10NSA-NEXT:    s_mov_b32 s5, s7
117; GFX10NSA-NEXT:    s_mov_b32 s6, s8
118; GFX10NSA-NEXT:    s_mov_b32 s7, s9
119; GFX10NSA-NEXT:    s_mov_b32 s8, s10
120; GFX10NSA-NEXT:    s_mov_b32 s9, s11
121; GFX10NSA-NEXT:    s_mov_b32 s10, s12
122; GFX10NSA-NEXT:    s_mov_b32 s11, s13
123; GFX10NSA-NEXT:    v_mov_b32_e32 v1, v0
124; GFX10NSA-NEXT:    v_mov_b32_e32 v2, v0
125; GFX10NSA-NEXT:    v_mov_b32_e32 v3, v0
126; GFX10NSA-NEXT:    v_mov_b32_e32 v4, v0
127; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
128; GFX10NSA-NEXT:    image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
129; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
130; GFX10NSA-NEXT:    ; return to shader part epilog
131;
132; GFX12-LABEL: gather4_2d_tfe:
133; GFX12:       ; %bb.0: ; %main_body
134; GFX12-NEXT:    s_mov_b32 s14, exec_lo
135; GFX12-NEXT:    s_mov_b32 s0, s2
136; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
137; GFX12-NEXT:    v_mov_b32_e32 v5, v0
138; GFX12-NEXT:    v_mov_b32_e32 v0, 0
139; GFX12-NEXT:    v_mov_b32_e32 v6, v1
140; GFX12-NEXT:    s_mov_b32 s1, s3
141; GFX12-NEXT:    s_mov_b32 s2, s4
142; GFX12-NEXT:    s_mov_b32 s3, s5
143; GFX12-NEXT:    s_mov_b32 s4, s6
144; GFX12-NEXT:    s_mov_b32 s5, s7
145; GFX12-NEXT:    s_mov_b32 s6, s8
146; GFX12-NEXT:    s_mov_b32 s7, s9
147; GFX12-NEXT:    s_mov_b32 s8, s10
148; GFX12-NEXT:    s_mov_b32 s9, s11
149; GFX12-NEXT:    s_mov_b32 s10, s12
150; GFX12-NEXT:    s_mov_b32 s11, s13
151; GFX12-NEXT:    v_mov_b32_e32 v1, v0
152; GFX12-NEXT:    v_mov_b32_e32 v2, v0
153; GFX12-NEXT:    v_mov_b32_e32 v3, v0
154; GFX12-NEXT:    v_mov_b32_e32 v4, v0
155; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
156; GFX12-NEXT:    image_gather4 v[0:4], [v5, v6], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
157; GFX12-NEXT:    s_wait_samplecnt 0x0
158; GFX12-NEXT:    ; return to shader part epilog
159main_body:
160  %v = call { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
161  %r = extractvalue { <4 x float>, i32 } %v, 0
162  ret <4 x float> %r
163}
164
165define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) {
166; GFX6-LABEL: gather4_cube:
167; GFX6:       ; %bb.0: ; %main_body
168; GFX6-NEXT:    s_mov_b64 s[14:15], exec
169; GFX6-NEXT:    s_mov_b32 s0, s2
170; GFX6-NEXT:    s_mov_b32 s1, s3
171; GFX6-NEXT:    s_mov_b32 s2, s4
172; GFX6-NEXT:    s_mov_b32 s3, s5
173; GFX6-NEXT:    s_mov_b32 s4, s6
174; GFX6-NEXT:    s_mov_b32 s5, s7
175; GFX6-NEXT:    s_mov_b32 s6, s8
176; GFX6-NEXT:    s_mov_b32 s7, s9
177; GFX6-NEXT:    s_mov_b32 s8, s10
178; GFX6-NEXT:    s_mov_b32 s9, s11
179; GFX6-NEXT:    s_mov_b32 s10, s12
180; GFX6-NEXT:    s_mov_b32 s11, s13
181; GFX6-NEXT:    s_wqm_b64 exec, exec
182; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
183; GFX6-NEXT:    image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 da
184; GFX6-NEXT:    s_waitcnt vmcnt(0)
185; GFX6-NEXT:    ; return to shader part epilog
186;
187; GFX10NSA-LABEL: gather4_cube:
188; GFX10NSA:       ; %bb.0: ; %main_body
189; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
190; GFX10NSA-NEXT:    s_mov_b32 s0, s2
191; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
192; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
193; GFX10NSA-NEXT:    s_mov_b32 s1, s3
194; GFX10NSA-NEXT:    s_mov_b32 s2, s4
195; GFX10NSA-NEXT:    s_mov_b32 s3, s5
196; GFX10NSA-NEXT:    s_mov_b32 s4, s6
197; GFX10NSA-NEXT:    s_mov_b32 s5, s7
198; GFX10NSA-NEXT:    s_mov_b32 s6, s8
199; GFX10NSA-NEXT:    s_mov_b32 s7, s9
200; GFX10NSA-NEXT:    s_mov_b32 s8, s10
201; GFX10NSA-NEXT:    s_mov_b32 s9, s11
202; GFX10NSA-NEXT:    s_mov_b32 s10, s12
203; GFX10NSA-NEXT:    s_mov_b32 s11, s13
204; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE
205; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
206; GFX10NSA-NEXT:    ; return to shader part epilog
207;
208; GFX12-LABEL: gather4_cube:
209; GFX12:       ; %bb.0: ; %main_body
210; GFX12-NEXT:    s_mov_b32 s1, exec_lo
211; GFX12-NEXT:    s_mov_b32 s0, s2
212; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
213; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s1
214; GFX12-NEXT:    s_mov_b32 s1, s3
215; GFX12-NEXT:    s_mov_b32 s2, s4
216; GFX12-NEXT:    s_mov_b32 s3, s5
217; GFX12-NEXT:    s_mov_b32 s4, s6
218; GFX12-NEXT:    s_mov_b32 s5, s7
219; GFX12-NEXT:    s_mov_b32 s6, s8
220; GFX12-NEXT:    s_mov_b32 s7, s9
221; GFX12-NEXT:    s_mov_b32 s8, s10
222; GFX12-NEXT:    s_mov_b32 s9, s11
223; GFX12-NEXT:    s_mov_b32 s10, s12
224; GFX12-NEXT:    s_mov_b32 s11, s13
225; GFX12-NEXT:    image_gather4 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE
226; GFX12-NEXT:    s_wait_samplecnt 0x0
227; GFX12-NEXT:    ; return to shader part epilog
228main_body:
229  %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 1, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
230  ret <4 x float> %v
231}
232
233define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) {
234; GFX6-LABEL: gather4_2darray:
235; GFX6:       ; %bb.0: ; %main_body
236; GFX6-NEXT:    s_mov_b64 s[14:15], exec
237; GFX6-NEXT:    s_mov_b32 s0, s2
238; GFX6-NEXT:    s_mov_b32 s1, s3
239; GFX6-NEXT:    s_mov_b32 s2, s4
240; GFX6-NEXT:    s_mov_b32 s3, s5
241; GFX6-NEXT:    s_mov_b32 s4, s6
242; GFX6-NEXT:    s_mov_b32 s5, s7
243; GFX6-NEXT:    s_mov_b32 s6, s8
244; GFX6-NEXT:    s_mov_b32 s7, s9
245; GFX6-NEXT:    s_mov_b32 s8, s10
246; GFX6-NEXT:    s_mov_b32 s9, s11
247; GFX6-NEXT:    s_mov_b32 s10, s12
248; GFX6-NEXT:    s_mov_b32 s11, s13
249; GFX6-NEXT:    s_wqm_b64 exec, exec
250; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
251; GFX6-NEXT:    image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 da
252; GFX6-NEXT:    s_waitcnt vmcnt(0)
253; GFX6-NEXT:    ; return to shader part epilog
254;
255; GFX10NSA-LABEL: gather4_2darray:
256; GFX10NSA:       ; %bb.0: ; %main_body
257; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
258; GFX10NSA-NEXT:    s_mov_b32 s0, s2
259; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
260; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
261; GFX10NSA-NEXT:    s_mov_b32 s1, s3
262; GFX10NSA-NEXT:    s_mov_b32 s2, s4
263; GFX10NSA-NEXT:    s_mov_b32 s3, s5
264; GFX10NSA-NEXT:    s_mov_b32 s4, s6
265; GFX10NSA-NEXT:    s_mov_b32 s5, s7
266; GFX10NSA-NEXT:    s_mov_b32 s6, s8
267; GFX10NSA-NEXT:    s_mov_b32 s7, s9
268; GFX10NSA-NEXT:    s_mov_b32 s8, s10
269; GFX10NSA-NEXT:    s_mov_b32 s9, s11
270; GFX10NSA-NEXT:    s_mov_b32 s10, s12
271; GFX10NSA-NEXT:    s_mov_b32 s11, s13
272; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
273; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
274; GFX10NSA-NEXT:    ; return to shader part epilog
275;
276; GFX12-LABEL: gather4_2darray:
277; GFX12:       ; %bb.0: ; %main_body
278; GFX12-NEXT:    s_mov_b32 s1, exec_lo
279; GFX12-NEXT:    s_mov_b32 s0, s2
280; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
281; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s1
282; GFX12-NEXT:    s_mov_b32 s1, s3
283; GFX12-NEXT:    s_mov_b32 s2, s4
284; GFX12-NEXT:    s_mov_b32 s3, s5
285; GFX12-NEXT:    s_mov_b32 s4, s6
286; GFX12-NEXT:    s_mov_b32 s5, s7
287; GFX12-NEXT:    s_mov_b32 s6, s8
288; GFX12-NEXT:    s_mov_b32 s7, s9
289; GFX12-NEXT:    s_mov_b32 s8, s10
290; GFX12-NEXT:    s_mov_b32 s9, s11
291; GFX12-NEXT:    s_mov_b32 s10, s12
292; GFX12-NEXT:    s_mov_b32 s11, s13
293; GFX12-NEXT:    image_gather4 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
294; GFX12-NEXT:    s_wait_samplecnt 0x0
295; GFX12-NEXT:    ; return to shader part epilog
296main_body:
297  %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 1, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
298  ret <4 x float> %v
299}
300
301define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
302; GFX6-LABEL: gather4_c_2d:
303; GFX6:       ; %bb.0: ; %main_body
304; GFX6-NEXT:    s_mov_b64 s[14:15], exec
305; GFX6-NEXT:    s_mov_b32 s0, s2
306; GFX6-NEXT:    s_mov_b32 s1, s3
307; GFX6-NEXT:    s_mov_b32 s2, s4
308; GFX6-NEXT:    s_mov_b32 s3, s5
309; GFX6-NEXT:    s_mov_b32 s4, s6
310; GFX6-NEXT:    s_mov_b32 s5, s7
311; GFX6-NEXT:    s_mov_b32 s6, s8
312; GFX6-NEXT:    s_mov_b32 s7, s9
313; GFX6-NEXT:    s_mov_b32 s8, s10
314; GFX6-NEXT:    s_mov_b32 s9, s11
315; GFX6-NEXT:    s_mov_b32 s10, s12
316; GFX6-NEXT:    s_mov_b32 s11, s13
317; GFX6-NEXT:    s_wqm_b64 exec, exec
318; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
319; GFX6-NEXT:    image_gather4_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
320; GFX6-NEXT:    s_waitcnt vmcnt(0)
321; GFX6-NEXT:    ; return to shader part epilog
322;
323; GFX10NSA-LABEL: gather4_c_2d:
324; GFX10NSA:       ; %bb.0: ; %main_body
325; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
326; GFX10NSA-NEXT:    s_mov_b32 s0, s2
327; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
328; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
329; GFX10NSA-NEXT:    s_mov_b32 s1, s3
330; GFX10NSA-NEXT:    s_mov_b32 s2, s4
331; GFX10NSA-NEXT:    s_mov_b32 s3, s5
332; GFX10NSA-NEXT:    s_mov_b32 s4, s6
333; GFX10NSA-NEXT:    s_mov_b32 s5, s7
334; GFX10NSA-NEXT:    s_mov_b32 s6, s8
335; GFX10NSA-NEXT:    s_mov_b32 s7, s9
336; GFX10NSA-NEXT:    s_mov_b32 s8, s10
337; GFX10NSA-NEXT:    s_mov_b32 s9, s11
338; GFX10NSA-NEXT:    s_mov_b32 s10, s12
339; GFX10NSA-NEXT:    s_mov_b32 s11, s13
340; GFX10NSA-NEXT:    image_gather4_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
341; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
342; GFX10NSA-NEXT:    ; return to shader part epilog
343;
344; GFX12-LABEL: gather4_c_2d:
345; GFX12:       ; %bb.0: ; %main_body
346; GFX12-NEXT:    s_mov_b32 s1, exec_lo
347; GFX12-NEXT:    s_mov_b32 s0, s2
348; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
349; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s1
350; GFX12-NEXT:    s_mov_b32 s1, s3
351; GFX12-NEXT:    s_mov_b32 s2, s4
352; GFX12-NEXT:    s_mov_b32 s3, s5
353; GFX12-NEXT:    s_mov_b32 s4, s6
354; GFX12-NEXT:    s_mov_b32 s5, s7
355; GFX12-NEXT:    s_mov_b32 s6, s8
356; GFX12-NEXT:    s_mov_b32 s7, s9
357; GFX12-NEXT:    s_mov_b32 s8, s10
358; GFX12-NEXT:    s_mov_b32 s9, s11
359; GFX12-NEXT:    s_mov_b32 s10, s12
360; GFX12-NEXT:    s_mov_b32 s11, s13
361; GFX12-NEXT:    image_gather4_c v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
362; GFX12-NEXT:    s_wait_samplecnt 0x0
363; GFX12-NEXT:    ; return to shader part epilog
364main_body:
365  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
366  ret <4 x float> %v
367}
368
369define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) {
370; GFX6-LABEL: gather4_cl_2d:
371; GFX6:       ; %bb.0: ; %main_body
372; GFX6-NEXT:    s_mov_b64 s[14:15], exec
373; GFX6-NEXT:    s_mov_b32 s0, s2
374; GFX6-NEXT:    s_mov_b32 s1, s3
375; GFX6-NEXT:    s_mov_b32 s2, s4
376; GFX6-NEXT:    s_mov_b32 s3, s5
377; GFX6-NEXT:    s_mov_b32 s4, s6
378; GFX6-NEXT:    s_mov_b32 s5, s7
379; GFX6-NEXT:    s_mov_b32 s6, s8
380; GFX6-NEXT:    s_mov_b32 s7, s9
381; GFX6-NEXT:    s_mov_b32 s8, s10
382; GFX6-NEXT:    s_mov_b32 s9, s11
383; GFX6-NEXT:    s_mov_b32 s10, s12
384; GFX6-NEXT:    s_mov_b32 s11, s13
385; GFX6-NEXT:    s_wqm_b64 exec, exec
386; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
387; GFX6-NEXT:    image_gather4_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
388; GFX6-NEXT:    s_waitcnt vmcnt(0)
389; GFX6-NEXT:    ; return to shader part epilog
390;
391; GFX10NSA-LABEL: gather4_cl_2d:
392; GFX10NSA:       ; %bb.0: ; %main_body
393; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
394; GFX10NSA-NEXT:    s_mov_b32 s0, s2
395; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
396; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
397; GFX10NSA-NEXT:    s_mov_b32 s1, s3
398; GFX10NSA-NEXT:    s_mov_b32 s2, s4
399; GFX10NSA-NEXT:    s_mov_b32 s3, s5
400; GFX10NSA-NEXT:    s_mov_b32 s4, s6
401; GFX10NSA-NEXT:    s_mov_b32 s5, s7
402; GFX10NSA-NEXT:    s_mov_b32 s6, s8
403; GFX10NSA-NEXT:    s_mov_b32 s7, s9
404; GFX10NSA-NEXT:    s_mov_b32 s8, s10
405; GFX10NSA-NEXT:    s_mov_b32 s9, s11
406; GFX10NSA-NEXT:    s_mov_b32 s10, s12
407; GFX10NSA-NEXT:    s_mov_b32 s11, s13
408; GFX10NSA-NEXT:    image_gather4_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
409; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
410; GFX10NSA-NEXT:    ; return to shader part epilog
411;
412; GFX12-LABEL: gather4_cl_2d:
413; GFX12:       ; %bb.0: ; %main_body
414; GFX12-NEXT:    s_mov_b32 s1, exec_lo
415; GFX12-NEXT:    s_mov_b32 s0, s2
416; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
417; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s1
418; GFX12-NEXT:    s_mov_b32 s1, s3
419; GFX12-NEXT:    s_mov_b32 s2, s4
420; GFX12-NEXT:    s_mov_b32 s3, s5
421; GFX12-NEXT:    s_mov_b32 s4, s6
422; GFX12-NEXT:    s_mov_b32 s5, s7
423; GFX12-NEXT:    s_mov_b32 s6, s8
424; GFX12-NEXT:    s_mov_b32 s7, s9
425; GFX12-NEXT:    s_mov_b32 s8, s10
426; GFX12-NEXT:    s_mov_b32 s9, s11
427; GFX12-NEXT:    s_mov_b32 s10, s12
428; GFX12-NEXT:    s_mov_b32 s11, s13
429; GFX12-NEXT:    image_gather4_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
430; GFX12-NEXT:    s_wait_samplecnt 0x0
431; GFX12-NEXT:    ; return to shader part epilog
432main_body:
433  %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 1, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
434  ret <4 x float> %v
435}
436
437define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) {
438; GFX6-LABEL: gather4_c_cl_2d:
439; GFX6:       ; %bb.0: ; %main_body
440; GFX6-NEXT:    s_mov_b64 s[14:15], exec
441; GFX6-NEXT:    s_mov_b32 s0, s2
442; GFX6-NEXT:    s_mov_b32 s1, s3
443; GFX6-NEXT:    s_mov_b32 s2, s4
444; GFX6-NEXT:    s_mov_b32 s3, s5
445; GFX6-NEXT:    s_mov_b32 s4, s6
446; GFX6-NEXT:    s_mov_b32 s5, s7
447; GFX6-NEXT:    s_mov_b32 s6, s8
448; GFX6-NEXT:    s_mov_b32 s7, s9
449; GFX6-NEXT:    s_mov_b32 s8, s10
450; GFX6-NEXT:    s_mov_b32 s9, s11
451; GFX6-NEXT:    s_mov_b32 s10, s12
452; GFX6-NEXT:    s_mov_b32 s11, s13
453; GFX6-NEXT:    s_wqm_b64 exec, exec
454; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
455; GFX6-NEXT:    image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
456; GFX6-NEXT:    s_waitcnt vmcnt(0)
457; GFX6-NEXT:    ; return to shader part epilog
458;
459; GFX10NSA-LABEL: gather4_c_cl_2d:
460; GFX10NSA:       ; %bb.0: ; %main_body
461; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
462; GFX10NSA-NEXT:    s_mov_b32 s0, s2
463; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
464; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
465; GFX10NSA-NEXT:    s_mov_b32 s1, s3
466; GFX10NSA-NEXT:    s_mov_b32 s2, s4
467; GFX10NSA-NEXT:    s_mov_b32 s3, s5
468; GFX10NSA-NEXT:    s_mov_b32 s4, s6
469; GFX10NSA-NEXT:    s_mov_b32 s5, s7
470; GFX10NSA-NEXT:    s_mov_b32 s6, s8
471; GFX10NSA-NEXT:    s_mov_b32 s7, s9
472; GFX10NSA-NEXT:    s_mov_b32 s8, s10
473; GFX10NSA-NEXT:    s_mov_b32 s9, s11
474; GFX10NSA-NEXT:    s_mov_b32 s10, s12
475; GFX10NSA-NEXT:    s_mov_b32 s11, s13
476; GFX10NSA-NEXT:    image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
477; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
478; GFX10NSA-NEXT:    ; return to shader part epilog
479;
480; GFX12-LABEL: gather4_c_cl_2d:
481; GFX12:       ; %bb.0: ; %main_body
482; GFX12-NEXT:    s_mov_b32 s1, exec_lo
483; GFX12-NEXT:    s_mov_b32 s0, s2
484; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
485; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s1
486; GFX12-NEXT:    s_mov_b32 s1, s3
487; GFX12-NEXT:    s_mov_b32 s2, s4
488; GFX12-NEXT:    s_mov_b32 s3, s5
489; GFX12-NEXT:    s_mov_b32 s4, s6
490; GFX12-NEXT:    s_mov_b32 s5, s7
491; GFX12-NEXT:    s_mov_b32 s6, s8
492; GFX12-NEXT:    s_mov_b32 s7, s9
493; GFX12-NEXT:    s_mov_b32 s8, s10
494; GFX12-NEXT:    s_mov_b32 s9, s11
495; GFX12-NEXT:    s_mov_b32 s10, s12
496; GFX12-NEXT:    s_mov_b32 s11, s13
497; GFX12-NEXT:    image_gather4_c_cl v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
498; GFX12-NEXT:    s_wait_samplecnt 0x0
499; GFX12-NEXT:    ; return to shader part epilog
500main_body:
501  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
502  ret <4 x float> %v
503}
504
505define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
506; GFX6-LABEL: gather4_b_2d:
507; GFX6:       ; %bb.0: ; %main_body
508; GFX6-NEXT:    s_mov_b64 s[14:15], exec
509; GFX6-NEXT:    s_mov_b32 s0, s2
510; GFX6-NEXT:    s_mov_b32 s1, s3
511; GFX6-NEXT:    s_mov_b32 s2, s4
512; GFX6-NEXT:    s_mov_b32 s3, s5
513; GFX6-NEXT:    s_mov_b32 s4, s6
514; GFX6-NEXT:    s_mov_b32 s5, s7
515; GFX6-NEXT:    s_mov_b32 s6, s8
516; GFX6-NEXT:    s_mov_b32 s7, s9
517; GFX6-NEXT:    s_mov_b32 s8, s10
518; GFX6-NEXT:    s_mov_b32 s9, s11
519; GFX6-NEXT:    s_mov_b32 s10, s12
520; GFX6-NEXT:    s_mov_b32 s11, s13
521; GFX6-NEXT:    s_wqm_b64 exec, exec
522; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
523; GFX6-NEXT:    image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
524; GFX6-NEXT:    s_waitcnt vmcnt(0)
525; GFX6-NEXT:    ; return to shader part epilog
526;
527; GFX10NSA-LABEL: gather4_b_2d:
528; GFX10NSA:       ; %bb.0: ; %main_body
529; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
530; GFX10NSA-NEXT:    s_mov_b32 s0, s2
531; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
532; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
533; GFX10NSA-NEXT:    s_mov_b32 s1, s3
534; GFX10NSA-NEXT:    s_mov_b32 s2, s4
535; GFX10NSA-NEXT:    s_mov_b32 s3, s5
536; GFX10NSA-NEXT:    s_mov_b32 s4, s6
537; GFX10NSA-NEXT:    s_mov_b32 s5, s7
538; GFX10NSA-NEXT:    s_mov_b32 s6, s8
539; GFX10NSA-NEXT:    s_mov_b32 s7, s9
540; GFX10NSA-NEXT:    s_mov_b32 s8, s10
541; GFX10NSA-NEXT:    s_mov_b32 s9, s11
542; GFX10NSA-NEXT:    s_mov_b32 s10, s12
543; GFX10NSA-NEXT:    s_mov_b32 s11, s13
544; GFX10NSA-NEXT:    image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
545; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
546; GFX10NSA-NEXT:    ; return to shader part epilog
547;
548; GFX12-LABEL: gather4_b_2d:
549; GFX12:       ; %bb.0: ; %main_body
550; GFX12-NEXT:    s_mov_b32 s1, exec_lo
551; GFX12-NEXT:    s_mov_b32 s0, s2
552; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
553; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s1
554; GFX12-NEXT:    s_mov_b32 s1, s3
555; GFX12-NEXT:    s_mov_b32 s2, s4
556; GFX12-NEXT:    s_mov_b32 s3, s5
557; GFX12-NEXT:    s_mov_b32 s4, s6
558; GFX12-NEXT:    s_mov_b32 s5, s7
559; GFX12-NEXT:    s_mov_b32 s6, s8
560; GFX12-NEXT:    s_mov_b32 s7, s9
561; GFX12-NEXT:    s_mov_b32 s8, s10
562; GFX12-NEXT:    s_mov_b32 s9, s11
563; GFX12-NEXT:    s_mov_b32 s10, s12
564; GFX12-NEXT:    s_mov_b32 s11, s13
565; GFX12-NEXT:    image_gather4_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
566; GFX12-NEXT:    s_wait_samplecnt 0x0
567; GFX12-NEXT:    ; return to shader part epilog
568main_body:
569  %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
570  ret <4 x float> %v
571}
572
573define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) {
574; GFX6-LABEL: gather4_c_b_2d:
575; GFX6:       ; %bb.0: ; %main_body
576; GFX6-NEXT:    s_mov_b64 s[14:15], exec
577; GFX6-NEXT:    s_mov_b32 s0, s2
578; GFX6-NEXT:    s_mov_b32 s1, s3
579; GFX6-NEXT:    s_mov_b32 s2, s4
580; GFX6-NEXT:    s_mov_b32 s3, s5
581; GFX6-NEXT:    s_mov_b32 s4, s6
582; GFX6-NEXT:    s_mov_b32 s5, s7
583; GFX6-NEXT:    s_mov_b32 s6, s8
584; GFX6-NEXT:    s_mov_b32 s7, s9
585; GFX6-NEXT:    s_mov_b32 s8, s10
586; GFX6-NEXT:    s_mov_b32 s9, s11
587; GFX6-NEXT:    s_mov_b32 s10, s12
588; GFX6-NEXT:    s_mov_b32 s11, s13
589; GFX6-NEXT:    s_wqm_b64 exec, exec
590; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
591; GFX6-NEXT:    image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
592; GFX6-NEXT:    s_waitcnt vmcnt(0)
593; GFX6-NEXT:    ; return to shader part epilog
594;
595; GFX10NSA-LABEL: gather4_c_b_2d:
596; GFX10NSA:       ; %bb.0: ; %main_body
597; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
598; GFX10NSA-NEXT:    s_mov_b32 s0, s2
599; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
600; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
601; GFX10NSA-NEXT:    s_mov_b32 s1, s3
602; GFX10NSA-NEXT:    s_mov_b32 s2, s4
603; GFX10NSA-NEXT:    s_mov_b32 s3, s5
604; GFX10NSA-NEXT:    s_mov_b32 s4, s6
605; GFX10NSA-NEXT:    s_mov_b32 s5, s7
606; GFX10NSA-NEXT:    s_mov_b32 s6, s8
607; GFX10NSA-NEXT:    s_mov_b32 s7, s9
608; GFX10NSA-NEXT:    s_mov_b32 s8, s10
609; GFX10NSA-NEXT:    s_mov_b32 s9, s11
610; GFX10NSA-NEXT:    s_mov_b32 s10, s12
611; GFX10NSA-NEXT:    s_mov_b32 s11, s13
612; GFX10NSA-NEXT:    image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
613; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
614; GFX10NSA-NEXT:    ; return to shader part epilog
615;
616; GFX12-LABEL: gather4_c_b_2d:
617; GFX12:       ; %bb.0: ; %main_body
618; GFX12-NEXT:    s_mov_b32 s1, exec_lo
619; GFX12-NEXT:    s_mov_b32 s0, s2
620; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
621; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s1
622; GFX12-NEXT:    s_mov_b32 s1, s3
623; GFX12-NEXT:    s_mov_b32 s2, s4
624; GFX12-NEXT:    s_mov_b32 s3, s5
625; GFX12-NEXT:    s_mov_b32 s4, s6
626; GFX12-NEXT:    s_mov_b32 s5, s7
627; GFX12-NEXT:    s_mov_b32 s6, s8
628; GFX12-NEXT:    s_mov_b32 s7, s9
629; GFX12-NEXT:    s_mov_b32 s8, s10
630; GFX12-NEXT:    s_mov_b32 s9, s11
631; GFX12-NEXT:    s_mov_b32 s10, s12
632; GFX12-NEXT:    s_mov_b32 s11, s13
633; GFX12-NEXT:    image_gather4_c_b v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
634; GFX12-NEXT:    s_wait_samplecnt 0x0
635; GFX12-NEXT:    ; return to shader part epilog
636main_body:
637  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
638  ret <4 x float> %v
639}
640
641define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) {
642; GFX6-LABEL: gather4_b_cl_2d:
643; GFX6:       ; %bb.0: ; %main_body
644; GFX6-NEXT:    s_mov_b64 s[14:15], exec
645; GFX6-NEXT:    s_mov_b32 s0, s2
646; GFX6-NEXT:    s_mov_b32 s1, s3
647; GFX6-NEXT:    s_mov_b32 s2, s4
648; GFX6-NEXT:    s_mov_b32 s3, s5
649; GFX6-NEXT:    s_mov_b32 s4, s6
650; GFX6-NEXT:    s_mov_b32 s5, s7
651; GFX6-NEXT:    s_mov_b32 s6, s8
652; GFX6-NEXT:    s_mov_b32 s7, s9
653; GFX6-NEXT:    s_mov_b32 s8, s10
654; GFX6-NEXT:    s_mov_b32 s9, s11
655; GFX6-NEXT:    s_mov_b32 s10, s12
656; GFX6-NEXT:    s_mov_b32 s11, s13
657; GFX6-NEXT:    s_wqm_b64 exec, exec
658; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
659; GFX6-NEXT:    image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
660; GFX6-NEXT:    s_waitcnt vmcnt(0)
661; GFX6-NEXT:    ; return to shader part epilog
662;
663; GFX10NSA-LABEL: gather4_b_cl_2d:
664; GFX10NSA:       ; %bb.0: ; %main_body
665; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
666; GFX10NSA-NEXT:    s_mov_b32 s0, s2
667; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
668; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
669; GFX10NSA-NEXT:    s_mov_b32 s1, s3
670; GFX10NSA-NEXT:    s_mov_b32 s2, s4
671; GFX10NSA-NEXT:    s_mov_b32 s3, s5
672; GFX10NSA-NEXT:    s_mov_b32 s4, s6
673; GFX10NSA-NEXT:    s_mov_b32 s5, s7
674; GFX10NSA-NEXT:    s_mov_b32 s6, s8
675; GFX10NSA-NEXT:    s_mov_b32 s7, s9
676; GFX10NSA-NEXT:    s_mov_b32 s8, s10
677; GFX10NSA-NEXT:    s_mov_b32 s9, s11
678; GFX10NSA-NEXT:    s_mov_b32 s10, s12
679; GFX10NSA-NEXT:    s_mov_b32 s11, s13
680; GFX10NSA-NEXT:    image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
681; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
682; GFX10NSA-NEXT:    ; return to shader part epilog
683;
684; GFX12-LABEL: gather4_b_cl_2d:
685; GFX12:       ; %bb.0: ; %main_body
686; GFX12-NEXT:    s_mov_b32 s1, exec_lo
687; GFX12-NEXT:    s_mov_b32 s0, s2
688; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
689; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s1
690; GFX12-NEXT:    s_mov_b32 s1, s3
691; GFX12-NEXT:    s_mov_b32 s2, s4
692; GFX12-NEXT:    s_mov_b32 s3, s5
693; GFX12-NEXT:    s_mov_b32 s4, s6
694; GFX12-NEXT:    s_mov_b32 s5, s7
695; GFX12-NEXT:    s_mov_b32 s6, s8
696; GFX12-NEXT:    s_mov_b32 s7, s9
697; GFX12-NEXT:    s_mov_b32 s8, s10
698; GFX12-NEXT:    s_mov_b32 s9, s11
699; GFX12-NEXT:    s_mov_b32 s10, s12
700; GFX12-NEXT:    s_mov_b32 s11, s13
701; GFX12-NEXT:    image_gather4_b_cl v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
702; GFX12-NEXT:    s_wait_samplecnt 0x0
703; GFX12-NEXT:    ; return to shader part epilog
704main_body:
705  %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
706  ret <4 x float> %v
707}
708
709define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
710; GFX6-LABEL: gather4_c_b_cl_2d:
711; GFX6:       ; %bb.0: ; %main_body
712; GFX6-NEXT:    s_mov_b64 s[14:15], exec
713; GFX6-NEXT:    s_mov_b32 s0, s2
714; GFX6-NEXT:    s_mov_b32 s1, s3
715; GFX6-NEXT:    s_mov_b32 s2, s4
716; GFX6-NEXT:    s_mov_b32 s3, s5
717; GFX6-NEXT:    s_mov_b32 s4, s6
718; GFX6-NEXT:    s_mov_b32 s5, s7
719; GFX6-NEXT:    s_mov_b32 s6, s8
720; GFX6-NEXT:    s_mov_b32 s7, s9
721; GFX6-NEXT:    s_mov_b32 s8, s10
722; GFX6-NEXT:    s_mov_b32 s9, s11
723; GFX6-NEXT:    s_mov_b32 s10, s12
724; GFX6-NEXT:    s_mov_b32 s11, s13
725; GFX6-NEXT:    s_wqm_b64 exec, exec
726; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
727; GFX6-NEXT:    image_gather4_c_b_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1
728; GFX6-NEXT:    s_waitcnt vmcnt(0)
729; GFX6-NEXT:    ; return to shader part epilog
730;
731; GFX10NSA-LABEL: gather4_c_b_cl_2d:
732; GFX10NSA:       ; %bb.0: ; %main_body
733; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
734; GFX10NSA-NEXT:    s_mov_b32 s0, s2
735; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
736; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
737; GFX10NSA-NEXT:    s_mov_b32 s1, s3
738; GFX10NSA-NEXT:    s_mov_b32 s2, s4
739; GFX10NSA-NEXT:    s_mov_b32 s3, s5
740; GFX10NSA-NEXT:    s_mov_b32 s4, s6
741; GFX10NSA-NEXT:    s_mov_b32 s5, s7
742; GFX10NSA-NEXT:    s_mov_b32 s6, s8
743; GFX10NSA-NEXT:    s_mov_b32 s7, s9
744; GFX10NSA-NEXT:    s_mov_b32 s8, s10
745; GFX10NSA-NEXT:    s_mov_b32 s9, s11
746; GFX10NSA-NEXT:    s_mov_b32 s10, s12
747; GFX10NSA-NEXT:    s_mov_b32 s11, s13
748; GFX10NSA-NEXT:    image_gather4_c_b_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
749; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
750; GFX10NSA-NEXT:    ; return to shader part epilog
751;
752; GFX12-LABEL: gather4_c_b_cl_2d:
753; GFX12:       ; %bb.0: ; %main_body
754; GFX12-NEXT:    s_mov_b32 s1, exec_lo
755; GFX12-NEXT:    s_mov_b32 s0, s2
756; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
757; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s1
758; GFX12-NEXT:    s_mov_b32 s1, s3
759; GFX12-NEXT:    s_mov_b32 s2, s4
760; GFX12-NEXT:    s_mov_b32 s3, s5
761; GFX12-NEXT:    s_mov_b32 s4, s6
762; GFX12-NEXT:    s_mov_b32 s5, s7
763; GFX12-NEXT:    s_mov_b32 s6, s8
764; GFX12-NEXT:    s_mov_b32 s7, s9
765; GFX12-NEXT:    s_mov_b32 s8, s10
766; GFX12-NEXT:    s_mov_b32 s9, s11
767; GFX12-NEXT:    s_mov_b32 s10, s12
768; GFX12-NEXT:    s_mov_b32 s11, s13
769; GFX12-NEXT:    image_gather4_c_b_cl v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
770; GFX12-NEXT:    s_wait_samplecnt 0x0
771; GFX12-NEXT:    ; return to shader part epilog
772main_body:
773  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
774  ret <4 x float> %v
775}
776
777define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
778; GFX6-LABEL: gather4_l_2d:
779; GFX6:       ; %bb.0: ; %main_body
780; GFX6-NEXT:    s_mov_b32 s0, s2
781; GFX6-NEXT:    s_mov_b32 s1, s3
782; GFX6-NEXT:    s_mov_b32 s2, s4
783; GFX6-NEXT:    s_mov_b32 s3, s5
784; GFX6-NEXT:    s_mov_b32 s4, s6
785; GFX6-NEXT:    s_mov_b32 s5, s7
786; GFX6-NEXT:    s_mov_b32 s6, s8
787; GFX6-NEXT:    s_mov_b32 s7, s9
788; GFX6-NEXT:    s_mov_b32 s8, s10
789; GFX6-NEXT:    s_mov_b32 s9, s11
790; GFX6-NEXT:    s_mov_b32 s10, s12
791; GFX6-NEXT:    s_mov_b32 s11, s13
792; GFX6-NEXT:    image_gather4_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
793; GFX6-NEXT:    s_waitcnt vmcnt(0)
794; GFX6-NEXT:    ; return to shader part epilog
795;
796; GFX10NSA-LABEL: gather4_l_2d:
797; GFX10NSA:       ; %bb.0: ; %main_body
798; GFX10NSA-NEXT:    s_mov_b32 s0, s2
799; GFX10NSA-NEXT:    s_mov_b32 s1, s3
800; GFX10NSA-NEXT:    s_mov_b32 s2, s4
801; GFX10NSA-NEXT:    s_mov_b32 s3, s5
802; GFX10NSA-NEXT:    s_mov_b32 s4, s6
803; GFX10NSA-NEXT:    s_mov_b32 s5, s7
804; GFX10NSA-NEXT:    s_mov_b32 s6, s8
805; GFX10NSA-NEXT:    s_mov_b32 s7, s9
806; GFX10NSA-NEXT:    s_mov_b32 s8, s10
807; GFX10NSA-NEXT:    s_mov_b32 s9, s11
808; GFX10NSA-NEXT:    s_mov_b32 s10, s12
809; GFX10NSA-NEXT:    s_mov_b32 s11, s13
810; GFX10NSA-NEXT:    image_gather4_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
811; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
812; GFX10NSA-NEXT:    ; return to shader part epilog
813;
814; GFX12-LABEL: gather4_l_2d:
815; GFX12:       ; %bb.0: ; %main_body
816; GFX12-NEXT:    s_mov_b32 s0, s2
817; GFX12-NEXT:    s_mov_b32 s1, s3
818; GFX12-NEXT:    s_mov_b32 s2, s4
819; GFX12-NEXT:    s_mov_b32 s3, s5
820; GFX12-NEXT:    s_mov_b32 s4, s6
821; GFX12-NEXT:    s_mov_b32 s5, s7
822; GFX12-NEXT:    s_mov_b32 s6, s8
823; GFX12-NEXT:    s_mov_b32 s7, s9
824; GFX12-NEXT:    s_mov_b32 s8, s10
825; GFX12-NEXT:    s_mov_b32 s9, s11
826; GFX12-NEXT:    s_mov_b32 s10, s12
827; GFX12-NEXT:    s_mov_b32 s11, s13
828; GFX12-NEXT:    image_gather4_l v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
829; GFX12-NEXT:    s_wait_samplecnt 0x0
830; GFX12-NEXT:    ; return to shader part epilog
831main_body:
832  %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 1, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
833  ret <4 x float> %v
834}
835
836define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) {
837; GFX6-LABEL: gather4_c_l_2d:
838; GFX6:       ; %bb.0: ; %main_body
839; GFX6-NEXT:    s_mov_b32 s0, s2
840; GFX6-NEXT:    s_mov_b32 s1, s3
841; GFX6-NEXT:    s_mov_b32 s2, s4
842; GFX6-NEXT:    s_mov_b32 s3, s5
843; GFX6-NEXT:    s_mov_b32 s4, s6
844; GFX6-NEXT:    s_mov_b32 s5, s7
845; GFX6-NEXT:    s_mov_b32 s6, s8
846; GFX6-NEXT:    s_mov_b32 s7, s9
847; GFX6-NEXT:    s_mov_b32 s8, s10
848; GFX6-NEXT:    s_mov_b32 s9, s11
849; GFX6-NEXT:    s_mov_b32 s10, s12
850; GFX6-NEXT:    s_mov_b32 s11, s13
851; GFX6-NEXT:    image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
852; GFX6-NEXT:    s_waitcnt vmcnt(0)
853; GFX6-NEXT:    ; return to shader part epilog
854;
855; GFX10NSA-LABEL: gather4_c_l_2d:
856; GFX10NSA:       ; %bb.0: ; %main_body
857; GFX10NSA-NEXT:    s_mov_b32 s0, s2
858; GFX10NSA-NEXT:    s_mov_b32 s1, s3
859; GFX10NSA-NEXT:    s_mov_b32 s2, s4
860; GFX10NSA-NEXT:    s_mov_b32 s3, s5
861; GFX10NSA-NEXT:    s_mov_b32 s4, s6
862; GFX10NSA-NEXT:    s_mov_b32 s5, s7
863; GFX10NSA-NEXT:    s_mov_b32 s6, s8
864; GFX10NSA-NEXT:    s_mov_b32 s7, s9
865; GFX10NSA-NEXT:    s_mov_b32 s8, s10
866; GFX10NSA-NEXT:    s_mov_b32 s9, s11
867; GFX10NSA-NEXT:    s_mov_b32 s10, s12
868; GFX10NSA-NEXT:    s_mov_b32 s11, s13
869; GFX10NSA-NEXT:    image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
870; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
871; GFX10NSA-NEXT:    ; return to shader part epilog
872;
873; GFX12-LABEL: gather4_c_l_2d:
874; GFX12:       ; %bb.0: ; %main_body
875; GFX12-NEXT:    s_mov_b32 s0, s2
876; GFX12-NEXT:    s_mov_b32 s1, s3
877; GFX12-NEXT:    s_mov_b32 s2, s4
878; GFX12-NEXT:    s_mov_b32 s3, s5
879; GFX12-NEXT:    s_mov_b32 s4, s6
880; GFX12-NEXT:    s_mov_b32 s5, s7
881; GFX12-NEXT:    s_mov_b32 s6, s8
882; GFX12-NEXT:    s_mov_b32 s7, s9
883; GFX12-NEXT:    s_mov_b32 s8, s10
884; GFX12-NEXT:    s_mov_b32 s9, s11
885; GFX12-NEXT:    s_mov_b32 s10, s12
886; GFX12-NEXT:    s_mov_b32 s11, s13
887; GFX12-NEXT:    image_gather4_c_l v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
888; GFX12-NEXT:    s_wait_samplecnt 0x0
889; GFX12-NEXT:    ; return to shader part epilog
890main_body:
891  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
892  ret <4 x float> %v
893}
894
895define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
896; GFX6-LABEL: gather4_lz_2d:
897; GFX6:       ; %bb.0: ; %main_body
898; GFX6-NEXT:    s_mov_b32 s0, s2
899; GFX6-NEXT:    s_mov_b32 s1, s3
900; GFX6-NEXT:    s_mov_b32 s2, s4
901; GFX6-NEXT:    s_mov_b32 s3, s5
902; GFX6-NEXT:    s_mov_b32 s4, s6
903; GFX6-NEXT:    s_mov_b32 s5, s7
904; GFX6-NEXT:    s_mov_b32 s6, s8
905; GFX6-NEXT:    s_mov_b32 s7, s9
906; GFX6-NEXT:    s_mov_b32 s8, s10
907; GFX6-NEXT:    s_mov_b32 s9, s11
908; GFX6-NEXT:    s_mov_b32 s10, s12
909; GFX6-NEXT:    s_mov_b32 s11, s13
910; GFX6-NEXT:    image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1
911; GFX6-NEXT:    s_waitcnt vmcnt(0)
912; GFX6-NEXT:    ; return to shader part epilog
913;
914; GFX10NSA-LABEL: gather4_lz_2d:
915; GFX10NSA:       ; %bb.0: ; %main_body
916; GFX10NSA-NEXT:    s_mov_b32 s0, s2
917; GFX10NSA-NEXT:    s_mov_b32 s1, s3
918; GFX10NSA-NEXT:    s_mov_b32 s2, s4
919; GFX10NSA-NEXT:    s_mov_b32 s3, s5
920; GFX10NSA-NEXT:    s_mov_b32 s4, s6
921; GFX10NSA-NEXT:    s_mov_b32 s5, s7
922; GFX10NSA-NEXT:    s_mov_b32 s6, s8
923; GFX10NSA-NEXT:    s_mov_b32 s7, s9
924; GFX10NSA-NEXT:    s_mov_b32 s8, s10
925; GFX10NSA-NEXT:    s_mov_b32 s9, s11
926; GFX10NSA-NEXT:    s_mov_b32 s10, s12
927; GFX10NSA-NEXT:    s_mov_b32 s11, s13
928; GFX10NSA-NEXT:    image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
929; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
930; GFX10NSA-NEXT:    ; return to shader part epilog
931;
932; GFX12-LABEL: gather4_lz_2d:
933; GFX12:       ; %bb.0: ; %main_body
934; GFX12-NEXT:    s_mov_b32 s0, s2
935; GFX12-NEXT:    s_mov_b32 s1, s3
936; GFX12-NEXT:    s_mov_b32 s2, s4
937; GFX12-NEXT:    s_mov_b32 s3, s5
938; GFX12-NEXT:    s_mov_b32 s4, s6
939; GFX12-NEXT:    s_mov_b32 s5, s7
940; GFX12-NEXT:    s_mov_b32 s6, s8
941; GFX12-NEXT:    s_mov_b32 s7, s9
942; GFX12-NEXT:    s_mov_b32 s8, s10
943; GFX12-NEXT:    s_mov_b32 s9, s11
944; GFX12-NEXT:    s_mov_b32 s10, s12
945; GFX12-NEXT:    s_mov_b32 s11, s13
946; GFX12-NEXT:    image_gather4_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
947; GFX12-NEXT:    s_wait_samplecnt 0x0
948; GFX12-NEXT:    ; return to shader part epilog
949main_body:
950  %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
951  ret <4 x float> %v
952}
953
954define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
955; GFX6-LABEL: gather4_c_lz_2d:
956; GFX6:       ; %bb.0: ; %main_body
957; GFX6-NEXT:    s_mov_b32 s0, s2
958; GFX6-NEXT:    s_mov_b32 s1, s3
959; GFX6-NEXT:    s_mov_b32 s2, s4
960; GFX6-NEXT:    s_mov_b32 s3, s5
961; GFX6-NEXT:    s_mov_b32 s4, s6
962; GFX6-NEXT:    s_mov_b32 s5, s7
963; GFX6-NEXT:    s_mov_b32 s6, s8
964; GFX6-NEXT:    s_mov_b32 s7, s9
965; GFX6-NEXT:    s_mov_b32 s8, s10
966; GFX6-NEXT:    s_mov_b32 s9, s11
967; GFX6-NEXT:    s_mov_b32 s10, s12
968; GFX6-NEXT:    s_mov_b32 s11, s13
969; GFX6-NEXT:    image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
970; GFX6-NEXT:    s_waitcnt vmcnt(0)
971; GFX6-NEXT:    ; return to shader part epilog
972;
973; GFX10NSA-LABEL: gather4_c_lz_2d:
974; GFX10NSA:       ; %bb.0: ; %main_body
975; GFX10NSA-NEXT:    s_mov_b32 s0, s2
976; GFX10NSA-NEXT:    s_mov_b32 s1, s3
977; GFX10NSA-NEXT:    s_mov_b32 s2, s4
978; GFX10NSA-NEXT:    s_mov_b32 s3, s5
979; GFX10NSA-NEXT:    s_mov_b32 s4, s6
980; GFX10NSA-NEXT:    s_mov_b32 s5, s7
981; GFX10NSA-NEXT:    s_mov_b32 s6, s8
982; GFX10NSA-NEXT:    s_mov_b32 s7, s9
983; GFX10NSA-NEXT:    s_mov_b32 s8, s10
984; GFX10NSA-NEXT:    s_mov_b32 s9, s11
985; GFX10NSA-NEXT:    s_mov_b32 s10, s12
986; GFX10NSA-NEXT:    s_mov_b32 s11, s13
987; GFX10NSA-NEXT:    image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
988; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
989; GFX10NSA-NEXT:    ; return to shader part epilog
990;
991; GFX12-LABEL: gather4_c_lz_2d:
992; GFX12:       ; %bb.0: ; %main_body
993; GFX12-NEXT:    s_mov_b32 s0, s2
994; GFX12-NEXT:    s_mov_b32 s1, s3
995; GFX12-NEXT:    s_mov_b32 s2, s4
996; GFX12-NEXT:    s_mov_b32 s3, s5
997; GFX12-NEXT:    s_mov_b32 s4, s6
998; GFX12-NEXT:    s_mov_b32 s5, s7
999; GFX12-NEXT:    s_mov_b32 s6, s8
1000; GFX12-NEXT:    s_mov_b32 s7, s9
1001; GFX12-NEXT:    s_mov_b32 s8, s10
1002; GFX12-NEXT:    s_mov_b32 s9, s11
1003; GFX12-NEXT:    s_mov_b32 s10, s12
1004; GFX12-NEXT:    s_mov_b32 s11, s13
1005; GFX12-NEXT:    image_gather4_c_lz v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
1006; GFX12-NEXT:    s_wait_samplecnt 0x0
1007; GFX12-NEXT:    ; return to shader part epilog
1008main_body:
1009  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
1010  ret <4 x float> %v
1011}
1012
1013define amdgpu_ps <4 x float> @gather4_2d_dmask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
1014; GFX6-LABEL: gather4_2d_dmask_2:
1015; GFX6:       ; %bb.0: ; %main_body
1016; GFX6-NEXT:    s_mov_b64 s[14:15], exec
1017; GFX6-NEXT:    s_mov_b32 s0, s2
1018; GFX6-NEXT:    s_mov_b32 s1, s3
1019; GFX6-NEXT:    s_mov_b32 s2, s4
1020; GFX6-NEXT:    s_mov_b32 s3, s5
1021; GFX6-NEXT:    s_mov_b32 s4, s6
1022; GFX6-NEXT:    s_mov_b32 s5, s7
1023; GFX6-NEXT:    s_mov_b32 s6, s8
1024; GFX6-NEXT:    s_mov_b32 s7, s9
1025; GFX6-NEXT:    s_mov_b32 s8, s10
1026; GFX6-NEXT:    s_mov_b32 s9, s11
1027; GFX6-NEXT:    s_mov_b32 s10, s12
1028; GFX6-NEXT:    s_mov_b32 s11, s13
1029; GFX6-NEXT:    s_wqm_b64 exec, exec
1030; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
1031; GFX6-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2
1032; GFX6-NEXT:    s_waitcnt vmcnt(0)
1033; GFX6-NEXT:    ; return to shader part epilog
1034;
1035; GFX10NSA-LABEL: gather4_2d_dmask_2:
1036; GFX10NSA:       ; %bb.0: ; %main_body
1037; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
1038; GFX10NSA-NEXT:    s_mov_b32 s0, s2
1039; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
1040; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
1041; GFX10NSA-NEXT:    s_mov_b32 s1, s3
1042; GFX10NSA-NEXT:    s_mov_b32 s2, s4
1043; GFX10NSA-NEXT:    s_mov_b32 s3, s5
1044; GFX10NSA-NEXT:    s_mov_b32 s4, s6
1045; GFX10NSA-NEXT:    s_mov_b32 s5, s7
1046; GFX10NSA-NEXT:    s_mov_b32 s6, s8
1047; GFX10NSA-NEXT:    s_mov_b32 s7, s9
1048; GFX10NSA-NEXT:    s_mov_b32 s8, s10
1049; GFX10NSA-NEXT:    s_mov_b32 s9, s11
1050; GFX10NSA-NEXT:    s_mov_b32 s10, s12
1051; GFX10NSA-NEXT:    s_mov_b32 s11, s13
1052; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_2D
1053; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
1054; GFX10NSA-NEXT:    ; return to shader part epilog
1055;
1056; GFX12-LABEL: gather4_2d_dmask_2:
1057; GFX12:       ; %bb.0: ; %main_body
1058; GFX12-NEXT:    s_mov_b32 s1, exec_lo
1059; GFX12-NEXT:    s_mov_b32 s0, s2
1060; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
1061; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s1
1062; GFX12-NEXT:    s_mov_b32 s1, s3
1063; GFX12-NEXT:    s_mov_b32 s2, s4
1064; GFX12-NEXT:    s_mov_b32 s3, s5
1065; GFX12-NEXT:    s_mov_b32 s4, s6
1066; GFX12-NEXT:    s_mov_b32 s5, s7
1067; GFX12-NEXT:    s_mov_b32 s6, s8
1068; GFX12-NEXT:    s_mov_b32 s7, s9
1069; GFX12-NEXT:    s_mov_b32 s8, s10
1070; GFX12-NEXT:    s_mov_b32 s9, s11
1071; GFX12-NEXT:    s_mov_b32 s10, s12
1072; GFX12-NEXT:    s_mov_b32 s11, s13
1073; GFX12-NEXT:    image_gather4 v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_2D
1074; GFX12-NEXT:    s_wait_samplecnt 0x0
1075; GFX12-NEXT:    ; return to shader part epilog
1076main_body:
1077  %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 2, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
1078  ret <4 x float> %v
1079}
1080
1081define amdgpu_ps <4 x float> @gather4_2d_dmask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
1082; GFX6-LABEL: gather4_2d_dmask_4:
1083; GFX6:       ; %bb.0: ; %main_body
1084; GFX6-NEXT:    s_mov_b64 s[14:15], exec
1085; GFX6-NEXT:    s_mov_b32 s0, s2
1086; GFX6-NEXT:    s_mov_b32 s1, s3
1087; GFX6-NEXT:    s_mov_b32 s2, s4
1088; GFX6-NEXT:    s_mov_b32 s3, s5
1089; GFX6-NEXT:    s_mov_b32 s4, s6
1090; GFX6-NEXT:    s_mov_b32 s5, s7
1091; GFX6-NEXT:    s_mov_b32 s6, s8
1092; GFX6-NEXT:    s_mov_b32 s7, s9
1093; GFX6-NEXT:    s_mov_b32 s8, s10
1094; GFX6-NEXT:    s_mov_b32 s9, s11
1095; GFX6-NEXT:    s_mov_b32 s10, s12
1096; GFX6-NEXT:    s_mov_b32 s11, s13
1097; GFX6-NEXT:    s_wqm_b64 exec, exec
1098; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
1099; GFX6-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4
1100; GFX6-NEXT:    s_waitcnt vmcnt(0)
1101; GFX6-NEXT:    ; return to shader part epilog
1102;
1103; GFX10NSA-LABEL: gather4_2d_dmask_4:
1104; GFX10NSA:       ; %bb.0: ; %main_body
1105; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
1106; GFX10NSA-NEXT:    s_mov_b32 s0, s2
1107; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
1108; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
1109; GFX10NSA-NEXT:    s_mov_b32 s1, s3
1110; GFX10NSA-NEXT:    s_mov_b32 s2, s4
1111; GFX10NSA-NEXT:    s_mov_b32 s3, s5
1112; GFX10NSA-NEXT:    s_mov_b32 s4, s6
1113; GFX10NSA-NEXT:    s_mov_b32 s5, s7
1114; GFX10NSA-NEXT:    s_mov_b32 s6, s8
1115; GFX10NSA-NEXT:    s_mov_b32 s7, s9
1116; GFX10NSA-NEXT:    s_mov_b32 s8, s10
1117; GFX10NSA-NEXT:    s_mov_b32 s9, s11
1118; GFX10NSA-NEXT:    s_mov_b32 s10, s12
1119; GFX10NSA-NEXT:    s_mov_b32 s11, s13
1120; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D
1121; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
1122; GFX10NSA-NEXT:    ; return to shader part epilog
1123;
1124; GFX12-LABEL: gather4_2d_dmask_4:
1125; GFX12:       ; %bb.0: ; %main_body
1126; GFX12-NEXT:    s_mov_b32 s1, exec_lo
1127; GFX12-NEXT:    s_mov_b32 s0, s2
1128; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
1129; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s1
1130; GFX12-NEXT:    s_mov_b32 s1, s3
1131; GFX12-NEXT:    s_mov_b32 s2, s4
1132; GFX12-NEXT:    s_mov_b32 s3, s5
1133; GFX12-NEXT:    s_mov_b32 s4, s6
1134; GFX12-NEXT:    s_mov_b32 s5, s7
1135; GFX12-NEXT:    s_mov_b32 s6, s8
1136; GFX12-NEXT:    s_mov_b32 s7, s9
1137; GFX12-NEXT:    s_mov_b32 s8, s10
1138; GFX12-NEXT:    s_mov_b32 s9, s11
1139; GFX12-NEXT:    s_mov_b32 s10, s12
1140; GFX12-NEXT:    s_mov_b32 s11, s13
1141; GFX12-NEXT:    image_gather4 v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D
1142; GFX12-NEXT:    s_wait_samplecnt 0x0
1143; GFX12-NEXT:    ; return to shader part epilog
1144main_body:
1145  %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 4, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
1146  ret <4 x float> %v
1147}
1148
1149define amdgpu_ps <4 x float> @gather4_2d_dmask_8(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
1150; GFX6-LABEL: gather4_2d_dmask_8:
1151; GFX6:       ; %bb.0: ; %main_body
1152; GFX6-NEXT:    s_mov_b64 s[14:15], exec
1153; GFX6-NEXT:    s_mov_b32 s0, s2
1154; GFX6-NEXT:    s_mov_b32 s1, s3
1155; GFX6-NEXT:    s_mov_b32 s2, s4
1156; GFX6-NEXT:    s_mov_b32 s3, s5
1157; GFX6-NEXT:    s_mov_b32 s4, s6
1158; GFX6-NEXT:    s_mov_b32 s5, s7
1159; GFX6-NEXT:    s_mov_b32 s6, s8
1160; GFX6-NEXT:    s_mov_b32 s7, s9
1161; GFX6-NEXT:    s_mov_b32 s8, s10
1162; GFX6-NEXT:    s_mov_b32 s9, s11
1163; GFX6-NEXT:    s_mov_b32 s10, s12
1164; GFX6-NEXT:    s_mov_b32 s11, s13
1165; GFX6-NEXT:    s_wqm_b64 exec, exec
1166; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
1167; GFX6-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8
1168; GFX6-NEXT:    s_waitcnt vmcnt(0)
1169; GFX6-NEXT:    ; return to shader part epilog
1170;
1171; GFX10NSA-LABEL: gather4_2d_dmask_8:
1172; GFX10NSA:       ; %bb.0: ; %main_body
1173; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
1174; GFX10NSA-NEXT:    s_mov_b32 s0, s2
1175; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
1176; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
1177; GFX10NSA-NEXT:    s_mov_b32 s1, s3
1178; GFX10NSA-NEXT:    s_mov_b32 s2, s4
1179; GFX10NSA-NEXT:    s_mov_b32 s3, s5
1180; GFX10NSA-NEXT:    s_mov_b32 s4, s6
1181; GFX10NSA-NEXT:    s_mov_b32 s5, s7
1182; GFX10NSA-NEXT:    s_mov_b32 s6, s8
1183; GFX10NSA-NEXT:    s_mov_b32 s7, s9
1184; GFX10NSA-NEXT:    s_mov_b32 s8, s10
1185; GFX10NSA-NEXT:    s_mov_b32 s9, s11
1186; GFX10NSA-NEXT:    s_mov_b32 s10, s12
1187; GFX10NSA-NEXT:    s_mov_b32 s11, s13
1188; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_2D
1189; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
1190; GFX10NSA-NEXT:    ; return to shader part epilog
1191;
1192; GFX12-LABEL: gather4_2d_dmask_8:
1193; GFX12:       ; %bb.0: ; %main_body
1194; GFX12-NEXT:    s_mov_b32 s1, exec_lo
1195; GFX12-NEXT:    s_mov_b32 s0, s2
1196; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
1197; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s1
1198; GFX12-NEXT:    s_mov_b32 s1, s3
1199; GFX12-NEXT:    s_mov_b32 s2, s4
1200; GFX12-NEXT:    s_mov_b32 s3, s5
1201; GFX12-NEXT:    s_mov_b32 s4, s6
1202; GFX12-NEXT:    s_mov_b32 s5, s7
1203; GFX12-NEXT:    s_mov_b32 s6, s8
1204; GFX12-NEXT:    s_mov_b32 s7, s9
1205; GFX12-NEXT:    s_mov_b32 s8, s10
1206; GFX12-NEXT:    s_mov_b32 s9, s11
1207; GFX12-NEXT:    s_mov_b32 s10, s12
1208; GFX12-NEXT:    s_mov_b32 s11, s13
1209; GFX12-NEXT:    image_gather4 v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_2D
1210; GFX12-NEXT:    s_wait_samplecnt 0x0
1211; GFX12-NEXT:    ; return to shader part epilog
1212main_body:
1213  %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 8, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
1214  ret <4 x float> %v
1215}
1216
1217declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
1218declare { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
1219declare <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
1220declare <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
1221declare <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
1222declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
1223declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
1224declare <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
1225declare <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
1226declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
1227declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
1228declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
1229declare <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
1230declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
1231declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
1232
1233attributes #0 = { nounwind readonly }
1234