xref: /llvm-project/llvm/test/CodeGen/AMDGPU/min.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2; RUN: llc -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck --check-prefix=EG %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
6; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
8
9define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
10; EG-LABEL: v_test_imin_sle_i32:
11; EG:       ; %bb.0:
12; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
13; EG-NEXT:    TEX 1 @6
14; EG-NEXT:    ALU 3, @14, KC0[CB0:0-32], KC1[]
15; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
16; EG-NEXT:    CF_END
17; EG-NEXT:    PAD
18; EG-NEXT:    Fetch clause starting at 6:
19; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
20; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
21; EG-NEXT:    ALU clause starting at 10:
22; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
23; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
24; EG-NEXT:     ADD_INT T0.X, KC0[2].Z, PV.W,
25; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, PV.W,
26; EG-NEXT:    ALU clause starting at 14:
27; EG-NEXT:     MIN_INT T0.X, T0.X, T1.X,
28; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
29; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
30; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
31;
32; CI-LABEL: v_test_imin_sle_i32:
33; CI:       ; %bb.0:
34; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
35; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
36; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
37; CI-NEXT:    s_waitcnt lgkmcnt(0)
38; CI-NEXT:    v_mov_b32_e32 v1, s3
39; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
40; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
41; CI-NEXT:    v_mov_b32_e32 v3, s5
42; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
43; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
44; CI-NEXT:    flat_load_dword v5, v[0:1]
45; CI-NEXT:    flat_load_dword v2, v[2:3]
46; CI-NEXT:    v_mov_b32_e32 v1, s1
47; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v4
48; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
49; CI-NEXT:    s_waitcnt vmcnt(0)
50; CI-NEXT:    v_min_i32_e32 v2, v5, v2
51; CI-NEXT:    flat_store_dword v[0:1], v2
52; CI-NEXT:    s_endpgm
53;
54; VI-LABEL: v_test_imin_sle_i32:
55; VI:       ; %bb.0:
56; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
57; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
58; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
59; VI-NEXT:    s_waitcnt lgkmcnt(0)
60; VI-NEXT:    v_mov_b32_e32 v1, s3
61; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
62; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
63; VI-NEXT:    v_mov_b32_e32 v3, s5
64; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
65; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
66; VI-NEXT:    flat_load_dword v5, v[0:1]
67; VI-NEXT:    flat_load_dword v2, v[2:3]
68; VI-NEXT:    v_mov_b32_e32 v1, s1
69; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
70; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
71; VI-NEXT:    s_waitcnt vmcnt(0)
72; VI-NEXT:    v_min_i32_e32 v2, v5, v2
73; VI-NEXT:    flat_store_dword v[0:1], v2
74; VI-NEXT:    s_endpgm
75;
76; GFX9-LABEL: v_test_imin_sle_i32:
77; GFX9:       ; %bb.0:
78; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
79; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
80; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
81; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
82; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
83; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
84; GFX9-NEXT:    s_waitcnt vmcnt(0)
85; GFX9-NEXT:    v_min_i32_e32 v1, v1, v2
86; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
87; GFX9-NEXT:    s_endpgm
88;
89; GFX10-LABEL: v_test_imin_sle_i32:
90; GFX10:       ; %bb.0:
91; GFX10-NEXT:    s_clause 0x1
92; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
93; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
94; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
95; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX10-NEXT:    s_clause 0x1
97; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
98; GFX10-NEXT:    global_load_dword v2, v0, s[4:5]
99; GFX10-NEXT:    s_waitcnt vmcnt(0)
100; GFX10-NEXT:    v_min_i32_e32 v1, v1, v2
101; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
102; GFX10-NEXT:    s_endpgm
103;
104; GFX11-LABEL: v_test_imin_sle_i32:
105; GFX11:       ; %bb.0:
106; GFX11-NEXT:    s_clause 0x1
107; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
108; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
109; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
110; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
111; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
112; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
113; GFX11-NEXT:    s_clause 0x1
114; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
115; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5]
116; GFX11-NEXT:    s_waitcnt vmcnt(0)
117; GFX11-NEXT:    v_min_i32_e32 v1, v1, v2
118; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
119; GFX11-NEXT:    s_endpgm
120  %tid = call i32 @llvm.amdgcn.workitem.id.x()
121  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
122  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
123  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
124  %a = load i32, ptr addrspace(1) %a.gep, align 4
125  %b = load i32, ptr addrspace(1) %b.gep, align 4
126  %cmp = icmp sle i32 %a, %b
127  %val = select i1 %cmp, i32 %a, i32 %b
128  store i32 %val, ptr addrspace(1) %out.gep, align 4
129  ret void
130}
131
132define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
133; EG-LABEL: s_test_imin_sle_i32:
134; EG:       ; %bb.0:
135; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
136; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
137; EG-NEXT:    CF_END
138; EG-NEXT:    PAD
139; EG-NEXT:    ALU clause starting at 4:
140; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
141; EG-NEXT:     MIN_INT * T1.X, KC0[2].Z, KC0[2].W,
142; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
143;
144; CI-LABEL: s_test_imin_sle_i32:
145; CI:       ; %bb.0:
146; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
147; CI-NEXT:    s_waitcnt lgkmcnt(0)
148; CI-NEXT:    s_min_i32 s2, s2, s3
149; CI-NEXT:    v_mov_b32_e32 v0, s0
150; CI-NEXT:    v_mov_b32_e32 v1, s1
151; CI-NEXT:    v_mov_b32_e32 v2, s2
152; CI-NEXT:    flat_store_dword v[0:1], v2
153; CI-NEXT:    s_endpgm
154;
155; VI-LABEL: s_test_imin_sle_i32:
156; VI:       ; %bb.0:
157; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
158; VI-NEXT:    s_waitcnt lgkmcnt(0)
159; VI-NEXT:    s_min_i32 s2, s2, s3
160; VI-NEXT:    v_mov_b32_e32 v0, s0
161; VI-NEXT:    v_mov_b32_e32 v1, s1
162; VI-NEXT:    v_mov_b32_e32 v2, s2
163; VI-NEXT:    flat_store_dword v[0:1], v2
164; VI-NEXT:    s_endpgm
165;
166; GFX9-LABEL: s_test_imin_sle_i32:
167; GFX9:       ; %bb.0:
168; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
169; GFX9-NEXT:    v_mov_b32_e32 v0, 0
170; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
171; GFX9-NEXT:    s_min_i32 s2, s2, s3
172; GFX9-NEXT:    v_mov_b32_e32 v1, s2
173; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
174; GFX9-NEXT:    s_endpgm
175;
176; GFX10-LABEL: s_test_imin_sle_i32:
177; GFX10:       ; %bb.0:
178; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
179; GFX10-NEXT:    v_mov_b32_e32 v0, 0
180; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
181; GFX10-NEXT:    s_min_i32 s2, s2, s3
182; GFX10-NEXT:    v_mov_b32_e32 v1, s2
183; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
184; GFX10-NEXT:    s_endpgm
185;
186; GFX11-LABEL: s_test_imin_sle_i32:
187; GFX11:       ; %bb.0:
188; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
189; GFX11-NEXT:    v_mov_b32_e32 v0, 0
190; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
191; GFX11-NEXT:    s_min_i32 s2, s2, s3
192; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
193; GFX11-NEXT:    v_mov_b32_e32 v1, s2
194; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
195; GFX11-NEXT:    s_endpgm
196  %cmp = icmp sle i32 %a, %b
197  %val = select i1 %cmp, i32 %a, i32 %b
198  store i32 %val, ptr addrspace(1) %out, align 4
199  ret void
200}
201
202define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 {
203; EG-LABEL: s_test_imin_sle_v1i32:
204; EG:       ; %bb.0:
205; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
206; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
207; EG-NEXT:    CF_END
208; EG-NEXT:    PAD
209; EG-NEXT:    ALU clause starting at 4:
210; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
211; EG-NEXT:     MIN_INT * T1.X, KC0[2].Z, KC0[2].W,
212; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
213;
214; CI-LABEL: s_test_imin_sle_v1i32:
215; CI:       ; %bb.0:
216; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
217; CI-NEXT:    s_waitcnt lgkmcnt(0)
218; CI-NEXT:    s_min_i32 s2, s2, s3
219; CI-NEXT:    v_mov_b32_e32 v0, s0
220; CI-NEXT:    v_mov_b32_e32 v1, s1
221; CI-NEXT:    v_mov_b32_e32 v2, s2
222; CI-NEXT:    flat_store_dword v[0:1], v2
223; CI-NEXT:    s_endpgm
224;
225; VI-LABEL: s_test_imin_sle_v1i32:
226; VI:       ; %bb.0:
227; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
228; VI-NEXT:    s_waitcnt lgkmcnt(0)
229; VI-NEXT:    s_min_i32 s2, s2, s3
230; VI-NEXT:    v_mov_b32_e32 v0, s0
231; VI-NEXT:    v_mov_b32_e32 v1, s1
232; VI-NEXT:    v_mov_b32_e32 v2, s2
233; VI-NEXT:    flat_store_dword v[0:1], v2
234; VI-NEXT:    s_endpgm
235;
236; GFX9-LABEL: s_test_imin_sle_v1i32:
237; GFX9:       ; %bb.0:
238; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
239; GFX9-NEXT:    v_mov_b32_e32 v0, 0
240; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
241; GFX9-NEXT:    s_min_i32 s2, s2, s3
242; GFX9-NEXT:    v_mov_b32_e32 v1, s2
243; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
244; GFX9-NEXT:    s_endpgm
245;
246; GFX10-LABEL: s_test_imin_sle_v1i32:
247; GFX10:       ; %bb.0:
248; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
249; GFX10-NEXT:    v_mov_b32_e32 v0, 0
250; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
251; GFX10-NEXT:    s_min_i32 s2, s2, s3
252; GFX10-NEXT:    v_mov_b32_e32 v1, s2
253; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
254; GFX10-NEXT:    s_endpgm
255;
256; GFX11-LABEL: s_test_imin_sle_v1i32:
257; GFX11:       ; %bb.0:
258; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
259; GFX11-NEXT:    v_mov_b32_e32 v0, 0
260; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
261; GFX11-NEXT:    s_min_i32 s2, s2, s3
262; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
263; GFX11-NEXT:    v_mov_b32_e32 v1, s2
264; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
265; GFX11-NEXT:    s_endpgm
266  %cmp = icmp sle <1 x i32> %a, %b
267  %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
268  store <1 x i32> %val, ptr addrspace(1) %out
269  ret void
270}
271
272define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b) #0 {
273; EG-LABEL: s_test_imin_sle_v4i32:
274; EG:       ; %bb.0:
275; EG-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
276; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
277; EG-NEXT:    CF_END
278; EG-NEXT:    PAD
279; EG-NEXT:    ALU clause starting at 4:
280; EG-NEXT:     MIN_INT * T0.W, KC0[4].X, KC0[5].X,
281; EG-NEXT:     MIN_INT * T0.Z, KC0[3].W, KC0[4].W,
282; EG-NEXT:     MIN_INT * T0.Y, KC0[3].Z, KC0[4].Z,
283; EG-NEXT:     MIN_INT * T0.X, KC0[3].Y, KC0[4].Y,
284; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
285; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
286;
287; CI-LABEL: s_test_imin_sle_v4i32:
288; CI:       ; %bb.0:
289; CI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x4
290; CI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
291; CI-NEXT:    s_waitcnt lgkmcnt(0)
292; CI-NEXT:    s_min_i32 s3, s3, s7
293; CI-NEXT:    s_min_i32 s2, s2, s6
294; CI-NEXT:    s_min_i32 s1, s1, s5
295; CI-NEXT:    s_min_i32 s0, s0, s4
296; CI-NEXT:    v_mov_b32_e32 v4, s8
297; CI-NEXT:    v_mov_b32_e32 v0, s0
298; CI-NEXT:    v_mov_b32_e32 v1, s1
299; CI-NEXT:    v_mov_b32_e32 v2, s2
300; CI-NEXT:    v_mov_b32_e32 v3, s3
301; CI-NEXT:    v_mov_b32_e32 v5, s9
302; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
303; CI-NEXT:    s_endpgm
304;
305; VI-LABEL: s_test_imin_sle_v4i32:
306; VI:       ; %bb.0:
307; VI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x10
308; VI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
309; VI-NEXT:    s_waitcnt lgkmcnt(0)
310; VI-NEXT:    s_min_i32 s3, s3, s7
311; VI-NEXT:    s_min_i32 s2, s2, s6
312; VI-NEXT:    s_min_i32 s1, s1, s5
313; VI-NEXT:    s_min_i32 s0, s0, s4
314; VI-NEXT:    v_mov_b32_e32 v4, s8
315; VI-NEXT:    v_mov_b32_e32 v0, s0
316; VI-NEXT:    v_mov_b32_e32 v1, s1
317; VI-NEXT:    v_mov_b32_e32 v2, s2
318; VI-NEXT:    v_mov_b32_e32 v3, s3
319; VI-NEXT:    v_mov_b32_e32 v5, s9
320; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
321; VI-NEXT:    s_endpgm
322;
323; GFX9-LABEL: s_test_imin_sle_v4i32:
324; GFX9:       ; %bb.0:
325; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x10
326; GFX9-NEXT:    v_mov_b32_e32 v4, 0
327; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
328; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
329; GFX9-NEXT:    s_min_i32 s3, s3, s7
330; GFX9-NEXT:    s_min_i32 s2, s2, s6
331; GFX9-NEXT:    s_min_i32 s1, s1, s5
332; GFX9-NEXT:    s_min_i32 s0, s0, s4
333; GFX9-NEXT:    v_mov_b32_e32 v0, s0
334; GFX9-NEXT:    v_mov_b32_e32 v1, s1
335; GFX9-NEXT:    v_mov_b32_e32 v2, s2
336; GFX9-NEXT:    v_mov_b32_e32 v3, s3
337; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
338; GFX9-NEXT:    s_endpgm
339;
340; GFX10-LABEL: s_test_imin_sle_v4i32:
341; GFX10:       ; %bb.0:
342; GFX10-NEXT:    s_clause 0x1
343; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x10
344; GFX10-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
345; GFX10-NEXT:    v_mov_b32_e32 v4, 0
346; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
347; GFX10-NEXT:    s_min_i32 s3, s3, s7
348; GFX10-NEXT:    s_min_i32 s2, s2, s6
349; GFX10-NEXT:    s_min_i32 s0, s0, s4
350; GFX10-NEXT:    s_min_i32 s1, s1, s5
351; GFX10-NEXT:    v_mov_b32_e32 v0, s0
352; GFX10-NEXT:    v_mov_b32_e32 v1, s1
353; GFX10-NEXT:    v_mov_b32_e32 v2, s2
354; GFX10-NEXT:    v_mov_b32_e32 v3, s3
355; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[10:11]
356; GFX10-NEXT:    s_endpgm
357;
358; GFX11-LABEL: s_test_imin_sle_v4i32:
359; GFX11:       ; %bb.0:
360; GFX11-NEXT:    s_clause 0x1
361; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x10
362; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
363; GFX11-NEXT:    v_mov_b32_e32 v4, 0
364; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
365; GFX11-NEXT:    s_min_i32 s2, s11, s15
366; GFX11-NEXT:    s_min_i32 s3, s10, s14
367; GFX11-NEXT:    s_min_i32 s4, s8, s12
368; GFX11-NEXT:    s_min_i32 s5, s9, s13
369; GFX11-NEXT:    v_mov_b32_e32 v0, s4
370; GFX11-NEXT:    v_mov_b32_e32 v1, s5
371; GFX11-NEXT:    v_mov_b32_e32 v2, s3
372; GFX11-NEXT:    v_mov_b32_e32 v3, s2
373; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
374; GFX11-NEXT:    s_endpgm
375  %cmp = icmp sle <4 x i32> %a, %b
376  %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
377  store <4 x i32> %val, ptr addrspace(1) %out
378  ret void
379}
380
381define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], i8 %a, [8 x i32], i8 %b) #0 {
382; EG-LABEL: s_test_imin_sle_i8:
383; EG:       ; %bb.0:
384; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
385; EG-NEXT:    TEX 1 @6
386; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
387; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
388; EG-NEXT:    CF_END
389; EG-NEXT:    PAD
390; EG-NEXT:    Fetch clause starting at 6:
391; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 72, #3
392; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 108, #3
393; EG-NEXT:    ALU clause starting at 10:
394; EG-NEXT:     MOV * T0.X, 0.0,
395; EG-NEXT:    ALU clause starting at 11:
396; EG-NEXT:     BFE_INT T0.Z, T1.X, 0.0, literal.x,
397; EG-NEXT:     BFE_INT T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
398; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
399; EG-NEXT:    8(1.121039e-44), 3(4.203895e-45)
400; EG-NEXT:     MIN_INT * T0.W, PV.Z, PV.W,
401; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
402; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
403; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
404; EG-NEXT:     LSHL T0.X, PV.W, PS,
405; EG-NEXT:     LSHL * T0.W, literal.x, PS,
406; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
407; EG-NEXT:     MOV T0.Y, 0.0,
408; EG-NEXT:     MOV * T0.Z, 0.0,
409; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
410; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
411;
412; CI-LABEL: s_test_imin_sle_i8:
413; CI:       ; %bb.0:
414; CI-NEXT:    s_load_dword s2, s[8:9], 0xa
415; CI-NEXT:    s_load_dword s3, s[8:9], 0x13
416; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
417; CI-NEXT:    s_waitcnt lgkmcnt(0)
418; CI-NEXT:    s_sext_i32_i8 s2, s2
419; CI-NEXT:    s_sext_i32_i8 s3, s3
420; CI-NEXT:    s_min_i32 s2, s2, s3
421; CI-NEXT:    v_mov_b32_e32 v0, s0
422; CI-NEXT:    v_mov_b32_e32 v1, s1
423; CI-NEXT:    v_mov_b32_e32 v2, s2
424; CI-NEXT:    flat_store_byte v[0:1], v2
425; CI-NEXT:    s_endpgm
426;
427; VI-LABEL: s_test_imin_sle_i8:
428; VI:       ; %bb.0:
429; VI-NEXT:    s_load_dword s2, s[8:9], 0x28
430; VI-NEXT:    s_load_dword s3, s[8:9], 0x4c
431; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
432; VI-NEXT:    s_waitcnt lgkmcnt(0)
433; VI-NEXT:    s_sext_i32_i8 s2, s2
434; VI-NEXT:    s_sext_i32_i8 s3, s3
435; VI-NEXT:    s_min_i32 s2, s2, s3
436; VI-NEXT:    v_mov_b32_e32 v0, s0
437; VI-NEXT:    v_mov_b32_e32 v1, s1
438; VI-NEXT:    v_mov_b32_e32 v2, s2
439; VI-NEXT:    flat_store_byte v[0:1], v2
440; VI-NEXT:    s_endpgm
441;
442; GFX9-LABEL: s_test_imin_sle_i8:
443; GFX9:       ; %bb.0:
444; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x28
445; GFX9-NEXT:    s_load_dword s3, s[8:9], 0x4c
446; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
447; GFX9-NEXT:    v_mov_b32_e32 v0, 0
448; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX9-NEXT:    s_sext_i32_i8 s2, s2
450; GFX9-NEXT:    s_sext_i32_i8 s3, s3
451; GFX9-NEXT:    s_min_i32 s2, s2, s3
452; GFX9-NEXT:    v_mov_b32_e32 v1, s2
453; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
454; GFX9-NEXT:    s_endpgm
455;
456; GFX10-LABEL: s_test_imin_sle_i8:
457; GFX10:       ; %bb.0:
458; GFX10-NEXT:    s_clause 0x2
459; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x28
460; GFX10-NEXT:    s_load_dword s3, s[8:9], 0x4c
461; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
462; GFX10-NEXT:    v_mov_b32_e32 v0, 0
463; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
464; GFX10-NEXT:    s_sext_i32_i8 s2, s2
465; GFX10-NEXT:    s_sext_i32_i8 s3, s3
466; GFX10-NEXT:    s_min_i32 s2, s2, s3
467; GFX10-NEXT:    v_mov_b32_e32 v1, s2
468; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
469; GFX10-NEXT:    s_endpgm
470;
471; GFX11-LABEL: s_test_imin_sle_i8:
472; GFX11:       ; %bb.0:
473; GFX11-NEXT:    s_clause 0x2
474; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x28
475; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x4c
476; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
477; GFX11-NEXT:    v_mov_b32_e32 v0, 0
478; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX11-NEXT:    s_sext_i32_i8 s2, s2
480; GFX11-NEXT:    s_sext_i32_i8 s3, s3
481; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
482; GFX11-NEXT:    s_min_i32 s2, s2, s3
483; GFX11-NEXT:    v_mov_b32_e32 v1, s2
484; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
485; GFX11-NEXT:    s_endpgm
486  %cmp = icmp sle i8 %a, %b
487  %val = select i1 %cmp, i8 %a, i8 %b
488  store i8 %val, ptr addrspace(1) %out
489  ret void
490}
491
492; FIXME: Why vector and sdwa for last element?
493
494define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], <4 x i8> %b) #0 {
495; EG-LABEL: s_test_imin_sle_v4i8:
496; EG:       ; %bb.0:
497; EG-NEXT:    ALU 0, @22, KC0[], KC1[]
498; EG-NEXT:    TEX 7 @6
499; EG-NEXT:    ALU 30, @23, KC0[CB0:0-32], KC1[]
500; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
501; EG-NEXT:    CF_END
502; EG-NEXT:    PAD
503; EG-NEXT:    Fetch clause starting at 6:
504; EG-NEXT:     VTX_READ_8 T5.X, T4.X, 74, #3
505; EG-NEXT:     VTX_READ_8 T6.X, T4.X, 108, #3
506; EG-NEXT:     VTX_READ_8 T7.X, T4.X, 72, #3
507; EG-NEXT:     VTX_READ_8 T8.X, T4.X, 111, #3
508; EG-NEXT:     VTX_READ_8 T9.X, T4.X, 75, #3
509; EG-NEXT:     VTX_READ_8 T10.X, T4.X, 109, #3
510; EG-NEXT:     VTX_READ_8 T11.X, T4.X, 73, #3
511; EG-NEXT:     VTX_READ_8 T4.X, T4.X, 110, #3
512; EG-NEXT:    ALU clause starting at 22:
513; EG-NEXT:     MOV * T4.X, 0.0,
514; EG-NEXT:    ALU clause starting at 23:
515; EG-NEXT:     BFE_INT T0.Z, T5.X, 0.0, literal.x,
516; EG-NEXT:     BFE_INT * T0.W, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212
517; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
518; EG-NEXT:     BFE_INT T4.X, T11.X, 0.0, literal.x,
519; EG-NEXT:     BFE_INT T0.Y, T10.X, 0.0, literal.x, BS:VEC_120/SCL_212
520; EG-NEXT:     BFE_INT * T1.Z, T9.X, 0.0, literal.x, BS:VEC_201
521; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
522; EG-NEXT:     BFE_INT T1.W, T8.X, 0.0, literal.x,
523; EG-NEXT:     MIN_INT * T0.W, T0.Z, T0.W,
524; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
525; EG-NEXT:     MIN_INT T0.Z, T1.Z, PV.W,
526; EG-NEXT:     AND_INT T0.W, PS, literal.x,
527; EG-NEXT:     MIN_INT * T1.W, T4.X, T0.Y,
528; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
529; EG-NEXT:     AND_INT T4.X, PS, literal.x,
530; EG-NEXT:     LSHL T0.Y, PV.W, literal.y,
531; EG-NEXT:     BFE_INT T1.Z, T7.X, 0.0, literal.z,
532; EG-NEXT:     BFE_INT T0.W, T6.X, 0.0, literal.z, BS:VEC_120/SCL_212
533; EG-NEXT:     LSHL * T1.W, PV.Z, literal.w,
534; EG-NEXT:    255(3.573311e-43), 16(2.242078e-44)
535; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
536; EG-NEXT:     MIN_INT T0.Z, PV.Z, PV.W,
537; EG-NEXT:     OR_INT T0.W, PS, PV.Y,
538; EG-NEXT:     LSHL * T1.W, PV.X, literal.x,
539; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
540; EG-NEXT:     OR_INT T0.W, PV.W, PS,
541; EG-NEXT:     AND_INT * T1.W, PV.Z, literal.x,
542; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
543; EG-NEXT:     OR_INT T4.X, PV.W, PS,
544; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
545; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
546;
547; CI-LABEL: s_test_imin_sle_v4i8:
548; CI:       ; %bb.0:
549; CI-NEXT:    s_load_dword s2, s[8:9], 0xa
550; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
551; CI-NEXT:    s_load_dword s3, s[8:9], 0x13
552; CI-NEXT:    s_waitcnt lgkmcnt(0)
553; CI-NEXT:    s_ashr_i32 s4, s2, 24
554; CI-NEXT:    s_sext_i32_i8 s5, s2
555; CI-NEXT:    s_bfe_i32 s6, s2, 0x80008
556; CI-NEXT:    s_bfe_i32 s2, s2, 0x80010
557; CI-NEXT:    s_ashr_i32 s7, s3, 24
558; CI-NEXT:    s_sext_i32_i8 s8, s3
559; CI-NEXT:    s_bfe_i32 s9, s3, 0x80008
560; CI-NEXT:    s_bfe_i32 s3, s3, 0x80010
561; CI-NEXT:    s_min_i32 s2, s2, s3
562; CI-NEXT:    s_min_i32 s4, s4, s7
563; CI-NEXT:    s_and_b32 s2, s2, 0xff
564; CI-NEXT:    s_lshl_b32 s4, s4, 24
565; CI-NEXT:    s_lshl_b32 s2, s2, 16
566; CI-NEXT:    s_or_b32 s2, s4, s2
567; CI-NEXT:    s_min_i32 s3, s6, s9
568; CI-NEXT:    s_min_i32 s4, s5, s8
569; CI-NEXT:    s_lshl_b32 s3, s3, 8
570; CI-NEXT:    s_and_b32 s4, s4, 0xff
571; CI-NEXT:    s_or_b32 s3, s4, s3
572; CI-NEXT:    s_and_b32 s3, s3, 0xffff
573; CI-NEXT:    s_or_b32 s2, s3, s2
574; CI-NEXT:    v_mov_b32_e32 v0, s0
575; CI-NEXT:    v_mov_b32_e32 v1, s1
576; CI-NEXT:    v_mov_b32_e32 v2, s2
577; CI-NEXT:    flat_store_dword v[0:1], v2
578; CI-NEXT:    s_endpgm
579;
580; VI-LABEL: s_test_imin_sle_v4i8:
581; VI:       ; %bb.0:
582; VI-NEXT:    s_load_dword s2, s[8:9], 0x28
583; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
584; VI-NEXT:    s_load_dword s3, s[8:9], 0x4c
585; VI-NEXT:    s_waitcnt lgkmcnt(0)
586; VI-NEXT:    s_ashr_i32 s4, s2, 24
587; VI-NEXT:    s_bfe_i32 s5, s2, 0x80010
588; VI-NEXT:    s_bfe_i32 s6, s2, 0x80008
589; VI-NEXT:    s_sext_i32_i8 s2, s2
590; VI-NEXT:    s_ashr_i32 s7, s3, 24
591; VI-NEXT:    s_bfe_i32 s8, s3, 0x80010
592; VI-NEXT:    s_bfe_i32 s9, s3, 0x80008
593; VI-NEXT:    s_sext_i32_i8 s3, s3
594; VI-NEXT:    s_min_i32 s2, s2, s3
595; VI-NEXT:    s_min_i32 s3, s6, s9
596; VI-NEXT:    s_min_i32 s5, s5, s8
597; VI-NEXT:    s_min_i32 s4, s4, s7
598; VI-NEXT:    s_and_b32 s5, s5, 0xff
599; VI-NEXT:    s_lshl_b32 s3, s3, 8
600; VI-NEXT:    s_and_b32 s2, s2, 0xff
601; VI-NEXT:    s_lshl_b32 s4, s4, 24
602; VI-NEXT:    s_lshl_b32 s5, s5, 16
603; VI-NEXT:    s_or_b32 s2, s2, s3
604; VI-NEXT:    s_or_b32 s4, s4, s5
605; VI-NEXT:    s_and_b32 s2, s2, 0xffff
606; VI-NEXT:    s_or_b32 s2, s2, s4
607; VI-NEXT:    v_mov_b32_e32 v0, s0
608; VI-NEXT:    v_mov_b32_e32 v1, s1
609; VI-NEXT:    v_mov_b32_e32 v2, s2
610; VI-NEXT:    flat_store_dword v[0:1], v2
611; VI-NEXT:    s_endpgm
612;
613; GFX9-LABEL: s_test_imin_sle_v4i8:
614; GFX9:       ; %bb.0:
615; GFX9-NEXT:    s_load_dword s3, s[8:9], 0x4c
616; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x28
617; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
618; GFX9-NEXT:    v_mov_b32_e32 v0, 0
619; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
620; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
621; GFX9-NEXT:    s_lshr_b32 s8, s3, 16
622; GFX9-NEXT:    s_ashr_i32 s9, s3, 24
623; GFX9-NEXT:    s_ashr_i32 s6, s2, 24
624; GFX9-NEXT:    s_bfe_i32 s8, s8, 0x80000
625; GFX9-NEXT:    v_mov_b32_e32 v1, s9
626; GFX9-NEXT:    s_bfe_i32 s5, s5, 0x80000
627; GFX9-NEXT:    s_sext_i32_i16 s7, s3
628; GFX9-NEXT:    v_min_i16_e32 v1, s6, v1
629; GFX9-NEXT:    v_mov_b32_e32 v2, s8
630; GFX9-NEXT:    s_sext_i32_i16 s4, s2
631; GFX9-NEXT:    s_lshr_b32 s7, s7, 8
632; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
633; GFX9-NEXT:    v_min_i16_e32 v2, s5, v2
634; GFX9-NEXT:    s_lshr_b32 s4, s4, 8
635; GFX9-NEXT:    s_bfe_i32 s3, s3, 0x80000
636; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
637; GFX9-NEXT:    v_mov_b32_e32 v2, s7
638; GFX9-NEXT:    s_bfe_i32 s2, s2, 0x80000
639; GFX9-NEXT:    v_min_i16_e32 v2, s4, v2
640; GFX9-NEXT:    v_mov_b32_e32 v3, s3
641; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
642; GFX9-NEXT:    v_min_i16_e32 v3, s2, v3
643; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
644; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
645; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
646; GFX9-NEXT:    s_endpgm
647;
648; GFX10-LABEL: s_test_imin_sle_v4i8:
649; GFX10:       ; %bb.0:
650; GFX10-NEXT:    s_clause 0x2
651; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x28
652; GFX10-NEXT:    s_load_dword s3, s[8:9], 0x4c
653; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
654; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
655; GFX10-NEXT:    s_sext_i32_i16 s4, s2
656; GFX10-NEXT:    s_sext_i32_i16 s7, s3
657; GFX10-NEXT:    s_ashr_i32 s6, s2, 24
658; GFX10-NEXT:    s_ashr_i32 s9, s3, 24
659; GFX10-NEXT:    s_lshr_b32 s4, s4, 8
660; GFX10-NEXT:    s_lshr_b32 s7, s7, 8
661; GFX10-NEXT:    v_min_i16 v0, s6, s9
662; GFX10-NEXT:    v_min_i16 v1, s4, s7
663; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
664; GFX10-NEXT:    s_lshr_b32 s8, s3, 16
665; GFX10-NEXT:    s_bfe_i32 s2, s2, 0x80000
666; GFX10-NEXT:    s_bfe_i32 s5, s5, 0x80000
667; GFX10-NEXT:    s_bfe_i32 s4, s8, 0x80000
668; GFX10-NEXT:    s_bfe_i32 s3, s3, 0x80000
669; GFX10-NEXT:    v_min_i16 v2, s5, s4
670; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
671; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
672; GFX10-NEXT:    v_min_i16 v3, s2, s3
673; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
674; GFX10-NEXT:    v_mov_b32_e32 v2, 0
675; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
676; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
677; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
678; GFX10-NEXT:    s_endpgm
679;
680; GFX11-LABEL: s_test_imin_sle_v4i8:
681; GFX11:       ; %bb.0:
682; GFX11-NEXT:    s_clause 0x1
683; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x28
684; GFX11-NEXT:    s_load_b32 s1, s[4:5], 0x4c
685; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
686; GFX11-NEXT:    s_sext_i32_i16 s2, s0
687; GFX11-NEXT:    s_lshr_b32 s3, s0, 16
688; GFX11-NEXT:    s_sext_i32_i16 s7, s1
689; GFX11-NEXT:    s_lshr_b32 s8, s1, 16
690; GFX11-NEXT:    s_ashr_i32 s6, s0, 24
691; GFX11-NEXT:    s_bfe_i32 s0, s0, 0x80000
692; GFX11-NEXT:    s_ashr_i32 s9, s1, 24
693; GFX11-NEXT:    s_bfe_i32 s1, s1, 0x80000
694; GFX11-NEXT:    s_lshr_b32 s2, s2, 8
695; GFX11-NEXT:    s_bfe_i32 s3, s3, 0x80000
696; GFX11-NEXT:    s_lshr_b32 s7, s7, 8
697; GFX11-NEXT:    s_bfe_i32 s8, s8, 0x80000
698; GFX11-NEXT:    v_min_i16 v0, s6, s9
699; GFX11-NEXT:    v_min_i16 v1, s0, s1
700; GFX11-NEXT:    v_min_i16 v2, s3, s8
701; GFX11-NEXT:    v_min_i16 v3, s2, s7
702; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
703; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
704; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
705; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
706; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
707; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
708; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
709; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
710; GFX11-NEXT:    v_mov_b32_e32 v2, 0
711; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
712; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
713; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
714; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
715; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
716; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
717; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
718; GFX11-NEXT:    s_endpgm
719  %cmp = icmp sle <4 x i8> %a, %b
720  %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
721  store <4 x i8> %val, ptr addrspace(1) %out
722  ret void
723}
724
725define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #0 {
726; EG-LABEL: s_test_imin_sle_v2i16:
727; EG:       ; %bb.0:
728; EG-NEXT:    ALU 0, @14, KC0[], KC1[]
729; EG-NEXT:    TEX 3 @6
730; EG-NEXT:    ALU 13, @15, KC0[CB0:0-32], KC1[]
731; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
732; EG-NEXT:    CF_END
733; EG-NEXT:    PAD
734; EG-NEXT:    Fetch clause starting at 6:
735; EG-NEXT:     VTX_READ_16 T5.X, T4.X, 42, #3
736; EG-NEXT:     VTX_READ_16 T6.X, T4.X, 44, #3
737; EG-NEXT:     VTX_READ_16 T7.X, T4.X, 40, #3
738; EG-NEXT:     VTX_READ_16 T4.X, T4.X, 46, #3
739; EG-NEXT:    ALU clause starting at 14:
740; EG-NEXT:     MOV * T4.X, 0.0,
741; EG-NEXT:    ALU clause starting at 15:
742; EG-NEXT:     BFE_INT T5.X, T5.X, 0.0, literal.x,
743; EG-NEXT:     BFE_INT T0.Y, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212
744; EG-NEXT:     BFE_INT * T0.Z, T7.X, 0.0, literal.x, BS:VEC_201
745; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
746; EG-NEXT:     BFE_INT * T0.W, T6.X, 0.0, literal.x,
747; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
748; EG-NEXT:     MIN_INT T0.W, T0.Z, PV.W,
749; EG-NEXT:     MIN_INT * T1.W, T5.X, T0.Y,
750; EG-NEXT:     LSHL T1.W, PS, literal.x,
751; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
752; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
753; EG-NEXT:     OR_INT T4.X, PV.W, PS,
754; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
755; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
756;
757; CI-LABEL: s_test_imin_sle_v2i16:
758; CI:       ; %bb.0:
759; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
760; CI-NEXT:    s_waitcnt lgkmcnt(0)
761; CI-NEXT:    s_ashr_i32 s4, s2, 16
762; CI-NEXT:    s_sext_i32_i16 s2, s2
763; CI-NEXT:    s_ashr_i32 s5, s3, 16
764; CI-NEXT:    s_sext_i32_i16 s3, s3
765; CI-NEXT:    s_min_i32 s4, s4, s5
766; CI-NEXT:    s_min_i32 s2, s2, s3
767; CI-NEXT:    s_lshl_b32 s3, s4, 16
768; CI-NEXT:    s_and_b32 s2, s2, 0xffff
769; CI-NEXT:    s_or_b32 s2, s2, s3
770; CI-NEXT:    v_mov_b32_e32 v0, s0
771; CI-NEXT:    v_mov_b32_e32 v1, s1
772; CI-NEXT:    v_mov_b32_e32 v2, s2
773; CI-NEXT:    flat_store_dword v[0:1], v2
774; CI-NEXT:    s_endpgm
775;
776; VI-LABEL: s_test_imin_sle_v2i16:
777; VI:       ; %bb.0:
778; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
779; VI-NEXT:    s_waitcnt lgkmcnt(0)
780; VI-NEXT:    s_ashr_i32 s4, s2, 16
781; VI-NEXT:    s_sext_i32_i16 s2, s2
782; VI-NEXT:    s_ashr_i32 s5, s3, 16
783; VI-NEXT:    s_sext_i32_i16 s3, s3
784; VI-NEXT:    s_min_i32 s4, s4, s5
785; VI-NEXT:    s_min_i32 s2, s2, s3
786; VI-NEXT:    s_lshl_b32 s3, s4, 16
787; VI-NEXT:    s_and_b32 s2, s2, 0xffff
788; VI-NEXT:    s_or_b32 s2, s2, s3
789; VI-NEXT:    v_mov_b32_e32 v0, s0
790; VI-NEXT:    v_mov_b32_e32 v1, s1
791; VI-NEXT:    v_mov_b32_e32 v2, s2
792; VI-NEXT:    flat_store_dword v[0:1], v2
793; VI-NEXT:    s_endpgm
794;
795; GFX9-LABEL: s_test_imin_sle_v2i16:
796; GFX9:       ; %bb.0:
797; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
798; GFX9-NEXT:    v_mov_b32_e32 v0, 0
799; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
800; GFX9-NEXT:    v_mov_b32_e32 v1, s3
801; GFX9-NEXT:    v_pk_min_i16 v1, s2, v1
802; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
803; GFX9-NEXT:    s_endpgm
804;
805; GFX10-LABEL: s_test_imin_sle_v2i16:
806; GFX10:       ; %bb.0:
807; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
808; GFX10-NEXT:    v_mov_b32_e32 v0, 0
809; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
810; GFX10-NEXT:    v_pk_min_i16 v1, s2, s3
811; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
812; GFX10-NEXT:    s_endpgm
813;
814; GFX11-LABEL: s_test_imin_sle_v2i16:
815; GFX11:       ; %bb.0:
816; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
817; GFX11-NEXT:    v_mov_b32_e32 v0, 0
818; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
819; GFX11-NEXT:    v_pk_min_i16 v1, s2, s3
820; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
821; GFX11-NEXT:    s_endpgm
822  %cmp = icmp sle <2 x i16> %a, %b
823  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
824  store <2 x i16> %val, ptr addrspace(1) %out
825  ret void
826}
827
828define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16> %a, <4 x i16> %b) #0 {
829; EG-LABEL: s_test_imin_sle_v4i16:
830; EG:       ; %bb.0:
831; EG-NEXT:    ALU 1, @28, KC0[], KC1[]
832; EG-NEXT:    TEX 1 @12
833; EG-NEXT:    ALU 9, @30, KC0[], KC1[]
834; EG-NEXT:    TEX 1 @16
835; EG-NEXT:    ALU 10, @40, KC0[], KC1[]
836; EG-NEXT:    TEX 1 @20
837; EG-NEXT:    ALU 10, @51, KC0[], KC1[]
838; EG-NEXT:    TEX 1 @24
839; EG-NEXT:    ALU 11, @62, KC0[CB0:0-32], KC1[]
840; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XY, T5.X, 1
841; EG-NEXT:    CF_END
842; EG-NEXT:    PAD
843; EG-NEXT:    Fetch clause starting at 12:
844; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 50, #3
845; EG-NEXT:     VTX_READ_16 T7.X, T5.X, 58, #3
846; EG-NEXT:    Fetch clause starting at 16:
847; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 48, #3
848; EG-NEXT:     VTX_READ_16 T7.X, T5.X, 56, #3
849; EG-NEXT:    Fetch clause starting at 20:
850; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 46, #3
851; EG-NEXT:     VTX_READ_16 T7.X, T5.X, 54, #3
852; EG-NEXT:    Fetch clause starting at 24:
853; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 44, #3
854; EG-NEXT:     VTX_READ_16 T5.X, T5.X, 52, #3
855; EG-NEXT:    ALU clause starting at 28:
856; EG-NEXT:     MOV * T0.Y, T3.X,
857; EG-NEXT:     MOV * T5.X, 0.0,
858; EG-NEXT:    ALU clause starting at 30:
859; EG-NEXT:     BFE_INT T0.Z, T6.X, 0.0, literal.x,
860; EG-NEXT:     BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
861; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
862; EG-NEXT:     MIN_INT * T0.W, PV.Z, PV.W,
863; EG-NEXT:     LSHL T0.W, PV.W, literal.x,
864; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
865; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
866; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
867; EG-NEXT:     MOV * T3.X, PV.W,
868; EG-NEXT:     MOV * T0.Y, PV.X,
869; EG-NEXT:    ALU clause starting at 40:
870; EG-NEXT:     BFE_INT T0.Z, T6.X, 0.0, literal.x,
871; EG-NEXT:     BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
872; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
873; EG-NEXT:     MIN_INT T0.W, PV.Z, PV.W,
874; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
875; EG-NEXT:    -65536(nan), 0(0.000000e+00)
876; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
877; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
878; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
879; EG-NEXT:     MOV T3.X, PV.W,
880; EG-NEXT:     MOV * T0.Y, T2.X,
881; EG-NEXT:    ALU clause starting at 51:
882; EG-NEXT:     BFE_INT T0.Z, T6.X, 0.0, literal.x,
883; EG-NEXT:     BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
884; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
885; EG-NEXT:     MIN_INT T0.W, PV.Z, PV.W,
886; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
887; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
888; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
889; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
890; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
891; EG-NEXT:     MOV * T2.X, PV.W,
892; EG-NEXT:     MOV * T0.Y, PV.X,
893; EG-NEXT:    ALU clause starting at 62:
894; EG-NEXT:     BFE_INT T0.Z, T6.X, 0.0, literal.x,
895; EG-NEXT:     BFE_INT * T0.W, T5.X, 0.0, literal.x, BS:VEC_120/SCL_212
896; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
897; EG-NEXT:     MIN_INT * T0.W, PV.Z, PV.W,
898; EG-NEXT:     LSHR T5.X, KC0[2].Y, literal.x,
899; EG-NEXT:     AND_INT T1.W, T0.Y, literal.y,
900; EG-NEXT:     AND_INT * T0.W, PV.W, literal.z,
901; EG-NEXT:    2(2.802597e-45), -65536(nan)
902; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
903; EG-NEXT:     OR_INT * T6.X, PV.W, PS,
904; EG-NEXT:     MOV T2.X, PV.X,
905; EG-NEXT:     MOV * T6.Y, T3.X,
906;
907; CI-LABEL: s_test_imin_sle_v4i16:
908; CI:       ; %bb.0:
909; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x2
910; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
911; CI-NEXT:    s_waitcnt lgkmcnt(0)
912; CI-NEXT:    s_ashr_i32 s6, s0, 16
913; CI-NEXT:    s_ashr_i32 s7, s1, 16
914; CI-NEXT:    s_sext_i32_i16 s0, s0
915; CI-NEXT:    s_sext_i32_i16 s1, s1
916; CI-NEXT:    s_ashr_i32 s8, s2, 16
917; CI-NEXT:    s_ashr_i32 s9, s3, 16
918; CI-NEXT:    s_sext_i32_i16 s2, s2
919; CI-NEXT:    s_sext_i32_i16 s3, s3
920; CI-NEXT:    s_min_i32 s7, s7, s9
921; CI-NEXT:    s_min_i32 s1, s1, s3
922; CI-NEXT:    s_min_i32 s3, s6, s8
923; CI-NEXT:    s_min_i32 s0, s0, s2
924; CI-NEXT:    s_lshl_b32 s7, s7, 16
925; CI-NEXT:    s_and_b32 s1, s1, 0xffff
926; CI-NEXT:    s_lshl_b32 s3, s3, 16
927; CI-NEXT:    s_and_b32 s0, s0, 0xffff
928; CI-NEXT:    s_or_b32 s1, s1, s7
929; CI-NEXT:    s_or_b32 s0, s0, s3
930; CI-NEXT:    v_mov_b32_e32 v2, s4
931; CI-NEXT:    v_mov_b32_e32 v0, s0
932; CI-NEXT:    v_mov_b32_e32 v1, s1
933; CI-NEXT:    v_mov_b32_e32 v3, s5
934; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
935; CI-NEXT:    s_endpgm
936;
937; VI-LABEL: s_test_imin_sle_v4i16:
938; VI:       ; %bb.0:
939; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
940; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
941; VI-NEXT:    s_waitcnt lgkmcnt(0)
942; VI-NEXT:    s_ashr_i32 s6, s1, 16
943; VI-NEXT:    s_sext_i32_i16 s1, s1
944; VI-NEXT:    s_ashr_i32 s8, s3, 16
945; VI-NEXT:    s_sext_i32_i16 s3, s3
946; VI-NEXT:    s_ashr_i32 s7, s0, 16
947; VI-NEXT:    s_sext_i32_i16 s0, s0
948; VI-NEXT:    s_ashr_i32 s9, s2, 16
949; VI-NEXT:    s_sext_i32_i16 s2, s2
950; VI-NEXT:    s_min_i32 s6, s6, s8
951; VI-NEXT:    s_min_i32 s1, s1, s3
952; VI-NEXT:    s_min_i32 s7, s7, s9
953; VI-NEXT:    s_min_i32 s0, s0, s2
954; VI-NEXT:    s_lshl_b32 s2, s6, 16
955; VI-NEXT:    s_and_b32 s1, s1, 0xffff
956; VI-NEXT:    s_or_b32 s1, s1, s2
957; VI-NEXT:    s_lshl_b32 s2, s7, 16
958; VI-NEXT:    s_and_b32 s0, s0, 0xffff
959; VI-NEXT:    s_or_b32 s0, s0, s2
960; VI-NEXT:    v_mov_b32_e32 v2, s4
961; VI-NEXT:    v_mov_b32_e32 v0, s0
962; VI-NEXT:    v_mov_b32_e32 v1, s1
963; VI-NEXT:    v_mov_b32_e32 v3, s5
964; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
965; VI-NEXT:    s_endpgm
966;
967; GFX9-LABEL: s_test_imin_sle_v4i16:
968; GFX9:       ; %bb.0:
969; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
970; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
971; GFX9-NEXT:    v_mov_b32_e32 v2, 0
972; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
973; GFX9-NEXT:    v_mov_b32_e32 v0, s3
974; GFX9-NEXT:    v_mov_b32_e32 v3, s2
975; GFX9-NEXT:    v_pk_min_i16 v1, s1, v0
976; GFX9-NEXT:    v_pk_min_i16 v0, s0, v3
977; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
978; GFX9-NEXT:    s_endpgm
979;
980; GFX10-LABEL: s_test_imin_sle_v4i16:
981; GFX10:       ; %bb.0:
982; GFX10-NEXT:    s_clause 0x1
983; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
984; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
985; GFX10-NEXT:    v_mov_b32_e32 v2, 0
986; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
987; GFX10-NEXT:    v_pk_min_i16 v1, s1, s3
988; GFX10-NEXT:    v_pk_min_i16 v0, s0, s2
989; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
990; GFX10-NEXT:    s_endpgm
991;
992; GFX11-LABEL: s_test_imin_sle_v4i16:
993; GFX11:       ; %bb.0:
994; GFX11-NEXT:    s_clause 0x1
995; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
996; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
997; GFX11-NEXT:    v_mov_b32_e32 v2, 0
998; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
999; GFX11-NEXT:    v_pk_min_i16 v1, s1, s3
1000; GFX11-NEXT:    v_pk_min_i16 v0, s0, s2
1001; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
1002; GFX11-NEXT:    s_endpgm
1003  %cmp = icmp sle <4 x i16> %a, %b
1004  %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
1005  store <4 x i16> %val, ptr addrspace(1) %out
1006  ret void
1007}
1008
1009define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
1010; EG-LABEL: v_test_imin_slt_i32:
1011; EG:       ; %bb.0:
1012; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
1013; EG-NEXT:    TEX 1 @6
1014; EG-NEXT:    ALU 3, @14, KC0[CB0:0-32], KC1[]
1015; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1016; EG-NEXT:    CF_END
1017; EG-NEXT:    PAD
1018; EG-NEXT:    Fetch clause starting at 6:
1019; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
1020; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1021; EG-NEXT:    ALU clause starting at 10:
1022; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1023; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1024; EG-NEXT:     ADD_INT T0.X, KC0[2].Z, PV.W,
1025; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, PV.W,
1026; EG-NEXT:    ALU clause starting at 14:
1027; EG-NEXT:     MIN_INT T0.X, T0.X, T1.X,
1028; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
1029; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
1030; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1031;
1032; CI-LABEL: v_test_imin_slt_i32:
1033; CI:       ; %bb.0:
1034; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1035; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
1036; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1037; CI-NEXT:    s_waitcnt lgkmcnt(0)
1038; CI-NEXT:    v_mov_b32_e32 v1, s3
1039; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
1040; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1041; CI-NEXT:    v_mov_b32_e32 v3, s5
1042; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
1043; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1044; CI-NEXT:    flat_load_dword v5, v[0:1]
1045; CI-NEXT:    flat_load_dword v2, v[2:3]
1046; CI-NEXT:    v_mov_b32_e32 v1, s1
1047; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v4
1048; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1049; CI-NEXT:    s_waitcnt vmcnt(0)
1050; CI-NEXT:    v_min_i32_e32 v2, v5, v2
1051; CI-NEXT:    flat_store_dword v[0:1], v2
1052; CI-NEXT:    s_endpgm
1053;
1054; VI-LABEL: v_test_imin_slt_i32:
1055; VI:       ; %bb.0:
1056; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1057; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1058; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1059; VI-NEXT:    s_waitcnt lgkmcnt(0)
1060; VI-NEXT:    v_mov_b32_e32 v1, s3
1061; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1062; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1063; VI-NEXT:    v_mov_b32_e32 v3, s5
1064; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
1065; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1066; VI-NEXT:    flat_load_dword v5, v[0:1]
1067; VI-NEXT:    flat_load_dword v2, v[2:3]
1068; VI-NEXT:    v_mov_b32_e32 v1, s1
1069; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
1070; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1071; VI-NEXT:    s_waitcnt vmcnt(0)
1072; VI-NEXT:    v_min_i32_e32 v2, v5, v2
1073; VI-NEXT:    flat_store_dword v[0:1], v2
1074; VI-NEXT:    s_endpgm
1075;
1076; GFX9-LABEL: v_test_imin_slt_i32:
1077; GFX9:       ; %bb.0:
1078; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1079; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1080; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1081; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1082; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1083; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
1084; GFX9-NEXT:    s_waitcnt vmcnt(0)
1085; GFX9-NEXT:    v_min_i32_e32 v1, v1, v2
1086; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1087; GFX9-NEXT:    s_endpgm
1088;
1089; GFX10-LABEL: v_test_imin_slt_i32:
1090; GFX10:       ; %bb.0:
1091; GFX10-NEXT:    s_clause 0x1
1092; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1093; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1094; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1095; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1096; GFX10-NEXT:    s_clause 0x1
1097; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1098; GFX10-NEXT:    global_load_dword v2, v0, s[4:5]
1099; GFX10-NEXT:    s_waitcnt vmcnt(0)
1100; GFX10-NEXT:    v_min_i32_e32 v1, v1, v2
1101; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1102; GFX10-NEXT:    s_endpgm
1103;
1104; GFX11-LABEL: v_test_imin_slt_i32:
1105; GFX11:       ; %bb.0:
1106; GFX11-NEXT:    s_clause 0x1
1107; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1108; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
1109; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1110; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1111; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1112; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1113; GFX11-NEXT:    s_clause 0x1
1114; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1115; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5]
1116; GFX11-NEXT:    s_waitcnt vmcnt(0)
1117; GFX11-NEXT:    v_min_i32_e32 v1, v1, v2
1118; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1119; GFX11-NEXT:    s_endpgm
1120  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1121  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
1122  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
1123  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
1124  %a = load i32, ptr addrspace(1) %a.gep, align 4
1125  %b = load i32, ptr addrspace(1) %b.gep, align 4
1126  %cmp = icmp slt i32 %a, %b
1127  %val = select i1 %cmp, i32 %a, i32 %b
1128  store i32 %val, ptr addrspace(1) %out.gep, align 4
1129  ret void
1130}
1131
1132define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
1133; EG-LABEL: v_test_imin_slt_i16:
1134; EG:       ; %bb.0:
1135; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
1136; EG-NEXT:    TEX 0 @8
1137; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
1138; EG-NEXT:    TEX 0 @10
1139; EG-NEXT:    ALU 16, @15, KC0[CB0:0-32], KC1[]
1140; EG-NEXT:    MEM_RAT MSKOR T1.XW, T0.X
1141; EG-NEXT:    CF_END
1142; EG-NEXT:    PAD
1143; EG-NEXT:    Fetch clause starting at 8:
1144; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1145; EG-NEXT:    Fetch clause starting at 10:
1146; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
1147; EG-NEXT:    ALU clause starting at 12:
1148; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1149; EG-NEXT:     ADD_INT * T0.X, KC0[2].W, PV.W,
1150; EG-NEXT:    ALU clause starting at 14:
1151; EG-NEXT:     ADD_INT * T1.X, KC0[2].Z, T0.W,
1152; EG-NEXT:    ALU clause starting at 15:
1153; EG-NEXT:     BFE_INT T0.Z, T0.X, 0.0, literal.x,
1154; EG-NEXT:     BFE_INT T1.W, T1.X, 0.0, literal.x, BS:VEC_120/SCL_212
1155; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
1156; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1157; EG-NEXT:     AND_INT T2.W, PS, literal.x,
1158; EG-NEXT:     MIN_INT * T1.W, PV.W, PV.Z,
1159; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1160; EG-NEXT:     AND_INT T1.W, PS, literal.x,
1161; EG-NEXT:     LSHL * T2.W, PV.W, literal.y,
1162; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1163; EG-NEXT:     LSHL T1.X, PV.W, PS,
1164; EG-NEXT:     LSHL * T1.W, literal.x, PS,
1165; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1166; EG-NEXT:     MOV T1.Y, 0.0,
1167; EG-NEXT:     MOV * T1.Z, 0.0,
1168; EG-NEXT:     LSHR * T0.X, T0.W, literal.x,
1169; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1170;
1171; CI-LABEL: v_test_imin_slt_i16:
1172; CI:       ; %bb.0:
1173; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1174; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
1175; CI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
1176; CI-NEXT:    s_waitcnt lgkmcnt(0)
1177; CI-NEXT:    v_mov_b32_e32 v1, s3
1178; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
1179; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1180; CI-NEXT:    v_mov_b32_e32 v3, s5
1181; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
1182; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1183; CI-NEXT:    flat_load_sshort v5, v[0:1]
1184; CI-NEXT:    flat_load_sshort v2, v[2:3]
1185; CI-NEXT:    v_mov_b32_e32 v1, s1
1186; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v4
1187; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1188; CI-NEXT:    s_waitcnt vmcnt(0)
1189; CI-NEXT:    v_min_i32_e32 v2, v5, v2
1190; CI-NEXT:    flat_store_short v[0:1], v2
1191; CI-NEXT:    s_endpgm
1192;
1193; VI-LABEL: v_test_imin_slt_i16:
1194; VI:       ; %bb.0:
1195; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1196; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1197; VI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
1198; VI-NEXT:    s_waitcnt lgkmcnt(0)
1199; VI-NEXT:    v_mov_b32_e32 v1, s3
1200; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1201; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1202; VI-NEXT:    v_mov_b32_e32 v3, s5
1203; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
1204; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1205; VI-NEXT:    flat_load_ushort v5, v[0:1]
1206; VI-NEXT:    flat_load_ushort v2, v[2:3]
1207; VI-NEXT:    v_mov_b32_e32 v1, s1
1208; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
1209; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1210; VI-NEXT:    s_waitcnt vmcnt(0)
1211; VI-NEXT:    v_min_i16_e32 v2, v5, v2
1212; VI-NEXT:    flat_store_short v[0:1], v2
1213; VI-NEXT:    s_endpgm
1214;
1215; GFX9-LABEL: v_test_imin_slt_i16:
1216; GFX9:       ; %bb.0:
1217; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1218; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1219; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1220; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1221; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
1222; GFX9-NEXT:    global_load_ushort v2, v0, s[4:5]
1223; GFX9-NEXT:    s_waitcnt vmcnt(0)
1224; GFX9-NEXT:    v_min_i16_e32 v1, v1, v2
1225; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
1226; GFX9-NEXT:    s_endpgm
1227;
1228; GFX10-LABEL: v_test_imin_slt_i16:
1229; GFX10:       ; %bb.0:
1230; GFX10-NEXT:    s_clause 0x1
1231; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1232; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1233; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1234; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1235; GFX10-NEXT:    s_clause 0x1
1236; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
1237; GFX10-NEXT:    global_load_ushort v2, v0, s[4:5]
1238; GFX10-NEXT:    s_waitcnt vmcnt(0)
1239; GFX10-NEXT:    v_min_i16 v1, v1, v2
1240; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
1241; GFX10-NEXT:    s_endpgm
1242;
1243; GFX11-LABEL: v_test_imin_slt_i16:
1244; GFX11:       ; %bb.0:
1245; GFX11-NEXT:    s_clause 0x1
1246; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1247; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
1248; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1249; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1250; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1251; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1252; GFX11-NEXT:    s_clause 0x1
1253; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
1254; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5]
1255; GFX11-NEXT:    s_waitcnt vmcnt(0)
1256; GFX11-NEXT:    v_min_i16 v1, v1, v2
1257; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
1258; GFX11-NEXT:    s_endpgm
1259  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1260  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %aptr, i32 %tid
1261  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %bptr, i32 %tid
1262  %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
1263
1264  %a = load i16, ptr addrspace(1) %a.gep
1265  %b = load i16, ptr addrspace(1) %b.gep
1266  %cmp = icmp slt i16 %a, %b
1267  %val = select i1 %cmp, i16 %a, i16 %b
1268  store i16 %val, ptr addrspace(1) %out.gep
1269  ret void
1270}
1271
1272define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
1273; EG-LABEL: s_test_imin_slt_i32:
1274; EG:       ; %bb.0:
1275; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
1276; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1277; EG-NEXT:    CF_END
1278; EG-NEXT:    PAD
1279; EG-NEXT:    ALU clause starting at 4:
1280; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
1281; EG-NEXT:     MIN_INT * T1.X, KC0[2].Z, KC0[2].W,
1282; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1283;
1284; CI-LABEL: s_test_imin_slt_i32:
1285; CI:       ; %bb.0:
1286; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1287; CI-NEXT:    s_waitcnt lgkmcnt(0)
1288; CI-NEXT:    s_min_i32 s2, s2, s3
1289; CI-NEXT:    v_mov_b32_e32 v0, s0
1290; CI-NEXT:    v_mov_b32_e32 v1, s1
1291; CI-NEXT:    v_mov_b32_e32 v2, s2
1292; CI-NEXT:    flat_store_dword v[0:1], v2
1293; CI-NEXT:    s_endpgm
1294;
1295; VI-LABEL: s_test_imin_slt_i32:
1296; VI:       ; %bb.0:
1297; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1298; VI-NEXT:    s_waitcnt lgkmcnt(0)
1299; VI-NEXT:    s_min_i32 s2, s2, s3
1300; VI-NEXT:    v_mov_b32_e32 v0, s0
1301; VI-NEXT:    v_mov_b32_e32 v1, s1
1302; VI-NEXT:    v_mov_b32_e32 v2, s2
1303; VI-NEXT:    flat_store_dword v[0:1], v2
1304; VI-NEXT:    s_endpgm
1305;
1306; GFX9-LABEL: s_test_imin_slt_i32:
1307; GFX9:       ; %bb.0:
1308; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1309; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1310; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1311; GFX9-NEXT:    s_min_i32 s2, s2, s3
1312; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1313; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1314; GFX9-NEXT:    s_endpgm
1315;
1316; GFX10-LABEL: s_test_imin_slt_i32:
1317; GFX10:       ; %bb.0:
1318; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1319; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1320; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1321; GFX10-NEXT:    s_min_i32 s2, s2, s3
1322; GFX10-NEXT:    v_mov_b32_e32 v1, s2
1323; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1324; GFX10-NEXT:    s_endpgm
1325;
1326; GFX11-LABEL: s_test_imin_slt_i32:
1327; GFX11:       ; %bb.0:
1328; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1329; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1330; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1331; GFX11-NEXT:    s_min_i32 s2, s2, s3
1332; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1333; GFX11-NEXT:    v_mov_b32_e32 v1, s2
1334; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1335; GFX11-NEXT:    s_endpgm
1336  %cmp = icmp slt i32 %a, %b
1337  %val = select i1 %cmp, i32 %a, i32 %b
1338  store i32 %val, ptr addrspace(1) %out, align 4
1339  ret void
1340}
1341
1342define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 {
1343; EG-LABEL: s_test_imin_slt_v2i32:
1344; EG:       ; %bb.0:
1345; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
1346; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1347; EG-NEXT:    CF_END
1348; EG-NEXT:    PAD
1349; EG-NEXT:    ALU clause starting at 4:
1350; EG-NEXT:     MIN_INT * T0.Y, KC0[3].X, KC0[3].Z,
1351; EG-NEXT:     MIN_INT * T0.X, KC0[2].W, KC0[3].Y,
1352; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1353; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1354;
1355; CI-LABEL: s_test_imin_slt_v2i32:
1356; CI:       ; %bb.0:
1357; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x2
1358; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1359; CI-NEXT:    s_waitcnt lgkmcnt(0)
1360; CI-NEXT:    s_min_i32 s1, s1, s3
1361; CI-NEXT:    s_min_i32 s0, s0, s2
1362; CI-NEXT:    v_mov_b32_e32 v2, s4
1363; CI-NEXT:    v_mov_b32_e32 v0, s0
1364; CI-NEXT:    v_mov_b32_e32 v1, s1
1365; CI-NEXT:    v_mov_b32_e32 v3, s5
1366; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1367; CI-NEXT:    s_endpgm
1368;
1369; VI-LABEL: s_test_imin_slt_v2i32:
1370; VI:       ; %bb.0:
1371; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
1372; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1373; VI-NEXT:    s_waitcnt lgkmcnt(0)
1374; VI-NEXT:    s_min_i32 s1, s1, s3
1375; VI-NEXT:    s_min_i32 s0, s0, s2
1376; VI-NEXT:    v_mov_b32_e32 v2, s4
1377; VI-NEXT:    v_mov_b32_e32 v0, s0
1378; VI-NEXT:    v_mov_b32_e32 v1, s1
1379; VI-NEXT:    v_mov_b32_e32 v3, s5
1380; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1381; VI-NEXT:    s_endpgm
1382;
1383; GFX9-LABEL: s_test_imin_slt_v2i32:
1384; GFX9:       ; %bb.0:
1385; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
1386; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1387; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1388; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1389; GFX9-NEXT:    s_min_i32 s1, s1, s3
1390; GFX9-NEXT:    s_min_i32 s0, s0, s2
1391; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1392; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1393; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
1394; GFX9-NEXT:    s_endpgm
1395;
1396; GFX10-LABEL: s_test_imin_slt_v2i32:
1397; GFX10:       ; %bb.0:
1398; GFX10-NEXT:    s_clause 0x1
1399; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
1400; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1401; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1402; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1403; GFX10-NEXT:    s_min_i32 s0, s0, s2
1404; GFX10-NEXT:    s_min_i32 s1, s1, s3
1405; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1406; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1407; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
1408; GFX10-NEXT:    s_endpgm
1409;
1410; GFX11-LABEL: s_test_imin_slt_v2i32:
1411; GFX11:       ; %bb.0:
1412; GFX11-NEXT:    s_clause 0x1
1413; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
1414; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
1415; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1416; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1417; GFX11-NEXT:    s_min_i32 s0, s0, s2
1418; GFX11-NEXT:    s_min_i32 s1, s1, s3
1419; GFX11-NEXT:    v_mov_b32_e32 v0, s0
1420; GFX11-NEXT:    v_mov_b32_e32 v1, s1
1421; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
1422; GFX11-NEXT:    s_endpgm
1423  %cmp = icmp slt <2 x i32> %a, %b
1424  %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
1425  store <2 x i32> %val, ptr addrspace(1) %out
1426  ret void
1427}
1428
1429define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a) #0 {
1430; EG-LABEL: s_test_imin_slt_imm_i32:
1431; EG:       ; %bb.0:
1432; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
1433; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1434; EG-NEXT:    CF_END
1435; EG-NEXT:    PAD
1436; EG-NEXT:    ALU clause starting at 4:
1437; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
1438; EG-NEXT:     MIN_INT * T1.X, KC0[2].Z, literal.y,
1439; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
1440;
1441; CI-LABEL: s_test_imin_slt_imm_i32:
1442; CI:       ; %bb.0:
1443; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
1444; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1445; CI-NEXT:    s_waitcnt lgkmcnt(0)
1446; CI-NEXT:    s_min_i32 s2, s2, 8
1447; CI-NEXT:    v_mov_b32_e32 v0, s0
1448; CI-NEXT:    v_mov_b32_e32 v1, s1
1449; CI-NEXT:    v_mov_b32_e32 v2, s2
1450; CI-NEXT:    flat_store_dword v[0:1], v2
1451; CI-NEXT:    s_endpgm
1452;
1453; VI-LABEL: s_test_imin_slt_imm_i32:
1454; VI:       ; %bb.0:
1455; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
1456; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1457; VI-NEXT:    s_waitcnt lgkmcnt(0)
1458; VI-NEXT:    s_min_i32 s2, s2, 8
1459; VI-NEXT:    v_mov_b32_e32 v0, s0
1460; VI-NEXT:    v_mov_b32_e32 v1, s1
1461; VI-NEXT:    v_mov_b32_e32 v2, s2
1462; VI-NEXT:    flat_store_dword v[0:1], v2
1463; VI-NEXT:    s_endpgm
1464;
1465; GFX9-LABEL: s_test_imin_slt_imm_i32:
1466; GFX9:       ; %bb.0:
1467; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
1468; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1469; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1470; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1471; GFX9-NEXT:    s_min_i32 s2, s2, 8
1472; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1473; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1474; GFX9-NEXT:    s_endpgm
1475;
1476; GFX10-LABEL: s_test_imin_slt_imm_i32:
1477; GFX10:       ; %bb.0:
1478; GFX10-NEXT:    s_clause 0x1
1479; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x8
1480; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1481; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1482; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1483; GFX10-NEXT:    s_min_i32 s2, s2, 8
1484; GFX10-NEXT:    v_mov_b32_e32 v1, s2
1485; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1486; GFX10-NEXT:    s_endpgm
1487;
1488; GFX11-LABEL: s_test_imin_slt_imm_i32:
1489; GFX11:       ; %bb.0:
1490; GFX11-NEXT:    s_clause 0x1
1491; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
1492; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1493; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1494; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1495; GFX11-NEXT:    s_min_i32 s2, s2, 8
1496; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1497; GFX11-NEXT:    v_mov_b32_e32 v1, s2
1498; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1499; GFX11-NEXT:    s_endpgm
1500  %cmp = icmp slt i32 %a, 8
1501  %val = select i1 %cmp, i32 %a, i32 8
1502  store i32 %val, ptr addrspace(1) %out, align 4
1503  ret void
1504}
1505
1506define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a) #0 {
1507; EG-LABEL: s_test_imin_sle_imm_i32:
1508; EG:       ; %bb.0:
1509; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
1510; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1511; EG-NEXT:    CF_END
1512; EG-NEXT:    PAD
1513; EG-NEXT:    ALU clause starting at 4:
1514; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
1515; EG-NEXT:     MIN_INT * T1.X, KC0[2].Z, literal.y,
1516; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
1517;
1518; CI-LABEL: s_test_imin_sle_imm_i32:
1519; CI:       ; %bb.0:
1520; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
1521; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1522; CI-NEXT:    s_waitcnt lgkmcnt(0)
1523; CI-NEXT:    s_min_i32 s2, s2, 8
1524; CI-NEXT:    v_mov_b32_e32 v0, s0
1525; CI-NEXT:    v_mov_b32_e32 v1, s1
1526; CI-NEXT:    v_mov_b32_e32 v2, s2
1527; CI-NEXT:    flat_store_dword v[0:1], v2
1528; CI-NEXT:    s_endpgm
1529;
1530; VI-LABEL: s_test_imin_sle_imm_i32:
1531; VI:       ; %bb.0:
1532; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
1533; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1534; VI-NEXT:    s_waitcnt lgkmcnt(0)
1535; VI-NEXT:    s_min_i32 s2, s2, 8
1536; VI-NEXT:    v_mov_b32_e32 v0, s0
1537; VI-NEXT:    v_mov_b32_e32 v1, s1
1538; VI-NEXT:    v_mov_b32_e32 v2, s2
1539; VI-NEXT:    flat_store_dword v[0:1], v2
1540; VI-NEXT:    s_endpgm
1541;
1542; GFX9-LABEL: s_test_imin_sle_imm_i32:
1543; GFX9:       ; %bb.0:
1544; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
1545; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1546; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1547; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1548; GFX9-NEXT:    s_min_i32 s2, s2, 8
1549; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1550; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1551; GFX9-NEXT:    s_endpgm
1552;
1553; GFX10-LABEL: s_test_imin_sle_imm_i32:
1554; GFX10:       ; %bb.0:
1555; GFX10-NEXT:    s_clause 0x1
1556; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x8
1557; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1558; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1559; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1560; GFX10-NEXT:    s_min_i32 s2, s2, 8
1561; GFX10-NEXT:    v_mov_b32_e32 v1, s2
1562; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1563; GFX10-NEXT:    s_endpgm
1564;
1565; GFX11-LABEL: s_test_imin_sle_imm_i32:
1566; GFX11:       ; %bb.0:
1567; GFX11-NEXT:    s_clause 0x1
1568; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
1569; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1570; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1571; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1572; GFX11-NEXT:    s_min_i32 s2, s2, 8
1573; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1574; GFX11-NEXT:    v_mov_b32_e32 v1, s2
1575; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1576; GFX11-NEXT:    s_endpgm
1577  %cmp = icmp sle i32 %a, 8
1578  %val = select i1 %cmp, i32 %a, i32 8
1579  store i32 %val, ptr addrspace(1) %out, align 4
1580  ret void
1581}
1582
1583define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1584; EG-LABEL: v_test_umin_ule_i32:
1585; EG:       ; %bb.0:
1586; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
1587; EG-NEXT:    TEX 1 @6
1588; EG-NEXT:    ALU 3, @14, KC0[CB0:0-32], KC1[]
1589; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1590; EG-NEXT:    CF_END
1591; EG-NEXT:    PAD
1592; EG-NEXT:    Fetch clause starting at 6:
1593; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
1594; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1595; EG-NEXT:    ALU clause starting at 10:
1596; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1597; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1598; EG-NEXT:     ADD_INT T0.X, KC0[2].Z, PV.W,
1599; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, PV.W,
1600; EG-NEXT:    ALU clause starting at 14:
1601; EG-NEXT:     MIN_UINT T0.X, T0.X, T1.X,
1602; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
1603; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
1604; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1605;
1606; CI-LABEL: v_test_umin_ule_i32:
1607; CI:       ; %bb.0:
1608; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1609; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
1610; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1611; CI-NEXT:    s_waitcnt lgkmcnt(0)
1612; CI-NEXT:    v_mov_b32_e32 v1, s3
1613; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
1614; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1615; CI-NEXT:    v_mov_b32_e32 v3, s5
1616; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
1617; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1618; CI-NEXT:    flat_load_dword v5, v[0:1]
1619; CI-NEXT:    flat_load_dword v2, v[2:3]
1620; CI-NEXT:    v_mov_b32_e32 v1, s1
1621; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v4
1622; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1623; CI-NEXT:    s_waitcnt vmcnt(0)
1624; CI-NEXT:    v_min_u32_e32 v2, v5, v2
1625; CI-NEXT:    flat_store_dword v[0:1], v2
1626; CI-NEXT:    s_endpgm
1627;
1628; VI-LABEL: v_test_umin_ule_i32:
1629; VI:       ; %bb.0:
1630; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1631; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1632; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1633; VI-NEXT:    s_waitcnt lgkmcnt(0)
1634; VI-NEXT:    v_mov_b32_e32 v1, s3
1635; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1636; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1637; VI-NEXT:    v_mov_b32_e32 v3, s5
1638; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
1639; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1640; VI-NEXT:    flat_load_dword v5, v[0:1]
1641; VI-NEXT:    flat_load_dword v2, v[2:3]
1642; VI-NEXT:    v_mov_b32_e32 v1, s1
1643; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
1644; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1645; VI-NEXT:    s_waitcnt vmcnt(0)
1646; VI-NEXT:    v_min_u32_e32 v2, v5, v2
1647; VI-NEXT:    flat_store_dword v[0:1], v2
1648; VI-NEXT:    s_endpgm
1649;
1650; GFX9-LABEL: v_test_umin_ule_i32:
1651; GFX9:       ; %bb.0:
1652; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1653; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1654; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1655; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1656; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1657; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
1658; GFX9-NEXT:    s_waitcnt vmcnt(0)
1659; GFX9-NEXT:    v_min_u32_e32 v1, v1, v2
1660; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1661; GFX9-NEXT:    s_endpgm
1662;
1663; GFX10-LABEL: v_test_umin_ule_i32:
1664; GFX10:       ; %bb.0:
1665; GFX10-NEXT:    s_clause 0x1
1666; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1667; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1668; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1669; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1670; GFX10-NEXT:    s_clause 0x1
1671; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1672; GFX10-NEXT:    global_load_dword v2, v0, s[4:5]
1673; GFX10-NEXT:    s_waitcnt vmcnt(0)
1674; GFX10-NEXT:    v_min_u32_e32 v1, v1, v2
1675; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1676; GFX10-NEXT:    s_endpgm
1677;
1678; GFX11-LABEL: v_test_umin_ule_i32:
1679; GFX11:       ; %bb.0:
1680; GFX11-NEXT:    s_clause 0x1
1681; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1682; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
1683; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1684; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1685; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1686; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1687; GFX11-NEXT:    s_clause 0x1
1688; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1689; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5]
1690; GFX11-NEXT:    s_waitcnt vmcnt(0)
1691; GFX11-NEXT:    v_min_u32_e32 v1, v1, v2
1692; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1693; GFX11-NEXT:    s_endpgm
1694  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1695  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
1696  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
1697  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
1698  %a = load i32, ptr addrspace(1) %a.gep, align 4
1699  %b = load i32, ptr addrspace(1) %b.gep, align 4
1700  %cmp = icmp ule i32 %a, %b
1701  %val = select i1 %cmp, i32 %a, i32 %b
1702  store i32 %val, ptr addrspace(1) %out.gep, align 4
1703  ret void
1704}
1705
1706define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1707; EG-LABEL: v_test_umin_ule_v3i32:
1708; EG:       ; %bb.0:
1709; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
1710; EG-NEXT:    TEX 1 @6
1711; EG-NEXT:    ALU 9, @14, KC0[CB0:0-32], KC1[]
1712; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
1713; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1714; EG-NEXT:    CF_END
1715; EG-NEXT:    Fetch clause starting at 6:
1716; EG-NEXT:     VTX_READ_128 T1.XYZW, T1.X, 0, #1
1717; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 0, #1
1718; EG-NEXT:    ALU clause starting at 10:
1719; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1720; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1721; EG-NEXT:     ADD_INT T0.X, KC0[2].Z, PV.W,
1722; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, PV.W,
1723; EG-NEXT:    ALU clause starting at 14:
1724; EG-NEXT:     MIN_UINT * T0.Y, T2.Y, T1.Y,
1725; EG-NEXT:     MIN_UINT T0.X, T2.X, T1.X,
1726; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
1727; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
1728; EG-NEXT:     MIN_UINT * T2.X, T2.Z, T1.Z,
1729; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1730; EG-NEXT:     ADD_INT * T0.W, T0.W, literal.x,
1731; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1732; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
1733; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1734;
1735; CI-LABEL: v_test_umin_ule_v3i32:
1736; CI:       ; %bb.0:
1737; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1738; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
1739; CI-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
1740; CI-NEXT:    s_waitcnt lgkmcnt(0)
1741; CI-NEXT:    v_mov_b32_e32 v1, s3
1742; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v6
1743; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1744; CI-NEXT:    v_mov_b32_e32 v2, s5
1745; CI-NEXT:    v_add_i32_e32 v3, vcc, s4, v6
1746; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
1747; CI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
1748; CI-NEXT:    flat_load_dwordx3 v[3:5], v[3:4]
1749; CI-NEXT:    v_mov_b32_e32 v7, s1
1750; CI-NEXT:    v_add_i32_e32 v6, vcc, s0, v6
1751; CI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
1752; CI-NEXT:    s_waitcnt vmcnt(0)
1753; CI-NEXT:    v_min_u32_e32 v2, v2, v5
1754; CI-NEXT:    v_min_u32_e32 v1, v1, v4
1755; CI-NEXT:    v_min_u32_e32 v0, v0, v3
1756; CI-NEXT:    flat_store_dwordx3 v[6:7], v[0:2]
1757; CI-NEXT:    s_endpgm
1758;
1759; VI-LABEL: v_test_umin_ule_v3i32:
1760; VI:       ; %bb.0:
1761; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1762; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1763; VI-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
1764; VI-NEXT:    s_waitcnt lgkmcnt(0)
1765; VI-NEXT:    v_mov_b32_e32 v1, s3
1766; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1767; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1768; VI-NEXT:    v_mov_b32_e32 v2, s5
1769; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v6
1770; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
1771; VI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
1772; VI-NEXT:    flat_load_dwordx3 v[3:5], v[3:4]
1773; VI-NEXT:    v_mov_b32_e32 v7, s1
1774; VI-NEXT:    v_add_u32_e32 v6, vcc, s0, v6
1775; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
1776; VI-NEXT:    s_waitcnt vmcnt(0)
1777; VI-NEXT:    v_min_u32_e32 v2, v2, v5
1778; VI-NEXT:    v_min_u32_e32 v1, v1, v4
1779; VI-NEXT:    v_min_u32_e32 v0, v0, v3
1780; VI-NEXT:    flat_store_dwordx3 v[6:7], v[0:2]
1781; VI-NEXT:    s_endpgm
1782;
1783; GFX9-LABEL: v_test_umin_ule_v3i32:
1784; GFX9:       ; %bb.0:
1785; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1786; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1787; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
1788; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1789; GFX9-NEXT:    global_load_dwordx3 v[0:2], v6, s[2:3]
1790; GFX9-NEXT:    global_load_dwordx3 v[3:5], v6, s[4:5]
1791; GFX9-NEXT:    s_waitcnt vmcnt(0)
1792; GFX9-NEXT:    v_min_u32_e32 v2, v2, v5
1793; GFX9-NEXT:    v_min_u32_e32 v1, v1, v4
1794; GFX9-NEXT:    v_min_u32_e32 v0, v0, v3
1795; GFX9-NEXT:    global_store_dwordx3 v6, v[0:2], s[0:1]
1796; GFX9-NEXT:    s_endpgm
1797;
1798; GFX10-LABEL: v_test_umin_ule_v3i32:
1799; GFX10:       ; %bb.0:
1800; GFX10-NEXT:    s_clause 0x1
1801; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1802; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1803; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
1804; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1805; GFX10-NEXT:    s_clause 0x1
1806; GFX10-NEXT:    global_load_dwordx3 v[0:2], v6, s[2:3]
1807; GFX10-NEXT:    global_load_dwordx3 v[3:5], v6, s[4:5]
1808; GFX10-NEXT:    s_waitcnt vmcnt(0)
1809; GFX10-NEXT:    v_min_u32_e32 v2, v2, v5
1810; GFX10-NEXT:    v_min_u32_e32 v1, v1, v4
1811; GFX10-NEXT:    v_min_u32_e32 v0, v0, v3
1812; GFX10-NEXT:    global_store_dwordx3 v6, v[0:2], s[0:1]
1813; GFX10-NEXT:    s_endpgm
1814;
1815; GFX11-LABEL: v_test_umin_ule_v3i32:
1816; GFX11:       ; %bb.0:
1817; GFX11-NEXT:    s_clause 0x1
1818; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1819; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
1820; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1821; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1822; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
1823; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1824; GFX11-NEXT:    s_clause 0x1
1825; GFX11-NEXT:    global_load_b96 v[0:2], v6, s[2:3]
1826; GFX11-NEXT:    global_load_b96 v[3:5], v6, s[4:5]
1827; GFX11-NEXT:    s_waitcnt vmcnt(0)
1828; GFX11-NEXT:    v_min_u32_e32 v2, v2, v5
1829; GFX11-NEXT:    v_min_u32_e32 v1, v1, v4
1830; GFX11-NEXT:    v_min_u32_e32 v0, v0, v3
1831; GFX11-NEXT:    global_store_b96 v6, v[0:2], s[0:1]
1832; GFX11-NEXT:    s_endpgm
1833  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1834  %a.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %a.ptr, i32 %tid
1835  %b.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %b.ptr, i32 %tid
1836  %out.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %out, i32 %tid
1837
1838  %a = load <3 x i32>, ptr addrspace(1) %a.gep
1839  %b = load <3 x i32>, ptr addrspace(1) %b.gep
1840  %cmp = icmp ule <3 x i32> %a, %b
1841  %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
1842  store <3 x i32> %val, ptr addrspace(1) %out.gep
1843  ret void
1844}
1845
1846; FIXME: Reduce unused packed component to scalar
1847
1848define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1849; EG-LABEL: v_test_umin_ule_v3i16:
1850; EG:       ; %bb.0:
1851; EG-NEXT:    ALU 3, @20, KC0[CB0:0-32], KC1[]
1852; EG-NEXT:    TEX 1 @8
1853; EG-NEXT:    ALU 11, @24, KC0[CB0:0-32], KC1[]
1854; EG-NEXT:    TEX 3 @12
1855; EG-NEXT:    ALU 8, @36, KC0[], KC1[]
1856; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T8.X, 0
1857; EG-NEXT:    MEM_RAT MSKOR T7.XW, T0.X
1858; EG-NEXT:    CF_END
1859; EG-NEXT:    Fetch clause starting at 8:
1860; EG-NEXT:     VTX_READ_16 T7.X, T6.X, 4, #1
1861; EG-NEXT:     VTX_READ_16 T8.X, T0.X, 4, #1
1862; EG-NEXT:    Fetch clause starting at 12:
1863; EG-NEXT:     VTX_READ_16 T8.X, T6.X, 0, #1
1864; EG-NEXT:     VTX_READ_16 T9.X, T0.X, 0, #1
1865; EG-NEXT:     VTX_READ_16 T6.X, T6.X, 2, #1
1866; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 2, #1
1867; EG-NEXT:    ALU clause starting at 20:
1868; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1869; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1870; EG-NEXT:     ADD_INT T0.X, KC0[2].Z, PV.W,
1871; EG-NEXT:     ADD_INT * T6.X, KC0[2].W, PV.W,
1872; EG-NEXT:    ALU clause starting at 24:
1873; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
1874; EG-NEXT:     ADD_INT * T1.W, PV.W, literal.x,
1875; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1876; EG-NEXT:     AND_INT * T2.W, PV.W, literal.x,
1877; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1878; EG-NEXT:     LSHL T2.W, PV.W, literal.x,
1879; EG-NEXT:     MIN_UINT * T3.W, T8.X, T7.X,
1880; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1881; EG-NEXT:     LSHL T7.X, PS, PV.W,
1882; EG-NEXT:     LSHL * T7.W, literal.x, PV.W,
1883; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1884; EG-NEXT:     MOV * T7.Y, 0.0,
1885; EG-NEXT:    ALU clause starting at 36:
1886; EG-NEXT:     MOV T7.Z, 0.0,
1887; EG-NEXT:     MIN_UINT * T2.W, T0.X, T6.X,
1888; EG-NEXT:     LSHR T0.X, T1.W, literal.x,
1889; EG-NEXT:     LSHL T1.W, PV.W, literal.y,
1890; EG-NEXT:     MIN_UINT * T2.W, T9.X, T8.X,
1891; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
1892; EG-NEXT:     OR_INT T6.X, PV.W, PS,
1893; EG-NEXT:     LSHR * T8.X, T0.W, literal.x,
1894; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1895;
1896; CI-LABEL: v_test_umin_ule_v3i16:
1897; CI:       ; %bb.0:
1898; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1899; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
1900; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1901; CI-NEXT:    s_waitcnt lgkmcnt(0)
1902; CI-NEXT:    v_mov_b32_e32 v1, s3
1903; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
1904; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1905; CI-NEXT:    v_mov_b32_e32 v3, s5
1906; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
1907; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1908; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1909; CI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1910; CI-NEXT:    v_mov_b32_e32 v5, s1
1911; CI-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
1912; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1913; CI-NEXT:    v_add_i32_e32 v6, vcc, 4, v4
1914; CI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
1915; CI-NEXT:    s_waitcnt vmcnt(1)
1916; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
1917; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1918; CI-NEXT:    s_waitcnt vmcnt(0)
1919; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
1920; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1921; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1922; CI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1923; CI-NEXT:    v_min_u32_e32 v0, v0, v2
1924; CI-NEXT:    v_min_u32_e32 v2, v8, v9
1925; CI-NEXT:    v_min_u32_e32 v1, v1, v3
1926; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1927; CI-NEXT:    v_or_b32_e32 v0, v0, v2
1928; CI-NEXT:    flat_store_short v[6:7], v1
1929; CI-NEXT:    flat_store_dword v[4:5], v0
1930; CI-NEXT:    s_endpgm
1931;
1932; VI-LABEL: v_test_umin_ule_v3i16:
1933; VI:       ; %bb.0:
1934; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1935; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1936; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1937; VI-NEXT:    s_waitcnt lgkmcnt(0)
1938; VI-NEXT:    v_mov_b32_e32 v1, s3
1939; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1940; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1941; VI-NEXT:    v_mov_b32_e32 v3, s5
1942; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
1943; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1944; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1945; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1946; VI-NEXT:    v_mov_b32_e32 v5, s1
1947; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
1948; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1949; VI-NEXT:    v_add_u32_e32 v6, vcc, 4, v4
1950; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
1951; VI-NEXT:    s_waitcnt vmcnt(0)
1952; VI-NEXT:    v_min_u16_e32 v8, v0, v2
1953; VI-NEXT:    v_min_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1954; VI-NEXT:    v_min_u16_e32 v1, v1, v3
1955; VI-NEXT:    v_or_b32_e32 v0, v8, v0
1956; VI-NEXT:    flat_store_short v[6:7], v1
1957; VI-NEXT:    flat_store_dword v[4:5], v0
1958; VI-NEXT:    s_endpgm
1959;
1960; GFX9-LABEL: v_test_umin_ule_v3i16:
1961; GFX9:       ; %bb.0:
1962; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1963; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1964; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1965; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1966; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
1967; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
1968; GFX9-NEXT:    s_waitcnt vmcnt(0)
1969; GFX9-NEXT:    v_pk_min_u16 v1, v1, v3
1970; GFX9-NEXT:    v_pk_min_u16 v0, v0, v2
1971; GFX9-NEXT:    global_store_short v4, v1, s[0:1] offset:4
1972; GFX9-NEXT:    global_store_dword v4, v0, s[0:1]
1973; GFX9-NEXT:    s_endpgm
1974;
1975; GFX10-LABEL: v_test_umin_ule_v3i16:
1976; GFX10:       ; %bb.0:
1977; GFX10-NEXT:    s_clause 0x1
1978; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1979; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1980; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1981; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1982; GFX10-NEXT:    s_clause 0x1
1983; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
1984; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
1985; GFX10-NEXT:    s_waitcnt vmcnt(0)
1986; GFX10-NEXT:    v_pk_min_u16 v1, v1, v3
1987; GFX10-NEXT:    v_pk_min_u16 v0, v0, v2
1988; GFX10-NEXT:    global_store_short v4, v1, s[0:1] offset:4
1989; GFX10-NEXT:    global_store_dword v4, v0, s[0:1]
1990; GFX10-NEXT:    s_endpgm
1991;
1992; GFX11-LABEL: v_test_umin_ule_v3i16:
1993; GFX11:       ; %bb.0:
1994; GFX11-NEXT:    s_clause 0x1
1995; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1996; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
1997; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1998; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1999; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
2000; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2001; GFX11-NEXT:    s_clause 0x1
2002; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
2003; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[4:5]
2004; GFX11-NEXT:    s_waitcnt vmcnt(0)
2005; GFX11-NEXT:    v_pk_min_u16 v1, v1, v3
2006; GFX11-NEXT:    v_pk_min_u16 v0, v0, v2
2007; GFX11-NEXT:    s_clause 0x1
2008; GFX11-NEXT:    global_store_b16 v4, v1, s[0:1] offset:4
2009; GFX11-NEXT:    global_store_b32 v4, v0, s[0:1]
2010; GFX11-NEXT:    s_endpgm
2011  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2012  %a.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
2013  %b.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
2014  %out.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %out, i32 %tid
2015
2016  %a = load <3 x i16>, ptr addrspace(1) %a.gep
2017  %b = load <3 x i16>, ptr addrspace(1) %b.gep
2018  %cmp = icmp ule <3 x i16> %a, %b
2019  %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
2020  store <3 x i16> %val, ptr addrspace(1) %out.gep
2021  ret void
2022}
2023
2024define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
2025; EG-LABEL: s_test_umin_ule_i32:
2026; EG:       ; %bb.0:
2027; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
2028; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2029; EG-NEXT:    CF_END
2030; EG-NEXT:    PAD
2031; EG-NEXT:    ALU clause starting at 4:
2032; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
2033; EG-NEXT:     MIN_UINT * T1.X, KC0[2].Z, KC0[2].W,
2034; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2035;
2036; CI-LABEL: s_test_umin_ule_i32:
2037; CI:       ; %bb.0:
2038; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2039; CI-NEXT:    s_waitcnt lgkmcnt(0)
2040; CI-NEXT:    s_min_u32 s2, s2, s3
2041; CI-NEXT:    v_mov_b32_e32 v0, s0
2042; CI-NEXT:    v_mov_b32_e32 v1, s1
2043; CI-NEXT:    v_mov_b32_e32 v2, s2
2044; CI-NEXT:    flat_store_dword v[0:1], v2
2045; CI-NEXT:    s_endpgm
2046;
2047; VI-LABEL: s_test_umin_ule_i32:
2048; VI:       ; %bb.0:
2049; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2050; VI-NEXT:    s_waitcnt lgkmcnt(0)
2051; VI-NEXT:    s_min_u32 s2, s2, s3
2052; VI-NEXT:    v_mov_b32_e32 v0, s0
2053; VI-NEXT:    v_mov_b32_e32 v1, s1
2054; VI-NEXT:    v_mov_b32_e32 v2, s2
2055; VI-NEXT:    flat_store_dword v[0:1], v2
2056; VI-NEXT:    s_endpgm
2057;
2058; GFX9-LABEL: s_test_umin_ule_i32:
2059; GFX9:       ; %bb.0:
2060; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2061; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2062; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2063; GFX9-NEXT:    s_min_u32 s2, s2, s3
2064; GFX9-NEXT:    v_mov_b32_e32 v1, s2
2065; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2066; GFX9-NEXT:    s_endpgm
2067;
2068; GFX10-LABEL: s_test_umin_ule_i32:
2069; GFX10:       ; %bb.0:
2070; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2071; GFX10-NEXT:    v_mov_b32_e32 v0, 0
2072; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2073; GFX10-NEXT:    s_min_u32 s2, s2, s3
2074; GFX10-NEXT:    v_mov_b32_e32 v1, s2
2075; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2076; GFX10-NEXT:    s_endpgm
2077;
2078; GFX11-LABEL: s_test_umin_ule_i32:
2079; GFX11:       ; %bb.0:
2080; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2081; GFX11-NEXT:    v_mov_b32_e32 v0, 0
2082; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2083; GFX11-NEXT:    s_min_u32 s2, s2, s3
2084; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2085; GFX11-NEXT:    v_mov_b32_e32 v1, s2
2086; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2087; GFX11-NEXT:    s_endpgm
2088  %cmp = icmp ule i32 %a, %b
2089  %val = select i1 %cmp, i32 %a, i32 %b
2090  store i32 %val, ptr addrspace(1) %out, align 4
2091  ret void
2092}
2093
2094define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
2095; EG-LABEL: v_test_umin_ult_i32:
2096; EG:       ; %bb.0:
2097; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
2098; EG-NEXT:    TEX 1 @6
2099; EG-NEXT:    ALU 3, @14, KC0[CB0:0-32], KC1[]
2100; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2101; EG-NEXT:    CF_END
2102; EG-NEXT:    PAD
2103; EG-NEXT:    Fetch clause starting at 6:
2104; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
2105; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
2106; EG-NEXT:    ALU clause starting at 10:
2107; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
2108; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2109; EG-NEXT:     ADD_INT T0.X, KC0[2].Z, PV.W,
2110; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, PV.W,
2111; EG-NEXT:    ALU clause starting at 14:
2112; EG-NEXT:     MIN_UINT T0.X, T0.X, T1.X,
2113; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
2114; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
2115; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2116;
2117; CI-LABEL: v_test_umin_ult_i32:
2118; CI:       ; %bb.0:
2119; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2120; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
2121; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
2122; CI-NEXT:    s_waitcnt lgkmcnt(0)
2123; CI-NEXT:    v_mov_b32_e32 v1, s3
2124; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
2125; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2126; CI-NEXT:    v_mov_b32_e32 v3, s5
2127; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
2128; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2129; CI-NEXT:    flat_load_dword v5, v[0:1]
2130; CI-NEXT:    flat_load_dword v2, v[2:3]
2131; CI-NEXT:    v_mov_b32_e32 v1, s1
2132; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v4
2133; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2134; CI-NEXT:    s_waitcnt vmcnt(0)
2135; CI-NEXT:    v_min_u32_e32 v2, v5, v2
2136; CI-NEXT:    flat_store_dword v[0:1], v2
2137; CI-NEXT:    s_endpgm
2138;
2139; VI-LABEL: v_test_umin_ult_i32:
2140; VI:       ; %bb.0:
2141; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2142; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
2143; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
2144; VI-NEXT:    s_waitcnt lgkmcnt(0)
2145; VI-NEXT:    v_mov_b32_e32 v1, s3
2146; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
2147; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2148; VI-NEXT:    v_mov_b32_e32 v3, s5
2149; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
2150; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2151; VI-NEXT:    flat_load_dword v5, v[0:1]
2152; VI-NEXT:    flat_load_dword v2, v[2:3]
2153; VI-NEXT:    v_mov_b32_e32 v1, s1
2154; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
2155; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2156; VI-NEXT:    s_waitcnt vmcnt(0)
2157; VI-NEXT:    v_min_u32_e32 v2, v5, v2
2158; VI-NEXT:    flat_store_dword v[0:1], v2
2159; VI-NEXT:    s_endpgm
2160;
2161; GFX9-LABEL: v_test_umin_ult_i32:
2162; GFX9:       ; %bb.0:
2163; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2164; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
2165; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2166; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2167; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2168; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
2169; GFX9-NEXT:    s_waitcnt vmcnt(0)
2170; GFX9-NEXT:    v_min_u32_e32 v1, v1, v2
2171; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2172; GFX9-NEXT:    s_endpgm
2173;
2174; GFX10-LABEL: v_test_umin_ult_i32:
2175; GFX10:       ; %bb.0:
2176; GFX10-NEXT:    s_clause 0x1
2177; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2178; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
2179; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2180; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2181; GFX10-NEXT:    s_clause 0x1
2182; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
2183; GFX10-NEXT:    global_load_dword v2, v0, s[4:5]
2184; GFX10-NEXT:    s_waitcnt vmcnt(0)
2185; GFX10-NEXT:    v_min_u32_e32 v1, v1, v2
2186; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2187; GFX10-NEXT:    s_endpgm
2188;
2189; GFX11-LABEL: v_test_umin_ult_i32:
2190; GFX11:       ; %bb.0:
2191; GFX11-NEXT:    s_clause 0x1
2192; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2193; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
2194; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2195; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2196; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2197; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2198; GFX11-NEXT:    s_clause 0x1
2199; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2200; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5]
2201; GFX11-NEXT:    s_waitcnt vmcnt(0)
2202; GFX11-NEXT:    v_min_u32_e32 v1, v1, v2
2203; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2204; GFX11-NEXT:    s_endpgm
2205  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2206  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
2207  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
2208  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
2209  %a = load i32, ptr addrspace(1) %a.gep, align 4
2210  %b = load i32, ptr addrspace(1) %b.gep, align 4
2211  %cmp = icmp ult i32 %a, %b
2212  %val = select i1 %cmp, i32 %a, i32 %b
2213  store i32 %val, ptr addrspace(1) %out.gep, align 4
2214  ret void
2215}
2216
2217define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
2218; EG-LABEL: v_test_umin_ult_i8:
2219; EG:       ; %bb.0:
2220; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
2221; EG-NEXT:    TEX 1 @6
2222; EG-NEXT:    ALU 12, @12, KC0[CB0:0-32], KC1[]
2223; EG-NEXT:    MEM_RAT MSKOR T1.XW, T0.X
2224; EG-NEXT:    CF_END
2225; EG-NEXT:    PAD
2226; EG-NEXT:    Fetch clause starting at 6:
2227; EG-NEXT:     VTX_READ_8 T2.X, T2.X, 0, #1
2228; EG-NEXT:     VTX_READ_8 T1.X, T1.X, 0, #1
2229; EG-NEXT:    ALU clause starting at 10:
2230; EG-NEXT:     ADD_INT T1.X, KC0[2].Z, T0.X,
2231; EG-NEXT:     ADD_INT * T2.X, KC0[2].W, T0.X,
2232; EG-NEXT:    ALU clause starting at 12:
2233; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.X,
2234; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
2235; EG-NEXT:     MIN_UINT * T2.W, T1.X, T2.X,
2236; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2237; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
2238; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2239; EG-NEXT:     LSHL T1.X, T2.W, PV.W,
2240; EG-NEXT:     LSHL * T1.W, literal.x, PV.W,
2241; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2242; EG-NEXT:     MOV T1.Y, 0.0,
2243; EG-NEXT:     MOV * T1.Z, 0.0,
2244; EG-NEXT:     LSHR * T0.X, T0.W, literal.x,
2245; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2246;
2247; CI-LABEL: v_test_umin_ult_i8:
2248; CI:       ; %bb.0:
2249; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2250; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
2251; CI-NEXT:    s_waitcnt lgkmcnt(0)
2252; CI-NEXT:    v_mov_b32_e32 v2, s3
2253; CI-NEXT:    v_add_i32_e32 v1, vcc, s2, v0
2254; CI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
2255; CI-NEXT:    v_mov_b32_e32 v4, s5
2256; CI-NEXT:    v_add_i32_e32 v3, vcc, s4, v0
2257; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
2258; CI-NEXT:    flat_load_ubyte v2, v[1:2]
2259; CI-NEXT:    flat_load_ubyte v3, v[3:4]
2260; CI-NEXT:    v_mov_b32_e32 v1, s1
2261; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
2262; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2263; CI-NEXT:    s_waitcnt vmcnt(0)
2264; CI-NEXT:    v_min_u32_e32 v2, v2, v3
2265; CI-NEXT:    flat_store_byte v[0:1], v2
2266; CI-NEXT:    s_endpgm
2267;
2268; VI-LABEL: v_test_umin_ult_i8:
2269; VI:       ; %bb.0:
2270; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2271; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
2272; VI-NEXT:    s_waitcnt lgkmcnt(0)
2273; VI-NEXT:    v_mov_b32_e32 v2, s3
2274; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v0
2275; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
2276; VI-NEXT:    v_mov_b32_e32 v4, s5
2277; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v0
2278; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
2279; VI-NEXT:    flat_load_ubyte v2, v[1:2]
2280; VI-NEXT:    flat_load_ubyte v3, v[3:4]
2281; VI-NEXT:    v_mov_b32_e32 v1, s1
2282; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2283; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2284; VI-NEXT:    s_waitcnt vmcnt(0)
2285; VI-NEXT:    v_min_u16_e32 v2, v2, v3
2286; VI-NEXT:    flat_store_byte v[0:1], v2
2287; VI-NEXT:    s_endpgm
2288;
2289; GFX9-LABEL: v_test_umin_ult_i8:
2290; GFX9:       ; %bb.0:
2291; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2292; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
2293; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2294; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3]
2295; GFX9-NEXT:    global_load_ubyte v2, v0, s[4:5]
2296; GFX9-NEXT:    s_waitcnt vmcnt(0)
2297; GFX9-NEXT:    v_min_u16_e32 v1, v1, v2
2298; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
2299; GFX9-NEXT:    s_endpgm
2300;
2301; GFX10-LABEL: v_test_umin_ult_i8:
2302; GFX10:       ; %bb.0:
2303; GFX10-NEXT:    s_clause 0x1
2304; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2305; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
2306; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2307; GFX10-NEXT:    s_clause 0x1
2308; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
2309; GFX10-NEXT:    global_load_ubyte v2, v0, s[4:5]
2310; GFX10-NEXT:    s_waitcnt vmcnt(0)
2311; GFX10-NEXT:    v_min_u16 v1, v1, v2
2312; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
2313; GFX10-NEXT:    s_endpgm
2314;
2315; GFX11-LABEL: v_test_umin_ult_i8:
2316; GFX11:       ; %bb.0:
2317; GFX11-NEXT:    s_clause 0x1
2318; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2319; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
2320; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2321; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2322; GFX11-NEXT:    s_clause 0x1
2323; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3]
2324; GFX11-NEXT:    global_load_u8 v2, v0, s[4:5]
2325; GFX11-NEXT:    s_waitcnt vmcnt(0)
2326; GFX11-NEXT:    v_min_u16 v1, v1, v2
2327; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
2328; GFX11-NEXT:    s_endpgm
2329  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2330  %a.gep = getelementptr inbounds i8, ptr addrspace(1) %a.ptr, i32 %tid
2331  %b.gep = getelementptr inbounds i8, ptr addrspace(1) %b.ptr, i32 %tid
2332  %out.gep = getelementptr inbounds i8, ptr addrspace(1) %out, i32 %tid
2333
2334  %a = load i8, ptr addrspace(1) %a.gep, align 1
2335  %b = load i8, ptr addrspace(1) %b.gep, align 1
2336  %cmp = icmp ult i8 %a, %b
2337  %val = select i1 %cmp, i8 %a, i8 %b
2338  store i8 %val, ptr addrspace(1) %out.gep, align 1
2339  ret void
2340}
2341
2342define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
2343; EG-LABEL: s_test_umin_ult_i32:
2344; EG:       ; %bb.0:
2345; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
2346; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2347; EG-NEXT:    CF_END
2348; EG-NEXT:    PAD
2349; EG-NEXT:    ALU clause starting at 4:
2350; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
2351; EG-NEXT:     MIN_UINT * T1.X, KC0[2].Z, KC0[2].W,
2352; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2353;
2354; CI-LABEL: s_test_umin_ult_i32:
2355; CI:       ; %bb.0:
2356; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2357; CI-NEXT:    s_waitcnt lgkmcnt(0)
2358; CI-NEXT:    s_min_u32 s2, s2, s3
2359; CI-NEXT:    v_mov_b32_e32 v0, s0
2360; CI-NEXT:    v_mov_b32_e32 v1, s1
2361; CI-NEXT:    v_mov_b32_e32 v2, s2
2362; CI-NEXT:    flat_store_dword v[0:1], v2
2363; CI-NEXT:    s_endpgm
2364;
2365; VI-LABEL: s_test_umin_ult_i32:
2366; VI:       ; %bb.0:
2367; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2368; VI-NEXT:    s_waitcnt lgkmcnt(0)
2369; VI-NEXT:    s_min_u32 s2, s2, s3
2370; VI-NEXT:    v_mov_b32_e32 v0, s0
2371; VI-NEXT:    v_mov_b32_e32 v1, s1
2372; VI-NEXT:    v_mov_b32_e32 v2, s2
2373; VI-NEXT:    flat_store_dword v[0:1], v2
2374; VI-NEXT:    s_endpgm
2375;
2376; GFX9-LABEL: s_test_umin_ult_i32:
2377; GFX9:       ; %bb.0:
2378; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2379; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2380; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2381; GFX9-NEXT:    s_min_u32 s2, s2, s3
2382; GFX9-NEXT:    v_mov_b32_e32 v1, s2
2383; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2384; GFX9-NEXT:    s_endpgm
2385;
2386; GFX10-LABEL: s_test_umin_ult_i32:
2387; GFX10:       ; %bb.0:
2388; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2389; GFX10-NEXT:    v_mov_b32_e32 v0, 0
2390; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2391; GFX10-NEXT:    s_min_u32 s2, s2, s3
2392; GFX10-NEXT:    v_mov_b32_e32 v1, s2
2393; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2394; GFX10-NEXT:    s_endpgm
2395;
2396; GFX11-LABEL: s_test_umin_ult_i32:
2397; GFX11:       ; %bb.0:
2398; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2399; GFX11-NEXT:    v_mov_b32_e32 v0, 0
2400; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2401; GFX11-NEXT:    s_min_u32 s2, s2, s3
2402; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2403; GFX11-NEXT:    v_mov_b32_e32 v1, s2
2404; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2405; GFX11-NEXT:    s_endpgm
2406  %cmp = icmp ult i32 %a, %b
2407  %val = select i1 %cmp, i32 %a, i32 %b
2408  store i32 %val, ptr addrspace(1) %out, align 4
2409  ret void
2410}
2411
2412define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
2413; EG-LABEL: v_test_umin_ult_i32_multi_use:
2414; EG:       ; %bb.0:
2415; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
2416; EG-NEXT:    TEX 1 @6
2417; EG-NEXT:    ALU 16, @12, KC0[CB0:0-32], KC1[]
2418; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 0
2419; EG-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
2420; EG-NEXT:    CF_END
2421; EG-NEXT:    Fetch clause starting at 6:
2422; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
2423; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
2424; EG-NEXT:    ALU clause starting at 10:
2425; EG-NEXT:     MOV T0.X, KC0[2].W,
2426; EG-NEXT:     MOV * T1.X, KC0[3].X,
2427; EG-NEXT:    ALU clause starting at 12:
2428; EG-NEXT:     AND_INT T0.W, KC0[2].Z, literal.x,
2429; EG-NEXT:     SETGT_UINT * T1.W, T1.X, T0.X,
2430; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2431; EG-NEXT:     AND_INT T1.W, PS, 1,
2432; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2433; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2434; EG-NEXT:     LSHL T2.X, PV.W, PS,
2435; EG-NEXT:     LSHL * T2.W, literal.x, PS,
2436; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2437; EG-NEXT:     MOV T2.Y, 0.0,
2438; EG-NEXT:     MOV * T2.Z, 0.0,
2439; EG-NEXT:     LSHR T3.X, KC0[2].Z, literal.x,
2440; EG-NEXT:     SETGE_UINT * T0.W, T0.X, T1.X,
2441; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2442; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.X, T1.X,
2443; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2444; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2445;
2446; CI-LABEL: v_test_umin_ult_i32_multi_use:
2447; CI:       ; %bb.0:
2448; CI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
2449; CI-NEXT:    s_waitcnt lgkmcnt(0)
2450; CI-NEXT:    s_load_dword s4, s[4:5], 0x0
2451; CI-NEXT:    s_load_dword s5, s[6:7], 0x0
2452; CI-NEXT:    v_mov_b32_e32 v0, s0
2453; CI-NEXT:    v_mov_b32_e32 v1, s1
2454; CI-NEXT:    v_mov_b32_e32 v2, s2
2455; CI-NEXT:    v_mov_b32_e32 v3, s3
2456; CI-NEXT:    s_waitcnt lgkmcnt(0)
2457; CI-NEXT:    s_cmp_lt_u32 s4, s5
2458; CI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2459; CI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
2460; CI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
2461; CI-NEXT:    s_cselect_b32 s0, s4, s5
2462; CI-NEXT:    v_mov_b32_e32 v5, s0
2463; CI-NEXT:    flat_store_dword v[0:1], v5
2464; CI-NEXT:    flat_store_byte v[2:3], v4
2465; CI-NEXT:    s_endpgm
2466;
2467; VI-LABEL: v_test_umin_ult_i32_multi_use:
2468; VI:       ; %bb.0:
2469; VI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
2470; VI-NEXT:    s_waitcnt lgkmcnt(0)
2471; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
2472; VI-NEXT:    s_load_dword s5, s[6:7], 0x0
2473; VI-NEXT:    v_mov_b32_e32 v0, s0
2474; VI-NEXT:    v_mov_b32_e32 v1, s1
2475; VI-NEXT:    v_mov_b32_e32 v2, s2
2476; VI-NEXT:    v_mov_b32_e32 v3, s3
2477; VI-NEXT:    s_waitcnt lgkmcnt(0)
2478; VI-NEXT:    s_cmp_lt_u32 s4, s5
2479; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2480; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
2481; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
2482; VI-NEXT:    s_cselect_b32 s0, s4, s5
2483; VI-NEXT:    v_mov_b32_e32 v5, s0
2484; VI-NEXT:    flat_store_dword v[0:1], v5
2485; VI-NEXT:    flat_store_byte v[2:3], v4
2486; VI-NEXT:    s_endpgm
2487;
2488; GFX9-LABEL: v_test_umin_ult_i32_multi_use:
2489; GFX9:       ; %bb.0:
2490; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
2491; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2492; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2493; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x0
2494; GFX9-NEXT:    s_load_dword s9, s[6:7], 0x0
2495; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2496; GFX9-NEXT:    s_cmp_lt_u32 s8, s9
2497; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
2498; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
2499; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
2500; GFX9-NEXT:    s_cselect_b32 s4, s8, s9
2501; GFX9-NEXT:    v_mov_b32_e32 v2, s4
2502; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
2503; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
2504; GFX9-NEXT:    s_endpgm
2505;
2506; GFX10-LABEL: v_test_umin_ult_i32_multi_use:
2507; GFX10:       ; %bb.0:
2508; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
2509; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2510; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2511; GFX10-NEXT:    s_load_dword s8, s[4:5], 0x0
2512; GFX10-NEXT:    s_load_dword s9, s[6:7], 0x0
2513; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2514; GFX10-NEXT:    s_cmp_lt_u32 s8, s9
2515; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
2516; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
2517; GFX10-NEXT:    s_and_b32 s4, s4, exec_lo
2518; GFX10-NEXT:    s_cselect_b32 s4, s8, s9
2519; GFX10-NEXT:    v_mov_b32_e32 v2, s4
2520; GFX10-NEXT:    global_store_dword v1, v2, s[0:1]
2521; GFX10-NEXT:    global_store_byte v1, v0, s[2:3]
2522; GFX10-NEXT:    s_endpgm
2523;
2524; GFX11-LABEL: v_test_umin_ult_i32_multi_use:
2525; GFX11:       ; %bb.0:
2526; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
2527; GFX11-NEXT:    v_mov_b32_e32 v1, 0
2528; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2529; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x0
2530; GFX11-NEXT:    s_load_b32 s5, s[6:7], 0x0
2531; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2532; GFX11-NEXT:    s_cmp_lt_u32 s4, s5
2533; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
2534; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
2535; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
2536; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
2537; GFX11-NEXT:    s_cselect_b32 s4, s4, s5
2538; GFX11-NEXT:    v_mov_b32_e32 v2, s4
2539; GFX11-NEXT:    s_clause 0x1
2540; GFX11-NEXT:    global_store_b32 v1, v2, s[0:1]
2541; GFX11-NEXT:    global_store_b8 v1, v0, s[2:3]
2542; GFX11-NEXT:    s_endpgm
2543  %a = load i32, ptr addrspace(1) %aptr, align 4
2544  %b = load i32, ptr addrspace(1) %bptr, align 4
2545  %cmp = icmp ult i32 %a, %b
2546  %val = select i1 %cmp, i32 %a, i32 %b
2547  store i32 %val, ptr addrspace(1) %out0, align 4
2548  store i1 %cmp, ptr addrspace(1) %out1
2549  ret void
2550}
2551
2552define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
2553; EG-LABEL: v_test_umin_ult_i16_multi_use:
2554; EG:       ; %bb.0:
2555; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
2556; EG-NEXT:    TEX 1 @6
2557; EG-NEXT:    ALU 24, @12, KC0[CB0:0-32], KC1[]
2558; EG-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
2559; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
2560; EG-NEXT:    CF_END
2561; EG-NEXT:    Fetch clause starting at 6:
2562; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
2563; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
2564; EG-NEXT:    ALU clause starting at 10:
2565; EG-NEXT:     MOV T0.X, KC0[2].W,
2566; EG-NEXT:     MOV * T1.X, KC0[3].X,
2567; EG-NEXT:    ALU clause starting at 12:
2568; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
2569; EG-NEXT:     SETGE_UINT * T1.W, T0.X, T1.X,
2570; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2571; EG-NEXT:     CNDE_INT T1.W, PS, T0.X, T1.X,
2572; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2573; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2574; EG-NEXT:     LSHL T2.X, PV.W, PS,
2575; EG-NEXT:     LSHL * T2.W, literal.x, PS,
2576; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2577; EG-NEXT:     MOV T2.Y, 0.0,
2578; EG-NEXT:     AND_INT T0.W, KC0[2].Z, literal.x,
2579; EG-NEXT:     SETGT_UINT * T1.W, T1.X, T0.X,
2580; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2581; EG-NEXT:     AND_INT T1.W, PS, 1,
2582; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2583; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2584; EG-NEXT:     LSHL T0.X, PV.W, PS,
2585; EG-NEXT:     LSHL * T0.W, literal.x, PS,
2586; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2587; EG-NEXT:     MOV T0.Y, 0.0,
2588; EG-NEXT:     MOV T2.Z, 0.0,
2589; EG-NEXT:     MOV * T0.Z, 0.0,
2590; EG-NEXT:     LSHR T1.X, KC0[2].Z, literal.x,
2591; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
2592; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2593;
2594; CI-LABEL: v_test_umin_ult_i16_multi_use:
2595; CI:       ; %bb.0:
2596; CI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
2597; CI-NEXT:    s_waitcnt lgkmcnt(0)
2598; CI-NEXT:    v_mov_b32_e32 v0, s4
2599; CI-NEXT:    v_mov_b32_e32 v1, s5
2600; CI-NEXT:    v_mov_b32_e32 v2, s6
2601; CI-NEXT:    v_mov_b32_e32 v3, s7
2602; CI-NEXT:    flat_load_ushort v4, v[0:1]
2603; CI-NEXT:    flat_load_ushort v5, v[2:3]
2604; CI-NEXT:    v_mov_b32_e32 v0, s0
2605; CI-NEXT:    v_mov_b32_e32 v1, s1
2606; CI-NEXT:    v_mov_b32_e32 v2, s2
2607; CI-NEXT:    v_mov_b32_e32 v3, s3
2608; CI-NEXT:    s_waitcnt vmcnt(0)
2609; CI-NEXT:    v_cmp_lt_u32_e32 vcc, v4, v5
2610; CI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
2611; CI-NEXT:    flat_store_short v[0:1], v4
2612; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
2613; CI-NEXT:    flat_store_byte v[2:3], v0
2614; CI-NEXT:    s_endpgm
2615;
2616; VI-LABEL: v_test_umin_ult_i16_multi_use:
2617; VI:       ; %bb.0:
2618; VI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
2619; VI-NEXT:    s_waitcnt lgkmcnt(0)
2620; VI-NEXT:    v_mov_b32_e32 v0, s4
2621; VI-NEXT:    v_mov_b32_e32 v1, s5
2622; VI-NEXT:    v_mov_b32_e32 v2, s6
2623; VI-NEXT:    v_mov_b32_e32 v3, s7
2624; VI-NEXT:    flat_load_ushort v4, v[0:1]
2625; VI-NEXT:    flat_load_ushort v5, v[2:3]
2626; VI-NEXT:    v_mov_b32_e32 v0, s0
2627; VI-NEXT:    v_mov_b32_e32 v1, s1
2628; VI-NEXT:    v_mov_b32_e32 v2, s2
2629; VI-NEXT:    v_mov_b32_e32 v3, s3
2630; VI-NEXT:    s_waitcnt vmcnt(0)
2631; VI-NEXT:    v_cmp_lt_u32_e32 vcc, v4, v5
2632; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
2633; VI-NEXT:    flat_store_short v[0:1], v4
2634; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
2635; VI-NEXT:    flat_store_byte v[2:3], v0
2636; VI-NEXT:    s_endpgm
2637;
2638; GFX9-LABEL: v_test_umin_ult_i16_multi_use:
2639; GFX9:       ; %bb.0:
2640; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
2641; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2642; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2643; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5]
2644; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
2645; GFX9-NEXT:    s_waitcnt vmcnt(0)
2646; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v2
2647; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
2648; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
2649; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
2650; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
2651; GFX9-NEXT:    s_endpgm
2652;
2653; GFX10-LABEL: v_test_umin_ult_i16_multi_use:
2654; GFX10:       ; %bb.0:
2655; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
2656; GFX10-NEXT:    v_mov_b32_e32 v0, 0
2657; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2658; GFX10-NEXT:    s_clause 0x1
2659; GFX10-NEXT:    global_load_ushort v1, v0, s[4:5]
2660; GFX10-NEXT:    global_load_ushort v2, v0, s[6:7]
2661; GFX10-NEXT:    s_waitcnt vmcnt(0)
2662; GFX10-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v1, v2
2663; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
2664; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
2665; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
2666; GFX10-NEXT:    global_store_byte v0, v2, s[2:3]
2667; GFX10-NEXT:    s_endpgm
2668;
2669; GFX11-LABEL: v_test_umin_ult_i16_multi_use:
2670; GFX11:       ; %bb.0:
2671; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
2672; GFX11-NEXT:    v_mov_b32_e32 v0, 0
2673; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2674; GFX11-NEXT:    s_clause 0x1
2675; GFX11-NEXT:    global_load_u16 v1, v0, s[4:5]
2676; GFX11-NEXT:    global_load_u16 v2, v0, s[6:7]
2677; GFX11-NEXT:    s_waitcnt vmcnt(0)
2678; GFX11-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v1, v2
2679; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
2680; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
2681; GFX11-NEXT:    s_clause 0x1
2682; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
2683; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
2684; GFX11-NEXT:    s_endpgm
2685  %a = load i16, ptr addrspace(1) %aptr, align 2
2686  %b = load i16, ptr addrspace(1) %bptr, align 2
2687  %cmp = icmp ult i16 %a, %b
2688  %val = select i1 %cmp, i16 %a, i16 %b
2689  store i16 %val, ptr addrspace(1) %out0, align 2
2690  store i1 %cmp, ptr addrspace(1) %out1
2691  ret void
2692}
2693
2694define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 {
2695; EG-LABEL: s_test_umin_ult_v1i32:
2696; EG:       ; %bb.0:
2697; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
2698; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2699; EG-NEXT:    CF_END
2700; EG-NEXT:    PAD
2701; EG-NEXT:    ALU clause starting at 4:
2702; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
2703; EG-NEXT:     MIN_UINT * T1.X, KC0[2].Z, KC0[2].W,
2704; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2705;
2706; CI-LABEL: s_test_umin_ult_v1i32:
2707; CI:       ; %bb.0:
2708; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2709; CI-NEXT:    s_waitcnt lgkmcnt(0)
2710; CI-NEXT:    s_min_u32 s2, s2, s3
2711; CI-NEXT:    v_mov_b32_e32 v0, s0
2712; CI-NEXT:    v_mov_b32_e32 v1, s1
2713; CI-NEXT:    v_mov_b32_e32 v2, s2
2714; CI-NEXT:    flat_store_dword v[0:1], v2
2715; CI-NEXT:    s_endpgm
2716;
2717; VI-LABEL: s_test_umin_ult_v1i32:
2718; VI:       ; %bb.0:
2719; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2720; VI-NEXT:    s_waitcnt lgkmcnt(0)
2721; VI-NEXT:    s_min_u32 s2, s2, s3
2722; VI-NEXT:    v_mov_b32_e32 v0, s0
2723; VI-NEXT:    v_mov_b32_e32 v1, s1
2724; VI-NEXT:    v_mov_b32_e32 v2, s2
2725; VI-NEXT:    flat_store_dword v[0:1], v2
2726; VI-NEXT:    s_endpgm
2727;
2728; GFX9-LABEL: s_test_umin_ult_v1i32:
2729; GFX9:       ; %bb.0:
2730; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2731; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2732; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2733; GFX9-NEXT:    s_min_u32 s2, s2, s3
2734; GFX9-NEXT:    v_mov_b32_e32 v1, s2
2735; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2736; GFX9-NEXT:    s_endpgm
2737;
2738; GFX10-LABEL: s_test_umin_ult_v1i32:
2739; GFX10:       ; %bb.0:
2740; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2741; GFX10-NEXT:    v_mov_b32_e32 v0, 0
2742; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2743; GFX10-NEXT:    s_min_u32 s2, s2, s3
2744; GFX10-NEXT:    v_mov_b32_e32 v1, s2
2745; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2746; GFX10-NEXT:    s_endpgm
2747;
2748; GFX11-LABEL: s_test_umin_ult_v1i32:
2749; GFX11:       ; %bb.0:
2750; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2751; GFX11-NEXT:    v_mov_b32_e32 v0, 0
2752; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2753; GFX11-NEXT:    s_min_u32 s2, s2, s3
2754; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2755; GFX11-NEXT:    v_mov_b32_e32 v1, s2
2756; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2757; GFX11-NEXT:    s_endpgm
2758  %cmp = icmp ult <1 x i32> %a, %b
2759  %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
2760  store <1 x i32> %val, ptr addrspace(1) %out
2761  ret void
2762}
2763
2764define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) #0 {
2765; EG-LABEL: s_test_umin_ult_v8i32:
2766; EG:       ; %bb.0:
2767; EG-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
2768; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
2769; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2770; EG-NEXT:    CF_END
2771; EG-NEXT:    ALU clause starting at 4:
2772; EG-NEXT:     MIN_UINT * T0.W, KC0[5].X, KC0[7].X,
2773; EG-NEXT:     MIN_UINT * T0.Z, KC0[4].W, KC0[6].W,
2774; EG-NEXT:     MIN_UINT * T0.Y, KC0[4].Z, KC0[6].Z,
2775; EG-NEXT:     MIN_UINT * T0.X, KC0[4].Y, KC0[6].Y,
2776; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2777; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2778; EG-NEXT:     MIN_UINT * T2.W, KC0[6].X, KC0[8].X,
2779; EG-NEXT:     MIN_UINT * T2.Z, KC0[5].W, KC0[7].W,
2780; EG-NEXT:     MIN_UINT * T2.Y, KC0[5].Z, KC0[7].Z,
2781; EG-NEXT:     MIN_UINT * T2.X, KC0[5].Y, KC0[7].Y,
2782; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
2783; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2784; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
2785; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2786;
2787; CI-LABEL: s_test_umin_ult_v8i32:
2788; CI:       ; %bb.0:
2789; CI-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x8
2790; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
2791; CI-NEXT:    s_waitcnt lgkmcnt(0)
2792; CI-NEXT:    s_min_u32 s4, s15, s23
2793; CI-NEXT:    s_min_u32 s5, s14, s22
2794; CI-NEXT:    s_min_u32 s6, s13, s21
2795; CI-NEXT:    s_min_u32 s7, s12, s20
2796; CI-NEXT:    s_min_u32 s2, s19, s27
2797; CI-NEXT:    s_min_u32 s3, s18, s26
2798; CI-NEXT:    s_min_u32 s8, s17, s25
2799; CI-NEXT:    s_min_u32 s9, s16, s24
2800; CI-NEXT:    v_mov_b32_e32 v3, s2
2801; CI-NEXT:    s_add_u32 s2, s0, 16
2802; CI-NEXT:    v_mov_b32_e32 v2, s3
2803; CI-NEXT:    s_addc_u32 s3, s1, 0
2804; CI-NEXT:    v_mov_b32_e32 v5, s3
2805; CI-NEXT:    v_mov_b32_e32 v0, s9
2806; CI-NEXT:    v_mov_b32_e32 v1, s8
2807; CI-NEXT:    v_mov_b32_e32 v4, s2
2808; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2809; CI-NEXT:    v_mov_b32_e32 v5, s1
2810; CI-NEXT:    v_mov_b32_e32 v0, s7
2811; CI-NEXT:    v_mov_b32_e32 v1, s6
2812; CI-NEXT:    v_mov_b32_e32 v2, s5
2813; CI-NEXT:    v_mov_b32_e32 v3, s4
2814; CI-NEXT:    v_mov_b32_e32 v4, s0
2815; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2816; CI-NEXT:    s_endpgm
2817;
2818; VI-LABEL: s_test_umin_ult_v8i32:
2819; VI:       ; %bb.0:
2820; VI-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x20
2821; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
2822; VI-NEXT:    s_waitcnt lgkmcnt(0)
2823; VI-NEXT:    s_min_u32 s4, s15, s23
2824; VI-NEXT:    s_min_u32 s5, s14, s22
2825; VI-NEXT:    s_min_u32 s6, s13, s21
2826; VI-NEXT:    s_min_u32 s7, s12, s20
2827; VI-NEXT:    s_min_u32 s2, s19, s27
2828; VI-NEXT:    s_min_u32 s3, s18, s26
2829; VI-NEXT:    s_min_u32 s8, s17, s25
2830; VI-NEXT:    s_min_u32 s9, s16, s24
2831; VI-NEXT:    v_mov_b32_e32 v3, s2
2832; VI-NEXT:    s_add_u32 s2, s0, 16
2833; VI-NEXT:    v_mov_b32_e32 v2, s3
2834; VI-NEXT:    s_addc_u32 s3, s1, 0
2835; VI-NEXT:    v_mov_b32_e32 v5, s3
2836; VI-NEXT:    v_mov_b32_e32 v0, s9
2837; VI-NEXT:    v_mov_b32_e32 v1, s8
2838; VI-NEXT:    v_mov_b32_e32 v4, s2
2839; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2840; VI-NEXT:    v_mov_b32_e32 v5, s1
2841; VI-NEXT:    v_mov_b32_e32 v0, s7
2842; VI-NEXT:    v_mov_b32_e32 v1, s6
2843; VI-NEXT:    v_mov_b32_e32 v2, s5
2844; VI-NEXT:    v_mov_b32_e32 v3, s4
2845; VI-NEXT:    v_mov_b32_e32 v4, s0
2846; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2847; VI-NEXT:    s_endpgm
2848;
2849; GFX9-LABEL: s_test_umin_ult_v8i32:
2850; GFX9:       ; %bb.0:
2851; GFX9-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x20
2852; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
2853; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2854; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2855; GFX9-NEXT:    s_min_u32 s6, s19, s27
2856; GFX9-NEXT:    s_min_u32 s7, s18, s26
2857; GFX9-NEXT:    s_min_u32 s8, s17, s25
2858; GFX9-NEXT:    s_min_u32 s9, s16, s24
2859; GFX9-NEXT:    s_min_u32 s2, s15, s23
2860; GFX9-NEXT:    s_min_u32 s3, s14, s22
2861; GFX9-NEXT:    s_min_u32 s4, s13, s21
2862; GFX9-NEXT:    s_min_u32 s5, s12, s20
2863; GFX9-NEXT:    v_mov_b32_e32 v0, s9
2864; GFX9-NEXT:    v_mov_b32_e32 v1, s8
2865; GFX9-NEXT:    v_mov_b32_e32 v2, s7
2866; GFX9-NEXT:    v_mov_b32_e32 v3, s6
2867; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
2868; GFX9-NEXT:    s_nop 0
2869; GFX9-NEXT:    v_mov_b32_e32 v0, s5
2870; GFX9-NEXT:    v_mov_b32_e32 v1, s4
2871; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2872; GFX9-NEXT:    v_mov_b32_e32 v3, s2
2873; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2874; GFX9-NEXT:    s_endpgm
2875;
2876; GFX10-LABEL: s_test_umin_ult_v8i32:
2877; GFX10:       ; %bb.0:
2878; GFX10-NEXT:    s_clause 0x1
2879; GFX10-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x20
2880; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
2881; GFX10-NEXT:    v_mov_b32_e32 v8, 0
2882; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2883; GFX10-NEXT:    s_min_u32 s6, s19, s27
2884; GFX10-NEXT:    s_min_u32 s7, s18, s26
2885; GFX10-NEXT:    s_min_u32 s8, s16, s24
2886; GFX10-NEXT:    s_min_u32 s9, s17, s25
2887; GFX10-NEXT:    s_min_u32 s2, s15, s23
2888; GFX10-NEXT:    s_min_u32 s3, s14, s22
2889; GFX10-NEXT:    s_min_u32 s4, s13, s21
2890; GFX10-NEXT:    s_min_u32 s5, s12, s20
2891; GFX10-NEXT:    v_mov_b32_e32 v0, s8
2892; GFX10-NEXT:    v_mov_b32_e32 v1, s9
2893; GFX10-NEXT:    v_mov_b32_e32 v2, s7
2894; GFX10-NEXT:    v_mov_b32_e32 v3, s6
2895; GFX10-NEXT:    v_mov_b32_e32 v4, s5
2896; GFX10-NEXT:    v_mov_b32_e32 v5, s4
2897; GFX10-NEXT:    v_mov_b32_e32 v6, s3
2898; GFX10-NEXT:    v_mov_b32_e32 v7, s2
2899; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
2900; GFX10-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
2901; GFX10-NEXT:    s_endpgm
2902;
2903; GFX11-LABEL: s_test_umin_ult_v8i32:
2904; GFX11:       ; %bb.0:
2905; GFX11-NEXT:    s_clause 0x1
2906; GFX11-NEXT:    s_load_b512 s[8:23], s[4:5], 0x20
2907; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2908; GFX11-NEXT:    v_mov_b32_e32 v8, 0
2909; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2910; GFX11-NEXT:    s_min_u32 s4, s9, s17
2911; GFX11-NEXT:    s_min_u32 s5, s8, s16
2912; GFX11-NEXT:    s_min_u32 s6, s15, s23
2913; GFX11-NEXT:    s_min_u32 s7, s14, s22
2914; GFX11-NEXT:    s_min_u32 s8, s12, s20
2915; GFX11-NEXT:    s_min_u32 s9, s13, s21
2916; GFX11-NEXT:    s_min_u32 s2, s11, s19
2917; GFX11-NEXT:    s_min_u32 s3, s10, s18
2918; GFX11-NEXT:    v_mov_b32_e32 v0, s8
2919; GFX11-NEXT:    v_mov_b32_e32 v1, s9
2920; GFX11-NEXT:    v_mov_b32_e32 v2, s7
2921; GFX11-NEXT:    v_mov_b32_e32 v3, s6
2922; GFX11-NEXT:    v_mov_b32_e32 v4, s5
2923; GFX11-NEXT:    v_mov_b32_e32 v5, s4
2924; GFX11-NEXT:    v_mov_b32_e32 v6, s3
2925; GFX11-NEXT:    v_mov_b32_e32 v7, s2
2926; GFX11-NEXT:    s_clause 0x1
2927; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
2928; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
2929; GFX11-NEXT:    s_endpgm
2930  %cmp = icmp ult <8 x i32> %a, %b
2931  %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
2932  store <8 x i32> %val, ptr addrspace(1) %out
2933  ret void
2934}
2935
2936define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16> %a, <8 x i16> %b) #0 {
2937; EG-LABEL: s_test_umin_ult_v8i16:
2938; EG:       ; %bb.0:
2939; EG-NEXT:    ALU 1, @52, KC0[], KC1[]
2940; EG-NEXT:    TEX 1 @20
2941; EG-NEXT:    ALU 9, @54, KC0[], KC1[]
2942; EG-NEXT:    TEX 1 @24
2943; EG-NEXT:    ALU 8, @64, KC0[], KC1[]
2944; EG-NEXT:    TEX 1 @28
2945; EG-NEXT:    ALU 10, @73, KC0[], KC1[]
2946; EG-NEXT:    TEX 1 @32
2947; EG-NEXT:    ALU 8, @84, KC0[], KC1[]
2948; EG-NEXT:    TEX 1 @36
2949; EG-NEXT:    ALU 10, @93, KC0[], KC1[]
2950; EG-NEXT:    TEX 1 @40
2951; EG-NEXT:    ALU 8, @104, KC0[], KC1[]
2952; EG-NEXT:    TEX 1 @44
2953; EG-NEXT:    ALU 10, @113, KC0[], KC1[]
2954; EG-NEXT:    TEX 1 @48
2955; EG-NEXT:    ALU 10, @124, KC0[CB0:0-32], KC1[]
2956; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
2957; EG-NEXT:    CF_END
2958; EG-NEXT:    PAD
2959; EG-NEXT:    Fetch clause starting at 20:
2960; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 66, #3
2961; EG-NEXT:     VTX_READ_16 T9.X, T7.X, 82, #3
2962; EG-NEXT:    Fetch clause starting at 24:
2963; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 64, #3
2964; EG-NEXT:     VTX_READ_16 T9.X, T7.X, 80, #3
2965; EG-NEXT:    Fetch clause starting at 28:
2966; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 62, #3
2967; EG-NEXT:     VTX_READ_16 T9.X, T7.X, 78, #3
2968; EG-NEXT:    Fetch clause starting at 32:
2969; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 60, #3
2970; EG-NEXT:     VTX_READ_16 T9.X, T7.X, 76, #3
2971; EG-NEXT:    Fetch clause starting at 36:
2972; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 58, #3
2973; EG-NEXT:     VTX_READ_16 T9.X, T7.X, 74, #3
2974; EG-NEXT:    Fetch clause starting at 40:
2975; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 56, #3
2976; EG-NEXT:     VTX_READ_16 T9.X, T7.X, 72, #3
2977; EG-NEXT:    Fetch clause starting at 44:
2978; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 54, #3
2979; EG-NEXT:     VTX_READ_16 T9.X, T7.X, 70, #3
2980; EG-NEXT:    Fetch clause starting at 48:
2981; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 52, #3
2982; EG-NEXT:     VTX_READ_16 T7.X, T7.X, 68, #3
2983; EG-NEXT:    ALU clause starting at 52:
2984; EG-NEXT:     MOV * T0.Y, T3.X,
2985; EG-NEXT:     MOV * T7.X, 0.0,
2986; EG-NEXT:    ALU clause starting at 54:
2987; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
2988; EG-NEXT:     AND_INT * T1.W, T9.X, literal.x,
2989; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2990; EG-NEXT:     MIN_UINT * T0.W, PV.W, PS,
2991; EG-NEXT:     LSHL T0.W, PV.W, literal.x,
2992; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2993; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
2994; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
2995; EG-NEXT:     MOV * T3.X, PV.W,
2996; EG-NEXT:     MOV * T0.Y, PV.X,
2997; EG-NEXT:    ALU clause starting at 64:
2998; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
2999; EG-NEXT:     AND_INT * T1.W, T9.X, literal.x,
3000; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3001; EG-NEXT:     AND_INT T2.W, T0.Y, literal.x,
3002; EG-NEXT:     MIN_UINT * T0.W, PV.W, PS,
3003; EG-NEXT:    -65536(nan), 0(0.000000e+00)
3004; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3005; EG-NEXT:     MOV T3.X, PV.W,
3006; EG-NEXT:     MOV * T0.Y, T2.X,
3007; EG-NEXT:    ALU clause starting at 73:
3008; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3009; EG-NEXT:     AND_INT * T1.W, T9.X, literal.x,
3010; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3011; EG-NEXT:     MIN_UINT T0.W, PV.W, PS,
3012; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
3013; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3014; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3015; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3016; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3017; EG-NEXT:     MOV * T2.X, PV.W,
3018; EG-NEXT:     MOV * T0.Y, PV.X,
3019; EG-NEXT:    ALU clause starting at 84:
3020; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3021; EG-NEXT:     AND_INT * T1.W, T9.X, literal.x,
3022; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3023; EG-NEXT:     AND_INT T2.W, T0.Y, literal.x,
3024; EG-NEXT:     MIN_UINT * T0.W, PV.W, PS,
3025; EG-NEXT:    -65536(nan), 0(0.000000e+00)
3026; EG-NEXT:     OR_INT * T7.Z, PV.W, PS,
3027; EG-NEXT:     MOV T2.X, PV.Z,
3028; EG-NEXT:     MOV * T0.Y, T5.X,
3029; EG-NEXT:    ALU clause starting at 93:
3030; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3031; EG-NEXT:     AND_INT * T1.W, T9.X, literal.x,
3032; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3033; EG-NEXT:     MIN_UINT T0.W, PV.W, PS,
3034; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
3035; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3036; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3037; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3038; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3039; EG-NEXT:     MOV * T5.X, PV.W,
3040; EG-NEXT:     MOV * T0.Y, PV.X,
3041; EG-NEXT:    ALU clause starting at 104:
3042; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3043; EG-NEXT:     AND_INT * T1.W, T9.X, literal.x,
3044; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3045; EG-NEXT:     AND_INT T2.W, T0.Y, literal.x,
3046; EG-NEXT:     MIN_UINT * T0.W, PV.W, PS,
3047; EG-NEXT:    -65536(nan), 0(0.000000e+00)
3048; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3049; EG-NEXT:     MOV T5.X, PV.W,
3050; EG-NEXT:     MOV * T0.Y, T4.X,
3051; EG-NEXT:    ALU clause starting at 113:
3052; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3053; EG-NEXT:     AND_INT * T1.W, T9.X, literal.x,
3054; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3055; EG-NEXT:     MIN_UINT T0.W, PV.W, PS,
3056; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
3057; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3058; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3059; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3060; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3061; EG-NEXT:     MOV * T4.X, PV.W,
3062; EG-NEXT:     MOV * T0.Y, PV.X,
3063; EG-NEXT:    ALU clause starting at 124:
3064; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3065; EG-NEXT:     AND_INT * T1.W, T7.X, literal.x,
3066; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3067; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
3068; EG-NEXT:     AND_INT T2.W, T0.Y, literal.y,
3069; EG-NEXT:     MIN_UINT * T0.W, PV.W, PS,
3070; EG-NEXT:    2(2.802597e-45), -65536(nan)
3071; EG-NEXT:     OR_INT * T7.X, PV.W, PS,
3072; EG-NEXT:     MOV T4.X, PV.X,
3073; EG-NEXT:     MOV * T7.W, T3.X,
3074; EG-NEXT:     MOV * T7.Y, T5.X,
3075;
3076; CI-LABEL: s_test_umin_ult_v8i16:
3077; CI:       ; %bb.0:
3078; CI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x4
3079; CI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
3080; CI-NEXT:    s_waitcnt lgkmcnt(0)
3081; CI-NEXT:    s_lshr_b32 s10, s0, 16
3082; CI-NEXT:    s_and_b32 s0, s0, 0xffff
3083; CI-NEXT:    s_lshr_b32 s11, s1, 16
3084; CI-NEXT:    s_and_b32 s1, s1, 0xffff
3085; CI-NEXT:    s_lshr_b32 s12, s2, 16
3086; CI-NEXT:    s_and_b32 s2, s2, 0xffff
3087; CI-NEXT:    s_lshr_b32 s13, s3, 16
3088; CI-NEXT:    s_and_b32 s3, s3, 0xffff
3089; CI-NEXT:    s_lshr_b32 s14, s4, 16
3090; CI-NEXT:    s_and_b32 s4, s4, 0xffff
3091; CI-NEXT:    s_lshr_b32 s15, s5, 16
3092; CI-NEXT:    s_and_b32 s5, s5, 0xffff
3093; CI-NEXT:    s_lshr_b32 s16, s6, 16
3094; CI-NEXT:    s_and_b32 s6, s6, 0xffff
3095; CI-NEXT:    s_lshr_b32 s17, s7, 16
3096; CI-NEXT:    s_and_b32 s7, s7, 0xffff
3097; CI-NEXT:    s_min_u32 s3, s3, s7
3098; CI-NEXT:    s_min_u32 s7, s13, s17
3099; CI-NEXT:    s_min_u32 s2, s2, s6
3100; CI-NEXT:    s_min_u32 s6, s12, s16
3101; CI-NEXT:    s_min_u32 s1, s1, s5
3102; CI-NEXT:    s_min_u32 s5, s11, s15
3103; CI-NEXT:    s_min_u32 s0, s0, s4
3104; CI-NEXT:    s_min_u32 s4, s10, s14
3105; CI-NEXT:    s_lshl_b32 s7, s7, 16
3106; CI-NEXT:    s_lshl_b32 s6, s6, 16
3107; CI-NEXT:    s_lshl_b32 s5, s5, 16
3108; CI-NEXT:    s_lshl_b32 s4, s4, 16
3109; CI-NEXT:    s_or_b32 s3, s3, s7
3110; CI-NEXT:    s_or_b32 s2, s2, s6
3111; CI-NEXT:    s_or_b32 s1, s1, s5
3112; CI-NEXT:    s_or_b32 s0, s0, s4
3113; CI-NEXT:    v_mov_b32_e32 v4, s8
3114; CI-NEXT:    v_mov_b32_e32 v0, s0
3115; CI-NEXT:    v_mov_b32_e32 v1, s1
3116; CI-NEXT:    v_mov_b32_e32 v2, s2
3117; CI-NEXT:    v_mov_b32_e32 v3, s3
3118; CI-NEXT:    v_mov_b32_e32 v5, s9
3119; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3120; CI-NEXT:    s_endpgm
3121;
3122; VI-LABEL: s_test_umin_ult_v8i16:
3123; VI:       ; %bb.0:
3124; VI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x10
3125; VI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
3126; VI-NEXT:    s_waitcnt lgkmcnt(0)
3127; VI-NEXT:    s_lshr_b32 s10, s3, 16
3128; VI-NEXT:    s_and_b32 s3, s3, 0xffff
3129; VI-NEXT:    s_lshr_b32 s11, s2, 16
3130; VI-NEXT:    s_and_b32 s2, s2, 0xffff
3131; VI-NEXT:    s_lshr_b32 s12, s1, 16
3132; VI-NEXT:    s_and_b32 s1, s1, 0xffff
3133; VI-NEXT:    s_lshr_b32 s13, s0, 16
3134; VI-NEXT:    s_and_b32 s0, s0, 0xffff
3135; VI-NEXT:    s_lshr_b32 s14, s7, 16
3136; VI-NEXT:    s_and_b32 s7, s7, 0xffff
3137; VI-NEXT:    s_lshr_b32 s15, s6, 16
3138; VI-NEXT:    s_and_b32 s6, s6, 0xffff
3139; VI-NEXT:    s_lshr_b32 s16, s5, 16
3140; VI-NEXT:    s_and_b32 s5, s5, 0xffff
3141; VI-NEXT:    s_lshr_b32 s17, s4, 16
3142; VI-NEXT:    s_and_b32 s4, s4, 0xffff
3143; VI-NEXT:    s_min_u32 s0, s0, s4
3144; VI-NEXT:    s_min_u32 s4, s13, s17
3145; VI-NEXT:    s_min_u32 s1, s1, s5
3146; VI-NEXT:    s_min_u32 s5, s12, s16
3147; VI-NEXT:    s_min_u32 s2, s2, s6
3148; VI-NEXT:    s_min_u32 s6, s11, s15
3149; VI-NEXT:    s_min_u32 s3, s3, s7
3150; VI-NEXT:    s_min_u32 s7, s10, s14
3151; VI-NEXT:    s_lshl_b32 s7, s7, 16
3152; VI-NEXT:    s_lshl_b32 s6, s6, 16
3153; VI-NEXT:    s_lshl_b32 s5, s5, 16
3154; VI-NEXT:    s_lshl_b32 s4, s4, 16
3155; VI-NEXT:    s_or_b32 s3, s3, s7
3156; VI-NEXT:    s_or_b32 s2, s2, s6
3157; VI-NEXT:    s_or_b32 s1, s1, s5
3158; VI-NEXT:    s_or_b32 s0, s0, s4
3159; VI-NEXT:    v_mov_b32_e32 v4, s8
3160; VI-NEXT:    v_mov_b32_e32 v0, s0
3161; VI-NEXT:    v_mov_b32_e32 v1, s1
3162; VI-NEXT:    v_mov_b32_e32 v2, s2
3163; VI-NEXT:    v_mov_b32_e32 v3, s3
3164; VI-NEXT:    v_mov_b32_e32 v5, s9
3165; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3166; VI-NEXT:    s_endpgm
3167;
3168; GFX9-LABEL: s_test_umin_ult_v8i16:
3169; GFX9:       ; %bb.0:
3170; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x10
3171; GFX9-NEXT:    v_mov_b32_e32 v4, 0
3172; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
3173; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3174; GFX9-NEXT:    v_mov_b32_e32 v0, s7
3175; GFX9-NEXT:    v_mov_b32_e32 v1, s6
3176; GFX9-NEXT:    v_pk_min_u16 v3, s3, v0
3177; GFX9-NEXT:    v_mov_b32_e32 v0, s5
3178; GFX9-NEXT:    v_pk_min_u16 v2, s2, v1
3179; GFX9-NEXT:    v_pk_min_u16 v1, s1, v0
3180; GFX9-NEXT:    v_mov_b32_e32 v0, s4
3181; GFX9-NEXT:    v_pk_min_u16 v0, s0, v0
3182; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
3183; GFX9-NEXT:    s_endpgm
3184;
3185; GFX10-LABEL: s_test_umin_ult_v8i16:
3186; GFX10:       ; %bb.0:
3187; GFX10-NEXT:    s_clause 0x1
3188; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x10
3189; GFX10-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
3190; GFX10-NEXT:    v_mov_b32_e32 v4, 0
3191; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3192; GFX10-NEXT:    v_pk_min_u16 v3, s3, s7
3193; GFX10-NEXT:    v_pk_min_u16 v2, s2, s6
3194; GFX10-NEXT:    v_pk_min_u16 v1, s1, s5
3195; GFX10-NEXT:    v_pk_min_u16 v0, s0, s4
3196; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[10:11]
3197; GFX10-NEXT:    s_endpgm
3198;
3199; GFX11-LABEL: s_test_umin_ult_v8i16:
3200; GFX11:       ; %bb.0:
3201; GFX11-NEXT:    s_clause 0x1
3202; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x10
3203; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3204; GFX11-NEXT:    v_mov_b32_e32 v4, 0
3205; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3206; GFX11-NEXT:    v_pk_min_u16 v3, s11, s15
3207; GFX11-NEXT:    v_pk_min_u16 v2, s10, s14
3208; GFX11-NEXT:    v_pk_min_u16 v1, s9, s13
3209; GFX11-NEXT:    v_pk_min_u16 v0, s8, s12
3210; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
3211; GFX11-NEXT:    s_endpgm
3212  %cmp = icmp ult <8 x i16> %a, %b
3213  %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
3214  store <8 x i16> %val, ptr addrspace(1) %out
3215  ret void
3216}
3217
3218; Make sure redundant and removed
3219
3220define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspace(1) %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
3221; EG-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3222; EG:       ; %bb.0:
3223; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
3224; EG-NEXT:    TEX 1 @6
3225; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
3226; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
3227; EG-NEXT:    CF_END
3228; EG-NEXT:    PAD
3229; EG-NEXT:    Fetch clause starting at 6:
3230; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 72, #3
3231; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 108, #3
3232; EG-NEXT:    ALU clause starting at 10:
3233; EG-NEXT:     MOV * T0.X, 0.0,
3234; EG-NEXT:    ALU clause starting at 11:
3235; EG-NEXT:     BFE_INT T0.Z, T1.X, 0.0, literal.x,
3236; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
3237; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3238; EG-NEXT:     MIN_UINT T0.X, PV.Z, PV.W,
3239; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
3240; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3241;
3242; CI-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3243; CI:       ; %bb.0:
3244; CI-NEXT:    s_load_dword s2, s[8:9], 0xa
3245; CI-NEXT:    s_load_dword s3, s[8:9], 0x13
3246; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
3247; CI-NEXT:    s_waitcnt lgkmcnt(0)
3248; CI-NEXT:    s_and_b32 s2, s2, 0xffff
3249; CI-NEXT:    s_and_b32 s3, s3, 0xffff
3250; CI-NEXT:    s_min_u32 s2, s2, s3
3251; CI-NEXT:    v_mov_b32_e32 v0, s0
3252; CI-NEXT:    v_mov_b32_e32 v1, s1
3253; CI-NEXT:    v_mov_b32_e32 v2, s2
3254; CI-NEXT:    flat_store_dword v[0:1], v2
3255; CI-NEXT:    s_endpgm
3256;
3257; VI-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3258; VI:       ; %bb.0:
3259; VI-NEXT:    s_load_dword s2, s[8:9], 0x28
3260; VI-NEXT:    s_load_dword s3, s[8:9], 0x4c
3261; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
3262; VI-NEXT:    s_waitcnt lgkmcnt(0)
3263; VI-NEXT:    s_and_b32 s2, s2, 0xffff
3264; VI-NEXT:    s_and_b32 s3, s3, 0xffff
3265; VI-NEXT:    s_min_u32 s2, s2, s3
3266; VI-NEXT:    v_mov_b32_e32 v0, s0
3267; VI-NEXT:    v_mov_b32_e32 v1, s1
3268; VI-NEXT:    v_mov_b32_e32 v2, s2
3269; VI-NEXT:    flat_store_dword v[0:1], v2
3270; VI-NEXT:    s_endpgm
3271;
3272; GFX9-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3273; GFX9:       ; %bb.0:
3274; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x28
3275; GFX9-NEXT:    s_load_dword s3, s[8:9], 0x4c
3276; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
3277; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3278; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3279; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
3280; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
3281; GFX9-NEXT:    s_min_u32 s2, s2, s3
3282; GFX9-NEXT:    v_mov_b32_e32 v1, s2
3283; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3284; GFX9-NEXT:    s_endpgm
3285;
3286; GFX10-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3287; GFX10:       ; %bb.0:
3288; GFX10-NEXT:    s_clause 0x2
3289; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x28
3290; GFX10-NEXT:    s_load_dword s3, s[8:9], 0x4c
3291; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
3292; GFX10-NEXT:    v_mov_b32_e32 v0, 0
3293; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3294; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff
3295; GFX10-NEXT:    s_and_b32 s3, s3, 0xffff
3296; GFX10-NEXT:    s_min_u32 s2, s2, s3
3297; GFX10-NEXT:    v_mov_b32_e32 v1, s2
3298; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
3299; GFX10-NEXT:    s_endpgm
3300;
3301; GFX11-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3302; GFX11:       ; %bb.0:
3303; GFX11-NEXT:    s_clause 0x2
3304; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x28
3305; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x4c
3306; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3307; GFX11-NEXT:    v_mov_b32_e32 v0, 0
3308; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3309; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
3310; GFX11-NEXT:    s_and_b32 s3, s3, 0xffff
3311; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3312; GFX11-NEXT:    s_min_u32 s2, s2, s3
3313; GFX11-NEXT:    v_mov_b32_e32 v1, s2
3314; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3315; GFX11-NEXT:    s_endpgm
3316  %a.ext = zext i16 %a to i32
3317  %b.ext = zext i16 %b to i32
3318  %cmp = icmp ult i32 %a.ext, %b.ext
3319  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
3320  %mask = and i32 %val, 65535
3321  store i32 %mask, ptr addrspace(1) %out
3322  ret void
3323}
3324
3325; Make sure redundant sign_extend_inreg removed.
3326
3327define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace(1) %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 {
3328; EG-LABEL: simplify_demanded_bits_test_min_slt_i16:
3329; EG:       ; %bb.0:
3330; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
3331; EG-NEXT:    TEX 1 @6
3332; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
3333; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
3334; EG-NEXT:    CF_END
3335; EG-NEXT:    PAD
3336; EG-NEXT:    Fetch clause starting at 6:
3337; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 72, #3
3338; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 108, #3
3339; EG-NEXT:    ALU clause starting at 10:
3340; EG-NEXT:     MOV * T0.X, 0.0,
3341; EG-NEXT:    ALU clause starting at 11:
3342; EG-NEXT:     BFE_INT T0.Z, T1.X, 0.0, literal.x,
3343; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
3344; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3345; EG-NEXT:     MIN_INT T0.X, PV.Z, PV.W,
3346; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
3347; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3348;
3349; CI-LABEL: simplify_demanded_bits_test_min_slt_i16:
3350; CI:       ; %bb.0:
3351; CI-NEXT:    s_load_dword s2, s[8:9], 0xa
3352; CI-NEXT:    s_load_dword s3, s[8:9], 0x13
3353; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
3354; CI-NEXT:    s_waitcnt lgkmcnt(0)
3355; CI-NEXT:    s_sext_i32_i16 s2, s2
3356; CI-NEXT:    s_sext_i32_i16 s3, s3
3357; CI-NEXT:    s_min_i32 s2, s2, s3
3358; CI-NEXT:    v_mov_b32_e32 v0, s0
3359; CI-NEXT:    v_mov_b32_e32 v1, s1
3360; CI-NEXT:    v_mov_b32_e32 v2, s2
3361; CI-NEXT:    flat_store_dword v[0:1], v2
3362; CI-NEXT:    s_endpgm
3363;
3364; VI-LABEL: simplify_demanded_bits_test_min_slt_i16:
3365; VI:       ; %bb.0:
3366; VI-NEXT:    s_load_dword s2, s[8:9], 0x28
3367; VI-NEXT:    s_load_dword s3, s[8:9], 0x4c
3368; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
3369; VI-NEXT:    s_waitcnt lgkmcnt(0)
3370; VI-NEXT:    s_sext_i32_i16 s2, s2
3371; VI-NEXT:    s_sext_i32_i16 s3, s3
3372; VI-NEXT:    s_min_i32 s2, s2, s3
3373; VI-NEXT:    v_mov_b32_e32 v0, s0
3374; VI-NEXT:    v_mov_b32_e32 v1, s1
3375; VI-NEXT:    v_mov_b32_e32 v2, s2
3376; VI-NEXT:    flat_store_dword v[0:1], v2
3377; VI-NEXT:    s_endpgm
3378;
3379; GFX9-LABEL: simplify_demanded_bits_test_min_slt_i16:
3380; GFX9:       ; %bb.0:
3381; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x28
3382; GFX9-NEXT:    s_load_dword s3, s[8:9], 0x4c
3383; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
3384; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3385; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3386; GFX9-NEXT:    s_sext_i32_i16 s2, s2
3387; GFX9-NEXT:    s_sext_i32_i16 s3, s3
3388; GFX9-NEXT:    s_min_i32 s2, s2, s3
3389; GFX9-NEXT:    v_mov_b32_e32 v1, s2
3390; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3391; GFX9-NEXT:    s_endpgm
3392;
3393; GFX10-LABEL: simplify_demanded_bits_test_min_slt_i16:
3394; GFX10:       ; %bb.0:
3395; GFX10-NEXT:    s_clause 0x2
3396; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x28
3397; GFX10-NEXT:    s_load_dword s3, s[8:9], 0x4c
3398; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
3399; GFX10-NEXT:    v_mov_b32_e32 v0, 0
3400; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3401; GFX10-NEXT:    s_sext_i32_i16 s2, s2
3402; GFX10-NEXT:    s_sext_i32_i16 s3, s3
3403; GFX10-NEXT:    s_min_i32 s2, s2, s3
3404; GFX10-NEXT:    v_mov_b32_e32 v1, s2
3405; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
3406; GFX10-NEXT:    s_endpgm
3407;
3408; GFX11-LABEL: simplify_demanded_bits_test_min_slt_i16:
3409; GFX11:       ; %bb.0:
3410; GFX11-NEXT:    s_clause 0x2
3411; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x28
3412; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x4c
3413; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3414; GFX11-NEXT:    v_mov_b32_e32 v0, 0
3415; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3416; GFX11-NEXT:    s_sext_i32_i16 s2, s2
3417; GFX11-NEXT:    s_sext_i32_i16 s3, s3
3418; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3419; GFX11-NEXT:    s_min_i32 s2, s2, s3
3420; GFX11-NEXT:    v_mov_b32_e32 v1, s2
3421; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3422; GFX11-NEXT:    s_endpgm
3423  %a.ext = sext i16 %a to i32
3424  %b.ext = sext i16 %b to i32
3425  %cmp = icmp slt i32 %a.ext, %b.ext
3426  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
3427  %shl = shl i32 %val, 16
3428  %sextinreg = ashr i32 %shl, 16
3429  store i32 %sextinreg, ptr addrspace(1) %out
3430  ret void
3431}
3432
3433define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i16 %b) #0 {
3434; EG-LABEL: s_test_imin_sle_i16:
3435; EG:       ; %bb.0:
3436; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
3437; EG-NEXT:    TEX 1 @6
3438; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
3439; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
3440; EG-NEXT:    CF_END
3441; EG-NEXT:    PAD
3442; EG-NEXT:    Fetch clause starting at 6:
3443; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 40, #3
3444; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 42, #3
3445; EG-NEXT:    ALU clause starting at 10:
3446; EG-NEXT:     MOV * T0.X, 0.0,
3447; EG-NEXT:    ALU clause starting at 11:
3448; EG-NEXT:     BFE_INT T0.Z, T1.X, 0.0, literal.x,
3449; EG-NEXT:     BFE_INT T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
3450; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
3451; EG-NEXT:    16(2.242078e-44), 3(4.203895e-45)
3452; EG-NEXT:     MIN_INT * T0.W, PV.Z, PV.W,
3453; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
3454; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
3455; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
3456; EG-NEXT:     LSHL T0.X, PV.W, PS,
3457; EG-NEXT:     LSHL * T0.W, literal.x, PS,
3458; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3459; EG-NEXT:     MOV T0.Y, 0.0,
3460; EG-NEXT:     MOV * T0.Z, 0.0,
3461; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
3462; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3463;
3464; CI-LABEL: s_test_imin_sle_i16:
3465; CI:       ; %bb.0:
3466; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
3467; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
3468; CI-NEXT:    s_waitcnt lgkmcnt(0)
3469; CI-NEXT:    s_sext_i32_i16 s3, s2
3470; CI-NEXT:    s_ashr_i32 s2, s2, 16
3471; CI-NEXT:    s_min_i32 s2, s3, s2
3472; CI-NEXT:    v_mov_b32_e32 v0, s0
3473; CI-NEXT:    v_mov_b32_e32 v1, s1
3474; CI-NEXT:    v_mov_b32_e32 v2, s2
3475; CI-NEXT:    flat_store_short v[0:1], v2
3476; CI-NEXT:    s_endpgm
3477;
3478; VI-LABEL: s_test_imin_sle_i16:
3479; VI:       ; %bb.0:
3480; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
3481; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
3482; VI-NEXT:    s_waitcnt lgkmcnt(0)
3483; VI-NEXT:    s_sext_i32_i16 s3, s2
3484; VI-NEXT:    s_ashr_i32 s2, s2, 16
3485; VI-NEXT:    s_min_i32 s2, s3, s2
3486; VI-NEXT:    v_mov_b32_e32 v0, s0
3487; VI-NEXT:    v_mov_b32_e32 v1, s1
3488; VI-NEXT:    v_mov_b32_e32 v2, s2
3489; VI-NEXT:    flat_store_short v[0:1], v2
3490; VI-NEXT:    s_endpgm
3491;
3492; GFX9-LABEL: s_test_imin_sle_i16:
3493; GFX9:       ; %bb.0:
3494; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
3495; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
3496; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3497; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3498; GFX9-NEXT:    s_sext_i32_i16 s3, s2
3499; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
3500; GFX9-NEXT:    s_min_i32 s2, s3, s2
3501; GFX9-NEXT:    v_mov_b32_e32 v1, s2
3502; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
3503; GFX9-NEXT:    s_endpgm
3504;
3505; GFX10-LABEL: s_test_imin_sle_i16:
3506; GFX10:       ; %bb.0:
3507; GFX10-NEXT:    s_clause 0x1
3508; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x8
3509; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
3510; GFX10-NEXT:    v_mov_b32_e32 v0, 0
3511; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3512; GFX10-NEXT:    s_sext_i32_i16 s3, s2
3513; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
3514; GFX10-NEXT:    s_min_i32 s2, s3, s2
3515; GFX10-NEXT:    v_mov_b32_e32 v1, s2
3516; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
3517; GFX10-NEXT:    s_endpgm
3518;
3519; GFX11-LABEL: s_test_imin_sle_i16:
3520; GFX11:       ; %bb.0:
3521; GFX11-NEXT:    s_clause 0x1
3522; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
3523; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3524; GFX11-NEXT:    v_mov_b32_e32 v0, 0
3525; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3526; GFX11-NEXT:    s_sext_i32_i16 s3, s2
3527; GFX11-NEXT:    s_ashr_i32 s2, s2, 16
3528; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3529; GFX11-NEXT:    s_min_i32 s2, s3, s2
3530; GFX11-NEXT:    v_mov_b32_e32 v1, s2
3531; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
3532; GFX11-NEXT:    s_endpgm
3533  %cmp = icmp sle i16 %a, %b
3534  %val = select i1 %cmp, i16 %a, i16 %b
3535  store i16 %val, ptr addrspace(1) %out
3536  ret void
3537}
3538
3539; 64 bit
3540
3541define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3542; EG-LABEL: test_umin_ult_i64:
3543; EG:       ; %bb.0:
3544; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
3545; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3546; EG-NEXT:    CF_END
3547; EG-NEXT:    PAD
3548; EG-NEXT:    ALU clause starting at 4:
3549; EG-NEXT:     SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3550; EG-NEXT:     SETGT_UINT * T0.W, KC0[3].Z, KC0[3].X,
3551; EG-NEXT:     SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3552; EG-NEXT:     CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3553; EG-NEXT:     CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3554; EG-NEXT:     CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3555; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
3556; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3557;
3558; CI-LABEL: test_umin_ult_i64:
3559; CI:       ; %bb.0:
3560; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3561; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
3562; CI-NEXT:    s_waitcnt lgkmcnt(0)
3563; CI-NEXT:    v_mov_b32_e32 v0, s0
3564; CI-NEXT:    v_mov_b32_e32 v1, s4
3565; CI-NEXT:    v_mov_b32_e32 v2, s5
3566; CI-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2]
3567; CI-NEXT:    v_mov_b32_e32 v1, s1
3568; CI-NEXT:    s_and_b64 s[0:1], vcc, exec
3569; CI-NEXT:    s_cselect_b32 s0, s3, s5
3570; CI-NEXT:    s_cselect_b32 s1, s2, s4
3571; CI-NEXT:    v_mov_b32_e32 v2, s1
3572; CI-NEXT:    v_mov_b32_e32 v3, s0
3573; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
3574; CI-NEXT:    s_endpgm
3575;
3576; VI-LABEL: test_umin_ult_i64:
3577; VI:       ; %bb.0:
3578; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3579; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
3580; VI-NEXT:    s_waitcnt lgkmcnt(0)
3581; VI-NEXT:    v_mov_b32_e32 v0, s0
3582; VI-NEXT:    v_mov_b32_e32 v1, s4
3583; VI-NEXT:    v_mov_b32_e32 v2, s5
3584; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2]
3585; VI-NEXT:    v_mov_b32_e32 v1, s1
3586; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
3587; VI-NEXT:    s_cselect_b32 s0, s3, s5
3588; VI-NEXT:    s_cselect_b32 s1, s2, s4
3589; VI-NEXT:    v_mov_b32_e32 v2, s1
3590; VI-NEXT:    v_mov_b32_e32 v3, s0
3591; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
3592; VI-NEXT:    s_endpgm
3593;
3594; GFX9-LABEL: test_umin_ult_i64:
3595; GFX9:       ; %bb.0:
3596; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
3597; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3598; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3599; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3600; GFX9-NEXT:    v_mov_b32_e32 v0, s4
3601; GFX9-NEXT:    v_mov_b32_e32 v1, s5
3602; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
3603; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
3604; GFX9-NEXT:    s_cselect_b32 s3, s3, s5
3605; GFX9-NEXT:    s_cselect_b32 s2, s2, s4
3606; GFX9-NEXT:    v_mov_b32_e32 v0, s2
3607; GFX9-NEXT:    v_mov_b32_e32 v1, s3
3608; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
3609; GFX9-NEXT:    s_endpgm
3610;
3611; GFX10-LABEL: test_umin_ult_i64:
3612; GFX10:       ; %bb.0:
3613; GFX10-NEXT:    s_clause 0x1
3614; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3615; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
3616; GFX10-NEXT:    v_mov_b32_e32 v2, 0
3617; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3618; GFX10-NEXT:    v_cmp_lt_u64_e64 s6, s[2:3], s[4:5]
3619; GFX10-NEXT:    s_and_b32 s6, s6, exec_lo
3620; GFX10-NEXT:    s_cselect_b32 s2, s2, s4
3621; GFX10-NEXT:    s_cselect_b32 s3, s3, s5
3622; GFX10-NEXT:    v_mov_b32_e32 v0, s2
3623; GFX10-NEXT:    v_mov_b32_e32 v1, s3
3624; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
3625; GFX10-NEXT:    s_endpgm
3626;
3627; GFX11-LABEL: test_umin_ult_i64:
3628; GFX11:       ; %bb.0:
3629; GFX11-NEXT:    s_clause 0x1
3630; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
3631; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
3632; GFX11-NEXT:    v_mov_b32_e32 v2, 0
3633; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3634; GFX11-NEXT:    v_cmp_lt_u64_e64 s6, s[2:3], s[4:5]
3635; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3636; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
3637; GFX11-NEXT:    s_cselect_b32 s2, s2, s4
3638; GFX11-NEXT:    s_cselect_b32 s3, s3, s5
3639; GFX11-NEXT:    v_mov_b32_e32 v0, s2
3640; GFX11-NEXT:    v_mov_b32_e32 v1, s3
3641; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
3642; GFX11-NEXT:    s_endpgm
3643  %tmp = icmp ult i64 %a, %b
3644  %val = select i1 %tmp, i64 %a, i64 %b
3645  store i64 %val, ptr addrspace(1) %out, align 8
3646  ret void
3647}
3648
3649define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3650; EG-LABEL: test_umin_ule_i64:
3651; EG:       ; %bb.0:
3652; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
3653; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3654; EG-NEXT:    CF_END
3655; EG-NEXT:    PAD
3656; EG-NEXT:    ALU clause starting at 4:
3657; EG-NEXT:     SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3658; EG-NEXT:     SETGT_UINT * T0.W, KC0[3].Z, KC0[3].X,
3659; EG-NEXT:     SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3660; EG-NEXT:     CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3661; EG-NEXT:     CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3662; EG-NEXT:     CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3663; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
3664; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3665;
3666; CI-LABEL: test_umin_ule_i64:
3667; CI:       ; %bb.0:
3668; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3669; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
3670; CI-NEXT:    s_waitcnt lgkmcnt(0)
3671; CI-NEXT:    v_mov_b32_e32 v0, s0
3672; CI-NEXT:    v_mov_b32_e32 v1, s4
3673; CI-NEXT:    v_mov_b32_e32 v2, s5
3674; CI-NEXT:    v_cmp_le_u64_e32 vcc, s[2:3], v[1:2]
3675; CI-NEXT:    v_mov_b32_e32 v1, s1
3676; CI-NEXT:    s_and_b64 s[0:1], vcc, exec
3677; CI-NEXT:    s_cselect_b32 s0, s3, s5
3678; CI-NEXT:    s_cselect_b32 s1, s2, s4
3679; CI-NEXT:    v_mov_b32_e32 v2, s1
3680; CI-NEXT:    v_mov_b32_e32 v3, s0
3681; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
3682; CI-NEXT:    s_endpgm
3683;
3684; VI-LABEL: test_umin_ule_i64:
3685; VI:       ; %bb.0:
3686; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3687; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
3688; VI-NEXT:    s_waitcnt lgkmcnt(0)
3689; VI-NEXT:    v_mov_b32_e32 v0, s0
3690; VI-NEXT:    v_mov_b32_e32 v1, s4
3691; VI-NEXT:    v_mov_b32_e32 v2, s5
3692; VI-NEXT:    v_cmp_le_u64_e32 vcc, s[2:3], v[1:2]
3693; VI-NEXT:    v_mov_b32_e32 v1, s1
3694; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
3695; VI-NEXT:    s_cselect_b32 s0, s3, s5
3696; VI-NEXT:    s_cselect_b32 s1, s2, s4
3697; VI-NEXT:    v_mov_b32_e32 v2, s1
3698; VI-NEXT:    v_mov_b32_e32 v3, s0
3699; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
3700; VI-NEXT:    s_endpgm
3701;
3702; GFX9-LABEL: test_umin_ule_i64:
3703; GFX9:       ; %bb.0:
3704; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
3705; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3706; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3707; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3708; GFX9-NEXT:    v_mov_b32_e32 v0, s4
3709; GFX9-NEXT:    v_mov_b32_e32 v1, s5
3710; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, s[2:3], v[0:1]
3711; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
3712; GFX9-NEXT:    s_cselect_b32 s3, s3, s5
3713; GFX9-NEXT:    s_cselect_b32 s2, s2, s4
3714; GFX9-NEXT:    v_mov_b32_e32 v0, s2
3715; GFX9-NEXT:    v_mov_b32_e32 v1, s3
3716; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
3717; GFX9-NEXT:    s_endpgm
3718;
3719; GFX10-LABEL: test_umin_ule_i64:
3720; GFX10:       ; %bb.0:
3721; GFX10-NEXT:    s_clause 0x1
3722; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3723; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
3724; GFX10-NEXT:    v_mov_b32_e32 v2, 0
3725; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3726; GFX10-NEXT:    v_cmp_le_u64_e64 s6, s[2:3], s[4:5]
3727; GFX10-NEXT:    s_and_b32 s6, s6, exec_lo
3728; GFX10-NEXT:    s_cselect_b32 s2, s2, s4
3729; GFX10-NEXT:    s_cselect_b32 s3, s3, s5
3730; GFX10-NEXT:    v_mov_b32_e32 v0, s2
3731; GFX10-NEXT:    v_mov_b32_e32 v1, s3
3732; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
3733; GFX10-NEXT:    s_endpgm
3734;
3735; GFX11-LABEL: test_umin_ule_i64:
3736; GFX11:       ; %bb.0:
3737; GFX11-NEXT:    s_clause 0x1
3738; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
3739; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
3740; GFX11-NEXT:    v_mov_b32_e32 v2, 0
3741; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3742; GFX11-NEXT:    v_cmp_le_u64_e64 s6, s[2:3], s[4:5]
3743; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3744; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
3745; GFX11-NEXT:    s_cselect_b32 s2, s2, s4
3746; GFX11-NEXT:    s_cselect_b32 s3, s3, s5
3747; GFX11-NEXT:    v_mov_b32_e32 v0, s2
3748; GFX11-NEXT:    v_mov_b32_e32 v1, s3
3749; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
3750; GFX11-NEXT:    s_endpgm
3751  %tmp = icmp ule i64 %a, %b
3752  %val = select i1 %tmp, i64 %a, i64 %b
3753  store i64 %val, ptr addrspace(1) %out, align 8
3754  ret void
3755}
3756
3757define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3758; EG-LABEL: test_imin_slt_i64:
3759; EG:       ; %bb.0:
3760; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
3761; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3762; EG-NEXT:    CF_END
3763; EG-NEXT:    PAD
3764; EG-NEXT:    ALU clause starting at 4:
3765; EG-NEXT:     SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3766; EG-NEXT:     SETGT_INT * T0.W, KC0[3].Z, KC0[3].X,
3767; EG-NEXT:     SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3768; EG-NEXT:     CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3769; EG-NEXT:     CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3770; EG-NEXT:     CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3771; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
3772; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3773;
3774; CI-LABEL: test_imin_slt_i64:
3775; CI:       ; %bb.0:
3776; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3777; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
3778; CI-NEXT:    s_waitcnt lgkmcnt(0)
3779; CI-NEXT:    v_mov_b32_e32 v0, s0
3780; CI-NEXT:    v_mov_b32_e32 v1, s4
3781; CI-NEXT:    v_mov_b32_e32 v2, s5
3782; CI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
3783; CI-NEXT:    v_mov_b32_e32 v1, s1
3784; CI-NEXT:    s_and_b64 s[0:1], vcc, exec
3785; CI-NEXT:    s_cselect_b32 s0, s3, s5
3786; CI-NEXT:    s_cselect_b32 s1, s2, s4
3787; CI-NEXT:    v_mov_b32_e32 v2, s1
3788; CI-NEXT:    v_mov_b32_e32 v3, s0
3789; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
3790; CI-NEXT:    s_endpgm
3791;
3792; VI-LABEL: test_imin_slt_i64:
3793; VI:       ; %bb.0:
3794; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3795; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
3796; VI-NEXT:    s_waitcnt lgkmcnt(0)
3797; VI-NEXT:    v_mov_b32_e32 v0, s0
3798; VI-NEXT:    v_mov_b32_e32 v1, s4
3799; VI-NEXT:    v_mov_b32_e32 v2, s5
3800; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
3801; VI-NEXT:    v_mov_b32_e32 v1, s1
3802; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
3803; VI-NEXT:    s_cselect_b32 s0, s3, s5
3804; VI-NEXT:    s_cselect_b32 s1, s2, s4
3805; VI-NEXT:    v_mov_b32_e32 v2, s1
3806; VI-NEXT:    v_mov_b32_e32 v3, s0
3807; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
3808; VI-NEXT:    s_endpgm
3809;
3810; GFX9-LABEL: test_imin_slt_i64:
3811; GFX9:       ; %bb.0:
3812; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
3813; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3814; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3815; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3816; GFX9-NEXT:    v_mov_b32_e32 v0, s4
3817; GFX9-NEXT:    v_mov_b32_e32 v1, s5
3818; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
3819; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
3820; GFX9-NEXT:    s_cselect_b32 s3, s3, s5
3821; GFX9-NEXT:    s_cselect_b32 s2, s2, s4
3822; GFX9-NEXT:    v_mov_b32_e32 v0, s2
3823; GFX9-NEXT:    v_mov_b32_e32 v1, s3
3824; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
3825; GFX9-NEXT:    s_endpgm
3826;
3827; GFX10-LABEL: test_imin_slt_i64:
3828; GFX10:       ; %bb.0:
3829; GFX10-NEXT:    s_clause 0x1
3830; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3831; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
3832; GFX10-NEXT:    v_mov_b32_e32 v2, 0
3833; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3834; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, s[2:3], s[4:5]
3835; GFX10-NEXT:    s_and_b32 s6, s6, exec_lo
3836; GFX10-NEXT:    s_cselect_b32 s2, s2, s4
3837; GFX10-NEXT:    s_cselect_b32 s3, s3, s5
3838; GFX10-NEXT:    v_mov_b32_e32 v0, s2
3839; GFX10-NEXT:    v_mov_b32_e32 v1, s3
3840; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
3841; GFX10-NEXT:    s_endpgm
3842;
3843; GFX11-LABEL: test_imin_slt_i64:
3844; GFX11:       ; %bb.0:
3845; GFX11-NEXT:    s_clause 0x1
3846; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
3847; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
3848; GFX11-NEXT:    v_mov_b32_e32 v2, 0
3849; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3850; GFX11-NEXT:    v_cmp_lt_i64_e64 s6, s[2:3], s[4:5]
3851; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3852; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
3853; GFX11-NEXT:    s_cselect_b32 s2, s2, s4
3854; GFX11-NEXT:    s_cselect_b32 s3, s3, s5
3855; GFX11-NEXT:    v_mov_b32_e32 v0, s2
3856; GFX11-NEXT:    v_mov_b32_e32 v1, s3
3857; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
3858; GFX11-NEXT:    s_endpgm
3859  %tmp = icmp slt i64 %a, %b
3860  %val = select i1 %tmp, i64 %a, i64 %b
3861  store i64 %val, ptr addrspace(1) %out, align 8
3862  ret void
3863}
3864
3865define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3866; EG-LABEL: test_imin_sle_i64:
3867; EG:       ; %bb.0:
3868; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
3869; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3870; EG-NEXT:    CF_END
3871; EG-NEXT:    PAD
3872; EG-NEXT:    ALU clause starting at 4:
3873; EG-NEXT:     SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3874; EG-NEXT:     SETGT_INT * T0.W, KC0[3].Z, KC0[3].X,
3875; EG-NEXT:     SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3876; EG-NEXT:     CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3877; EG-NEXT:     CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3878; EG-NEXT:     CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3879; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
3880; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3881;
3882; CI-LABEL: test_imin_sle_i64:
3883; CI:       ; %bb.0:
3884; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3885; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
3886; CI-NEXT:    s_waitcnt lgkmcnt(0)
3887; CI-NEXT:    v_mov_b32_e32 v0, s0
3888; CI-NEXT:    v_mov_b32_e32 v1, s4
3889; CI-NEXT:    v_mov_b32_e32 v2, s5
3890; CI-NEXT:    v_cmp_le_i64_e32 vcc, s[2:3], v[1:2]
3891; CI-NEXT:    v_mov_b32_e32 v1, s1
3892; CI-NEXT:    s_and_b64 s[0:1], vcc, exec
3893; CI-NEXT:    s_cselect_b32 s0, s3, s5
3894; CI-NEXT:    s_cselect_b32 s1, s2, s4
3895; CI-NEXT:    v_mov_b32_e32 v2, s1
3896; CI-NEXT:    v_mov_b32_e32 v3, s0
3897; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
3898; CI-NEXT:    s_endpgm
3899;
3900; VI-LABEL: test_imin_sle_i64:
3901; VI:       ; %bb.0:
3902; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3903; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
3904; VI-NEXT:    s_waitcnt lgkmcnt(0)
3905; VI-NEXT:    v_mov_b32_e32 v0, s0
3906; VI-NEXT:    v_mov_b32_e32 v1, s4
3907; VI-NEXT:    v_mov_b32_e32 v2, s5
3908; VI-NEXT:    v_cmp_le_i64_e32 vcc, s[2:3], v[1:2]
3909; VI-NEXT:    v_mov_b32_e32 v1, s1
3910; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
3911; VI-NEXT:    s_cselect_b32 s0, s3, s5
3912; VI-NEXT:    s_cselect_b32 s1, s2, s4
3913; VI-NEXT:    v_mov_b32_e32 v2, s1
3914; VI-NEXT:    v_mov_b32_e32 v3, s0
3915; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
3916; VI-NEXT:    s_endpgm
3917;
3918; GFX9-LABEL: test_imin_sle_i64:
3919; GFX9:       ; %bb.0:
3920; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
3921; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3922; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3923; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3924; GFX9-NEXT:    v_mov_b32_e32 v0, s4
3925; GFX9-NEXT:    v_mov_b32_e32 v1, s5
3926; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, s[2:3], v[0:1]
3927; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
3928; GFX9-NEXT:    s_cselect_b32 s3, s3, s5
3929; GFX9-NEXT:    s_cselect_b32 s2, s2, s4
3930; GFX9-NEXT:    v_mov_b32_e32 v0, s2
3931; GFX9-NEXT:    v_mov_b32_e32 v1, s3
3932; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
3933; GFX9-NEXT:    s_endpgm
3934;
3935; GFX10-LABEL: test_imin_sle_i64:
3936; GFX10:       ; %bb.0:
3937; GFX10-NEXT:    s_clause 0x1
3938; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3939; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
3940; GFX10-NEXT:    v_mov_b32_e32 v2, 0
3941; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3942; GFX10-NEXT:    v_cmp_le_i64_e64 s6, s[2:3], s[4:5]
3943; GFX10-NEXT:    s_and_b32 s6, s6, exec_lo
3944; GFX10-NEXT:    s_cselect_b32 s2, s2, s4
3945; GFX10-NEXT:    s_cselect_b32 s3, s3, s5
3946; GFX10-NEXT:    v_mov_b32_e32 v0, s2
3947; GFX10-NEXT:    v_mov_b32_e32 v1, s3
3948; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
3949; GFX10-NEXT:    s_endpgm
3950;
3951; GFX11-LABEL: test_imin_sle_i64:
3952; GFX11:       ; %bb.0:
3953; GFX11-NEXT:    s_clause 0x1
3954; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
3955; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
3956; GFX11-NEXT:    v_mov_b32_e32 v2, 0
3957; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3958; GFX11-NEXT:    v_cmp_le_i64_e64 s6, s[2:3], s[4:5]
3959; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3960; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
3961; GFX11-NEXT:    s_cselect_b32 s2, s2, s4
3962; GFX11-NEXT:    s_cselect_b32 s3, s3, s5
3963; GFX11-NEXT:    v_mov_b32_e32 v0, s2
3964; GFX11-NEXT:    v_mov_b32_e32 v1, s3
3965; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
3966; GFX11-NEXT:    s_endpgm
3967  %tmp = icmp sle i64 %a, %b
3968  %val = select i1 %tmp, i64 %a, i64 %b
3969  store i64 %val, ptr addrspace(1) %out, align 8
3970  ret void
3971}
3972
3973define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
3974; EG-LABEL: v_test_imin_sle_v2i16:
3975; EG:       ; %bb.0:
3976; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
3977; EG-NEXT:    TEX 0 @8
3978; EG-NEXT:    ALU 0, @15, KC0[CB0:0-32], KC1[]
3979; EG-NEXT:    TEX 0 @10
3980; EG-NEXT:    ALU 16, @16, KC0[CB0:0-32], KC1[]
3981; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
3982; EG-NEXT:    CF_END
3983; EG-NEXT:    PAD
3984; EG-NEXT:    Fetch clause starting at 8:
3985; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
3986; EG-NEXT:    Fetch clause starting at 10:
3987; EG-NEXT:     VTX_READ_32 T7.X, T7.X, 0, #1
3988; EG-NEXT:    ALU clause starting at 12:
3989; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
3990; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3991; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
3992; EG-NEXT:    ALU clause starting at 15:
3993; EG-NEXT:     ADD_INT * T7.X, KC0[2].W, T0.W,
3994; EG-NEXT:    ALU clause starting at 16:
3995; EG-NEXT:     LSHR T1.W, T0.X, literal.x,
3996; EG-NEXT:     LSHR * T2.W, T7.X, literal.x,
3997; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3998; EG-NEXT:     BFE_INT T8.X, PS, 0.0, literal.x,
3999; EG-NEXT:     BFE_INT T0.Y, PV.W, 0.0, literal.x,
4000; EG-NEXT:     BFE_INT T0.Z, T7.X, 0.0, literal.x,
4001; EG-NEXT:     BFE_INT * T1.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
4002; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4003; EG-NEXT:     MIN_INT T1.W, PV.W, PV.Z,
4004; EG-NEXT:     MIN_INT * T2.W, PV.Y, PV.X,
4005; EG-NEXT:     LSHL T2.W, PS, literal.x,
4006; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
4007; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
4008; EG-NEXT:     OR_INT T0.X, PS, PV.W,
4009; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
4010; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
4011; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4012;
4013; CI-LABEL: v_test_imin_sle_v2i16:
4014; CI:       ; %bb.0:
4015; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
4016; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
4017; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
4018; CI-NEXT:    s_waitcnt lgkmcnt(0)
4019; CI-NEXT:    v_mov_b32_e32 v1, s3
4020; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
4021; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4022; CI-NEXT:    v_mov_b32_e32 v3, s5
4023; CI-NEXT:    flat_load_dword v4, v[0:1]
4024; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
4025; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
4026; CI-NEXT:    flat_load_dword v3, v[0:1]
4027; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
4028; CI-NEXT:    v_mov_b32_e32 v1, s1
4029; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4030; CI-NEXT:    s_waitcnt vmcnt(1)
4031; CI-NEXT:    v_bfe_i32 v2, v4, 0, 16
4032; CI-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
4033; CI-NEXT:    s_waitcnt vmcnt(0)
4034; CI-NEXT:    v_bfe_i32 v5, v3, 0, 16
4035; CI-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
4036; CI-NEXT:    v_min_i32_e32 v3, v4, v3
4037; CI-NEXT:    v_min_i32_e32 v2, v2, v5
4038; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
4039; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
4040; CI-NEXT:    v_or_b32_e32 v2, v2, v3
4041; CI-NEXT:    flat_store_dword v[0:1], v2
4042; CI-NEXT:    s_endpgm
4043;
4044; VI-LABEL: v_test_imin_sle_v2i16:
4045; VI:       ; %bb.0:
4046; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
4047; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
4048; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
4049; VI-NEXT:    s_waitcnt lgkmcnt(0)
4050; VI-NEXT:    v_mov_b32_e32 v1, s3
4051; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
4052; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4053; VI-NEXT:    v_mov_b32_e32 v3, s5
4054; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
4055; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
4056; VI-NEXT:    flat_load_dword v5, v[0:1]
4057; VI-NEXT:    flat_load_dword v2, v[2:3]
4058; VI-NEXT:    v_mov_b32_e32 v1, s1
4059; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
4060; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4061; VI-NEXT:    s_waitcnt vmcnt(0)
4062; VI-NEXT:    v_min_i16_e32 v3, v5, v2
4063; VI-NEXT:    v_min_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4064; VI-NEXT:    v_or_b32_e32 v2, v3, v2
4065; VI-NEXT:    flat_store_dword v[0:1], v2
4066; VI-NEXT:    s_endpgm
4067;
4068; GFX9-LABEL: v_test_imin_sle_v2i16:
4069; GFX9:       ; %bb.0:
4070; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
4071; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
4072; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4073; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4074; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
4075; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
4076; GFX9-NEXT:    s_waitcnt vmcnt(0)
4077; GFX9-NEXT:    v_pk_min_i16 v1, v1, v2
4078; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
4079; GFX9-NEXT:    s_endpgm
4080;
4081; GFX10-LABEL: v_test_imin_sle_v2i16:
4082; GFX10:       ; %bb.0:
4083; GFX10-NEXT:    s_clause 0x1
4084; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
4085; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
4086; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4087; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
4088; GFX10-NEXT:    s_clause 0x1
4089; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
4090; GFX10-NEXT:    global_load_dword v2, v0, s[4:5]
4091; GFX10-NEXT:    s_waitcnt vmcnt(0)
4092; GFX10-NEXT:    v_pk_min_i16 v1, v1, v2
4093; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
4094; GFX10-NEXT:    s_endpgm
4095;
4096; GFX11-LABEL: v_test_imin_sle_v2i16:
4097; GFX11:       ; %bb.0:
4098; GFX11-NEXT:    s_clause 0x1
4099; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
4100; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
4101; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4102; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4103; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4104; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
4105; GFX11-NEXT:    s_clause 0x1
4106; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
4107; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5]
4108; GFX11-NEXT:    s_waitcnt vmcnt(0)
4109; GFX11-NEXT:    v_pk_min_i16 v1, v1, v2
4110; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
4111; GFX11-NEXT:    s_endpgm
4112  %tid = call i32 @llvm.amdgcn.workitem.id.x()
4113  %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
4114  %b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
4115  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
4116  %a = load <2 x i16>, ptr addrspace(1) %a.gep
4117  %b = load <2 x i16>, ptr addrspace(1) %b.gep
4118  %cmp = icmp sle <2 x i16> %a, %b
4119  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
4120  store <2 x i16> %val, ptr addrspace(1) %out.gep
4121  ret void
4122}
4123
4124; FIXME: i16 min
4125
4126define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
4127; EG-LABEL: v_test_imin_ule_v2i16:
4128; EG:       ; %bb.0:
4129; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
4130; EG-NEXT:    TEX 0 @8
4131; EG-NEXT:    ALU 0, @15, KC0[CB0:0-32], KC1[]
4132; EG-NEXT:    TEX 0 @10
4133; EG-NEXT:    ALU 13, @16, KC0[CB0:0-32], KC1[]
4134; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
4135; EG-NEXT:    CF_END
4136; EG-NEXT:    PAD
4137; EG-NEXT:    Fetch clause starting at 8:
4138; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
4139; EG-NEXT:    Fetch clause starting at 10:
4140; EG-NEXT:     VTX_READ_32 T7.X, T7.X, 0, #1
4141; EG-NEXT:    ALU clause starting at 12:
4142; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
4143; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4144; EG-NEXT:     ADD_INT * T0.X, KC0[2].W, PV.W,
4145; EG-NEXT:    ALU clause starting at 15:
4146; EG-NEXT:     ADD_INT * T7.X, KC0[2].Z, T0.W,
4147; EG-NEXT:    ALU clause starting at 16:
4148; EG-NEXT:     LSHR T1.W, T0.X, literal.x,
4149; EG-NEXT:     LSHR * T2.W, T7.X, literal.x,
4150; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4151; EG-NEXT:     AND_INT T0.Z, T0.X, literal.x,
4152; EG-NEXT:     AND_INT T3.W, T7.X, literal.x, BS:VEC_120/SCL_212
4153; EG-NEXT:     MIN_UINT * T1.W, PS, PV.W,
4154; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
4155; EG-NEXT:     LSHL T1.W, PS, literal.x,
4156; EG-NEXT:     MIN_UINT * T2.W, PV.W, PV.Z,
4157; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4158; EG-NEXT:     OR_INT T0.X, PS, PV.W,
4159; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
4160; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
4161; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4162;
4163; CI-LABEL: v_test_imin_ule_v2i16:
4164; CI:       ; %bb.0:
4165; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
4166; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x4
4167; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
4168; CI-NEXT:    s_waitcnt lgkmcnt(0)
4169; CI-NEXT:    v_mov_b32_e32 v1, s3
4170; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
4171; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4172; CI-NEXT:    v_mov_b32_e32 v3, s5
4173; CI-NEXT:    flat_load_dword v4, v[0:1]
4174; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
4175; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
4176; CI-NEXT:    flat_load_dword v3, v[0:1]
4177; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
4178; CI-NEXT:    v_mov_b32_e32 v1, s1
4179; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4180; CI-NEXT:    s_waitcnt vmcnt(1)
4181; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
4182; CI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
4183; CI-NEXT:    s_waitcnt vmcnt(0)
4184; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
4185; CI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
4186; CI-NEXT:    v_min_u32_e32 v2, v2, v5
4187; CI-NEXT:    v_min_u32_e32 v3, v4, v3
4188; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
4189; CI-NEXT:    v_or_b32_e32 v2, v3, v2
4190; CI-NEXT:    flat_store_dword v[0:1], v2
4191; CI-NEXT:    s_endpgm
4192;
4193; VI-LABEL: v_test_imin_ule_v2i16:
4194; VI:       ; %bb.0:
4195; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
4196; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
4197; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
4198; VI-NEXT:    s_waitcnt lgkmcnt(0)
4199; VI-NEXT:    v_mov_b32_e32 v1, s3
4200; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
4201; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4202; VI-NEXT:    v_mov_b32_e32 v3, s5
4203; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
4204; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
4205; VI-NEXT:    flat_load_dword v5, v[0:1]
4206; VI-NEXT:    flat_load_dword v2, v[2:3]
4207; VI-NEXT:    v_mov_b32_e32 v1, s1
4208; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
4209; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4210; VI-NEXT:    s_waitcnt vmcnt(0)
4211; VI-NEXT:    v_min_u16_e32 v3, v5, v2
4212; VI-NEXT:    v_min_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4213; VI-NEXT:    v_or_b32_e32 v2, v3, v2
4214; VI-NEXT:    flat_store_dword v[0:1], v2
4215; VI-NEXT:    s_endpgm
4216;
4217; GFX9-LABEL: v_test_imin_ule_v2i16:
4218; GFX9:       ; %bb.0:
4219; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
4220; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
4221; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4222; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4223; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
4224; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
4225; GFX9-NEXT:    s_waitcnt vmcnt(0)
4226; GFX9-NEXT:    v_pk_min_u16 v1, v1, v2
4227; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
4228; GFX9-NEXT:    s_endpgm
4229;
4230; GFX10-LABEL: v_test_imin_ule_v2i16:
4231; GFX10:       ; %bb.0:
4232; GFX10-NEXT:    s_clause 0x1
4233; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
4234; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
4235; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4236; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
4237; GFX10-NEXT:    s_clause 0x1
4238; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
4239; GFX10-NEXT:    global_load_dword v2, v0, s[4:5]
4240; GFX10-NEXT:    s_waitcnt vmcnt(0)
4241; GFX10-NEXT:    v_pk_min_u16 v1, v1, v2
4242; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
4243; GFX10-NEXT:    s_endpgm
4244;
4245; GFX11-LABEL: v_test_imin_ule_v2i16:
4246; GFX11:       ; %bb.0:
4247; GFX11-NEXT:    s_clause 0x1
4248; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
4249; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
4250; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4251; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4252; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4253; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
4254; GFX11-NEXT:    s_clause 0x1
4255; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
4256; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5]
4257; GFX11-NEXT:    s_waitcnt vmcnt(0)
4258; GFX11-NEXT:    v_pk_min_u16 v1, v1, v2
4259; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
4260; GFX11-NEXT:    s_endpgm
4261  %tid = call i32 @llvm.amdgcn.workitem.id.x()
4262  %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
4263  %b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
4264  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
4265  %a = load <2 x i16>, ptr addrspace(1) %a.gep
4266  %b = load <2 x i16>, ptr addrspace(1) %b.gep
4267  %cmp = icmp ule <2 x i16> %a, %b
4268  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
4269  store <2 x i16> %val, ptr addrspace(1) %out.gep
4270  ret void
4271}
4272
4273declare i32 @llvm.amdgcn.workitem.id.x() #1
4274
4275attributes #0 = { nounwind }
4276attributes #1 = { nounwind readnone }
4277