xref: /llvm-project/llvm/test/CodeGen/AMDGPU/max.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2; RUN: llc -mtriple=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
3; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG %s
4
5define amdgpu_kernel void @v_test_imax_sge_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
6; SI-LABEL: v_test_imax_sge_i32:
7; SI:       ; %bb.0:
8; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
9; SI-NEXT:    s_mov_b32 s7, 0xf000
10; SI-NEXT:    s_mov_b32 s2, 0
11; SI-NEXT:    s_mov_b32 s3, s7
12; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
13; SI-NEXT:    v_mov_b32_e32 v1, 0
14; SI-NEXT:    s_waitcnt lgkmcnt(0)
15; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
16; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
17; SI-NEXT:    s_mov_b32 s6, -1
18; SI-NEXT:    s_waitcnt lgkmcnt(0)
19; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
20; SI-NEXT:    s_mov_b32 s4, s0
21; SI-NEXT:    s_mov_b32 s5, s1
22; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
23; SI-NEXT:    v_max_i32_e32 v0, s2, v0
24; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
25; SI-NEXT:    s_endpgm
26;
27; EG-LABEL: v_test_imax_sge_i32:
28; EG:       ; %bb.0:
29; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
30; EG-NEXT:    TEX 1 @6
31; EG-NEXT:    ALU 2, @14, KC0[CB0:0-32], KC1[]
32; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
33; EG-NEXT:    CF_END
34; EG-NEXT:    PAD
35; EG-NEXT:    Fetch clause starting at 6:
36; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
37; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
38; EG-NEXT:    ALU clause starting at 10:
39; EG-NEXT:     MOV T1.X, KC0[2].Z,
40; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
41; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
42; EG-NEXT:     ADD_INT * T0.X, KC0[2].W, PV.W,
43; EG-NEXT:    ALU clause starting at 14:
44; EG-NEXT:     MAX_INT T0.X, T1.X, T0.X,
45; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
46; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
47  %tid = call i32 @llvm.amdgcn.workitem.id.x()
48  %gep.in = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
49  %a = load i32, ptr addrspace(1) %aptr, align 4
50  %b = load i32, ptr addrspace(1) %gep.in, align 4
51  %cmp = icmp sge i32 %a, %b
52  %val = select i1 %cmp, i32 %a, i32 %b
53  store i32 %val, ptr addrspace(1) %out, align 4
54  ret void
55}
56
57; These could be merged into one
58define amdgpu_kernel void @v_test_imax_sge_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
59; SI-LABEL: v_test_imax_sge_v4i32:
60; SI:       ; %bb.0:
61; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
62; SI-NEXT:    s_mov_b32 s7, 0xf000
63; SI-NEXT:    s_mov_b32 s2, 0
64; SI-NEXT:    s_mov_b32 s3, s7
65; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
66; SI-NEXT:    v_mov_b32_e32 v1, 0
67; SI-NEXT:    s_waitcnt lgkmcnt(0)
68; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
69; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
70; SI-NEXT:    s_mov_b32 s6, -1
71; SI-NEXT:    s_waitcnt lgkmcnt(0)
72; SI-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
73; SI-NEXT:    s_mov_b32 s4, s0
74; SI-NEXT:    s_mov_b32 s5, s1
75; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
76; SI-NEXT:    v_max_i32_e32 v3, s11, v3
77; SI-NEXT:    v_max_i32_e32 v2, s10, v2
78; SI-NEXT:    v_max_i32_e32 v1, s9, v1
79; SI-NEXT:    v_max_i32_e32 v0, s8, v0
80; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
81; SI-NEXT:    s_endpgm
82;
83; EG-LABEL: v_test_imax_sge_v4i32:
84; EG:       ; %bb.0:
85; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
86; EG-NEXT:    TEX 1 @6
87; EG-NEXT:    ALU 5, @14, KC0[CB0:0-32], KC1[]
88; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
89; EG-NEXT:    CF_END
90; EG-NEXT:    PAD
91; EG-NEXT:    Fetch clause starting at 6:
92; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
93; EG-NEXT:     VTX_READ_128 T1.XYZW, T1.X, 0, #1
94; EG-NEXT:    ALU clause starting at 10:
95; EG-NEXT:     MOV T1.X, KC0[2].Z,
96; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
97; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
98; EG-NEXT:     ADD_INT * T0.X, KC0[2].W, PV.W,
99; EG-NEXT:    ALU clause starting at 14:
100; EG-NEXT:     MAX_INT * T0.W, T1.W, T0.W,
101; EG-NEXT:     MAX_INT * T0.Z, T1.Z, T0.Z,
102; EG-NEXT:     MAX_INT * T0.Y, T1.Y, T0.Y,
103; EG-NEXT:     MAX_INT T0.X, T1.X, T0.X,
104; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
105; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
106  %tid = call i32 @llvm.amdgcn.workitem.id.x()
107  %gep.in = getelementptr inbounds <4 x i32>, ptr addrspace(1) %bptr, i32 %tid
108  %a = load <4 x i32>, ptr addrspace(1) %aptr, align 4
109  %b = load <4 x i32>, ptr addrspace(1) %gep.in, align 4
110  %cmp = icmp sge <4 x i32> %a, %b
111  %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
112  store <4 x i32> %val, ptr addrspace(1) %out, align 4
113  ret void
114}
115
116define amdgpu_kernel void @s_test_imax_sge_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
117; SI-LABEL: s_test_imax_sge_i32:
118; SI:       ; %bb.0:
119; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
120; SI-NEXT:    s_mov_b32 s7, 0xf000
121; SI-NEXT:    s_mov_b32 s6, -1
122; SI-NEXT:    s_waitcnt lgkmcnt(0)
123; SI-NEXT:    s_mov_b32 s4, s0
124; SI-NEXT:    s_max_i32 s0, s2, s3
125; SI-NEXT:    s_mov_b32 s5, s1
126; SI-NEXT:    v_mov_b32_e32 v0, s0
127; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
128; SI-NEXT:    s_endpgm
129;
130; EG-LABEL: s_test_imax_sge_i32:
131; EG:       ; %bb.0:
132; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
133; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
134; EG-NEXT:    CF_END
135; EG-NEXT:    PAD
136; EG-NEXT:    ALU clause starting at 4:
137; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
138; EG-NEXT:     MAX_INT * T1.X, KC0[2].Z, KC0[2].W,
139; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
140  %cmp = icmp sge i32 %a, %b
141  %val = select i1 %cmp, i32 %a, i32 %b
142  store i32 %val, ptr addrspace(1) %out, align 4
143  ret void
144}
145
146define amdgpu_kernel void @s_test_imax_sge_imm_i32(ptr addrspace(1) %out, i32 %a) nounwind {
147; SI-LABEL: s_test_imax_sge_imm_i32:
148; SI:       ; %bb.0:
149; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
150; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
151; SI-NEXT:    s_mov_b32 s3, 0xf000
152; SI-NEXT:    s_mov_b32 s2, -1
153; SI-NEXT:    s_waitcnt lgkmcnt(0)
154; SI-NEXT:    s_max_i32 s4, s6, 9
155; SI-NEXT:    v_mov_b32_e32 v0, s4
156; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
157; SI-NEXT:    s_endpgm
158;
159; EG-LABEL: s_test_imax_sge_imm_i32:
160; EG:       ; %bb.0:
161; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
162; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
163; EG-NEXT:    CF_END
164; EG-NEXT:    PAD
165; EG-NEXT:    ALU clause starting at 4:
166; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
167; EG-NEXT:     MAX_INT * T1.X, KC0[2].Z, literal.y,
168; EG-NEXT:    2(2.802597e-45), 9(1.261169e-44)
169  %cmp = icmp sge i32 %a, 9
170  %val = select i1 %cmp, i32 %a, i32 9
171  store i32 %val, ptr addrspace(1) %out, align 4
172  ret void
173}
174
175define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
176; SI-LABEL: v_test_imax_sge_i8:
177; SI:       ; %bb.0:
178; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
179; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
180; SI-NEXT:    s_mov_b32 s7, 0xf000
181; SI-NEXT:    s_mov_b32 s6, -1
182; SI-NEXT:    s_mov_b32 s14, s6
183; SI-NEXT:    s_waitcnt lgkmcnt(0)
184; SI-NEXT:    s_mov_b32 s12, s2
185; SI-NEXT:    s_mov_b32 s13, s3
186; SI-NEXT:    s_mov_b32 s15, s7
187; SI-NEXT:    s_mov_b32 s10, s6
188; SI-NEXT:    s_mov_b32 s11, s7
189; SI-NEXT:    buffer_load_sbyte v0, off, s[12:15], 0
190; SI-NEXT:    buffer_load_sbyte v1, off, s[8:11], 0
191; SI-NEXT:    s_mov_b32 s4, s0
192; SI-NEXT:    s_mov_b32 s5, s1
193; SI-NEXT:    s_waitcnt vmcnt(0)
194; SI-NEXT:    v_max_i32_e32 v0, v0, v1
195; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
196; SI-NEXT:    s_endpgm
197;
198; EG-LABEL: v_test_imax_sge_i8:
199; EG:       ; %bb.0:
200; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
201; EG-NEXT:    TEX 0 @8
202; EG-NEXT:    ALU 0, @13, KC0[CB0:0-32], KC1[]
203; EG-NEXT:    TEX 0 @10
204; EG-NEXT:    ALU 14, @14, KC0[CB0:0-32], KC1[]
205; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
206; EG-NEXT:    CF_END
207; EG-NEXT:    PAD
208; EG-NEXT:    Fetch clause starting at 8:
209; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
210; EG-NEXT:    Fetch clause starting at 10:
211; EG-NEXT:     VTX_READ_8 T1.X, T1.X, 0, #1
212; EG-NEXT:    ALU clause starting at 12:
213; EG-NEXT:     MOV * T0.X, KC0[2].W,
214; EG-NEXT:    ALU clause starting at 13:
215; EG-NEXT:     MOV * T1.X, KC0[2].Z,
216; EG-NEXT:    ALU clause starting at 14:
217; EG-NEXT:     BFE_INT T0.Z, T0.X, 0.0, literal.x,
218; EG-NEXT:     BFE_INT T0.W, T1.X, 0.0, literal.x, BS:VEC_120/SCL_212
219; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
220; EG-NEXT:    8(1.121039e-44), 3(4.203895e-45)
221; EG-NEXT:     MAX_INT * T0.W, PV.W, PV.Z,
222; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
223; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
224; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
225; EG-NEXT:     LSHL T0.X, PV.W, PS,
226; EG-NEXT:     LSHL * T0.W, literal.x, PS,
227; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
228; EG-NEXT:     MOV T0.Y, 0.0,
229; EG-NEXT:     MOV * T0.Z, 0.0,
230; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
231; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
232  %a = load i8, ptr addrspace(1) %aptr, align 1
233  %b = load i8, ptr addrspace(1) %bptr, align 1
234  %cmp = icmp sge i8 %a, %b
235  %val = select i1 %cmp, i8 %a, i8 %b
236  store i8 %val, ptr addrspace(1) %out, align 1
237  ret void
238}
239
240define amdgpu_kernel void @s_test_imax_sgt_imm_i32(ptr addrspace(1) %out, i32 %a) nounwind {
241; SI-LABEL: s_test_imax_sgt_imm_i32:
242; SI:       ; %bb.0:
243; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
244; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
245; SI-NEXT:    s_mov_b32 s3, 0xf000
246; SI-NEXT:    s_mov_b32 s2, -1
247; SI-NEXT:    s_waitcnt lgkmcnt(0)
248; SI-NEXT:    s_max_i32 s4, s6, 9
249; SI-NEXT:    v_mov_b32_e32 v0, s4
250; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
251; SI-NEXT:    s_endpgm
252;
253; EG-LABEL: s_test_imax_sgt_imm_i32:
254; EG:       ; %bb.0:
255; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
256; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
257; EG-NEXT:    CF_END
258; EG-NEXT:    PAD
259; EG-NEXT:    ALU clause starting at 4:
260; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
261; EG-NEXT:     MAX_INT * T1.X, KC0[2].Z, literal.y,
262; EG-NEXT:    2(2.802597e-45), 9(1.261169e-44)
263  %cmp = icmp sgt i32 %a, 9
264  %val = select i1 %cmp, i32 %a, i32 9
265  store i32 %val, ptr addrspace(1) %out, align 4
266  ret void
267}
268
269define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x i32> %a) nounwind {
270; SI-LABEL: s_test_imax_sgt_imm_v2i32:
271; SI:       ; %bb.0:
272; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
273; SI-NEXT:    s_mov_b32 s7, 0xf000
274; SI-NEXT:    s_mov_b32 s6, -1
275; SI-NEXT:    s_waitcnt lgkmcnt(0)
276; SI-NEXT:    s_mov_b32 s4, s0
277; SI-NEXT:    s_mov_b32 s5, s1
278; SI-NEXT:    s_max_i32 s0, s3, 9
279; SI-NEXT:    s_max_i32 s1, s2, 9
280; SI-NEXT:    v_mov_b32_e32 v0, s1
281; SI-NEXT:    v_mov_b32_e32 v1, s0
282; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
283; SI-NEXT:    s_endpgm
284;
285; EG-LABEL: s_test_imax_sgt_imm_v2i32:
286; EG:       ; %bb.0:
287; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
288; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
289; EG-NEXT:    CF_END
290; EG-NEXT:    PAD
291; EG-NEXT:    ALU clause starting at 4:
292; EG-NEXT:     MAX_INT * T0.Y, KC0[3].X, literal.x,
293; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
294; EG-NEXT:     MAX_INT T0.X, KC0[2].W, literal.x,
295; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
296; EG-NEXT:    9(1.261169e-44), 2(2.802597e-45)
297  %cmp = icmp sgt <2 x i32> %a, <i32 9, i32 9>
298  %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 9, i32 9>
299  store <2 x i32> %val, ptr addrspace(1) %out, align 4
300  ret void
301}
302
303define amdgpu_kernel void @v_test_imax_sgt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
304; SI-LABEL: v_test_imax_sgt_i32:
305; SI:       ; %bb.0:
306; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
307; SI-NEXT:    s_mov_b32 s7, 0xf000
308; SI-NEXT:    s_mov_b32 s2, 0
309; SI-NEXT:    s_mov_b32 s3, s7
310; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
311; SI-NEXT:    v_mov_b32_e32 v1, 0
312; SI-NEXT:    s_waitcnt lgkmcnt(0)
313; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
314; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
315; SI-NEXT:    s_mov_b32 s6, -1
316; SI-NEXT:    s_waitcnt lgkmcnt(0)
317; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
318; SI-NEXT:    s_mov_b32 s4, s0
319; SI-NEXT:    s_mov_b32 s5, s1
320; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
321; SI-NEXT:    v_max_i32_e32 v0, s2, v0
322; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
323; SI-NEXT:    s_endpgm
324;
325; EG-LABEL: v_test_imax_sgt_i32:
326; EG:       ; %bb.0:
327; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
328; EG-NEXT:    TEX 1 @6
329; EG-NEXT:    ALU 2, @14, KC0[CB0:0-32], KC1[]
330; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
331; EG-NEXT:    CF_END
332; EG-NEXT:    PAD
333; EG-NEXT:    Fetch clause starting at 6:
334; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
335; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
336; EG-NEXT:    ALU clause starting at 10:
337; EG-NEXT:     MOV T1.X, KC0[2].Z,
338; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
339; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
340; EG-NEXT:     ADD_INT * T0.X, KC0[2].W, PV.W,
341; EG-NEXT:    ALU clause starting at 14:
342; EG-NEXT:     MAX_INT T0.X, T1.X, T0.X,
343; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
344; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
345  %tid = call i32 @llvm.amdgcn.workitem.id.x()
346  %gep.in = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
347  %a = load i32, ptr addrspace(1) %aptr, align 4
348  %b = load i32, ptr addrspace(1) %gep.in, align 4
349  %cmp = icmp sgt i32 %a, %b
350  %val = select i1 %cmp, i32 %a, i32 %b
351  store i32 %val, ptr addrspace(1) %out, align 4
352  ret void
353}
354
355define amdgpu_kernel void @s_test_imax_sgt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
356; SI-LABEL: s_test_imax_sgt_i32:
357; SI:       ; %bb.0:
358; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
359; SI-NEXT:    s_mov_b32 s7, 0xf000
360; SI-NEXT:    s_mov_b32 s6, -1
361; SI-NEXT:    s_waitcnt lgkmcnt(0)
362; SI-NEXT:    s_mov_b32 s4, s0
363; SI-NEXT:    s_max_i32 s0, s2, s3
364; SI-NEXT:    s_mov_b32 s5, s1
365; SI-NEXT:    v_mov_b32_e32 v0, s0
366; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
367; SI-NEXT:    s_endpgm
368;
369; EG-LABEL: s_test_imax_sgt_i32:
370; EG:       ; %bb.0:
371; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
372; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
373; EG-NEXT:    CF_END
374; EG-NEXT:    PAD
375; EG-NEXT:    ALU clause starting at 4:
376; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
377; EG-NEXT:     MAX_INT * T1.X, KC0[2].Z, KC0[2].W,
378; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
379  %cmp = icmp sgt i32 %a, %b
380  %val = select i1 %cmp, i32 %a, i32 %b
381  store i32 %val, ptr addrspace(1) %out, align 4
382  ret void
383}
384
385define amdgpu_kernel void @v_test_umax_uge_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
386; SI-LABEL: v_test_umax_uge_i32:
387; SI:       ; %bb.0:
388; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
389; SI-NEXT:    s_mov_b32 s7, 0xf000
390; SI-NEXT:    s_mov_b32 s2, 0
391; SI-NEXT:    s_mov_b32 s3, s7
392; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
393; SI-NEXT:    v_mov_b32_e32 v1, 0
394; SI-NEXT:    s_waitcnt lgkmcnt(0)
395; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
396; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
397; SI-NEXT:    s_mov_b32 s6, -1
398; SI-NEXT:    s_waitcnt lgkmcnt(0)
399; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
400; SI-NEXT:    s_mov_b32 s4, s0
401; SI-NEXT:    s_mov_b32 s5, s1
402; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
403; SI-NEXT:    v_max_u32_e32 v0, s2, v0
404; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
405; SI-NEXT:    s_endpgm
406;
407; EG-LABEL: v_test_umax_uge_i32:
408; EG:       ; %bb.0:
409; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
410; EG-NEXT:    TEX 1 @6
411; EG-NEXT:    ALU 2, @14, KC0[CB0:0-32], KC1[]
412; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
413; EG-NEXT:    CF_END
414; EG-NEXT:    PAD
415; EG-NEXT:    Fetch clause starting at 6:
416; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
417; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
418; EG-NEXT:    ALU clause starting at 10:
419; EG-NEXT:     MOV T1.X, KC0[2].Z,
420; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
421; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
422; EG-NEXT:     ADD_INT * T0.X, KC0[2].W, PV.W,
423; EG-NEXT:    ALU clause starting at 14:
424; EG-NEXT:     MAX_UINT T0.X, T1.X, T0.X,
425; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
426; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
427  %tid = call i32 @llvm.amdgcn.workitem.id.x()
428  %gep.in = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
429  %a = load i32, ptr addrspace(1) %aptr, align 4
430  %b = load i32, ptr addrspace(1) %gep.in, align 4
431  %cmp = icmp uge i32 %a, %b
432  %val = select i1 %cmp, i32 %a, i32 %b
433  store i32 %val, ptr addrspace(1) %out, align 4
434  ret void
435}
436
437define amdgpu_kernel void @s_test_umax_uge_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
438; SI-LABEL: s_test_umax_uge_i32:
439; SI:       ; %bb.0:
440; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
441; SI-NEXT:    s_mov_b32 s7, 0xf000
442; SI-NEXT:    s_mov_b32 s6, -1
443; SI-NEXT:    s_waitcnt lgkmcnt(0)
444; SI-NEXT:    s_mov_b32 s4, s0
445; SI-NEXT:    s_max_u32 s0, s2, s3
446; SI-NEXT:    s_mov_b32 s5, s1
447; SI-NEXT:    v_mov_b32_e32 v0, s0
448; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
449; SI-NEXT:    s_endpgm
450;
451; EG-LABEL: s_test_umax_uge_i32:
452; EG:       ; %bb.0:
453; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
454; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
455; EG-NEXT:    CF_END
456; EG-NEXT:    PAD
457; EG-NEXT:    ALU clause starting at 4:
458; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
459; EG-NEXT:     MAX_UINT * T1.X, KC0[2].Z, KC0[2].W,
460; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
461  %cmp = icmp uge i32 %a, %b
462  %val = select i1 %cmp, i32 %a, i32 %b
463  store i32 %val, ptr addrspace(1) %out, align 4
464  ret void
465}
466
467define amdgpu_kernel void @s_test_umax_uge_v3i32(ptr addrspace(1) %out, <3 x i32> %a, <3 x i32> %b) nounwind {
468; SI-LABEL: s_test_umax_uge_v3i32:
469; SI:       ; %bb.0:
470; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
471; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
472; SI-NEXT:    s_mov_b32 s3, 0xf000
473; SI-NEXT:    s_mov_b32 s2, -1
474; SI-NEXT:    s_waitcnt lgkmcnt(0)
475; SI-NEXT:    s_max_u32 s6, s10, s14
476; SI-NEXT:    s_max_u32 s4, s9, s13
477; SI-NEXT:    s_max_u32 s5, s8, s12
478; SI-NEXT:    v_mov_b32_e32 v0, s6
479; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
480; SI-NEXT:    s_waitcnt expcnt(0)
481; SI-NEXT:    v_mov_b32_e32 v0, s5
482; SI-NEXT:    v_mov_b32_e32 v1, s4
483; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
484; SI-NEXT:    s_endpgm
485;
486; EG-LABEL: s_test_umax_uge_v3i32:
487; EG:       ; %bb.0:
488; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
489; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
490; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
491; EG-NEXT:    CF_END
492; EG-NEXT:    ALU clause starting at 4:
493; EG-NEXT:     MAX_UINT * T0.Y, KC0[3].Z, KC0[4].Z,
494; EG-NEXT:     MAX_UINT * T0.X, KC0[3].Y, KC0[4].Y,
495; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
496; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
497; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
498; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
499; EG-NEXT:     MAX_UINT * T3.X, KC0[3].W, KC0[4].W,
500; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
501  %cmp = icmp uge <3 x i32> %a, %b
502  %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
503  store <3 x i32> %val, ptr addrspace(1) %out, align 4
504  ret void
505}
506
507define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
508; SI-LABEL: v_test_umax_uge_i8:
509; SI:       ; %bb.0:
510; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
511; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
512; SI-NEXT:    s_mov_b32 s7, 0xf000
513; SI-NEXT:    s_mov_b32 s6, -1
514; SI-NEXT:    s_mov_b32 s14, s6
515; SI-NEXT:    s_waitcnt lgkmcnt(0)
516; SI-NEXT:    s_mov_b32 s12, s2
517; SI-NEXT:    s_mov_b32 s13, s3
518; SI-NEXT:    s_mov_b32 s15, s7
519; SI-NEXT:    s_mov_b32 s10, s6
520; SI-NEXT:    s_mov_b32 s11, s7
521; SI-NEXT:    buffer_load_ubyte v0, off, s[12:15], 0
522; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
523; SI-NEXT:    s_mov_b32 s4, s0
524; SI-NEXT:    s_mov_b32 s5, s1
525; SI-NEXT:    s_waitcnt vmcnt(0)
526; SI-NEXT:    v_max_u32_e32 v0, v0, v1
527; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
528; SI-NEXT:    s_endpgm
529;
530; EG-LABEL: v_test_umax_uge_i8:
531; EG:       ; %bb.0:
532; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
533; EG-NEXT:    TEX 1 @6
534; EG-NEXT:    ALU 11, @12, KC0[CB0:0-32], KC1[]
535; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
536; EG-NEXT:    CF_END
537; EG-NEXT:    PAD
538; EG-NEXT:    Fetch clause starting at 6:
539; EG-NEXT:     VTX_READ_8 T1.X, T1.X, 0, #1
540; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
541; EG-NEXT:    ALU clause starting at 10:
542; EG-NEXT:     MOV T0.X, KC0[2].Z,
543; EG-NEXT:     MOV * T1.X, KC0[2].W,
544; EG-NEXT:    ALU clause starting at 12:
545; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
546; EG-NEXT:     MAX_UINT * T1.W, T0.X, T1.X,
547; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
548; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
549; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
550; EG-NEXT:     LSHL T0.X, T1.W, PV.W,
551; EG-NEXT:     LSHL * T0.W, literal.x, PV.W,
552; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
553; EG-NEXT:     MOV T0.Y, 0.0,
554; EG-NEXT:     MOV * T0.Z, 0.0,
555; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
556; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
557  %a = load i8, ptr addrspace(1) %aptr, align 1
558  %b = load i8, ptr addrspace(1) %bptr, align 1
559  %cmp = icmp uge i8 %a, %b
560  %val = select i1 %cmp, i8 %a, i8 %b
561  store i8 %val, ptr addrspace(1) %out, align 1
562  ret void
563}
564
565define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
566; SI-LABEL: v_test_umax_ugt_i32:
567; SI:       ; %bb.0:
568; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
569; SI-NEXT:    s_mov_b32 s7, 0xf000
570; SI-NEXT:    s_mov_b32 s2, 0
571; SI-NEXT:    s_mov_b32 s3, s7
572; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
573; SI-NEXT:    v_mov_b32_e32 v1, 0
574; SI-NEXT:    s_waitcnt lgkmcnt(0)
575; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
576; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
577; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
578; SI-NEXT:    s_mov_b32 s6, -1
579; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
580; SI-NEXT:    v_max_u32_e32 v0, s0, v0
581; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
582; SI-NEXT:    s_endpgm
583;
584; EG-LABEL: v_test_umax_ugt_i32:
585; EG:       ; %bb.0:
586; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
587; EG-NEXT:    TEX 1 @6
588; EG-NEXT:    ALU 2, @14, KC0[CB0:0-32], KC1[]
589; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
590; EG-NEXT:    CF_END
591; EG-NEXT:    PAD
592; EG-NEXT:    Fetch clause starting at 6:
593; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
594; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
595; EG-NEXT:    ALU clause starting at 10:
596; EG-NEXT:     MOV T1.X, KC0[2].W,
597; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
598; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
599; EG-NEXT:     ADD_INT * T0.X, KC0[2].W, PV.W,
600; EG-NEXT:    ALU clause starting at 14:
601; EG-NEXT:     MAX_UINT T0.X, T0.X, T1.X,
602; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
603; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
604  %tid = call i32 @llvm.amdgcn.workitem.id.x()
605  %gep.in = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
606  %a = load i32, ptr addrspace(1) %gep.in, align 4
607  %b = load i32, ptr addrspace(1) %bptr, align 4
608  %cmp = icmp ugt i32 %a, %b
609  %val = select i1 %cmp, i32 %a, i32 %b
610  store i32 %val, ptr addrspace(1) %out, align 4
611  ret void
612}
613
614define amdgpu_kernel void @s_test_umax_ugt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
615; SI-LABEL: s_test_umax_ugt_i32:
616; SI:       ; %bb.0:
617; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
618; SI-NEXT:    s_mov_b32 s7, 0xf000
619; SI-NEXT:    s_mov_b32 s6, -1
620; SI-NEXT:    s_waitcnt lgkmcnt(0)
621; SI-NEXT:    s_mov_b32 s4, s0
622; SI-NEXT:    s_max_u32 s0, s2, s3
623; SI-NEXT:    s_mov_b32 s5, s1
624; SI-NEXT:    v_mov_b32_e32 v0, s0
625; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
626; SI-NEXT:    s_endpgm
627;
628; EG-LABEL: s_test_umax_ugt_i32:
629; EG:       ; %bb.0:
630; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
631; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
632; EG-NEXT:    CF_END
633; EG-NEXT:    PAD
634; EG-NEXT:    ALU clause starting at 4:
635; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
636; EG-NEXT:     MAX_UINT * T1.X, KC0[2].Z, KC0[2].W,
637; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
638  %cmp = icmp ugt i32 %a, %b
639  %val = select i1 %cmp, i32 %a, i32 %b
640  store i32 %val, ptr addrspace(1) %out, align 4
641  ret void
642}
643
644define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x i32> %a) nounwind {
645; SI-LABEL: s_test_umax_ugt_imm_v2i32:
646; SI:       ; %bb.0:
647; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
648; SI-NEXT:    s_mov_b32 s7, 0xf000
649; SI-NEXT:    s_mov_b32 s6, -1
650; SI-NEXT:    s_waitcnt lgkmcnt(0)
651; SI-NEXT:    s_mov_b32 s4, s0
652; SI-NEXT:    s_mov_b32 s5, s1
653; SI-NEXT:    s_max_u32 s0, s3, 23
654; SI-NEXT:    s_max_u32 s1, s2, 15
655; SI-NEXT:    v_mov_b32_e32 v0, s1
656; SI-NEXT:    v_mov_b32_e32 v1, s0
657; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
658; SI-NEXT:    s_endpgm
659;
660; EG-LABEL: s_test_umax_ugt_imm_v2i32:
661; EG:       ; %bb.0:
662; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
663; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
664; EG-NEXT:    CF_END
665; EG-NEXT:    PAD
666; EG-NEXT:    ALU clause starting at 4:
667; EG-NEXT:     MAX_UINT * T0.Y, KC0[3].X, literal.x,
668; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
669; EG-NEXT:     MAX_UINT T0.X, KC0[2].W, literal.x,
670; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
671; EG-NEXT:    15(2.101948e-44), 2(2.802597e-45)
672  %cmp = icmp ugt <2 x i32> %a, <i32 15, i32 23>
673  %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 15, i32 23>
674  store <2 x i32> %val, ptr addrspace(1) %out, align 4
675  ret void
676}
677
678; Make sure redundant and removed
679
680define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(ptr addrspace(1) %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) nounwind {
681; SI-LABEL: simplify_demanded_bits_test_umax_ugt_i16:
682; SI:       ; %bb.0:
683; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
684; SI-NEXT:    s_load_dword s7, s[4:5], 0x1c
685; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
686; SI-NEXT:    s_mov_b32 s3, 0xf000
687; SI-NEXT:    s_mov_b32 s2, -1
688; SI-NEXT:    s_waitcnt lgkmcnt(0)
689; SI-NEXT:    s_and_b32 s4, s6, 0xffff
690; SI-NEXT:    s_and_b32 s5, s7, 0xffff
691; SI-NEXT:    s_max_u32 s4, s4, s5
692; SI-NEXT:    v_mov_b32_e32 v0, s4
693; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
694; SI-NEXT:    s_endpgm
695;
696; EG-LABEL: simplify_demanded_bits_test_umax_ugt_i16:
697; EG:       ; %bb.0:
698; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
699; EG-NEXT:    TEX 1 @6
700; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
701; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
702; EG-NEXT:    CF_END
703; EG-NEXT:    PAD
704; EG-NEXT:    Fetch clause starting at 6:
705; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 72, #3
706; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 108, #3
707; EG-NEXT:    ALU clause starting at 10:
708; EG-NEXT:     MOV * T0.X, 0.0,
709; EG-NEXT:    ALU clause starting at 11:
710; EG-NEXT:     BFE_INT T0.Z, T1.X, 0.0, literal.x,
711; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
712; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
713; EG-NEXT:     MAX_UINT T0.X, PV.Z, PV.W,
714; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
715; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
716  %a.ext = zext i16 %a to i32
717  %b.ext = zext i16 %b to i32
718  %cmp = icmp ugt i32 %a.ext, %b.ext
719  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
720  %mask = and i32 %val, 65535
721  store i32 %mask, ptr addrspace(1) %out
722  ret void
723}
724
725; Make sure redundant sign_extend_inreg removed.
726
727define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(ptr addrspace(1) %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) nounwind {
728; SI-LABEL: simplify_demanded_bits_test_max_slt_i16:
729; SI:       ; %bb.0:
730; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
731; SI-NEXT:    s_load_dword s7, s[4:5], 0x1c
732; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
733; SI-NEXT:    s_mov_b32 s3, 0xf000
734; SI-NEXT:    s_mov_b32 s2, -1
735; SI-NEXT:    s_waitcnt lgkmcnt(0)
736; SI-NEXT:    s_sext_i32_i16 s4, s6
737; SI-NEXT:    s_sext_i32_i16 s5, s7
738; SI-NEXT:    s_max_i32 s4, s4, s5
739; SI-NEXT:    v_mov_b32_e32 v0, s4
740; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
741; SI-NEXT:    s_endpgm
742;
743; EG-LABEL: simplify_demanded_bits_test_max_slt_i16:
744; EG:       ; %bb.0:
745; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
746; EG-NEXT:    TEX 1 @6
747; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
748; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
749; EG-NEXT:    CF_END
750; EG-NEXT:    PAD
751; EG-NEXT:    Fetch clause starting at 6:
752; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 72, #3
753; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 108, #3
754; EG-NEXT:    ALU clause starting at 10:
755; EG-NEXT:     MOV * T0.X, 0.0,
756; EG-NEXT:    ALU clause starting at 11:
757; EG-NEXT:     BFE_INT T0.Z, T1.X, 0.0, literal.x,
758; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
759; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
760; EG-NEXT:     MAX_INT T0.X, PV.Z, PV.W,
761; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
762; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
763  %a.ext = sext i16 %a to i32
764  %b.ext = sext i16 %b to i32
765  %cmp = icmp sgt i32 %a.ext, %b.ext
766  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
767  %shl = shl i32 %val, 16
768  %sextinreg = ashr i32 %shl, 16
769  store i32 %sextinreg, ptr addrspace(1) %out
770  ret void
771}
772
773define amdgpu_kernel void @s_test_imax_sge_i16(ptr addrspace(1) %out, [8 x i32], i16 %a, [8 x i32], i16 %b) nounwind {
774; SI-LABEL: s_test_imax_sge_i16:
775; SI:       ; %bb.0:
776; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
777; SI-NEXT:    s_load_dword s7, s[4:5], 0x1c
778; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
779; SI-NEXT:    s_mov_b32 s3, 0xf000
780; SI-NEXT:    s_mov_b32 s2, -1
781; SI-NEXT:    s_waitcnt lgkmcnt(0)
782; SI-NEXT:    s_sext_i32_i16 s4, s6
783; SI-NEXT:    s_sext_i32_i16 s5, s7
784; SI-NEXT:    s_max_i32 s4, s4, s5
785; SI-NEXT:    v_mov_b32_e32 v0, s4
786; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
787; SI-NEXT:    s_endpgm
788;
789; EG-LABEL: s_test_imax_sge_i16:
790; EG:       ; %bb.0:
791; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
792; EG-NEXT:    TEX 1 @6
793; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
794; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
795; EG-NEXT:    CF_END
796; EG-NEXT:    PAD
797; EG-NEXT:    Fetch clause starting at 6:
798; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 72, #3
799; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 108, #3
800; EG-NEXT:    ALU clause starting at 10:
801; EG-NEXT:     MOV * T0.X, 0.0,
802; EG-NEXT:    ALU clause starting at 11:
803; EG-NEXT:     BFE_INT T0.Z, T1.X, 0.0, literal.x,
804; EG-NEXT:     BFE_INT T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
805; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
806; EG-NEXT:    16(2.242078e-44), 3(4.203895e-45)
807; EG-NEXT:     MAX_INT * T0.W, PV.Z, PV.W,
808; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
809; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
810; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
811; EG-NEXT:     LSHL T0.X, PV.W, PS,
812; EG-NEXT:     LSHL * T0.W, literal.x, PS,
813; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
814; EG-NEXT:     MOV T0.Y, 0.0,
815; EG-NEXT:     MOV * T0.Z, 0.0,
816; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
817; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
818  %cmp = icmp sge i16 %a, %b
819  %val = select i1 %cmp, i16 %a, i16 %b
820  store i16 %val, ptr addrspace(1) %out
821  ret void
822}
823
824; 64 bit
825
826define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
827; SI-LABEL: test_umax_ugt_i64:
828; SI:       ; %bb.0:
829; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
830; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
831; SI-NEXT:    s_mov_b32 s7, 0xf000
832; SI-NEXT:    s_mov_b32 s6, -1
833; SI-NEXT:    s_waitcnt lgkmcnt(0)
834; SI-NEXT:    s_mov_b32 s4, s0
835; SI-NEXT:    v_mov_b32_e32 v0, s8
836; SI-NEXT:    v_mov_b32_e32 v1, s9
837; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
838; SI-NEXT:    s_mov_b32 s5, s1
839; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
840; SI-NEXT:    s_cselect_b32 s0, s3, s9
841; SI-NEXT:    s_cselect_b32 s1, s2, s8
842; SI-NEXT:    v_mov_b32_e32 v0, s1
843; SI-NEXT:    v_mov_b32_e32 v1, s0
844; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
845; SI-NEXT:    s_endpgm
846;
847; EG-LABEL: test_umax_ugt_i64:
848; EG:       ; %bb.0:
849; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
850; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
851; EG-NEXT:    CF_END
852; EG-NEXT:    PAD
853; EG-NEXT:    ALU clause starting at 4:
854; EG-NEXT:     SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
855; EG-NEXT:     SETGT_UINT * T0.W, KC0[3].X, KC0[3].Z,
856; EG-NEXT:     SETGT_UINT * T1.W, KC0[2].W, KC0[3].Y,
857; EG-NEXT:     CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
858; EG-NEXT:     CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
859; EG-NEXT:     CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
860; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
861; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
862  %tmp = icmp ugt i64 %a, %b
863  %val = select i1 %tmp, i64 %a, i64 %b
864  store i64 %val, ptr addrspace(1) %out, align 8
865  ret void
866}
867
868define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
869; SI-LABEL: test_umax_uge_i64:
870; SI:       ; %bb.0:
871; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
872; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
873; SI-NEXT:    s_mov_b32 s7, 0xf000
874; SI-NEXT:    s_mov_b32 s6, -1
875; SI-NEXT:    s_waitcnt lgkmcnt(0)
876; SI-NEXT:    s_mov_b32 s4, s0
877; SI-NEXT:    v_mov_b32_e32 v0, s8
878; SI-NEXT:    v_mov_b32_e32 v1, s9
879; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1]
880; SI-NEXT:    s_mov_b32 s5, s1
881; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
882; SI-NEXT:    s_cselect_b32 s0, s3, s9
883; SI-NEXT:    s_cselect_b32 s1, s2, s8
884; SI-NEXT:    v_mov_b32_e32 v0, s1
885; SI-NEXT:    v_mov_b32_e32 v1, s0
886; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
887; SI-NEXT:    s_endpgm
888;
889; EG-LABEL: test_umax_uge_i64:
890; EG:       ; %bb.0:
891; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
892; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
893; EG-NEXT:    CF_END
894; EG-NEXT:    PAD
895; EG-NEXT:    ALU clause starting at 4:
896; EG-NEXT:     SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
897; EG-NEXT:     SETGT_UINT * T0.W, KC0[3].X, KC0[3].Z,
898; EG-NEXT:     SETGT_UINT * T1.W, KC0[2].W, KC0[3].Y,
899; EG-NEXT:     CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
900; EG-NEXT:     CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
901; EG-NEXT:     CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
902; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
903; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
904  %tmp = icmp uge i64 %a, %b
905  %val = select i1 %tmp, i64 %a, i64 %b
906  store i64 %val, ptr addrspace(1) %out, align 8
907  ret void
908}
909
910define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
911; SI-LABEL: test_imax_sgt_i64:
912; SI:       ; %bb.0:
913; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
914; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
915; SI-NEXT:    s_mov_b32 s7, 0xf000
916; SI-NEXT:    s_mov_b32 s6, -1
917; SI-NEXT:    s_waitcnt lgkmcnt(0)
918; SI-NEXT:    s_mov_b32 s4, s0
919; SI-NEXT:    v_mov_b32_e32 v0, s8
920; SI-NEXT:    v_mov_b32_e32 v1, s9
921; SI-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
922; SI-NEXT:    s_mov_b32 s5, s1
923; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
924; SI-NEXT:    s_cselect_b32 s0, s3, s9
925; SI-NEXT:    s_cselect_b32 s1, s2, s8
926; SI-NEXT:    v_mov_b32_e32 v0, s1
927; SI-NEXT:    v_mov_b32_e32 v1, s0
928; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
929; SI-NEXT:    s_endpgm
930;
931; EG-LABEL: test_imax_sgt_i64:
932; EG:       ; %bb.0:
933; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
934; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
935; EG-NEXT:    CF_END
936; EG-NEXT:    PAD
937; EG-NEXT:    ALU clause starting at 4:
938; EG-NEXT:     SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
939; EG-NEXT:     SETGT_INT * T0.W, KC0[3].X, KC0[3].Z,
940; EG-NEXT:     SETGT_UINT * T1.W, KC0[2].W, KC0[3].Y,
941; EG-NEXT:     CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
942; EG-NEXT:     CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
943; EG-NEXT:     CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
944; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
945; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
946  %tmp = icmp sgt i64 %a, %b
947  %val = select i1 %tmp, i64 %a, i64 %b
948  store i64 %val, ptr addrspace(1) %out, align 8
949  ret void
950}
951
952define amdgpu_kernel void @test_imax_sge_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
953; SI-LABEL: test_imax_sge_i64:
954; SI:       ; %bb.0:
955; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
956; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
957; SI-NEXT:    s_mov_b32 s7, 0xf000
958; SI-NEXT:    s_mov_b32 s6, -1
959; SI-NEXT:    s_waitcnt lgkmcnt(0)
960; SI-NEXT:    s_mov_b32 s4, s0
961; SI-NEXT:    v_mov_b32_e32 v0, s8
962; SI-NEXT:    v_mov_b32_e32 v1, s9
963; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1]
964; SI-NEXT:    s_mov_b32 s5, s1
965; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
966; SI-NEXT:    s_cselect_b32 s0, s3, s9
967; SI-NEXT:    s_cselect_b32 s1, s2, s8
968; SI-NEXT:    v_mov_b32_e32 v0, s1
969; SI-NEXT:    v_mov_b32_e32 v1, s0
970; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
971; SI-NEXT:    s_endpgm
972;
973; EG-LABEL: test_imax_sge_i64:
974; EG:       ; %bb.0:
975; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
976; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
977; EG-NEXT:    CF_END
978; EG-NEXT:    PAD
979; EG-NEXT:    ALU clause starting at 4:
980; EG-NEXT:     SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
981; EG-NEXT:     SETGT_INT * T0.W, KC0[3].X, KC0[3].Z,
982; EG-NEXT:     SETGT_UINT * T1.W, KC0[2].W, KC0[3].Y,
983; EG-NEXT:     CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
984; EG-NEXT:     CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
985; EG-NEXT:     CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
986; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
987; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
988  %tmp = icmp sge i64 %a, %b
989  %val = select i1 %tmp, i64 %a, i64 %b
990  store i64 %val, ptr addrspace(1) %out, align 8
991  ret void
992}
993
994declare i32 @llvm.amdgcn.workitem.id.x() #0
995
996attributes #0 = { nounwind readnone }
997attributes #1 = { nounwind }
998