xref: /llvm-project/llvm/test/CodeGen/AMDGPU/max.i16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s --check-prefix=VI
3; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX9
4
5; FIXME: Need to handle non-uniform case for function below (load without gep).
6define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
7; VI-LABEL: v_test_imax_sge_i16:
8; VI:       ; %bb.0:
9; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
10; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
11; VI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
12; VI-NEXT:    s_waitcnt lgkmcnt(0)
13; VI-NEXT:    v_mov_b32_e32 v1, s3
14; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
15; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
16; VI-NEXT:    v_mov_b32_e32 v3, s5
17; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
18; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
19; VI-NEXT:    flat_load_ushort v5, v[0:1]
20; VI-NEXT:    flat_load_ushort v2, v[2:3]
21; VI-NEXT:    v_mov_b32_e32 v1, s1
22; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
23; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
24; VI-NEXT:    s_waitcnt vmcnt(0)
25; VI-NEXT:    v_max_i16_e32 v2, v5, v2
26; VI-NEXT:    flat_store_short v[0:1], v2
27; VI-NEXT:    s_endpgm
28;
29; GFX9-LABEL: v_test_imax_sge_i16:
30; GFX9:       ; %bb.0:
31; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
32; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
33; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
34; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
36; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
37; GFX9-NEXT:    s_waitcnt vmcnt(0)
38; GFX9-NEXT:    v_max_i16_e32 v1, v1, v2
39; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
40; GFX9-NEXT:    s_endpgm
41  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
42  %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
43  %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
44  %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
45  %a = load i16, ptr addrspace(1) %gep0, align 4
46  %b = load i16, ptr addrspace(1) %gep1, align 4
47  %cmp = icmp sge i16 %a, %b
48  %val = select i1 %cmp, i16 %a, i16 %b
49  store i16 %val, ptr addrspace(1) %outgep, align 4
50  ret void
51}
52
53; FIXME: Need to handle non-uniform case for function below (load without gep).
54define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
55; VI-LABEL: v_test_imax_sge_v2i16:
56; VI:       ; %bb.0:
57; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
58; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
59; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
60; VI-NEXT:    s_waitcnt lgkmcnt(0)
61; VI-NEXT:    v_mov_b32_e32 v1, s3
62; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
63; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
64; VI-NEXT:    v_mov_b32_e32 v3, s5
65; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
66; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
67; VI-NEXT:    flat_load_dword v5, v[0:1]
68; VI-NEXT:    flat_load_dword v2, v[2:3]
69; VI-NEXT:    v_mov_b32_e32 v1, s1
70; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
71; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
72; VI-NEXT:    s_waitcnt vmcnt(0)
73; VI-NEXT:    v_max_i16_e32 v3, v5, v2
74; VI-NEXT:    v_max_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
75; VI-NEXT:    v_or_b32_e32 v2, v3, v2
76; VI-NEXT:    flat_store_dword v[0:1], v2
77; VI-NEXT:    s_endpgm
78;
79; GFX9-LABEL: v_test_imax_sge_v2i16:
80; GFX9:       ; %bb.0:
81; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
82; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
83; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
84; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
85; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
86; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
87; GFX9-NEXT:    s_waitcnt vmcnt(0)
88; GFX9-NEXT:    v_pk_max_i16 v1, v1, v2
89; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
90; GFX9-NEXT:    s_endpgm
91  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
92  %gep0 = getelementptr <2 x i16>, ptr addrspace(1) %aptr, i32 %tid
93  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %bptr, i32 %tid
94  %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid
95  %a = load <2 x i16>, ptr addrspace(1) %gep0, align 4
96  %b = load <2 x i16>, ptr addrspace(1) %gep1, align 4
97  %cmp = icmp sge <2 x i16> %a, %b
98  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
99  store <2 x i16> %val, ptr addrspace(1) %outgep, align 4
100  ret void
101}
102
103; FIXME: Need to handle non-uniform case for function below (load without gep).
104define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
105; VI-LABEL: v_test_imax_sge_v3i16:
106; VI:       ; %bb.0:
107; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
108; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
109; VI-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
110; VI-NEXT:    s_waitcnt lgkmcnt(0)
111; VI-NEXT:    v_mov_b32_e32 v1, s3
112; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
113; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
114; VI-NEXT:    v_mov_b32_e32 v3, s5
115; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v6
116; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
117; VI-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
118; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
119; VI-NEXT:    flat_load_ushort v4, v[4:5]
120; VI-NEXT:    flat_load_dword v5, v[0:1]
121; VI-NEXT:    flat_load_dword v7, v[2:3]
122; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
123; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
124; VI-NEXT:    flat_load_ushort v8, v[0:1]
125; VI-NEXT:    v_mov_b32_e32 v1, s1
126; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
127; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
128; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
129; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
130; VI-NEXT:    s_waitcnt vmcnt(1)
131; VI-NEXT:    v_max_i16_e32 v6, v5, v7
132; VI-NEXT:    v_max_i16_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
133; VI-NEXT:    v_or_b32_e32 v5, v6, v5
134; VI-NEXT:    s_waitcnt vmcnt(0)
135; VI-NEXT:    v_max_i16_e32 v4, v4, v8
136; VI-NEXT:    flat_store_short v[2:3], v4
137; VI-NEXT:    flat_store_dword v[0:1], v5
138; VI-NEXT:    s_endpgm
139;
140; GFX9-LABEL: v_test_imax_sge_v3i16:
141; GFX9:       ; %bb.0:
142; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
143; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
144; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
145; GFX9-NEXT:    v_mov_b32_e32 v1, 0
146; GFX9-NEXT:    v_mov_b32_e32 v2, 0
147; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
148; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
149; GFX9-NEXT:    s_nop 0
150; GFX9-NEXT:    global_load_short_d16 v2, v0, s[2:3] offset:4
151; GFX9-NEXT:    s_nop 0
152; GFX9-NEXT:    global_load_dword v4, v0, s[2:3]
153; GFX9-NEXT:    s_nop 0
154; GFX9-NEXT:    global_load_short_d16 v1, v0, s[6:7] offset:4
155; GFX9-NEXT:    s_waitcnt vmcnt(1)
156; GFX9-NEXT:    v_pk_max_i16 v3, v4, v3
157; GFX9-NEXT:    s_waitcnt vmcnt(0)
158; GFX9-NEXT:    v_pk_max_i16 v1, v2, v1
159; GFX9-NEXT:    global_store_short v0, v1, s[0:1] offset:4
160; GFX9-NEXT:    global_store_dword v0, v3, s[0:1]
161; GFX9-NEXT:    s_endpgm
162  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
163  %gep0 = getelementptr <3 x i16>, ptr addrspace(1) %aptr, i32 %tid
164  %gep1 = getelementptr <3 x i16>, ptr addrspace(1) %bptr, i32 %tid
165  %outgep = getelementptr <3 x i16>, ptr addrspace(1) %out, i32 %tid
166  %a = load <3 x i16>, ptr addrspace(1) %gep0, align 4
167  %b = load <3 x i16>, ptr addrspace(1) %gep1, align 4
168  %cmp = icmp sge <3 x i16> %a, %b
169  %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
170  store <3 x i16> %val, ptr addrspace(1) %outgep, align 4
171  ret void
172}
173
174; FIXME: Need to handle non-uniform case for function below (load without gep).
175define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
176; VI-LABEL: v_test_imax_sge_v4i16:
177; VI:       ; %bb.0:
178; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
179; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
180; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
181; VI-NEXT:    s_waitcnt lgkmcnt(0)
182; VI-NEXT:    v_mov_b32_e32 v1, s3
183; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
184; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
185; VI-NEXT:    v_mov_b32_e32 v3, s5
186; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
187; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
188; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
189; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
190; VI-NEXT:    v_mov_b32_e32 v5, s1
191; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
192; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
193; VI-NEXT:    s_waitcnt vmcnt(0)
194; VI-NEXT:    v_max_i16_e32 v6, v1, v3
195; VI-NEXT:    v_max_i16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
196; VI-NEXT:    v_max_i16_e32 v3, v0, v2
197; VI-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
198; VI-NEXT:    v_or_b32_e32 v1, v6, v1
199; VI-NEXT:    v_or_b32_e32 v0, v3, v0
200; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
201; VI-NEXT:    s_endpgm
202;
203; GFX9-LABEL: v_test_imax_sge_v4i16:
204; GFX9:       ; %bb.0:
205; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
206; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
207; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
208; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
209; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
210; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
211; GFX9-NEXT:    s_waitcnt vmcnt(0)
212; GFX9-NEXT:    v_pk_max_i16 v1, v1, v3
213; GFX9-NEXT:    v_pk_max_i16 v0, v0, v2
214; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
215; GFX9-NEXT:    s_endpgm
216  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
217  %gep0 = getelementptr <4 x i16>, ptr addrspace(1) %aptr, i32 %tid
218  %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %bptr, i32 %tid
219  %outgep = getelementptr <4 x i16>, ptr addrspace(1) %out, i32 %tid
220  %a = load <4 x i16>, ptr addrspace(1) %gep0, align 4
221  %b = load <4 x i16>, ptr addrspace(1) %gep1, align 4
222  %cmp = icmp sge <4 x i16> %a, %b
223  %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
224  store <4 x i16> %val, ptr addrspace(1) %outgep, align 4
225  ret void
226}
227
228; FIXME: Need to handle non-uniform case for function below (load without gep).
229define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
230; VI-LABEL: v_test_imax_sgt_i16:
231; VI:       ; %bb.0:
232; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
233; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
234; VI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
235; VI-NEXT:    s_waitcnt lgkmcnt(0)
236; VI-NEXT:    v_mov_b32_e32 v1, s3
237; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
238; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
239; VI-NEXT:    v_mov_b32_e32 v3, s5
240; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
241; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
242; VI-NEXT:    flat_load_ushort v5, v[0:1]
243; VI-NEXT:    flat_load_ushort v2, v[2:3]
244; VI-NEXT:    v_mov_b32_e32 v1, s1
245; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
246; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
247; VI-NEXT:    s_waitcnt vmcnt(0)
248; VI-NEXT:    v_max_i16_e32 v2, v5, v2
249; VI-NEXT:    flat_store_short v[0:1], v2
250; VI-NEXT:    s_endpgm
251;
252; GFX9-LABEL: v_test_imax_sgt_i16:
253; GFX9:       ; %bb.0:
254; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
255; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
256; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
257; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
258; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
259; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
260; GFX9-NEXT:    s_waitcnt vmcnt(0)
261; GFX9-NEXT:    v_max_i16_e32 v1, v1, v2
262; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
263; GFX9-NEXT:    s_endpgm
264  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
265  %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
266  %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
267  %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
268  %a = load i16, ptr addrspace(1) %gep0, align 4
269  %b = load i16, ptr addrspace(1) %gep1, align 4
270  %cmp = icmp sgt i16 %a, %b
271  %val = select i1 %cmp, i16 %a, i16 %b
272  store i16 %val, ptr addrspace(1) %outgep, align 4
273  ret void
274}
275
276; FIXME: Need to handle non-uniform case for function below (load without gep).
277define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
278; VI-LABEL: v_test_umax_uge_i16:
279; VI:       ; %bb.0:
280; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
281; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
282; VI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
283; VI-NEXT:    s_waitcnt lgkmcnt(0)
284; VI-NEXT:    v_mov_b32_e32 v1, s3
285; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
286; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
287; VI-NEXT:    v_mov_b32_e32 v3, s5
288; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
289; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
290; VI-NEXT:    flat_load_ushort v5, v[0:1]
291; VI-NEXT:    flat_load_ushort v2, v[2:3]
292; VI-NEXT:    v_mov_b32_e32 v1, s1
293; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
294; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
295; VI-NEXT:    s_waitcnt vmcnt(0)
296; VI-NEXT:    v_max_u16_e32 v2, v5, v2
297; VI-NEXT:    flat_store_short v[0:1], v2
298; VI-NEXT:    s_endpgm
299;
300; GFX9-LABEL: v_test_umax_uge_i16:
301; GFX9:       ; %bb.0:
302; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
303; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
304; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
305; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
306; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
307; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
308; GFX9-NEXT:    s_waitcnt vmcnt(0)
309; GFX9-NEXT:    v_max_u16_e32 v1, v1, v2
310; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
311; GFX9-NEXT:    s_endpgm
312  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
313  %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
314  %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
315  %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
316  %a = load i16, ptr addrspace(1) %gep0, align 4
317  %b = load i16, ptr addrspace(1) %gep1, align 4
318  %cmp = icmp uge i16 %a, %b
319  %val = select i1 %cmp, i16 %a, i16 %b
320  store i16 %val, ptr addrspace(1) %outgep, align 4
321  ret void
322}
323
324; FIXME: Need to handle non-uniform case for function below (load without gep).
325define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
326; VI-LABEL: v_test_umax_ugt_i16:
327; VI:       ; %bb.0:
328; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
329; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
330; VI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
331; VI-NEXT:    s_waitcnt lgkmcnt(0)
332; VI-NEXT:    v_mov_b32_e32 v1, s3
333; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
334; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
335; VI-NEXT:    v_mov_b32_e32 v3, s5
336; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
337; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
338; VI-NEXT:    flat_load_ushort v5, v[0:1]
339; VI-NEXT:    flat_load_ushort v2, v[2:3]
340; VI-NEXT:    v_mov_b32_e32 v1, s1
341; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
342; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
343; VI-NEXT:    s_waitcnt vmcnt(0)
344; VI-NEXT:    v_max_u16_e32 v2, v5, v2
345; VI-NEXT:    flat_store_short v[0:1], v2
346; VI-NEXT:    s_endpgm
347;
348; GFX9-LABEL: v_test_umax_ugt_i16:
349; GFX9:       ; %bb.0:
350; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
351; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
352; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
353; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
354; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
355; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
356; GFX9-NEXT:    s_waitcnt vmcnt(0)
357; GFX9-NEXT:    v_max_u16_e32 v1, v1, v2
358; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
359; GFX9-NEXT:    s_endpgm
360  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
361  %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
362  %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
363  %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
364  %a = load i16, ptr addrspace(1) %gep0, align 4
365  %b = load i16, ptr addrspace(1) %gep1, align 4
366  %cmp = icmp ugt i16 %a, %b
367  %val = select i1 %cmp, i16 %a, i16 %b
368  store i16 %val, ptr addrspace(1) %outgep, align 4
369  ret void
370}
371
372define amdgpu_kernel void @v_test_umax_ugt_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
373; VI-LABEL: v_test_umax_ugt_v2i16:
374; VI:       ; %bb.0:
375; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
376; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
377; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
378; VI-NEXT:    s_waitcnt lgkmcnt(0)
379; VI-NEXT:    v_mov_b32_e32 v1, s3
380; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
381; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
382; VI-NEXT:    v_mov_b32_e32 v3, s5
383; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
384; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
385; VI-NEXT:    flat_load_dword v5, v[0:1]
386; VI-NEXT:    flat_load_dword v2, v[2:3]
387; VI-NEXT:    v_mov_b32_e32 v1, s1
388; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
389; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
390; VI-NEXT:    s_waitcnt vmcnt(0)
391; VI-NEXT:    v_max_u16_e32 v3, v5, v2
392; VI-NEXT:    v_max_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
393; VI-NEXT:    v_or_b32_e32 v2, v3, v2
394; VI-NEXT:    flat_store_dword v[0:1], v2
395; VI-NEXT:    s_endpgm
396;
397; GFX9-LABEL: v_test_umax_ugt_v2i16:
398; GFX9:       ; %bb.0:
399; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
400; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
401; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
402; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
403; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
404; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
405; GFX9-NEXT:    s_waitcnt vmcnt(0)
406; GFX9-NEXT:    v_pk_max_u16 v1, v1, v2
407; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
408; GFX9-NEXT:    s_endpgm
409  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
410  %gep0 = getelementptr <2 x i16>, ptr addrspace(1) %aptr, i32 %tid
411  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %bptr, i32 %tid
412  %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid
413  %a = load <2 x i16>, ptr addrspace(1) %gep0, align 4
414  %b = load <2 x i16>, ptr addrspace(1) %gep1, align 4
415  %cmp = icmp ugt <2 x i16> %a, %b
416  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
417  store <2 x i16> %val, ptr addrspace(1) %outgep, align 4
418  ret void
419}
420
421declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
422