xref: /llvm-project/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11
6
7; FIXME: Need to handle non-uniform case for function below (load without gep).
8define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
9; GFX9-LABEL: v_test_sub_v2i16:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
12; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
13; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
14; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
16; GFX9-NEXT:    s_waitcnt vmcnt(0)
17; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
18; GFX9-NEXT:    s_waitcnt vmcnt(0)
19; GFX9-NEXT:    s_mov_b32 s3, 0xf000
20; GFX9-NEXT:    s_mov_b32 s2, -1
21; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v2
22; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
23; GFX9-NEXT:    s_endpgm
24;
25; VI-LABEL: v_test_sub_v2i16:
26; VI:       ; %bb.0:
27; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
28; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
29; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
30; VI-NEXT:    s_waitcnt lgkmcnt(0)
31; VI-NEXT:    v_mov_b32_e32 v1, s3
32; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
33; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
34; VI-NEXT:    v_mov_b32_e32 v3, s5
35; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
36; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
37; VI-NEXT:    flat_load_dword v0, v[0:1] glc
38; VI-NEXT:    s_waitcnt vmcnt(0)
39; VI-NEXT:    flat_load_dword v1, v[2:3] glc
40; VI-NEXT:    s_waitcnt vmcnt(0)
41; VI-NEXT:    s_mov_b32 s3, 0xf000
42; VI-NEXT:    s_mov_b32 s2, -1
43; VI-NEXT:    v_sub_u16_e32 v2, v0, v1
44; VI-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
45; VI-NEXT:    v_or_b32_e32 v0, v2, v0
46; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
47; VI-NEXT:    s_endpgm
48;
49; GFX10-LABEL: v_test_sub_v2i16:
50; GFX10:       ; %bb.0:
51; GFX10-NEXT:    s_clause 0x1
52; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
53; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
54; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
55; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
56; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
57; GFX10-NEXT:    s_waitcnt vmcnt(0)
58; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
59; GFX10-NEXT:    s_waitcnt vmcnt(0)
60; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
61; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
62; GFX10-NEXT:    s_mov_b32 s2, -1
63; GFX10-NEXT:    v_pk_sub_i16 v0, v1, v2
64; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
65; GFX10-NEXT:    s_endpgm
66;
67; GFX11-LABEL: v_test_sub_v2i16:
68; GFX11:       ; %bb.0:
69; GFX11-NEXT:    s_clause 0x1
70; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
71; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
72; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
73; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
74; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
75; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
76; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
77; GFX11-NEXT:    s_waitcnt vmcnt(0)
78; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5] glc dlc
79; GFX11-NEXT:    s_waitcnt vmcnt(0)
80; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
81; GFX11-NEXT:    s_mov_b32 s2, -1
82; GFX11-NEXT:    v_pk_sub_i16 v0, v1, v0
83; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
84; GFX11-NEXT:    s_endpgm
85  %tid = call i32 @llvm.amdgcn.workitem.id.x()
86  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
87  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
88  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
89  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
90  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
91  %add = sub <2 x i16> %a, %b
92  store <2 x i16> %add, ptr addrspace(1) %out
93  ret void
94}
95
96define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 {
97; GFX9-LABEL: s_test_sub_v2i16:
98; GFX9:       ; %bb.0:
99; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
100; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
101; GFX9-NEXT:    s_mov_b32 s7, 0xf000
102; GFX9-NEXT:    s_mov_b32 s6, -1
103; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
104; GFX9-NEXT:    s_load_dword s10, s[8:9], 0x0
105; GFX9-NEXT:    s_load_dword s11, s[2:3], 0x0
106; GFX9-NEXT:    s_mov_b32 s4, s0
107; GFX9-NEXT:    s_mov_b32 s5, s1
108; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
109; GFX9-NEXT:    v_mov_b32_e32 v0, s10
110; GFX9-NEXT:    v_pk_sub_i16 v0, s11, v0
111; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
112; GFX9-NEXT:    s_endpgm
113;
114; VI-LABEL: s_test_sub_v2i16:
115; VI:       ; %bb.0:
116; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
117; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
118; VI-NEXT:    s_mov_b32 s7, 0xf000
119; VI-NEXT:    s_mov_b32 s6, -1
120; VI-NEXT:    s_waitcnt lgkmcnt(0)
121; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
122; VI-NEXT:    s_load_dword s3, s[4:5], 0x0
123; VI-NEXT:    s_mov_b32 s4, s0
124; VI-NEXT:    s_mov_b32 s5, s1
125; VI-NEXT:    s_waitcnt lgkmcnt(0)
126; VI-NEXT:    s_lshr_b32 s0, s2, 16
127; VI-NEXT:    s_lshr_b32 s1, s3, 16
128; VI-NEXT:    s_sub_i32 s2, s2, s3
129; VI-NEXT:    s_sub_i32 s0, s0, s1
130; VI-NEXT:    s_and_b32 s1, s2, 0xffff
131; VI-NEXT:    s_lshl_b32 s0, s0, 16
132; VI-NEXT:    s_or_b32 s0, s1, s0
133; VI-NEXT:    v_mov_b32_e32 v0, s0
134; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
135; VI-NEXT:    s_endpgm
136;
137; GFX10-LABEL: s_test_sub_v2i16:
138; GFX10:       ; %bb.0:
139; GFX10-NEXT:    s_clause 0x1
140; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
141; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
142; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
143; GFX10-NEXT:    s_load_dword s4, s[2:3], 0x0
144; GFX10-NEXT:    s_load_dword s5, s[6:7], 0x0
145; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
146; GFX10-NEXT:    s_mov_b32 s2, -1
147; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
148; GFX10-NEXT:    v_pk_sub_i16 v0, s4, s5
149; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
150; GFX10-NEXT:    s_endpgm
151;
152; GFX11-LABEL: s_test_sub_v2i16:
153; GFX11:       ; %bb.0:
154; GFX11-NEXT:    s_clause 0x1
155; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
156; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
157; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
158; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
159; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x0
160; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
161; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
162; GFX11-NEXT:    v_pk_sub_i16 v0, s2, s4
163; GFX11-NEXT:    s_mov_b32 s2, -1
164; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
165; GFX11-NEXT:    s_endpgm
166  %a = load <2 x i16>, ptr addrspace(4) %in0
167  %b = load <2 x i16>, ptr addrspace(4) %in1
168  %add = sub <2 x i16> %a, %b
169  store <2 x i16> %add, ptr addrspace(1) %out
170  ret void
171}
172
173define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
174; GCN-LABEL: s_test_sub_self_v2i16:
175; GCN:       ; %bb.0:
176; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
177; GCN-NEXT:    s_mov_b32 s3, 0xf000
178; GCN-NEXT:    s_mov_b32 s2, -1
179; GCN-NEXT:    v_mov_b32_e32 v0, 0
180; GCN-NEXT:    s_waitcnt lgkmcnt(0)
181; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
182; GCN-NEXT:    s_endpgm
183;
184; GFX10-LABEL: s_test_sub_self_v2i16:
185; GFX10:       ; %bb.0:
186; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
187; GFX10-NEXT:    v_mov_b32_e32 v0, 0
188; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
189; GFX10-NEXT:    s_mov_b32 s2, -1
190; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
191; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
192; GFX10-NEXT:    s_endpgm
193;
194; GFX11-LABEL: s_test_sub_self_v2i16:
195; GFX11:       ; %bb.0:
196; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
197; GFX11-NEXT:    v_mov_b32_e32 v0, 0
198; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
199; GFX11-NEXT:    s_mov_b32 s2, -1
200; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
201; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
202; GFX11-NEXT:    s_endpgm
203  %a = load <2 x i16>, ptr addrspace(4) %in0
204  %add = sub <2 x i16> %a, %a
205  store <2 x i16> %add, ptr addrspace(1) %out
206  ret void
207}
208
209; FIXME: VI should not scalarize arg access.
210define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
211; GFX9-LABEL: s_test_sub_v2i16_kernarg:
212; GFX9:       ; %bb.0:
213; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
214; GFX9-NEXT:    s_mov_b32 s7, 0xf000
215; GFX9-NEXT:    s_mov_b32 s6, -1
216; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
217; GFX9-NEXT:    v_mov_b32_e32 v0, s3
218; GFX9-NEXT:    s_mov_b32 s4, s0
219; GFX9-NEXT:    s_mov_b32 s5, s1
220; GFX9-NEXT:    v_pk_sub_i16 v0, s2, v0
221; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
222; GFX9-NEXT:    s_endpgm
223;
224; VI-LABEL: s_test_sub_v2i16_kernarg:
225; VI:       ; %bb.0:
226; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
227; VI-NEXT:    s_mov_b32 s7, 0xf000
228; VI-NEXT:    s_mov_b32 s6, -1
229; VI-NEXT:    s_waitcnt lgkmcnt(0)
230; VI-NEXT:    s_mov_b32 s4, s0
231; VI-NEXT:    s_mov_b32 s5, s1
232; VI-NEXT:    s_lshr_b32 s0, s2, 16
233; VI-NEXT:    s_lshr_b32 s1, s3, 16
234; VI-NEXT:    s_sub_i32 s0, s0, s1
235; VI-NEXT:    s_sub_i32 s1, s2, s3
236; VI-NEXT:    s_lshl_b32 s0, s0, 16
237; VI-NEXT:    s_and_b32 s1, s1, 0xffff
238; VI-NEXT:    s_or_b32 s0, s1, s0
239; VI-NEXT:    v_mov_b32_e32 v0, s0
240; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
241; VI-NEXT:    s_endpgm
242;
243; GFX10-LABEL: s_test_sub_v2i16_kernarg:
244; GFX10:       ; %bb.0:
245; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
246; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
247; GFX10-NEXT:    s_mov_b32 s6, -1
248; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
249; GFX10-NEXT:    v_pk_sub_i16 v0, s2, s3
250; GFX10-NEXT:    s_mov_b32 s4, s0
251; GFX10-NEXT:    s_mov_b32 s5, s1
252; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
253; GFX10-NEXT:    s_endpgm
254;
255; GFX11-LABEL: s_test_sub_v2i16_kernarg:
256; GFX11:       ; %bb.0:
257; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
258; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
259; GFX11-NEXT:    s_mov_b32 s6, -1
260; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
261; GFX11-NEXT:    v_pk_sub_i16 v0, s2, s3
262; GFX11-NEXT:    s_mov_b32 s4, s0
263; GFX11-NEXT:    s_mov_b32 s5, s1
264; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
265; GFX11-NEXT:    s_endpgm
266  %add = sub <2 x i16> %a, %b
267  store <2 x i16> %add, ptr addrspace(1) %out
268  ret void
269}
270
271define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
272; GFX9-LABEL: v_test_sub_v2i16_constant:
273; GFX9:       ; %bb.0:
274; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
275; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
276; GFX9-NEXT:    s_mov_b32 s4, 0x1c8007b
277; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
278; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
279; GFX9-NEXT:    s_waitcnt vmcnt(0)
280; GFX9-NEXT:    s_mov_b32 s3, 0xf000
281; GFX9-NEXT:    s_mov_b32 s2, -1
282; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
283; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
284; GFX9-NEXT:    s_endpgm
285;
286; VI-LABEL: v_test_sub_v2i16_constant:
287; VI:       ; %bb.0:
288; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
289; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
290; VI-NEXT:    s_waitcnt lgkmcnt(0)
291; VI-NEXT:    v_mov_b32_e32 v1, s3
292; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
293; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
294; VI-NEXT:    flat_load_dword v0, v[0:1] glc
295; VI-NEXT:    s_waitcnt vmcnt(0)
296; VI-NEXT:    v_mov_b32_e32 v1, 0xfffffe38
297; VI-NEXT:    s_mov_b32 s3, 0xf000
298; VI-NEXT:    s_mov_b32 s2, -1
299; VI-NEXT:    v_add_u16_e32 v2, 0xff85, v0
300; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
301; VI-NEXT:    v_or_b32_e32 v0, v2, v0
302; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
303; VI-NEXT:    s_endpgm
304;
305; GFX10-LABEL: v_test_sub_v2i16_constant:
306; GFX10:       ; %bb.0:
307; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
308; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
309; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
310; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
311; GFX10-NEXT:    s_waitcnt vmcnt(0)
312; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
313; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
314; GFX10-NEXT:    s_mov_b32 s2, -1
315; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0x1c8007b
316; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
317; GFX10-NEXT:    s_endpgm
318;
319; GFX11-LABEL: v_test_sub_v2i16_constant:
320; GFX11:       ; %bb.0:
321; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
322; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
323; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
324; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
325; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
326; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
327; GFX11-NEXT:    s_waitcnt vmcnt(0)
328; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
329; GFX11-NEXT:    s_mov_b32 s2, -1
330; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0x1c8007b
331; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
332; GFX11-NEXT:    s_endpgm
333  %tid = call i32 @llvm.amdgcn.workitem.id.x()
334  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
335  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
336  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
337  %add = sub <2 x i16> %a, <i16 123, i16 456>
338  store <2 x i16> %add, ptr addrspace(1) %out
339  ret void
340}
341
342; FIXME: Need to handle non-uniform case for function below (load without gep).
343define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
344; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
345; GFX9:       ; %bb.0:
346; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
347; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
348; GFX9-NEXT:    s_mov_b32 s4, 0xfc21fcb3
349; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
350; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
351; GFX9-NEXT:    s_waitcnt vmcnt(0)
352; GFX9-NEXT:    s_mov_b32 s3, 0xf000
353; GFX9-NEXT:    s_mov_b32 s2, -1
354; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
355; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
356; GFX9-NEXT:    s_endpgm
357;
358; VI-LABEL: v_test_sub_v2i16_neg_constant:
359; VI:       ; %bb.0:
360; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
361; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
362; VI-NEXT:    s_waitcnt lgkmcnt(0)
363; VI-NEXT:    v_mov_b32_e32 v1, s3
364; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
365; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
366; VI-NEXT:    flat_load_dword v0, v[0:1] glc
367; VI-NEXT:    s_waitcnt vmcnt(0)
368; VI-NEXT:    v_mov_b32_e32 v1, 0x3df
369; VI-NEXT:    s_mov_b32 s3, 0xf000
370; VI-NEXT:    s_mov_b32 s2, -1
371; VI-NEXT:    v_add_u16_e32 v2, 0x34d, v0
372; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
373; VI-NEXT:    v_or_b32_e32 v0, v2, v0
374; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
375; VI-NEXT:    s_endpgm
376;
377; GFX10-LABEL: v_test_sub_v2i16_neg_constant:
378; GFX10:       ; %bb.0:
379; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
380; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
381; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
382; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
383; GFX10-NEXT:    s_waitcnt vmcnt(0)
384; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
385; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
386; GFX10-NEXT:    s_mov_b32 s2, -1
387; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0xfc21fcb3
388; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
389; GFX10-NEXT:    s_endpgm
390;
391; GFX11-LABEL: v_test_sub_v2i16_neg_constant:
392; GFX11:       ; %bb.0:
393; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
394; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
395; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
396; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
397; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
398; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
399; GFX11-NEXT:    s_waitcnt vmcnt(0)
400; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
401; GFX11-NEXT:    s_mov_b32 s2, -1
402; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0xfc21fcb3
403; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
404; GFX11-NEXT:    s_endpgm
405  %tid = call i32 @llvm.amdgcn.workitem.id.x()
406  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
407  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
408  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
409  %add = sub <2 x i16> %a, <i16 -845, i16 -991>
410  store <2 x i16> %add, ptr addrspace(1) %out
411  ret void
412}
413
414define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
415; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
416; GFX9:       ; %bb.0:
417; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
418; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
419; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
420; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
421; GFX9-NEXT:    s_waitcnt vmcnt(0)
422; GFX9-NEXT:    s_mov_b32 s3, 0xf000
423; GFX9-NEXT:    s_mov_b32 s2, -1
424; GFX9-NEXT:    v_pk_sub_i16 v0, v0, -1
425; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
426; GFX9-NEXT:    s_endpgm
427;
428; VI-LABEL: v_test_sub_v2i16_inline_neg1:
429; VI:       ; %bb.0:
430; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
431; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
432; VI-NEXT:    s_waitcnt lgkmcnt(0)
433; VI-NEXT:    v_mov_b32_e32 v1, s3
434; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
435; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
436; VI-NEXT:    flat_load_dword v0, v[0:1] glc
437; VI-NEXT:    s_waitcnt vmcnt(0)
438; VI-NEXT:    v_mov_b32_e32 v1, 1
439; VI-NEXT:    s_mov_b32 s3, 0xf000
440; VI-NEXT:    s_mov_b32 s2, -1
441; VI-NEXT:    v_add_u16_e32 v2, 1, v0
442; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
443; VI-NEXT:    v_or_b32_e32 v0, v2, v0
444; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
445; VI-NEXT:    s_endpgm
446;
447; GFX10-LABEL: v_test_sub_v2i16_inline_neg1:
448; GFX10:       ; %bb.0:
449; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
450; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
451; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
453; GFX10-NEXT:    s_waitcnt vmcnt(0)
454; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
455; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
456; GFX10-NEXT:    s_mov_b32 s2, -1
457; GFX10-NEXT:    v_pk_sub_i16 v0, v0, -1
458; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
459; GFX10-NEXT:    s_endpgm
460;
461; GFX11-LABEL: v_test_sub_v2i16_inline_neg1:
462; GFX11:       ; %bb.0:
463; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
464; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
465; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
466; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
467; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
468; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
469; GFX11-NEXT:    s_waitcnt vmcnt(0)
470; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
471; GFX11-NEXT:    s_mov_b32 s2, -1
472; GFX11-NEXT:    v_pk_sub_i16 v0, v0, -1
473; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
474; GFX11-NEXT:    s_endpgm
475  %tid = call i32 @llvm.amdgcn.workitem.id.x()
476  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
477  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
478  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
479  %add = sub <2 x i16> %a, <i16 -1, i16 -1>
480  store <2 x i16> %add, ptr addrspace(1) %out
481  ret void
482}
483
484define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
485; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
486; GFX9:       ; %bb.0:
487; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
488; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
489; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
490; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
491; GFX9-NEXT:    s_waitcnt vmcnt(0)
492; GFX9-NEXT:    s_mov_b32 s3, 0xf000
493; GFX9-NEXT:    s_mov_b32 s2, -1
494; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 32
495; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
496; GFX9-NEXT:    s_endpgm
497;
498; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
499; VI:       ; %bb.0:
500; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
501; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
502; VI-NEXT:    s_waitcnt lgkmcnt(0)
503; VI-NEXT:    v_mov_b32_e32 v1, s3
504; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
505; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
506; VI-NEXT:    flat_load_dword v0, v[0:1] glc
507; VI-NEXT:    s_waitcnt vmcnt(0)
508; VI-NEXT:    s_mov_b32 s3, 0xf000
509; VI-NEXT:    s_mov_b32 s2, -1
510; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
511; VI-NEXT:    v_subrev_u16_e32 v0, 32, v0
512; VI-NEXT:    v_or_b32_e32 v0, v0, v1
513; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
514; VI-NEXT:    s_endpgm
515;
516; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
517; GFX10:       ; %bb.0:
518; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
519; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
520; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
521; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
522; GFX10-NEXT:    s_waitcnt vmcnt(0)
523; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
524; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
525; GFX10-NEXT:    s_mov_b32 s2, -1
526; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 32
527; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
528; GFX10-NEXT:    s_endpgm
529;
530; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
531; GFX11:       ; %bb.0:
532; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
533; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
534; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
535; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
536; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
537; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
538; GFX11-NEXT:    s_waitcnt vmcnt(0)
539; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
540; GFX11-NEXT:    s_mov_b32 s2, -1
541; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 32
542; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
543; GFX11-NEXT:    s_endpgm
544  %tid = call i32 @llvm.amdgcn.workitem.id.x()
545  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
546  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
547  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
548  %add = sub <2 x i16> %a, <i16 32, i16 0>
549  store <2 x i16> %add, ptr addrspace(1) %out
550  ret void
551}
552
553; The high element gives fp
554define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
555; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
556; GFX9:       ; %bb.0:
557; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
558; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
559; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
560; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
561; GFX9-NEXT:    s_waitcnt vmcnt(0)
562; GFX9-NEXT:    s_mov_b32 s3, 0xf000
563; GFX9-NEXT:    s_mov_b32 s2, -1
564; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 1.0
565; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
566; GFX9-NEXT:    s_endpgm
567;
568; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
569; VI:       ; %bb.0:
570; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
571; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
572; VI-NEXT:    s_waitcnt lgkmcnt(0)
573; VI-NEXT:    v_mov_b32_e32 v1, s3
574; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
575; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
576; VI-NEXT:    flat_load_dword v0, v[0:1] glc
577; VI-NEXT:    s_waitcnt vmcnt(0)
578; VI-NEXT:    v_mov_b32_e32 v1, 0xffffc080
579; VI-NEXT:    s_mov_b32 s3, 0xf000
580; VI-NEXT:    s_mov_b32 s2, -1
581; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
582; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
583; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
584; VI-NEXT:    s_endpgm
585;
586; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split:
587; GFX10:       ; %bb.0:
588; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
589; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
590; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
591; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
592; GFX10-NEXT:    s_waitcnt vmcnt(0)
593; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
594; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
595; GFX10-NEXT:    s_mov_b32 s2, -1
596; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 1.0
597; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
598; GFX10-NEXT:    s_endpgm
599;
600; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split:
601; GFX11:       ; %bb.0:
602; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
603; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
604; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
605; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
606; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
607; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
608; GFX11-NEXT:    s_waitcnt vmcnt(0)
609; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
610; GFX11-NEXT:    s_mov_b32 s2, -1
611; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 1.0
612; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
613; GFX11-NEXT:    s_endpgm
614  %tid = call i32 @llvm.amdgcn.workitem.id.x()
615  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
616  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
617  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
618  %add = sub <2 x i16> %a, <i16 0, i16 16256>
619  store <2 x i16> %add, ptr addrspace(1) %out
620  ret void
621}
622
623; FIXME: Need to handle non-uniform case for function below (load without gep).
624define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
625; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32:
626; GFX9:       ; %bb.0:
627; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
628; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
629; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
630; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
631; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
632; GFX9-NEXT:    s_waitcnt vmcnt(0)
633; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
634; GFX9-NEXT:    s_waitcnt vmcnt(0)
635; GFX9-NEXT:    s_mov_b32 s3, 0xf000
636; GFX9-NEXT:    s_mov_b32 s2, -1
637; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v2
638; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
639; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
640; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
641; GFX9-NEXT:    s_endpgm
642;
643; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
644; VI:       ; %bb.0:
645; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
646; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
647; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
648; VI-NEXT:    s_waitcnt lgkmcnt(0)
649; VI-NEXT:    v_mov_b32_e32 v1, s3
650; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
651; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
652; VI-NEXT:    v_mov_b32_e32 v3, s5
653; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
654; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
655; VI-NEXT:    flat_load_dword v1, v[0:1] glc
656; VI-NEXT:    s_waitcnt vmcnt(0)
657; VI-NEXT:    flat_load_dword v2, v[2:3] glc
658; VI-NEXT:    s_waitcnt vmcnt(0)
659; VI-NEXT:    s_mov_b32 s3, 0xf000
660; VI-NEXT:    s_mov_b32 s2, -1
661; VI-NEXT:    v_sub_u16_e32 v0, v1, v2
662; VI-NEXT:    v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
663; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
664; VI-NEXT:    s_endpgm
665;
666; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32:
667; GFX10:       ; %bb.0:
668; GFX10-NEXT:    s_clause 0x1
669; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
670; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
671; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
672; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
673; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
674; GFX10-NEXT:    s_waitcnt vmcnt(0)
675; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
676; GFX10-NEXT:    s_waitcnt vmcnt(0)
677; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
678; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
679; GFX10-NEXT:    s_mov_b32 s2, -1
680; GFX10-NEXT:    v_pk_sub_i16 v0, v1, v2
681; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
682; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
683; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
684; GFX10-NEXT:    s_endpgm
685;
686; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i32:
687; GFX11:       ; %bb.0:
688; GFX11-NEXT:    s_clause 0x1
689; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
690; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
691; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
692; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
693; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
694; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
695; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
696; GFX11-NEXT:    s_waitcnt vmcnt(0)
697; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5] glc dlc
698; GFX11-NEXT:    s_waitcnt vmcnt(0)
699; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
700; GFX11-NEXT:    s_mov_b32 s2, -1
701; GFX11-NEXT:    v_pk_sub_i16 v0, v1, v0
702; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
703; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
704; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
705; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
706; GFX11-NEXT:    s_endpgm
707  %tid = call i32 @llvm.amdgcn.workitem.id.x()
708  %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
709  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
710  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
711  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
712  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
713  %add = sub <2 x i16> %a, %b
714  %ext = zext <2 x i16> %add to <2 x i32>
715  store <2 x i32> %ext, ptr addrspace(1) %out
716  ret void
717}
718
719; FIXME: Need to handle non-uniform case for function below (load without gep).
720define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
721; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
722; GFX9:       ; %bb.0:
723; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
724; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
725; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
726; GFX9-NEXT:    v_mov_b32_e32 v1, 0
727; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
728; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
729; GFX9-NEXT:    s_waitcnt vmcnt(0)
730; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
731; GFX9-NEXT:    s_waitcnt vmcnt(0)
732; GFX9-NEXT:    s_mov_b32 s3, 0xf000
733; GFX9-NEXT:    s_mov_b32 s2, -1
734; GFX9-NEXT:    v_pk_sub_i16 v0, v2, v3
735; GFX9-NEXT:    v_alignbit_b32 v2, 0, v0, 16
736; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
737; GFX9-NEXT:    v_mov_b32_e32 v3, v1
738; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
739; GFX9-NEXT:    s_endpgm
740;
741; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
742; VI:       ; %bb.0:
743; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
744; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
745; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
746; VI-NEXT:    s_waitcnt lgkmcnt(0)
747; VI-NEXT:    v_mov_b32_e32 v1, s3
748; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
749; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
750; VI-NEXT:    v_mov_b32_e32 v3, s5
751; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
752; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
753; VI-NEXT:    flat_load_dword v4, v[0:1] glc
754; VI-NEXT:    s_waitcnt vmcnt(0)
755; VI-NEXT:    flat_load_dword v2, v[2:3] glc
756; VI-NEXT:    s_waitcnt vmcnt(0)
757; VI-NEXT:    v_mov_b32_e32 v1, 0
758; VI-NEXT:    s_mov_b32 s3, 0xf000
759; VI-NEXT:    s_mov_b32 s2, -1
760; VI-NEXT:    v_mov_b32_e32 v3, v1
761; VI-NEXT:    v_sub_u16_e32 v0, v4, v2
762; VI-NEXT:    v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
763; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
764; VI-NEXT:    s_endpgm
765;
766; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64:
767; GFX10:       ; %bb.0:
768; GFX10-NEXT:    s_clause 0x1
769; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
770; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
771; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
772; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
773; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
774; GFX10-NEXT:    s_waitcnt vmcnt(0)
775; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
776; GFX10-NEXT:    s_waitcnt vmcnt(0)
777; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
778; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
779; GFX10-NEXT:    s_mov_b32 s2, -1
780; GFX10-NEXT:    v_pk_sub_i16 v0, v1, v2
781; GFX10-NEXT:    v_mov_b32_e32 v1, 0
782; GFX10-NEXT:    v_alignbit_b32 v2, 0, v0, 16
783; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
784; GFX10-NEXT:    v_mov_b32_e32 v3, v1
785; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
786; GFX10-NEXT:    s_endpgm
787;
788; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i64:
789; GFX11:       ; %bb.0:
790; GFX11-NEXT:    s_clause 0x1
791; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
792; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
793; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
794; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
795; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
796; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
797; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
798; GFX11-NEXT:    s_waitcnt vmcnt(0)
799; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5] glc dlc
800; GFX11-NEXT:    s_waitcnt vmcnt(0)
801; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
802; GFX11-NEXT:    s_mov_b32 s2, -1
803; GFX11-NEXT:    v_pk_sub_i16 v0, v1, v0
804; GFX11-NEXT:    v_mov_b32_e32 v1, 0
805; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
806; GFX11-NEXT:    v_alignbit_b32 v2, 0, v0, 16
807; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
808; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
809; GFX11-NEXT:    s_endpgm
810  %tid = call i32 @llvm.amdgcn.workitem.id.x()
811  %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
812  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
813  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
814  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
815  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
816  %add = sub <2 x i16> %a, %b
817  %ext = zext <2 x i16> %add to <2 x i64>
818  store <2 x i64> %ext, ptr addrspace(1) %out
819  ret void
820}
821
822; FIXME: Need to handle non-uniform case for function below (load without gep).
823define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
824; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32:
825; GFX9:       ; %bb.0:
826; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
827; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
828; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
829; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
830; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
831; GFX9-NEXT:    s_waitcnt vmcnt(0)
832; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
833; GFX9-NEXT:    s_waitcnt vmcnt(0)
834; GFX9-NEXT:    s_mov_b32 s3, 0xf000
835; GFX9-NEXT:    s_mov_b32 s2, -1
836; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v2
837; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
838; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
839; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
840; GFX9-NEXT:    s_endpgm
841;
842; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
843; VI:       ; %bb.0:
844; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
845; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
846; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
847; VI-NEXT:    s_waitcnt lgkmcnt(0)
848; VI-NEXT:    v_mov_b32_e32 v1, s3
849; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
850; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
851; VI-NEXT:    v_mov_b32_e32 v3, s5
852; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
853; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
854; VI-NEXT:    flat_load_dword v0, v[0:1] glc
855; VI-NEXT:    s_waitcnt vmcnt(0)
856; VI-NEXT:    flat_load_dword v1, v[2:3] glc
857; VI-NEXT:    s_waitcnt vmcnt(0)
858; VI-NEXT:    s_mov_b32 s3, 0xf000
859; VI-NEXT:    s_mov_b32 s2, -1
860; VI-NEXT:    v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
861; VI-NEXT:    v_sub_u16_e32 v0, v0, v1
862; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
863; VI-NEXT:    v_bfe_i32 v1, v2, 0, 16
864; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
865; VI-NEXT:    s_endpgm
866;
867; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32:
868; GFX10:       ; %bb.0:
869; GFX10-NEXT:    s_clause 0x1
870; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
871; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
872; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
873; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
874; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
875; GFX10-NEXT:    s_waitcnt vmcnt(0)
876; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
877; GFX10-NEXT:    s_waitcnt vmcnt(0)
878; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
879; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
880; GFX10-NEXT:    s_mov_b32 s2, -1
881; GFX10-NEXT:    v_pk_sub_i16 v0, v1, v2
882; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
883; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 16
884; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
885; GFX10-NEXT:    s_endpgm
886;
887; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i32:
888; GFX11:       ; %bb.0:
889; GFX11-NEXT:    s_clause 0x1
890; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
891; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
892; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
893; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
894; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
895; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
896; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
897; GFX11-NEXT:    s_waitcnt vmcnt(0)
898; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5] glc dlc
899; GFX11-NEXT:    s_waitcnt vmcnt(0)
900; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
901; GFX11-NEXT:    s_mov_b32 s2, -1
902; GFX11-NEXT:    v_pk_sub_i16 v0, v1, v0
903; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
904; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
905; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
906; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
907; GFX11-NEXT:    s_endpgm
908  %tid = call i32 @llvm.amdgcn.workitem.id.x()
909  %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
910  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
911  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
912  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
913  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
914  %add = sub <2 x i16> %a, %b
915  %ext = sext <2 x i16> %add to <2 x i32>
916  store <2 x i32> %ext, ptr addrspace(1) %out
917  ret void
918}
919
920; FIXME: Need to handle non-uniform case for function below (load without gep).
921define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
922; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64:
923; GFX9:       ; %bb.0:
924; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
925; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
926; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
927; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
928; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
929; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
930; GFX9-NEXT:    s_mov_b32 s3, 0xf000
931; GFX9-NEXT:    s_mov_b32 s2, -1
932; GFX9-NEXT:    s_waitcnt vmcnt(0)
933; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v2
934; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
935; GFX9-NEXT:    v_bfe_i32 v0, v1, 0, 16
936; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 16
937; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
938; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
939; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
940; GFX9-NEXT:    s_endpgm
941;
942; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
943; VI:       ; %bb.0:
944; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
945; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
946; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
947; VI-NEXT:    s_waitcnt lgkmcnt(0)
948; VI-NEXT:    v_mov_b32_e32 v1, s3
949; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
950; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
951; VI-NEXT:    v_mov_b32_e32 v3, s5
952; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
953; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
954; VI-NEXT:    flat_load_dword v0, v[0:1]
955; VI-NEXT:    flat_load_dword v1, v[2:3]
956; VI-NEXT:    s_mov_b32 s3, 0xf000
957; VI-NEXT:    s_mov_b32 s2, -1
958; VI-NEXT:    s_waitcnt vmcnt(0)
959; VI-NEXT:    v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
960; VI-NEXT:    v_sub_u16_e32 v0, v0, v1
961; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
962; VI-NEXT:    v_bfe_i32 v2, v2, 0, 16
963; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
964; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
965; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
966; VI-NEXT:    s_endpgm
967;
968; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64:
969; GFX10:       ; %bb.0:
970; GFX10-NEXT:    s_clause 0x1
971; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
972; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
973; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
974; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
975; GFX10-NEXT:    s_clause 0x1
976; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
977; GFX10-NEXT:    global_load_dword v2, v0, s[6:7]
978; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
979; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
980; GFX10-NEXT:    s_mov_b32 s2, -1
981; GFX10-NEXT:    s_waitcnt vmcnt(0)
982; GFX10-NEXT:    v_pk_sub_i16 v0, v1, v2
983; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
984; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 16
985; GFX10-NEXT:    v_bfe_i32 v2, v1, 0, 16
986; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
987; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
988; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
989; GFX10-NEXT:    s_endpgm
990;
991; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i64:
992; GFX11:       ; %bb.0:
993; GFX11-NEXT:    s_clause 0x1
994; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
995; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
996; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
997; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
998; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
999; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1000; GFX11-NEXT:    s_clause 0x1
1001; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1002; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5]
1003; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1004; GFX11-NEXT:    s_mov_b32 s2, -1
1005; GFX11-NEXT:    s_waitcnt vmcnt(0)
1006; GFX11-NEXT:    v_pk_sub_i16 v0, v1, v0
1007; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1008; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1009; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
1010; GFX11-NEXT:    v_bfe_i32 v2, v1, 0, 16
1011; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1012; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1013; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
1014; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
1015; GFX11-NEXT:    s_endpgm
1016  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1017  %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
1018  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
1019  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
1020  %a = load <2 x i16>, ptr addrspace(1) %gep.in0
1021  %b = load <2 x i16>, ptr addrspace(1) %gep.in1
1022  %add = sub <2 x i16> %a, %b
1023  %ext = sext <2 x i16> %add to <2 x i64>
1024  store <2 x i64> %ext, ptr addrspace(1) %out
1025  ret void
1026}
1027
1028declare i32 @llvm.amdgcn.workitem.id.x() #0
1029
1030attributes #0 = { nounwind readnone }
1031attributes #1 = { nounwind }
1032