xref: /llvm-project/llvm/test/CodeGen/AMDGPU/add.v2i16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
6
7; FIXME: Need to handle non-uniform case for function below (load without gep).
8; FIXME: VI or should be unnecessary
9define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
10; VI-LABEL: v_test_add_v2i16:
11; VI:       ; %bb.0:
12; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
13; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
14; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
15; VI-NEXT:    s_waitcnt lgkmcnt(0)
16; VI-NEXT:    v_mov_b32_e32 v1, s3
17; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
18; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
19; VI-NEXT:    v_mov_b32_e32 v3, s5
20; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
21; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
22; VI-NEXT:    flat_load_dword v4, v[0:1] glc
23; VI-NEXT:    s_waitcnt vmcnt(0)
24; VI-NEXT:    flat_load_dword v2, v[2:3] glc
25; VI-NEXT:    s_waitcnt vmcnt(0)
26; VI-NEXT:    v_mov_b32_e32 v0, s0
27; VI-NEXT:    v_mov_b32_e32 v1, s1
28; VI-NEXT:    v_add_u16_e32 v3, v4, v2
29; VI-NEXT:    v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
30; VI-NEXT:    v_or_b32_e32 v2, v3, v2
31; VI-NEXT:    flat_store_dword v[0:1], v2
32; VI-NEXT:    s_endpgm
33;
34; GFX9-LABEL: v_test_add_v2i16:
35; GFX9:       ; %bb.0:
36; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
37; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
38; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
39; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
41; GFX9-NEXT:    s_waitcnt vmcnt(0)
42; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
43; GFX9-NEXT:    s_waitcnt vmcnt(0)
44; GFX9-NEXT:    v_mov_b32_e32 v0, 0
45; GFX9-NEXT:    v_pk_add_u16 v1, v1, v2
46; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
47; GFX9-NEXT:    s_endpgm
48;
49; GFX10-LABEL: v_test_add_v2i16:
50; GFX10:       ; %bb.0:
51; GFX10-NEXT:    s_clause 0x1
52; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
53; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
54; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
55; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
56; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
57; GFX10-NEXT:    s_waitcnt vmcnt(0)
58; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
59; GFX10-NEXT:    s_waitcnt vmcnt(0)
60; GFX10-NEXT:    v_mov_b32_e32 v0, 0
61; GFX10-NEXT:    v_pk_add_u16 v1, v1, v2
62; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
63; GFX10-NEXT:    s_endpgm
64;
65; GFX11-LABEL: v_test_add_v2i16:
66; GFX11:       ; %bb.0:
67; GFX11-NEXT:    s_clause 0x1
68; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
69; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
70; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
71; GFX11-NEXT:    v_mov_b32_e32 v2, 0
72; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
73; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
74; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
75; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
76; GFX11-NEXT:    s_waitcnt vmcnt(0)
77; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5] glc dlc
78; GFX11-NEXT:    s_waitcnt vmcnt(0)
79; GFX11-NEXT:    v_pk_add_u16 v0, v1, v0
80; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
81; GFX11-NEXT:    s_endpgm
82  %tid = call i32 @llvm.amdgcn.workitem.id.x()
83  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
84  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
85  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
86  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
87  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
88  %add = add <2 x i16> %a, %b
89  store <2 x i16> %add, ptr addrspace(1) %out
90  ret void
91}
92
93define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 {
94; VI-LABEL: s_test_add_v2i16:
95; VI:       ; %bb.0:
96; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
97; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
98; VI-NEXT:    s_waitcnt lgkmcnt(0)
99; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
100; VI-NEXT:    s_load_dword s3, s[4:5], 0x0
101; VI-NEXT:    v_mov_b32_e32 v0, s0
102; VI-NEXT:    v_mov_b32_e32 v1, s1
103; VI-NEXT:    s_waitcnt lgkmcnt(0)
104; VI-NEXT:    s_lshr_b32 s0, s2, 16
105; VI-NEXT:    s_lshr_b32 s1, s3, 16
106; VI-NEXT:    s_add_i32 s2, s2, s3
107; VI-NEXT:    s_add_i32 s0, s0, s1
108; VI-NEXT:    s_and_b32 s1, s2, 0xffff
109; VI-NEXT:    s_lshl_b32 s0, s0, 16
110; VI-NEXT:    s_or_b32 s0, s1, s0
111; VI-NEXT:    v_mov_b32_e32 v2, s0
112; VI-NEXT:    flat_store_dword v[0:1], v2
113; VI-NEXT:    s_endpgm
114;
115; GFX9-LABEL: s_test_add_v2i16:
116; GFX9:       ; %bb.0:
117; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
118; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
119; GFX9-NEXT:    v_mov_b32_e32 v0, 0
120; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
122; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
123; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
124; GFX9-NEXT:    v_mov_b32_e32 v1, s4
125; GFX9-NEXT:    v_pk_add_u16 v1, s5, v1
126; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
127; GFX9-NEXT:    s_endpgm
128;
129; GFX10-LABEL: s_test_add_v2i16:
130; GFX10:       ; %bb.0:
131; GFX10-NEXT:    s_clause 0x1
132; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
133; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
134; GFX10-NEXT:    v_mov_b32_e32 v0, 0
135; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
136; GFX10-NEXT:    s_load_dword s4, s[2:3], 0x0
137; GFX10-NEXT:    s_load_dword s5, s[6:7], 0x0
138; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
139; GFX10-NEXT:    v_pk_add_u16 v1, s4, s5
140; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
141; GFX10-NEXT:    s_endpgm
142;
143; GFX11-LABEL: s_test_add_v2i16:
144; GFX11:       ; %bb.0:
145; GFX11-NEXT:    s_clause 0x1
146; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
147; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
148; GFX11-NEXT:    v_mov_b32_e32 v0, 0
149; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
150; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
151; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x0
152; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
153; GFX11-NEXT:    v_pk_add_u16 v1, s2, s3
154; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
155; GFX11-NEXT:    s_endpgm
156  %a = load <2 x i16>, ptr addrspace(4) %in0
157  %b = load <2 x i16>, ptr addrspace(4) %in1
158  %add = add <2 x i16> %a, %b
159  store <2 x i16> %add, ptr addrspace(1) %out
160  ret void
161}
162
163define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
164; VI-LABEL: s_test_add_self_v2i16:
165; VI:       ; %bb.0:
166; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
167; VI-NEXT:    s_waitcnt lgkmcnt(0)
168; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
169; VI-NEXT:    v_mov_b32_e32 v0, s0
170; VI-NEXT:    v_mov_b32_e32 v1, s1
171; VI-NEXT:    s_waitcnt lgkmcnt(0)
172; VI-NEXT:    s_lshr_b32 s0, s2, 16
173; VI-NEXT:    s_and_b32 s1, s2, 0xffff
174; VI-NEXT:    s_add_i32 s1, s1, s1
175; VI-NEXT:    s_add_i32 s0, s0, s0
176; VI-NEXT:    s_lshl_b32 s0, s0, 16
177; VI-NEXT:    s_and_b32 s1, s1, 0xffff
178; VI-NEXT:    s_or_b32 s0, s1, s0
179; VI-NEXT:    v_mov_b32_e32 v2, s0
180; VI-NEXT:    flat_store_dword v[0:1], v2
181; VI-NEXT:    s_endpgm
182;
183; GFX9-LABEL: s_test_add_self_v2i16:
184; GFX9:       ; %bb.0:
185; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
186; GFX9-NEXT:    v_mov_b32_e32 v0, 0
187; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
188; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
189; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
190; GFX9-NEXT:    v_pk_add_u16 v1, s2, s2
191; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
192; GFX9-NEXT:    s_endpgm
193;
194; GFX10-LABEL: s_test_add_self_v2i16:
195; GFX10:       ; %bb.0:
196; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
197; GFX10-NEXT:    v_mov_b32_e32 v0, 0
198; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
199; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
200; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
201; GFX10-NEXT:    v_pk_add_u16 v1, s2, s2
202; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
203; GFX10-NEXT:    s_endpgm
204;
205; GFX11-LABEL: s_test_add_self_v2i16:
206; GFX11:       ; %bb.0:
207; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
208; GFX11-NEXT:    v_mov_b32_e32 v0, 0
209; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
211; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
212; GFX11-NEXT:    v_pk_add_u16 v1, s2, s2
213; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
214; GFX11-NEXT:    s_endpgm
215  %a = load <2 x i16>, ptr addrspace(4) %in0
216  %add = add <2 x i16> %a, %a
217  store <2 x i16> %add, ptr addrspace(1) %out
218  ret void
219}
220
221; FIXME: VI should not scalarize arg access.
222define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
223; VI-LABEL: s_test_add_v2i16_kernarg:
224; VI:       ; %bb.0:
225; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
226; VI-NEXT:    s_waitcnt lgkmcnt(0)
227; VI-NEXT:    s_lshr_b32 s4, s2, 16
228; VI-NEXT:    s_lshr_b32 s5, s3, 16
229; VI-NEXT:    s_add_i32 s2, s2, s3
230; VI-NEXT:    s_add_i32 s4, s4, s5
231; VI-NEXT:    s_and_b32 s2, s2, 0xffff
232; VI-NEXT:    s_lshl_b32 s3, s4, 16
233; VI-NEXT:    s_or_b32 s2, s2, s3
234; VI-NEXT:    v_mov_b32_e32 v0, s0
235; VI-NEXT:    v_mov_b32_e32 v1, s1
236; VI-NEXT:    v_mov_b32_e32 v2, s2
237; VI-NEXT:    flat_store_dword v[0:1], v2
238; VI-NEXT:    s_endpgm
239;
240; GFX9-LABEL: s_test_add_v2i16_kernarg:
241; GFX9:       ; %bb.0:
242; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
243; GFX9-NEXT:    v_mov_b32_e32 v0, 0
244; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
245; GFX9-NEXT:    v_mov_b32_e32 v1, s3
246; GFX9-NEXT:    v_pk_add_u16 v1, s2, v1
247; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
248; GFX9-NEXT:    s_endpgm
249;
250; GFX10-LABEL: s_test_add_v2i16_kernarg:
251; GFX10:       ; %bb.0:
252; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
253; GFX10-NEXT:    v_mov_b32_e32 v0, 0
254; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
255; GFX10-NEXT:    v_pk_add_u16 v1, s2, s3
256; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
257; GFX10-NEXT:    s_endpgm
258;
259; GFX11-LABEL: s_test_add_v2i16_kernarg:
260; GFX11:       ; %bb.0:
261; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
262; GFX11-NEXT:    v_mov_b32_e32 v0, 0
263; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
264; GFX11-NEXT:    v_pk_add_u16 v1, s2, s3
265; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
266; GFX11-NEXT:    s_endpgm
267  %add = add <2 x i16> %a, %b
268  store <2 x i16> %add, ptr addrspace(1) %out
269  ret void
270}
271
272; FIXME: Eliminate or with sdwa
273define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
274; VI-LABEL: v_test_add_v2i16_constant:
275; VI:       ; %bb.0:
276; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
277; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
278; VI-NEXT:    v_mov_b32_e32 v3, 0x1c8
279; VI-NEXT:    s_waitcnt lgkmcnt(0)
280; VI-NEXT:    v_mov_b32_e32 v1, s3
281; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
282; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
283; VI-NEXT:    flat_load_dword v2, v[0:1] glc
284; VI-NEXT:    s_waitcnt vmcnt(0)
285; VI-NEXT:    v_mov_b32_e32 v0, s0
286; VI-NEXT:    v_mov_b32_e32 v1, s1
287; VI-NEXT:    v_add_u16_e32 v4, 0x7b, v2
288; VI-NEXT:    v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
289; VI-NEXT:    v_or_b32_e32 v2, v4, v2
290; VI-NEXT:    flat_store_dword v[0:1], v2
291; VI-NEXT:    s_endpgm
292;
293; GFX9-LABEL: v_test_add_v2i16_constant:
294; GFX9:       ; %bb.0:
295; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
296; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
297; GFX9-NEXT:    v_mov_b32_e32 v1, 0
298; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
299; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
300; GFX9-NEXT:    s_waitcnt vmcnt(0)
301; GFX9-NEXT:    s_mov_b32 s2, 0x1c8007b
302; GFX9-NEXT:    v_pk_add_u16 v0, v0, s2
303; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
304; GFX9-NEXT:    s_endpgm
305;
306; GFX10-LABEL: v_test_add_v2i16_constant:
307; GFX10:       ; %bb.0:
308; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
309; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
310; GFX10-NEXT:    v_mov_b32_e32 v1, 0
311; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
312; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
313; GFX10-NEXT:    s_waitcnt vmcnt(0)
314; GFX10-NEXT:    v_pk_add_u16 v0, 0x1c8007b, v0
315; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
316; GFX10-NEXT:    s_endpgm
317;
318; GFX11-LABEL: v_test_add_v2i16_constant:
319; GFX11:       ; %bb.0:
320; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
321; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
322; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
323; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
324; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
325; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
326; GFX11-NEXT:    s_waitcnt vmcnt(0)
327; GFX11-NEXT:    v_pk_add_u16 v0, 0x1c8007b, v0
328; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
329; GFX11-NEXT:    s_endpgm
330  %tid = call i32 @llvm.amdgcn.workitem.id.x()
331  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
332  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
333  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
334  %add = add <2 x i16> %a, <i16 123, i16 456>
335  store <2 x i16> %add, ptr addrspace(1) %out
336  ret void
337}
338
339; FIXME: Need to handle non-uniform case for function below (load without gep).
340define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
341; VI-LABEL: v_test_add_v2i16_neg_constant:
342; VI:       ; %bb.0:
343; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
344; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
345; VI-NEXT:    v_mov_b32_e32 v3, 0xfffffc21
346; VI-NEXT:    s_waitcnt lgkmcnt(0)
347; VI-NEXT:    v_mov_b32_e32 v1, s3
348; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
349; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
350; VI-NEXT:    flat_load_dword v2, v[0:1] glc
351; VI-NEXT:    s_waitcnt vmcnt(0)
352; VI-NEXT:    v_mov_b32_e32 v0, s0
353; VI-NEXT:    v_mov_b32_e32 v1, s1
354; VI-NEXT:    v_add_u16_e32 v4, 0xfcb3, v2
355; VI-NEXT:    v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
356; VI-NEXT:    v_or_b32_e32 v2, v4, v2
357; VI-NEXT:    flat_store_dword v[0:1], v2
358; VI-NEXT:    s_endpgm
359;
360; GFX9-LABEL: v_test_add_v2i16_neg_constant:
361; GFX9:       ; %bb.0:
362; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
363; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
364; GFX9-NEXT:    v_mov_b32_e32 v1, 0
365; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
366; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
367; GFX9-NEXT:    s_waitcnt vmcnt(0)
368; GFX9-NEXT:    s_mov_b32 s2, 0xfc21fcb3
369; GFX9-NEXT:    v_pk_add_u16 v0, v0, s2
370; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
371; GFX9-NEXT:    s_endpgm
372;
373; GFX10-LABEL: v_test_add_v2i16_neg_constant:
374; GFX10:       ; %bb.0:
375; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
376; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
377; GFX10-NEXT:    v_mov_b32_e32 v1, 0
378; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
379; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
380; GFX10-NEXT:    s_waitcnt vmcnt(0)
381; GFX10-NEXT:    v_pk_add_u16 v0, 0xfc21fcb3, v0
382; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
383; GFX10-NEXT:    s_endpgm
384;
385; GFX11-LABEL: v_test_add_v2i16_neg_constant:
386; GFX11:       ; %bb.0:
387; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
388; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
389; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
390; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
391; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
393; GFX11-NEXT:    s_waitcnt vmcnt(0)
394; GFX11-NEXT:    v_pk_add_u16 v0, 0xfc21fcb3, v0
395; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
396; GFX11-NEXT:    s_endpgm
397  %tid = call i32 @llvm.amdgcn.workitem.id.x()
398  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
399  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
400  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
401  %add = add <2 x i16> %a, <i16 -845, i16 -991>
402  store <2 x i16> %add, ptr addrspace(1) %out
403  ret void
404}
405
406define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
407; VI-LABEL: v_test_add_v2i16_inline_neg1:
408; VI:       ; %bb.0:
409; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
410; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
411; VI-NEXT:    v_mov_b32_e32 v3, -1
412; VI-NEXT:    s_waitcnt lgkmcnt(0)
413; VI-NEXT:    v_mov_b32_e32 v1, s3
414; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
415; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
416; VI-NEXT:    flat_load_dword v2, v[0:1] glc
417; VI-NEXT:    s_waitcnt vmcnt(0)
418; VI-NEXT:    v_mov_b32_e32 v0, s0
419; VI-NEXT:    v_mov_b32_e32 v1, s1
420; VI-NEXT:    v_add_u16_e32 v4, -1, v2
421; VI-NEXT:    v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
422; VI-NEXT:    v_or_b32_e32 v2, v4, v2
423; VI-NEXT:    flat_store_dword v[0:1], v2
424; VI-NEXT:    s_endpgm
425;
426; GFX9-LABEL: v_test_add_v2i16_inline_neg1:
427; GFX9:       ; %bb.0:
428; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
429; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
430; GFX9-NEXT:    v_mov_b32_e32 v1, 0
431; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
432; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
433; GFX9-NEXT:    s_waitcnt vmcnt(0)
434; GFX9-NEXT:    v_pk_add_u16 v0, v0, -1
435; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
436; GFX9-NEXT:    s_endpgm
437;
438; GFX10-LABEL: v_test_add_v2i16_inline_neg1:
439; GFX10:       ; %bb.0:
440; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
441; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
442; GFX10-NEXT:    v_mov_b32_e32 v1, 0
443; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
444; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
445; GFX10-NEXT:    s_waitcnt vmcnt(0)
446; GFX10-NEXT:    v_pk_add_u16 v0, v0, -1
447; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
448; GFX10-NEXT:    s_endpgm
449;
450; GFX11-LABEL: v_test_add_v2i16_inline_neg1:
451; GFX11:       ; %bb.0:
452; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
453; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
454; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
455; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
456; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
457; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
458; GFX11-NEXT:    s_waitcnt vmcnt(0)
459; GFX11-NEXT:    v_pk_add_u16 v0, v0, -1
460; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
461; GFX11-NEXT:    s_endpgm
462  %tid = call i32 @llvm.amdgcn.workitem.id.x()
463  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
464  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
465  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
466  %add = add <2 x i16> %a, <i16 -1, i16 -1>
467  store <2 x i16> %add, ptr addrspace(1) %out
468  ret void
469}
470
471define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
472; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
473; VI:       ; %bb.0:
474; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
475; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
476; VI-NEXT:    s_waitcnt lgkmcnt(0)
477; VI-NEXT:    v_mov_b32_e32 v1, s3
478; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
479; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
480; VI-NEXT:    flat_load_dword v2, v[0:1] glc
481; VI-NEXT:    s_waitcnt vmcnt(0)
482; VI-NEXT:    v_mov_b32_e32 v0, s0
483; VI-NEXT:    v_mov_b32_e32 v1, s1
484; VI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
485; VI-NEXT:    v_add_u16_e32 v2, 32, v2
486; VI-NEXT:    v_or_b32_e32 v2, v2, v3
487; VI-NEXT:    flat_store_dword v[0:1], v2
488; VI-NEXT:    s_endpgm
489;
490; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
491; GFX9:       ; %bb.0:
492; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
493; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
494; GFX9-NEXT:    v_mov_b32_e32 v1, 0
495; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
496; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
497; GFX9-NEXT:    s_waitcnt vmcnt(0)
498; GFX9-NEXT:    v_pk_add_u16 v0, v0, 32
499; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
500; GFX9-NEXT:    s_endpgm
501;
502; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
503; GFX10:       ; %bb.0:
504; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
505; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
506; GFX10-NEXT:    v_mov_b32_e32 v1, 0
507; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
508; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
509; GFX10-NEXT:    s_waitcnt vmcnt(0)
510; GFX10-NEXT:    v_pk_add_u16 v0, v0, 32
511; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
512; GFX10-NEXT:    s_endpgm
513;
514; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
515; GFX11:       ; %bb.0:
516; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
517; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
518; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
519; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
520; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
521; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
522; GFX11-NEXT:    s_waitcnt vmcnt(0)
523; GFX11-NEXT:    v_pk_add_u16 v0, v0, 32
524; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
525; GFX11-NEXT:    s_endpgm
526  %tid = call i32 @llvm.amdgcn.workitem.id.x()
527  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
528  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
529  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
530  %add = add <2 x i16> %a, <i16 32, i16 0>
531  store <2 x i16> %add, ptr addrspace(1) %out
532  ret void
533}
534
535; The high element gives fp
536define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
537; VI-LABEL: v_test_add_v2i16_inline_fp_split:
538; VI:       ; %bb.0:
539; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
540; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
541; VI-NEXT:    v_mov_b32_e32 v3, 0x3f80
542; VI-NEXT:    s_waitcnt lgkmcnt(0)
543; VI-NEXT:    v_mov_b32_e32 v1, s3
544; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
545; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
546; VI-NEXT:    flat_load_dword v2, v[0:1] glc
547; VI-NEXT:    s_waitcnt vmcnt(0)
548; VI-NEXT:    v_mov_b32_e32 v0, s0
549; VI-NEXT:    v_mov_b32_e32 v1, s1
550; VI-NEXT:    v_add_u16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
551; VI-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
552; VI-NEXT:    flat_store_dword v[0:1], v2
553; VI-NEXT:    s_endpgm
554;
555; GFX9-LABEL: v_test_add_v2i16_inline_fp_split:
556; GFX9:       ; %bb.0:
557; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
558; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
559; GFX9-NEXT:    v_mov_b32_e32 v1, 0
560; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
561; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
562; GFX9-NEXT:    s_waitcnt vmcnt(0)
563; GFX9-NEXT:    v_pk_add_u16 v0, v0, 1.0
564; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
565; GFX9-NEXT:    s_endpgm
566;
567; GFX10-LABEL: v_test_add_v2i16_inline_fp_split:
568; GFX10:       ; %bb.0:
569; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
570; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
571; GFX10-NEXT:    v_mov_b32_e32 v1, 0
572; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
573; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
574; GFX10-NEXT:    s_waitcnt vmcnt(0)
575; GFX10-NEXT:    v_pk_add_u16 v0, v0, 1.0
576; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
577; GFX10-NEXT:    s_endpgm
578;
579; GFX11-LABEL: v_test_add_v2i16_inline_fp_split:
580; GFX11:       ; %bb.0:
581; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
582; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
583; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
584; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
585; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
586; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
587; GFX11-NEXT:    s_waitcnt vmcnt(0)
588; GFX11-NEXT:    v_pk_add_u16 v0, v0, 1.0
589; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
590; GFX11-NEXT:    s_endpgm
591  %tid = call i32 @llvm.amdgcn.workitem.id.x()
592  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
593  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
594  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
595  %add = add <2 x i16> %a, <i16 0, i16 16256>
596  store <2 x i16> %add, ptr addrspace(1) %out
597  ret void
598}
599
600; FIXME: Need to handle non-uniform case for function below (load without gep).
601define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
602; VI-LABEL: v_test_add_v2i16_zext_to_v2i32:
603; VI:       ; %bb.0:
604; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
605; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
606; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
607; VI-NEXT:    s_waitcnt lgkmcnt(0)
608; VI-NEXT:    v_mov_b32_e32 v1, s3
609; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
610; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
611; VI-NEXT:    v_mov_b32_e32 v3, s5
612; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
613; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
614; VI-NEXT:    flat_load_dword v4, v[0:1] glc
615; VI-NEXT:    s_waitcnt vmcnt(0)
616; VI-NEXT:    flat_load_dword v3, v[2:3] glc
617; VI-NEXT:    s_waitcnt vmcnt(0)
618; VI-NEXT:    v_mov_b32_e32 v0, s0
619; VI-NEXT:    v_mov_b32_e32 v1, s1
620; VI-NEXT:    v_add_u16_e32 v2, v4, v3
621; VI-NEXT:    v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
622; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
623; VI-NEXT:    s_endpgm
624;
625; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i32:
626; GFX9:       ; %bb.0:
627; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
628; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
629; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
630; GFX9-NEXT:    v_mov_b32_e32 v3, 0
631; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
632; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
633; GFX9-NEXT:    s_waitcnt vmcnt(0)
634; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
635; GFX9-NEXT:    s_waitcnt vmcnt(0)
636; GFX9-NEXT:    v_pk_add_u16 v0, v1, v2
637; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
638; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
639; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
640; GFX9-NEXT:    s_endpgm
641;
642; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i32:
643; GFX10:       ; %bb.0:
644; GFX10-NEXT:    s_clause 0x1
645; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
646; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
647; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
648; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
649; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
650; GFX10-NEXT:    s_waitcnt vmcnt(0)
651; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
652; GFX10-NEXT:    s_waitcnt vmcnt(0)
653; GFX10-NEXT:    v_pk_add_u16 v0, v1, v2
654; GFX10-NEXT:    v_mov_b32_e32 v2, 0
655; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
656; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
657; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
658; GFX10-NEXT:    s_endpgm
659;
660; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32:
661; GFX11:       ; %bb.0:
662; GFX11-NEXT:    s_clause 0x1
663; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
664; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
665; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
666; GFX11-NEXT:    v_mov_b32_e32 v2, 0
667; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
668; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
669; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
670; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
671; GFX11-NEXT:    s_waitcnt vmcnt(0)
672; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5] glc dlc
673; GFX11-NEXT:    s_waitcnt vmcnt(0)
674; GFX11-NEXT:    v_pk_add_u16 v0, v1, v0
675; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
676; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
677; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
678; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
679; GFX11-NEXT:    s_endpgm
680  %tid = call i32 @llvm.amdgcn.workitem.id.x()
681  %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
682  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
683  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
684  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
685  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
686  %add = add <2 x i16> %a, %b
687  %ext = zext <2 x i16> %add to <2 x i32>
688  store <2 x i32> %ext, ptr addrspace(1) %out
689  ret void
690}
691
692; FIXME: Need to handle non-uniform case for function below (load without gep).
693define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
694; VI-LABEL: v_test_add_v2i16_zext_to_v2i64:
695; VI:       ; %bb.0:
696; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
697; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
698; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
699; VI-NEXT:    s_waitcnt lgkmcnt(0)
700; VI-NEXT:    v_mov_b32_e32 v1, s3
701; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
702; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
703; VI-NEXT:    v_mov_b32_e32 v3, s5
704; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
705; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
706; VI-NEXT:    flat_load_dword v6, v[0:1] glc
707; VI-NEXT:    s_waitcnt vmcnt(0)
708; VI-NEXT:    flat_load_dword v2, v[2:3] glc
709; VI-NEXT:    s_waitcnt vmcnt(0)
710; VI-NEXT:    v_mov_b32_e32 v1, 0
711; VI-NEXT:    v_mov_b32_e32 v4, s0
712; VI-NEXT:    v_mov_b32_e32 v5, s1
713; VI-NEXT:    v_mov_b32_e32 v3, v1
714; VI-NEXT:    v_add_u16_e32 v0, v6, v2
715; VI-NEXT:    v_add_u16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
716; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
717; VI-NEXT:    s_endpgm
718;
719; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i64:
720; GFX9:       ; %bb.0:
721; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
722; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
723; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
724; GFX9-NEXT:    v_mov_b32_e32 v1, 0
725; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
726; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
727; GFX9-NEXT:    s_waitcnt vmcnt(0)
728; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
729; GFX9-NEXT:    s_waitcnt vmcnt(0)
730; GFX9-NEXT:    v_pk_add_u16 v0, v2, v3
731; GFX9-NEXT:    v_alignbit_b32 v2, 0, v0, 16
732; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
733; GFX9-NEXT:    v_mov_b32_e32 v3, v1
734; GFX9-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
735; GFX9-NEXT:    s_endpgm
736;
737; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i64:
738; GFX10:       ; %bb.0:
739; GFX10-NEXT:    s_clause 0x1
740; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
741; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
742; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
743; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
744; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
745; GFX10-NEXT:    s_waitcnt vmcnt(0)
746; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
747; GFX10-NEXT:    s_waitcnt vmcnt(0)
748; GFX10-NEXT:    v_pk_add_u16 v0, v1, v2
749; GFX10-NEXT:    v_mov_b32_e32 v1, 0
750; GFX10-NEXT:    v_alignbit_b32 v2, 0, v0, 16
751; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
752; GFX10-NEXT:    v_mov_b32_e32 v3, v1
753; GFX10-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
754; GFX10-NEXT:    s_endpgm
755;
756; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i64:
757; GFX11:       ; %bb.0:
758; GFX11-NEXT:    s_clause 0x1
759; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
760; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
761; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
762; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
763; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
764; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
765; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
766; GFX11-NEXT:    s_waitcnt vmcnt(0)
767; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5] glc dlc
768; GFX11-NEXT:    s_waitcnt vmcnt(0)
769; GFX11-NEXT:    v_pk_add_u16 v0, v1, v0
770; GFX11-NEXT:    v_mov_b32_e32 v1, 0
771; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
772; GFX11-NEXT:    v_alignbit_b32 v2, 0, v0, 16
773; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
774; GFX11-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
775; GFX11-NEXT:    s_endpgm
776  %tid = call i32 @llvm.amdgcn.workitem.id.x()
777  %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
778  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
779  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
780  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
781  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
782  %add = add <2 x i16> %a, %b
783  %ext = zext <2 x i16> %add to <2 x i64>
784  store <2 x i64> %ext, ptr addrspace(1) %out
785  ret void
786}
787
788; FIXME: Need to handle non-uniform case for function below (load without gep).
789define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
790; VI-LABEL: v_test_add_v2i16_sext_to_v2i32:
791; VI:       ; %bb.0:
792; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
793; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
794; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
795; VI-NEXT:    s_waitcnt lgkmcnt(0)
796; VI-NEXT:    v_mov_b32_e32 v1, s3
797; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
798; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
799; VI-NEXT:    v_mov_b32_e32 v3, s5
800; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
801; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
802; VI-NEXT:    flat_load_dword v4, v[0:1] glc
803; VI-NEXT:    s_waitcnt vmcnt(0)
804; VI-NEXT:    flat_load_dword v2, v[2:3] glc
805; VI-NEXT:    s_waitcnt vmcnt(0)
806; VI-NEXT:    v_mov_b32_e32 v0, s0
807; VI-NEXT:    v_mov_b32_e32 v1, s1
808; VI-NEXT:    v_add_u16_sdwa v3, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
809; VI-NEXT:    v_add_u16_e32 v2, v4, v2
810; VI-NEXT:    v_bfe_i32 v2, v2, 0, 16
811; VI-NEXT:    v_bfe_i32 v3, v3, 0, 16
812; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
813; VI-NEXT:    s_endpgm
814;
815; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i32:
816; GFX9:       ; %bb.0:
817; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
818; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
819; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
820; GFX9-NEXT:    v_mov_b32_e32 v3, 0
821; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
822; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
823; GFX9-NEXT:    s_waitcnt vmcnt(0)
824; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
825; GFX9-NEXT:    s_waitcnt vmcnt(0)
826; GFX9-NEXT:    v_pk_add_u16 v0, v1, v2
827; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
828; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
829; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
830; GFX9-NEXT:    s_endpgm
831;
832; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i32:
833; GFX10:       ; %bb.0:
834; GFX10-NEXT:    s_clause 0x1
835; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
836; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
837; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
838; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
839; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
840; GFX10-NEXT:    s_waitcnt vmcnt(0)
841; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
842; GFX10-NEXT:    s_waitcnt vmcnt(0)
843; GFX10-NEXT:    v_pk_add_u16 v0, v1, v2
844; GFX10-NEXT:    v_mov_b32_e32 v2, 0
845; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
846; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 16
847; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
848; GFX10-NEXT:    s_endpgm
849;
850; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i32:
851; GFX11:       ; %bb.0:
852; GFX11-NEXT:    s_clause 0x1
853; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
854; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
855; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
856; GFX11-NEXT:    v_mov_b32_e32 v2, 0
857; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
858; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
859; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
860; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
861; GFX11-NEXT:    s_waitcnt vmcnt(0)
862; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5] glc dlc
863; GFX11-NEXT:    s_waitcnt vmcnt(0)
864; GFX11-NEXT:    v_pk_add_u16 v0, v1, v0
865; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
866; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
867; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
868; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
869; GFX11-NEXT:    s_endpgm
870  %tid = call i32 @llvm.amdgcn.workitem.id.x()
871  %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
872  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
873  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
874  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
875  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
876  %add = add <2 x i16> %a, %b
877  %ext = sext <2 x i16> %add to <2 x i32>
878  store <2 x i32> %ext, ptr addrspace(1) %out
879  ret void
880}
881
882; FIXME: Need to handle non-uniform case for function below (load without gep).
883define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
884; VI-LABEL: v_test_add_v2i16_sext_to_v2i64:
885; VI:       ; %bb.0:
886; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
887; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
888; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
889; VI-NEXT:    s_waitcnt lgkmcnt(0)
890; VI-NEXT:    v_mov_b32_e32 v1, s3
891; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
892; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
893; VI-NEXT:    v_mov_b32_e32 v3, s5
894; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
895; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
896; VI-NEXT:    flat_load_dword v0, v[0:1]
897; VI-NEXT:    flat_load_dword v1, v[2:3]
898; VI-NEXT:    v_mov_b32_e32 v4, s0
899; VI-NEXT:    v_mov_b32_e32 v5, s1
900; VI-NEXT:    s_waitcnt vmcnt(0)
901; VI-NEXT:    v_add_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
902; VI-NEXT:    v_add_u16_e32 v0, v0, v1
903; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
904; VI-NEXT:    v_bfe_i32 v2, v2, 0, 16
905; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
906; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
907; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
908; VI-NEXT:    s_endpgm
909;
910; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i64:
911; GFX9:       ; %bb.0:
912; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
913; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
914; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
915; GFX9-NEXT:    v_mov_b32_e32 v4, 0
916; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
917; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
918; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
919; GFX9-NEXT:    s_waitcnt vmcnt(0)
920; GFX9-NEXT:    v_pk_add_u16 v1, v1, v2
921; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
922; GFX9-NEXT:    v_bfe_i32 v0, v1, 0, 16
923; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 16
924; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
925; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
926; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
927; GFX9-NEXT:    s_endpgm
928;
929; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i64:
930; GFX10:       ; %bb.0:
931; GFX10-NEXT:    s_clause 0x1
932; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
933; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
934; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
935; GFX10-NEXT:    v_mov_b32_e32 v4, 0
936; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
937; GFX10-NEXT:    s_clause 0x1
938; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
939; GFX10-NEXT:    global_load_dword v2, v0, s[6:7]
940; GFX10-NEXT:    s_waitcnt vmcnt(0)
941; GFX10-NEXT:    v_pk_add_u16 v0, v1, v2
942; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
943; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 16
944; GFX10-NEXT:    v_bfe_i32 v2, v1, 0, 16
945; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
946; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
947; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
948; GFX10-NEXT:    s_endpgm
949;
950; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64:
951; GFX11:       ; %bb.0:
952; GFX11-NEXT:    s_clause 0x1
953; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
954; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
955; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
956; GFX11-NEXT:    v_mov_b32_e32 v4, 0
957; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
958; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
959; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
960; GFX11-NEXT:    s_clause 0x1
961; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
962; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5]
963; GFX11-NEXT:    s_waitcnt vmcnt(0)
964; GFX11-NEXT:    v_pk_add_u16 v0, v1, v0
965; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
966; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
967; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
968; GFX11-NEXT:    v_bfe_i32 v2, v1, 0, 16
969; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
970; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
971; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
972; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
973; GFX11-NEXT:    s_endpgm
974  %tid = call i32 @llvm.amdgcn.workitem.id.x()
975  %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
976  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
977  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
978  %a = load <2 x i16>, ptr addrspace(1) %gep.in0
979  %b = load <2 x i16>, ptr addrspace(1) %gep.in1
980  %add = add <2 x i16> %a, %b
981  %ext = sext <2 x i16> %add to <2 x i64>
982  store <2 x i64> %ext, ptr addrspace(1) %out
983  ret void
984}
985
986define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) {
987; VI-LABEL: add_inline_imm_neg1_0:
988; VI:       ; %bb.0:
989; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
990; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
991; VI-NEXT:    v_add_u16_e32 v0, -1, v0
992; VI-NEXT:    v_or_b32_e32 v0, v0, v1
993; VI-NEXT:    s_setpc_b64 s[30:31]
994;
995; GFX9-LABEL: add_inline_imm_neg1_0:
996; GFX9:       ; %bb.0:
997; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
998; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 1
999; GFX9-NEXT:    s_setpc_b64 s[30:31]
1000;
1001; GFX10-LABEL: add_inline_imm_neg1_0:
1002; GFX10:       ; %bb.0:
1003; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1004; GFX10-NEXT:    v_pk_sub_u16 v0, v0, 1
1005; GFX10-NEXT:    s_setpc_b64 s[30:31]
1006;
1007; GFX11-LABEL: add_inline_imm_neg1_0:
1008; GFX11:       ; %bb.0:
1009; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1010; GFX11-NEXT:    v_pk_sub_u16 v0, v0, 1
1011; GFX11-NEXT:    s_setpc_b64 s[30:31]
1012  %y = add <2 x i16> %x, <i16 -1, i16 0>
1013  ret <2 x i16> %y
1014}
1015
1016define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) {
1017; VI-LABEL: add_inline_imm_1_0:
1018; VI:       ; %bb.0:
1019; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1020; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
1021; VI-NEXT:    v_add_u16_e32 v0, 1, v0
1022; VI-NEXT:    v_or_b32_e32 v0, v0, v1
1023; VI-NEXT:    s_setpc_b64 s[30:31]
1024;
1025; GFX9-LABEL: add_inline_imm_1_0:
1026; GFX9:       ; %bb.0:
1027; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1028; GFX9-NEXT:    v_pk_add_u16 v0, v0, 1
1029; GFX9-NEXT:    s_setpc_b64 s[30:31]
1030;
1031; GFX10-LABEL: add_inline_imm_1_0:
1032; GFX10:       ; %bb.0:
1033; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1034; GFX10-NEXT:    v_pk_add_u16 v0, v0, 1
1035; GFX10-NEXT:    s_setpc_b64 s[30:31]
1036;
1037; GFX11-LABEL: add_inline_imm_1_0:
1038; GFX11:       ; %bb.0:
1039; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1040; GFX11-NEXT:    v_pk_add_u16 v0, v0, 1
1041; GFX11-NEXT:    s_setpc_b64 s[30:31]
1042  %y = add <2 x i16> %x, <i16 1, i16 0>
1043  ret <2 x i16> %y
1044}
1045
1046declare i32 @llvm.amdgcn.workitem.id.x() #0
1047
1048attributes #0 = { nounwind readnone }
1049attributes #1 = { nounwind }
1050