xref: /llvm-project/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll (revision 7c58d6363a40fc6d1cdf6a147da8f3bb0d4f96ec)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel=0 -mtriple=amdgcn < %s | FileCheck --check-prefixes=SI,SI-SDAG %s
3; RUN: llc -global-isel=1 -mtriple=amdgcn < %s | FileCheck --check-prefixes=SI,SI-GISEL %s
4; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefixes=VI,VI-SDAG %s
5; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefixes=VI,VI-GISEL %s
6; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9,GFX9-SDAG %s
7; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9,GFX9-GISEL %s
8; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10,GFX10-SDAG %s
9; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL %s
10; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
11; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
12; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
13; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
14
15; Test that add/sub with a constant is swapped to sub/add with negated
16; constant to minimize code size.
17
18define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
19; SI-SDAG-LABEL: v_test_i32_x_sub_64:
20; SI-SDAG:       ; %bb.0:
21; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
22; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
23; SI-SDAG-NEXT:    s_mov_b32 s6, 0
24; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
25; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
26; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
27; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
28; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
29; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
30; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
31; SI-SDAG-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
32; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
33; SI-SDAG-NEXT:    s_endpgm
34;
35; SI-GISEL-LABEL: v_test_i32_x_sub_64:
36; SI-GISEL:       ; %bb.0:
37; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
38; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
39; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
40; SI-GISEL-NEXT:    s_mov_b32 s6, 0
41; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
42; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
43; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
44; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
45; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
46; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
47; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
48; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
49; SI-GISEL-NEXT:    s_endpgm
50;
51; VI-SDAG-LABEL: v_test_i32_x_sub_64:
52; VI-SDAG:       ; %bb.0:
53; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
54; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
55; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
56; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
57; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
58; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
59; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
60; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
61; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
62; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
63; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
64; VI-SDAG-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v3
65; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
66; VI-SDAG-NEXT:    s_endpgm
67;
68; VI-GISEL-LABEL: v_test_i32_x_sub_64:
69; VI-GISEL:       ; %bb.0:
70; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
71; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
72; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
73; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
74; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
75; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
76; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
77; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
78; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
79; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
80; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
81; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
82; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
83; VI-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 0xffffffc0, v3
84; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
85; VI-GISEL-NEXT:    s_endpgm
86;
87; GFX9-SDAG-LABEL: v_test_i32_x_sub_64:
88; GFX9-SDAG:       ; %bb.0:
89; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
90; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
91; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
92; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
93; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
94; GFX9-SDAG-NEXT:    v_subrev_u32_e32 v1, 64, v1
95; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
96; GFX9-SDAG-NEXT:    s_endpgm
97;
98; GFX9-GISEL-LABEL: v_test_i32_x_sub_64:
99; GFX9-GISEL:       ; %bb.0:
100; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
101; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
102; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
103; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
104; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
105; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 0xffffffc0, v1
106; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
107; GFX9-GISEL-NEXT:    s_endpgm
108;
109; GFX10-SDAG-LABEL: v_test_i32_x_sub_64:
110; GFX10-SDAG:       ; %bb.0:
111; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
112; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
113; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
114; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
115; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
116; GFX10-SDAG-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
117; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
118; GFX10-SDAG-NEXT:    s_endpgm
119;
120; GFX10-GISEL-LABEL: v_test_i32_x_sub_64:
121; GFX10-GISEL:       ; %bb.0:
122; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
123; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
124; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
126; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
127; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 0xffffffc0, v1
128; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
129; GFX10-GISEL-NEXT:    s_endpgm
130;
131; GFX11-SDAG-LABEL: v_test_i32_x_sub_64:
132; GFX11-SDAG:       ; %bb.0:
133; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
134; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
135; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
136; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
137; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
138; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
139; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
140; GFX11-SDAG-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
141; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
142; GFX11-SDAG-NEXT:    s_endpgm
143;
144; GFX11-GISEL-LABEL: v_test_i32_x_sub_64:
145; GFX11-GISEL:       ; %bb.0:
146; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
147; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
148; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
149; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
150; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
151; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
152; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
153; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 0xffffffc0, v1
154; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
155; GFX11-GISEL-NEXT:    s_endpgm
156  %tid = call i32 @llvm.amdgcn.workitem.id.x()
157  %tid.ext = sext i32 %tid to i64
158  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
159  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
160  %x = load i32, ptr addrspace(1) %gep
161  %result = sub i32 %x, 64
162  store i32 %result, ptr addrspace(1) %gep.out
163  ret void
164}
165
166define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
167; SI-SDAG-LABEL: v_test_i32_x_sub_64_multi_use:
168; SI-SDAG:       ; %bb.0:
169; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
170; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
171; SI-SDAG-NEXT:    s_mov_b32 s6, 0
172; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
173; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
174; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
175; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
176; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
177; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
178; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
179; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
180; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
181; SI-SDAG-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
182; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
183; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
184; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
185; SI-SDAG-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
186; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
187; SI-SDAG-NEXT:    s_endpgm
188;
189; SI-GISEL-LABEL: v_test_i32_x_sub_64_multi_use:
190; SI-GISEL:       ; %bb.0:
191; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
192; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
193; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
194; SI-GISEL-NEXT:    s_mov_b32 s6, 0
195; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
196; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
197; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
198; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
199; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
200; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
201; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
202; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
203; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
204; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffc0, v3
205; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
206; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
207; SI-GISEL-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
208; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
209; SI-GISEL-NEXT:    s_endpgm
210;
211; VI-SDAG-LABEL: v_test_i32_x_sub_64_multi_use:
212; VI-SDAG:       ; %bb.0:
213; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
214; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
215; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
216; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
217; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
218; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
219; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1] glc
220; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
221; VI-SDAG-NEXT:    flat_load_dword v4, v[0:1] glc
222; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
223; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
224; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
225; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
226; VI-SDAG-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v3
227; VI-SDAG-NEXT:    v_subrev_u32_e32 v3, vcc, 64, v4
228; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
229; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
230; VI-SDAG-NEXT:    flat_store_dword v[0:1], v3
231; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
232; VI-SDAG-NEXT:    s_endpgm
233;
234; VI-GISEL-LABEL: v_test_i32_x_sub_64_multi_use:
235; VI-GISEL:       ; %bb.0:
236; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
237; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
238; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
239; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
240; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
241; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
242; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
243; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1] glc
244; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
245; VI-GISEL-NEXT:    flat_load_dword v4, v[0:1] glc
246; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
247; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
248; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
249; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
250; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
251; VI-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 0xffffffc0, v3
252; VI-GISEL-NEXT:    v_add_u32_e32 v3, vcc, 0xffffffc0, v4
253; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
254; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
255; VI-GISEL-NEXT:    flat_store_dword v[0:1], v3
256; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
257; VI-GISEL-NEXT:    s_endpgm
258;
259; GFX9-SDAG-LABEL: v_test_i32_x_sub_64_multi_use:
260; GFX9-SDAG:       ; %bb.0:
261; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
262; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
263; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
264; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
265; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
266; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[2:3] glc
267; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
268; GFX9-SDAG-NEXT:    v_subrev_u32_e32 v1, 64, v1
269; GFX9-SDAG-NEXT:    v_subrev_u32_e32 v2, 64, v2
270; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
271; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
272; GFX9-SDAG-NEXT:    global_store_dword v0, v2, s[0:1]
273; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
274; GFX9-SDAG-NEXT:    s_endpgm
275;
276; GFX9-GISEL-LABEL: v_test_i32_x_sub_64_multi_use:
277; GFX9-GISEL:       ; %bb.0:
278; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
279; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
280; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
282; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
283; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[2:3] glc
284; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
285; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 0xffffffc0, v1
286; GFX9-GISEL-NEXT:    v_add_u32_e32 v2, 0xffffffc0, v2
287; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
288; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
289; GFX9-GISEL-NEXT:    global_store_dword v0, v2, s[0:1]
290; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
291; GFX9-GISEL-NEXT:    s_endpgm
292;
293; GFX10-SDAG-LABEL: v_test_i32_x_sub_64_multi_use:
294; GFX10-SDAG:       ; %bb.0:
295; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
296; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
297; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
298; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
299; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
300; GFX10-SDAG-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
301; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
302; GFX10-SDAG-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
303; GFX10-SDAG-NEXT:    v_subrev_nc_u32_e32 v2, 64, v2
304; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
305; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
306; GFX10-SDAG-NEXT:    global_store_dword v0, v2, s[0:1]
307; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
308; GFX10-SDAG-NEXT:    s_endpgm
309;
310; GFX10-GISEL-LABEL: v_test_i32_x_sub_64_multi_use:
311; GFX10-GISEL:       ; %bb.0:
312; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
313; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
314; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
315; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
316; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
317; GFX10-GISEL-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
318; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
319; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 0xffffffc0, v1
320; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v2, 0xffffffc0, v2
321; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
322; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
323; GFX10-GISEL-NEXT:    global_store_dword v0, v2, s[0:1]
324; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
325; GFX10-GISEL-NEXT:    s_endpgm
326;
327; GFX11-SDAG-LABEL: v_test_i32_x_sub_64_multi_use:
328; GFX11-SDAG:       ; %bb.0:
329; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
330; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
331; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
332; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
333; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
334; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
335; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
336; GFX11-SDAG-NEXT:    global_load_b32 v2, v0, s[2:3] glc dlc
337; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
338; GFX11-SDAG-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
339; GFX11-SDAG-NEXT:    v_subrev_nc_u32_e32 v2, 64, v2
340; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
341; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
342; GFX11-SDAG-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
343; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
344; GFX11-SDAG-NEXT:    s_endpgm
345;
346; GFX11-GISEL-LABEL: v_test_i32_x_sub_64_multi_use:
347; GFX11-GISEL:       ; %bb.0:
348; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
349; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
350; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
351; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
352; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
354; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
355; GFX11-GISEL-NEXT:    global_load_b32 v2, v0, s[2:3] glc dlc
356; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
357; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 0xffffffc0, v1
358; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v2, 0xffffffc0, v2
359; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
360; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
361; GFX11-GISEL-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
362; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
363; GFX11-GISEL-NEXT:    s_endpgm
364  %tid = call i32 @llvm.amdgcn.workitem.id.x()
365  %tid.ext = sext i32 %tid to i64
366  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
367  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
368  %x = load volatile i32, ptr addrspace(1) %gep
369  %y = load volatile i32, ptr addrspace(1) %gep
370  %result0 = sub i32 %x, 64
371  %result1 = sub i32 %y, 64
372  store volatile i32 %result0, ptr addrspace(1) %gep.out
373  store volatile i32 %result1, ptr addrspace(1) %gep.out
374  ret void
375}
376
377define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
378; SI-SDAG-LABEL: v_test_i32_64_sub_x:
379; SI-SDAG:       ; %bb.0:
380; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
381; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
382; SI-SDAG-NEXT:    s_mov_b32 s6, 0
383; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
384; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
385; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
386; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
387; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
388; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
389; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
390; SI-SDAG-NEXT:    v_sub_i32_e32 v2, vcc, 64, v2
391; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
392; SI-SDAG-NEXT:    s_endpgm
393;
394; SI-GISEL-LABEL: v_test_i32_64_sub_x:
395; SI-GISEL:       ; %bb.0:
396; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
397; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
398; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
399; SI-GISEL-NEXT:    s_mov_b32 s6, 0
400; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
401; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
402; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
403; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
404; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
405; SI-GISEL-NEXT:    v_sub_i32_e32 v2, vcc, 64, v2
406; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
407; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
408; SI-GISEL-NEXT:    s_endpgm
409;
410; VI-SDAG-LABEL: v_test_i32_64_sub_x:
411; VI-SDAG:       ; %bb.0:
412; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
413; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
414; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
415; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
416; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
417; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
418; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
419; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
420; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
421; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
422; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
423; VI-SDAG-NEXT:    v_sub_u32_e32 v2, vcc, 64, v3
424; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
425; VI-SDAG-NEXT:    s_endpgm
426;
427; VI-GISEL-LABEL: v_test_i32_64_sub_x:
428; VI-GISEL:       ; %bb.0:
429; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
430; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
431; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
432; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
433; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
434; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
435; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
436; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
437; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
438; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
439; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
440; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
441; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
442; VI-GISEL-NEXT:    v_sub_u32_e32 v2, vcc, 64, v3
443; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
444; VI-GISEL-NEXT:    s_endpgm
445;
446; GFX9-LABEL: v_test_i32_64_sub_x:
447; GFX9:       ; %bb.0:
448; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
449; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
450; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
451; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
452; GFX9-NEXT:    s_waitcnt vmcnt(0)
453; GFX9-NEXT:    v_sub_u32_e32 v1, 64, v1
454; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
455; GFX9-NEXT:    s_endpgm
456;
457; GFX10-LABEL: v_test_i32_64_sub_x:
458; GFX10:       ; %bb.0:
459; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
460; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
461; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
462; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
463; GFX10-NEXT:    s_waitcnt vmcnt(0)
464; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 64, v1
465; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
466; GFX10-NEXT:    s_endpgm
467;
468; GFX11-LABEL: v_test_i32_64_sub_x:
469; GFX11:       ; %bb.0:
470; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
471; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
472; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
473; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
474; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
475; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
476; GFX11-NEXT:    s_waitcnt vmcnt(0)
477; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 64, v1
478; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
479; GFX11-NEXT:    s_endpgm
480  %tid = call i32 @llvm.amdgcn.workitem.id.x()
481  %tid.ext = sext i32 %tid to i64
482  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
483  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
484  %x = load i32, ptr addrspace(1) %gep
485  %result = sub i32 64, %x
486  store i32 %result, ptr addrspace(1) %gep.out
487  ret void
488}
489
490define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
491; SI-SDAG-LABEL: v_test_i32_x_sub_65:
492; SI-SDAG:       ; %bb.0:
493; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
494; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
495; SI-SDAG-NEXT:    s_mov_b32 s6, 0
496; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
497; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
498; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
499; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
500; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
501; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
502; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
503; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffbf, v2
504; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
505; SI-SDAG-NEXT:    s_endpgm
506;
507; SI-GISEL-LABEL: v_test_i32_x_sub_65:
508; SI-GISEL:       ; %bb.0:
509; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
510; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
511; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
512; SI-GISEL-NEXT:    s_mov_b32 s6, 0
513; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
514; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
515; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
516; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
517; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
518; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffbf, v2
519; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
520; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
521; SI-GISEL-NEXT:    s_endpgm
522;
523; VI-SDAG-LABEL: v_test_i32_x_sub_65:
524; VI-SDAG:       ; %bb.0:
525; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
526; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
527; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
528; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
529; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
530; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
531; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
532; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
533; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
534; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
535; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
536; VI-SDAG-NEXT:    v_add_u32_e32 v2, vcc, 0xffffffbf, v3
537; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
538; VI-SDAG-NEXT:    s_endpgm
539;
540; VI-GISEL-LABEL: v_test_i32_x_sub_65:
541; VI-GISEL:       ; %bb.0:
542; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
543; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
544; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
545; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
546; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
547; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
548; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
549; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
550; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
551; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
552; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
553; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
554; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
555; VI-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 0xffffffbf, v3
556; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
557; VI-GISEL-NEXT:    s_endpgm
558;
559; GFX9-LABEL: v_test_i32_x_sub_65:
560; GFX9:       ; %bb.0:
561; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
562; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
563; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
564; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
565; GFX9-NEXT:    s_waitcnt vmcnt(0)
566; GFX9-NEXT:    v_add_u32_e32 v1, 0xffffffbf, v1
567; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
568; GFX9-NEXT:    s_endpgm
569;
570; GFX10-LABEL: v_test_i32_x_sub_65:
571; GFX10:       ; %bb.0:
572; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
573; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
574; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
575; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
576; GFX10-NEXT:    s_waitcnt vmcnt(0)
577; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0xffffffbf, v1
578; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
579; GFX10-NEXT:    s_endpgm
580;
581; GFX11-LABEL: v_test_i32_x_sub_65:
582; GFX11:       ; %bb.0:
583; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
584; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
585; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
586; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
587; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
588; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
589; GFX11-NEXT:    s_waitcnt vmcnt(0)
590; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0xffffffbf, v1
591; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
592; GFX11-NEXT:    s_endpgm
593  %tid = call i32 @llvm.amdgcn.workitem.id.x()
594  %tid.ext = sext i32 %tid to i64
595  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
596  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
597  %x = load i32, ptr addrspace(1) %gep
598  %result = sub i32 %x, 65
599  store i32 %result, ptr addrspace(1) %gep.out
600  ret void
601}
602
603define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
604; SI-SDAG-LABEL: v_test_i32_65_sub_x:
605; SI-SDAG:       ; %bb.0:
606; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
607; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
608; SI-SDAG-NEXT:    s_mov_b32 s6, 0
609; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
610; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
611; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
612; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
613; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
614; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
615; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
616; SI-SDAG-NEXT:    v_sub_i32_e32 v2, vcc, 0x41, v2
617; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
618; SI-SDAG-NEXT:    s_endpgm
619;
620; SI-GISEL-LABEL: v_test_i32_65_sub_x:
621; SI-GISEL:       ; %bb.0:
622; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
623; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
624; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
625; SI-GISEL-NEXT:    s_mov_b32 s6, 0
626; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
627; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
628; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
629; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
630; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
631; SI-GISEL-NEXT:    v_sub_i32_e32 v2, vcc, 0x41, v2
632; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
633; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
634; SI-GISEL-NEXT:    s_endpgm
635;
636; VI-SDAG-LABEL: v_test_i32_65_sub_x:
637; VI-SDAG:       ; %bb.0:
638; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
639; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
640; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
641; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
642; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
643; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
644; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
645; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
646; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
647; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
648; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
649; VI-SDAG-NEXT:    v_sub_u32_e32 v2, vcc, 0x41, v3
650; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
651; VI-SDAG-NEXT:    s_endpgm
652;
653; VI-GISEL-LABEL: v_test_i32_65_sub_x:
654; VI-GISEL:       ; %bb.0:
655; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
656; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
657; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
658; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
659; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
660; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
661; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
662; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
663; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
664; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
665; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
666; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
667; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
668; VI-GISEL-NEXT:    v_sub_u32_e32 v2, vcc, 0x41, v3
669; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
670; VI-GISEL-NEXT:    s_endpgm
671;
672; GFX9-LABEL: v_test_i32_65_sub_x:
673; GFX9:       ; %bb.0:
674; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
675; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
676; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
678; GFX9-NEXT:    s_waitcnt vmcnt(0)
679; GFX9-NEXT:    v_sub_u32_e32 v1, 0x41, v1
680; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
681; GFX9-NEXT:    s_endpgm
682;
683; GFX10-LABEL: v_test_i32_65_sub_x:
684; GFX10:       ; %bb.0:
685; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
686; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
687; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
688; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
689; GFX10-NEXT:    s_waitcnt vmcnt(0)
690; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0x41, v1
691; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
692; GFX10-NEXT:    s_endpgm
693;
694; GFX11-LABEL: v_test_i32_65_sub_x:
695; GFX11:       ; %bb.0:
696; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
697; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
698; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
699; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
700; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
701; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
702; GFX11-NEXT:    s_waitcnt vmcnt(0)
703; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0x41, v1
704; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
705; GFX11-NEXT:    s_endpgm
706  %tid = call i32 @llvm.amdgcn.workitem.id.x()
707  %tid.ext = sext i32 %tid to i64
708  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
709  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
710  %x = load i32, ptr addrspace(1) %gep
711  %result = sub i32 65, %x
712  store i32 %result, ptr addrspace(1) %gep.out
713  ret void
714}
715
716define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
717; SI-SDAG-LABEL: v_test_i32_x_sub_neg16:
718; SI-SDAG:       ; %bb.0:
719; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
720; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
721; SI-SDAG-NEXT:    s_mov_b32 s6, 0
722; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
723; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
724; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
725; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
726; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
727; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
728; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
729; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 16, v2
730; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
731; SI-SDAG-NEXT:    s_endpgm
732;
733; SI-GISEL-LABEL: v_test_i32_x_sub_neg16:
734; SI-GISEL:       ; %bb.0:
735; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
736; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
737; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
738; SI-GISEL-NEXT:    s_mov_b32 s6, 0
739; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
740; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
741; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
742; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
743; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
744; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 16, v2
745; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
746; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
747; SI-GISEL-NEXT:    s_endpgm
748;
749; VI-SDAG-LABEL: v_test_i32_x_sub_neg16:
750; VI-SDAG:       ; %bb.0:
751; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
752; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
753; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
754; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
755; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
756; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
757; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
758; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
759; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
760; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
761; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
762; VI-SDAG-NEXT:    v_add_u32_e32 v2, vcc, 16, v3
763; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
764; VI-SDAG-NEXT:    s_endpgm
765;
766; VI-GISEL-LABEL: v_test_i32_x_sub_neg16:
767; VI-GISEL:       ; %bb.0:
768; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
769; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
770; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
771; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
772; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
773; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
774; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
775; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
776; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
777; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
778; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
779; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
780; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
781; VI-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 16, v3
782; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
783; VI-GISEL-NEXT:    s_endpgm
784;
785; GFX9-LABEL: v_test_i32_x_sub_neg16:
786; GFX9:       ; %bb.0:
787; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
788; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
789; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
790; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
791; GFX9-NEXT:    s_waitcnt vmcnt(0)
792; GFX9-NEXT:    v_add_u32_e32 v1, 16, v1
793; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
794; GFX9-NEXT:    s_endpgm
795;
796; GFX10-LABEL: v_test_i32_x_sub_neg16:
797; GFX10:       ; %bb.0:
798; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
799; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
800; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
801; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
802; GFX10-NEXT:    s_waitcnt vmcnt(0)
803; GFX10-NEXT:    v_add_nc_u32_e32 v1, 16, v1
804; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
805; GFX10-NEXT:    s_endpgm
806;
807; GFX11-LABEL: v_test_i32_x_sub_neg16:
808; GFX11:       ; %bb.0:
809; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
810; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
811; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
812; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
813; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
814; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
815; GFX11-NEXT:    s_waitcnt vmcnt(0)
816; GFX11-NEXT:    v_add_nc_u32_e32 v1, 16, v1
817; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
818; GFX11-NEXT:    s_endpgm
819  %tid = call i32 @llvm.amdgcn.workitem.id.x()
820  %tid.ext = sext i32 %tid to i64
821  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
822  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
823  %x = load i32, ptr addrspace(1) %gep
824  %result = sub i32 %x, -16
825  store i32 %result, ptr addrspace(1) %gep.out
826  ret void
827}
828
829define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
830; SI-SDAG-LABEL: v_test_i32_neg16_sub_x:
831; SI-SDAG:       ; %bb.0:
832; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
833; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
834; SI-SDAG-NEXT:    s_mov_b32 s6, 0
835; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
836; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
837; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
838; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
839; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
840; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
841; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
842; SI-SDAG-NEXT:    v_sub_i32_e32 v2, vcc, -16, v2
843; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
844; SI-SDAG-NEXT:    s_endpgm
845;
846; SI-GISEL-LABEL: v_test_i32_neg16_sub_x:
847; SI-GISEL:       ; %bb.0:
848; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
849; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
850; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
851; SI-GISEL-NEXT:    s_mov_b32 s6, 0
852; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
853; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
854; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
855; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
856; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
857; SI-GISEL-NEXT:    v_sub_i32_e32 v2, vcc, -16, v2
858; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
859; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
860; SI-GISEL-NEXT:    s_endpgm
861;
862; VI-SDAG-LABEL: v_test_i32_neg16_sub_x:
863; VI-SDAG:       ; %bb.0:
864; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
865; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
866; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
867; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
868; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
869; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
870; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
871; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
872; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
873; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
874; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
875; VI-SDAG-NEXT:    v_sub_u32_e32 v2, vcc, -16, v3
876; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
877; VI-SDAG-NEXT:    s_endpgm
878;
879; VI-GISEL-LABEL: v_test_i32_neg16_sub_x:
880; VI-GISEL:       ; %bb.0:
881; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
882; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
883; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
884; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
885; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
886; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
887; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
888; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
889; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
890; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
891; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
892; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
893; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
894; VI-GISEL-NEXT:    v_sub_u32_e32 v2, vcc, -16, v3
895; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
896; VI-GISEL-NEXT:    s_endpgm
897;
898; GFX9-LABEL: v_test_i32_neg16_sub_x:
899; GFX9:       ; %bb.0:
900; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
901; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
902; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
903; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
904; GFX9-NEXT:    s_waitcnt vmcnt(0)
905; GFX9-NEXT:    v_sub_u32_e32 v1, -16, v1
906; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
907; GFX9-NEXT:    s_endpgm
908;
909; GFX10-LABEL: v_test_i32_neg16_sub_x:
910; GFX10:       ; %bb.0:
911; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
912; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
913; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
914; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
915; GFX10-NEXT:    s_waitcnt vmcnt(0)
916; GFX10-NEXT:    v_sub_nc_u32_e32 v1, -16, v1
917; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
918; GFX10-NEXT:    s_endpgm
919;
920; GFX11-LABEL: v_test_i32_neg16_sub_x:
921; GFX11:       ; %bb.0:
922; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
923; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
924; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
925; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
926; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
927; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
928; GFX11-NEXT:    s_waitcnt vmcnt(0)
929; GFX11-NEXT:    v_sub_nc_u32_e32 v1, -16, v1
930; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
931; GFX11-NEXT:    s_endpgm
932  %tid = call i32 @llvm.amdgcn.workitem.id.x()
933  %tid.ext = sext i32 %tid to i64
934  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
935  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
936  %x = load i32, ptr addrspace(1) %gep
937  %result = sub i32 -16, %x
938  store i32 %result, ptr addrspace(1) %gep.out
939  ret void
940}
941
942define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
943; SI-SDAG-LABEL: v_test_i32_x_sub_neg17:
944; SI-SDAG:       ; %bb.0:
945; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
946; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
947; SI-SDAG-NEXT:    s_mov_b32 s6, 0
948; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
949; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
950; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
951; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
952; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
953; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
954; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
955; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 17, v2
956; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
957; SI-SDAG-NEXT:    s_endpgm
958;
959; SI-GISEL-LABEL: v_test_i32_x_sub_neg17:
960; SI-GISEL:       ; %bb.0:
961; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
962; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
963; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
964; SI-GISEL-NEXT:    s_mov_b32 s6, 0
965; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
966; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
967; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
968; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
969; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
970; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 17, v2
971; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
972; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
973; SI-GISEL-NEXT:    s_endpgm
974;
975; VI-SDAG-LABEL: v_test_i32_x_sub_neg17:
976; VI-SDAG:       ; %bb.0:
977; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
978; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
979; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
980; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
981; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
982; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
983; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
984; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
985; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
986; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
987; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
988; VI-SDAG-NEXT:    v_add_u32_e32 v2, vcc, 17, v3
989; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
990; VI-SDAG-NEXT:    s_endpgm
991;
992; VI-GISEL-LABEL: v_test_i32_x_sub_neg17:
993; VI-GISEL:       ; %bb.0:
994; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
995; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
996; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
997; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
998; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
999; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1000; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1001; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
1002; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1003; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1004; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1005; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1006; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1007; VI-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 17, v3
1008; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
1009; VI-GISEL-NEXT:    s_endpgm
1010;
1011; GFX9-LABEL: v_test_i32_x_sub_neg17:
1012; GFX9:       ; %bb.0:
1013; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1014; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1015; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1016; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1017; GFX9-NEXT:    s_waitcnt vmcnt(0)
1018; GFX9-NEXT:    v_add_u32_e32 v1, 17, v1
1019; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1020; GFX9-NEXT:    s_endpgm
1021;
1022; GFX10-LABEL: v_test_i32_x_sub_neg17:
1023; GFX10:       ; %bb.0:
1024; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1025; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1026; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1027; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1028; GFX10-NEXT:    s_waitcnt vmcnt(0)
1029; GFX10-NEXT:    v_add_nc_u32_e32 v1, 17, v1
1030; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1031; GFX10-NEXT:    s_endpgm
1032;
1033; GFX11-LABEL: v_test_i32_x_sub_neg17:
1034; GFX11:       ; %bb.0:
1035; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1036; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1037; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1038; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1039; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1040; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1041; GFX11-NEXT:    s_waitcnt vmcnt(0)
1042; GFX11-NEXT:    v_add_nc_u32_e32 v1, 17, v1
1043; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1044; GFX11-NEXT:    s_endpgm
1045  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1046  %tid.ext = sext i32 %tid to i64
1047  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
1048  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
1049  %x = load i32, ptr addrspace(1) %gep
1050  %result = sub i32 %x, -17
1051  store i32 %result, ptr addrspace(1) %gep.out
1052  ret void
1053}
1054
1055define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1056; SI-SDAG-LABEL: v_test_i32_neg17_sub_x:
1057; SI-SDAG:       ; %bb.0:
1058; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1059; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
1060; SI-SDAG-NEXT:    s_mov_b32 s6, 0
1061; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1062; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
1063; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1064; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
1065; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1066; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
1067; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1068; SI-SDAG-NEXT:    v_sub_i32_e32 v2, vcc, 0xffffffef, v2
1069; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1070; SI-SDAG-NEXT:    s_endpgm
1071;
1072; SI-GISEL-LABEL: v_test_i32_neg17_sub_x:
1073; SI-GISEL:       ; %bb.0:
1074; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1075; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1076; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1077; SI-GISEL-NEXT:    s_mov_b32 s6, 0
1078; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
1079; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1080; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
1081; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1082; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1083; SI-GISEL-NEXT:    v_sub_i32_e32 v2, vcc, 0xffffffef, v2
1084; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
1085; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1086; SI-GISEL-NEXT:    s_endpgm
1087;
1088; VI-SDAG-LABEL: v_test_i32_neg17_sub_x:
1089; VI-SDAG:       ; %bb.0:
1090; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1091; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1092; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1093; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
1094; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1095; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1096; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
1097; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
1098; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1099; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1100; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1101; VI-SDAG-NEXT:    v_sub_u32_e32 v2, vcc, 0xffffffef, v3
1102; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
1103; VI-SDAG-NEXT:    s_endpgm
1104;
1105; VI-GISEL-LABEL: v_test_i32_neg17_sub_x:
1106; VI-GISEL:       ; %bb.0:
1107; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1108; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1109; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1110; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
1111; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
1112; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1113; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1114; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
1115; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1116; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1117; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1118; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1119; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1120; VI-GISEL-NEXT:    v_sub_u32_e32 v2, vcc, 0xffffffef, v3
1121; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
1122; VI-GISEL-NEXT:    s_endpgm
1123;
1124; GFX9-LABEL: v_test_i32_neg17_sub_x:
1125; GFX9:       ; %bb.0:
1126; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1127; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1128; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1129; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1130; GFX9-NEXT:    s_waitcnt vmcnt(0)
1131; GFX9-NEXT:    v_sub_u32_e32 v1, 0xffffffef, v1
1132; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1133; GFX9-NEXT:    s_endpgm
1134;
1135; GFX10-LABEL: v_test_i32_neg17_sub_x:
1136; GFX10:       ; %bb.0:
1137; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1138; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1139; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1140; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1141; GFX10-NEXT:    s_waitcnt vmcnt(0)
1142; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0xffffffef, v1
1143; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1144; GFX10-NEXT:    s_endpgm
1145;
1146; GFX11-LABEL: v_test_i32_neg17_sub_x:
1147; GFX11:       ; %bb.0:
1148; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1149; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1150; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1151; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1152; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1153; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1154; GFX11-NEXT:    s_waitcnt vmcnt(0)
1155; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0xffffffef, v1
1156; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1157; GFX11-NEXT:    s_endpgm
1158  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1159  %tid.ext = sext i32 %tid to i64
1160  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
1161  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
1162  %x = load i32, ptr addrspace(1) %gep
1163  %result = sub i32 -17, %x
1164  store i32 %result, ptr addrspace(1) %gep.out
1165  ret void
1166}
1167
1168define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 {
1169; SI-LABEL: s_test_i32_x_sub_64:
1170; SI:       ; %bb.0:
1171; SI-NEXT:    s_load_dword s0, s[4:5], 0x9
1172; SI-NEXT:    s_waitcnt lgkmcnt(0)
1173; SI-NEXT:    s_sub_i32 s0, s0, 64
1174; SI-NEXT:    ;;#ASMSTART
1175; SI-NEXT:    ; use s0
1176; SI-NEXT:    ;;#ASMEND
1177; SI-NEXT:    s_endpgm
1178;
1179; VI-LABEL: s_test_i32_x_sub_64:
1180; VI:       ; %bb.0:
1181; VI-NEXT:    s_load_dword s0, s[4:5], 0x24
1182; VI-NEXT:    s_waitcnt lgkmcnt(0)
1183; VI-NEXT:    s_sub_i32 s0, s0, 64
1184; VI-NEXT:    ;;#ASMSTART
1185; VI-NEXT:    ; use s0
1186; VI-NEXT:    ;;#ASMEND
1187; VI-NEXT:    s_endpgm
1188;
1189; GFX9-LABEL: s_test_i32_x_sub_64:
1190; GFX9:       ; %bb.0:
1191; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x24
1192; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1193; GFX9-NEXT:    s_sub_i32 s0, s0, 64
1194; GFX9-NEXT:    ;;#ASMSTART
1195; GFX9-NEXT:    ; use s0
1196; GFX9-NEXT:    ;;#ASMEND
1197; GFX9-NEXT:    s_endpgm
1198;
1199; GFX10-LABEL: s_test_i32_x_sub_64:
1200; GFX10:       ; %bb.0:
1201; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x24
1202; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1203; GFX10-NEXT:    s_sub_i32 s0, s0, 64
1204; GFX10-NEXT:    ;;#ASMSTART
1205; GFX10-NEXT:    ; use s0
1206; GFX10-NEXT:    ;;#ASMEND
1207; GFX10-NEXT:    s_endpgm
1208;
1209; GFX11-LABEL: s_test_i32_x_sub_64:
1210; GFX11:       ; %bb.0:
1211; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
1212; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1213; GFX11-NEXT:    s_sub_i32 s0, s0, 64
1214; GFX11-NEXT:    ;;#ASMSTART
1215; GFX11-NEXT:    ; use s0
1216; GFX11-NEXT:    ;;#ASMEND
1217; GFX11-NEXT:    s_endpgm
1218  %result = sub i32 %x, 64
1219  call void asm sideeffect "; use $0", "s"(i32 %result)
1220  ret void
1221}
1222
1223define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1224; SI-SDAG-LABEL: v_test_i16_x_sub_64:
1225; SI-SDAG:       ; %bb.0:
1226; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1227; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
1228; SI-SDAG-NEXT:    s_mov_b32 s6, 0
1229; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1230; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
1231; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1232; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
1233; SI-SDAG-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
1234; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
1235; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1236; SI-SDAG-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
1237; SI-SDAG-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
1238; SI-SDAG-NEXT:    s_endpgm
1239;
1240; SI-GISEL-LABEL: v_test_i16_x_sub_64:
1241; SI-GISEL:       ; %bb.0:
1242; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1243; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1244; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1245; SI-GISEL-NEXT:    s_mov_b32 s6, 0
1246; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
1247; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1248; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
1249; SI-GISEL-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
1250; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1251; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
1252; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
1253; SI-GISEL-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
1254; SI-GISEL-NEXT:    s_endpgm
1255;
1256; VI-SDAG-LABEL: v_test_i16_x_sub_64:
1257; VI-SDAG:       ; %bb.0:
1258; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1259; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
1260; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1261; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
1262; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1263; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1264; VI-SDAG-NEXT:    flat_load_ushort v3, v[0:1]
1265; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
1266; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1267; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1268; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1269; VI-SDAG-NEXT:    v_subrev_u16_e32 v2, 64, v3
1270; VI-SDAG-NEXT:    flat_store_short v[0:1], v2
1271; VI-SDAG-NEXT:    s_endpgm
1272;
1273; VI-GISEL-LABEL: v_test_i16_x_sub_64:
1274; VI-GISEL:       ; %bb.0:
1275; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1276; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
1277; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1278; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
1279; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
1280; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1281; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1282; VI-GISEL-NEXT:    flat_load_ushort v3, v[0:1]
1283; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1284; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1285; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1286; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1287; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1288; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xffc0, v3
1289; VI-GISEL-NEXT:    flat_store_short v[0:1], v2
1290; VI-GISEL-NEXT:    s_endpgm
1291;
1292; GFX9-SDAG-LABEL: v_test_i16_x_sub_64:
1293; GFX9-SDAG:       ; %bb.0:
1294; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1295; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1296; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1297; GFX9-SDAG-NEXT:    global_load_ushort v1, v0, s[2:3]
1298; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
1299; GFX9-SDAG-NEXT:    v_subrev_u16_e32 v1, 64, v1
1300; GFX9-SDAG-NEXT:    global_store_short v0, v1, s[0:1]
1301; GFX9-SDAG-NEXT:    s_endpgm
1302;
1303; GFX9-GISEL-LABEL: v_test_i16_x_sub_64:
1304; GFX9-GISEL:       ; %bb.0:
1305; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1306; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1307; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1308; GFX9-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
1309; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1310; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, 0xffc0, v1
1311; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
1312; GFX9-GISEL-NEXT:    s_endpgm
1313;
1314; GFX10-SDAG-LABEL: v_test_i16_x_sub_64:
1315; GFX10-SDAG:       ; %bb.0:
1316; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1317; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1318; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1319; GFX10-SDAG-NEXT:    global_load_ushort v1, v0, s[2:3]
1320; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
1321; GFX10-SDAG-NEXT:    v_sub_nc_u16 v1, v1, 64
1322; GFX10-SDAG-NEXT:    global_store_short v0, v1, s[0:1]
1323; GFX10-SDAG-NEXT:    s_endpgm
1324;
1325; GFX10-GISEL-LABEL: v_test_i16_x_sub_64:
1326; GFX10-GISEL:       ; %bb.0:
1327; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1328; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1329; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1330; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
1331; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1332; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, 0xffc0, v1
1333; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
1334; GFX10-GISEL-NEXT:    s_endpgm
1335;
1336; GFX11-SDAG-TRUE16-LABEL: v_test_i16_x_sub_64:
1337; GFX11-SDAG-TRUE16:       ; %bb.0:
1338; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1339; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1340; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1341; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1342; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX11-SDAG-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
1344; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
1345; GFX11-SDAG-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, 64
1346; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
1347; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
1348;
1349; GFX11-SDAG-FAKE16-LABEL: v_test_i16_x_sub_64:
1350; GFX11-SDAG-FAKE16:       ; %bb.0:
1351; GFX11-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1352; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1353; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1354; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1355; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
1356; GFX11-SDAG-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
1357; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
1358; GFX11-SDAG-FAKE16-NEXT:    v_sub_nc_u16 v1, v1, 64
1359; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
1360; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
1361;
1362; GFX11-GISEL-TRUE16-LABEL: v_test_i16_x_sub_64:
1363; GFX11-GISEL-TRUE16:       ; %bb.0:
1364; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1365; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1366; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1367; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1368; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
1369; GFX11-GISEL-TRUE16-NEXT:    global_load_u16 v1, v0, s[2:3]
1370; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
1371; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0xffc0, v1.l
1372; GFX11-GISEL-TRUE16-NEXT:    global_store_b16 v0, v1, s[0:1]
1373; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
1374;
1375; GFX11-GISEL-FAKE16-LABEL: v_test_i16_x_sub_64:
1376; GFX11-GISEL-FAKE16:       ; %bb.0:
1377; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1378; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1379; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1380; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1381; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
1382; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
1383; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
1384; GFX11-GISEL-FAKE16-NEXT:    v_add_nc_u16 v1, 0xffc0, v1
1385; GFX11-GISEL-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
1386; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
1387  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1388  %tid.ext = sext i32 %tid to i64
1389  %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext
1390  %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i64 %tid.ext
1391  %x = load i16, ptr addrspace(1) %gep
1392  %result = sub i16 %x, 64
1393  store i16 %result, ptr addrspace(1) %gep.out
1394  ret void
1395}
1396
1397define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1398; SI-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32:
1399; SI-SDAG:       ; %bb.0:
1400; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1401; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
1402; SI-SDAG-NEXT:    s_mov_b32 s6, 0
1403; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1404; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0
1405; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1406; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
1407; SI-SDAG-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
1408; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
1409; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1410; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1411; SI-SDAG-NEXT:    v_subrev_i32_e32 v0, vcc, 64, v3
1412; SI-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1413; SI-SDAG-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
1414; SI-SDAG-NEXT:    s_endpgm
1415;
1416; SI-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32:
1417; SI-GISEL:       ; %bb.0:
1418; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1419; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1420; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
1421; SI-GISEL-NEXT:    s_mov_b32 s6, 0
1422; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
1423; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1424; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
1425; SI-GISEL-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
1426; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1427; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1428; SI-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc0, v3
1429; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1430; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
1431; SI-GISEL-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
1432; SI-GISEL-NEXT:    s_endpgm
1433;
1434; VI-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32:
1435; VI-SDAG:       ; %bb.0:
1436; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1437; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1438; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1439; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1440; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s3
1441; VI-SDAG-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
1442; VI-SDAG-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
1443; VI-SDAG-NEXT:    flat_load_ushort v2, v[1:2]
1444; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
1445; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1446; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1447; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1448; VI-SDAG-NEXT:    v_subrev_u16_e32 v2, 64, v2
1449; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
1450; VI-SDAG-NEXT:    s_endpgm
1451;
1452; VI-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32:
1453; VI-GISEL:       ; %bb.0:
1454; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1455; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 1, v0
1456; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1457; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1458; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1459; VI-GISEL-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
1460; VI-GISEL-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
1461; VI-GISEL-NEXT:    flat_load_ushort v2, v[1:2]
1462; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
1463; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1464; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1465; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
1466; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1467; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1468; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xffc0, v2
1469; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
1470; VI-GISEL-NEXT:    s_endpgm
1471;
1472; GFX9-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32:
1473; GFX9-SDAG:       ; %bb.0:
1474; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1475; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1476; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1477; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1478; GFX9-SDAG-NEXT:    global_load_ushort v1, v1, s[2:3]
1479; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
1480; GFX9-SDAG-NEXT:    v_subrev_u16_e32 v1, 64, v1
1481; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
1482; GFX9-SDAG-NEXT:    s_endpgm
1483;
1484; GFX9-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32:
1485; GFX9-GISEL:       ; %bb.0:
1486; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1487; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1488; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1489; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1490; GFX9-GISEL-NEXT:    global_load_ushort v1, v1, s[2:3]
1491; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1492; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, 0xffc0, v1
1493; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
1494; GFX9-GISEL-NEXT:    s_endpgm
1495;
1496; GFX10-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32:
1497; GFX10-SDAG:       ; %bb.0:
1498; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1499; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1500; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1501; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1502; GFX10-SDAG-NEXT:    global_load_ushort v1, v1, s[2:3]
1503; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
1504; GFX10-SDAG-NEXT:    v_sub_nc_u16 v1, v1, 64
1505; GFX10-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1506; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
1507; GFX10-SDAG-NEXT:    s_endpgm
1508;
1509; GFX10-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32:
1510; GFX10-GISEL:       ; %bb.0:
1511; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1512; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1513; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1514; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1515; GFX10-GISEL-NEXT:    global_load_ushort v1, v1, s[2:3]
1516; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1517; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, 0xffc0, v1
1518; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1519; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
1520; GFX10-GISEL-NEXT:    s_endpgm
1521;
1522; GFX11-SDAG-TRUE16-LABEL: v_test_i16_x_sub_64_zext_to_i32:
1523; GFX11-SDAG-TRUE16:       ; %bb.0:
1524; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1525; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
1526; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1527; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
1528; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
1529; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
1530; GFX11-SDAG-TRUE16-NEXT:    global_load_u16 v0, v0, s[2:3]
1531; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
1532; GFX11-SDAG-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, 64
1533; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1534; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1535; GFX11-SDAG-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
1536; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
1537;
1538; GFX11-SDAG-FAKE16-LABEL: v_test_i16_x_sub_64_zext_to_i32:
1539; GFX11-SDAG-FAKE16:       ; %bb.0:
1540; GFX11-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1541; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1542; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1543; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1544; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1545; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
1546; GFX11-SDAG-FAKE16-NEXT:    global_load_u16 v1, v1, s[2:3]
1547; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
1548; GFX11-SDAG-FAKE16-NEXT:    v_sub_nc_u16 v1, v1, 64
1549; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1550; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1551; GFX11-SDAG-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
1552; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
1553;
1554; GFX11-GISEL-TRUE16-LABEL: v_test_i16_x_sub_64_zext_to_i32:
1555; GFX11-GISEL-TRUE16:       ; %bb.0:
1556; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1557; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
1558; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1559; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
1560; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
1561; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
1562; GFX11-GISEL-TRUE16-NEXT:    global_load_u16 v0, v0, s[2:3]
1563; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
1564; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0xffc0, v0.l
1565; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1566; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1567; GFX11-GISEL-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
1568; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
1569;
1570; GFX11-GISEL-FAKE16-LABEL: v_test_i16_x_sub_64_zext_to_i32:
1571; GFX11-GISEL-FAKE16:       ; %bb.0:
1572; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1573; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1574; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1575; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1576; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1577; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
1578; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v1, v1, s[2:3]
1579; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
1580; GFX11-GISEL-FAKE16-NEXT:    v_add_nc_u16 v1, 0xffc0, v1
1581; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1582; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1583; GFX11-GISEL-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
1584; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
1585  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1586  %tid.ext = sext i32 %tid to i64
1587  %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext
1588  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
1589  %x = load i16, ptr addrspace(1) %gep
1590  %result = sub i16 %x, 64
1591  %zext = zext i16 %result to i32
1592  store i32 %zext, ptr addrspace(1) %gep.out
1593  ret void
1594}
1595
1596define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1597; SI-SDAG-LABEL: v_test_i16_x_sub_64_multi_use:
1598; SI-SDAG:       ; %bb.0:
1599; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1600; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
1601; SI-SDAG-NEXT:    s_mov_b32 s6, 0
1602; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1603; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
1604; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1605; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
1606; SI-SDAG-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
1607; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1608; SI-SDAG-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
1609; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1610; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
1611; SI-SDAG-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
1612; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
1613; SI-SDAG-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
1614; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1615; SI-SDAG-NEXT:    buffer_store_short v3, v[0:1], s[0:3], 0 addr64
1616; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1617; SI-SDAG-NEXT:    s_endpgm
1618;
1619; SI-GISEL-LABEL: v_test_i16_x_sub_64_multi_use:
1620; SI-GISEL:       ; %bb.0:
1621; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1622; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1623; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1624; SI-GISEL-NEXT:    s_mov_b32 s6, 0
1625; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
1626; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1627; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
1628; SI-GISEL-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
1629; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1630; SI-GISEL-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
1631; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1632; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
1633; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
1634; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffc0, v3
1635; SI-GISEL-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
1636; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1637; SI-GISEL-NEXT:    buffer_store_short v3, v[0:1], s[0:3], 0 addr64
1638; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1639; SI-GISEL-NEXT:    s_endpgm
1640;
1641; VI-SDAG-LABEL: v_test_i16_x_sub_64_multi_use:
1642; VI-SDAG:       ; %bb.0:
1643; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1644; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
1645; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1646; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
1647; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1648; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1649; VI-SDAG-NEXT:    flat_load_ushort v3, v[0:1] glc
1650; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1651; VI-SDAG-NEXT:    flat_load_ushort v4, v[0:1] glc
1652; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1653; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
1654; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1655; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1656; VI-SDAG-NEXT:    v_subrev_u16_e32 v2, 64, v3
1657; VI-SDAG-NEXT:    v_subrev_u16_e32 v3, 64, v4
1658; VI-SDAG-NEXT:    flat_store_short v[0:1], v2
1659; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1660; VI-SDAG-NEXT:    flat_store_short v[0:1], v3
1661; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1662; VI-SDAG-NEXT:    s_endpgm
1663;
1664; VI-GISEL-LABEL: v_test_i16_x_sub_64_multi_use:
1665; VI-GISEL:       ; %bb.0:
1666; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1667; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
1668; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1669; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
1670; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
1671; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1672; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1673; VI-GISEL-NEXT:    flat_load_ushort v3, v[0:1] glc
1674; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1675; VI-GISEL-NEXT:    flat_load_ushort v4, v[0:1] glc
1676; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1677; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1678; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1679; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1680; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1681; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xffc0, v3
1682; VI-GISEL-NEXT:    v_add_u16_e32 v3, 0xffc0, v4
1683; VI-GISEL-NEXT:    flat_store_short v[0:1], v2
1684; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1685; VI-GISEL-NEXT:    flat_store_short v[0:1], v3
1686; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1687; VI-GISEL-NEXT:    s_endpgm
1688;
1689; GFX9-SDAG-LABEL: v_test_i16_x_sub_64_multi_use:
1690; GFX9-SDAG:       ; %bb.0:
1691; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1692; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1693; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1694; GFX9-SDAG-NEXT:    global_load_ushort v1, v0, s[2:3] glc
1695; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
1696; GFX9-SDAG-NEXT:    global_load_ushort v2, v0, s[2:3] glc
1697; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
1698; GFX9-SDAG-NEXT:    v_subrev_u16_e32 v1, 64, v1
1699; GFX9-SDAG-NEXT:    v_subrev_u16_e32 v2, 64, v2
1700; GFX9-SDAG-NEXT:    global_store_short v0, v1, s[0:1]
1701; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
1702; GFX9-SDAG-NEXT:    global_store_short v0, v2, s[0:1]
1703; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
1704; GFX9-SDAG-NEXT:    s_endpgm
1705;
1706; GFX9-GISEL-LABEL: v_test_i16_x_sub_64_multi_use:
1707; GFX9-GISEL:       ; %bb.0:
1708; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1709; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1710; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1711; GFX9-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3] glc
1712; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1713; GFX9-GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc
1714; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1715; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, 0xffc0, v1
1716; GFX9-GISEL-NEXT:    v_add_u16_e32 v2, 0xffc0, v2
1717; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
1718; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1719; GFX9-GISEL-NEXT:    global_store_short v0, v2, s[0:1]
1720; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1721; GFX9-GISEL-NEXT:    s_endpgm
1722;
1723; GFX10-SDAG-LABEL: v_test_i16_x_sub_64_multi_use:
1724; GFX10-SDAG:       ; %bb.0:
1725; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1726; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1727; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1728; GFX10-SDAG-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1729; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
1730; GFX10-SDAG-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
1731; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
1732; GFX10-SDAG-NEXT:    v_sub_nc_u16 v1, v1, 64
1733; GFX10-SDAG-NEXT:    v_sub_nc_u16 v2, v2, 64
1734; GFX10-SDAG-NEXT:    global_store_short v0, v1, s[0:1]
1735; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
1736; GFX10-SDAG-NEXT:    global_store_short v0, v2, s[0:1]
1737; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
1738; GFX10-SDAG-NEXT:    s_endpgm
1739;
1740; GFX10-GISEL-LABEL: v_test_i16_x_sub_64_multi_use:
1741; GFX10-GISEL:       ; %bb.0:
1742; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1743; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1744; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1745; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1746; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1747; GFX10-GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
1748; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1749; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, 0xffc0, v1
1750; GFX10-GISEL-NEXT:    v_add_nc_u16 v2, 0xffc0, v2
1751; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
1752; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
1753; GFX10-GISEL-NEXT:    global_store_short v0, v2, s[0:1]
1754; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
1755; GFX10-GISEL-NEXT:    s_endpgm
1756;
1757; GFX11-SDAG-TRUE16-LABEL: v_test_i16_x_sub_64_multi_use:
1758; GFX11-SDAG-TRUE16:       ; %bb.0:
1759; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1760; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1761; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1762; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1763; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
1764; GFX11-SDAG-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
1765; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
1766; GFX11-SDAG-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] glc dlc
1767; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
1768; GFX11-SDAG-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, 64
1769; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
1770; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1771; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
1772; GFX11-SDAG-TRUE16-NEXT:    v_sub_nc_u16 v0.h, v0.h, 64
1773; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1774; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
1775; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v1, v2, s[0:1] dlc
1776; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
1777; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
1778; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
1779; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
1780;
1781; GFX11-SDAG-FAKE16-LABEL: v_test_i16_x_sub_64_multi_use:
1782; GFX11-SDAG-FAKE16:       ; %bb.0:
1783; GFX11-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1784; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1785; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1786; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1787; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
1788; GFX11-SDAG-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1789; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
1790; GFX11-SDAG-FAKE16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
1791; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
1792; GFX11-SDAG-FAKE16-NEXT:    v_sub_nc_u16 v1, v1, 64
1793; GFX11-SDAG-FAKE16-NEXT:    v_sub_nc_u16 v2, v2, 64
1794; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1] dlc
1795; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
1796; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v0, v2, s[0:1] dlc
1797; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
1798; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
1799;
1800; GFX11-GISEL-TRUE16-LABEL: v_test_i16_x_sub_64_multi_use:
1801; GFX11-GISEL-TRUE16:       ; %bb.0:
1802; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1803; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1804; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1805; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1806; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
1807; GFX11-GISEL-TRUE16-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1808; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
1809; GFX11-GISEL-TRUE16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
1810; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
1811; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0xffc0, v1.l
1812; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0xffc0, v2.l
1813; GFX11-GISEL-TRUE16-NEXT:    global_store_b16 v0, v1, s[0:1] dlc
1814; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
1815; GFX11-GISEL-TRUE16-NEXT:    global_store_b16 v0, v2, s[0:1] dlc
1816; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
1817; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
1818;
1819; GFX11-GISEL-FAKE16-LABEL: v_test_i16_x_sub_64_multi_use:
1820; GFX11-GISEL-FAKE16:       ; %bb.0:
1821; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1822; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1823; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1824; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1825; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
1826; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1827; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
1828; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
1829; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
1830; GFX11-GISEL-FAKE16-NEXT:    v_add_nc_u16 v1, 0xffc0, v1
1831; GFX11-GISEL-FAKE16-NEXT:    v_add_nc_u16 v2, 0xffc0, v2
1832; GFX11-GISEL-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1] dlc
1833; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
1834; GFX11-GISEL-FAKE16-NEXT:    global_store_b16 v0, v2, s[0:1] dlc
1835; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
1836; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
1837  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1838  %tid.ext = sext i32 %tid to i64
1839  %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext
1840  %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i64 %tid.ext
1841  %x = load volatile i16, ptr addrspace(1) %gep
1842  %y = load volatile i16, ptr addrspace(1) %gep
1843  %result0 = sub i16 %x, 64
1844  %result1 = sub i16 %y, 64
1845  store volatile i16 %result0, ptr addrspace(1) %gep.out
1846  store volatile i16 %result1, ptr addrspace(1) %gep.out
1847  ret void
1848}
1849
1850define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1851; SI-SDAG-LABEL: v_test_v2i16_x_sub_64_64:
1852; SI-SDAG:       ; %bb.0:
1853; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1854; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
1855; SI-SDAG-NEXT:    s_mov_b32 s6, 0
1856; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1857; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
1858; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1859; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
1860; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1861; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
1862; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1863; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v2
1864; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
1865; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
1866; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xffc00000, v2
1867; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1868; SI-SDAG-NEXT:    s_endpgm
1869;
1870; SI-GISEL-LABEL: v_test_v2i16_x_sub_64_64:
1871; SI-GISEL:       ; %bb.0:
1872; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1873; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1874; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1875; SI-GISEL-NEXT:    s_mov_b32 s6, 0
1876; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
1877; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1878; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
1879; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1880; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1881; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1882; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
1883; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffc0, v3
1884; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1885; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1886; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1887; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
1888; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
1889; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1890; SI-GISEL-NEXT:    s_endpgm
1891;
1892; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_64:
1893; VI-SDAG:       ; %bb.0:
1894; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1895; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1896; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 64
1897; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1898; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
1899; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1900; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1901; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
1902; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
1903; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1904; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1905; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1906; VI-SDAG-NEXT:    v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1907; VI-SDAG-NEXT:    v_subrev_u16_e32 v3, 64, v3
1908; VI-SDAG-NEXT:    v_or_b32_e32 v2, v3, v2
1909; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
1910; VI-SDAG-NEXT:    s_endpgm
1911;
1912; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_64:
1913; VI-GISEL:       ; %bb.0:
1914; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1915; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1916; VI-GISEL-NEXT:    v_not_b32_e32 v4, 63
1917; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1918; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
1919; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
1920; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1921; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1922; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
1923; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1924; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1925; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1926; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1927; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
1928; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xffc0, v3
1929; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1930; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
1931; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
1932; VI-GISEL-NEXT:    s_endpgm
1933;
1934; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
1935; GFX9:       ; %bb.0:
1936; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1937; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1938; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1939; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1940; GFX9-NEXT:    s_waitcnt vmcnt(0)
1941; GFX9-NEXT:    v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
1942; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1943; GFX9-NEXT:    s_endpgm
1944;
1945; GFX10-LABEL: v_test_v2i16_x_sub_64_64:
1946; GFX10:       ; %bb.0:
1947; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1948; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1949; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1950; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1951; GFX10-NEXT:    s_waitcnt vmcnt(0)
1952; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
1953; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1954; GFX10-NEXT:    s_endpgm
1955;
1956; GFX11-LABEL: v_test_v2i16_x_sub_64_64:
1957; GFX11:       ; %bb.0:
1958; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1959; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1960; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1961; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1962; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1963; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1964; GFX11-NEXT:    s_waitcnt vmcnt(0)
1965; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
1966; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1967; GFX11-NEXT:    s_endpgm
1968  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1969  %tid.ext = sext i32 %tid to i64
1970  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1971  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1972  %x = load <2 x i16>, ptr addrspace(1) %gep
1973  %result = sub <2 x i16> %x, <i16 64, i16 64>
1974  store <2 x i16> %result, ptr addrspace(1) %gep.out
1975  ret void
1976}
1977
1978define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1979; SI-SDAG-LABEL: v_test_v2i16_x_sub_7_64:
1980; SI-SDAG:       ; %bb.0:
1981; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1982; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
1983; SI-SDAG-NEXT:    s_mov_b32 s6, 0
1984; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1985; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
1986; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
1987; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
1988; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1989; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
1990; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
1991; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, -7, v2
1992; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
1993; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
1994; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xffc00000, v2
1995; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1996; SI-SDAG-NEXT:    s_endpgm
1997;
1998; SI-GISEL-LABEL: v_test_v2i16_x_sub_7_64:
1999; SI-GISEL:       ; %bb.0:
2000; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2001; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2002; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
2003; SI-GISEL-NEXT:    s_mov_b32 s6, 0
2004; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
2005; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2006; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
2007; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2008; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2009; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2010; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, -7, v2
2011; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffc0, v3
2012; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2013; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
2014; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2015; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
2016; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
2017; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2018; SI-GISEL-NEXT:    s_endpgm
2019;
2020; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_64:
2021; VI-SDAG:       ; %bb.0:
2022; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2023; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2024; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 64
2025; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2026; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
2027; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2028; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2029; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
2030; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
2031; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2032; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2033; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
2034; VI-SDAG-NEXT:    v_add_u16_e32 v2, -7, v3
2035; VI-SDAG-NEXT:    v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2036; VI-SDAG-NEXT:    v_or_b32_e32 v2, v2, v3
2037; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
2038; VI-SDAG-NEXT:    s_endpgm
2039;
2040; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_64:
2041; VI-GISEL:       ; %bb.0:
2042; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2043; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2044; VI-GISEL-NEXT:    v_not_b32_e32 v4, 63
2045; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2046; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
2047; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
2048; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2049; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2050; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
2051; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
2052; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
2053; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2054; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2055; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2056; VI-GISEL-NEXT:    v_add_u16_e32 v2, -7, v3
2057; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2058; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
2059; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
2060; VI-GISEL-NEXT:    s_endpgm
2061;
2062; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_7_64:
2063; GFX9-SDAG:       ; %bb.0:
2064; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2065; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2066; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2067; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
2068; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x400007
2069; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
2070; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
2071; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
2072; GFX9-SDAG-NEXT:    s_endpgm
2073;
2074; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_7_64:
2075; GFX9-GISEL:       ; %bb.0:
2076; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2077; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2078; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x400007
2079; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2080; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
2081; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
2082; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
2083; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
2084; GFX9-GISEL-NEXT:    s_endpgm
2085;
2086; GFX10-LABEL: v_test_v2i16_x_sub_7_64:
2087; GFX10:       ; %bb.0:
2088; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2089; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2090; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2091; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
2092; GFX10-NEXT:    s_waitcnt vmcnt(0)
2093; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0x400007
2094; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2095; GFX10-NEXT:    s_endpgm
2096;
2097; GFX11-LABEL: v_test_v2i16_x_sub_7_64:
2098; GFX11:       ; %bb.0:
2099; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2100; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2101; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2102; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2103; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2104; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2105; GFX11-NEXT:    s_waitcnt vmcnt(0)
2106; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 0x400007
2107; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2108; GFX11-NEXT:    s_endpgm
2109  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2110  %tid.ext = sext i32 %tid to i64
2111  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2112  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2113  %x = load <2 x i16>, ptr addrspace(1) %gep
2114  %result = sub <2 x i16> %x, <i16 7, i16 64>
2115  store <2 x i16> %result, ptr addrspace(1) %gep.out
2116  ret void
2117}
2118
2119define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2120; SI-SDAG-LABEL: v_test_v2i16_x_sub_64_123:
2121; SI-SDAG:       ; %bb.0:
2122; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2123; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
2124; SI-SDAG-NEXT:    s_mov_b32 s6, 0
2125; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2126; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
2127; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2128; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
2129; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2130; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
2131; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
2132; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v2
2133; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
2134; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
2135; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xff850000, v2
2136; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2137; SI-SDAG-NEXT:    s_endpgm
2138;
2139; SI-GISEL-LABEL: v_test_v2i16_x_sub_64_123:
2140; SI-GISEL:       ; %bb.0:
2141; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2142; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2143; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
2144; SI-GISEL-NEXT:    s_mov_b32 s6, 0
2145; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
2146; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2147; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
2148; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2149; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2150; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2151; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
2152; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffff85, v3
2153; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2154; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
2155; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2156; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
2157; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
2158; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2159; SI-GISEL-NEXT:    s_endpgm
2160;
2161; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_123:
2162; VI-SDAG:       ; %bb.0:
2163; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2164; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2165; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff85
2166; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2167; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
2168; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2169; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2170; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
2171; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
2172; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2173; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2174; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
2175; VI-SDAG-NEXT:    v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2176; VI-SDAG-NEXT:    v_subrev_u16_e32 v3, 64, v3
2177; VI-SDAG-NEXT:    v_or_b32_e32 v2, v3, v2
2178; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
2179; VI-SDAG-NEXT:    s_endpgm
2180;
2181; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_123:
2182; VI-GISEL:       ; %bb.0:
2183; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2184; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2185; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff85
2186; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2187; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
2188; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
2189; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2190; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2191; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
2192; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
2193; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
2194; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2195; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2196; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2197; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xffc0, v3
2198; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2199; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
2200; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
2201; VI-GISEL-NEXT:    s_endpgm
2202;
2203; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_64_123:
2204; GFX9-SDAG:       ; %bb.0:
2205; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2206; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2207; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2208; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
2209; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x7b0040
2210; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
2211; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
2212; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
2213; GFX9-SDAG-NEXT:    s_endpgm
2214;
2215; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_64_123:
2216; GFX9-GISEL:       ; %bb.0:
2217; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2218; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2219; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7b0040
2220; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2221; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
2222; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
2223; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
2224; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
2225; GFX9-GISEL-NEXT:    s_endpgm
2226;
2227; GFX10-LABEL: v_test_v2i16_x_sub_64_123:
2228; GFX10:       ; %bb.0:
2229; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2230; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2231; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2232; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
2233; GFX10-NEXT:    s_waitcnt vmcnt(0)
2234; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0x7b0040
2235; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2236; GFX10-NEXT:    s_endpgm
2237;
2238; GFX11-LABEL: v_test_v2i16_x_sub_64_123:
2239; GFX11:       ; %bb.0:
2240; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2241; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2242; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2243; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2244; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2245; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2246; GFX11-NEXT:    s_waitcnt vmcnt(0)
2247; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 0x7b0040
2248; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2249; GFX11-NEXT:    s_endpgm
2250  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2251  %tid.ext = sext i32 %tid to i64
2252  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2253  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2254  %x = load <2 x i16>, ptr addrspace(1) %gep
2255  %result = sub <2 x i16> %x, <i16 64, i16 123>
2256  store <2 x i16> %result, ptr addrspace(1) %gep.out
2257  ret void
2258}
2259
2260; Can fold 0 and inline immediate in other half.
2261define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2262; SI-SDAG-LABEL: v_test_v2i16_x_sub_7_0:
2263; SI-SDAG:       ; %bb.0:
2264; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2265; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
2266; SI-SDAG-NEXT:    s_mov_b32 s6, 0
2267; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2268; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
2269; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2270; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
2271; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2272; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
2273; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
2274; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, -7, v2
2275; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff
2276; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v3, v2
2277; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2278; SI-SDAG-NEXT:    s_endpgm
2279;
2280; SI-GISEL-LABEL: v_test_v2i16_x_sub_7_0:
2281; SI-GISEL:       ; %bb.0:
2282; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2283; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2284; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
2285; SI-GISEL-NEXT:    s_mov_b32 s6, 0
2286; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
2287; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2288; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
2289; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2290; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2291; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2292; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, -7, v2
2293; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2294; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2295; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
2296; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
2297; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2298; SI-GISEL-NEXT:    s_endpgm
2299;
2300; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_0:
2301; VI-SDAG:       ; %bb.0:
2302; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2303; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2304; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2305; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
2306; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2307; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2308; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
2309; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
2310; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2311; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2312; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
2313; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
2314; VI-SDAG-NEXT:    v_add_u16_e32 v3, -7, v3
2315; VI-SDAG-NEXT:    v_or_b32_e32 v2, v3, v2
2316; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
2317; VI-SDAG-NEXT:    s_endpgm
2318;
2319; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_0:
2320; VI-GISEL:       ; %bb.0:
2321; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2322; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2323; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2324; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
2325; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
2326; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2327; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2328; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
2329; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
2330; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2331; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
2332; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2333; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2334; VI-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
2335; VI-GISEL-NEXT:    v_add_u16_e32 v3, -7, v3
2336; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2337; VI-GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
2338; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
2339; VI-GISEL-NEXT:    s_endpgm
2340;
2341; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
2342; GFX9:       ; %bb.0:
2343; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2344; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2345; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2346; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2347; GFX9-NEXT:    s_waitcnt vmcnt(0)
2348; GFX9-NEXT:    v_pk_sub_i16 v1, v1, 7
2349; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2350; GFX9-NEXT:    s_endpgm
2351;
2352; GFX10-LABEL: v_test_v2i16_x_sub_7_0:
2353; GFX10:       ; %bb.0:
2354; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2355; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2356; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2357; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
2358; GFX10-NEXT:    s_waitcnt vmcnt(0)
2359; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 7
2360; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2361; GFX10-NEXT:    s_endpgm
2362;
2363; GFX11-LABEL: v_test_v2i16_x_sub_7_0:
2364; GFX11:       ; %bb.0:
2365; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2366; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2367; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2368; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2369; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2370; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2371; GFX11-NEXT:    s_waitcnt vmcnt(0)
2372; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 7
2373; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2374; GFX11-NEXT:    s_endpgm
2375  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2376  %tid.ext = sext i32 %tid to i64
2377  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2378  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2379  %x = load <2 x i16>, ptr addrspace(1) %gep
2380  %result = sub <2 x i16> %x, <i16 7, i16 0>
2381  store <2 x i16> %result, ptr addrspace(1) %gep.out
2382  ret void
2383}
2384
2385; Can fold 0 and inline immediate in other half.
2386define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2387; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_16:
2388; SI-SDAG:       ; %bb.0:
2389; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2390; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
2391; SI-SDAG-NEXT:    s_mov_b32 s6, 0
2392; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2393; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
2394; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2395; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
2396; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2397; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
2398; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
2399; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
2400; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2401; SI-SDAG-NEXT:    s_endpgm
2402;
2403; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_16:
2404; SI-GISEL:       ; %bb.0:
2405; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2406; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2407; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
2408; SI-GISEL-NEXT:    s_mov_b32 s6, 0
2409; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
2410; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2411; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
2412; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2413; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2414; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2415; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, -16, v3
2416; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2417; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
2418; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2419; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
2420; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
2421; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2422; SI-GISEL-NEXT:    s_endpgm
2423;
2424; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_16:
2425; VI-SDAG:       ; %bb.0:
2426; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2427; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2428; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2429; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
2430; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2431; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2432; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
2433; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2434; VI-SDAG-NEXT:    v_mov_b32_e32 v2, -16
2435; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
2436; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2437; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
2438; VI-SDAG-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2439; VI-SDAG-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2440; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
2441; VI-SDAG-NEXT:    s_endpgm
2442;
2443; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_16:
2444; VI-GISEL:       ; %bb.0:
2445; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2446; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2447; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2448; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
2449; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
2450; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2451; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2452; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
2453; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
2454; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2455; VI-GISEL-NEXT:    v_mov_b32_e32 v2, -16
2456; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
2457; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2458; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2459; VI-GISEL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2460; VI-GISEL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2461; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
2462; VI-GISEL-NEXT:    s_endpgm
2463;
2464; GFX9-LABEL: v_test_v2i16_x_sub_0_16:
2465; GFX9:       ; %bb.0:
2466; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2467; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2468; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2469; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2470; GFX9-NEXT:    s_waitcnt vmcnt(0)
2471; GFX9-NEXT:    v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
2472; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2473; GFX9-NEXT:    s_endpgm
2474;
2475; GFX10-LABEL: v_test_v2i16_x_sub_0_16:
2476; GFX10:       ; %bb.0:
2477; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2478; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2479; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2480; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
2481; GFX10-NEXT:    s_waitcnt vmcnt(0)
2482; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
2483; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2484; GFX10-NEXT:    s_endpgm
2485;
2486; GFX11-LABEL: v_test_v2i16_x_sub_0_16:
2487; GFX11:       ; %bb.0:
2488; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2489; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2490; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2491; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2492; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2493; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2494; GFX11-NEXT:    s_waitcnt vmcnt(0)
2495; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
2496; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2497; GFX11-NEXT:    s_endpgm
2498  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2499  %tid.ext = sext i32 %tid to i64
2500  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2501  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2502  %x = load <2 x i16>, ptr addrspace(1) %gep
2503  %result = sub <2 x i16> %x, <i16 0, i16 16>
2504  store <2 x i16> %result, ptr addrspace(1) %gep.out
2505  ret void
2506}
2507
2508define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2509; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0:
2510; SI-SDAG:       ; %bb.0:
2511; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2512; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
2513; SI-SDAG-NEXT:    s_mov_b32 s6, 0
2514; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2515; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
2516; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2517; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
2518; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2519; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
2520; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
2521; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0x3c000000, v2
2522; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2523; SI-SDAG-NEXT:    s_endpgm
2524;
2525; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0:
2526; SI-GISEL:       ; %bb.0:
2527; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2528; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2529; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
2530; SI-GISEL-NEXT:    s_mov_b32 s6, 0
2531; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
2532; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2533; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
2534; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2535; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2536; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2537; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0x3c00, v3
2538; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2539; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
2540; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2541; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
2542; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
2543; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2544; SI-GISEL-NEXT:    s_endpgm
2545;
2546; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0:
2547; VI-SDAG:       ; %bb.0:
2548; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2549; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2550; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2551; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
2552; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2553; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2554; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
2555; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2556; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x3c00
2557; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
2558; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2559; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
2560; VI-SDAG-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2561; VI-SDAG-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2562; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
2563; VI-SDAG-NEXT:    s_endpgm
2564;
2565; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0:
2566; VI-GISEL:       ; %bb.0:
2567; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2568; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2569; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2570; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
2571; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
2572; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2573; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2574; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
2575; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
2576; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2577; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
2578; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
2579; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2580; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2581; VI-GISEL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2582; VI-GISEL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2583; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
2584; VI-GISEL-NEXT:    s_endpgm
2585;
2586; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0:
2587; GFX9-SDAG:       ; %bb.0:
2588; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2589; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2590; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2591; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
2592; GFX9-SDAG-NEXT:    s_brev_b32 s2, 35
2593; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
2594; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
2595; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
2596; GFX9-SDAG-NEXT:    s_endpgm
2597;
2598; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0:
2599; GFX9-GISEL:       ; %bb.0:
2600; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2601; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2602; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 35
2603; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2604; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
2605; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
2606; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
2607; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
2608; GFX9-GISEL-NEXT:    s_endpgm
2609;
2610; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0:
2611; GFX10:       ; %bb.0:
2612; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2613; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2614; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2615; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
2616; GFX10-NEXT:    s_waitcnt vmcnt(0)
2617; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0xc4000000
2618; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2619; GFX10-NEXT:    s_endpgm
2620;
2621; GFX11-LABEL: v_test_v2i16_x_sub_0_1_0:
2622; GFX11:       ; %bb.0:
2623; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2624; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2625; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2626; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2627; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2628; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2629; GFX11-NEXT:    s_waitcnt vmcnt(0)
2630; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 0xc4000000
2631; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2632; GFX11-NEXT:    s_endpgm
2633  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2634  %tid.ext = sext i32 %tid to i64
2635  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2636  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2637  %x = load <2 x i16>, ptr addrspace(1) %gep
2638  %result = sub <2 x i16> %x, <i16 0, i16 -15360>
2639  store <2 x i16> %result, ptr addrspace(1) %gep.out
2640  ret void
2641}
2642
2643define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2644; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0:
2645; SI-SDAG:       ; %bb.0:
2646; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2647; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
2648; SI-SDAG-NEXT:    s_mov_b32 s6, 0
2649; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2650; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
2651; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2652; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
2653; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2654; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
2655; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
2656; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xbc000000, v2
2657; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2658; SI-SDAG-NEXT:    s_endpgm
2659;
2660; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0:
2661; SI-GISEL:       ; %bb.0:
2662; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2663; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2664; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
2665; SI-GISEL-NEXT:    s_mov_b32 s6, 0
2666; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
2667; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2668; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
2669; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2670; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2671; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2672; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffbc00, v3
2673; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2674; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
2675; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2676; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
2677; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
2678; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2679; SI-GISEL-NEXT:    s_endpgm
2680;
2681; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0:
2682; VI-SDAG:       ; %bb.0:
2683; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2684; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2685; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2686; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
2687; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2688; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2689; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
2690; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2691; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0xffffbc00
2692; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
2693; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2694; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
2695; VI-SDAG-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2696; VI-SDAG-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2697; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
2698; VI-SDAG-NEXT:    s_endpgm
2699;
2700; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0:
2701; VI-GISEL:       ; %bb.0:
2702; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2703; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2704; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2705; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
2706; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
2707; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2708; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2709; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
2710; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
2711; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2712; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffbc00
2713; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
2714; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2715; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2716; VI-GISEL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2717; VI-GISEL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2718; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
2719; VI-GISEL-NEXT:    s_endpgm
2720;
2721; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0:
2722; GFX9-SDAG:       ; %bb.0:
2723; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2724; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2725; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2726; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
2727; GFX9-SDAG-NEXT:    s_brev_b32 s2, 34
2728; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
2729; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
2730; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
2731; GFX9-SDAG-NEXT:    s_endpgm
2732;
2733; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0:
2734; GFX9-GISEL:       ; %bb.0:
2735; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2736; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2737; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 34
2738; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2739; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
2740; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
2741; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
2742; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
2743; GFX9-GISEL-NEXT:    s_endpgm
2744;
2745; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0:
2746; GFX10:       ; %bb.0:
2747; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2748; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2749; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2750; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
2751; GFX10-NEXT:    s_waitcnt vmcnt(0)
2752; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0x44000000
2753; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2754; GFX10-NEXT:    s_endpgm
2755;
2756; GFX11-LABEL: v_test_v2i16_x_sub_0_neg1_0:
2757; GFX11:       ; %bb.0:
2758; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2759; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2760; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2761; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2762; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2763; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2764; GFX11-NEXT:    s_waitcnt vmcnt(0)
2765; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 0x44000000
2766; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2767; GFX11-NEXT:    s_endpgm
2768  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2769  %tid.ext = sext i32 %tid to i64
2770  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2771  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2772  %x = load <2 x i16>, ptr addrspace(1) %gep
2773  %result = sub <2 x i16> %x, <i16 0, i16 17408>
2774  store <2 x i16> %result, ptr addrspace(1) %gep.out
2775  ret void
2776}
2777
2778; -32 isn't an inline immediate, but 32 is
2779define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2780; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32:
2781; SI-SDAG:       ; %bb.0:
2782; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2783; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
2784; SI-SDAG-NEXT:    s_mov_b32 s6, 0
2785; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2786; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
2787; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2788; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
2789; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2790; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
2791; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
2792; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 32, v2
2793; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
2794; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
2795; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xffe00000, v2
2796; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2797; SI-SDAG-NEXT:    s_endpgm
2798;
2799; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32:
2800; SI-GISEL:       ; %bb.0:
2801; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2802; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2803; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
2804; SI-GISEL-NEXT:    s_mov_b32 s6, 0
2805; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
2806; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2807; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
2808; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2809; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2810; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2811; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffe0, v2
2812; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffe0, v3
2813; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2814; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
2815; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2816; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
2817; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
2818; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2819; SI-GISEL-NEXT:    s_endpgm
2820;
2821; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32:
2822; VI-SDAG:       ; %bb.0:
2823; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2824; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2825; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 32
2826; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2827; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
2828; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2829; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2830; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
2831; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
2832; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2833; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2834; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
2835; VI-SDAG-NEXT:    v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2836; VI-SDAG-NEXT:    v_subrev_u16_e32 v3, 32, v3
2837; VI-SDAG-NEXT:    v_or_b32_e32 v2, v3, v2
2838; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
2839; VI-SDAG-NEXT:    s_endpgm
2840;
2841; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32:
2842; VI-GISEL:       ; %bb.0:
2843; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2844; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2845; VI-GISEL-NEXT:    v_not_b32_e32 v4, 31
2846; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2847; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
2848; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
2849; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2850; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2851; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
2852; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
2853; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
2854; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2855; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2856; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2857; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xffe0, v3
2858; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2859; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
2860; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
2861; VI-GISEL-NEXT:    s_endpgm
2862;
2863; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
2864; GFX9:       ; %bb.0:
2865; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2866; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2867; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2868; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2869; GFX9-NEXT:    s_waitcnt vmcnt(0)
2870; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
2871; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2872; GFX9-NEXT:    s_endpgm
2873;
2874; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32:
2875; GFX10:       ; %bb.0:
2876; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2877; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2878; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2879; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
2880; GFX10-NEXT:    s_waitcnt vmcnt(0)
2881; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
2882; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2883; GFX10-NEXT:    s_endpgm
2884;
2885; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32:
2886; GFX11:       ; %bb.0:
2887; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2888; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2889; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2890; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2891; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2892; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2893; GFX11-NEXT:    s_waitcnt vmcnt(0)
2894; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
2895; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2896; GFX11-NEXT:    s_endpgm
2897  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2898  %tid.ext = sext i32 %tid to i64
2899  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2900  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2901  %x = load <2 x i16>, ptr addrspace(1) %gep
2902  %result = add <2 x i16> %x, <i16 -32, i16 -32>
2903  store <2 x i16> %result, ptr addrspace(1) %gep.out
2904  ret void
2905}
2906
2907define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2908; SI-SDAG-LABEL: v_test_v2i16_x_add_0_neg32:
2909; SI-SDAG:       ; %bb.0:
2910; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2911; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
2912; SI-SDAG-NEXT:    s_mov_b32 s6, 0
2913; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2914; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
2915; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2916; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
2917; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2918; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
2919; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
2920; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xffe00000, v2
2921; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2922; SI-SDAG-NEXT:    s_endpgm
2923;
2924; SI-GISEL-LABEL: v_test_v2i16_x_add_0_neg32:
2925; SI-GISEL:       ; %bb.0:
2926; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2927; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2928; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
2929; SI-GISEL-NEXT:    s_mov_b32 s6, 0
2930; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
2931; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2932; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
2933; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2934; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2935; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2936; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffe0, v3
2937; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2938; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
2939; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2940; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
2941; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
2942; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2943; SI-GISEL-NEXT:    s_endpgm
2944;
2945; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg32:
2946; VI-SDAG:       ; %bb.0:
2947; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2948; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2949; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
2950; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
2951; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2952; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2953; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
2954; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2955; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 32
2956; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
2957; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2958; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
2959; VI-SDAG-NEXT:    v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2960; VI-SDAG-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2961; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
2962; VI-SDAG-NEXT:    s_endpgm
2963;
2964; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg32:
2965; VI-GISEL:       ; %bb.0:
2966; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2967; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2968; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2969; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
2970; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
2971; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2972; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2973; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
2974; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
2975; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2976; VI-GISEL-NEXT:    v_not_b32_e32 v2, 31
2977; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
2978; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2979; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
2980; VI-GISEL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2981; VI-GISEL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2982; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
2983; VI-GISEL-NEXT:    s_endpgm
2984;
2985; GFX9-LABEL: v_test_v2i16_x_add_0_neg32:
2986; GFX9:       ; %bb.0:
2987; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2988; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2989; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2990; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2991; GFX9-NEXT:    s_waitcnt vmcnt(0)
2992; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
2993; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2994; GFX9-NEXT:    s_endpgm
2995;
2996; GFX10-LABEL: v_test_v2i16_x_add_0_neg32:
2997; GFX10:       ; %bb.0:
2998; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2999; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3000; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3001; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
3002; GFX10-NEXT:    s_waitcnt vmcnt(0)
3003; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
3004; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
3005; GFX10-NEXT:    s_endpgm
3006;
3007; GFX11-LABEL: v_test_v2i16_x_add_0_neg32:
3008; GFX11:       ; %bb.0:
3009; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3010; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3011; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3012; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3013; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3014; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3015; GFX11-NEXT:    s_waitcnt vmcnt(0)
3016; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
3017; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3018; GFX11-NEXT:    s_endpgm
3019  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3020  %tid.ext = sext i32 %tid to i64
3021  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
3022  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
3023  %x = load <2 x i16>, ptr addrspace(1) %gep
3024  %result = add <2 x i16> %x, <i16 0, i16 -32>
3025  store <2 x i16> %result, ptr addrspace(1) %gep.out
3026  ret void
3027}
3028
3029define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
3030; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_0:
3031; SI-SDAG:       ; %bb.0:
3032; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3033; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
3034; SI-SDAG-NEXT:    s_mov_b32 s6, 0
3035; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3036; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
3037; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3038; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
3039; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3040; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
3041; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
3042; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 32, v2
3043; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff
3044; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v3, v2
3045; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3046; SI-SDAG-NEXT:    s_endpgm
3047;
3048; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_0:
3049; SI-GISEL:       ; %bb.0:
3050; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3051; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3052; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
3053; SI-GISEL-NEXT:    s_mov_b32 s6, 0
3054; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
3055; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3056; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
3057; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3058; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
3059; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3060; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffe0, v2
3061; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3062; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3063; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
3064; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
3065; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3066; SI-GISEL-NEXT:    s_endpgm
3067;
3068; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_0:
3069; VI-SDAG:       ; %bb.0:
3070; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3071; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3072; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3073; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
3074; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3075; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3076; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
3077; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
3078; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3079; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3080; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
3081; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
3082; VI-SDAG-NEXT:    v_subrev_u16_e32 v3, 32, v3
3083; VI-SDAG-NEXT:    v_or_b32_e32 v2, v3, v2
3084; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
3085; VI-SDAG-NEXT:    s_endpgm
3086;
3087; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_0:
3088; VI-GISEL:       ; %bb.0:
3089; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3090; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3091; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3092; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
3093; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
3094; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
3095; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3096; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
3097; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
3098; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
3099; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
3100; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3101; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
3102; VI-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
3103; VI-GISEL-NEXT:    v_add_u16_e32 v3, 0xffe0, v3
3104; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3105; VI-GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
3106; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
3107; VI-GISEL-NEXT:    s_endpgm
3108;
3109; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
3110; GFX9:       ; %bb.0:
3111; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3112; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3113; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3114; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3115; GFX9-NEXT:    s_waitcnt vmcnt(0)
3116; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32
3117; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3118; GFX9-NEXT:    s_endpgm
3119;
3120; GFX10-LABEL: v_test_v2i16_x_add_neg32_0:
3121; GFX10:       ; %bb.0:
3122; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3123; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3124; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3125; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
3126; GFX10-NEXT:    s_waitcnt vmcnt(0)
3127; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32
3128; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
3129; GFX10-NEXT:    s_endpgm
3130;
3131; GFX11-LABEL: v_test_v2i16_x_add_neg32_0:
3132; GFX11:       ; %bb.0:
3133; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3134; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3135; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3136; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3137; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3138; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3139; GFX11-NEXT:    s_waitcnt vmcnt(0)
3140; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32
3141; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3142; GFX11-NEXT:    s_endpgm
3143  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3144  %tid.ext = sext i32 %tid to i64
3145  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
3146  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
3147  %x = load <2 x i16>, ptr addrspace(1) %gep
3148  %result = add <2 x i16> %x, <i16 -32, i16 0>
3149  store <2 x i16> %result, ptr addrspace(1) %gep.out
3150  ret void
3151}
3152
3153; 16 and -16 are both inline immediates
3154define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
3155; SI-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16:
3156; SI-SDAG:       ; %bb.0:
3157; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3158; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
3159; SI-SDAG-NEXT:    s_mov_b32 s6, 0
3160; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3161; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
3162; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3163; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
3164; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3165; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
3166; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
3167; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, -16, v2
3168; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
3169; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
3170; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
3171; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3172; SI-SDAG-NEXT:    s_endpgm
3173;
3174; SI-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16:
3175; SI-GISEL:       ; %bb.0:
3176; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3177; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3178; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
3179; SI-GISEL-NEXT:    s_mov_b32 s6, 0
3180; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
3181; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3182; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
3183; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3184; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
3185; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3186; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, -16, v2
3187; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, -16, v3
3188; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3189; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
3190; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3191; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
3192; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
3193; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3194; SI-GISEL-NEXT:    s_endpgm
3195;
3196; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16:
3197; VI-SDAG:       ; %bb.0:
3198; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3199; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3200; VI-SDAG-NEXT:    v_mov_b32_e32 v4, -16
3201; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3202; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
3203; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3204; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3205; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
3206; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
3207; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3208; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3209; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
3210; VI-SDAG-NEXT:    v_add_u16_e32 v2, -16, v3
3211; VI-SDAG-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3212; VI-SDAG-NEXT:    v_or_b32_e32 v2, v2, v3
3213; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
3214; VI-SDAG-NEXT:    s_endpgm
3215;
3216; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16:
3217; VI-GISEL:       ; %bb.0:
3218; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3219; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3220; VI-GISEL-NEXT:    v_mov_b32_e32 v4, -16
3221; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3222; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
3223; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
3224; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
3225; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3226; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
3227; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
3228; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
3229; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
3230; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3231; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
3232; VI-GISEL-NEXT:    v_add_u16_e32 v2, -16, v3
3233; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3234; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
3235; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
3236; VI-GISEL-NEXT:    s_endpgm
3237;
3238; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
3239; GFX9:       ; %bb.0:
3240; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3241; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3242; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3243; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3244; GFX9-NEXT:    s_waitcnt vmcnt(0)
3245; GFX9-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
3246; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3247; GFX9-NEXT:    s_endpgm
3248;
3249; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16:
3250; GFX10:       ; %bb.0:
3251; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3252; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3253; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3254; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
3255; GFX10-NEXT:    s_waitcnt vmcnt(0)
3256; GFX10-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
3257; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
3258; GFX10-NEXT:    s_endpgm
3259;
3260; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16:
3261; GFX11:       ; %bb.0:
3262; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3263; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3264; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3265; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3266; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3267; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3268; GFX11-NEXT:    s_waitcnt vmcnt(0)
3269; GFX11-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
3270; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3271; GFX11-NEXT:    s_endpgm
3272  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3273  %tid.ext = sext i32 %tid to i64
3274  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
3275  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
3276  %x = load <2 x i16>, ptr addrspace(1) %gep
3277  %result = add <2 x i16> %x, <i16 -16, i16 -16>
3278  store <2 x i16> %result, ptr addrspace(1) %gep.out
3279  ret void
3280}
3281
3282define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
3283; SI-SDAG-LABEL: v_test_v2i16_x_add_0_neg16:
3284; SI-SDAG:       ; %bb.0:
3285; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3286; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
3287; SI-SDAG-NEXT:    s_mov_b32 s6, 0
3288; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3289; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
3290; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3291; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
3292; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3293; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
3294; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
3295; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
3296; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3297; SI-SDAG-NEXT:    s_endpgm
3298;
3299; SI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16:
3300; SI-GISEL:       ; %bb.0:
3301; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3302; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3303; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
3304; SI-GISEL-NEXT:    s_mov_b32 s6, 0
3305; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
3306; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3307; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
3308; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3309; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
3310; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3311; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, -16, v3
3312; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3313; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
3314; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3315; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
3316; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
3317; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3318; SI-GISEL-NEXT:    s_endpgm
3319;
3320; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg16:
3321; VI-SDAG:       ; %bb.0:
3322; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3323; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3324; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3325; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
3326; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3327; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3328; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
3329; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3330; VI-SDAG-NEXT:    v_mov_b32_e32 v2, -16
3331; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
3332; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3333; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
3334; VI-SDAG-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3335; VI-SDAG-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3336; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
3337; VI-SDAG-NEXT:    s_endpgm
3338;
3339; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16:
3340; VI-GISEL:       ; %bb.0:
3341; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3342; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3343; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3344; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
3345; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
3346; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
3347; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3348; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
3349; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
3350; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
3351; VI-GISEL-NEXT:    v_mov_b32_e32 v2, -16
3352; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
3353; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3354; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
3355; VI-GISEL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3356; VI-GISEL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3357; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
3358; VI-GISEL-NEXT:    s_endpgm
3359;
3360; GFX9-LABEL: v_test_v2i16_x_add_0_neg16:
3361; GFX9:       ; %bb.0:
3362; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3363; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3364; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3365; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3366; GFX9-NEXT:    s_waitcnt vmcnt(0)
3367; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
3368; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3369; GFX9-NEXT:    s_endpgm
3370;
3371; GFX10-LABEL: v_test_v2i16_x_add_0_neg16:
3372; GFX10:       ; %bb.0:
3373; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3374; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3375; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3376; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
3377; GFX10-NEXT:    s_waitcnt vmcnt(0)
3378; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
3379; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
3380; GFX10-NEXT:    s_endpgm
3381;
3382; GFX11-LABEL: v_test_v2i16_x_add_0_neg16:
3383; GFX11:       ; %bb.0:
3384; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3385; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3386; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3387; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3388; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3389; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3390; GFX11-NEXT:    s_waitcnt vmcnt(0)
3391; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
3392; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3393; GFX11-NEXT:    s_endpgm
3394  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3395  %tid.ext = sext i32 %tid to i64
3396  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
3397  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
3398  %x = load <2 x i16>, ptr addrspace(1) %gep
3399  %result = add <2 x i16> %x, <i16 0, i16 -16>
3400  store <2 x i16> %result, ptr addrspace(1) %gep.out
3401  ret void
3402}
3403
3404define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
3405; SI-SDAG-LABEL: v_test_v2i16_x_add_neg16_0:
3406; SI-SDAG:       ; %bb.0:
3407; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3408; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
3409; SI-SDAG-NEXT:    s_mov_b32 s6, 0
3410; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3411; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
3412; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3413; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
3414; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3415; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
3416; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
3417; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, -16, v2
3418; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff
3419; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v3, v2
3420; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3421; SI-SDAG-NEXT:    s_endpgm
3422;
3423; SI-GISEL-LABEL: v_test_v2i16_x_add_neg16_0:
3424; SI-GISEL:       ; %bb.0:
3425; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3426; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3427; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
3428; SI-GISEL-NEXT:    s_mov_b32 s6, 0
3429; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
3430; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3431; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
3432; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3433; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
3434; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3435; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, -16, v2
3436; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3437; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3438; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
3439; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
3440; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3441; SI-GISEL-NEXT:    s_endpgm
3442;
3443; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_0:
3444; VI-SDAG:       ; %bb.0:
3445; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3446; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3447; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3448; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
3449; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3450; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3451; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
3452; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
3453; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3454; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3455; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
3456; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
3457; VI-SDAG-NEXT:    v_add_u16_e32 v3, -16, v3
3458; VI-SDAG-NEXT:    v_or_b32_e32 v2, v3, v2
3459; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
3460; VI-SDAG-NEXT:    s_endpgm
3461;
3462; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_0:
3463; VI-GISEL:       ; %bb.0:
3464; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3465; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3466; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3467; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
3468; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
3469; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
3470; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3471; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
3472; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
3473; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
3474; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
3475; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3476; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
3477; VI-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
3478; VI-GISEL-NEXT:    v_add_u16_e32 v3, -16, v3
3479; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3480; VI-GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
3481; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
3482; VI-GISEL-NEXT:    s_endpgm
3483;
3484; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
3485; GFX9:       ; %bb.0:
3486; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3487; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3488; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3489; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3490; GFX9-NEXT:    s_waitcnt vmcnt(0)
3491; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 16
3492; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3493; GFX9-NEXT:    s_endpgm
3494;
3495; GFX10-LABEL: v_test_v2i16_x_add_neg16_0:
3496; GFX10:       ; %bb.0:
3497; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3498; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3499; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3500; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
3501; GFX10-NEXT:    s_waitcnt vmcnt(0)
3502; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 16
3503; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
3504; GFX10-NEXT:    s_endpgm
3505;
3506; GFX11-LABEL: v_test_v2i16_x_add_neg16_0:
3507; GFX11:       ; %bb.0:
3508; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3509; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3510; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3511; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3512; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3513; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3514; GFX11-NEXT:    s_waitcnt vmcnt(0)
3515; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 16
3516; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3517; GFX11-NEXT:    s_endpgm
3518  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3519  %tid.ext = sext i32 %tid to i64
3520  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
3521  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
3522  %x = load <2 x i16>, ptr addrspace(1) %gep
3523  %result = add <2 x i16> %x, <i16 -16, i16 0>
3524  store <2 x i16> %result, ptr addrspace(1) %gep.out
3525  ret void
3526}
3527
3528define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
3529; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
3530; SI-SDAG:       ; %bb.0:
3531; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3532; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
3533; SI-SDAG-NEXT:    s_mov_b32 s6, 0
3534; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3535; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
3536; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3537; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
3538; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3539; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
3540; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
3541; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc400, v2
3542; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
3543; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
3544; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xc4000000, v2
3545; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3546; SI-SDAG-NEXT:    s_endpgm
3547;
3548; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
3549; SI-GISEL:       ; %bb.0:
3550; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3551; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3552; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
3553; SI-GISEL-NEXT:    s_mov_b32 s6, 0
3554; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
3555; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3556; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
3557; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3558; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
3559; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3560; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffc400, v2
3561; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc400, v3
3562; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3563; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
3564; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3565; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
3566; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
3567; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3568; SI-GISEL-NEXT:    s_endpgm
3569;
3570; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
3571; VI-SDAG:       ; %bb.0:
3572; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3573; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3574; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffc400
3575; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3576; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
3577; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3578; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3579; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
3580; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
3581; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3582; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3583; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
3584; VI-SDAG-NEXT:    v_add_u16_e32 v2, 0xc400, v3
3585; VI-SDAG-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3586; VI-SDAG-NEXT:    v_or_b32_e32 v2, v2, v3
3587; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
3588; VI-SDAG-NEXT:    s_endpgm
3589;
3590; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
3591; VI-GISEL:       ; %bb.0:
3592; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3593; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3594; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffc400
3595; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3596; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
3597; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
3598; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
3599; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3600; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
3601; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
3602; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
3603; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
3604; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3605; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
3606; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xc400, v3
3607; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3608; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
3609; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
3610; VI-GISEL-NEXT:    s_endpgm
3611;
3612; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
3613; GFX9-SDAG:       ; %bb.0:
3614; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3615; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3616; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3617; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
3618; GFX9-SDAG-NEXT:    s_movk_i32 s2, 0xc400
3619; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
3620; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0]
3621; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
3622; GFX9-SDAG-NEXT:    s_endpgm
3623;
3624; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
3625; GFX9-GISEL:       ; %bb.0:
3626; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3627; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3628; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc400c400
3629; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3630; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
3631; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
3632; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
3633; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
3634; GFX9-GISEL-NEXT:    s_endpgm
3635;
3636; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
3637; GFX10-SDAG:       ; %bb.0:
3638; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3639; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3640; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3641; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
3642; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
3643; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1]
3644; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
3645; GFX10-SDAG-NEXT:    s_endpgm
3646;
3647; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
3648; GFX10-GISEL:       ; %bb.0:
3649; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3650; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3651; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3652; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
3653; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
3654; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xc400c400, v1
3655; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
3656; GFX10-GISEL-NEXT:    s_endpgm
3657;
3658; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
3659; GFX11-SDAG:       ; %bb.0:
3660; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3661; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3662; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3663; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3664; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3665; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
3666; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
3667; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1]
3668; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
3669; GFX11-SDAG-NEXT:    s_endpgm
3670;
3671; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
3672; GFX11-GISEL:       ; %bb.0:
3673; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3674; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3675; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3676; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3677; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3678; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
3679; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
3680; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xc400c400, v1
3681; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
3682; GFX11-GISEL-NEXT:    s_endpgm
3683  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3684  %tid.ext = sext i32 %tid to i64
3685  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
3686  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
3687  %x = load <2 x i16>, ptr addrspace(1) %gep
3688  %result = add <2 x i16> %x, <i16 -15360, i16 -15360>
3689  store <2 x i16> %result, ptr addrspace(1) %gep.out
3690  ret void
3691}
3692
3693define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
3694; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
3695; SI-SDAG:       ; %bb.0:
3696; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3697; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
3698; SI-SDAG-NEXT:    s_mov_b32 s6, 0
3699; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3700; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
3701; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3702; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
3703; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3704; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
3705; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
3706; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 0x4400, v2
3707; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
3708; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
3709; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0x44000000, v2
3710; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3711; SI-SDAG-NEXT:    s_endpgm
3712;
3713; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
3714; SI-GISEL:       ; %bb.0:
3715; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3716; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3717; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
3718; SI-GISEL-NEXT:    s_mov_b32 s6, 0
3719; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
3720; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3721; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
3722; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3723; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
3724; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3725; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0x4400, v2
3726; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0x4400, v3
3727; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3728; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
3729; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3730; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
3731; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
3732; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3733; SI-GISEL-NEXT:    s_endpgm
3734;
3735; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
3736; VI-SDAG:       ; %bb.0:
3737; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3738; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3739; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4400
3740; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3741; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
3742; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3743; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3744; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
3745; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
3746; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3747; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3748; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
3749; VI-SDAG-NEXT:    v_add_u16_e32 v2, 0x4400, v3
3750; VI-SDAG-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3751; VI-SDAG-NEXT:    v_or_b32_e32 v2, v2, v3
3752; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
3753; VI-SDAG-NEXT:    s_endpgm
3754;
3755; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
3756; VI-GISEL:       ; %bb.0:
3757; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3758; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3759; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
3760; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3761; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
3762; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
3763; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
3764; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3765; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
3766; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
3767; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
3768; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
3769; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3770; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
3771; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0x4400, v3
3772; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3773; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
3774; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
3775; VI-GISEL-NEXT:    s_endpgm
3776;
3777; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
3778; GFX9-SDAG:       ; %bb.0:
3779; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3780; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3781; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3782; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
3783; GFX9-SDAG-NEXT:    s_movk_i32 s2, 0x4400
3784; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
3785; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0]
3786; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
3787; GFX9-SDAG-NEXT:    s_endpgm
3788;
3789; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
3790; GFX9-GISEL:       ; %bb.0:
3791; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3792; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3793; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x44004400
3794; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3795; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
3796; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
3797; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
3798; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
3799; GFX9-GISEL-NEXT:    s_endpgm
3800;
3801; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
3802; GFX10-SDAG:       ; %bb.0:
3803; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3804; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3805; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3806; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
3807; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
3808; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
3809; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
3810; GFX10-SDAG-NEXT:    s_endpgm
3811;
3812; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
3813; GFX10-GISEL:       ; %bb.0:
3814; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3815; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3816; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3817; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
3818; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
3819; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0x44004400, v1
3820; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
3821; GFX10-GISEL-NEXT:    s_endpgm
3822;
3823; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
3824; GFX11-SDAG:       ; %bb.0:
3825; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3826; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3827; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3828; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3829; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3830; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
3831; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
3832; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
3833; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
3834; GFX11-SDAG-NEXT:    s_endpgm
3835;
3836; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
3837; GFX11-GISEL:       ; %bb.0:
3838; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3839; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3840; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3841; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3842; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3843; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
3844; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
3845; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0x44004400, v1
3846; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
3847; GFX11-GISEL-NEXT:    s_endpgm
3848  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3849  %tid.ext = sext i32 %tid to i64
3850  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
3851  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
3852  %x = load <2 x i16>, ptr addrspace(1) %gep
3853  %result = add <2 x i16> %x, <i16 17408, i16 17408>
3854  store <2 x i16> %result, ptr addrspace(1) %gep.out
3855  ret void
3856}
3857
3858define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
3859; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo:
3860; SI-SDAG:       ; %bb.0:
3861; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3862; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
3863; SI-SDAG-NEXT:    s_mov_b32 s6, 0
3864; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3865; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
3866; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3867; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
3868; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3869; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
3870; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
3871; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 0x4000, v2
3872; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
3873; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
3874; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 2.0, v2
3875; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3876; SI-SDAG-NEXT:    s_endpgm
3877;
3878; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo:
3879; SI-GISEL:       ; %bb.0:
3880; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3881; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3882; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
3883; SI-GISEL-NEXT:    s_mov_b32 s6, 0
3884; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
3885; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3886; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
3887; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3888; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
3889; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3890; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0x4000, v2
3891; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0x4000, v3
3892; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3893; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
3894; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3895; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
3896; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
3897; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3898; SI-GISEL-NEXT:    s_endpgm
3899;
3900; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo:
3901; VI-SDAG:       ; %bb.0:
3902; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3903; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3904; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
3905; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3906; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
3907; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3908; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3909; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
3910; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
3911; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3912; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3913; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
3914; VI-SDAG-NEXT:    v_add_u16_e32 v2, 0x4000, v3
3915; VI-SDAG-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3916; VI-SDAG-NEXT:    v_or_b32_e32 v2, v2, v3
3917; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
3918; VI-SDAG-NEXT:    s_endpgm
3919;
3920; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo:
3921; VI-GISEL:       ; %bb.0:
3922; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3923; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3924; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4000
3925; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
3926; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
3927; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
3928; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
3929; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3930; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
3931; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
3932; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
3933; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
3934; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3935; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
3936; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0x4000, v3
3937; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3938; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
3939; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
3940; VI-GISEL-NEXT:    s_endpgm
3941;
3942; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
3943; GFX9:       ; %bb.0:
3944; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3945; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3946; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3947; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3948; GFX9-NEXT:    s_waitcnt vmcnt(0)
3949; GFX9-NEXT:    v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
3950; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3951; GFX9-NEXT:    s_endpgm
3952;
3953; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo:
3954; GFX10:       ; %bb.0:
3955; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3956; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3957; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3958; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
3959; GFX10-NEXT:    s_waitcnt vmcnt(0)
3960; GFX10-NEXT:    v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
3961; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
3962; GFX10-NEXT:    s_endpgm
3963;
3964; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo:
3965; GFX11:       ; %bb.0:
3966; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3967; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3968; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3969; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3970; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3971; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3972; GFX11-NEXT:    s_waitcnt vmcnt(0)
3973; GFX11-NEXT:    v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
3974; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3975; GFX11-NEXT:    s_endpgm
3976  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3977  %tid.ext = sext i32 %tid to i64
3978  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
3979  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
3980  %x = load <2 x i16>, ptr addrspace(1) %gep
3981  %result = add <2 x i16> %x, <i16 16384, i16 16384>
3982  store <2 x i16> %result, ptr addrspace(1) %gep.out
3983  ret void
3984}
3985
3986define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
3987; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo:
3988; SI-SDAG:       ; %bb.0:
3989; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3990; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
3991; SI-SDAG-NEXT:    s_mov_b32 s6, 0
3992; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3993; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
3994; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
3995; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
3996; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3997; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
3998; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
3999; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc000, v2
4000; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
4001; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
4002; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, -2.0, v2
4003; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
4004; SI-SDAG-NEXT:    s_endpgm
4005;
4006; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo:
4007; SI-GISEL:       ; %bb.0:
4008; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4009; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4010; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
4011; SI-GISEL-NEXT:    s_mov_b32 s6, 0
4012; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
4013; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
4014; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
4015; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
4016; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
4017; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
4018; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffc000, v2
4019; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc000, v3
4020; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
4021; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
4022; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
4023; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
4024; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
4025; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
4026; SI-GISEL-NEXT:    s_endpgm
4027;
4028; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo:
4029; VI-SDAG:       ; %bb.0:
4030; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4031; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
4032; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffc000
4033; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
4034; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
4035; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
4036; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4037; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
4038; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
4039; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
4040; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4041; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
4042; VI-SDAG-NEXT:    v_add_u16_e32 v2, 0xc000, v3
4043; VI-SDAG-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
4044; VI-SDAG-NEXT:    v_or_b32_e32 v2, v2, v3
4045; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
4046; VI-SDAG-NEXT:    s_endpgm
4047;
4048; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo:
4049; VI-GISEL:       ; %bb.0:
4050; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4051; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
4052; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffc000
4053; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
4054; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
4055; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
4056; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
4057; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4058; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
4059; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
4060; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
4061; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
4062; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4063; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
4064; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xc000, v3
4065; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
4066; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
4067; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
4068; VI-GISEL-NEXT:    s_endpgm
4069;
4070; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
4071; GFX9:       ; %bb.0:
4072; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4073; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4074; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4075; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
4076; GFX9-NEXT:    s_waitcnt vmcnt(0)
4077; GFX9-NEXT:    v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
4078; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
4079; GFX9-NEXT:    s_endpgm
4080;
4081; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo:
4082; GFX10:       ; %bb.0:
4083; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4084; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4085; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
4086; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
4087; GFX10-NEXT:    s_waitcnt vmcnt(0)
4088; GFX10-NEXT:    v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
4089; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
4090; GFX10-NEXT:    s_endpgm
4091;
4092; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo:
4093; GFX11:       ; %bb.0:
4094; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4095; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4096; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4097; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4098; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
4099; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
4100; GFX11-NEXT:    s_waitcnt vmcnt(0)
4101; GFX11-NEXT:    v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
4102; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
4103; GFX11-NEXT:    s_endpgm
4104  %tid = call i32 @llvm.amdgcn.workitem.id.x()
4105  %tid.ext = sext i32 %tid to i64
4106  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
4107  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
4108  %x = load <2 x i16>, ptr addrspace(1) %gep
4109  %result = add <2 x i16> %x, <i16 -16384, i16 -16384>
4110  store <2 x i16> %result, ptr addrspace(1) %gep.out
4111  ret void
4112}
4113
4114define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
4115; SI-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
4116; SI-SDAG:       ; %bb.0:
4117; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4118; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
4119; SI-SDAG-NEXT:    s_mov_b32 s6, 0
4120; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4121; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
4122; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
4123; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
4124; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
4125; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
4126; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
4127; SI-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
4128; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xffe00000, v2
4129; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
4130; SI-SDAG-NEXT:    s_endpgm
4131;
4132; SI-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32:
4133; SI-GISEL:       ; %bb.0:
4134; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4135; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4136; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
4137; SI-GISEL-NEXT:    s_mov_b32 s6, 0
4138; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
4139; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
4140; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
4141; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
4142; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
4143; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
4144; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffe0, v2
4145; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
4146; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
4147; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
4148; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
4149; SI-GISEL-NEXT:    s_endpgm
4150;
4151; VI-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
4152; VI-SDAG:       ; %bb.0:
4153; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4154; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
4155; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
4156; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
4157; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
4158; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4159; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
4160; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
4161; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
4162; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 32
4163; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4164; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
4165; VI-SDAG-NEXT:    v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
4166; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
4167; VI-SDAG-NEXT:    s_endpgm
4168;
4169; VI-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32:
4170; VI-GISEL:       ; %bb.0:
4171; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4172; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
4173; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
4174; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
4175; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
4176; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
4177; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4178; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
4179; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
4180; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
4181; VI-GISEL-NEXT:    v_not_b32_e32 v2, 31
4182; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
4183; VI-GISEL-NEXT:    s_and_b32 s0, 0xffff, s0
4184; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4185; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
4186; VI-GISEL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
4187; VI-GISEL-NEXT:    v_or_b32_e32 v2, s0, v2
4188; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
4189; VI-GISEL-NEXT:    s_endpgm
4190;
4191; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32:
4192; GFX9:       ; %bb.0:
4193; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4194; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4195; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4196; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
4197; GFX9-NEXT:    s_waitcnt vmcnt(0)
4198; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
4199; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
4200; GFX9-NEXT:    s_endpgm
4201;
4202; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32:
4203; GFX10:       ; %bb.0:
4204; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4205; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4206; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
4207; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
4208; GFX10-NEXT:    s_waitcnt vmcnt(0)
4209; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
4210; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
4211; GFX10-NEXT:    s_endpgm
4212;
4213; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32:
4214; GFX11:       ; %bb.0:
4215; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4216; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4217; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4218; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4219; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
4220; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
4221; GFX11-NEXT:    s_waitcnt vmcnt(0)
4222; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
4223; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
4224; GFX11-NEXT:    s_endpgm
4225  %tid = call i32 @llvm.amdgcn.workitem.id.x()
4226  %tid.ext = sext i32 %tid to i64
4227  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
4228  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
4229  %x = load <2 x i16>, ptr addrspace(1) %gep
4230  %result = add <2 x i16> %x, <i16 undef, i16 -32>
4231  store <2 x i16> %result, ptr addrspace(1) %gep.out
4232  ret void
4233}
4234
4235define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
4236; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
4237; SI-SDAG:       ; %bb.0:
4238; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4239; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
4240; SI-SDAG-NEXT:    s_mov_b32 s6, 0
4241; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4242; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
4243; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
4244; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
4245; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
4246; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
4247; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
4248; SI-SDAG-NEXT:    v_subrev_i32_e32 v2, vcc, 32, v2
4249; SI-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v2
4250; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
4251; SI-SDAG-NEXT:    s_endpgm
4252;
4253; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
4254; SI-GISEL:       ; %bb.0:
4255; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4256; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4257; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
4258; SI-GISEL-NEXT:    s_mov_b32 s6, 0
4259; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
4260; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
4261; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
4262; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
4263; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
4264; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffe0, v2
4265; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
4266; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
4267; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
4268; SI-GISEL-NEXT:    s_endpgm
4269;
4270; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
4271; VI-SDAG:       ; %bb.0:
4272; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4273; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
4274; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
4275; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
4276; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
4277; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4278; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
4279; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
4280; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
4281; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4282; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
4283; VI-SDAG-NEXT:    v_subrev_u16_e32 v2, 32, v3
4284; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
4285; VI-SDAG-NEXT:    s_endpgm
4286;
4287; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
4288; VI-GISEL:       ; %bb.0:
4289; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4290; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
4291; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
4292; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
4293; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
4294; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
4295; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4296; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
4297; VI-GISEL-NEXT:    s_and_b32 s2, 0xffff, s0
4298; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
4299; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
4300; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
4301; VI-GISEL-NEXT:    s_lshl_b32 s0, s2, 16
4302; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4303; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
4304; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xffe0, v3
4305; VI-GISEL-NEXT:    v_or_b32_e32 v2, s0, v2
4306; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
4307; VI-GISEL-NEXT:    s_endpgm
4308;
4309; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
4310; GFX9-SDAG:       ; %bb.0:
4311; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4312; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4313; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
4314; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
4315; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
4316; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
4317; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
4318; GFX9-SDAG-NEXT:    s_endpgm
4319;
4320; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
4321; GFX9-GISEL:       ; %bb.0:
4322; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4323; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4324; GFX9-GISEL-NEXT:    v_not_b32_e32 v2, 31
4325; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
4326; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
4327; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
4328; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
4329; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
4330; GFX9-GISEL-NEXT:    s_endpgm
4331;
4332; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
4333; GFX10-SDAG:       ; %bb.0:
4334; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4335; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4336; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
4337; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
4338; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
4339; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
4340; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
4341; GFX10-SDAG-NEXT:    s_endpgm
4342;
4343; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
4344; GFX10-GISEL:       ; %bb.0:
4345; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4346; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4347; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
4348; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
4349; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
4350; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xffffffe0, v1
4351; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
4352; GFX10-GISEL-NEXT:    s_endpgm
4353;
4354; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
4355; GFX11-SDAG:       ; %bb.0:
4356; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4357; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4358; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4359; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4360; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
4361; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
4362; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
4363; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
4364; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
4365; GFX11-SDAG-NEXT:    s_endpgm
4366;
4367; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
4368; GFX11-GISEL:       ; %bb.0:
4369; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4370; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4371; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4372; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4373; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
4374; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
4375; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
4376; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xffffffe0, v1
4377; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
4378; GFX11-GISEL-NEXT:    s_endpgm
4379  %tid = call i32 @llvm.amdgcn.workitem.id.x()
4380  %tid.ext = sext i32 %tid to i64
4381  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
4382  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
4383  %x = load <2 x i16>, ptr addrspace(1) %gep
4384  %result = add <2 x i16> %x, <i16 -32, i16 undef>
4385  store <2 x i16> %result, ptr addrspace(1) %gep.out
4386  ret void
4387}
4388
4389declare i32 @llvm.amdgcn.workitem.id.x() #1
4390
4391attributes #0 = { nounwind }
4392attributes #1 = { nounwind readnone }
4393