xref: /llvm-project/llvm/test/CodeGen/AMDGPU/sub.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
6
7declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable
8
9define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
10; GFX6-LABEL: s_sub_i32:
11; GFX6:       ; %bb.0:
12; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
13; GFX6-NEXT:    s_mov_b32 s7, 0xf000
14; GFX6-NEXT:    s_mov_b32 s6, -1
15; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
16; GFX6-NEXT:    s_mov_b32 s4, s0
17; GFX6-NEXT:    s_sub_i32 s0, s2, s3
18; GFX6-NEXT:    s_mov_b32 s5, s1
19; GFX6-NEXT:    v_mov_b32_e32 v0, s0
20; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
21; GFX6-NEXT:    s_endpgm
22;
23; GFX8-LABEL: s_sub_i32:
24; GFX8:       ; %bb.0:
25; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
26; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
27; GFX8-NEXT:    s_sub_i32 s2, s2, s3
28; GFX8-NEXT:    v_mov_b32_e32 v0, s0
29; GFX8-NEXT:    v_mov_b32_e32 v1, s1
30; GFX8-NEXT:    v_mov_b32_e32 v2, s2
31; GFX8-NEXT:    flat_store_dword v[0:1], v2
32; GFX8-NEXT:    s_endpgm
33;
34; GFX9-LABEL: s_sub_i32:
35; GFX9:       ; %bb.0:
36; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
37; GFX9-NEXT:    v_mov_b32_e32 v0, 0
38; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX9-NEXT:    s_sub_i32 s2, s2, s3
40; GFX9-NEXT:    v_mov_b32_e32 v1, s2
41; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
42; GFX9-NEXT:    s_endpgm
43;
44; GFX12-LABEL: s_sub_i32:
45; GFX12:       ; %bb.0:
46; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
47; GFX12-NEXT:    s_wait_kmcnt 0x0
48; GFX12-NEXT:    s_sub_co_i32 s2, s2, s3
49; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
50; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
51; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
52; GFX12-NEXT:    s_endpgm
53  %result = sub i32 %a, %b
54  store i32 %result, ptr addrspace(1) %out
55  ret void
56}
57
58define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) {
59; GFX6-LABEL: s_sub_imm_i32:
60; GFX6:       ; %bb.0:
61; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
62; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
63; GFX6-NEXT:    s_mov_b32 s3, 0xf000
64; GFX6-NEXT:    s_mov_b32 s2, -1
65; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX6-NEXT:    s_sub_i32 s4, 0x4d2, s6
67; GFX6-NEXT:    v_mov_b32_e32 v0, s4
68; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
69; GFX6-NEXT:    s_endpgm
70;
71; GFX8-LABEL: s_sub_imm_i32:
72; GFX8:       ; %bb.0:
73; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x2c
74; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
75; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
76; GFX8-NEXT:    s_sub_i32 s2, 0x4d2, s2
77; GFX8-NEXT:    v_mov_b32_e32 v0, s0
78; GFX8-NEXT:    v_mov_b32_e32 v1, s1
79; GFX8-NEXT:    v_mov_b32_e32 v2, s2
80; GFX8-NEXT:    flat_store_dword v[0:1], v2
81; GFX8-NEXT:    s_endpgm
82;
83; GFX9-LABEL: s_sub_imm_i32:
84; GFX9:       ; %bb.0:
85; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
86; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
87; GFX9-NEXT:    v_mov_b32_e32 v0, 0
88; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
89; GFX9-NEXT:    s_sub_i32 s2, 0x4d2, s2
90; GFX9-NEXT:    v_mov_b32_e32 v1, s2
91; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
92; GFX9-NEXT:    s_endpgm
93;
94; GFX12-LABEL: s_sub_imm_i32:
95; GFX12:       ; %bb.0:
96; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
97; GFX12-NEXT:    s_wait_kmcnt 0x0
98; GFX12-NEXT:    s_sub_co_i32 s2, 0x4d2, s2
99; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
100; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
101; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
102; GFX12-NEXT:    s_endpgm
103  %result = sub i32 1234, %a
104  store i32 %result, ptr addrspace(1) %out
105  ret void
106}
107
108define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
109; GFX6-LABEL: test_sub_i32:
110; GFX6:       ; %bb.0:
111; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
112; GFX6-NEXT:    s_mov_b32 s7, 0xf000
113; GFX6-NEXT:    s_mov_b32 s6, -1
114; GFX6-NEXT:    s_mov_b32 s10, s6
115; GFX6-NEXT:    s_mov_b32 s11, s7
116; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
117; GFX6-NEXT:    s_mov_b32 s8, s2
118; GFX6-NEXT:    s_mov_b32 s9, s3
119; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
120; GFX6-NEXT:    s_mov_b32 s4, s0
121; GFX6-NEXT:    s_mov_b32 s5, s1
122; GFX6-NEXT:    s_waitcnt vmcnt(0)
123; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
124; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
125; GFX6-NEXT:    s_endpgm
126;
127; GFX8-LABEL: test_sub_i32:
128; GFX8:       ; %bb.0:
129; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
130; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
131; GFX8-NEXT:    v_mov_b32_e32 v0, s2
132; GFX8-NEXT:    v_mov_b32_e32 v1, s3
133; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
134; GFX8-NEXT:    v_mov_b32_e32 v2, s0
135; GFX8-NEXT:    v_mov_b32_e32 v3, s1
136; GFX8-NEXT:    s_waitcnt vmcnt(0)
137; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
138; GFX8-NEXT:    flat_store_dword v[2:3], v0
139; GFX8-NEXT:    s_endpgm
140;
141; GFX9-LABEL: test_sub_i32:
142; GFX9:       ; %bb.0:
143; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
144; GFX9-NEXT:    v_mov_b32_e32 v2, 0
145; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
146; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
147; GFX9-NEXT:    s_waitcnt vmcnt(0)
148; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
149; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
150; GFX9-NEXT:    s_endpgm
151;
152; GFX12-LABEL: test_sub_i32:
153; GFX12:       ; %bb.0:
154; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
155; GFX12-NEXT:    v_mov_b32_e32 v2, 0
156; GFX12-NEXT:    s_wait_kmcnt 0x0
157; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
158; GFX12-NEXT:    s_wait_loadcnt 0x0
159; GFX12-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
160; GFX12-NEXT:    global_store_b32 v2, v0, s[0:1]
161; GFX12-NEXT:    s_endpgm
162  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
163  %a = load i32, ptr addrspace(1) %in
164  %b = load i32, ptr addrspace(1) %b_ptr
165  %result = sub i32 %a, %b
166  store i32 %result, ptr addrspace(1) %out
167  ret void
168}
169
170define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
171; GFX6-LABEL: test_sub_imm_i32:
172; GFX6:       ; %bb.0:
173; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
174; GFX6-NEXT:    s_mov_b32 s7, 0xf000
175; GFX6-NEXT:    s_mov_b32 s6, -1
176; GFX6-NEXT:    s_mov_b32 s10, s6
177; GFX6-NEXT:    s_mov_b32 s11, s7
178; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
179; GFX6-NEXT:    s_mov_b32 s8, s2
180; GFX6-NEXT:    s_mov_b32 s9, s3
181; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
182; GFX6-NEXT:    s_mov_b32 s4, s0
183; GFX6-NEXT:    s_mov_b32 s5, s1
184; GFX6-NEXT:    s_waitcnt vmcnt(0)
185; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 0x7b, v0
186; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
187; GFX6-NEXT:    s_endpgm
188;
189; GFX8-LABEL: test_sub_imm_i32:
190; GFX8:       ; %bb.0:
191; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
192; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
193; GFX8-NEXT:    v_mov_b32_e32 v0, s2
194; GFX8-NEXT:    v_mov_b32_e32 v1, s3
195; GFX8-NEXT:    flat_load_dword v2, v[0:1]
196; GFX8-NEXT:    v_mov_b32_e32 v0, s0
197; GFX8-NEXT:    v_mov_b32_e32 v1, s1
198; GFX8-NEXT:    s_waitcnt vmcnt(0)
199; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 0x7b, v2
200; GFX8-NEXT:    flat_store_dword v[0:1], v2
201; GFX8-NEXT:    s_endpgm
202;
203; GFX9-LABEL: test_sub_imm_i32:
204; GFX9:       ; %bb.0:
205; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
206; GFX9-NEXT:    v_mov_b32_e32 v0, 0
207; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
208; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
209; GFX9-NEXT:    s_waitcnt vmcnt(0)
210; GFX9-NEXT:    v_sub_u32_e32 v1, 0x7b, v1
211; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
212; GFX9-NEXT:    s_endpgm
213;
214; GFX12-LABEL: test_sub_imm_i32:
215; GFX12:       ; %bb.0:
216; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
217; GFX12-NEXT:    v_mov_b32_e32 v0, 0
218; GFX12-NEXT:    s_wait_kmcnt 0x0
219; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
220; GFX12-NEXT:    s_wait_loadcnt 0x0
221; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0x7b, v1
222; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
223; GFX12-NEXT:    s_endpgm
224  %a = load i32, ptr addrspace(1) %in
225  %result = sub i32 123, %a
226  store i32 %result, ptr addrspace(1) %out
227  ret void
228}
229
230define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
231; GFX6-LABEL: test_sub_v2i32:
232; GFX6:       ; %bb.0:
233; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
234; GFX6-NEXT:    s_mov_b32 s7, 0xf000
235; GFX6-NEXT:    s_mov_b32 s6, -1
236; GFX6-NEXT:    s_mov_b32 s10, s6
237; GFX6-NEXT:    s_mov_b32 s11, s7
238; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
239; GFX6-NEXT:    s_mov_b32 s8, s2
240; GFX6-NEXT:    s_mov_b32 s9, s3
241; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
242; GFX6-NEXT:    s_mov_b32 s4, s0
243; GFX6-NEXT:    s_mov_b32 s5, s1
244; GFX6-NEXT:    s_waitcnt vmcnt(0)
245; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
246; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
247; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
248; GFX6-NEXT:    s_endpgm
249;
250; GFX8-LABEL: test_sub_v2i32:
251; GFX8:       ; %bb.0:
252; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
253; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX8-NEXT:    v_mov_b32_e32 v0, s2
255; GFX8-NEXT:    v_mov_b32_e32 v1, s3
256; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
257; GFX8-NEXT:    v_mov_b32_e32 v4, s0
258; GFX8-NEXT:    v_mov_b32_e32 v5, s1
259; GFX8-NEXT:    s_waitcnt vmcnt(0)
260; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v3
261; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
262; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
263; GFX8-NEXT:    s_endpgm
264;
265; GFX9-LABEL: test_sub_v2i32:
266; GFX9:       ; %bb.0:
267; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
268; GFX9-NEXT:    v_mov_b32_e32 v4, 0
269; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
270; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
271; GFX9-NEXT:    s_waitcnt vmcnt(0)
272; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v3
273; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
274; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
275; GFX9-NEXT:    s_endpgm
276;
277; GFX12-LABEL: test_sub_v2i32:
278; GFX12:       ; %bb.0:
279; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
280; GFX12-NEXT:    v_mov_b32_e32 v4, 0
281; GFX12-NEXT:    s_wait_kmcnt 0x0
282; GFX12-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
283; GFX12-NEXT:    s_wait_loadcnt 0x0
284; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v1, v3
285; GFX12-NEXT:    v_sub_nc_u32_e32 v0, v0, v2
286; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
287; GFX12-NEXT:    s_endpgm
288  %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
289  %a = load <2 x i32>, ptr addrspace(1) %in
290  %b = load <2 x i32>, ptr addrspace(1) %b_ptr
291  %result = sub <2 x i32> %a, %b
292  store <2 x i32> %result, ptr addrspace(1) %out
293  ret void
294}
295
296define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
297; GFX6-LABEL: test_sub_v4i32:
298; GFX6:       ; %bb.0:
299; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
300; GFX6-NEXT:    s_mov_b32 s7, 0xf000
301; GFX6-NEXT:    s_mov_b32 s6, -1
302; GFX6-NEXT:    s_mov_b32 s10, s6
303; GFX6-NEXT:    s_mov_b32 s11, s7
304; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
305; GFX6-NEXT:    s_mov_b32 s8, s2
306; GFX6-NEXT:    s_mov_b32 s9, s3
307; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
308; GFX6-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
309; GFX6-NEXT:    s_mov_b32 s4, s0
310; GFX6-NEXT:    s_mov_b32 s5, s1
311; GFX6-NEXT:    s_waitcnt vmcnt(0)
312; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v7
313; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
314; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
315; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
316; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
317; GFX6-NEXT:    s_endpgm
318;
319; GFX8-LABEL: test_sub_v4i32:
320; GFX8:       ; %bb.0:
321; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
322; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX8-NEXT:    v_mov_b32_e32 v0, s2
324; GFX8-NEXT:    v_mov_b32_e32 v1, s3
325; GFX8-NEXT:    s_add_u32 s2, s2, 16
326; GFX8-NEXT:    s_addc_u32 s3, s3, 0
327; GFX8-NEXT:    v_mov_b32_e32 v5, s3
328; GFX8-NEXT:    v_mov_b32_e32 v4, s2
329; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
330; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
331; GFX8-NEXT:    v_mov_b32_e32 v8, s0
332; GFX8-NEXT:    v_mov_b32_e32 v9, s1
333; GFX8-NEXT:    s_waitcnt vmcnt(0)
334; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v7
335; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v6
336; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v5
337; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
338; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
339; GFX8-NEXT:    s_endpgm
340;
341; GFX9-LABEL: test_sub_v4i32:
342; GFX9:       ; %bb.0:
343; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
344; GFX9-NEXT:    v_mov_b32_e32 v8, 0
345; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
346; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
347; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3]
348; GFX9-NEXT:    s_waitcnt vmcnt(0)
349; GFX9-NEXT:    v_sub_u32_e32 v3, v7, v3
350; GFX9-NEXT:    v_sub_u32_e32 v2, v6, v2
351; GFX9-NEXT:    v_sub_u32_e32 v1, v5, v1
352; GFX9-NEXT:    v_sub_u32_e32 v0, v4, v0
353; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
354; GFX9-NEXT:    s_endpgm
355;
356; GFX12-LABEL: test_sub_v4i32:
357; GFX12:       ; %bb.0:
358; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
359; GFX12-NEXT:    v_mov_b32_e32 v8, 0
360; GFX12-NEXT:    s_wait_kmcnt 0x0
361; GFX12-NEXT:    s_clause 0x1
362; GFX12-NEXT:    global_load_b128 v[0:3], v8, s[2:3] offset:16
363; GFX12-NEXT:    global_load_b128 v[4:7], v8, s[2:3]
364; GFX12-NEXT:    s_wait_loadcnt 0x0
365; GFX12-NEXT:    v_sub_nc_u32_e32 v3, v7, v3
366; GFX12-NEXT:    v_sub_nc_u32_e32 v2, v6, v2
367; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v5, v1
368; GFX12-NEXT:    v_sub_nc_u32_e32 v0, v4, v0
369; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
370; GFX12-NEXT:    s_endpgm
371  %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
372  %a = load <4 x i32>, ptr addrspace(1) %in
373  %b = load <4 x i32>, ptr addrspace(1) %b_ptr
374  %result = sub <4 x i32> %a, %b
375  store <4 x i32> %result, ptr addrspace(1) %out
376  ret void
377}
378
379define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
380; GFX6-LABEL: test_sub_i16:
381; GFX6:       ; %bb.0:
382; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
383; GFX6-NEXT:    s_mov_b32 s7, 0xf000
384; GFX6-NEXT:    s_mov_b32 s10, 0
385; GFX6-NEXT:    s_mov_b32 s11, s7
386; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
387; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
388; GFX6-NEXT:    s_mov_b64 s[8:9], s[2:3]
389; GFX6-NEXT:    v_mov_b32_e32 v1, 0
390; GFX6-NEXT:    buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
391; GFX6-NEXT:    s_waitcnt vmcnt(0)
392; GFX6-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 offset:2 glc
393; GFX6-NEXT:    s_waitcnt vmcnt(0)
394; GFX6-NEXT:    s_mov_b32 s6, -1
395; GFX6-NEXT:    s_mov_b32 s4, s0
396; GFX6-NEXT:    s_mov_b32 s5, s1
397; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
398; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0
399; GFX6-NEXT:    s_endpgm
400;
401; GFX8-LABEL: test_sub_i16:
402; GFX8:       ; %bb.0:
403; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
404; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
405; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
406; GFX8-NEXT:    v_mov_b32_e32 v1, s3
407; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
408; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
409; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
410; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
411; GFX8-NEXT:    flat_load_ushort v4, v[0:1] glc
412; GFX8-NEXT:    s_waitcnt vmcnt(0)
413; GFX8-NEXT:    flat_load_ushort v2, v[2:3] glc
414; GFX8-NEXT:    s_waitcnt vmcnt(0)
415; GFX8-NEXT:    v_mov_b32_e32 v0, s0
416; GFX8-NEXT:    v_mov_b32_e32 v1, s1
417; GFX8-NEXT:    v_sub_u16_e32 v2, v4, v2
418; GFX8-NEXT:    flat_store_short v[0:1], v2
419; GFX8-NEXT:    s_endpgm
420;
421; GFX9-LABEL: test_sub_i16:
422; GFX9:       ; %bb.0:
423; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
424; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
425; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
426; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
427; GFX9-NEXT:    s_waitcnt vmcnt(0)
428; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc
429; GFX9-NEXT:    s_waitcnt vmcnt(0)
430; GFX9-NEXT:    v_mov_b32_e32 v0, 0
431; GFX9-NEXT:    v_sub_u16_e32 v1, v1, v2
432; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
433; GFX9-NEXT:    s_endpgm
434;
435; GFX12-LABEL: test_sub_i16:
436; GFX12:       ; %bb.0:
437; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
438; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
439; GFX12-NEXT:    v_mov_b32_e32 v2, 0
440; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
441; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
442; GFX12-NEXT:    s_wait_kmcnt 0x0
443; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
444; GFX12-NEXT:    s_wait_loadcnt 0x0
445; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3] offset:2 scope:SCOPE_SYS
446; GFX12-NEXT:    s_wait_loadcnt 0x0
447; GFX12-NEXT:    v_sub_nc_u16 v0, v1, v0
448; GFX12-NEXT:    global_store_b16 v2, v0, s[0:1]
449; GFX12-NEXT:    s_endpgm
450  %tid = call i32 @llvm.amdgcn.workitem.id.x()
451  %gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
452  %b_ptr = getelementptr i16, ptr addrspace(1) %gep, i32 1
453  %a = load volatile i16, ptr addrspace(1) %gep
454  %b = load volatile i16, ptr addrspace(1) %b_ptr
455  %result = sub i16 %a, %b
456  store i16 %result, ptr addrspace(1) %out
457  ret void
458}
459
460define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
461; GFX6-LABEL: test_sub_v2i16:
462; GFX6:       ; %bb.0:
463; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
464; GFX6-NEXT:    s_mov_b32 s7, 0xf000
465; GFX6-NEXT:    s_mov_b32 s10, 0
466; GFX6-NEXT:    s_mov_b32 s11, s7
467; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
468; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
469; GFX6-NEXT:    s_mov_b64 s[8:9], s[2:3]
470; GFX6-NEXT:    v_mov_b32_e32 v1, 0
471; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
472; GFX6-NEXT:    s_mov_b32 s6, -1
473; GFX6-NEXT:    s_mov_b32 s4, s0
474; GFX6-NEXT:    s_mov_b32 s5, s1
475; GFX6-NEXT:    s_waitcnt vmcnt(0)
476; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
477; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
478; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
479; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v2, v3
480; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
481; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
482; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
483; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
484; GFX6-NEXT:    s_endpgm
485;
486; GFX8-LABEL: test_sub_v2i16:
487; GFX8:       ; %bb.0:
488; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
489; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
490; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
491; GFX8-NEXT:    v_mov_b32_e32 v1, s3
492; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
493; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
494; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
495; GFX8-NEXT:    v_mov_b32_e32 v2, s0
496; GFX8-NEXT:    v_mov_b32_e32 v3, s1
497; GFX8-NEXT:    s_waitcnt vmcnt(0)
498; GFX8-NEXT:    v_sub_u16_e32 v4, v0, v1
499; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
500; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
501; GFX8-NEXT:    flat_store_dword v[2:3], v0
502; GFX8-NEXT:    s_endpgm
503;
504; GFX9-LABEL: test_sub_v2i16:
505; GFX9:       ; %bb.0:
506; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
507; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
508; GFX9-NEXT:    v_mov_b32_e32 v2, 0
509; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
510; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
511; GFX9-NEXT:    s_waitcnt vmcnt(0)
512; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
513; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
514; GFX9-NEXT:    s_endpgm
515;
516; GFX12-LABEL: test_sub_v2i16:
517; GFX12:       ; %bb.0:
518; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
519; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
520; GFX12-NEXT:    v_mov_b32_e32 v2, 0
521; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
522; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
523; GFX12-NEXT:    s_wait_kmcnt 0x0
524; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
525; GFX12-NEXT:    s_wait_loadcnt 0x0
526; GFX12-NEXT:    v_pk_sub_i16 v0, v0, v1
527; GFX12-NEXT:    global_store_b32 v2, v0, s[0:1]
528; GFX12-NEXT:    s_endpgm
529  %tid = call i32 @llvm.amdgcn.workitem.id.x()
530  %gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid
531  %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %gep, i16 1
532  %a = load <2 x i16>, ptr addrspace(1) %gep
533  %b = load <2 x i16>, ptr addrspace(1) %b_ptr
534  %result = sub <2 x i16> %a, %b
535  store <2 x i16> %result, ptr addrspace(1) %out
536  ret void
537}
538
539define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
540; GFX6-LABEL: test_sub_v4i16:
541; GFX6:       ; %bb.0:
542; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
543; GFX6-NEXT:    s_mov_b32 s7, 0xf000
544; GFX6-NEXT:    s_mov_b32 s10, 0
545; GFX6-NEXT:    s_mov_b32 s11, s7
546; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
547; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
548; GFX6-NEXT:    s_mov_b64 s[8:9], s[2:3]
549; GFX6-NEXT:    v_mov_b32_e32 v1, 0
550; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
551; GFX6-NEXT:    s_mov_b32 s6, -1
552; GFX6-NEXT:    s_mov_b32 s4, s0
553; GFX6-NEXT:    s_mov_b32 s5, s1
554; GFX6-NEXT:    s_waitcnt vmcnt(0)
555; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
556; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
557; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
558; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
559; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
560; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
561; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v5, v7
562; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v4, v6
563; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
564; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
565; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
566; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
567; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
568; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
569; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
570; GFX6-NEXT:    s_endpgm
571;
572; GFX8-LABEL: test_sub_v4i16:
573; GFX8:       ; %bb.0:
574; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
575; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
576; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
577; GFX8-NEXT:    v_mov_b32_e32 v1, s3
578; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
579; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
580; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
581; GFX8-NEXT:    v_mov_b32_e32 v4, s0
582; GFX8-NEXT:    v_mov_b32_e32 v5, s1
583; GFX8-NEXT:    s_waitcnt vmcnt(0)
584; GFX8-NEXT:    v_sub_u16_e32 v6, v1, v3
585; GFX8-NEXT:    v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
586; GFX8-NEXT:    v_sub_u16_e32 v3, v0, v2
587; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
588; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
589; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
590; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
591; GFX8-NEXT:    s_endpgm
592;
593; GFX9-LABEL: test_sub_v4i16:
594; GFX9:       ; %bb.0:
595; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
596; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
597; GFX9-NEXT:    v_mov_b32_e32 v4, 0
598; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
599; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
600; GFX9-NEXT:    s_waitcnt vmcnt(0)
601; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v3
602; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v2
603; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
604; GFX9-NEXT:    s_endpgm
605;
606; GFX12-LABEL: test_sub_v4i16:
607; GFX12:       ; %bb.0:
608; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
609; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
610; GFX12-NEXT:    v_mov_b32_e32 v4, 0
611; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
612; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
613; GFX12-NEXT:    s_wait_kmcnt 0x0
614; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
615; GFX12-NEXT:    s_wait_loadcnt 0x0
616; GFX12-NEXT:    v_pk_sub_i16 v1, v1, v3
617; GFX12-NEXT:    v_pk_sub_i16 v0, v0, v2
618; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
619; GFX12-NEXT:    s_endpgm
620  %tid = call i32 @llvm.amdgcn.workitem.id.x()
621  %gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid
622  %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %gep, i16 1
623  %a = load <4 x i16>, ptr addrspace(1) %gep
624  %b = load <4 x i16>, ptr addrspace(1) %b_ptr
625  %result = sub <4 x i16> %a, %b
626  store <4 x i16> %result, ptr addrspace(1) %out
627  ret void
628}
629
630define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind {
631; GFX6-LABEL: s_sub_i64:
632; GFX6:       ; %bb.0:
633; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
634; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
635; GFX6-NEXT:    s_mov_b32 s7, 0xf000
636; GFX6-NEXT:    s_mov_b32 s6, -1
637; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
638; GFX6-NEXT:    s_sub_u32 s0, s0, s2
639; GFX6-NEXT:    s_subb_u32 s1, s1, s3
640; GFX6-NEXT:    v_mov_b32_e32 v0, s0
641; GFX6-NEXT:    v_mov_b32_e32 v1, s1
642; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
643; GFX6-NEXT:    s_endpgm
644;
645; GFX8-LABEL: s_sub_i64:
646; GFX8:       ; %bb.0:
647; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
648; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
649; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
650; GFX8-NEXT:    s_sub_u32 s0, s0, s2
651; GFX8-NEXT:    s_subb_u32 s1, s1, s3
652; GFX8-NEXT:    v_mov_b32_e32 v0, s4
653; GFX8-NEXT:    v_mov_b32_e32 v3, s1
654; GFX8-NEXT:    v_mov_b32_e32 v1, s5
655; GFX8-NEXT:    v_mov_b32_e32 v2, s0
656; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
657; GFX8-NEXT:    s_endpgm
658;
659; GFX9-LABEL: s_sub_i64:
660; GFX9:       ; %bb.0:
661; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
662; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
663; GFX9-NEXT:    v_mov_b32_e32 v2, 0
664; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
665; GFX9-NEXT:    s_sub_u32 s0, s0, s2
666; GFX9-NEXT:    s_subb_u32 s1, s1, s3
667; GFX9-NEXT:    v_mov_b32_e32 v0, s0
668; GFX9-NEXT:    v_mov_b32_e32 v1, s1
669; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
670; GFX9-NEXT:    s_endpgm
671;
672; GFX12-LABEL: s_sub_i64:
673; GFX12:       ; %bb.0:
674; GFX12-NEXT:    s_clause 0x1
675; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
676; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
677; GFX12-NEXT:    s_wait_kmcnt 0x0
678; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], s[2:3]
679; GFX12-NEXT:    v_mov_b32_e32 v2, 0
680; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
681; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
682; GFX12-NEXT:    s_endpgm
683  %result = sub i64 %a, %b
684  store i64 %result, ptr addrspace(1) %out, align 8
685  ret void
686}
687
688define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) nounwind {
689; GFX6-LABEL: v_sub_i64:
690; GFX6:       ; %bb.0:
691; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
692; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
693; GFX6-NEXT:    s_mov_b32 s11, 0xf000
694; GFX6-NEXT:    s_mov_b32 s14, 0
695; GFX6-NEXT:    s_mov_b32 s15, s11
696; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
697; GFX6-NEXT:    v_mov_b32_e32 v1, 0
698; GFX6-NEXT:    s_mov_b64 s[6:7], s[14:15]
699; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
700; GFX6-NEXT:    s_mov_b64 s[12:13], s[2:3]
701; GFX6-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
702; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[12:15], 0 addr64
703; GFX6-NEXT:    s_mov_b32 s10, -1
704; GFX6-NEXT:    s_mov_b32 s8, s0
705; GFX6-NEXT:    s_mov_b32 s9, s1
706; GFX6-NEXT:    s_waitcnt vmcnt(0)
707; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
708; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
709; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
710; GFX6-NEXT:    s_endpgm
711;
712; GFX8-LABEL: v_sub_i64:
713; GFX8:       ; %bb.0:
714; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
715; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
716; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
717; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
718; GFX8-NEXT:    v_mov_b32_e32 v1, s3
719; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
720; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
721; GFX8-NEXT:    v_mov_b32_e32 v3, s5
722; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
723; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
724; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
725; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
726; GFX8-NEXT:    s_waitcnt vmcnt(0)
727; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
728; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
729; GFX8-NEXT:    v_mov_b32_e32 v3, s1
730; GFX8-NEXT:    v_mov_b32_e32 v2, s0
731; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
732; GFX8-NEXT:    s_endpgm
733;
734; GFX9-LABEL: v_sub_i64:
735; GFX9:       ; %bb.0:
736; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
737; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
738; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
739; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
740; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
741; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
742; GFX9-NEXT:    v_mov_b32_e32 v4, 0
743; GFX9-NEXT:    s_waitcnt vmcnt(0)
744; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
745; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
746; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
747; GFX9-NEXT:    s_endpgm
748;
749; GFX12-LABEL: v_sub_i64:
750; GFX12:       ; %bb.0:
751; GFX12-NEXT:    s_clause 0x1
752; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
753; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
754; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
755; GFX12-NEXT:    v_mov_b32_e32 v4, 0
756; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
757; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
758; GFX12-NEXT:    s_wait_kmcnt 0x0
759; GFX12-NEXT:    s_clause 0x1
760; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
761; GFX12-NEXT:    global_load_b64 v[2:3], v2, s[4:5]
762; GFX12-NEXT:    s_wait_loadcnt 0x0
763; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
764; GFX12-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
765; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
766; GFX12-NEXT:    s_endpgm
767  %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
768  %a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid
769  %b_ptr = getelementptr i64, ptr addrspace(1) %inB, i32 %tid
770  %a = load i64, ptr addrspace(1) %a_ptr
771  %b = load i64, ptr addrspace(1) %b_ptr
772  %result = sub i64 %a, %b
773  store i64 %result, ptr addrspace(1) %out, align 8
774  ret void
775}
776
777define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
778; GFX6-LABEL: v_test_sub_v2i64:
779; GFX6:       ; %bb.0:
780; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
781; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
782; GFX6-NEXT:    s_mov_b32 s11, 0xf000
783; GFX6-NEXT:    s_mov_b32 s14, 0
784; GFX6-NEXT:    s_mov_b32 s15, s11
785; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
786; GFX6-NEXT:    v_mov_b32_e32 v5, 0
787; GFX6-NEXT:    s_mov_b64 s[6:7], s[14:15]
788; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
789; GFX6-NEXT:    s_mov_b64 s[12:13], s[2:3]
790; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
791; GFX6-NEXT:    buffer_load_dwordx4 v[4:7], v[4:5], s[12:15], 0 addr64
792; GFX6-NEXT:    s_mov_b32 s10, -1
793; GFX6-NEXT:    s_mov_b32 s8, s0
794; GFX6-NEXT:    s_mov_b32 s9, s1
795; GFX6-NEXT:    s_waitcnt vmcnt(0)
796; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v6, v2
797; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
798; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
799; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
800; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
801; GFX6-NEXT:    s_endpgm
802;
803; GFX8-LABEL: v_test_sub_v2i64:
804; GFX8:       ; %bb.0:
805; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
806; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
807; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
808; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
809; GFX8-NEXT:    v_mov_b32_e32 v1, s3
810; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
811; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
812; GFX8-NEXT:    v_mov_b32_e32 v3, s5
813; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v2
814; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
815; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
816; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
817; GFX8-NEXT:    s_waitcnt vmcnt(0)
818; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v6
819; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
820; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
821; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
822; GFX8-NEXT:    v_mov_b32_e32 v5, s1
823; GFX8-NEXT:    v_mov_b32_e32 v4, s0
824; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
825; GFX8-NEXT:    s_endpgm
826;
827; GFX9-LABEL: v_test_sub_v2i64:
828; GFX9:       ; %bb.0:
829; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
830; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
831; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 4, v0
832; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
833; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
834; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[6:7]
835; GFX9-NEXT:    v_mov_b32_e32 v8, 0
836; GFX9-NEXT:    s_waitcnt vmcnt(0)
837; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v6
838; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
839; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
840; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
841; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
842; GFX9-NEXT:    s_endpgm
843;
844; GFX12-LABEL: v_test_sub_v2i64:
845; GFX12:       ; %bb.0:
846; GFX12-NEXT:    s_clause 0x1
847; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
848; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
849; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
850; GFX12-NEXT:    v_mov_b32_e32 v8, 0
851; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
852; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
853; GFX12-NEXT:    s_wait_kmcnt 0x0
854; GFX12-NEXT:    s_clause 0x1
855; GFX12-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
856; GFX12-NEXT:    global_load_b128 v[4:7], v4, s[4:5]
857; GFX12-NEXT:    s_wait_loadcnt 0x0
858; GFX12-NEXT:    v_sub_co_u32 v2, vcc_lo, v2, v6
859; GFX12-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
860; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
861; GFX12-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
862; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
863; GFX12-NEXT:    s_endpgm
864  %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
865  %a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid
866  %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inB, i32 %tid
867  %a = load <2 x i64>, ptr addrspace(1) %a_ptr
868  %b = load <2 x i64>, ptr addrspace(1) %b_ptr
869  %result = sub <2 x i64> %a, %b
870  store <2 x i64> %result, ptr addrspace(1) %out
871  ret void
872}
873
874define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
875; GFX6-LABEL: v_test_sub_v4i64:
876; GFX6:       ; %bb.0:
877; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
878; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
879; GFX6-NEXT:    s_mov_b32 s11, 0xf000
880; GFX6-NEXT:    s_mov_b32 s14, 0
881; GFX6-NEXT:    s_mov_b32 s15, s11
882; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
883; GFX6-NEXT:    s_mov_b64 s[12:13], s[2:3]
884; GFX6-NEXT:    v_lshlrev_b32_e32 v12, 5, v0
885; GFX6-NEXT:    v_mov_b32_e32 v13, 0
886; GFX6-NEXT:    s_mov_b64 s[6:7], s[14:15]
887; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[12:13], s[12:15], 0 addr64
888; GFX6-NEXT:    buffer_load_dwordx4 v[4:7], v[12:13], s[4:7], 0 addr64
889; GFX6-NEXT:    buffer_load_dwordx4 v[8:11], v[12:13], s[4:7], 0 addr64 offset:16
890; GFX6-NEXT:    buffer_load_dwordx4 v[12:15], v[12:13], s[12:15], 0 addr64 offset:16
891; GFX6-NEXT:    s_mov_b32 s10, -1
892; GFX6-NEXT:    s_mov_b32 s8, s0
893; GFX6-NEXT:    s_mov_b32 s9, s1
894; GFX6-NEXT:    s_waitcnt vmcnt(2)
895; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
896; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
897; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
898; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
899; GFX6-NEXT:    s_waitcnt vmcnt(0)
900; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v14, v10
901; GFX6-NEXT:    v_subb_u32_e32 v7, vcc, v15, v11, vcc
902; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
903; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v13, v9, vcc
904; GFX6-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
905; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
906; GFX6-NEXT:    s_endpgm
907;
908; GFX8-LABEL: v_test_sub_v4i64:
909; GFX8:       ; %bb.0:
910; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
911; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
912; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
913; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
914; GFX8-NEXT:    v_mov_b32_e32 v1, s3
915; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s2, v0
916; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
917; GFX8-NEXT:    v_mov_b32_e32 v1, s5
918; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v0
919; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
920; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[8:9]
921; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[12:13]
922; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 16, v8
923; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
924; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 16, v12
925; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v13, vcc
926; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
927; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
928; GFX8-NEXT:    v_mov_b32_e32 v17, s1
929; GFX8-NEXT:    v_mov_b32_e32 v16, s0
930; GFX8-NEXT:    s_add_u32 s0, s0, 16
931; GFX8-NEXT:    s_addc_u32 s1, s1, 0
932; GFX8-NEXT:    s_waitcnt vmcnt(2)
933; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v6
934; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
935; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
936; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
937; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
938; GFX8-NEXT:    s_waitcnt vmcnt(1)
939; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v10, v14
940; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v11, v15, vcc
941; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v8, v12
942; GFX8-NEXT:    v_mov_b32_e32 v0, s0
943; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v9, v13, vcc
944; GFX8-NEXT:    v_mov_b32_e32 v1, s1
945; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
946; GFX8-NEXT:    s_endpgm
947;
948; GFX9-LABEL: v_test_sub_v4i64:
949; GFX9:       ; %bb.0:
950; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
951; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
952; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 5, v0
953; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
954; GFX9-NEXT:    global_load_dwordx4 v[0:3], v16, s[2:3]
955; GFX9-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7]
956; GFX9-NEXT:    global_load_dwordx4 v[8:11], v16, s[2:3] offset:16
957; GFX9-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:16
958; GFX9-NEXT:    v_mov_b32_e32 v16, 0
959; GFX9-NEXT:    s_waitcnt vmcnt(2)
960; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v6
961; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
962; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
963; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
964; GFX9-NEXT:    s_waitcnt vmcnt(0)
965; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, v10, v14
966; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v11, v15, vcc
967; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v8, v12
968; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v9, v13, vcc
969; GFX9-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
970; GFX9-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
971; GFX9-NEXT:    s_endpgm
972;
973; GFX12-LABEL: v_test_sub_v4i64:
974; GFX12:       ; %bb.0:
975; GFX12-NEXT:    s_clause 0x1
976; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
977; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
978; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
979; GFX12-NEXT:    v_mov_b32_e32 v16, 0
980; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
981; GFX12-NEXT:    v_lshlrev_b32_e32 v12, 5, v0
982; GFX12-NEXT:    s_wait_kmcnt 0x0
983; GFX12-NEXT:    s_clause 0x3
984; GFX12-NEXT:    global_load_b128 v[0:3], v12, s[2:3]
985; GFX12-NEXT:    global_load_b128 v[4:7], v12, s[4:5]
986; GFX12-NEXT:    global_load_b128 v[8:11], v12, s[2:3] offset:16
987; GFX12-NEXT:    global_load_b128 v[12:15], v12, s[4:5] offset:16
988; GFX12-NEXT:    s_wait_loadcnt 0x2
989; GFX12-NEXT:    v_sub_co_u32 v2, vcc_lo, v2, v6
990; GFX12-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
991; GFX12-NEXT:    s_wait_loadcnt 0x0
992; GFX12-NEXT:    v_sub_co_u32 v10, vcc_lo, v10, v14
993; GFX12-NEXT:    v_sub_co_ci_u32_e32 v11, vcc_lo, v11, v15, vcc_lo
994; GFX12-NEXT:    v_sub_co_u32 v8, vcc_lo, v8, v12
995; GFX12-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v9, v13, vcc_lo
996; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
997; GFX12-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
998; GFX12-NEXT:    s_clause 0x1
999; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
1000; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
1001; GFX12-NEXT:    s_endpgm
1002  %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
1003  %a_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inA, i32 %tid
1004  %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inB, i32 %tid
1005  %a = load <4 x i64>, ptr addrspace(1) %a_ptr
1006  %b = load <4 x i64>, ptr addrspace(1) %b_ptr
1007  %result = sub <4 x i64> %a, %b
1008  store <4 x i64> %result, ptr addrspace(1) %out
1009  ret void
1010}
1011
1012; Make sure the VOP3 form of sub is initially selected. Otherwise pair
1013; of opies from/to VCC would be necessary
1014
1015define amdgpu_ps void @sub_select_vop3(i32 inreg %s, i32 %v) {
1016; GFX6-LABEL: sub_select_vop3:
1017; GFX6:       ; %bb.0:
1018; GFX6-NEXT:    v_subrev_i32_e64 v0, s[0:1], s0, v0
1019; GFX6-NEXT:    s_mov_b32 m0, -1
1020; GFX6-NEXT:    ;;#ASMSTART
1021; GFX6-NEXT:    ; def vcc
1022; GFX6-NEXT:    ;;#ASMEND
1023; GFX6-NEXT:    ds_write_b32 v0, v0
1024; GFX6-NEXT:    ;;#ASMSTART
1025; GFX6-NEXT:    ; use vcc
1026; GFX6-NEXT:    ;;#ASMEND
1027; GFX6-NEXT:    s_endpgm
1028;
1029; GFX8-LABEL: sub_select_vop3:
1030; GFX8:       ; %bb.0:
1031; GFX8-NEXT:    v_subrev_u32_e64 v0, s[0:1], s0, v0
1032; GFX8-NEXT:    s_mov_b32 m0, -1
1033; GFX8-NEXT:    ;;#ASMSTART
1034; GFX8-NEXT:    ; def vcc
1035; GFX8-NEXT:    ;;#ASMEND
1036; GFX8-NEXT:    ds_write_b32 v0, v0
1037; GFX8-NEXT:    ;;#ASMSTART
1038; GFX8-NEXT:    ; use vcc
1039; GFX8-NEXT:    ;;#ASMEND
1040; GFX8-NEXT:    s_endpgm
1041;
1042; GFX9-LABEL: sub_select_vop3:
1043; GFX9:       ; %bb.0:
1044; GFX9-NEXT:    v_subrev_u32_e32 v0, s0, v0
1045; GFX9-NEXT:    ;;#ASMSTART
1046; GFX9-NEXT:    ; def vcc
1047; GFX9-NEXT:    ;;#ASMEND
1048; GFX9-NEXT:    ds_write_b32 v0, v0
1049; GFX9-NEXT:    ;;#ASMSTART
1050; GFX9-NEXT:    ; use vcc
1051; GFX9-NEXT:    ;;#ASMEND
1052; GFX9-NEXT:    s_endpgm
1053;
1054; GFX12-LABEL: sub_select_vop3:
1055; GFX12:       ; %bb.0:
1056; GFX12-NEXT:    v_subrev_nc_u32_e32 v0, s0, v0
1057; GFX12-NEXT:    ;;#ASMSTART
1058; GFX12-NEXT:    ; def vcc
1059; GFX12-NEXT:    ;;#ASMEND
1060; GFX12-NEXT:    ds_store_b32 v0, v0
1061; GFX12-NEXT:    ;;#ASMSTART
1062; GFX12-NEXT:    ; use vcc
1063; GFX12-NEXT:    ;;#ASMEND
1064; GFX12-NEXT:    s_endpgm
1065  %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"()
1066  %sub = sub i32 %v, %s
1067  store i32 %sub, ptr addrspace(3) undef
1068  call void asm sideeffect "; use vcc", "{vcc}"(i64 %vcc)
1069  ret void
1070}
1071