xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5
6define i32 @v_usubo_i32(i32 %a, i32 %b) {
7; GFX7-LABEL: v_usubo_i32:
8; GFX7:       ; %bb.0:
9; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
11; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
12; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
13; GFX7-NEXT:    s_setpc_b64 s[30:31]
14;
15; GFX8-LABEL: v_usubo_i32:
16; GFX8:       ; %bb.0:
17; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
19; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
20; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
21; GFX8-NEXT:    s_setpc_b64 s[30:31]
22;
23; GFX9-LABEL: v_usubo_i32:
24; GFX9:       ; %bb.0:
25; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v1
27; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
28; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
29; GFX9-NEXT:    s_setpc_b64 s[30:31]
30  %usubo = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
31  %sub = extractvalue {i32, i1} %usubo, 0
32  %of = extractvalue {i32, i1} %usubo, 1
33  %of.zext = zext i1 %of to i32
34  %ret = sub i32 %sub, %of.zext
35  ret i32 %ret
36}
37
38define i64 @v_usubo_i64(i64 %a, i64 %b) {
39; GFX7-LABEL: v_usubo_i64:
40; GFX7:       ; %bb.0:
41; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
43; GFX7-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
44; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
45; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
46; GFX7-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
47; GFX7-NEXT:    s_setpc_b64 s[30:31]
48;
49; GFX8-LABEL: v_usubo_i64:
50; GFX8:       ; %bb.0:
51; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
53; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
54; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
55; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
56; GFX8-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
57; GFX8-NEXT:    s_setpc_b64 s[30:31]
58;
59; GFX9-LABEL: v_usubo_i64:
60; GFX9:       ; %bb.0:
61; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
63; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
64; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
65; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
66; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
67; GFX9-NEXT:    s_setpc_b64 s[30:31]
68  %usubo = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
69  %sub = extractvalue {i64, i1} %usubo, 0
70  %of = extractvalue {i64, i1} %usubo, 1
71  %of.zext = zext i1 %of to i64
72  %ret = sub i64 %sub, %of.zext
73  ret i64 %ret
74}
75
76define i8 @v_usubo_i8(i8 %a, i8 %b) {
77; GFX7-LABEL: v_usubo_i8:
78; GFX7:       ; %bb.0:
79; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
81; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
82; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
83; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v0
84; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
85; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
86; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
87; GFX7-NEXT:    s_setpc_b64 s[30:31]
88;
89; GFX8-LABEL: v_usubo_i8:
90; GFX8:       ; %bb.0:
91; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
93; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
94; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
95; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
96; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
97; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
98; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
99; GFX8-NEXT:    s_setpc_b64 s[30:31]
100;
101; GFX9-LABEL: v_usubo_i8:
102; GFX9:       ; %bb.0:
103; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
104; GFX9-NEXT:    v_sub_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
105; GFX9-NEXT:    v_cmp_ne_u32_sdwa s[4:5], v0, v0 src0_sel:DWORD src1_sel:BYTE_0
106; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
107; GFX9-NEXT:    v_sub_u16_e32 v0, v0, v1
108; GFX9-NEXT:    s_setpc_b64 s[30:31]
109  %usubo = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a, i8 %b)
110  %sub = extractvalue {i8, i1} %usubo, 0
111  %of = extractvalue {i8, i1} %usubo, 1
112  %of.zext = zext i1 %of to i8
113  %ret = sub i8 %sub, %of.zext
114  ret i8 %ret
115}
116
117define i7 @v_usubo_i7(i7 %a, i7 %b) {
118; GFX7-LABEL: v_usubo_i7:
119; GFX7:       ; %bb.0:
120; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121; GFX7-NEXT:    v_and_b32_e32 v0, 0x7f, v0
122; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v1
123; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
124; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v0
125; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
126; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
127; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
128; GFX7-NEXT:    s_setpc_b64 s[30:31]
129;
130; GFX8-LABEL: v_usubo_i7:
131; GFX8:       ; %bb.0:
132; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
133; GFX8-NEXT:    v_and_b32_e32 v0, 0x7f, v0
134; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v1
135; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
136; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v0
137; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
138; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
139; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
140; GFX8-NEXT:    s_setpc_b64 s[30:31]
141;
142; GFX9-LABEL: v_usubo_i7:
143; GFX9:       ; %bb.0:
144; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
145; GFX9-NEXT:    v_and_b32_e32 v0, 0x7f, v0
146; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v1
147; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
148; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v0
149; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
150; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
151; GFX9-NEXT:    v_sub_u16_e32 v0, v0, v1
152; GFX9-NEXT:    s_setpc_b64 s[30:31]
153  %usubo = call {i7, i1} @llvm.usub.with.overflow.i7(i7 %a, i7 %b)
154  %sub = extractvalue {i7, i1} %usubo, 0
155  %of = extractvalue {i7, i1} %usubo, 1
156  %of.zext = zext i1 %of to i7
157  %ret = sub i7 %sub, %of.zext
158  ret i7 %ret
159}
160
161define <2 x i32> @v_usubo_v2i32(<2 x i32> %a, <2 x i32> %b) {
162; GFX7-LABEL: v_usubo_v2i32:
163; GFX7:       ; %bb.0:
164; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
166; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
167; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
168; GFX7-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
169; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
170; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
171; GFX7-NEXT:    s_setpc_b64 s[30:31]
172;
173; GFX8-LABEL: v_usubo_v2i32:
174; GFX8:       ; %bb.0:
175; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
176; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
177; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
178; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v3
179; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
180; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
181; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v3
182; GFX8-NEXT:    s_setpc_b64 s[30:31]
183;
184; GFX9-LABEL: v_usubo_v2i32:
185; GFX9:       ; %bb.0:
186; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
187; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
188; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
189; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v3
190; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
191; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
192; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v3
193; GFX9-NEXT:    s_setpc_b64 s[30:31]
194  %usubo = call {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
195  %sub = extractvalue {<2 x i32>, <2 x i1>} %usubo, 0
196  %of = extractvalue {<2 x i32>, <2 x i1>} %usubo, 1
197  %of.zext = zext <2 x i1> %of to <2 x i32>
198  %ret = sub <2 x i32> %sub, %of.zext
199  ret <2 x i32> %ret
200}
201
202define i32 @v_ssubo_i32(i32 %a, i32 %b) {
203; GFX7-LABEL: v_ssubo_i32:
204; GFX7:       ; %bb.0:
205; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
206; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
207; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v0
208; GFX7-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v1
209; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
210; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
211; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
212; GFX7-NEXT:    s_setpc_b64 s[30:31]
213;
214; GFX8-LABEL: v_ssubo_i32:
215; GFX8:       ; %bb.0:
216; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v0, v1
218; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v0
219; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v1
220; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
221; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
222; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v2, v0
223; GFX8-NEXT:    s_setpc_b64 s[30:31]
224;
225; GFX9-LABEL: v_ssubo_i32:
226; GFX9:       ; %bb.0:
227; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228; GFX9-NEXT:    v_sub_u32_e32 v2, v0, v1
229; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v0
230; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v1
231; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
232; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
233; GFX9-NEXT:    v_sub_u32_e32 v0, v2, v0
234; GFX9-NEXT:    s_setpc_b64 s[30:31]
235  %ssubo = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
236  %sub = extractvalue {i32, i1} %ssubo, 0
237  %of = extractvalue {i32, i1} %ssubo, 1
238  %of.zext = zext i1 %of to i32
239  %ret = sub i32 %sub, %of.zext
240  ret i32 %ret
241}
242
243define i64 @v_ssubo_i64(i64 %a, i64 %b) {
244; GFX7-LABEL: v_ssubo_i64:
245; GFX7:       ; %bb.0:
246; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
247; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
248; GFX7-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
249; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
250; GFX7-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
251; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
252; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
253; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
254; GFX7-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v5, vcc
255; GFX7-NEXT:    s_setpc_b64 s[30:31]
256;
257; GFX8-LABEL: v_ssubo_i64:
258; GFX8:       ; %bb.0:
259; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
260; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v0, v2
261; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
262; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
263; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
264; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
265; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
266; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v4, v0
267; GFX8-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v5, vcc
268; GFX8-NEXT:    s_setpc_b64 s[30:31]
269;
270; GFX9-LABEL: v_ssubo_i64:
271; GFX9:       ; %bb.0:
272; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
273; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
274; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
275; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
276; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
277; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
278; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
279; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v4, v0
280; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v5, vcc
281; GFX9-NEXT:    s_setpc_b64 s[30:31]
282  %ssubo = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %a, i64 %b)
283  %sub = extractvalue {i64, i1} %ssubo, 0
284  %of = extractvalue {i64, i1} %ssubo, 1
285  %of.zext = zext i1 %of to i64
286  %ret = sub i64 %sub, %of.zext
287  ret i64 %ret
288}
289
290define <2 x i32> @v_ssubo_v2i32(<2 x i32> %a, <2 x i32> %b) {
291; GFX7-LABEL: v_ssubo_v2i32:
292; GFX7:       ; %bb.0:
293; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
294; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
295; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, v1, v3
296; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, v4, v0
297; GFX7-NEXT:    v_cmp_lt_i32_e64 s[4:5], v5, v1
298; GFX7-NEXT:    v_cmp_lt_i32_e64 s[6:7], 0, v2
299; GFX7-NEXT:    v_cmp_lt_i32_e64 s[8:9], 0, v3
300; GFX7-NEXT:    s_xor_b64 s[6:7], s[6:7], vcc
301; GFX7-NEXT:    s_xor_b64 s[4:5], s[8:9], s[4:5]
302; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
303; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
304; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
305; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
306; GFX7-NEXT:    s_setpc_b64 s[30:31]
307;
308; GFX8-LABEL: v_ssubo_v2i32:
309; GFX8:       ; %bb.0:
310; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
311; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v0, v2
312; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v1, v3
313; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v4, v0
314; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v5, v1
315; GFX8-NEXT:    v_cmp_lt_i32_e64 s[6:7], 0, v2
316; GFX8-NEXT:    v_cmp_lt_i32_e64 s[8:9], 0, v3
317; GFX8-NEXT:    s_xor_b64 s[6:7], s[6:7], vcc
318; GFX8-NEXT:    s_xor_b64 s[4:5], s[8:9], s[4:5]
319; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
320; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
321; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v4, v0
322; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v5, v1
323; GFX8-NEXT:    s_setpc_b64 s[30:31]
324;
325; GFX9-LABEL: v_ssubo_v2i32:
326; GFX9:       ; %bb.0:
327; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
328; GFX9-NEXT:    v_sub_u32_e32 v4, v0, v2
329; GFX9-NEXT:    v_sub_u32_e32 v5, v1, v3
330; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, v4, v0
331; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], v5, v1
332; GFX9-NEXT:    v_cmp_lt_i32_e64 s[6:7], 0, v2
333; GFX9-NEXT:    v_cmp_lt_i32_e64 s[8:9], 0, v3
334; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], vcc
335; GFX9-NEXT:    s_xor_b64 s[4:5], s[8:9], s[4:5]
336; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
337; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
338; GFX9-NEXT:    v_sub_u32_e32 v0, v4, v0
339; GFX9-NEXT:    v_sub_u32_e32 v1, v5, v1
340; GFX9-NEXT:    s_setpc_b64 s[30:31]
341  %ssubo = call {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
342  %sub = extractvalue {<2 x i32>, <2 x i1>} %ssubo, 0
343  %of = extractvalue {<2 x i32>, <2 x i1>} %ssubo, 1
344  %of.zext = zext <2 x i1> %of to <2 x i32>
345  %ret = sub <2 x i32> %sub, %of.zext
346  ret <2 x i32> %ret
347}
348
349define i8 @v_ssubo_i8(i8 %a, i8 %b) {
350; GFX7-LABEL: v_ssubo_i8:
351; GFX7:       ; %bb.0:
352; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
353; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
354; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
355; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 8
356; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v0
357; GFX7-NEXT:    v_bfe_i32 v0, v1, 0, 8
358; GFX7-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v0
359; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
360; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
361; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
362; GFX7-NEXT:    s_setpc_b64 s[30:31]
363;
364; GFX8-LABEL: v_ssubo_i8:
365; GFX8:       ; %bb.0:
366; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
367; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
368; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 8
369; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
370; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v0
371; GFX8-NEXT:    v_bfe_i32 v0, v1, 0, 8
372; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v0
373; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
374; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
375; GFX8-NEXT:    v_sub_u16_e32 v0, v2, v0
376; GFX8-NEXT:    s_setpc_b64 s[30:31]
377;
378; GFX9-LABEL: v_ssubo_i8:
379; GFX9:       ; %bb.0:
380; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
381; GFX9-NEXT:    v_sub_u16_e32 v2, v0, v1
382; GFX9-NEXT:    v_cmp_lt_i32_sdwa s[4:5], sext(v2), sext(v0) src0_sel:BYTE_0 src1_sel:BYTE_0
383; GFX9-NEXT:    v_mov_b32_e32 v0, 0
384; GFX9-NEXT:    v_cmp_gt_i32_sdwa s[6:7], sext(v1), v0 src0_sel:BYTE_0 src1_sel:DWORD
385; GFX9-NEXT:    s_xor_b64 s[4:5], s[6:7], s[4:5]
386; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
387; GFX9-NEXT:    v_sub_u16_e32 v0, v2, v0
388; GFX9-NEXT:    s_setpc_b64 s[30:31]
389  %ssubo = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a, i8 %b)
390  %sub = extractvalue {i8, i1} %ssubo, 0
391  %of = extractvalue {i8, i1} %ssubo, 1
392  %of.zext = zext i1 %of to i8
393  %ret = sub i8 %sub, %of.zext
394  ret i8 %ret
395}
396
397define i7 @v_ssubo_i7(i7 %a, i7 %b) {
398; GFX7-LABEL: v_ssubo_i7:
399; GFX7:       ; %bb.0:
400; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
401; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
402; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 7
403; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 7
404; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v0
405; GFX7-NEXT:    v_bfe_i32 v0, v1, 0, 7
406; GFX7-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v0
407; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
408; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
409; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
410; GFX7-NEXT:    s_setpc_b64 s[30:31]
411;
412; GFX8-LABEL: v_ssubo_i7:
413; GFX8:       ; %bb.0:
414; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
415; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
416; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 7
417; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 7
418; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v0
419; GFX8-NEXT:    v_bfe_i32 v0, v1, 0, 7
420; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v0
421; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
422; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
423; GFX8-NEXT:    v_sub_u16_e32 v0, v2, v0
424; GFX8-NEXT:    s_setpc_b64 s[30:31]
425;
426; GFX9-LABEL: v_ssubo_i7:
427; GFX9:       ; %bb.0:
428; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
429; GFX9-NEXT:    v_sub_u16_e32 v2, v0, v1
430; GFX9-NEXT:    v_bfe_i32 v3, v2, 0, 7
431; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 7
432; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v0
433; GFX9-NEXT:    v_bfe_i32 v0, v1, 0, 7
434; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v0
435; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
436; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
437; GFX9-NEXT:    v_sub_u16_e32 v0, v2, v0
438; GFX9-NEXT:    s_setpc_b64 s[30:31]
439  %ssubo = call {i7, i1} @llvm.ssub.with.overflow.i7(i7 %a, i7 %b)
440  %sub = extractvalue {i7, i1} %ssubo, 0
441  %of = extractvalue {i7, i1} %ssubo, 1
442  %of.zext = zext i1 %of to i7
443  %ret = sub i7 %sub, %of.zext
444  ret i7 %ret
445}
446
447define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
448; GFX7-LABEL: s_usubo_i32:
449; GFX7:       ; %bb.0:
450; GFX7-NEXT:    s_sub_u32 s0, s0, s1
451; GFX7-NEXT:    s_cselect_b32 s1, 1, 0
452; GFX7-NEXT:    s_sub_i32 s0, s0, s1
453; GFX7-NEXT:    ; return to shader part epilog
454;
455; GFX8-LABEL: s_usubo_i32:
456; GFX8:       ; %bb.0:
457; GFX8-NEXT:    s_sub_u32 s0, s0, s1
458; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
459; GFX8-NEXT:    s_sub_i32 s0, s0, s1
460; GFX8-NEXT:    ; return to shader part epilog
461;
462; GFX9-LABEL: s_usubo_i32:
463; GFX9:       ; %bb.0:
464; GFX9-NEXT:    s_sub_u32 s0, s0, s1
465; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
466; GFX9-NEXT:    s_sub_i32 s0, s0, s1
467; GFX9-NEXT:    ; return to shader part epilog
468  %usubo = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
469  %sub = extractvalue {i32, i1} %usubo, 0
470  %of = extractvalue {i32, i1} %usubo, 1
471  %of.zext = zext i1 %of to i32
472  %ret = sub i32 %sub, %of.zext
473  ret i32 %ret
474}
475
476define amdgpu_ps i64 @s_usubo_i64(i64 inreg %a, i64 inreg %b) {
477; GFX7-LABEL: s_usubo_i64:
478; GFX7:       ; %bb.0:
479; GFX7-NEXT:    s_sub_u32 s0, s0, s2
480; GFX7-NEXT:    s_subb_u32 s1, s1, s3
481; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
482; GFX7-NEXT:    s_sub_u32 s0, s0, s2
483; GFX7-NEXT:    s_subb_u32 s1, s1, 0
484; GFX7-NEXT:    ; return to shader part epilog
485;
486; GFX8-LABEL: s_usubo_i64:
487; GFX8:       ; %bb.0:
488; GFX8-NEXT:    s_sub_u32 s0, s0, s2
489; GFX8-NEXT:    s_subb_u32 s1, s1, s3
490; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
491; GFX8-NEXT:    s_sub_u32 s0, s0, s2
492; GFX8-NEXT:    s_subb_u32 s1, s1, 0
493; GFX8-NEXT:    ; return to shader part epilog
494;
495; GFX9-LABEL: s_usubo_i64:
496; GFX9:       ; %bb.0:
497; GFX9-NEXT:    s_sub_u32 s0, s0, s2
498; GFX9-NEXT:    s_subb_u32 s1, s1, s3
499; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
500; GFX9-NEXT:    s_sub_u32 s0, s0, s2
501; GFX9-NEXT:    s_subb_u32 s1, s1, 0
502; GFX9-NEXT:    ; return to shader part epilog
503  %usubo = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
504  %sub = extractvalue {i64, i1} %usubo, 0
505  %of = extractvalue {i64, i1} %usubo, 1
506  %of.zext = zext i1 %of to i64
507  %ret = sub i64 %sub, %of.zext
508  ret i64 %ret
509}
510
511define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b) {
512; GFX7-LABEL: s_usubo_v2i32:
513; GFX7:       ; %bb.0:
514; GFX7-NEXT:    s_sub_u32 s0, s0, s2
515; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
516; GFX7-NEXT:    s_sub_u32 s1, s1, s3
517; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
518; GFX7-NEXT:    s_sub_i32 s0, s0, s2
519; GFX7-NEXT:    s_sub_i32 s1, s1, s3
520; GFX7-NEXT:    ; return to shader part epilog
521;
522; GFX8-LABEL: s_usubo_v2i32:
523; GFX8:       ; %bb.0:
524; GFX8-NEXT:    s_sub_u32 s0, s0, s2
525; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
526; GFX8-NEXT:    s_sub_u32 s1, s1, s3
527; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
528; GFX8-NEXT:    s_sub_i32 s0, s0, s2
529; GFX8-NEXT:    s_sub_i32 s1, s1, s3
530; GFX8-NEXT:    ; return to shader part epilog
531;
532; GFX9-LABEL: s_usubo_v2i32:
533; GFX9:       ; %bb.0:
534; GFX9-NEXT:    s_sub_u32 s0, s0, s2
535; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
536; GFX9-NEXT:    s_sub_u32 s1, s1, s3
537; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
538; GFX9-NEXT:    s_sub_i32 s0, s0, s2
539; GFX9-NEXT:    s_sub_i32 s1, s1, s3
540; GFX9-NEXT:    ; return to shader part epilog
541  %usubo = call {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
542  %sub = extractvalue {<2 x i32>, <2 x i1>} %usubo, 0
543  %of = extractvalue {<2 x i32>, <2 x i1>} %usubo, 1
544  %of.zext = zext <2 x i1> %of to <2 x i32>
545  %ret = sub <2 x i32> %sub, %of.zext
546  ret <2 x i32> %ret
547}
548
549define i8 @s_usubo_i8(i8 %a, i8 %b) {
550; GFX7-LABEL: s_usubo_i8:
551; GFX7:       ; %bb.0:
552; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
553; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
554; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
555; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
556; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v0
557; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
558; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
559; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
560; GFX7-NEXT:    s_setpc_b64 s[30:31]
561;
562; GFX8-LABEL: s_usubo_i8:
563; GFX8:       ; %bb.0:
564; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
565; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
566; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
567; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
568; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
569; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
570; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
571; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
572; GFX8-NEXT:    s_setpc_b64 s[30:31]
573;
574; GFX9-LABEL: s_usubo_i8:
575; GFX9:       ; %bb.0:
576; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
577; GFX9-NEXT:    v_sub_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
578; GFX9-NEXT:    v_cmp_ne_u32_sdwa s[4:5], v0, v0 src0_sel:DWORD src1_sel:BYTE_0
579; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
580; GFX9-NEXT:    v_sub_u16_e32 v0, v0, v1
581; GFX9-NEXT:    s_setpc_b64 s[30:31]
582  %usubo = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a, i8 %b)
583  %sub = extractvalue {i8, i1} %usubo, 0
584  %of = extractvalue {i8, i1} %usubo, 1
585  %of.zext = zext i1 %of to i8
586  %ret = sub i8 %sub, %of.zext
587  ret i8 %ret
588}
589
590define i7 @s_usubo_i7(i7 %a, i7 %b) {
591; GFX7-LABEL: s_usubo_i7:
592; GFX7:       ; %bb.0:
593; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
594; GFX7-NEXT:    v_and_b32_e32 v0, 0x7f, v0
595; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v1
596; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
597; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v0
598; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
599; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
600; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
601; GFX7-NEXT:    s_setpc_b64 s[30:31]
602;
603; GFX8-LABEL: s_usubo_i7:
604; GFX8:       ; %bb.0:
605; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
606; GFX8-NEXT:    v_and_b32_e32 v0, 0x7f, v0
607; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v1
608; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
609; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v0
610; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
611; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
612; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
613; GFX8-NEXT:    s_setpc_b64 s[30:31]
614;
615; GFX9-LABEL: s_usubo_i7:
616; GFX9:       ; %bb.0:
617; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
618; GFX9-NEXT:    v_and_b32_e32 v0, 0x7f, v0
619; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v1
620; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
621; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v0
622; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
623; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
624; GFX9-NEXT:    v_sub_u16_e32 v0, v0, v1
625; GFX9-NEXT:    s_setpc_b64 s[30:31]
626  %usubo = call {i7, i1} @llvm.usub.with.overflow.i7(i7 %a, i7 %b)
627  %sub = extractvalue {i7, i1} %usubo, 0
628  %of = extractvalue {i7, i1} %usubo, 1
629  %of.zext = zext i1 %of to i7
630  %ret = sub i7 %sub, %of.zext
631  ret i7 %ret
632}
633
634define amdgpu_ps i32 @s_ssubo_i32(i32 inreg %a, i32 inreg %b) {
635; GFX7-LABEL: s_ssubo_i32:
636; GFX7:       ; %bb.0:
637; GFX7-NEXT:    s_sub_i32 s2, s0, s1
638; GFX7-NEXT:    s_cmp_lt_i32 s2, s0
639; GFX7-NEXT:    s_cselect_b32 s0, 1, 0
640; GFX7-NEXT:    s_cmp_gt_i32 s1, 0
641; GFX7-NEXT:    s_cselect_b32 s1, 1, 0
642; GFX7-NEXT:    s_xor_b32 s0, s1, s0
643; GFX7-NEXT:    s_and_b32 s0, s0, 1
644; GFX7-NEXT:    s_sub_i32 s0, s2, s0
645; GFX7-NEXT:    ; return to shader part epilog
646;
647; GFX8-LABEL: s_ssubo_i32:
648; GFX8:       ; %bb.0:
649; GFX8-NEXT:    s_sub_i32 s2, s0, s1
650; GFX8-NEXT:    s_cmp_lt_i32 s2, s0
651; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
652; GFX8-NEXT:    s_cmp_gt_i32 s1, 0
653; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
654; GFX8-NEXT:    s_xor_b32 s0, s1, s0
655; GFX8-NEXT:    s_and_b32 s0, s0, 1
656; GFX8-NEXT:    s_sub_i32 s0, s2, s0
657; GFX8-NEXT:    ; return to shader part epilog
658;
659; GFX9-LABEL: s_ssubo_i32:
660; GFX9:       ; %bb.0:
661; GFX9-NEXT:    s_sub_i32 s2, s0, s1
662; GFX9-NEXT:    s_cmp_lt_i32 s2, s0
663; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
664; GFX9-NEXT:    s_cmp_gt_i32 s1, 0
665; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
666; GFX9-NEXT:    s_xor_b32 s0, s1, s0
667; GFX9-NEXT:    s_and_b32 s0, s0, 1
668; GFX9-NEXT:    s_sub_i32 s0, s2, s0
669; GFX9-NEXT:    ; return to shader part epilog
670  %ssubo = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
671  %sub = extractvalue {i32, i1} %ssubo, 0
672  %of = extractvalue {i32, i1} %ssubo, 1
673  %of.zext = zext i1 %of to i32
674  %ret = sub i32 %sub, %of.zext
675  ret i32 %ret
676}
677
678define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
679; GFX7-LABEL: s_ssubo_i64:
680; GFX7:       ; %bb.0:
681; GFX7-NEXT:    s_sub_u32 s4, s0, s2
682; GFX7-NEXT:    v_mov_b32_e32 v0, s0
683; GFX7-NEXT:    s_subb_u32 s5, s1, s3
684; GFX7-NEXT:    v_mov_b32_e32 v1, s1
685; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
686; GFX7-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
687; GFX7-NEXT:    v_mov_b32_e32 v1, s5
688; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
689; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
690; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
691; GFX7-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
692; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
693; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
694; GFX7-NEXT:    ; return to shader part epilog
695;
696; GFX8-LABEL: s_ssubo_i64:
697; GFX8:       ; %bb.0:
698; GFX8-NEXT:    s_sub_u32 s4, s0, s2
699; GFX8-NEXT:    v_mov_b32_e32 v0, s0
700; GFX8-NEXT:    s_subb_u32 s5, s1, s3
701; GFX8-NEXT:    v_mov_b32_e32 v1, s1
702; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
703; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
704; GFX8-NEXT:    v_mov_b32_e32 v1, s5
705; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
706; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
707; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
708; GFX8-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
709; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
710; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
711; GFX8-NEXT:    ; return to shader part epilog
712;
713; GFX9-LABEL: s_ssubo_i64:
714; GFX9:       ; %bb.0:
715; GFX9-NEXT:    s_sub_u32 s4, s0, s2
716; GFX9-NEXT:    v_mov_b32_e32 v0, s0
717; GFX9-NEXT:    s_subb_u32 s5, s1, s3
718; GFX9-NEXT:    v_mov_b32_e32 v1, s1
719; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
720; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
721; GFX9-NEXT:    v_mov_b32_e32 v1, s5
722; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
723; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
724; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
725; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
726; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
727; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
728; GFX9-NEXT:    ; return to shader part epilog
729  %ssubo = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %a, i64 %b)
730  %sub = extractvalue {i64, i1} %ssubo, 0
731  %of = extractvalue {i64, i1} %ssubo, 1
732  %of.zext = zext i1 %of to i64
733  %ret = sub i64 %sub, %of.zext
734  ret i64 %ret
735}
736
737define amdgpu_ps <2 x i32> @s_ssubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b) {
738; GFX7-LABEL: s_ssubo_v2i32:
739; GFX7:       ; %bb.0:
740; GFX7-NEXT:    s_sub_i32 s4, s0, s2
741; GFX7-NEXT:    s_sub_i32 s5, s1, s3
742; GFX7-NEXT:    s_cmp_lt_i32 s4, s0
743; GFX7-NEXT:    s_cselect_b32 s0, 1, 0
744; GFX7-NEXT:    s_cmp_lt_i32 s5, s1
745; GFX7-NEXT:    s_cselect_b32 s1, 1, 0
746; GFX7-NEXT:    s_cmp_gt_i32 s2, 0
747; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
748; GFX7-NEXT:    s_cmp_gt_i32 s3, 0
749; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
750; GFX7-NEXT:    s_xor_b32 s0, s2, s0
751; GFX7-NEXT:    s_xor_b32 s1, s3, s1
752; GFX7-NEXT:    s_and_b32 s0, s0, 1
753; GFX7-NEXT:    s_and_b32 s1, s1, 1
754; GFX7-NEXT:    s_sub_i32 s0, s4, s0
755; GFX7-NEXT:    s_sub_i32 s1, s5, s1
756; GFX7-NEXT:    ; return to shader part epilog
757;
758; GFX8-LABEL: s_ssubo_v2i32:
759; GFX8:       ; %bb.0:
760; GFX8-NEXT:    s_sub_i32 s4, s0, s2
761; GFX8-NEXT:    s_sub_i32 s5, s1, s3
762; GFX8-NEXT:    s_cmp_lt_i32 s4, s0
763; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
764; GFX8-NEXT:    s_cmp_lt_i32 s5, s1
765; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
766; GFX8-NEXT:    s_cmp_gt_i32 s2, 0
767; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
768; GFX8-NEXT:    s_cmp_gt_i32 s3, 0
769; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
770; GFX8-NEXT:    s_xor_b32 s0, s2, s0
771; GFX8-NEXT:    s_xor_b32 s1, s3, s1
772; GFX8-NEXT:    s_and_b32 s0, s0, 1
773; GFX8-NEXT:    s_and_b32 s1, s1, 1
774; GFX8-NEXT:    s_sub_i32 s0, s4, s0
775; GFX8-NEXT:    s_sub_i32 s1, s5, s1
776; GFX8-NEXT:    ; return to shader part epilog
777;
778; GFX9-LABEL: s_ssubo_v2i32:
779; GFX9:       ; %bb.0:
780; GFX9-NEXT:    s_sub_i32 s4, s0, s2
781; GFX9-NEXT:    s_sub_i32 s5, s1, s3
782; GFX9-NEXT:    s_cmp_lt_i32 s4, s0
783; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
784; GFX9-NEXT:    s_cmp_lt_i32 s5, s1
785; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
786; GFX9-NEXT:    s_cmp_gt_i32 s2, 0
787; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
788; GFX9-NEXT:    s_cmp_gt_i32 s3, 0
789; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
790; GFX9-NEXT:    s_xor_b32 s0, s2, s0
791; GFX9-NEXT:    s_xor_b32 s1, s3, s1
792; GFX9-NEXT:    s_and_b32 s0, s0, 1
793; GFX9-NEXT:    s_and_b32 s1, s1, 1
794; GFX9-NEXT:    s_sub_i32 s0, s4, s0
795; GFX9-NEXT:    s_sub_i32 s1, s5, s1
796; GFX9-NEXT:    ; return to shader part epilog
797  %ssubo = call {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
798  %sub = extractvalue {<2 x i32>, <2 x i1>} %ssubo, 0
799  %of = extractvalue {<2 x i32>, <2 x i1>} %ssubo, 1
800  %of.zext = zext <2 x i1> %of to <2 x i32>
801  %ret = sub <2 x i32> %sub, %of.zext
802  ret <2 x i32> %ret
803}
804
805define i8 @s_ssubo_i8(i8 %a, i8 %b) {
806; GFX7-LABEL: s_ssubo_i8:
807; GFX7:       ; %bb.0:
808; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
809; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
810; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
811; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 8
812; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v0
813; GFX7-NEXT:    v_bfe_i32 v0, v1, 0, 8
814; GFX7-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v0
815; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
816; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
817; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
818; GFX7-NEXT:    s_setpc_b64 s[30:31]
819;
820; GFX8-LABEL: s_ssubo_i8:
821; GFX8:       ; %bb.0:
822; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
823; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
824; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 8
825; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
826; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v0
827; GFX8-NEXT:    v_bfe_i32 v0, v1, 0, 8
828; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v0
829; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
830; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
831; GFX8-NEXT:    v_sub_u16_e32 v0, v2, v0
832; GFX8-NEXT:    s_setpc_b64 s[30:31]
833;
834; GFX9-LABEL: s_ssubo_i8:
835; GFX9:       ; %bb.0:
836; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
837; GFX9-NEXT:    v_sub_u16_e32 v2, v0, v1
838; GFX9-NEXT:    v_cmp_lt_i32_sdwa s[4:5], sext(v2), sext(v0) src0_sel:BYTE_0 src1_sel:BYTE_0
839; GFX9-NEXT:    v_mov_b32_e32 v0, 0
840; GFX9-NEXT:    v_cmp_gt_i32_sdwa s[6:7], sext(v1), v0 src0_sel:BYTE_0 src1_sel:DWORD
841; GFX9-NEXT:    s_xor_b64 s[4:5], s[6:7], s[4:5]
842; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
843; GFX9-NEXT:    v_sub_u16_e32 v0, v2, v0
844; GFX9-NEXT:    s_setpc_b64 s[30:31]
845  %ssubo = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a, i8 %b)
846  %sub = extractvalue {i8, i1} %ssubo, 0
847  %of = extractvalue {i8, i1} %ssubo, 1
848  %of.zext = zext i1 %of to i8
849  %ret = sub i8 %sub, %of.zext
850  ret i8 %ret
851}
852
853define i7 @s_ssubo_i7(i7 %a, i7 %b) {
854; GFX7-LABEL: s_ssubo_i7:
855; GFX7:       ; %bb.0:
856; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
857; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
858; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 7
859; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 7
860; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v0
861; GFX7-NEXT:    v_bfe_i32 v0, v1, 0, 7
862; GFX7-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v0
863; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
864; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
865; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
866; GFX7-NEXT:    s_setpc_b64 s[30:31]
867;
868; GFX8-LABEL: s_ssubo_i7:
869; GFX8:       ; %bb.0:
870; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
871; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
872; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 7
873; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 7
874; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v0
875; GFX8-NEXT:    v_bfe_i32 v0, v1, 0, 7
876; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v0
877; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
878; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
879; GFX8-NEXT:    v_sub_u16_e32 v0, v2, v0
880; GFX8-NEXT:    s_setpc_b64 s[30:31]
881;
882; GFX9-LABEL: s_ssubo_i7:
883; GFX9:       ; %bb.0:
884; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
885; GFX9-NEXT:    v_sub_u16_e32 v2, v0, v1
886; GFX9-NEXT:    v_bfe_i32 v3, v2, 0, 7
887; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 7
888; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v0
889; GFX9-NEXT:    v_bfe_i32 v0, v1, 0, 7
890; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v0
891; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
892; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
893; GFX9-NEXT:    v_sub_u16_e32 v0, v2, v0
894; GFX9-NEXT:    s_setpc_b64 s[30:31]
895  %ssubo = call {i7, i1} @llvm.ssub.with.overflow.i7(i7 %a, i7 %b)
896  %sub = extractvalue {i7, i1} %ssubo, 0
897  %of = extractvalue {i7, i1} %ssubo, 1
898  %of.zext = zext i1 %of to i7
899  %ret = sub i7 %sub, %of.zext
900  ret i7 %ret
901}
902
903define amdgpu_ps i32 @usubo_i32_sv(i32 inreg %a, i32 %b) {
904; GFX7-LABEL: usubo_i32_sv:
905; GFX7:       ; %bb.0:
906; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
907; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
908; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
909; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
910; GFX7-NEXT:    ; return to shader part epilog
911;
912; GFX8-LABEL: usubo_i32_sv:
913; GFX8:       ; %bb.0:
914; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
915; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
916; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
917; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
918; GFX8-NEXT:    ; return to shader part epilog
919;
920; GFX9-LABEL: usubo_i32_sv:
921; GFX9:       ; %bb.0:
922; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
923; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
924; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
925; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
926; GFX9-NEXT:    ; return to shader part epilog
927  %usubo = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
928  %sub = extractvalue {i32, i1} %usubo, 0
929  %of = extractvalue {i32, i1} %usubo, 1
930  %of.zext = zext i1 %of to i32
931  %ret = sub i32 %sub, %of.zext
932  ret i32 %ret
933}
934
935define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) {
936; GFX7-LABEL: usubo_i16_sv:
937; GFX7:       ; %bb.0:
938; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
939; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
940; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
941; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v0
942; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
943; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
944; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
945; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
946; GFX7-NEXT:    ; return to shader part epilog
947;
948; GFX8-LABEL: usubo_i16_sv:
949; GFX8:       ; %bb.0:
950; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
951; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
952; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
953; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v0
954; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
955; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
956; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
957; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
958; GFX8-NEXT:    ; return to shader part epilog
959;
960; GFX9-LABEL: usubo_i16_sv:
961; GFX9:       ; %bb.0:
962; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
963; GFX9-NEXT:    v_sub_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
964; GFX9-NEXT:    v_cmp_ne_u32_sdwa s[0:1], v0, v0 src0_sel:DWORD src1_sel:WORD_0
965; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
966; GFX9-NEXT:    v_sub_u16_e32 v0, v0, v1
967; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
968; GFX9-NEXT:    ; return to shader part epilog
969  %usubo = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a, i16 %b)
970  %sub = extractvalue {i16, i1} %usubo, 0
971  %of = extractvalue {i16, i1} %usubo, 1
972  %of.zext = zext i1 %of to i16
973  %ret = sub i16 %sub, %of.zext
974  ret i16 %ret
975}
976
977define amdgpu_ps i32 @ssubo_i32_sv(i32 inreg %a, i32 %b) {
978; GFX7-LABEL: ssubo_i32_sv:
979; GFX7:       ; %bb.0:
980; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
981; GFX7-NEXT:    v_cmp_gt_i32_e32 vcc, s0, v1
982; GFX7-NEXT:    v_cmp_lt_i32_e64 s[0:1], 0, v0
983; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
984; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
985; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
986; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
987; GFX7-NEXT:    ; return to shader part epilog
988;
989; GFX8-LABEL: ssubo_i32_sv:
990; GFX8:       ; %bb.0:
991; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s0, v0
992; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, s0, v1
993; GFX8-NEXT:    v_cmp_lt_i32_e64 s[0:1], 0, v0
994; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
995; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
996; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v1, v0
997; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
998; GFX8-NEXT:    ; return to shader part epilog
999;
1000; GFX9-LABEL: ssubo_i32_sv:
1001; GFX9:       ; %bb.0:
1002; GFX9-NEXT:    v_sub_u32_e32 v1, s0, v0
1003; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, s0, v1
1004; GFX9-NEXT:    v_cmp_lt_i32_e64 s[0:1], 0, v0
1005; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
1006; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1007; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
1008; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1009; GFX9-NEXT:    ; return to shader part epilog
1010  %ssubo = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
1011  %sub = extractvalue {i32, i1} %ssubo, 0
1012  %of = extractvalue {i32, i1} %ssubo, 1
1013  %of.zext = zext i1 %of to i32
1014  %ret = sub i32 %sub, %of.zext
1015  ret i32 %ret
1016}
1017
1018define amdgpu_ps i16 @ssubo_i16_sv(i16 inreg %a, i16 %b) {
1019; GFX7-LABEL: ssubo_i16_sv:
1020; GFX7:       ; %bb.0:
1021; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
1022; GFX7-NEXT:    v_bfe_i32 v2, v1, 0, 16
1023; GFX7-NEXT:    s_sext_i32_i16 s0, s0
1024; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
1025; GFX7-NEXT:    v_cmp_gt_i32_e32 vcc, s0, v2
1026; GFX7-NEXT:    v_cmp_lt_i32_e64 s[0:1], 0, v0
1027; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
1028; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1029; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
1030; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
1031; GFX7-NEXT:    ; return to shader part epilog
1032;
1033; GFX8-LABEL: ssubo_i16_sv:
1034; GFX8:       ; %bb.0:
1035; GFX8-NEXT:    v_sub_u16_e32 v1, s0, v0
1036; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, s0, v1
1037; GFX8-NEXT:    v_cmp_lt_i16_e64 s[0:1], 0, v0
1038; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
1039; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1040; GFX8-NEXT:    v_sub_u16_e32 v0, v1, v0
1041; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1042; GFX8-NEXT:    ; return to shader part epilog
1043;
1044; GFX9-LABEL: ssubo_i16_sv:
1045; GFX9:       ; %bb.0:
1046; GFX9-NEXT:    v_sub_u16_e32 v1, s0, v0
1047; GFX9-NEXT:    v_cmp_gt_i16_e32 vcc, s0, v1
1048; GFX9-NEXT:    v_cmp_lt_i16_e64 s[0:1], 0, v0
1049; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
1050; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1051; GFX9-NEXT:    v_sub_u16_e32 v0, v1, v0
1052; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1053; GFX9-NEXT:    ; return to shader part epilog
1054  %ssubo = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a, i16 %b)
1055  %sub = extractvalue {i16, i1} %ssubo, 0
1056  %of = extractvalue {i16, i1} %ssubo, 1
1057  %of.zext = zext i1 %of to i16
1058  %ret = sub i16 %sub, %of.zext
1059  ret i16 %ret
1060}
1061
1062declare {i7, i1} @llvm.usub.with.overflow.i7(i7 %a, i7 %b)
1063declare {i8, i1} @llvm.usub.with.overflow.i8(i8 %a, i8 %b)
1064declare {i16, i1} @llvm.usub.with.overflow.i16(i16 %a, i16 %b)
1065declare {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
1066declare {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
1067declare {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
1068
1069declare {i7, i1} @llvm.ssub.with.overflow.i7(i7 %a, i7 %b)
1070declare {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a, i8 %b)
1071declare {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a, i16 %b)
1072declare {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
1073declare {i64, i1} @llvm.ssub.with.overflow.i64(i64 %a, i64 %b)
1074declare {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
1075