xref: /llvm-project/llvm/test/CodeGen/AMDGPU/usubsat.ll (revision 0f3aeca16fc1de8d172fd14c908ebbd0fe61eeb4)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
7; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
8
9define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
10; GFX6-LABEL: v_usubsat_i8:
11; GFX6:       ; %bb.0:
12; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
14; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
15; GFX6-NEXT:    v_max_u32_e32 v0, v0, v1
16; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
17; GFX6-NEXT:    s_setpc_b64 s[30:31]
18;
19; GFX8-LABEL: v_usubsat_i8:
20; GFX8:       ; %bb.0:
21; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
23; GFX8-NEXT:    s_setpc_b64 s[30:31]
24;
25; GFX9-LABEL: v_usubsat_i8:
26; GFX9:       ; %bb.0:
27; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28; GFX9-NEXT:    v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
29; GFX9-NEXT:    s_setpc_b64 s[30:31]
30;
31; GFX10-LABEL: v_usubsat_i8:
32; GFX10:       ; %bb.0:
33; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
35; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
36; GFX10-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
37; GFX10-NEXT:    s_setpc_b64 s[30:31]
38;
39; GFX11-TRUE16-LABEL: v_usubsat_i8:
40; GFX11-TRUE16:       ; %bb.0:
41; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
43; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
44; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, v0.h clamp
45; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
46;
47; GFX11-FAKE16-LABEL: v_usubsat_i8:
48; GFX11-FAKE16:       ; %bb.0:
49; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
51; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
52; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
53; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
54  %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
55  ret i8 %result
56}
57
58define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
59; GFX6-LABEL: v_usubsat_i16:
60; GFX6:       ; %bb.0:
61; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
63; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
64; GFX6-NEXT:    v_max_u32_e32 v0, v0, v1
65; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
66; GFX6-NEXT:    s_setpc_b64 s[30:31]
67;
68; GFX8-LABEL: v_usubsat_i16:
69; GFX8:       ; %bb.0:
70; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
71; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
72; GFX8-NEXT:    s_setpc_b64 s[30:31]
73;
74; GFX9-LABEL: v_usubsat_i16:
75; GFX9:       ; %bb.0:
76; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
77; GFX9-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
78; GFX9-NEXT:    s_setpc_b64 s[30:31]
79;
80; GFX10-LABEL: v_usubsat_i16:
81; GFX10:       ; %bb.0:
82; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; GFX10-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
84; GFX10-NEXT:    s_setpc_b64 s[30:31]
85;
86; GFX11-TRUE16-LABEL: v_usubsat_i16:
87; GFX11-TRUE16:       ; %bb.0:
88; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
89; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
90; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, v0.h clamp
91; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
92;
93; GFX11-FAKE16-LABEL: v_usubsat_i16:
94; GFX11-FAKE16:       ; %bb.0:
95; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
96; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
97; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
98  %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
99  ret i16 %result
100}
101
102define i16 @usubsat_as_bithack_i16(i16 %x) {
103; GFX6-LABEL: usubsat_as_bithack_i16:
104; GFX6:       ; %bb.0:
105; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
107; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v1
108; GFX6-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
109; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
110; GFX6-NEXT:    s_setpc_b64 s[30:31]
111;
112; GFX8-LABEL: usubsat_as_bithack_i16:
113; GFX8:       ; %bb.0:
114; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115; GFX8-NEXT:    s_movk_i32 s4, 0x8000
116; GFX8-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
117; GFX8-NEXT:    s_setpc_b64 s[30:31]
118;
119; GFX9-LABEL: usubsat_as_bithack_i16:
120; GFX9:       ; %bb.0:
121; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
122; GFX9-NEXT:    s_movk_i32 s4, 0x8000
123; GFX9-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
124; GFX9-NEXT:    s_setpc_b64 s[30:31]
125;
126; GFX10-LABEL: usubsat_as_bithack_i16:
127; GFX10:       ; %bb.0:
128; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129; GFX10-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
130; GFX10-NEXT:    s_setpc_b64 s[30:31]
131;
132; GFX11-TRUE16-LABEL: usubsat_as_bithack_i16:
133; GFX11-TRUE16:       ; %bb.0:
134; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
135; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, 0x8000 clamp
136; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
137;
138; GFX11-FAKE16-LABEL: usubsat_as_bithack_i16:
139; GFX11-FAKE16:       ; %bb.0:
140; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
142; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
143  %signsplat = ashr i16 %x, 15
144  %flipsign = xor i16 %x, 32768
145  %result = and i16 %signsplat, %flipsign
146  ret i16 %result
147}
148
149define i16 @usubsat_as_bithack2_i16(i16 %x) {
150; GFX6-LABEL: usubsat_as_bithack2_i16:
151; GFX6:       ; %bb.0:
152; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
154; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v1
155; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xffff8000, v0
156; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
157; GFX6-NEXT:    s_setpc_b64 s[30:31]
158;
159; GFX8-LABEL: usubsat_as_bithack2_i16:
160; GFX8:       ; %bb.0:
161; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
162; GFX8-NEXT:    s_movk_i32 s4, 0x8000
163; GFX8-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
164; GFX8-NEXT:    s_setpc_b64 s[30:31]
165;
166; GFX9-LABEL: usubsat_as_bithack2_i16:
167; GFX9:       ; %bb.0:
168; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
169; GFX9-NEXT:    s_movk_i32 s4, 0x8000
170; GFX9-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
171; GFX9-NEXT:    s_setpc_b64 s[30:31]
172;
173; GFX10-LABEL: usubsat_as_bithack2_i16:
174; GFX10:       ; %bb.0:
175; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
176; GFX10-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
177; GFX10-NEXT:    s_setpc_b64 s[30:31]
178;
179; GFX11-TRUE16-LABEL: usubsat_as_bithack2_i16:
180; GFX11-TRUE16:       ; %bb.0:
181; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, 0x8000 clamp
183; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
184;
185; GFX11-FAKE16-LABEL: usubsat_as_bithack2_i16:
186; GFX11-FAKE16:       ; %bb.0:
187; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
188; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
189; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
190  %signsplat = ashr i16 %x, 15
191  %flipsign = add i16 %x, 32768
192  %result = and i16 %signsplat, %flipsign
193  ret i16 %result
194}
195
196define i16 @usubsat_as_bithack_commute_i16(i16 %x) {
197; GFX6-LABEL: usubsat_as_bithack_commute_i16:
198; GFX6:       ; %bb.0:
199; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
200; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
201; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v1
202; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xffff8000, v0
203; GFX6-NEXT:    v_and_b32_e32 v0, v0, v1
204; GFX6-NEXT:    s_setpc_b64 s[30:31]
205;
206; GFX8-LABEL: usubsat_as_bithack_commute_i16:
207; GFX8:       ; %bb.0:
208; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
209; GFX8-NEXT:    s_movk_i32 s4, 0x8000
210; GFX8-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
211; GFX8-NEXT:    s_setpc_b64 s[30:31]
212;
213; GFX9-LABEL: usubsat_as_bithack_commute_i16:
214; GFX9:       ; %bb.0:
215; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
216; GFX9-NEXT:    s_movk_i32 s4, 0x8000
217; GFX9-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
218; GFX9-NEXT:    s_setpc_b64 s[30:31]
219;
220; GFX10-LABEL: usubsat_as_bithack_commute_i16:
221; GFX10:       ; %bb.0:
222; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
223; GFX10-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
224; GFX10-NEXT:    s_setpc_b64 s[30:31]
225;
226; GFX11-TRUE16-LABEL: usubsat_as_bithack_commute_i16:
227; GFX11-TRUE16:       ; %bb.0:
228; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, 0x8000 clamp
230; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
231;
232; GFX11-FAKE16-LABEL: usubsat_as_bithack_commute_i16:
233; GFX11-FAKE16:       ; %bb.0:
234; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
236; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
237  %signsplat = ashr i16 %x, 15
238  %flipsign = add i16 %x, 32768
239  %result = and i16 %flipsign, %signsplat
240  ret i16 %result
241}
242
243define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) {
244; GFX6-LABEL: v_usubsat_i32:
245; GFX6:       ; %bb.0:
246; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
247; GFX6-NEXT:    v_max_u32_e32 v0, v0, v1
248; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
249; GFX6-NEXT:    s_setpc_b64 s[30:31]
250;
251; GFX8-LABEL: v_usubsat_i32:
252; GFX8:       ; %bb.0:
253; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v1 clamp
255; GFX8-NEXT:    s_setpc_b64 s[30:31]
256;
257; GFX9-LABEL: v_usubsat_i32:
258; GFX9:       ; %bb.0:
259; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
260; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v1 clamp
261; GFX9-NEXT:    s_setpc_b64 s[30:31]
262;
263; GFX10PLUS-LABEL: v_usubsat_i32:
264; GFX10PLUS:       ; %bb.0:
265; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
266; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v1 clamp
267; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
268  %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
269  ret i32 %result
270}
271
272define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
273; GFX6-LABEL: v_usubsat_v2i16:
274; GFX6:       ; %bb.0:
275; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
277; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
278; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
279; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
280; GFX6-NEXT:    v_max_u32_e32 v1, v1, v3
281; GFX6-NEXT:    v_max_u32_e32 v0, v0, v2
282; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
283; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
284; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
285; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
286; GFX6-NEXT:    s_setpc_b64 s[30:31]
287;
288; GFX8-LABEL: v_usubsat_v2i16:
289; GFX8:       ; %bb.0:
290; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
291; GFX8-NEXT:    v_sub_u16_sdwa v2, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
292; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
293; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
294; GFX8-NEXT:    s_setpc_b64 s[30:31]
295;
296; GFX9-LABEL: v_usubsat_v2i16:
297; GFX9:       ; %bb.0:
298; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
299; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
300; GFX9-NEXT:    s_setpc_b64 s[30:31]
301;
302; GFX10PLUS-LABEL: v_usubsat_v2i16:
303; GFX10PLUS:       ; %bb.0:
304; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
306; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
307  %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
308  ret <2 x i16> %result
309}
310
311define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
312; GFX6-LABEL: v_usubsat_v3i16:
313; GFX6:       ; %bb.0:
314; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
315; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v4
316; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
317; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
318; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
319; GFX6-NEXT:    v_max_u32_e32 v1, v1, v6
320; GFX6-NEXT:    v_max_u32_e32 v0, v0, v3
321; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
322; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v5
323; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
324; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
325; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
326; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
327; GFX6-NEXT:    v_max_u32_e32 v1, v2, v5
328; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v1, v5
329; GFX6-NEXT:    v_alignbit_b32 v1, v2, v0, 16
330; GFX6-NEXT:    s_setpc_b64 s[30:31]
331;
332; GFX8-LABEL: v_usubsat_v3i16:
333; GFX8:       ; %bb.0:
334; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
335; GFX8-NEXT:    v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
336; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v2 clamp
337; GFX8-NEXT:    v_sub_u16_e64 v1, v1, v3 clamp
338; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
339; GFX8-NEXT:    s_setpc_b64 s[30:31]
340;
341; GFX9-LABEL: v_usubsat_v3i16:
342; GFX9:       ; %bb.0:
343; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344; GFX9-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
345; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
346; GFX9-NEXT:    s_setpc_b64 s[30:31]
347;
348; GFX10PLUS-LABEL: v_usubsat_v3i16:
349; GFX10PLUS:       ; %bb.0:
350; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
351; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
352; GFX10PLUS-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
353; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
354  %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
355  ret <3 x i16> %result
356}
357
358define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
359; GFX6-LABEL: v_usubsat_v4i16:
360; GFX6:       ; %bb.0:
361; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
362; GFX6-NEXT:    v_and_b32_e32 v9, 0xffff, v5
363; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
364; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
365; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
366; GFX6-NEXT:    v_max_u32_e32 v1, v1, v9
367; GFX6-NEXT:    v_max_u32_e32 v0, v0, v4
368; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
369; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v7
370; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
371; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v6
372; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
373; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
374; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
375; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
376; GFX6-NEXT:    v_max_u32_e32 v1, v2, v6
377; GFX6-NEXT:    v_max_u32_e32 v2, v3, v8
378; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
379; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
380; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
381; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
382; GFX6-NEXT:    s_setpc_b64 s[30:31]
383;
384; GFX8-LABEL: v_usubsat_v4i16:
385; GFX8:       ; %bb.0:
386; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
387; GFX8-NEXT:    v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
388; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v2 clamp
389; GFX8-NEXT:    v_sub_u16_sdwa v2, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
390; GFX8-NEXT:    v_sub_u16_e64 v1, v1, v3 clamp
391; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
392; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
393; GFX8-NEXT:    s_setpc_b64 s[30:31]
394;
395; GFX9-LABEL: v_usubsat_v4i16:
396; GFX9:       ; %bb.0:
397; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
398; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
399; GFX9-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
400; GFX9-NEXT:    s_setpc_b64 s[30:31]
401;
402; GFX10PLUS-LABEL: v_usubsat_v4i16:
403; GFX10PLUS:       ; %bb.0:
404; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
405; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
406; GFX10PLUS-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
407; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
408  %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
409  %cast = bitcast <4 x i16> %result to <2 x float>
410  ret <2 x float> %cast
411}
412
413define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
414; GFX6-LABEL: v_usubsat_v2i32:
415; GFX6:       ; %bb.0:
416; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
417; GFX6-NEXT:    v_max_u32_e32 v0, v0, v2
418; GFX6-NEXT:    v_max_u32_e32 v1, v1, v3
419; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
420; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
421; GFX6-NEXT:    s_setpc_b64 s[30:31]
422;
423; GFX8-LABEL: v_usubsat_v2i32:
424; GFX8:       ; %bb.0:
425; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
426; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v2 clamp
427; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v3 clamp
428; GFX8-NEXT:    s_setpc_b64 s[30:31]
429;
430; GFX9-LABEL: v_usubsat_v2i32:
431; GFX9:       ; %bb.0:
432; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
433; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v2 clamp
434; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v3 clamp
435; GFX9-NEXT:    s_setpc_b64 s[30:31]
436;
437; GFX10PLUS-LABEL: v_usubsat_v2i32:
438; GFX10PLUS:       ; %bb.0:
439; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
440; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v2 clamp
441; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v3 clamp
442; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
443  %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
444  ret <2 x i32> %result
445}
446
447define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
448; GFX6-LABEL: v_usubsat_v3i32:
449; GFX6:       ; %bb.0:
450; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
451; GFX6-NEXT:    v_max_u32_e32 v0, v0, v3
452; GFX6-NEXT:    v_max_u32_e32 v1, v1, v4
453; GFX6-NEXT:    v_max_u32_e32 v2, v2, v5
454; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
455; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
456; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
457; GFX6-NEXT:    s_setpc_b64 s[30:31]
458;
459; GFX8-LABEL: v_usubsat_v3i32:
460; GFX8:       ; %bb.0:
461; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
462; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v3 clamp
463; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v4 clamp
464; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v5 clamp
465; GFX8-NEXT:    s_setpc_b64 s[30:31]
466;
467; GFX9-LABEL: v_usubsat_v3i32:
468; GFX9:       ; %bb.0:
469; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
470; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v3 clamp
471; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v4 clamp
472; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v5 clamp
473; GFX9-NEXT:    s_setpc_b64 s[30:31]
474;
475; GFX10PLUS-LABEL: v_usubsat_v3i32:
476; GFX10PLUS:       ; %bb.0:
477; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
478; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v3 clamp
479; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v4 clamp
480; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, v2, v5 clamp
481; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
482  %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
483  ret <3 x i32> %result
484}
485
486define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
487; GFX6-LABEL: v_usubsat_v4i32:
488; GFX6:       ; %bb.0:
489; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
490; GFX6-NEXT:    v_max_u32_e32 v0, v0, v4
491; GFX6-NEXT:    v_max_u32_e32 v1, v1, v5
492; GFX6-NEXT:    v_max_u32_e32 v2, v2, v6
493; GFX6-NEXT:    v_max_u32_e32 v3, v3, v7
494; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
495; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
496; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
497; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v7
498; GFX6-NEXT:    s_setpc_b64 s[30:31]
499;
500; GFX8-LABEL: v_usubsat_v4i32:
501; GFX8:       ; %bb.0:
502; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
503; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v4 clamp
504; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v5 clamp
505; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v6 clamp
506; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v7 clamp
507; GFX8-NEXT:    s_setpc_b64 s[30:31]
508;
509; GFX9-LABEL: v_usubsat_v4i32:
510; GFX9:       ; %bb.0:
511; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
512; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v4 clamp
513; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v5 clamp
514; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v6 clamp
515; GFX9-NEXT:    v_sub_u32_e64 v3, v3, v7 clamp
516; GFX9-NEXT:    s_setpc_b64 s[30:31]
517;
518; GFX10PLUS-LABEL: v_usubsat_v4i32:
519; GFX10PLUS:       ; %bb.0:
520; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
521; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v4 clamp
522; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v5 clamp
523; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, v2, v6 clamp
524; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v3, v3, v7 clamp
525; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
526  %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
527  ret <4 x i32> %result
528}
529
530define <8 x i32> @v_usubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
531; GFX6-LABEL: v_usubsat_v8i32:
532; GFX6:       ; %bb.0:
533; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534; GFX6-NEXT:    v_max_u32_e32 v0, v0, v8
535; GFX6-NEXT:    v_max_u32_e32 v1, v1, v9
536; GFX6-NEXT:    v_max_u32_e32 v2, v2, v10
537; GFX6-NEXT:    v_max_u32_e32 v3, v3, v11
538; GFX6-NEXT:    v_max_u32_e32 v4, v4, v12
539; GFX6-NEXT:    v_max_u32_e32 v5, v5, v13
540; GFX6-NEXT:    v_max_u32_e32 v6, v6, v14
541; GFX6-NEXT:    v_max_u32_e32 v7, v7, v15
542; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
543; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
544; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
545; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v11
546; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v12
547; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v13
548; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v14
549; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v15
550; GFX6-NEXT:    s_setpc_b64 s[30:31]
551;
552; GFX8-LABEL: v_usubsat_v8i32:
553; GFX8:       ; %bb.0:
554; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
555; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v8 clamp
556; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v9 clamp
557; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v10 clamp
558; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v11 clamp
559; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v4, v12 clamp
560; GFX8-NEXT:    v_sub_u32_e64 v5, s[4:5], v5, v13 clamp
561; GFX8-NEXT:    v_sub_u32_e64 v6, s[4:5], v6, v14 clamp
562; GFX8-NEXT:    v_sub_u32_e64 v7, s[4:5], v7, v15 clamp
563; GFX8-NEXT:    s_setpc_b64 s[30:31]
564;
565; GFX9-LABEL: v_usubsat_v8i32:
566; GFX9:       ; %bb.0:
567; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
568; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v8 clamp
569; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v9 clamp
570; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v10 clamp
571; GFX9-NEXT:    v_sub_u32_e64 v3, v3, v11 clamp
572; GFX9-NEXT:    v_sub_u32_e64 v4, v4, v12 clamp
573; GFX9-NEXT:    v_sub_u32_e64 v5, v5, v13 clamp
574; GFX9-NEXT:    v_sub_u32_e64 v6, v6, v14 clamp
575; GFX9-NEXT:    v_sub_u32_e64 v7, v7, v15 clamp
576; GFX9-NEXT:    s_setpc_b64 s[30:31]
577;
578; GFX10PLUS-LABEL: v_usubsat_v8i32:
579; GFX10PLUS:       ; %bb.0:
580; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
581; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v8 clamp
582; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v9 clamp
583; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, v2, v10 clamp
584; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v3, v3, v11 clamp
585; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v4, v4, v12 clamp
586; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v5, v5, v13 clamp
587; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v6, v6, v14 clamp
588; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v7, v7, v15 clamp
589; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
590  %result = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs)
591  ret <8 x i32> %result
592}
593
594define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
595; GFX6-LABEL: v_usubsat_v16i32:
596; GFX6:       ; %bb.0:
597; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
598; GFX6-NEXT:    v_max_u32_e32 v0, v0, v16
599; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
600; GFX6-NEXT:    buffer_load_dword v16, off, s[0:3], s32
601; GFX6-NEXT:    v_max_u32_e32 v1, v1, v17
602; GFX6-NEXT:    v_max_u32_e32 v2, v2, v18
603; GFX6-NEXT:    v_max_u32_e32 v3, v3, v19
604; GFX6-NEXT:    v_max_u32_e32 v4, v4, v20
605; GFX6-NEXT:    v_max_u32_e32 v5, v5, v21
606; GFX6-NEXT:    v_max_u32_e32 v6, v6, v22
607; GFX6-NEXT:    v_max_u32_e32 v7, v7, v23
608; GFX6-NEXT:    v_max_u32_e32 v8, v8, v24
609; GFX6-NEXT:    v_max_u32_e32 v9, v9, v25
610; GFX6-NEXT:    v_max_u32_e32 v10, v10, v26
611; GFX6-NEXT:    v_max_u32_e32 v11, v11, v27
612; GFX6-NEXT:    v_max_u32_e32 v12, v12, v28
613; GFX6-NEXT:    v_max_u32_e32 v13, v13, v29
614; GFX6-NEXT:    v_max_u32_e32 v14, v14, v30
615; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v17
616; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v18
617; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v19
618; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v20
619; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v21
620; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v22
621; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v23
622; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v24
623; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v25
624; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v26
625; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, v11, v27
626; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v28
627; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, v13, v29
628; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v14, v30
629; GFX6-NEXT:    s_waitcnt vmcnt(0)
630; GFX6-NEXT:    v_max_u32_e32 v15, v15, v16
631; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, v15, v16
632; GFX6-NEXT:    s_setpc_b64 s[30:31]
633;
634; GFX8-LABEL: v_usubsat_v16i32:
635; GFX8:       ; %bb.0:
636; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
637; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v16 clamp
638; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
639; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v17 clamp
640; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v18 clamp
641; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v19 clamp
642; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v4, v20 clamp
643; GFX8-NEXT:    v_sub_u32_e64 v5, s[4:5], v5, v21 clamp
644; GFX8-NEXT:    v_sub_u32_e64 v6, s[4:5], v6, v22 clamp
645; GFX8-NEXT:    v_sub_u32_e64 v7, s[4:5], v7, v23 clamp
646; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v8, v24 clamp
647; GFX8-NEXT:    v_sub_u32_e64 v9, s[4:5], v9, v25 clamp
648; GFX8-NEXT:    v_sub_u32_e64 v10, s[4:5], v10, v26 clamp
649; GFX8-NEXT:    v_sub_u32_e64 v11, s[4:5], v11, v27 clamp
650; GFX8-NEXT:    v_sub_u32_e64 v12, s[4:5], v12, v28 clamp
651; GFX8-NEXT:    v_sub_u32_e64 v13, s[4:5], v13, v29 clamp
652; GFX8-NEXT:    v_sub_u32_e64 v14, s[4:5], v14, v30 clamp
653; GFX8-NEXT:    s_waitcnt vmcnt(0)
654; GFX8-NEXT:    v_sub_u32_e64 v15, s[4:5], v15, v16 clamp
655; GFX8-NEXT:    s_setpc_b64 s[30:31]
656;
657; GFX9-LABEL: v_usubsat_v16i32:
658; GFX9:       ; %bb.0:
659; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
660; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v16 clamp
661; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
662; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v17 clamp
663; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v18 clamp
664; GFX9-NEXT:    v_sub_u32_e64 v3, v3, v19 clamp
665; GFX9-NEXT:    v_sub_u32_e64 v4, v4, v20 clamp
666; GFX9-NEXT:    v_sub_u32_e64 v5, v5, v21 clamp
667; GFX9-NEXT:    v_sub_u32_e64 v6, v6, v22 clamp
668; GFX9-NEXT:    v_sub_u32_e64 v7, v7, v23 clamp
669; GFX9-NEXT:    v_sub_u32_e64 v8, v8, v24 clamp
670; GFX9-NEXT:    v_sub_u32_e64 v9, v9, v25 clamp
671; GFX9-NEXT:    v_sub_u32_e64 v10, v10, v26 clamp
672; GFX9-NEXT:    v_sub_u32_e64 v11, v11, v27 clamp
673; GFX9-NEXT:    v_sub_u32_e64 v12, v12, v28 clamp
674; GFX9-NEXT:    v_sub_u32_e64 v13, v13, v29 clamp
675; GFX9-NEXT:    v_sub_u32_e64 v14, v14, v30 clamp
676; GFX9-NEXT:    s_waitcnt vmcnt(0)
677; GFX9-NEXT:    v_sub_u32_e64 v15, v15, v16 clamp
678; GFX9-NEXT:    s_setpc_b64 s[30:31]
679;
680; GFX10-LABEL: v_usubsat_v16i32:
681; GFX10:       ; %bb.0:
682; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
683; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
684; GFX10-NEXT:    v_sub_nc_u32_e64 v0, v0, v16 clamp
685; GFX10-NEXT:    v_sub_nc_u32_e64 v1, v1, v17 clamp
686; GFX10-NEXT:    v_sub_nc_u32_e64 v2, v2, v18 clamp
687; GFX10-NEXT:    v_sub_nc_u32_e64 v3, v3, v19 clamp
688; GFX10-NEXT:    v_sub_nc_u32_e64 v4, v4, v20 clamp
689; GFX10-NEXT:    v_sub_nc_u32_e64 v5, v5, v21 clamp
690; GFX10-NEXT:    v_sub_nc_u32_e64 v6, v6, v22 clamp
691; GFX10-NEXT:    v_sub_nc_u32_e64 v7, v7, v23 clamp
692; GFX10-NEXT:    v_sub_nc_u32_e64 v8, v8, v24 clamp
693; GFX10-NEXT:    v_sub_nc_u32_e64 v9, v9, v25 clamp
694; GFX10-NEXT:    v_sub_nc_u32_e64 v10, v10, v26 clamp
695; GFX10-NEXT:    v_sub_nc_u32_e64 v11, v11, v27 clamp
696; GFX10-NEXT:    v_sub_nc_u32_e64 v12, v12, v28 clamp
697; GFX10-NEXT:    v_sub_nc_u32_e64 v13, v13, v29 clamp
698; GFX10-NEXT:    v_sub_nc_u32_e64 v14, v14, v30 clamp
699; GFX10-NEXT:    s_waitcnt vmcnt(0)
700; GFX10-NEXT:    v_sub_nc_u32_e64 v15, v15, v31 clamp
701; GFX10-NEXT:    s_setpc_b64 s[30:31]
702;
703; GFX11-LABEL: v_usubsat_v16i32:
704; GFX11:       ; %bb.0:
705; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
706; GFX11-NEXT:    scratch_load_b32 v31, off, s32
707; GFX11-NEXT:    v_sub_nc_u32_e64 v0, v0, v16 clamp
708; GFX11-NEXT:    v_sub_nc_u32_e64 v1, v1, v17 clamp
709; GFX11-NEXT:    v_sub_nc_u32_e64 v2, v2, v18 clamp
710; GFX11-NEXT:    v_sub_nc_u32_e64 v3, v3, v19 clamp
711; GFX11-NEXT:    v_sub_nc_u32_e64 v4, v4, v20 clamp
712; GFX11-NEXT:    v_sub_nc_u32_e64 v5, v5, v21 clamp
713; GFX11-NEXT:    v_sub_nc_u32_e64 v6, v6, v22 clamp
714; GFX11-NEXT:    v_sub_nc_u32_e64 v7, v7, v23 clamp
715; GFX11-NEXT:    v_sub_nc_u32_e64 v8, v8, v24 clamp
716; GFX11-NEXT:    v_sub_nc_u32_e64 v9, v9, v25 clamp
717; GFX11-NEXT:    v_sub_nc_u32_e64 v10, v10, v26 clamp
718; GFX11-NEXT:    v_sub_nc_u32_e64 v11, v11, v27 clamp
719; GFX11-NEXT:    v_sub_nc_u32_e64 v12, v12, v28 clamp
720; GFX11-NEXT:    v_sub_nc_u32_e64 v13, v13, v29 clamp
721; GFX11-NEXT:    v_sub_nc_u32_e64 v14, v14, v30 clamp
722; GFX11-NEXT:    s_waitcnt vmcnt(0)
723; GFX11-NEXT:    v_sub_nc_u32_e64 v15, v15, v31 clamp
724; GFX11-NEXT:    s_setpc_b64 s[30:31]
725  %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
726  ret <16 x i32> %result
727}
728
729
730define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
731; GFX6-LABEL: v_usubsat_i64:
732; GFX6:       ; %bb.0:
733; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
734; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v0, v2
735; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
736; GFX6-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
737; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
738; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc
739; GFX6-NEXT:    s_setpc_b64 s[30:31]
740;
741; GFX8-LABEL: v_usubsat_i64:
742; GFX8:       ; %bb.0:
743; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
744; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v0, v2
745; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
746; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
747; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
748; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc
749; GFX8-NEXT:    s_setpc_b64 s[30:31]
750;
751; GFX9-LABEL: v_usubsat_i64:
752; GFX9:       ; %bb.0:
753; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
754; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
755; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
756; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
757; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
758; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc
759; GFX9-NEXT:    s_setpc_b64 s[30:31]
760;
761; GFX10PLUS-LABEL: v_usubsat_i64:
762; GFX10PLUS:       ; %bb.0:
763; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
764; GFX10PLUS-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
765; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
766; GFX10PLUS-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
767; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
768; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
769; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
770  %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
771  ret i64 %result
772}
773
774declare i8 @llvm.usub.sat.i8(i8, i8) #0
775declare i16 @llvm.usub.sat.i16(i16, i16) #0
776declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) #0
777declare <3 x i16> @llvm.usub.sat.v3i16(<3 x i16>, <3 x i16>) #0
778declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) #0
779declare i32 @llvm.usub.sat.i32(i32, i32) #0
780declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) #0
781declare <3 x i32> @llvm.usub.sat.v3i32(<3 x i32>, <3 x i32>) #0
782declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) #0
783declare <8 x i32> @llvm.usub.sat.v8i32(<8 x i32>, <8 x i32>) #0
784declare <16 x i32> @llvm.usub.sat.v16i32(<16 x i32>, <16 x i32>) #0
785declare i64 @llvm.usub.sat.i64(i64, i64) #0
786
787attributes #0 = { nounwind readnone speculatable willreturn }
788