xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll (revision bfd9bc274586b0261e16e22ac50d50586a0152e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
7
8define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) {
9; GFX6-LABEL: v_usubsat_i7:
10; GFX6:       ; %bb.0:
11; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
13; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
14; GFX6-NEXT:    v_min_u32_e32 v1, v0, v1
15; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
16; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
17; GFX6-NEXT:    s_setpc_b64 s[30:31]
18;
19; GFX8-LABEL: v_usubsat_i7:
20; GFX8:       ; %bb.0:
21; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
23; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
24; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
25; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 9, v0
26; GFX8-NEXT:    s_setpc_b64 s[30:31]
27;
28; GFX9-LABEL: v_usubsat_i7:
29; GFX9:       ; %bb.0:
30; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
32; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
33; GFX9-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
34; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 9, v0
35; GFX9-NEXT:    s_setpc_b64 s[30:31]
36;
37; GFX10PLUS-LABEL: v_usubsat_i7:
38; GFX10PLUS:       ; %bb.0:
39; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 9, v0
41; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 9, v1
42; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
43; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 9, v0
44; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
45  %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs)
46  ret i7 %result
47}
48
49define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
50; GFX6-LABEL: s_usubsat_i7:
51; GFX6:       ; %bb.0:
52; GFX6-NEXT:    s_lshl_b32 s0, s0, 25
53; GFX6-NEXT:    s_lshl_b32 s1, s1, 25
54; GFX6-NEXT:    s_min_u32 s1, s0, s1
55; GFX6-NEXT:    s_sub_i32 s0, s0, s1
56; GFX6-NEXT:    s_lshr_b32 s0, s0, 25
57; GFX6-NEXT:    ; return to shader part epilog
58;
59; GFX8-LABEL: s_usubsat_i7:
60; GFX8:       ; %bb.0:
61; GFX8-NEXT:    s_lshl_b32 s1, s1, 9
62; GFX8-NEXT:    s_lshl_b32 s0, s0, 9
63; GFX8-NEXT:    v_mov_b32_e32 v0, s1
64; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
65; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 9, v0
66; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
67; GFX8-NEXT:    ; return to shader part epilog
68;
69; GFX9-LABEL: s_usubsat_i7:
70; GFX9:       ; %bb.0:
71; GFX9-NEXT:    s_lshl_b32 s1, s1, 9
72; GFX9-NEXT:    s_lshl_b32 s0, s0, 9
73; GFX9-NEXT:    v_mov_b32_e32 v0, s1
74; GFX9-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
75; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 9, v0
76; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
77; GFX9-NEXT:    ; return to shader part epilog
78;
79; GFX10PLUS-LABEL: s_usubsat_i7:
80; GFX10PLUS:       ; %bb.0:
81; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 9
82; GFX10PLUS-NEXT:    s_lshl_b32 s1, s1, 9
83; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, s0, s1 clamp
84; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 9, v0
85; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
86; GFX10PLUS-NEXT:    ; return to shader part epilog
87  %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs)
88  ret i7 %result
89}
90
91define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
92; GFX6-LABEL: v_usubsat_i8:
93; GFX6:       ; %bb.0:
94; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
95; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
96; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
97; GFX6-NEXT:    v_min_u32_e32 v1, v0, v1
98; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
99; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
100; GFX6-NEXT:    s_setpc_b64 s[30:31]
101;
102; GFX8-LABEL: v_usubsat_i8:
103; GFX8:       ; %bb.0:
104; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
106; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
107; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
108; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
109; GFX8-NEXT:    s_setpc_b64 s[30:31]
110;
111; GFX9-LABEL: v_usubsat_i8:
112; GFX9:       ; %bb.0:
113; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
115; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
116; GFX9-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
117; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
118; GFX9-NEXT:    s_setpc_b64 s[30:31]
119;
120; GFX10PLUS-LABEL: v_usubsat_i8:
121; GFX10PLUS:       ; %bb.0:
122; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 8, v0
124; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 8, v1
125; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
126; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 8, v0
127; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
128  %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
129  ret i8 %result
130}
131
132define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
133; GFX6-LABEL: s_usubsat_i8:
134; GFX6:       ; %bb.0:
135; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
136; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
137; GFX6-NEXT:    s_min_u32 s1, s0, s1
138; GFX6-NEXT:    s_sub_i32 s0, s0, s1
139; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
140; GFX6-NEXT:    ; return to shader part epilog
141;
142; GFX8-LABEL: s_usubsat_i8:
143; GFX8:       ; %bb.0:
144; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
145; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
146; GFX8-NEXT:    v_mov_b32_e32 v0, s1
147; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
148; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
149; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
150; GFX8-NEXT:    ; return to shader part epilog
151;
152; GFX9-LABEL: s_usubsat_i8:
153; GFX9:       ; %bb.0:
154; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
155; GFX9-NEXT:    s_lshl_b32 s0, s0, 8
156; GFX9-NEXT:    v_mov_b32_e32 v0, s1
157; GFX9-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
158; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
159; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
160; GFX9-NEXT:    ; return to shader part epilog
161;
162; GFX10PLUS-LABEL: s_usubsat_i8:
163; GFX10PLUS:       ; %bb.0:
164; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 8
165; GFX10PLUS-NEXT:    s_lshl_b32 s1, s1, 8
166; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, s0, s1 clamp
167; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 8, v0
168; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
169; GFX10PLUS-NEXT:    ; return to shader part epilog
170  %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
171  ret i8 %result
172}
173
174define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
175; GFX6-LABEL: v_usubsat_v2i8:
176; GFX6:       ; %bb.0:
177; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
178; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
179; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
180; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
181; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
182; GFX6-NEXT:    v_min_u32_e32 v1, v0, v1
183; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
184; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
185; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
186; GFX6-NEXT:    v_min_u32_e32 v2, v1, v2
187; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
188; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
189; GFX6-NEXT:    v_alignbit_b32 v0, v1, v0, 24
190; GFX6-NEXT:    s_setpc_b64 s[30:31]
191;
192; GFX8-LABEL: v_usubsat_v2i8:
193; GFX8:       ; %bb.0:
194; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
195; GFX8-NEXT:    v_mov_b32_e32 v2, 8
196; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
197; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
198; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
199; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
200; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
201; GFX8-NEXT:    v_sub_u16_e64 v1, v3, v2 clamp
202; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
203; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
204; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
205; GFX8-NEXT:    s_setpc_b64 s[30:31]
206;
207; GFX9-LABEL: v_usubsat_v2i8:
208; GFX9:       ; %bb.0:
209; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
211; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
212; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
213; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
214; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
215; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
216; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
217; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
218; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
219; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
220; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
221; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
222; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
223; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
224; GFX9-NEXT:    s_setpc_b64 s[30:31]
225;
226; GFX10-LABEL: v_usubsat_v2i8:
227; GFX10:       ; %bb.0:
228; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
230; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
231; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
232; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
233; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
234; GFX10-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
235; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
236; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
237; GFX10-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
238; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
239; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
240; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
241; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
242; GFX10-NEXT:    s_setpc_b64 s[30:31]
243;
244; GFX11-LABEL: v_usubsat_v2i8:
245; GFX11:       ; %bb.0:
246; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
247; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
248; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
249; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
250; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
251; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
252; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
253; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
254; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
255; GFX11-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
256; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
257; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
258; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
259; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
260; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
261; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
262; GFX11-NEXT:    s_setpc_b64 s[30:31]
263  %lhs = bitcast i16 %lhs.arg to <2 x i8>
264  %rhs = bitcast i16 %rhs.arg to <2 x i8>
265  %result = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
266  %cast.result = bitcast <2 x i8> %result to i16
267  ret i16 %cast.result
268}
269
270define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
271; GFX6-LABEL: s_usubsat_v2i8:
272; GFX6:       ; %bb.0:
273; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
274; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
275; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
276; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
277; GFX6-NEXT:    s_min_u32 s1, s0, s1
278; GFX6-NEXT:    s_sub_i32 s0, s0, s1
279; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
280; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
281; GFX6-NEXT:    s_min_u32 s2, s1, s2
282; GFX6-NEXT:    s_sub_i32 s1, s1, s2
283; GFX6-NEXT:    s_lshr_b32 s1, s1, 24
284; GFX6-NEXT:    v_mov_b32_e32 v0, s0
285; GFX6-NEXT:    v_alignbit_b32 v0, s1, v0, 24
286; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
287; GFX6-NEXT:    ; return to shader part epilog
288;
289; GFX8-LABEL: s_usubsat_v2i8:
290; GFX8:       ; %bb.0:
291; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
292; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
293; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
294; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
295; GFX8-NEXT:    v_mov_b32_e32 v0, s1
296; GFX8-NEXT:    s_lshl_b32 s1, s3, 8
297; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
298; GFX8-NEXT:    s_lshl_b32 s0, s2, 8
299; GFX8-NEXT:    v_mov_b32_e32 v1, s1
300; GFX8-NEXT:    v_sub_u16_e64 v1, s0, v1 clamp
301; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
302; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
303; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
304; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
305; GFX8-NEXT:    ; return to shader part epilog
306;
307; GFX9-LABEL: s_usubsat_v2i8:
308; GFX9:       ; %bb.0:
309; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
310; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
311; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
312; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
313; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
314; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
315; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
316; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
317; GFX9-NEXT:    s_lshr_b32 s2, s1, 16
318; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
319; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
320; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
321; GFX9-NEXT:    v_mov_b32_e32 v0, s1
322; GFX9-NEXT:    v_pk_sub_u16 v0, s0, v0 clamp
323; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
324; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
325; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
326; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
327; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
328; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
329; GFX9-NEXT:    ; return to shader part epilog
330;
331; GFX10-LABEL: s_usubsat_v2i8:
332; GFX10:       ; %bb.0:
333; GFX10-NEXT:    s_lshr_b32 s2, s0, 8
334; GFX10-NEXT:    s_lshr_b32 s3, s1, 8
335; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
336; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
337; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
338; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
339; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x80008
340; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
341; GFX10-NEXT:    s_lshl_b32 s1, s1, 0x80008
342; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
343; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
344; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
345; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
346; GFX10-NEXT:    v_pk_sub_u16 v0, s0, s1 clamp
347; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
348; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
349; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
350; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
351; GFX10-NEXT:    ; return to shader part epilog
352;
353; GFX11-LABEL: s_usubsat_v2i8:
354; GFX11:       ; %bb.0:
355; GFX11-NEXT:    s_lshr_b32 s2, s0, 8
356; GFX11-NEXT:    s_lshr_b32 s3, s1, 8
357; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
358; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
359; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
360; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
361; GFX11-NEXT:    s_lshl_b32 s0, s0, 0x80008
362; GFX11-NEXT:    s_lshl_b32 s2, s2, 8
363; GFX11-NEXT:    s_lshl_b32 s1, s1, 0x80008
364; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
365; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
366; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
367; GFX11-NEXT:    v_pk_sub_u16 v0, s0, s1 clamp
368; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
369; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
370; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
371; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
372; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
373; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
374; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
375; GFX11-NEXT:    ; return to shader part epilog
376  %lhs = bitcast i16 %lhs.arg to <2 x i8>
377  %rhs = bitcast i16 %rhs.arg to <2 x i8>
378  %result = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
379  %cast.result = bitcast <2 x i8> %result to i16
380  ret i16 %cast.result
381}
382
383define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
384; GFX6-LABEL: v_usubsat_v4i8:
385; GFX6:       ; %bb.0:
386; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
387; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
388; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
389; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
390; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
391; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
392; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
393; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
394; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
395; GFX6-NEXT:    v_min_u32_e32 v1, v0, v1
396; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
397; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
398; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
399; GFX6-NEXT:    v_min_u32_e32 v2, v1, v2
400; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
401; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
402; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
403; GFX6-NEXT:    v_min_u32_e32 v3, v2, v3
404; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
405; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
406; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
407; GFX6-NEXT:    v_min_u32_e32 v4, v3, v4
408; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
409; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
410; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
411; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
412; GFX6-NEXT:    v_alignbit_b32 v0, v1, v0, 24
413; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
414; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
415; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
416; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
417; GFX6-NEXT:    s_setpc_b64 s[30:31]
418;
419; GFX8-LABEL: v_usubsat_v4i8:
420; GFX8:       ; %bb.0:
421; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
422; GFX8-NEXT:    v_mov_b32_e32 v2, 8
423; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
424; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
425; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
426; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
427; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
428; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
429; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
430; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
431; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
432; GFX8-NEXT:    v_sub_u16_e64 v1, v3, v2 clamp
433; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
434; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v6
435; GFX8-NEXT:    v_sub_u16_e64 v2, v2, v3 clamp
436; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v5
437; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
438; GFX8-NEXT:    v_sub_u16_e64 v3, v3, v4 clamp
439; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
440; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
441; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
442; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
443; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
444; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
445; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
446; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
447; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
448; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
449; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
450; GFX8-NEXT:    s_setpc_b64 s[30:31]
451;
452; GFX9-LABEL: v_usubsat_v4i8:
453; GFX9:       ; %bb.0:
454; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
455; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
456; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
457; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
458; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff, v0
459; GFX9-NEXT:    v_alignbit_b32 v0, v3, v0, 16
460; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v1
461; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
462; GFX9-NEXT:    v_lshl_or_b32 v2, v2, 16, v6
463; GFX9-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
464; GFX9-NEXT:    v_alignbit_b32 v1, v5, v1, 16
465; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
466; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
467; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
468; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
469; GFX9-NEXT:    v_pk_sub_u16 v2, v2, v3 clamp
470; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
471; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
472; GFX9-NEXT:    v_mov_b32_e32 v3, 8
473; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
474; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
475; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
476; GFX9-NEXT:    v_and_or_b32 v1, v1, v2, v3
477; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v0
478; GFX9-NEXT:    v_mov_b32_e32 v3, 24
479; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
480; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
481; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
482; GFX9-NEXT:    s_setpc_b64 s[30:31]
483;
484; GFX10-LABEL: v_usubsat_v4i8:
485; GFX10:       ; %bb.0:
486; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
487; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
488; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
489; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v0
490; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
491; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff, v1
492; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
493; GFX10-NEXT:    v_alignbit_b32 v0, v3, v0, 16
494; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
495; GFX10-NEXT:    v_mov_b32_e32 v4, 24
496; GFX10-NEXT:    v_lshl_or_b32 v3, v5, 16, v6
497; GFX10-NEXT:    v_alignbit_b32 v1, v7, v1, 16
498; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
499; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
500; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
501; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
502; GFX10-NEXT:    v_pk_sub_u16 v2, v2, v3 clamp
503; GFX10-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
504; GFX10-NEXT:    v_mov_b32_e32 v1, 8
505; GFX10-NEXT:    v_pk_lshrrev_b16 v2, 8, v2 op_sel_hi:[0,1]
506; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
507; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
508; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v0
509; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
510; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v2, v1
511; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
512; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
513; GFX10-NEXT:    s_setpc_b64 s[30:31]
514;
515; GFX11-LABEL: v_usubsat_v4i8:
516; GFX11:       ; %bb.0:
517; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
518; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
519; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
520; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v0
521; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v1
522; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
523; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
524; GFX11-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
525; GFX11-NEXT:    v_lshl_or_b32 v3, v3, 16, v5
526; GFX11-NEXT:    v_alignbit_b32 v0, v6, v0, 16
527; GFX11-NEXT:    v_alignbit_b32 v1, v7, v1, 16
528; GFX11-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
529; GFX11-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
530; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
531; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
532; GFX11-NEXT:    v_pk_sub_u16 v2, v2, v3 clamp
533; GFX11-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
534; GFX11-NEXT:    v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
535; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
536; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 8
537; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v0
538; GFX11-NEXT:    v_bfe_u32 v0, v0, 16, 8
539; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
540; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
541; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
542; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
543; GFX11-NEXT:    v_or3_b32 v0, v1, v3, v0
544; GFX11-NEXT:    s_setpc_b64 s[30:31]
545  %lhs = bitcast i32 %lhs.arg to <4 x i8>
546  %rhs = bitcast i32 %rhs.arg to <4 x i8>
547  %result = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
548  %cast.result = bitcast <4 x i8> %result to i32
549  ret i32 %cast.result
550}
551
552define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
553; GFX6-LABEL: s_usubsat_v4i8:
554; GFX6:       ; %bb.0:
555; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
556; GFX6-NEXT:    s_lshr_b32 s3, s0, 16
557; GFX6-NEXT:    s_lshr_b32 s4, s0, 24
558; GFX6-NEXT:    s_lshr_b32 s5, s1, 8
559; GFX6-NEXT:    s_lshr_b32 s6, s1, 16
560; GFX6-NEXT:    s_lshr_b32 s7, s1, 24
561; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
562; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
563; GFX6-NEXT:    s_min_u32 s1, s0, s1
564; GFX6-NEXT:    s_sub_i32 s0, s0, s1
565; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
566; GFX6-NEXT:    s_lshl_b32 s2, s5, 24
567; GFX6-NEXT:    s_min_u32 s2, s1, s2
568; GFX6-NEXT:    s_sub_i32 s1, s1, s2
569; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
570; GFX6-NEXT:    s_lshl_b32 s3, s6, 24
571; GFX6-NEXT:    s_min_u32 s3, s2, s3
572; GFX6-NEXT:    s_sub_i32 s2, s2, s3
573; GFX6-NEXT:    s_lshl_b32 s3, s4, 24
574; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
575; GFX6-NEXT:    s_min_u32 s4, s3, s4
576; GFX6-NEXT:    s_lshr_b32 s1, s1, 24
577; GFX6-NEXT:    s_lshr_b32 s2, s2, 24
578; GFX6-NEXT:    s_sub_i32 s3, s3, s4
579; GFX6-NEXT:    v_mov_b32_e32 v0, s0
580; GFX6-NEXT:    s_lshr_b32 s3, s3, 24
581; GFX6-NEXT:    v_alignbit_b32 v0, s1, v0, 24
582; GFX6-NEXT:    s_lshl_b32 s0, s2, 16
583; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
584; GFX6-NEXT:    s_lshl_b32 s0, s3, 24
585; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
586; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
587; GFX6-NEXT:    ; return to shader part epilog
588;
589; GFX8-LABEL: s_usubsat_v4i8:
590; GFX8:       ; %bb.0:
591; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
592; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
593; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
594; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
595; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
596; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
597; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
598; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
599; GFX8-NEXT:    v_mov_b32_e32 v0, s1
600; GFX8-NEXT:    s_lshl_b32 s1, s5, 8
601; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
602; GFX8-NEXT:    s_lshl_b32 s0, s2, 8
603; GFX8-NEXT:    v_mov_b32_e32 v1, s1
604; GFX8-NEXT:    v_sub_u16_e64 v1, s0, v1 clamp
605; GFX8-NEXT:    s_lshl_b32 s1, s6, 8
606; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
607; GFX8-NEXT:    s_lshl_b32 s0, s3, 8
608; GFX8-NEXT:    v_mov_b32_e32 v2, s1
609; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
610; GFX8-NEXT:    v_sub_u16_e64 v2, s0, v2 clamp
611; GFX8-NEXT:    s_lshl_b32 s1, s7, 8
612; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
613; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
614; GFX8-NEXT:    s_lshl_b32 s0, s4, 8
615; GFX8-NEXT:    v_mov_b32_e32 v3, s1
616; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
617; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
618; GFX8-NEXT:    v_sub_u16_e64 v3, s0, v3 clamp
619; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
620; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
621; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
622; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
623; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
624; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
625; GFX8-NEXT:    ; return to shader part epilog
626;
627; GFX9-LABEL: s_usubsat_v4i8:
628; GFX9:       ; %bb.0:
629; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
630; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
631; GFX9-NEXT:    s_lshr_b32 s4, s0, 24
632; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
633; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
634; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
635; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
636; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
637; GFX9-NEXT:    s_lshr_b32 s5, s1, 8
638; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
639; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
640; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
641; GFX9-NEXT:    s_lshr_b32 s7, s1, 24
642; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
643; GFX9-NEXT:    s_lshl_b32 s2, s2, 0x80008
644; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
645; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
646; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
647; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s6, s7
648; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
649; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
650; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
651; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
652; GFX9-NEXT:    s_lshl_b32 s3, s3, 0x80008
653; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
654; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
655; GFX9-NEXT:    v_mov_b32_e32 v0, s1
656; GFX9-NEXT:    v_pk_sub_u16 v0, s0, v0 clamp
657; GFX9-NEXT:    v_mov_b32_e32 v1, s3
658; GFX9-NEXT:    v_pk_sub_u16 v1, s2, v1 clamp
659; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
660; GFX9-NEXT:    v_mov_b32_e32 v3, 8
661; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
662; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
663; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
664; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v3
665; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v1
666; GFX9-NEXT:    v_mov_b32_e32 v3, 24
667; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
668; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
669; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
670; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
671; GFX9-NEXT:    ; return to shader part epilog
672;
673; GFX10-LABEL: s_usubsat_v4i8:
674; GFX10:       ; %bb.0:
675; GFX10-NEXT:    s_lshr_b32 s2, s0, 8
676; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
677; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
678; GFX10-NEXT:    s_lshr_b32 s5, s1, 8
679; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
680; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
681; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
682; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
683; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
684; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
685; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
686; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s6, s7
687; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x80008
688; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
689; GFX10-NEXT:    s_lshl_b32 s2, s2, 0x80008
690; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
691; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
692; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
693; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
694; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
695; GFX10-NEXT:    s_lshl_b32 s1, s1, 0x80008
696; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
697; GFX10-NEXT:    s_lshl_b32 s3, s3, 0x80008
698; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
699; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
700; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
701; GFX10-NEXT:    v_pk_sub_u16 v0, s0, s1 clamp
702; GFX10-NEXT:    v_pk_sub_u16 v1, s2, s3 clamp
703; GFX10-NEXT:    v_mov_b32_e32 v2, 8
704; GFX10-NEXT:    v_mov_b32_e32 v4, 24
705; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
706; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
707; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
708; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
709; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
710; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
711; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
712; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
713; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
714; GFX10-NEXT:    ; return to shader part epilog
715;
716; GFX11-LABEL: s_usubsat_v4i8:
717; GFX11:       ; %bb.0:
718; GFX11-NEXT:    s_lshr_b32 s2, s0, 8
719; GFX11-NEXT:    s_lshr_b32 s3, s0, 24
720; GFX11-NEXT:    s_lshr_b32 s4, s1, 8
721; GFX11-NEXT:    s_lshr_b32 s5, s1, 24
722; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s0, s2
723; GFX11-NEXT:    s_pack_hl_b32_b16 s0, s0, s3
724; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s1, s4
725; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
726; GFX11-NEXT:    s_pack_hl_b32_b16 s1, s1, s5
727; GFX11-NEXT:    s_lshr_b32 s5, s3, 16
728; GFX11-NEXT:    s_lshl_b32 s2, s2, 0x80008
729; GFX11-NEXT:    s_lshl_b32 s4, s4, 8
730; GFX11-NEXT:    s_lshl_b32 s3, s3, 0x80008
731; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
732; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
733; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
734; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
735; GFX11-NEXT:    s_lshr_b32 s5, s1, 16
736; GFX11-NEXT:    v_pk_sub_u16 v0, s2, s3 clamp
737; GFX11-NEXT:    s_lshl_b32 s0, s0, 0x80008
738; GFX11-NEXT:    s_lshl_b32 s4, s4, 8
739; GFX11-NEXT:    s_lshl_b32 s1, s1, 0x80008
740; GFX11-NEXT:    s_lshl_b32 s2, s5, 8
741; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
742; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
743; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
744; GFX11-NEXT:    v_pk_sub_u16 v1, s0, s1 clamp
745; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 8
746; GFX11-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
747; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
748; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v1
749; GFX11-NEXT:    v_bfe_u32 v1, v1, 16, 8
750; GFX11-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
751; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
752; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
753; GFX11-NEXT:    v_or3_b32 v0, v0, v2, v1
754; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
755; GFX11-NEXT:    ; return to shader part epilog
756  %lhs = bitcast i32 %lhs.arg to <4 x i8>
757  %rhs = bitcast i32 %rhs.arg to <4 x i8>
758  %result = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
759  %cast.result = bitcast <4 x i8> %result to i32
760  ret i32 %cast.result
761}
762
763define i24 @v_usubsat_i24(i24 %lhs, i24 %rhs) {
764; GFX6-LABEL: v_usubsat_i24:
765; GFX6:       ; %bb.0:
766; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
767; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
768; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
769; GFX6-NEXT:    v_min_u32_e32 v1, v0, v1
770; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
771; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
772; GFX6-NEXT:    s_setpc_b64 s[30:31]
773;
774; GFX8-LABEL: v_usubsat_i24:
775; GFX8:       ; %bb.0:
776; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
777; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
778; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
779; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v1 clamp
780; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
781; GFX8-NEXT:    s_setpc_b64 s[30:31]
782;
783; GFX9-LABEL: v_usubsat_i24:
784; GFX9:       ; %bb.0:
785; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
786; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
787; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
788; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v1 clamp
789; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
790; GFX9-NEXT:    s_setpc_b64 s[30:31]
791;
792; GFX10PLUS-LABEL: v_usubsat_i24:
793; GFX10PLUS:       ; %bb.0:
794; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
795; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
796; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
797; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v1 clamp
798; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
799; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
800  %result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs)
801  ret i24 %result
802}
803
804define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
805; GFX6-LABEL: s_usubsat_i24:
806; GFX6:       ; %bb.0:
807; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
808; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
809; GFX6-NEXT:    s_min_u32 s1, s0, s1
810; GFX6-NEXT:    s_sub_i32 s0, s0, s1
811; GFX6-NEXT:    s_lshr_b32 s0, s0, 8
812; GFX6-NEXT:    ; return to shader part epilog
813;
814; GFX8-LABEL: s_usubsat_i24:
815; GFX8:       ; %bb.0:
816; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
817; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
818; GFX8-NEXT:    v_mov_b32_e32 v0, s1
819; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s0, v0 clamp
820; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
821; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
822; GFX8-NEXT:    ; return to shader part epilog
823;
824; GFX9-LABEL: s_usubsat_i24:
825; GFX9:       ; %bb.0:
826; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
827; GFX9-NEXT:    s_lshl_b32 s0, s0, 8
828; GFX9-NEXT:    v_mov_b32_e32 v0, s1
829; GFX9-NEXT:    v_sub_u32_e64 v0, s0, v0 clamp
830; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
831; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
832; GFX9-NEXT:    ; return to shader part epilog
833;
834; GFX10PLUS-LABEL: s_usubsat_i24:
835; GFX10PLUS:       ; %bb.0:
836; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 8
837; GFX10PLUS-NEXT:    s_lshl_b32 s1, s1, 8
838; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, s0, s1 clamp
839; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
840; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
841; GFX10PLUS-NEXT:    ; return to shader part epilog
842  %result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs)
843  ret i24 %result
844}
845
846define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) {
847; GFX6-LABEL: v_usubsat_i32:
848; GFX6:       ; %bb.0:
849; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
850; GFX6-NEXT:    v_min_u32_e32 v1, v0, v1
851; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
852; GFX6-NEXT:    s_setpc_b64 s[30:31]
853;
854; GFX8-LABEL: v_usubsat_i32:
855; GFX8:       ; %bb.0:
856; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
857; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v1 clamp
858; GFX8-NEXT:    s_setpc_b64 s[30:31]
859;
860; GFX9-LABEL: v_usubsat_i32:
861; GFX9:       ; %bb.0:
862; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
863; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v1 clamp
864; GFX9-NEXT:    s_setpc_b64 s[30:31]
865;
866; GFX10PLUS-LABEL: v_usubsat_i32:
867; GFX10PLUS:       ; %bb.0:
868; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
869; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v1 clamp
870; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
871  %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
872  ret i32 %result
873}
874
875define amdgpu_ps i32 @s_usubsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
876; GFX6-LABEL: s_usubsat_i32:
877; GFX6:       ; %bb.0:
878; GFX6-NEXT:    s_min_u32 s1, s0, s1
879; GFX6-NEXT:    s_sub_i32 s0, s0, s1
880; GFX6-NEXT:    ; return to shader part epilog
881;
882; GFX8-LABEL: s_usubsat_i32:
883; GFX8:       ; %bb.0:
884; GFX8-NEXT:    v_mov_b32_e32 v0, s1
885; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s0, v0 clamp
886; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
887; GFX8-NEXT:    ; return to shader part epilog
888;
889; GFX9-LABEL: s_usubsat_i32:
890; GFX9:       ; %bb.0:
891; GFX9-NEXT:    v_mov_b32_e32 v0, s1
892; GFX9-NEXT:    v_sub_u32_e64 v0, s0, v0 clamp
893; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
894; GFX9-NEXT:    ; return to shader part epilog
895;
896; GFX10PLUS-LABEL: s_usubsat_i32:
897; GFX10PLUS:       ; %bb.0:
898; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, s0, s1 clamp
899; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
900; GFX10PLUS-NEXT:    ; return to shader part epilog
901  %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
902  ret i32 %result
903}
904
905define amdgpu_ps float @usubsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
906; GFX6-LABEL: usubsat_i32_sv:
907; GFX6:       ; %bb.0:
908; GFX6-NEXT:    v_min_u32_e32 v0, s0, v0
909; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
910; GFX6-NEXT:    ; return to shader part epilog
911;
912; GFX8-LABEL: usubsat_i32_sv:
913; GFX8:       ; %bb.0:
914; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s0, v0 clamp
915; GFX8-NEXT:    ; return to shader part epilog
916;
917; GFX9-LABEL: usubsat_i32_sv:
918; GFX9:       ; %bb.0:
919; GFX9-NEXT:    v_sub_u32_e64 v0, s0, v0 clamp
920; GFX9-NEXT:    ; return to shader part epilog
921;
922; GFX10PLUS-LABEL: usubsat_i32_sv:
923; GFX10PLUS:       ; %bb.0:
924; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, s0, v0 clamp
925; GFX10PLUS-NEXT:    ; return to shader part epilog
926  %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
927  %cast = bitcast i32 %result to float
928  ret float %cast
929}
930
931define amdgpu_ps float @usubsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
932; GFX6-LABEL: usubsat_i32_vs:
933; GFX6:       ; %bb.0:
934; GFX6-NEXT:    v_min_u32_e32 v1, s0, v0
935; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
936; GFX6-NEXT:    ; return to shader part epilog
937;
938; GFX8-LABEL: usubsat_i32_vs:
939; GFX8:       ; %bb.0:
940; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], v0, s0 clamp
941; GFX8-NEXT:    ; return to shader part epilog
942;
943; GFX9-LABEL: usubsat_i32_vs:
944; GFX9:       ; %bb.0:
945; GFX9-NEXT:    v_sub_u32_e64 v0, v0, s0 clamp
946; GFX9-NEXT:    ; return to shader part epilog
947;
948; GFX10PLUS-LABEL: usubsat_i32_vs:
949; GFX10PLUS:       ; %bb.0:
950; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, s0 clamp
951; GFX10PLUS-NEXT:    ; return to shader part epilog
952  %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
953  %cast = bitcast i32 %result to float
954  ret float %cast
955}
956
957define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
958; GFX6-LABEL: v_usubsat_v2i32:
959; GFX6:       ; %bb.0:
960; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
961; GFX6-NEXT:    v_min_u32_e32 v2, v0, v2
962; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
963; GFX6-NEXT:    v_min_u32_e32 v2, v1, v3
964; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
965; GFX6-NEXT:    s_setpc_b64 s[30:31]
966;
967; GFX8-LABEL: v_usubsat_v2i32:
968; GFX8:       ; %bb.0:
969; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
970; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v2 clamp
971; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v3 clamp
972; GFX8-NEXT:    s_setpc_b64 s[30:31]
973;
974; GFX9-LABEL: v_usubsat_v2i32:
975; GFX9:       ; %bb.0:
976; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
977; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v2 clamp
978; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v3 clamp
979; GFX9-NEXT:    s_setpc_b64 s[30:31]
980;
981; GFX10PLUS-LABEL: v_usubsat_v2i32:
982; GFX10PLUS:       ; %bb.0:
983; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
984; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v2 clamp
985; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v3 clamp
986; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
987  %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
988  ret <2 x i32> %result
989}
990
991define amdgpu_ps <2 x i32> @s_usubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
992; GFX6-LABEL: s_usubsat_v2i32:
993; GFX6:       ; %bb.0:
994; GFX6-NEXT:    s_min_u32 s2, s0, s2
995; GFX6-NEXT:    s_sub_i32 s0, s0, s2
996; GFX6-NEXT:    s_min_u32 s2, s1, s3
997; GFX6-NEXT:    s_sub_i32 s1, s1, s2
998; GFX6-NEXT:    ; return to shader part epilog
999;
1000; GFX8-LABEL: s_usubsat_v2i32:
1001; GFX8:       ; %bb.0:
1002; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1003; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1004; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], s0, v0 clamp
1005; GFX8-NEXT:    v_sub_u32_e64 v1, s[0:1], s1, v1 clamp
1006; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1007; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
1008; GFX8-NEXT:    ; return to shader part epilog
1009;
1010; GFX9-LABEL: s_usubsat_v2i32:
1011; GFX9:       ; %bb.0:
1012; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1013; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1014; GFX9-NEXT:    v_sub_u32_e64 v0, s0, v0 clamp
1015; GFX9-NEXT:    v_sub_u32_e64 v1, s1, v1 clamp
1016; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1017; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1018; GFX9-NEXT:    ; return to shader part epilog
1019;
1020; GFX10PLUS-LABEL: s_usubsat_v2i32:
1021; GFX10PLUS:       ; %bb.0:
1022; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, s0, s2 clamp
1023; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, s1, s3 clamp
1024; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1025; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
1026; GFX10PLUS-NEXT:    ; return to shader part epilog
1027  %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1028  ret <2 x i32> %result
1029}
1030
1031define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
1032; GFX6-LABEL: v_usubsat_v3i32:
1033; GFX6:       ; %bb.0:
1034; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1035; GFX6-NEXT:    v_min_u32_e32 v3, v0, v3
1036; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
1037; GFX6-NEXT:    v_min_u32_e32 v3, v1, v4
1038; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
1039; GFX6-NEXT:    v_min_u32_e32 v3, v2, v5
1040; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
1041; GFX6-NEXT:    s_setpc_b64 s[30:31]
1042;
1043; GFX8-LABEL: v_usubsat_v3i32:
1044; GFX8:       ; %bb.0:
1045; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1046; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v3 clamp
1047; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v4 clamp
1048; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v5 clamp
1049; GFX8-NEXT:    s_setpc_b64 s[30:31]
1050;
1051; GFX9-LABEL: v_usubsat_v3i32:
1052; GFX9:       ; %bb.0:
1053; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1054; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v3 clamp
1055; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v4 clamp
1056; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v5 clamp
1057; GFX9-NEXT:    s_setpc_b64 s[30:31]
1058;
1059; GFX10PLUS-LABEL: v_usubsat_v3i32:
1060; GFX10PLUS:       ; %bb.0:
1061; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1062; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v3 clamp
1063; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v4 clamp
1064; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, v2, v5 clamp
1065; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1066  %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1067  ret <3 x i32> %result
1068}
1069
1070define amdgpu_ps <3 x i32> @s_usubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
1071; GFX6-LABEL: s_usubsat_v3i32:
1072; GFX6:       ; %bb.0:
1073; GFX6-NEXT:    s_min_u32 s3, s0, s3
1074; GFX6-NEXT:    s_sub_i32 s0, s0, s3
1075; GFX6-NEXT:    s_min_u32 s3, s1, s4
1076; GFX6-NEXT:    s_sub_i32 s1, s1, s3
1077; GFX6-NEXT:    s_min_u32 s3, s2, s5
1078; GFX6-NEXT:    s_sub_i32 s2, s2, s3
1079; GFX6-NEXT:    ; return to shader part epilog
1080;
1081; GFX8-LABEL: s_usubsat_v3i32:
1082; GFX8:       ; %bb.0:
1083; GFX8-NEXT:    v_mov_b32_e32 v0, s3
1084; GFX8-NEXT:    v_mov_b32_e32 v1, s4
1085; GFX8-NEXT:    v_mov_b32_e32 v2, s5
1086; GFX8-NEXT:    v_sub_u32_e64 v0, s[6:7], s0, v0 clamp
1087; GFX8-NEXT:    v_sub_u32_e64 v1, s[0:1], s1, v1 clamp
1088; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s2, v2 clamp
1089; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1090; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
1091; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
1092; GFX8-NEXT:    ; return to shader part epilog
1093;
1094; GFX9-LABEL: s_usubsat_v3i32:
1095; GFX9:       ; %bb.0:
1096; GFX9-NEXT:    v_mov_b32_e32 v0, s3
1097; GFX9-NEXT:    v_mov_b32_e32 v1, s4
1098; GFX9-NEXT:    v_mov_b32_e32 v2, s5
1099; GFX9-NEXT:    v_sub_u32_e64 v0, s0, v0 clamp
1100; GFX9-NEXT:    v_sub_u32_e64 v1, s1, v1 clamp
1101; GFX9-NEXT:    v_sub_u32_e64 v2, s2, v2 clamp
1102; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1103; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1104; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1105; GFX9-NEXT:    ; return to shader part epilog
1106;
1107; GFX10PLUS-LABEL: s_usubsat_v3i32:
1108; GFX10PLUS:       ; %bb.0:
1109; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, s0, s3 clamp
1110; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, s1, s4 clamp
1111; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, s2, s5 clamp
1112; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1113; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
1114; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
1115; GFX10PLUS-NEXT:    ; return to shader part epilog
1116  %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1117  ret <3 x i32> %result
1118}
1119
1120define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
1121; GFX6-LABEL: v_usubsat_v4i32:
1122; GFX6:       ; %bb.0:
1123; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1124; GFX6-NEXT:    v_min_u32_e32 v4, v0, v4
1125; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
1126; GFX6-NEXT:    v_min_u32_e32 v4, v1, v5
1127; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
1128; GFX6-NEXT:    v_min_u32_e32 v4, v2, v6
1129; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
1130; GFX6-NEXT:    v_min_u32_e32 v4, v3, v7
1131; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
1132; GFX6-NEXT:    s_setpc_b64 s[30:31]
1133;
1134; GFX8-LABEL: v_usubsat_v4i32:
1135; GFX8:       ; %bb.0:
1136; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1137; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v4 clamp
1138; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v5 clamp
1139; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v6 clamp
1140; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v7 clamp
1141; GFX8-NEXT:    s_setpc_b64 s[30:31]
1142;
1143; GFX9-LABEL: v_usubsat_v4i32:
1144; GFX9:       ; %bb.0:
1145; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1146; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v4 clamp
1147; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v5 clamp
1148; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v6 clamp
1149; GFX9-NEXT:    v_sub_u32_e64 v3, v3, v7 clamp
1150; GFX9-NEXT:    s_setpc_b64 s[30:31]
1151;
1152; GFX10PLUS-LABEL: v_usubsat_v4i32:
1153; GFX10PLUS:       ; %bb.0:
1154; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1155; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v4 clamp
1156; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v5 clamp
1157; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, v2, v6 clamp
1158; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v3, v3, v7 clamp
1159; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1160  %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1161  ret <4 x i32> %result
1162}
1163
1164define amdgpu_ps <4 x i32> @s_usubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
1165; GFX6-LABEL: s_usubsat_v4i32:
1166; GFX6:       ; %bb.0:
1167; GFX6-NEXT:    s_min_u32 s4, s0, s4
1168; GFX6-NEXT:    s_sub_i32 s0, s0, s4
1169; GFX6-NEXT:    s_min_u32 s4, s1, s5
1170; GFX6-NEXT:    s_sub_i32 s1, s1, s4
1171; GFX6-NEXT:    s_min_u32 s4, s2, s6
1172; GFX6-NEXT:    s_sub_i32 s2, s2, s4
1173; GFX6-NEXT:    s_min_u32 s4, s3, s7
1174; GFX6-NEXT:    s_sub_i32 s3, s3, s4
1175; GFX6-NEXT:    ; return to shader part epilog
1176;
1177; GFX8-LABEL: s_usubsat_v4i32:
1178; GFX8:       ; %bb.0:
1179; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1180; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1181; GFX8-NEXT:    v_mov_b32_e32 v2, s6
1182; GFX8-NEXT:    v_mov_b32_e32 v3, s7
1183; GFX8-NEXT:    v_sub_u32_e64 v0, s[8:9], s0, v0 clamp
1184; GFX8-NEXT:    v_sub_u32_e64 v1, s[0:1], s1, v1 clamp
1185; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s2, v2 clamp
1186; GFX8-NEXT:    v_sub_u32_e64 v3, s[0:1], s3, v3 clamp
1187; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1188; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
1189; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
1190; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
1191; GFX8-NEXT:    ; return to shader part epilog
1192;
1193; GFX9-LABEL: s_usubsat_v4i32:
1194; GFX9:       ; %bb.0:
1195; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1196; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1197; GFX9-NEXT:    v_mov_b32_e32 v2, s6
1198; GFX9-NEXT:    v_mov_b32_e32 v3, s7
1199; GFX9-NEXT:    v_sub_u32_e64 v0, s0, v0 clamp
1200; GFX9-NEXT:    v_sub_u32_e64 v1, s1, v1 clamp
1201; GFX9-NEXT:    v_sub_u32_e64 v2, s2, v2 clamp
1202; GFX9-NEXT:    v_sub_u32_e64 v3, s3, v3 clamp
1203; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1204; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1205; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1206; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
1207; GFX9-NEXT:    ; return to shader part epilog
1208;
1209; GFX10PLUS-LABEL: s_usubsat_v4i32:
1210; GFX10PLUS:       ; %bb.0:
1211; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, s0, s4 clamp
1212; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, s1, s5 clamp
1213; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, s2, s6 clamp
1214; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v3, s3, s7 clamp
1215; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1216; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
1217; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
1218; GFX10PLUS-NEXT:    v_readfirstlane_b32 s3, v3
1219; GFX10PLUS-NEXT:    ; return to shader part epilog
1220  %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1221  ret <4 x i32> %result
1222}
1223
1224define <5 x i32> @v_usubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
1225; GFX6-LABEL: v_usubsat_v5i32:
1226; GFX6:       ; %bb.0:
1227; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1228; GFX6-NEXT:    v_min_u32_e32 v5, v0, v5
1229; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
1230; GFX6-NEXT:    v_min_u32_e32 v5, v1, v6
1231; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
1232; GFX6-NEXT:    v_min_u32_e32 v5, v2, v7
1233; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
1234; GFX6-NEXT:    v_min_u32_e32 v5, v3, v8
1235; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
1236; GFX6-NEXT:    v_min_u32_e32 v5, v4, v9
1237; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v5
1238; GFX6-NEXT:    s_setpc_b64 s[30:31]
1239;
1240; GFX8-LABEL: v_usubsat_v5i32:
1241; GFX8:       ; %bb.0:
1242; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1243; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v5 clamp
1244; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v6 clamp
1245; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v7 clamp
1246; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v8 clamp
1247; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v4, v9 clamp
1248; GFX8-NEXT:    s_setpc_b64 s[30:31]
1249;
1250; GFX9-LABEL: v_usubsat_v5i32:
1251; GFX9:       ; %bb.0:
1252; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1253; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v5 clamp
1254; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v6 clamp
1255; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v7 clamp
1256; GFX9-NEXT:    v_sub_u32_e64 v3, v3, v8 clamp
1257; GFX9-NEXT:    v_sub_u32_e64 v4, v4, v9 clamp
1258; GFX9-NEXT:    s_setpc_b64 s[30:31]
1259;
1260; GFX10PLUS-LABEL: v_usubsat_v5i32:
1261; GFX10PLUS:       ; %bb.0:
1262; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1263; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v5 clamp
1264; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v6 clamp
1265; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, v2, v7 clamp
1266; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v3, v3, v8 clamp
1267; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v4, v4, v9 clamp
1268; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1269  %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1270  ret <5 x i32> %result
1271}
1272
1273define amdgpu_ps <5 x i32> @s_usubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
1274; GFX6-LABEL: s_usubsat_v5i32:
1275; GFX6:       ; %bb.0:
1276; GFX6-NEXT:    s_min_u32 s5, s0, s5
1277; GFX6-NEXT:    s_sub_i32 s0, s0, s5
1278; GFX6-NEXT:    s_min_u32 s5, s1, s6
1279; GFX6-NEXT:    s_sub_i32 s1, s1, s5
1280; GFX6-NEXT:    s_min_u32 s5, s2, s7
1281; GFX6-NEXT:    s_sub_i32 s2, s2, s5
1282; GFX6-NEXT:    s_min_u32 s5, s3, s8
1283; GFX6-NEXT:    s_sub_i32 s3, s3, s5
1284; GFX6-NEXT:    s_min_u32 s5, s4, s9
1285; GFX6-NEXT:    s_sub_i32 s4, s4, s5
1286; GFX6-NEXT:    ; return to shader part epilog
1287;
1288; GFX8-LABEL: s_usubsat_v5i32:
1289; GFX8:       ; %bb.0:
1290; GFX8-NEXT:    v_mov_b32_e32 v0, s5
1291; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1292; GFX8-NEXT:    v_mov_b32_e32 v2, s7
1293; GFX8-NEXT:    v_mov_b32_e32 v3, s8
1294; GFX8-NEXT:    v_mov_b32_e32 v4, s9
1295; GFX8-NEXT:    v_sub_u32_e64 v0, s[10:11], s0, v0 clamp
1296; GFX8-NEXT:    v_sub_u32_e64 v1, s[0:1], s1, v1 clamp
1297; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s2, v2 clamp
1298; GFX8-NEXT:    v_sub_u32_e64 v3, s[0:1], s3, v3 clamp
1299; GFX8-NEXT:    v_sub_u32_e64 v4, s[0:1], s4, v4 clamp
1300; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1301; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
1302; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
1303; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
1304; GFX8-NEXT:    v_readfirstlane_b32 s4, v4
1305; GFX8-NEXT:    ; return to shader part epilog
1306;
1307; GFX9-LABEL: s_usubsat_v5i32:
1308; GFX9:       ; %bb.0:
1309; GFX9-NEXT:    v_mov_b32_e32 v0, s5
1310; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1311; GFX9-NEXT:    v_mov_b32_e32 v2, s7
1312; GFX9-NEXT:    v_mov_b32_e32 v3, s8
1313; GFX9-NEXT:    v_mov_b32_e32 v4, s9
1314; GFX9-NEXT:    v_sub_u32_e64 v0, s0, v0 clamp
1315; GFX9-NEXT:    v_sub_u32_e64 v1, s1, v1 clamp
1316; GFX9-NEXT:    v_sub_u32_e64 v2, s2, v2 clamp
1317; GFX9-NEXT:    v_sub_u32_e64 v3, s3, v3 clamp
1318; GFX9-NEXT:    v_sub_u32_e64 v4, s4, v4 clamp
1319; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1320; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1321; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1322; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
1323; GFX9-NEXT:    v_readfirstlane_b32 s4, v4
1324; GFX9-NEXT:    ; return to shader part epilog
1325;
1326; GFX10PLUS-LABEL: s_usubsat_v5i32:
1327; GFX10PLUS:       ; %bb.0:
1328; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, s0, s5 clamp
1329; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, s1, s6 clamp
1330; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, s2, s7 clamp
1331; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v3, s3, s8 clamp
1332; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v4, s4, s9 clamp
1333; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1334; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
1335; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
1336; GFX10PLUS-NEXT:    v_readfirstlane_b32 s3, v3
1337; GFX10PLUS-NEXT:    v_readfirstlane_b32 s4, v4
1338; GFX10PLUS-NEXT:    ; return to shader part epilog
1339  %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1340  ret <5 x i32> %result
1341}
1342
1343define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
1344; GFX6-LABEL: v_usubsat_v16i32:
1345; GFX6:       ; %bb.0:
1346; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1347; GFX6-NEXT:    v_min_u32_e32 v16, v0, v16
1348; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
1349; GFX6-NEXT:    v_min_u32_e32 v16, v1, v17
1350; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v16
1351; GFX6-NEXT:    v_min_u32_e32 v16, v2, v18
1352; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v16
1353; GFX6-NEXT:    v_min_u32_e32 v16, v3, v19
1354; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v16
1355; GFX6-NEXT:    buffer_load_dword v16, off, s[0:3], s32
1356; GFX6-NEXT:    v_min_u32_e32 v17, v4, v20
1357; GFX6-NEXT:    v_min_u32_e32 v18, v5, v21
1358; GFX6-NEXT:    v_min_u32_e32 v19, v6, v22
1359; GFX6-NEXT:    v_min_u32_e32 v20, v7, v23
1360; GFX6-NEXT:    v_min_u32_e32 v21, v8, v24
1361; GFX6-NEXT:    v_min_u32_e32 v22, v9, v25
1362; GFX6-NEXT:    v_min_u32_e32 v23, v10, v26
1363; GFX6-NEXT:    v_min_u32_e32 v24, v11, v27
1364; GFX6-NEXT:    v_min_u32_e32 v25, v12, v28
1365; GFX6-NEXT:    v_min_u32_e32 v26, v13, v29
1366; GFX6-NEXT:    v_min_u32_e32 v27, v14, v30
1367; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v17
1368; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v18
1369; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v19
1370; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v20
1371; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v21
1372; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v22
1373; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v23
1374; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, v11, v24
1375; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v25
1376; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, v13, v26
1377; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v14, v27
1378; GFX6-NEXT:    s_waitcnt vmcnt(0)
1379; GFX6-NEXT:    v_min_u32_e32 v16, v15, v16
1380; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, v15, v16
1381; GFX6-NEXT:    s_setpc_b64 s[30:31]
1382;
1383; GFX8-LABEL: v_usubsat_v16i32:
1384; GFX8:       ; %bb.0:
1385; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1386; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v16 clamp
1387; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
1388; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v17 clamp
1389; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v18 clamp
1390; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v19 clamp
1391; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v4, v20 clamp
1392; GFX8-NEXT:    v_sub_u32_e64 v5, s[4:5], v5, v21 clamp
1393; GFX8-NEXT:    v_sub_u32_e64 v6, s[4:5], v6, v22 clamp
1394; GFX8-NEXT:    v_sub_u32_e64 v7, s[4:5], v7, v23 clamp
1395; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v8, v24 clamp
1396; GFX8-NEXT:    v_sub_u32_e64 v9, s[4:5], v9, v25 clamp
1397; GFX8-NEXT:    v_sub_u32_e64 v10, s[4:5], v10, v26 clamp
1398; GFX8-NEXT:    v_sub_u32_e64 v11, s[4:5], v11, v27 clamp
1399; GFX8-NEXT:    v_sub_u32_e64 v12, s[4:5], v12, v28 clamp
1400; GFX8-NEXT:    v_sub_u32_e64 v13, s[4:5], v13, v29 clamp
1401; GFX8-NEXT:    v_sub_u32_e64 v14, s[4:5], v14, v30 clamp
1402; GFX8-NEXT:    s_waitcnt vmcnt(0)
1403; GFX8-NEXT:    v_sub_u32_e64 v15, s[4:5], v15, v16 clamp
1404; GFX8-NEXT:    s_setpc_b64 s[30:31]
1405;
1406; GFX9-LABEL: v_usubsat_v16i32:
1407; GFX9:       ; %bb.0:
1408; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1409; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v16 clamp
1410; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
1411; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v17 clamp
1412; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v18 clamp
1413; GFX9-NEXT:    v_sub_u32_e64 v3, v3, v19 clamp
1414; GFX9-NEXT:    v_sub_u32_e64 v4, v4, v20 clamp
1415; GFX9-NEXT:    v_sub_u32_e64 v5, v5, v21 clamp
1416; GFX9-NEXT:    v_sub_u32_e64 v6, v6, v22 clamp
1417; GFX9-NEXT:    v_sub_u32_e64 v7, v7, v23 clamp
1418; GFX9-NEXT:    v_sub_u32_e64 v8, v8, v24 clamp
1419; GFX9-NEXT:    v_sub_u32_e64 v9, v9, v25 clamp
1420; GFX9-NEXT:    v_sub_u32_e64 v10, v10, v26 clamp
1421; GFX9-NEXT:    v_sub_u32_e64 v11, v11, v27 clamp
1422; GFX9-NEXT:    v_sub_u32_e64 v12, v12, v28 clamp
1423; GFX9-NEXT:    v_sub_u32_e64 v13, v13, v29 clamp
1424; GFX9-NEXT:    v_sub_u32_e64 v14, v14, v30 clamp
1425; GFX9-NEXT:    s_waitcnt vmcnt(0)
1426; GFX9-NEXT:    v_sub_u32_e64 v15, v15, v16 clamp
1427; GFX9-NEXT:    s_setpc_b64 s[30:31]
1428;
1429; GFX10-LABEL: v_usubsat_v16i32:
1430; GFX10:       ; %bb.0:
1431; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1432; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
1433; GFX10-NEXT:    v_sub_nc_u32_e64 v0, v0, v16 clamp
1434; GFX10-NEXT:    v_sub_nc_u32_e64 v1, v1, v17 clamp
1435; GFX10-NEXT:    v_sub_nc_u32_e64 v2, v2, v18 clamp
1436; GFX10-NEXT:    v_sub_nc_u32_e64 v3, v3, v19 clamp
1437; GFX10-NEXT:    v_sub_nc_u32_e64 v4, v4, v20 clamp
1438; GFX10-NEXT:    v_sub_nc_u32_e64 v5, v5, v21 clamp
1439; GFX10-NEXT:    v_sub_nc_u32_e64 v6, v6, v22 clamp
1440; GFX10-NEXT:    v_sub_nc_u32_e64 v7, v7, v23 clamp
1441; GFX10-NEXT:    v_sub_nc_u32_e64 v8, v8, v24 clamp
1442; GFX10-NEXT:    v_sub_nc_u32_e64 v9, v9, v25 clamp
1443; GFX10-NEXT:    v_sub_nc_u32_e64 v10, v10, v26 clamp
1444; GFX10-NEXT:    v_sub_nc_u32_e64 v11, v11, v27 clamp
1445; GFX10-NEXT:    v_sub_nc_u32_e64 v12, v12, v28 clamp
1446; GFX10-NEXT:    v_sub_nc_u32_e64 v13, v13, v29 clamp
1447; GFX10-NEXT:    v_sub_nc_u32_e64 v14, v14, v30 clamp
1448; GFX10-NEXT:    s_waitcnt vmcnt(0)
1449; GFX10-NEXT:    v_sub_nc_u32_e64 v15, v15, v31 clamp
1450; GFX10-NEXT:    s_setpc_b64 s[30:31]
1451;
1452; GFX11-LABEL: v_usubsat_v16i32:
1453; GFX11:       ; %bb.0:
1454; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1455; GFX11-NEXT:    scratch_load_b32 v31, off, s32
1456; GFX11-NEXT:    v_sub_nc_u32_e64 v0, v0, v16 clamp
1457; GFX11-NEXT:    v_sub_nc_u32_e64 v1, v1, v17 clamp
1458; GFX11-NEXT:    v_sub_nc_u32_e64 v2, v2, v18 clamp
1459; GFX11-NEXT:    v_sub_nc_u32_e64 v3, v3, v19 clamp
1460; GFX11-NEXT:    v_sub_nc_u32_e64 v4, v4, v20 clamp
1461; GFX11-NEXT:    v_sub_nc_u32_e64 v5, v5, v21 clamp
1462; GFX11-NEXT:    v_sub_nc_u32_e64 v6, v6, v22 clamp
1463; GFX11-NEXT:    v_sub_nc_u32_e64 v7, v7, v23 clamp
1464; GFX11-NEXT:    v_sub_nc_u32_e64 v8, v8, v24 clamp
1465; GFX11-NEXT:    v_sub_nc_u32_e64 v9, v9, v25 clamp
1466; GFX11-NEXT:    v_sub_nc_u32_e64 v10, v10, v26 clamp
1467; GFX11-NEXT:    v_sub_nc_u32_e64 v11, v11, v27 clamp
1468; GFX11-NEXT:    v_sub_nc_u32_e64 v12, v12, v28 clamp
1469; GFX11-NEXT:    v_sub_nc_u32_e64 v13, v13, v29 clamp
1470; GFX11-NEXT:    v_sub_nc_u32_e64 v14, v14, v30 clamp
1471; GFX11-NEXT:    s_waitcnt vmcnt(0)
1472; GFX11-NEXT:    v_sub_nc_u32_e64 v15, v15, v31 clamp
1473; GFX11-NEXT:    s_setpc_b64 s[30:31]
1474  %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
1475  ret <16 x i32> %result
1476}
1477
1478define amdgpu_ps <16 x i32> @s_usubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
1479; GFX6-LABEL: s_usubsat_v16i32:
1480; GFX6:       ; %bb.0:
1481; GFX6-NEXT:    s_min_u32 s16, s0, s16
1482; GFX6-NEXT:    s_sub_i32 s0, s0, s16
1483; GFX6-NEXT:    s_min_u32 s16, s1, s17
1484; GFX6-NEXT:    s_sub_i32 s1, s1, s16
1485; GFX6-NEXT:    s_min_u32 s16, s2, s18
1486; GFX6-NEXT:    s_sub_i32 s2, s2, s16
1487; GFX6-NEXT:    s_min_u32 s16, s3, s19
1488; GFX6-NEXT:    s_sub_i32 s3, s3, s16
1489; GFX6-NEXT:    s_min_u32 s16, s4, s20
1490; GFX6-NEXT:    s_sub_i32 s4, s4, s16
1491; GFX6-NEXT:    s_min_u32 s16, s5, s21
1492; GFX6-NEXT:    s_sub_i32 s5, s5, s16
1493; GFX6-NEXT:    s_min_u32 s16, s6, s22
1494; GFX6-NEXT:    s_sub_i32 s6, s6, s16
1495; GFX6-NEXT:    s_min_u32 s16, s7, s23
1496; GFX6-NEXT:    s_sub_i32 s7, s7, s16
1497; GFX6-NEXT:    s_min_u32 s16, s8, s24
1498; GFX6-NEXT:    s_sub_i32 s8, s8, s16
1499; GFX6-NEXT:    s_min_u32 s16, s9, s25
1500; GFX6-NEXT:    s_sub_i32 s9, s9, s16
1501; GFX6-NEXT:    s_min_u32 s16, s10, s26
1502; GFX6-NEXT:    s_sub_i32 s10, s10, s16
1503; GFX6-NEXT:    s_min_u32 s16, s11, s27
1504; GFX6-NEXT:    s_sub_i32 s11, s11, s16
1505; GFX6-NEXT:    s_min_u32 s16, s12, s28
1506; GFX6-NEXT:    s_sub_i32 s12, s12, s16
1507; GFX6-NEXT:    s_min_u32 s16, s13, s29
1508; GFX6-NEXT:    s_sub_i32 s13, s13, s16
1509; GFX6-NEXT:    s_min_u32 s16, s14, s30
1510; GFX6-NEXT:    s_sub_i32 s14, s14, s16
1511; GFX6-NEXT:    s_min_u32 s16, s15, s31
1512; GFX6-NEXT:    s_sub_i32 s15, s15, s16
1513; GFX6-NEXT:    ; return to shader part epilog
1514;
1515; GFX8-LABEL: s_usubsat_v16i32:
1516; GFX8:       ; %bb.0:
1517; GFX8-NEXT:    v_mov_b32_e32 v0, s16
1518; GFX8-NEXT:    v_mov_b32_e32 v1, s17
1519; GFX8-NEXT:    v_mov_b32_e32 v2, s18
1520; GFX8-NEXT:    v_mov_b32_e32 v3, s19
1521; GFX8-NEXT:    v_mov_b32_e32 v4, s20
1522; GFX8-NEXT:    v_mov_b32_e32 v5, s21
1523; GFX8-NEXT:    v_mov_b32_e32 v6, s22
1524; GFX8-NEXT:    v_mov_b32_e32 v7, s23
1525; GFX8-NEXT:    v_mov_b32_e32 v8, s24
1526; GFX8-NEXT:    v_mov_b32_e32 v9, s25
1527; GFX8-NEXT:    v_mov_b32_e32 v10, s26
1528; GFX8-NEXT:    v_mov_b32_e32 v11, s27
1529; GFX8-NEXT:    v_mov_b32_e32 v12, s28
1530; GFX8-NEXT:    v_mov_b32_e32 v13, s29
1531; GFX8-NEXT:    v_mov_b32_e32 v14, s30
1532; GFX8-NEXT:    v_mov_b32_e32 v15, s31
1533; GFX8-NEXT:    v_sub_u32_e64 v0, s[32:33], s0, v0 clamp
1534; GFX8-NEXT:    v_sub_u32_e64 v1, s[16:17], s1, v1 clamp
1535; GFX8-NEXT:    v_sub_u32_e64 v2, s[16:17], s2, v2 clamp
1536; GFX8-NEXT:    v_sub_u32_e64 v3, s[2:3], s3, v3 clamp
1537; GFX8-NEXT:    v_sub_u32_e64 v4, s[2:3], s4, v4 clamp
1538; GFX8-NEXT:    v_sub_u32_e64 v5, s[2:3], s5, v5 clamp
1539; GFX8-NEXT:    v_sub_u32_e64 v6, s[2:3], s6, v6 clamp
1540; GFX8-NEXT:    v_sub_u32_e64 v7, s[2:3], s7, v7 clamp
1541; GFX8-NEXT:    v_sub_u32_e64 v8, s[2:3], s8, v8 clamp
1542; GFX8-NEXT:    v_sub_u32_e64 v9, s[2:3], s9, v9 clamp
1543; GFX8-NEXT:    v_sub_u32_e64 v10, s[2:3], s10, v10 clamp
1544; GFX8-NEXT:    v_sub_u32_e64 v11, s[2:3], s11, v11 clamp
1545; GFX8-NEXT:    v_sub_u32_e64 v12, s[2:3], s12, v12 clamp
1546; GFX8-NEXT:    v_sub_u32_e64 v13, s[2:3], s13, v13 clamp
1547; GFX8-NEXT:    v_sub_u32_e64 v14, s[2:3], s14, v14 clamp
1548; GFX8-NEXT:    v_sub_u32_e64 v15, s[2:3], s15, v15 clamp
1549; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1550; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
1551; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
1552; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
1553; GFX8-NEXT:    v_readfirstlane_b32 s4, v4
1554; GFX8-NEXT:    v_readfirstlane_b32 s5, v5
1555; GFX8-NEXT:    v_readfirstlane_b32 s6, v6
1556; GFX8-NEXT:    v_readfirstlane_b32 s7, v7
1557; GFX8-NEXT:    v_readfirstlane_b32 s8, v8
1558; GFX8-NEXT:    v_readfirstlane_b32 s9, v9
1559; GFX8-NEXT:    v_readfirstlane_b32 s10, v10
1560; GFX8-NEXT:    v_readfirstlane_b32 s11, v11
1561; GFX8-NEXT:    v_readfirstlane_b32 s12, v12
1562; GFX8-NEXT:    v_readfirstlane_b32 s13, v13
1563; GFX8-NEXT:    v_readfirstlane_b32 s14, v14
1564; GFX8-NEXT:    v_readfirstlane_b32 s15, v15
1565; GFX8-NEXT:    ; return to shader part epilog
1566;
1567; GFX9-LABEL: s_usubsat_v16i32:
1568; GFX9:       ; %bb.0:
1569; GFX9-NEXT:    v_mov_b32_e32 v0, s16
1570; GFX9-NEXT:    v_mov_b32_e32 v1, s17
1571; GFX9-NEXT:    v_mov_b32_e32 v2, s18
1572; GFX9-NEXT:    v_mov_b32_e32 v3, s19
1573; GFX9-NEXT:    v_mov_b32_e32 v4, s20
1574; GFX9-NEXT:    v_mov_b32_e32 v5, s21
1575; GFX9-NEXT:    v_mov_b32_e32 v6, s22
1576; GFX9-NEXT:    v_mov_b32_e32 v7, s23
1577; GFX9-NEXT:    v_mov_b32_e32 v8, s24
1578; GFX9-NEXT:    v_mov_b32_e32 v9, s25
1579; GFX9-NEXT:    v_mov_b32_e32 v10, s26
1580; GFX9-NEXT:    v_mov_b32_e32 v11, s27
1581; GFX9-NEXT:    v_mov_b32_e32 v12, s28
1582; GFX9-NEXT:    v_mov_b32_e32 v13, s29
1583; GFX9-NEXT:    v_mov_b32_e32 v14, s30
1584; GFX9-NEXT:    v_mov_b32_e32 v15, s31
1585; GFX9-NEXT:    v_sub_u32_e64 v0, s0, v0 clamp
1586; GFX9-NEXT:    v_sub_u32_e64 v1, s1, v1 clamp
1587; GFX9-NEXT:    v_sub_u32_e64 v2, s2, v2 clamp
1588; GFX9-NEXT:    v_sub_u32_e64 v3, s3, v3 clamp
1589; GFX9-NEXT:    v_sub_u32_e64 v4, s4, v4 clamp
1590; GFX9-NEXT:    v_sub_u32_e64 v5, s5, v5 clamp
1591; GFX9-NEXT:    v_sub_u32_e64 v6, s6, v6 clamp
1592; GFX9-NEXT:    v_sub_u32_e64 v7, s7, v7 clamp
1593; GFX9-NEXT:    v_sub_u32_e64 v8, s8, v8 clamp
1594; GFX9-NEXT:    v_sub_u32_e64 v9, s9, v9 clamp
1595; GFX9-NEXT:    v_sub_u32_e64 v10, s10, v10 clamp
1596; GFX9-NEXT:    v_sub_u32_e64 v11, s11, v11 clamp
1597; GFX9-NEXT:    v_sub_u32_e64 v12, s12, v12 clamp
1598; GFX9-NEXT:    v_sub_u32_e64 v13, s13, v13 clamp
1599; GFX9-NEXT:    v_sub_u32_e64 v14, s14, v14 clamp
1600; GFX9-NEXT:    v_sub_u32_e64 v15, s15, v15 clamp
1601; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1602; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1603; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1604; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
1605; GFX9-NEXT:    v_readfirstlane_b32 s4, v4
1606; GFX9-NEXT:    v_readfirstlane_b32 s5, v5
1607; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
1608; GFX9-NEXT:    v_readfirstlane_b32 s7, v7
1609; GFX9-NEXT:    v_readfirstlane_b32 s8, v8
1610; GFX9-NEXT:    v_readfirstlane_b32 s9, v9
1611; GFX9-NEXT:    v_readfirstlane_b32 s10, v10
1612; GFX9-NEXT:    v_readfirstlane_b32 s11, v11
1613; GFX9-NEXT:    v_readfirstlane_b32 s12, v12
1614; GFX9-NEXT:    v_readfirstlane_b32 s13, v13
1615; GFX9-NEXT:    v_readfirstlane_b32 s14, v14
1616; GFX9-NEXT:    v_readfirstlane_b32 s15, v15
1617; GFX9-NEXT:    ; return to shader part epilog
1618;
1619; GFX10PLUS-LABEL: s_usubsat_v16i32:
1620; GFX10PLUS:       ; %bb.0:
1621; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, s0, s16 clamp
1622; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, s1, s17 clamp
1623; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, s2, s18 clamp
1624; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v3, s3, s19 clamp
1625; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v4, s4, s20 clamp
1626; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v5, s5, s21 clamp
1627; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v6, s6, s22 clamp
1628; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v7, s7, s23 clamp
1629; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v8, s8, s24 clamp
1630; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v9, s9, s25 clamp
1631; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v10, s10, s26 clamp
1632; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v11, s11, s27 clamp
1633; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v12, s12, s28 clamp
1634; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v13, s13, s29 clamp
1635; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v14, s14, s30 clamp
1636; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v15, s15, s31 clamp
1637; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1638; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
1639; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
1640; GFX10PLUS-NEXT:    v_readfirstlane_b32 s3, v3
1641; GFX10PLUS-NEXT:    v_readfirstlane_b32 s4, v4
1642; GFX10PLUS-NEXT:    v_readfirstlane_b32 s5, v5
1643; GFX10PLUS-NEXT:    v_readfirstlane_b32 s6, v6
1644; GFX10PLUS-NEXT:    v_readfirstlane_b32 s7, v7
1645; GFX10PLUS-NEXT:    v_readfirstlane_b32 s8, v8
1646; GFX10PLUS-NEXT:    v_readfirstlane_b32 s9, v9
1647; GFX10PLUS-NEXT:    v_readfirstlane_b32 s10, v10
1648; GFX10PLUS-NEXT:    v_readfirstlane_b32 s11, v11
1649; GFX10PLUS-NEXT:    v_readfirstlane_b32 s12, v12
1650; GFX10PLUS-NEXT:    v_readfirstlane_b32 s13, v13
1651; GFX10PLUS-NEXT:    v_readfirstlane_b32 s14, v14
1652; GFX10PLUS-NEXT:    v_readfirstlane_b32 s15, v15
1653; GFX10PLUS-NEXT:    ; return to shader part epilog
1654  %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
1655  ret <16 x i32> %result
1656}
1657
1658define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
1659; GFX6-LABEL: v_usubsat_i16:
1660; GFX6:       ; %bb.0:
1661; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1662; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1663; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1664; GFX6-NEXT:    v_min_u32_e32 v1, v0, v1
1665; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
1666; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1667; GFX6-NEXT:    s_setpc_b64 s[30:31]
1668;
1669; GFX8-LABEL: v_usubsat_i16:
1670; GFX8:       ; %bb.0:
1671; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1672; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
1673; GFX8-NEXT:    s_setpc_b64 s[30:31]
1674;
1675; GFX9-LABEL: v_usubsat_i16:
1676; GFX9:       ; %bb.0:
1677; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1678; GFX9-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
1679; GFX9-NEXT:    s_setpc_b64 s[30:31]
1680;
1681; GFX10PLUS-LABEL: v_usubsat_i16:
1682; GFX10PLUS:       ; %bb.0:
1683; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1684; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
1685; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1686  %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
1687  ret i16 %result
1688}
1689
1690define amdgpu_ps i16 @s_usubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
1691; GFX6-LABEL: s_usubsat_i16:
1692; GFX6:       ; %bb.0:
1693; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
1694; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
1695; GFX6-NEXT:    s_min_u32 s1, s0, s1
1696; GFX6-NEXT:    s_sub_i32 s0, s0, s1
1697; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
1698; GFX6-NEXT:    ; return to shader part epilog
1699;
1700; GFX8-LABEL: s_usubsat_i16:
1701; GFX8:       ; %bb.0:
1702; GFX8-NEXT:    v_mov_b32_e32 v0, s1
1703; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
1704; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1705; GFX8-NEXT:    ; return to shader part epilog
1706;
1707; GFX9-LABEL: s_usubsat_i16:
1708; GFX9:       ; %bb.0:
1709; GFX9-NEXT:    v_mov_b32_e32 v0, s1
1710; GFX9-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
1711; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1712; GFX9-NEXT:    ; return to shader part epilog
1713;
1714; GFX10PLUS-LABEL: s_usubsat_i16:
1715; GFX10PLUS:       ; %bb.0:
1716; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, s0, s1 clamp
1717; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1718; GFX10PLUS-NEXT:    ; return to shader part epilog
1719  %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
1720  ret i16 %result
1721}
1722
1723define amdgpu_ps half @usubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
1724; GFX6-LABEL: usubsat_i16_sv:
1725; GFX6:       ; %bb.0:
1726; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
1727; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1728; GFX6-NEXT:    v_min_u32_e32 v0, s0, v0
1729; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1730; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1731; GFX6-NEXT:    ; return to shader part epilog
1732;
1733; GFX8-LABEL: usubsat_i16_sv:
1734; GFX8:       ; %bb.0:
1735; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
1736; GFX8-NEXT:    ; return to shader part epilog
1737;
1738; GFX9-LABEL: usubsat_i16_sv:
1739; GFX9:       ; %bb.0:
1740; GFX9-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
1741; GFX9-NEXT:    ; return to shader part epilog
1742;
1743; GFX10PLUS-LABEL: usubsat_i16_sv:
1744; GFX10PLUS:       ; %bb.0:
1745; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, s0, v0 clamp
1746; GFX10PLUS-NEXT:    ; return to shader part epilog
1747  %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
1748  %cast = bitcast i16 %result to half
1749  ret half %cast
1750}
1751
1752define amdgpu_ps half @usubsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
1753; GFX6-LABEL: usubsat_i16_vs:
1754; GFX6:       ; %bb.0:
1755; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1756; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
1757; GFX6-NEXT:    v_min_u32_e32 v1, s0, v0
1758; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
1759; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1760; GFX6-NEXT:    ; return to shader part epilog
1761;
1762; GFX8-LABEL: usubsat_i16_vs:
1763; GFX8:       ; %bb.0:
1764; GFX8-NEXT:    v_sub_u16_e64 v0, v0, s0 clamp
1765; GFX8-NEXT:    ; return to shader part epilog
1766;
1767; GFX9-LABEL: usubsat_i16_vs:
1768; GFX9:       ; %bb.0:
1769; GFX9-NEXT:    v_sub_u16_e64 v0, v0, s0 clamp
1770; GFX9-NEXT:    ; return to shader part epilog
1771;
1772; GFX10PLUS-LABEL: usubsat_i16_vs:
1773; GFX10PLUS:       ; %bb.0:
1774; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, s0 clamp
1775; GFX10PLUS-NEXT:    ; return to shader part epilog
1776  %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
1777  %cast = bitcast i16 %result to half
1778  ret half %cast
1779}
1780
1781define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
1782; GFX6-LABEL: v_usubsat_v2i16:
1783; GFX6:       ; %bb.0:
1784; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1785; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1786; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1787; GFX6-NEXT:    v_min_u32_e32 v2, v0, v2
1788; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
1789; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1790; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
1791; GFX6-NEXT:    v_min_u32_e32 v2, v1, v2
1792; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
1793; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1794; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1795; GFX6-NEXT:    s_setpc_b64 s[30:31]
1796;
1797; GFX8-LABEL: v_usubsat_v2i16:
1798; GFX8:       ; %bb.0:
1799; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1800; GFX8-NEXT:    v_sub_u16_e64 v2, v0, v1 clamp
1801; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1802; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
1803; GFX8-NEXT:    s_setpc_b64 s[30:31]
1804;
1805; GFX9-LABEL: v_usubsat_v2i16:
1806; GFX9:       ; %bb.0:
1807; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1808; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
1809; GFX9-NEXT:    s_setpc_b64 s[30:31]
1810;
1811; GFX10PLUS-LABEL: v_usubsat_v2i16:
1812; GFX10PLUS:       ; %bb.0:
1813; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1814; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
1815; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1816  %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1817  ret <2 x i16> %result
1818}
1819
1820define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
1821; GFX6-LABEL: s_usubsat_v2i16:
1822; GFX6:       ; %bb.0:
1823; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
1824; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
1825; GFX6-NEXT:    s_min_u32 s2, s0, s2
1826; GFX6-NEXT:    s_sub_i32 s0, s0, s2
1827; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
1828; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
1829; GFX6-NEXT:    s_min_u32 s2, s1, s2
1830; GFX6-NEXT:    s_sub_i32 s1, s1, s2
1831; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
1832; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1833; GFX6-NEXT:    v_alignbit_b32 v0, s1, v0, 16
1834; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
1835; GFX6-NEXT:    ; return to shader part epilog
1836;
1837; GFX8-LABEL: s_usubsat_v2i16:
1838; GFX8:       ; %bb.0:
1839; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
1840; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
1841; GFX8-NEXT:    v_mov_b32_e32 v0, s1
1842; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1843; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1844; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
1845; GFX8-NEXT:    v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1846; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
1847; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1848; GFX8-NEXT:    ; return to shader part epilog
1849;
1850; GFX9-LABEL: s_usubsat_v2i16:
1851; GFX9:       ; %bb.0:
1852; GFX9-NEXT:    v_mov_b32_e32 v0, s1
1853; GFX9-NEXT:    v_pk_sub_u16 v0, s0, v0 clamp
1854; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1855; GFX9-NEXT:    ; return to shader part epilog
1856;
1857; GFX10PLUS-LABEL: s_usubsat_v2i16:
1858; GFX10PLUS:       ; %bb.0:
1859; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, s0, s1 clamp
1860; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1861; GFX10PLUS-NEXT:    ; return to shader part epilog
1862  %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1863  %cast = bitcast <2 x i16> %result to i32
1864  ret i32 %cast
1865}
1866
1867define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
1868; GFX6-LABEL: usubsat_v2i16_sv:
1869; GFX6:       ; %bb.0:
1870; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
1871; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1872; GFX6-NEXT:    v_min_u32_e32 v0, s0, v0
1873; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1874; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
1875; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1876; GFX6-NEXT:    v_min_u32_e32 v1, s0, v1
1877; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
1878; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1879; GFX6-NEXT:    v_alignbit_b32 v0, v1, v0, 16
1880; GFX6-NEXT:    ; return to shader part epilog
1881;
1882; GFX8-LABEL: usubsat_v2i16_sv:
1883; GFX8:       ; %bb.0:
1884; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
1885; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1886; GFX8-NEXT:    v_sub_u16_e64 v1, s0, v0 clamp
1887; GFX8-NEXT:    v_sub_u16_sdwa v0, v2, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1888; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
1889; GFX8-NEXT:    ; return to shader part epilog
1890;
1891; GFX9-LABEL: usubsat_v2i16_sv:
1892; GFX9:       ; %bb.0:
1893; GFX9-NEXT:    v_pk_sub_u16 v0, s0, v0 clamp
1894; GFX9-NEXT:    ; return to shader part epilog
1895;
1896; GFX10PLUS-LABEL: usubsat_v2i16_sv:
1897; GFX10PLUS:       ; %bb.0:
1898; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, s0, v0 clamp
1899; GFX10PLUS-NEXT:    ; return to shader part epilog
1900  %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1901  %cast = bitcast <2 x i16> %result to float
1902  ret float %cast
1903}
1904
1905define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
1906; GFX6-LABEL: usubsat_v2i16_vs:
1907; GFX6:       ; %bb.0:
1908; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1909; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
1910; GFX6-NEXT:    v_min_u32_e32 v2, s0, v0
1911; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1912; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
1913; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
1914; GFX6-NEXT:    v_min_u32_e32 v2, s0, v1
1915; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
1916; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1917; GFX6-NEXT:    v_alignbit_b32 v0, v1, v0, 16
1918; GFX6-NEXT:    ; return to shader part epilog
1919;
1920; GFX8-LABEL: usubsat_v2i16_vs:
1921; GFX8:       ; %bb.0:
1922; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
1923; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1924; GFX8-NEXT:    v_sub_u16_e64 v1, v0, s0 clamp
1925; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1926; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
1927; GFX8-NEXT:    ; return to shader part epilog
1928;
1929; GFX9-LABEL: usubsat_v2i16_vs:
1930; GFX9:       ; %bb.0:
1931; GFX9-NEXT:    v_pk_sub_u16 v0, v0, s0 clamp
1932; GFX9-NEXT:    ; return to shader part epilog
1933;
1934; GFX10PLUS-LABEL: usubsat_v2i16_vs:
1935; GFX10PLUS:       ; %bb.0:
1936; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, s0 clamp
1937; GFX10PLUS-NEXT:    ; return to shader part epilog
1938  %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1939  %cast = bitcast <2 x i16> %result to float
1940  ret float %cast
1941}
1942
1943; FIXME: v3i16 insert/extract
1944; define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
1945;   %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
1946;   ret <3 x i16> %result
1947; }
1948
1949; define amdgpu_ps <3 x i16> @s_usubsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) {
1950;   %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
1951;   ret <3 x i16> %result
1952; }
1953
1954define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
1955; GFX6-LABEL: v_usubsat_v4i16:
1956; GFX6:       ; %bb.0:
1957; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1958; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1959; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
1960; GFX6-NEXT:    v_min_u32_e32 v4, v0, v4
1961; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
1962; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1963; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
1964; GFX6-NEXT:    v_min_u32_e32 v4, v1, v4
1965; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
1966; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1967; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
1968; GFX6-NEXT:    v_min_u32_e32 v4, v2, v4
1969; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
1970; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1971; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
1972; GFX6-NEXT:    v_min_u32_e32 v4, v3, v4
1973; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
1974; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1975; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1976; GFX6-NEXT:    v_alignbit_b32 v0, v1, v0, 16
1977; GFX6-NEXT:    v_alignbit_b32 v1, v3, v2, 16
1978; GFX6-NEXT:    s_setpc_b64 s[30:31]
1979;
1980; GFX8-LABEL: v_usubsat_v4i16:
1981; GFX8:       ; %bb.0:
1982; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1983; GFX8-NEXT:    v_sub_u16_e64 v4, v0, v2 clamp
1984; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1985; GFX8-NEXT:    v_sub_u16_e64 v2, v1, v3 clamp
1986; GFX8-NEXT:    v_sub_u16_sdwa v1, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1987; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
1988; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
1989; GFX8-NEXT:    s_setpc_b64 s[30:31]
1990;
1991; GFX9-LABEL: v_usubsat_v4i16:
1992; GFX9:       ; %bb.0:
1993; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1994; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
1995; GFX9-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
1996; GFX9-NEXT:    s_setpc_b64 s[30:31]
1997;
1998; GFX10PLUS-LABEL: v_usubsat_v4i16:
1999; GFX10PLUS:       ; %bb.0:
2000; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2001; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
2002; GFX10PLUS-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
2003; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
2004  %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
2005  %cast = bitcast <4 x i16> %result to <2 x float>
2006  ret <2 x float> %cast
2007}
2008
2009define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) {
2010; GFX6-LABEL: s_usubsat_v4i16:
2011; GFX6:       ; %bb.0:
2012; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2013; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
2014; GFX6-NEXT:    s_min_u32 s4, s0, s4
2015; GFX6-NEXT:    s_sub_i32 s0, s0, s4
2016; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2017; GFX6-NEXT:    s_lshl_b32 s4, s5, 16
2018; GFX6-NEXT:    s_min_u32 s4, s1, s4
2019; GFX6-NEXT:    s_sub_i32 s1, s1, s4
2020; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
2021; GFX6-NEXT:    s_lshl_b32 s4, s6, 16
2022; GFX6-NEXT:    s_min_u32 s4, s2, s4
2023; GFX6-NEXT:    s_sub_i32 s2, s2, s4
2024; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
2025; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
2026; GFX6-NEXT:    s_min_u32 s4, s3, s4
2027; GFX6-NEXT:    s_sub_i32 s3, s3, s4
2028; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
2029; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
2030; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2031; GFX6-NEXT:    v_mov_b32_e32 v1, s2
2032; GFX6-NEXT:    v_alignbit_b32 v0, s1, v0, 16
2033; GFX6-NEXT:    v_alignbit_b32 v1, s3, v1, 16
2034; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
2035; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
2036; GFX6-NEXT:    ; return to shader part epilog
2037;
2038; GFX8-LABEL: s_usubsat_v4i16:
2039; GFX8:       ; %bb.0:
2040; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
2041; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
2042; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
2043; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
2044; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2045; GFX8-NEXT:    v_mov_b32_e32 v2, s4
2046; GFX8-NEXT:    v_mov_b32_e32 v0, s2
2047; GFX8-NEXT:    v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2048; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2049; GFX8-NEXT:    v_mov_b32_e32 v3, s7
2050; GFX8-NEXT:    v_mov_b32_e32 v4, s5
2051; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
2052; GFX8-NEXT:    v_sub_u16_e64 v2, s1, v2 clamp
2053; GFX8-NEXT:    v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2054; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
2055; GFX8-NEXT:    v_or_b32_e32 v1, v2, v3
2056; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
2057; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
2058; GFX8-NEXT:    ; return to shader part epilog
2059;
2060; GFX9-LABEL: s_usubsat_v4i16:
2061; GFX9:       ; %bb.0:
2062; GFX9-NEXT:    v_mov_b32_e32 v0, s2
2063; GFX9-NEXT:    v_mov_b32_e32 v1, s3
2064; GFX9-NEXT:    v_pk_sub_u16 v0, s0, v0 clamp
2065; GFX9-NEXT:    v_pk_sub_u16 v1, s1, v1 clamp
2066; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2067; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2068; GFX9-NEXT:    ; return to shader part epilog
2069;
2070; GFX10PLUS-LABEL: s_usubsat_v4i16:
2071; GFX10PLUS:       ; %bb.0:
2072; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, s0, s2 clamp
2073; GFX10PLUS-NEXT:    v_pk_sub_u16 v1, s1, s3 clamp
2074; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
2075; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
2076; GFX10PLUS-NEXT:    ; return to shader part epilog
2077  %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
2078  %cast = bitcast <4 x i16> %result to <2 x i32>
2079  ret <2 x i32> %cast
2080}
2081
2082; FIXME
2083; define <5 x i16> @v_usubsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) {
2084;   %result = call <5 x i16> @llvm.usub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
2085;   ret <5 x i16> %result
2086; }
2087
2088; define amdgpu_ps <5 x i16> @s_usubsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) {
2089;   %result = call <5 x i16> @llvm.usub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
2090;   ret <5 x i16> %result
2091; }
2092
2093define <3 x float> @v_usubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
2094; GFX6-LABEL: v_usubsat_v6i16:
2095; GFX6:       ; %bb.0:
2096; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2097; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2098; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
2099; GFX6-NEXT:    v_min_u32_e32 v6, v0, v6
2100; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
2101; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2102; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
2103; GFX6-NEXT:    v_min_u32_e32 v6, v1, v6
2104; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
2105; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2106; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
2107; GFX6-NEXT:    v_min_u32_e32 v6, v2, v6
2108; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
2109; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2110; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
2111; GFX6-NEXT:    v_min_u32_e32 v6, v3, v6
2112; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
2113; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
2114; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
2115; GFX6-NEXT:    v_min_u32_e32 v6, v4, v6
2116; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
2117; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
2118; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
2119; GFX6-NEXT:    v_min_u32_e32 v6, v5, v6
2120; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
2121; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2122; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2123; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
2124; GFX6-NEXT:    v_alignbit_b32 v0, v1, v0, 16
2125; GFX6-NEXT:    v_alignbit_b32 v1, v3, v2, 16
2126; GFX6-NEXT:    v_alignbit_b32 v2, v5, v4, 16
2127; GFX6-NEXT:    s_setpc_b64 s[30:31]
2128;
2129; GFX8-LABEL: v_usubsat_v6i16:
2130; GFX8:       ; %bb.0:
2131; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2132; GFX8-NEXT:    v_sub_u16_e64 v6, v0, v3 clamp
2133; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2134; GFX8-NEXT:    v_sub_u16_e64 v3, v1, v4 clamp
2135; GFX8-NEXT:    v_sub_u16_sdwa v1, v1, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2136; GFX8-NEXT:    v_sub_u16_e64 v4, v2, v5 clamp
2137; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2138; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
2139; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
2140; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
2141; GFX8-NEXT:    s_setpc_b64 s[30:31]
2142;
2143; GFX9-LABEL: v_usubsat_v6i16:
2144; GFX9:       ; %bb.0:
2145; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2146; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v3 clamp
2147; GFX9-NEXT:    v_pk_sub_u16 v1, v1, v4 clamp
2148; GFX9-NEXT:    v_pk_sub_u16 v2, v2, v5 clamp
2149; GFX9-NEXT:    s_setpc_b64 s[30:31]
2150;
2151; GFX10PLUS-LABEL: v_usubsat_v6i16:
2152; GFX10PLUS:       ; %bb.0:
2153; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2154; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v3 clamp
2155; GFX10PLUS-NEXT:    v_pk_sub_u16 v1, v1, v4 clamp
2156; GFX10PLUS-NEXT:    v_pk_sub_u16 v2, v2, v5 clamp
2157; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
2158  %result = call <6 x i16> @llvm.usub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
2159  %cast = bitcast <6 x i16> %result to <3 x float>
2160  ret <3 x float> %cast
2161}
2162
2163define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) {
2164; GFX6-LABEL: s_usubsat_v6i16:
2165; GFX6:       ; %bb.0:
2166; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2167; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
2168; GFX6-NEXT:    s_min_u32 s6, s0, s6
2169; GFX6-NEXT:    s_sub_i32 s0, s0, s6
2170; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2171; GFX6-NEXT:    s_lshl_b32 s6, s7, 16
2172; GFX6-NEXT:    s_min_u32 s6, s1, s6
2173; GFX6-NEXT:    s_sub_i32 s1, s1, s6
2174; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
2175; GFX6-NEXT:    s_lshl_b32 s6, s8, 16
2176; GFX6-NEXT:    s_min_u32 s6, s2, s6
2177; GFX6-NEXT:    s_sub_i32 s2, s2, s6
2178; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
2179; GFX6-NEXT:    s_lshl_b32 s6, s9, 16
2180; GFX6-NEXT:    s_min_u32 s6, s3, s6
2181; GFX6-NEXT:    s_sub_i32 s3, s3, s6
2182; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
2183; GFX6-NEXT:    s_lshl_b32 s6, s10, 16
2184; GFX6-NEXT:    s_min_u32 s6, s4, s6
2185; GFX6-NEXT:    s_sub_i32 s4, s4, s6
2186; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
2187; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
2188; GFX6-NEXT:    s_min_u32 s6, s5, s6
2189; GFX6-NEXT:    s_sub_i32 s5, s5, s6
2190; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
2191; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
2192; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
2193; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2194; GFX6-NEXT:    v_mov_b32_e32 v1, s2
2195; GFX6-NEXT:    v_mov_b32_e32 v2, s4
2196; GFX6-NEXT:    v_alignbit_b32 v0, s1, v0, 16
2197; GFX6-NEXT:    v_alignbit_b32 v1, s3, v1, 16
2198; GFX6-NEXT:    v_alignbit_b32 v2, s5, v2, 16
2199; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
2200; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
2201; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
2202; GFX6-NEXT:    ; return to shader part epilog
2203;
2204; GFX8-LABEL: s_usubsat_v6i16:
2205; GFX8:       ; %bb.0:
2206; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
2207; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
2208; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
2209; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
2210; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
2211; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
2212; GFX8-NEXT:    v_mov_b32_e32 v1, s9
2213; GFX8-NEXT:    v_mov_b32_e32 v2, s6
2214; GFX8-NEXT:    v_mov_b32_e32 v3, s10
2215; GFX8-NEXT:    v_mov_b32_e32 v4, s7
2216; GFX8-NEXT:    v_mov_b32_e32 v0, s3
2217; GFX8-NEXT:    v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2218; GFX8-NEXT:    v_mov_b32_e32 v2, s4
2219; GFX8-NEXT:    v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2220; GFX8-NEXT:    v_mov_b32_e32 v4, s5
2221; GFX8-NEXT:    v_mov_b32_e32 v5, s11
2222; GFX8-NEXT:    v_mov_b32_e32 v6, s8
2223; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
2224; GFX8-NEXT:    v_sub_u16_e64 v2, s1, v2 clamp
2225; GFX8-NEXT:    v_sub_u16_e64 v4, s2, v4 clamp
2226; GFX8-NEXT:    v_sub_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2227; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
2228; GFX8-NEXT:    v_or_b32_e32 v1, v2, v3
2229; GFX8-NEXT:    v_or_b32_e32 v2, v4, v5
2230; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
2231; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
2232; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
2233; GFX8-NEXT:    ; return to shader part epilog
2234;
2235; GFX9-LABEL: s_usubsat_v6i16:
2236; GFX9:       ; %bb.0:
2237; GFX9-NEXT:    v_mov_b32_e32 v0, s3
2238; GFX9-NEXT:    v_mov_b32_e32 v1, s4
2239; GFX9-NEXT:    v_mov_b32_e32 v2, s5
2240; GFX9-NEXT:    v_pk_sub_u16 v0, s0, v0 clamp
2241; GFX9-NEXT:    v_pk_sub_u16 v1, s1, v1 clamp
2242; GFX9-NEXT:    v_pk_sub_u16 v2, s2, v2 clamp
2243; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2244; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2245; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
2246; GFX9-NEXT:    ; return to shader part epilog
2247;
2248; GFX10PLUS-LABEL: s_usubsat_v6i16:
2249; GFX10PLUS:       ; %bb.0:
2250; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, s0, s3 clamp
2251; GFX10PLUS-NEXT:    v_pk_sub_u16 v1, s1, s4 clamp
2252; GFX10PLUS-NEXT:    v_pk_sub_u16 v2, s2, s5 clamp
2253; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
2254; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
2255; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
2256; GFX10PLUS-NEXT:    ; return to shader part epilog
2257  %result = call <6 x i16> @llvm.usub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
2258  %cast = bitcast <6 x i16> %result to <3 x i32>
2259  ret <3 x i32> %cast
2260}
2261
2262define <4 x float> @v_usubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
2263; GFX6-LABEL: v_usubsat_v8i16:
2264; GFX6:       ; %bb.0:
2265; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2266; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2267; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
2268; GFX6-NEXT:    v_min_u32_e32 v8, v0, v8
2269; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
2270; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2271; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
2272; GFX6-NEXT:    v_min_u32_e32 v8, v1, v8
2273; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
2274; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2275; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
2276; GFX6-NEXT:    v_min_u32_e32 v8, v2, v8
2277; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
2278; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2279; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
2280; GFX6-NEXT:    v_min_u32_e32 v8, v3, v8
2281; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v8
2282; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
2283; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
2284; GFX6-NEXT:    v_min_u32_e32 v8, v4, v8
2285; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v8
2286; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
2287; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
2288; GFX6-NEXT:    v_min_u32_e32 v8, v5, v8
2289; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v8
2290; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
2291; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
2292; GFX6-NEXT:    v_min_u32_e32 v8, v6, v8
2293; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
2294; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
2295; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
2296; GFX6-NEXT:    v_min_u32_e32 v8, v7, v8
2297; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v8
2298; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2299; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2300; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
2301; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
2302; GFX6-NEXT:    v_alignbit_b32 v0, v1, v0, 16
2303; GFX6-NEXT:    v_alignbit_b32 v1, v3, v2, 16
2304; GFX6-NEXT:    v_alignbit_b32 v2, v5, v4, 16
2305; GFX6-NEXT:    v_alignbit_b32 v3, v7, v6, 16
2306; GFX6-NEXT:    s_setpc_b64 s[30:31]
2307;
2308; GFX8-LABEL: v_usubsat_v8i16:
2309; GFX8:       ; %bb.0:
2310; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2311; GFX8-NEXT:    v_sub_u16_e64 v8, v0, v4 clamp
2312; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2313; GFX8-NEXT:    v_sub_u16_e64 v4, v1, v5 clamp
2314; GFX8-NEXT:    v_sub_u16_sdwa v1, v1, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2315; GFX8-NEXT:    v_sub_u16_e64 v5, v2, v6 clamp
2316; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, v6 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2317; GFX8-NEXT:    v_sub_u16_e64 v6, v3, v7 clamp
2318; GFX8-NEXT:    v_sub_u16_sdwa v3, v3, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2319; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
2320; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
2321; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
2322; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
2323; GFX8-NEXT:    s_setpc_b64 s[30:31]
2324;
2325; GFX9-LABEL: v_usubsat_v8i16:
2326; GFX9:       ; %bb.0:
2327; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2328; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v4 clamp
2329; GFX9-NEXT:    v_pk_sub_u16 v1, v1, v5 clamp
2330; GFX9-NEXT:    v_pk_sub_u16 v2, v2, v6 clamp
2331; GFX9-NEXT:    v_pk_sub_u16 v3, v3, v7 clamp
2332; GFX9-NEXT:    s_setpc_b64 s[30:31]
2333;
2334; GFX10PLUS-LABEL: v_usubsat_v8i16:
2335; GFX10PLUS:       ; %bb.0:
2336; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2337; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v4 clamp
2338; GFX10PLUS-NEXT:    v_pk_sub_u16 v1, v1, v5 clamp
2339; GFX10PLUS-NEXT:    v_pk_sub_u16 v2, v2, v6 clamp
2340; GFX10PLUS-NEXT:    v_pk_sub_u16 v3, v3, v7 clamp
2341; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
2342  %result = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
2343  %cast = bitcast <8 x i16> %result to <4 x float>
2344  ret <4 x float> %cast
2345}
2346
2347define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) {
2348; GFX6-LABEL: s_usubsat_v8i16:
2349; GFX6:       ; %bb.0:
2350; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2351; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
2352; GFX6-NEXT:    s_min_u32 s8, s0, s8
2353; GFX6-NEXT:    s_sub_i32 s0, s0, s8
2354; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2355; GFX6-NEXT:    s_lshl_b32 s8, s9, 16
2356; GFX6-NEXT:    s_min_u32 s8, s1, s8
2357; GFX6-NEXT:    s_sub_i32 s1, s1, s8
2358; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
2359; GFX6-NEXT:    s_lshl_b32 s8, s10, 16
2360; GFX6-NEXT:    s_min_u32 s8, s2, s8
2361; GFX6-NEXT:    s_sub_i32 s2, s2, s8
2362; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
2363; GFX6-NEXT:    s_lshl_b32 s8, s11, 16
2364; GFX6-NEXT:    s_min_u32 s8, s3, s8
2365; GFX6-NEXT:    s_sub_i32 s3, s3, s8
2366; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
2367; GFX6-NEXT:    s_lshl_b32 s8, s12, 16
2368; GFX6-NEXT:    s_min_u32 s8, s4, s8
2369; GFX6-NEXT:    s_sub_i32 s4, s4, s8
2370; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
2371; GFX6-NEXT:    s_lshl_b32 s8, s13, 16
2372; GFX6-NEXT:    s_min_u32 s8, s5, s8
2373; GFX6-NEXT:    s_sub_i32 s5, s5, s8
2374; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
2375; GFX6-NEXT:    s_lshl_b32 s8, s14, 16
2376; GFX6-NEXT:    s_min_u32 s8, s6, s8
2377; GFX6-NEXT:    s_sub_i32 s6, s6, s8
2378; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
2379; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
2380; GFX6-NEXT:    s_min_u32 s8, s7, s8
2381; GFX6-NEXT:    s_sub_i32 s7, s7, s8
2382; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
2383; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
2384; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
2385; GFX6-NEXT:    s_lshr_b32 s7, s7, 16
2386; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2387; GFX6-NEXT:    v_mov_b32_e32 v1, s2
2388; GFX6-NEXT:    v_mov_b32_e32 v2, s4
2389; GFX6-NEXT:    v_mov_b32_e32 v3, s6
2390; GFX6-NEXT:    v_alignbit_b32 v0, s1, v0, 16
2391; GFX6-NEXT:    v_alignbit_b32 v1, s3, v1, 16
2392; GFX6-NEXT:    v_alignbit_b32 v2, s5, v2, 16
2393; GFX6-NEXT:    v_alignbit_b32 v3, s7, v3, 16
2394; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
2395; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
2396; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
2397; GFX6-NEXT:    v_readfirstlane_b32 s3, v3
2398; GFX6-NEXT:    ; return to shader part epilog
2399;
2400; GFX8-LABEL: s_usubsat_v8i16:
2401; GFX8:       ; %bb.0:
2402; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
2403; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
2404; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
2405; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
2406; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
2407; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
2408; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
2409; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
2410; GFX8-NEXT:    v_mov_b32_e32 v1, s12
2411; GFX8-NEXT:    v_mov_b32_e32 v2, s8
2412; GFX8-NEXT:    v_mov_b32_e32 v3, s13
2413; GFX8-NEXT:    v_mov_b32_e32 v4, s9
2414; GFX8-NEXT:    v_mov_b32_e32 v5, s14
2415; GFX8-NEXT:    v_mov_b32_e32 v6, s10
2416; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2417; GFX8-NEXT:    v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2418; GFX8-NEXT:    v_mov_b32_e32 v2, s5
2419; GFX8-NEXT:    v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2420; GFX8-NEXT:    v_mov_b32_e32 v4, s6
2421; GFX8-NEXT:    v_sub_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2422; GFX8-NEXT:    v_mov_b32_e32 v6, s7
2423; GFX8-NEXT:    v_mov_b32_e32 v7, s15
2424; GFX8-NEXT:    v_mov_b32_e32 v8, s11
2425; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
2426; GFX8-NEXT:    v_sub_u16_e64 v2, s1, v2 clamp
2427; GFX8-NEXT:    v_sub_u16_e64 v4, s2, v4 clamp
2428; GFX8-NEXT:    v_sub_u16_e64 v6, s3, v6 clamp
2429; GFX8-NEXT:    v_sub_u16_sdwa v7, v8, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2430; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
2431; GFX8-NEXT:    v_or_b32_e32 v1, v2, v3
2432; GFX8-NEXT:    v_or_b32_e32 v2, v4, v5
2433; GFX8-NEXT:    v_or_b32_e32 v3, v6, v7
2434; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
2435; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
2436; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
2437; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
2438; GFX8-NEXT:    ; return to shader part epilog
2439;
2440; GFX9-LABEL: s_usubsat_v8i16:
2441; GFX9:       ; %bb.0:
2442; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2443; GFX9-NEXT:    v_mov_b32_e32 v1, s5
2444; GFX9-NEXT:    v_mov_b32_e32 v2, s6
2445; GFX9-NEXT:    v_mov_b32_e32 v3, s7
2446; GFX9-NEXT:    v_pk_sub_u16 v0, s0, v0 clamp
2447; GFX9-NEXT:    v_pk_sub_u16 v1, s1, v1 clamp
2448; GFX9-NEXT:    v_pk_sub_u16 v2, s2, v2 clamp
2449; GFX9-NEXT:    v_pk_sub_u16 v3, s3, v3 clamp
2450; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2451; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2452; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
2453; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
2454; GFX9-NEXT:    ; return to shader part epilog
2455;
2456; GFX10PLUS-LABEL: s_usubsat_v8i16:
2457; GFX10PLUS:       ; %bb.0:
2458; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, s0, s4 clamp
2459; GFX10PLUS-NEXT:    v_pk_sub_u16 v1, s1, s5 clamp
2460; GFX10PLUS-NEXT:    v_pk_sub_u16 v2, s2, s6 clamp
2461; GFX10PLUS-NEXT:    v_pk_sub_u16 v3, s3, s7 clamp
2462; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
2463; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
2464; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
2465; GFX10PLUS-NEXT:    v_readfirstlane_b32 s3, v3
2466; GFX10PLUS-NEXT:    ; return to shader part epilog
2467  %result = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
2468  %cast = bitcast <8 x i16> %result to <4 x i32>
2469  ret <4 x i32> %cast
2470}
2471
2472define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
2473; GFX6-LABEL: v_usubsat_i48:
2474; GFX6:       ; %bb.0:
2475; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2476; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2477; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
2478; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
2479; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
2480; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
2481; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v1
2482; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2483; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2484; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
2485; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
2486; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2487; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
2488; GFX6-NEXT:    s_setpc_b64 s[30:31]
2489;
2490; GFX8-LABEL: v_usubsat_i48:
2491; GFX8:       ; %bb.0:
2492; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2493; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
2494; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
2495; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
2496; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
2497; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2498; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2499; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
2500; GFX8-NEXT:    s_setpc_b64 s[30:31]
2501;
2502; GFX9-LABEL: v_usubsat_i48:
2503; GFX9:       ; %bb.0:
2504; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2505; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
2506; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
2507; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
2508; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
2509; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2510; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2511; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
2512; GFX9-NEXT:    s_setpc_b64 s[30:31]
2513;
2514; GFX10PLUS-LABEL: v_usubsat_i48:
2515; GFX10PLUS:       ; %bb.0:
2516; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2517; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
2518; GFX10PLUS-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
2519; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
2520; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
2521; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2522; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2523; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
2524; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
2525  %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
2526  ret i48 %result
2527}
2528
2529define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
2530; GFX6-LABEL: s_usubsat_i48:
2531; GFX6:       ; %bb.0:
2532; GFX6-NEXT:    s_sub_u32 s0, s0, s2
2533; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
2534; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
2535; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
2536; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
2537; GFX6-NEXT:    s_subb_u32 s2, s1, s3
2538; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
2539; GFX6-NEXT:    s_cmp_lg_u32 s2, s1
2540; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
2541; GFX6-NEXT:    s_lshr_b32 s3, s0, 16
2542; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
2543; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
2544; GFX6-NEXT:    s_or_b32 s0, s0, s3
2545; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
2546; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
2547; GFX6-NEXT:    ; return to shader part epilog
2548;
2549; GFX8-LABEL: s_usubsat_i48:
2550; GFX8:       ; %bb.0:
2551; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
2552; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
2553; GFX8-NEXT:    s_sub_u32 s0, s0, s2
2554; GFX8-NEXT:    s_subb_u32 s1, s1, s3
2555; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
2556; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
2557; GFX8-NEXT:    ; return to shader part epilog
2558;
2559; GFX9-LABEL: s_usubsat_i48:
2560; GFX9:       ; %bb.0:
2561; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
2562; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
2563; GFX9-NEXT:    s_sub_u32 s0, s0, s2
2564; GFX9-NEXT:    s_subb_u32 s1, s1, s3
2565; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
2566; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
2567; GFX9-NEXT:    ; return to shader part epilog
2568;
2569; GFX10PLUS-LABEL: s_usubsat_i48:
2570; GFX10PLUS:       ; %bb.0:
2571; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
2572; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
2573; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s2
2574; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s3
2575; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
2576; GFX10PLUS-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
2577; GFX10PLUS-NEXT:    ; return to shader part epilog
2578  %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
2579  ret i48 %result
2580}
2581
2582define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
2583; GFX6-LABEL: usubsat_i48_sv:
2584; GFX6:       ; %bb.0:
2585; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
2586; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2587; GFX6-NEXT:    v_mov_b32_e32 v2, s1
2588; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2589; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2590; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
2591; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v1
2592; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2593; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2594; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
2595; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
2596; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2597; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
2598; GFX6-NEXT:    ; return to shader part epilog
2599;
2600; GFX8-LABEL: usubsat_i48_sv:
2601; GFX8:       ; %bb.0:
2602; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
2603; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
2604; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2605; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
2606; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2607; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2608; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2609; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
2610; GFX8-NEXT:    ; return to shader part epilog
2611;
2612; GFX9-LABEL: usubsat_i48_sv:
2613; GFX9:       ; %bb.0:
2614; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
2615; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
2616; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2617; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
2618; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2619; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2620; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2621; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
2622; GFX9-NEXT:    ; return to shader part epilog
2623;
2624; GFX10PLUS-LABEL: usubsat_i48_sv:
2625; GFX10PLUS:       ; %bb.0:
2626; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
2627; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
2628; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
2629; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2630; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2631; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2632; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
2633; GFX10PLUS-NEXT:    ; return to shader part epilog
2634  %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
2635  %ext.result = zext i48 %result to i64
2636  %cast = bitcast i64 %ext.result to <2 x float>
2637  ret <2 x float> %cast
2638}
2639
2640define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
2641; GFX6-LABEL: usubsat_i48_vs:
2642; GFX6:       ; %bb.0:
2643; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
2644; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2645; GFX6-NEXT:    v_mov_b32_e32 v2, s1
2646; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
2647; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
2648; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
2649; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v1
2650; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2651; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2652; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
2653; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
2654; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2655; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
2656; GFX6-NEXT:    ; return to shader part epilog
2657;
2658; GFX8-LABEL: usubsat_i48_vs:
2659; GFX8:       ; %bb.0:
2660; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
2661; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
2662; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2663; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
2664; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
2665; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2666; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2667; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
2668; GFX8-NEXT:    ; return to shader part epilog
2669;
2670; GFX9-LABEL: usubsat_i48_vs:
2671; GFX9:       ; %bb.0:
2672; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
2673; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
2674; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2675; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
2676; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
2677; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2678; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2679; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
2680; GFX9-NEXT:    ; return to shader part epilog
2681;
2682; GFX10PLUS-LABEL: usubsat_i48_vs:
2683; GFX10PLUS:       ; %bb.0:
2684; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
2685; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
2686; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
2687; GFX10PLUS-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2688; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2689; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2690; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
2691; GFX10PLUS-NEXT:    ; return to shader part epilog
2692  %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
2693  %ext.result = zext i48 %result to i64
2694  %cast = bitcast i64 %ext.result to <2 x float>
2695  ret <2 x float> %cast
2696}
2697
2698define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
2699; GFX6-LABEL: v_usubsat_i64:
2700; GFX6:       ; %bb.0:
2701; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2702; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
2703; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
2704; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2705; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2706; GFX6-NEXT:    s_setpc_b64 s[30:31]
2707;
2708; GFX8-LABEL: v_usubsat_i64:
2709; GFX8:       ; %bb.0:
2710; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2711; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
2712; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
2713; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2714; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2715; GFX8-NEXT:    s_setpc_b64 s[30:31]
2716;
2717; GFX9-LABEL: v_usubsat_i64:
2718; GFX9:       ; %bb.0:
2719; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2720; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
2721; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
2722; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2723; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2724; GFX9-NEXT:    s_setpc_b64 s[30:31]
2725;
2726; GFX10PLUS-LABEL: v_usubsat_i64:
2727; GFX10PLUS:       ; %bb.0:
2728; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2729; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
2730; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
2731; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2732; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2733; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
2734  %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
2735  ret i64 %result
2736}
2737
2738define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
2739; GFX6-LABEL: s_usubsat_i64:
2740; GFX6:       ; %bb.0:
2741; GFX6-NEXT:    s_sub_u32 s0, s0, s2
2742; GFX6-NEXT:    s_subb_u32 s1, s1, s3
2743; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
2744; GFX6-NEXT:    ; return to shader part epilog
2745;
2746; GFX8-LABEL: s_usubsat_i64:
2747; GFX8:       ; %bb.0:
2748; GFX8-NEXT:    s_sub_u32 s0, s0, s2
2749; GFX8-NEXT:    s_subb_u32 s1, s1, s3
2750; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
2751; GFX8-NEXT:    ; return to shader part epilog
2752;
2753; GFX9-LABEL: s_usubsat_i64:
2754; GFX9:       ; %bb.0:
2755; GFX9-NEXT:    s_sub_u32 s0, s0, s2
2756; GFX9-NEXT:    s_subb_u32 s1, s1, s3
2757; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
2758; GFX9-NEXT:    ; return to shader part epilog
2759;
2760; GFX10PLUS-LABEL: s_usubsat_i64:
2761; GFX10PLUS:       ; %bb.0:
2762; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s2
2763; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s3
2764; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
2765; GFX10PLUS-NEXT:    ; return to shader part epilog
2766  %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
2767  ret i64 %result
2768}
2769
2770define amdgpu_ps <2 x float> @usubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
2771; GFX6-LABEL: usubsat_i64_sv:
2772; GFX6:       ; %bb.0:
2773; GFX6-NEXT:    v_mov_b32_e32 v2, s1
2774; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2775; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2776; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2777; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2778; GFX6-NEXT:    ; return to shader part epilog
2779;
2780; GFX8-LABEL: usubsat_i64_sv:
2781; GFX8:       ; %bb.0:
2782; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2783; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
2784; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2785; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2786; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2787; GFX8-NEXT:    ; return to shader part epilog
2788;
2789; GFX9-LABEL: usubsat_i64_sv:
2790; GFX9:       ; %bb.0:
2791; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2792; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
2793; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2794; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2795; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2796; GFX9-NEXT:    ; return to shader part epilog
2797;
2798; GFX10PLUS-LABEL: usubsat_i64_sv:
2799; GFX10PLUS:       ; %bb.0:
2800; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
2801; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2802; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2803; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2804; GFX10PLUS-NEXT:    ; return to shader part epilog
2805  %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
2806  %cast = bitcast i64 %result to <2 x float>
2807  ret <2 x float> %cast
2808}
2809
2810define amdgpu_ps <2 x float> @usubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
2811; GFX6-LABEL: usubsat_i64_vs:
2812; GFX6:       ; %bb.0:
2813; GFX6-NEXT:    v_mov_b32_e32 v2, s1
2814; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
2815; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
2816; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2817; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2818; GFX6-NEXT:    ; return to shader part epilog
2819;
2820; GFX8-LABEL: usubsat_i64_vs:
2821; GFX8:       ; %bb.0:
2822; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2823; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
2824; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
2825; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2826; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2827; GFX8-NEXT:    ; return to shader part epilog
2828;
2829; GFX9-LABEL: usubsat_i64_vs:
2830; GFX9:       ; %bb.0:
2831; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2832; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
2833; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
2834; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2835; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2836; GFX9-NEXT:    ; return to shader part epilog
2837;
2838; GFX10PLUS-LABEL: usubsat_i64_vs:
2839; GFX10PLUS:       ; %bb.0:
2840; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
2841; GFX10PLUS-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2842; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2843; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2844; GFX10PLUS-NEXT:    ; return to shader part epilog
2845  %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
2846  %cast = bitcast i64 %result to <2 x float>
2847  ret <2 x float> %cast
2848}
2849
2850define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
2851; GFX6-LABEL: v_usubsat_v2i64:
2852; GFX6:       ; %bb.0:
2853; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2854; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
2855; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
2856; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2857; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2858; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
2859; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
2860; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
2861; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
2862; GFX6-NEXT:    s_setpc_b64 s[30:31]
2863;
2864; GFX8-LABEL: v_usubsat_v2i64:
2865; GFX8:       ; %bb.0:
2866; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2867; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
2868; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
2869; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2870; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2871; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v6
2872; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
2873; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
2874; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
2875; GFX8-NEXT:    s_setpc_b64 s[30:31]
2876;
2877; GFX9-LABEL: v_usubsat_v2i64:
2878; GFX9:       ; %bb.0:
2879; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2880; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
2881; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
2882; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2883; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
2884; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v6
2885; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
2886; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
2887; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
2888; GFX9-NEXT:    s_setpc_b64 s[30:31]
2889;
2890; GFX10-LABEL: v_usubsat_v2i64:
2891; GFX10:       ; %bb.0:
2892; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2893; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
2894; GFX10-NEXT:    v_sub_co_u32 v2, s4, v2, v6
2895; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
2896; GFX10-NEXT:    v_sub_co_ci_u32_e64 v3, s4, v3, v7, s4
2897; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2898; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2899; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s4
2900; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s4
2901; GFX10-NEXT:    s_setpc_b64 s[30:31]
2902;
2903; GFX11-LABEL: v_usubsat_v2i64:
2904; GFX11:       ; %bb.0:
2905; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2906; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
2907; GFX11-NEXT:    v_sub_co_u32 v2, s0, v2, v6
2908; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
2909; GFX11-NEXT:    v_sub_co_ci_u32_e64 v3, s0, v3, v7, s0
2910; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2911; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2912; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s0
2913; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s0
2914; GFX11-NEXT:    s_setpc_b64 s[30:31]
2915  %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
2916  ret <2 x i64> %result
2917}
2918
2919define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) {
2920; GFX6-LABEL: s_usubsat_v2i64:
2921; GFX6:       ; %bb.0:
2922; GFX6-NEXT:    s_sub_u32 s0, s0, s4
2923; GFX6-NEXT:    s_subb_u32 s1, s1, s5
2924; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
2925; GFX6-NEXT:    s_sub_u32 s2, s2, s6
2926; GFX6-NEXT:    s_subb_u32 s3, s3, s7
2927; GFX6-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
2928; GFX6-NEXT:    ; return to shader part epilog
2929;
2930; GFX8-LABEL: s_usubsat_v2i64:
2931; GFX8:       ; %bb.0:
2932; GFX8-NEXT:    s_sub_u32 s0, s0, s4
2933; GFX8-NEXT:    s_subb_u32 s1, s1, s5
2934; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
2935; GFX8-NEXT:    s_sub_u32 s2, s2, s6
2936; GFX8-NEXT:    s_subb_u32 s3, s3, s7
2937; GFX8-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
2938; GFX8-NEXT:    ; return to shader part epilog
2939;
2940; GFX9-LABEL: s_usubsat_v2i64:
2941; GFX9:       ; %bb.0:
2942; GFX9-NEXT:    s_sub_u32 s0, s0, s4
2943; GFX9-NEXT:    s_subb_u32 s1, s1, s5
2944; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
2945; GFX9-NEXT:    s_sub_u32 s2, s2, s6
2946; GFX9-NEXT:    s_subb_u32 s3, s3, s7
2947; GFX9-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
2948; GFX9-NEXT:    ; return to shader part epilog
2949;
2950; GFX10PLUS-LABEL: s_usubsat_v2i64:
2951; GFX10PLUS:       ; %bb.0:
2952; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s4
2953; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s5
2954; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
2955; GFX10PLUS-NEXT:    s_sub_u32 s2, s2, s6
2956; GFX10PLUS-NEXT:    s_subb_u32 s3, s3, s7
2957; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
2958; GFX10PLUS-NEXT:    ; return to shader part epilog
2959  %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
2960  ret <2 x i64> %result
2961}
2962
2963define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
2964; GFX6-LABEL: s_usubsat_i128:
2965; GFX6:       ; %bb.0:
2966; GFX6-NEXT:    s_sub_u32 s0, s0, s4
2967; GFX6-NEXT:    s_subb_u32 s1, s1, s5
2968; GFX6-NEXT:    s_subb_u32 s2, s2, s6
2969; GFX6-NEXT:    s_subb_u32 s3, s3, s7
2970; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
2971; GFX6-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
2972; GFX6-NEXT:    ; return to shader part epilog
2973;
2974; GFX8-LABEL: s_usubsat_i128:
2975; GFX8:       ; %bb.0:
2976; GFX8-NEXT:    s_sub_u32 s0, s0, s4
2977; GFX8-NEXT:    s_subb_u32 s1, s1, s5
2978; GFX8-NEXT:    s_subb_u32 s2, s2, s6
2979; GFX8-NEXT:    s_subb_u32 s3, s3, s7
2980; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
2981; GFX8-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
2982; GFX8-NEXT:    ; return to shader part epilog
2983;
2984; GFX9-LABEL: s_usubsat_i128:
2985; GFX9:       ; %bb.0:
2986; GFX9-NEXT:    s_sub_u32 s0, s0, s4
2987; GFX9-NEXT:    s_subb_u32 s1, s1, s5
2988; GFX9-NEXT:    s_subb_u32 s2, s2, s6
2989; GFX9-NEXT:    s_subb_u32 s3, s3, s7
2990; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
2991; GFX9-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
2992; GFX9-NEXT:    ; return to shader part epilog
2993;
2994; GFX10PLUS-LABEL: s_usubsat_i128:
2995; GFX10PLUS:       ; %bb.0:
2996; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s4
2997; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s5
2998; GFX10PLUS-NEXT:    s_subb_u32 s2, s2, s6
2999; GFX10PLUS-NEXT:    s_subb_u32 s3, s3, s7
3000; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
3001; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
3002; GFX10PLUS-NEXT:    ; return to shader part epilog
3003  %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
3004  ret i128 %result
3005}
3006
3007define amdgpu_ps <4 x float> @usubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
3008; GFX6-LABEL: usubsat_i128_sv:
3009; GFX6:       ; %bb.0:
3010; GFX6-NEXT:    v_mov_b32_e32 v4, s1
3011; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
3012; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
3013; GFX6-NEXT:    v_mov_b32_e32 v4, s2
3014; GFX6-NEXT:    v_mov_b32_e32 v5, s3
3015; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v4, v2, vcc
3016; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
3017; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
3018; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
3019; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
3020; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
3021; GFX6-NEXT:    ; return to shader part epilog
3022;
3023; GFX8-LABEL: usubsat_i128_sv:
3024; GFX8:       ; %bb.0:
3025; GFX8-NEXT:    v_mov_b32_e32 v4, s1
3026; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
3027; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
3028; GFX8-NEXT:    v_mov_b32_e32 v4, s2
3029; GFX8-NEXT:    v_mov_b32_e32 v5, s3
3030; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v4, v2, vcc
3031; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
3032; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
3033; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
3034; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
3035; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
3036; GFX8-NEXT:    ; return to shader part epilog
3037;
3038; GFX9-LABEL: usubsat_i128_sv:
3039; GFX9:       ; %bb.0:
3040; GFX9-NEXT:    v_mov_b32_e32 v4, s1
3041; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
3042; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
3043; GFX9-NEXT:    v_mov_b32_e32 v4, s2
3044; GFX9-NEXT:    v_mov_b32_e32 v5, s3
3045; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v4, v2, vcc
3046; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v3, vcc
3047; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
3048; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
3049; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
3050; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
3051; GFX9-NEXT:    ; return to shader part epilog
3052;
3053; GFX10PLUS-LABEL: usubsat_i128_sv:
3054; GFX10PLUS:       ; %bb.0:
3055; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
3056; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
3057; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
3058; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
3059; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
3060; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
3061; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
3062; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
3063; GFX10PLUS-NEXT:    ; return to shader part epilog
3064  %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
3065  %cast = bitcast i128 %result to <4 x float>
3066  ret <4 x float> %cast
3067}
3068
3069define amdgpu_ps <4 x float> @usubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
3070; GFX6-LABEL: usubsat_i128_vs:
3071; GFX6:       ; %bb.0:
3072; GFX6-NEXT:    v_mov_b32_e32 v4, s1
3073; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
3074; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
3075; GFX6-NEXT:    v_mov_b32_e32 v4, s2
3076; GFX6-NEXT:    v_mov_b32_e32 v5, s3
3077; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v2, v4, vcc
3078; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
3079; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
3080; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
3081; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
3082; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
3083; GFX6-NEXT:    ; return to shader part epilog
3084;
3085; GFX8-LABEL: usubsat_i128_vs:
3086; GFX8:       ; %bb.0:
3087; GFX8-NEXT:    v_mov_b32_e32 v4, s1
3088; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
3089; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
3090; GFX8-NEXT:    v_mov_b32_e32 v4, s2
3091; GFX8-NEXT:    v_mov_b32_e32 v5, s3
3092; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v4, vcc
3093; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
3094; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
3095; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
3096; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
3097; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
3098; GFX8-NEXT:    ; return to shader part epilog
3099;
3100; GFX9-LABEL: usubsat_i128_vs:
3101; GFX9:       ; %bb.0:
3102; GFX9-NEXT:    v_mov_b32_e32 v4, s1
3103; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
3104; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
3105; GFX9-NEXT:    v_mov_b32_e32 v4, s2
3106; GFX9-NEXT:    v_mov_b32_e32 v5, s3
3107; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
3108; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
3109; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
3110; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
3111; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
3112; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
3113; GFX9-NEXT:    ; return to shader part epilog
3114;
3115; GFX10PLUS-LABEL: usubsat_i128_vs:
3116; GFX10PLUS:       ; %bb.0:
3117; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
3118; GFX10PLUS-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
3119; GFX10PLUS-NEXT:    v_subrev_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
3120; GFX10PLUS-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
3121; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
3122; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
3123; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
3124; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
3125; GFX10PLUS-NEXT:    ; return to shader part epilog
3126  %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
3127  %cast = bitcast i128 %result to <4 x float>
3128  ret <4 x float> %cast
3129}
3130
3131define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
3132; GFX6-LABEL: v_usubsat_v2i128:
3133; GFX6:       ; %bb.0:
3134; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3135; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
3136; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
3137; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v2, v10, vcc
3138; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v11, vcc
3139; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
3140; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
3141; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
3142; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
3143; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v12
3144; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v5, v13, vcc
3145; GFX6-NEXT:    v_subb_u32_e32 v6, vcc, v6, v14, vcc
3146; GFX6-NEXT:    v_subb_u32_e32 v7, vcc, v7, v15, vcc
3147; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
3148; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
3149; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
3150; GFX6-NEXT:    v_cndmask_b32_e64 v7, v7, 0, vcc
3151; GFX6-NEXT:    s_setpc_b64 s[30:31]
3152;
3153; GFX8-LABEL: v_usubsat_v2i128:
3154; GFX8:       ; %bb.0:
3155; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3156; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
3157; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
3158; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v10, vcc
3159; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v11, vcc
3160; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
3161; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
3162; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
3163; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
3164; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v12
3165; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v5, v13, vcc
3166; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v14, vcc
3167; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v15, vcc
3168; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
3169; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
3170; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
3171; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, 0, vcc
3172; GFX8-NEXT:    s_setpc_b64 s[30:31]
3173;
3174; GFX9-LABEL: v_usubsat_v2i128:
3175; GFX9:       ; %bb.0:
3176; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3177; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v8
3178; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v9, vcc
3179; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v10, vcc
3180; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v11, vcc
3181; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
3182; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
3183; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
3184; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
3185; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v12
3186; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v13, vcc
3187; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v14, vcc
3188; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v15, vcc
3189; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
3190; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
3191; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
3192; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, 0, vcc
3193; GFX9-NEXT:    s_setpc_b64 s[30:31]
3194;
3195; GFX10-LABEL: v_usubsat_v2i128:
3196; GFX10:       ; %bb.0:
3197; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3198; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v8
3199; GFX10-NEXT:    v_sub_co_u32 v4, s4, v4, v12
3200; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
3201; GFX10-NEXT:    v_sub_co_ci_u32_e64 v5, s4, v5, v13, s4
3202; GFX10-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
3203; GFX10-NEXT:    v_sub_co_ci_u32_e64 v6, s4, v6, v14, s4
3204; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
3205; GFX10-NEXT:    v_sub_co_ci_u32_e64 v7, s4, v7, v15, s4
3206; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
3207; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
3208; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
3209; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
3210; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s4
3211; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s4
3212; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s4
3213; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, 0, s4
3214; GFX10-NEXT:    s_setpc_b64 s[30:31]
3215;
3216; GFX11-LABEL: v_usubsat_v2i128:
3217; GFX11:       ; %bb.0:
3218; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3219; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v8
3220; GFX11-NEXT:    v_sub_co_u32 v4, s0, v4, v12
3221; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
3222; GFX11-NEXT:    v_sub_co_ci_u32_e64 v5, s0, v5, v13, s0
3223; GFX11-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
3224; GFX11-NEXT:    v_sub_co_ci_u32_e64 v6, s0, v6, v14, s0
3225; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
3226; GFX11-NEXT:    v_sub_co_ci_u32_e64 v7, s0, v7, v15, s0
3227; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
3228; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
3229; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
3230; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
3231; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s0
3232; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s0
3233; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s0
3234; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, 0, s0
3235; GFX11-NEXT:    s_setpc_b64 s[30:31]
3236  %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
3237  ret <2 x i128> %result
3238}
3239
3240define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) {
3241; GFX6-LABEL: s_usubsat_v2i128:
3242; GFX6:       ; %bb.0:
3243; GFX6-NEXT:    s_sub_u32 s0, s0, s8
3244; GFX6-NEXT:    s_subb_u32 s1, s1, s9
3245; GFX6-NEXT:    s_subb_u32 s2, s2, s10
3246; GFX6-NEXT:    s_subb_u32 s3, s3, s11
3247; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
3248; GFX6-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
3249; GFX6-NEXT:    s_sub_u32 s4, s4, s12
3250; GFX6-NEXT:    s_subb_u32 s5, s5, s13
3251; GFX6-NEXT:    s_subb_u32 s6, s6, s14
3252; GFX6-NEXT:    s_subb_u32 s7, s7, s15
3253; GFX6-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
3254; GFX6-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
3255; GFX6-NEXT:    ; return to shader part epilog
3256;
3257; GFX8-LABEL: s_usubsat_v2i128:
3258; GFX8:       ; %bb.0:
3259; GFX8-NEXT:    s_sub_u32 s0, s0, s8
3260; GFX8-NEXT:    s_subb_u32 s1, s1, s9
3261; GFX8-NEXT:    s_subb_u32 s2, s2, s10
3262; GFX8-NEXT:    s_subb_u32 s3, s3, s11
3263; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
3264; GFX8-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
3265; GFX8-NEXT:    s_sub_u32 s4, s4, s12
3266; GFX8-NEXT:    s_subb_u32 s5, s5, s13
3267; GFX8-NEXT:    s_subb_u32 s6, s6, s14
3268; GFX8-NEXT:    s_subb_u32 s7, s7, s15
3269; GFX8-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
3270; GFX8-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
3271; GFX8-NEXT:    ; return to shader part epilog
3272;
3273; GFX9-LABEL: s_usubsat_v2i128:
3274; GFX9:       ; %bb.0:
3275; GFX9-NEXT:    s_sub_u32 s0, s0, s8
3276; GFX9-NEXT:    s_subb_u32 s1, s1, s9
3277; GFX9-NEXT:    s_subb_u32 s2, s2, s10
3278; GFX9-NEXT:    s_subb_u32 s3, s3, s11
3279; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
3280; GFX9-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
3281; GFX9-NEXT:    s_sub_u32 s4, s4, s12
3282; GFX9-NEXT:    s_subb_u32 s5, s5, s13
3283; GFX9-NEXT:    s_subb_u32 s6, s6, s14
3284; GFX9-NEXT:    s_subb_u32 s7, s7, s15
3285; GFX9-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
3286; GFX9-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
3287; GFX9-NEXT:    ; return to shader part epilog
3288;
3289; GFX10PLUS-LABEL: s_usubsat_v2i128:
3290; GFX10PLUS:       ; %bb.0:
3291; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s8
3292; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s9
3293; GFX10PLUS-NEXT:    s_subb_u32 s2, s2, s10
3294; GFX10PLUS-NEXT:    s_subb_u32 s3, s3, s11
3295; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
3296; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
3297; GFX10PLUS-NEXT:    s_sub_u32 s4, s4, s12
3298; GFX10PLUS-NEXT:    s_subb_u32 s5, s5, s13
3299; GFX10PLUS-NEXT:    s_subb_u32 s6, s6, s14
3300; GFX10PLUS-NEXT:    s_subb_u32 s7, s7, s15
3301; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
3302; GFX10PLUS-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
3303; GFX10PLUS-NEXT:    ; return to shader part epilog
3304  %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
3305  ret <2 x i128> %result
3306}
3307
3308declare i7 @llvm.usub.sat.i7(i7, i7) #0
3309declare i8 @llvm.usub.sat.i8(i8, i8) #0
3310declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>) #0
3311declare <4 x i8> @llvm.usub.sat.v4i8(<4 x i8>, <4 x i8>) #0
3312
3313declare i16 @llvm.usub.sat.i16(i16, i16) #0
3314declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) #0
3315declare <3 x i16> @llvm.usub.sat.v3i16(<3 x i16>, <3 x i16>) #0
3316declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) #0
3317declare <5 x i16> @llvm.usub.sat.v5i16(<5 x i16>, <5 x i16>) #0
3318declare <6 x i16> @llvm.usub.sat.v6i16(<6 x i16>, <6 x i16>) #0
3319declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) #0
3320
3321declare i24 @llvm.usub.sat.i24(i24, i24) #0
3322
3323declare i32 @llvm.usub.sat.i32(i32, i32) #0
3324declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) #0
3325declare <3 x i32> @llvm.usub.sat.v3i32(<3 x i32>, <3 x i32>) #0
3326declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) #0
3327declare <5 x i32> @llvm.usub.sat.v5i32(<5 x i32>, <5 x i32>) #0
3328declare <16 x i32> @llvm.usub.sat.v16i32(<16 x i32>, <16 x i32>) #0
3329
3330declare i48 @llvm.usub.sat.i48(i48, i48) #0
3331
3332declare i64 @llvm.usub.sat.i64(i64, i64) #0
3333declare <2 x i64> @llvm.usub.sat.v2i64(<2 x i64>, <2 x i64>) #0
3334
3335declare i128 @llvm.usub.sat.i128(i128, i128) #0
3336declare <2 x i128> @llvm.usub.sat.v2i128(<2 x i128>, <2 x i128>) #0
3337
3338attributes #0 = { nounwind readnone speculatable willreturn }
3339