xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (revision bfd9bc274586b0261e16e22ac50d50586a0152e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
7
8define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
9; GFX6-LABEL: v_ssubsat_i7:
10; GFX6:       ; %bb.0:
11; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
13; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
14; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
15; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000001, v2
16; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
17; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000000, v3
18; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
19; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
20; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
21; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 25, v0
22; GFX6-NEXT:    s_setpc_b64 s[30:31]
23;
24; GFX8-LABEL: v_ssubsat_i7:
25; GFX8:       ; %bb.0:
26; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
28; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
29; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
30; GFX8-NEXT:    v_add_u16_e32 v2, 0x8001, v2
31; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
32; GFX8-NEXT:    v_add_u16_e32 v3, 0x8000, v3
33; GFX8-NEXT:    v_max_i16_e32 v1, v2, v1
34; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
35; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
36; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
37; GFX8-NEXT:    s_setpc_b64 s[30:31]
38;
39; GFX9-LABEL: v_ssubsat_i7:
40; GFX9:       ; %bb.0:
41; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
43; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
44; GFX9-NEXT:    v_sub_i16 v0, v0, v1 clamp
45; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
46; GFX9-NEXT:    s_setpc_b64 s[30:31]
47;
48; GFX10PLUS-LABEL: v_ssubsat_i7:
49; GFX10PLUS:       ; %bb.0:
50; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 9, v0
52; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 9, v1
53; GFX10PLUS-NEXT:    v_sub_nc_i16 v0, v0, v1 clamp
54; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 9, v0
55; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
56  %result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs)
57  ret i7 %result
58}
59
60define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
61; GFX6-LABEL: s_ssubsat_i7:
62; GFX6:       ; %bb.0:
63; GFX6-NEXT:    s_lshl_b32 s0, s0, 25
64; GFX6-NEXT:    s_max_i32 s2, s0, -1
65; GFX6-NEXT:    s_lshl_b32 s1, s1, 25
66; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000001
67; GFX6-NEXT:    s_min_i32 s3, s0, -1
68; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000000
69; GFX6-NEXT:    s_max_i32 s1, s2, s1
70; GFX6-NEXT:    s_min_i32 s1, s1, s3
71; GFX6-NEXT:    s_sub_i32 s0, s0, s1
72; GFX6-NEXT:    s_ashr_i32 s0, s0, 25
73; GFX6-NEXT:    ; return to shader part epilog
74;
75; GFX8-LABEL: s_ssubsat_i7:
76; GFX8:       ; %bb.0:
77; GFX8-NEXT:    s_lshl_b32 s0, s0, 9
78; GFX8-NEXT:    s_sext_i32_i16 s2, s0
79; GFX8-NEXT:    s_sext_i32_i16 s3, -1
80; GFX8-NEXT:    s_max_i32 s4, s2, s3
81; GFX8-NEXT:    s_lshl_b32 s1, s1, 9
82; GFX8-NEXT:    s_addk_i32 s4, 0x8001
83; GFX8-NEXT:    s_min_i32 s2, s2, s3
84; GFX8-NEXT:    s_sext_i32_i16 s3, s4
85; GFX8-NEXT:    s_sext_i32_i16 s1, s1
86; GFX8-NEXT:    s_addk_i32 s2, 0x8000
87; GFX8-NEXT:    s_max_i32 s1, s3, s1
88; GFX8-NEXT:    s_sext_i32_i16 s1, s1
89; GFX8-NEXT:    s_sext_i32_i16 s2, s2
90; GFX8-NEXT:    s_min_i32 s1, s1, s2
91; GFX8-NEXT:    s_sub_i32 s0, s0, s1
92; GFX8-NEXT:    s_sext_i32_i16 s0, s0
93; GFX8-NEXT:    s_ashr_i32 s0, s0, 9
94; GFX8-NEXT:    ; return to shader part epilog
95;
96; GFX9-LABEL: s_ssubsat_i7:
97; GFX9:       ; %bb.0:
98; GFX9-NEXT:    s_lshl_b32 s1, s1, 9
99; GFX9-NEXT:    s_lshl_b32 s0, s0, 9
100; GFX9-NEXT:    v_mov_b32_e32 v0, s1
101; GFX9-NEXT:    v_sub_i16 v0, s0, v0 clamp
102; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
103; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
104; GFX9-NEXT:    ; return to shader part epilog
105;
106; GFX10PLUS-LABEL: s_ssubsat_i7:
107; GFX10PLUS:       ; %bb.0:
108; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 9
109; GFX10PLUS-NEXT:    s_lshl_b32 s1, s1, 9
110; GFX10PLUS-NEXT:    v_sub_nc_i16 v0, s0, s1 clamp
111; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 9, v0
112; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
113; GFX10PLUS-NEXT:    ; return to shader part epilog
114  %result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs)
115  ret i7 %result
116}
117
118define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
119; GFX6-LABEL: v_ssubsat_i8:
120; GFX6:       ; %bb.0:
121; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
122; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
123; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
124; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
125; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000001, v2
126; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
127; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000000, v3
128; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
129; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
130; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
131; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
132; GFX6-NEXT:    s_setpc_b64 s[30:31]
133;
134; GFX8-LABEL: v_ssubsat_i8:
135; GFX8:       ; %bb.0:
136; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
138; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
139; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
140; GFX8-NEXT:    v_add_u16_e32 v2, 0x8001, v2
141; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
142; GFX8-NEXT:    v_add_u16_e32 v3, 0x8000, v3
143; GFX8-NEXT:    v_max_i16_e32 v1, v2, v1
144; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
145; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
146; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
147; GFX8-NEXT:    s_setpc_b64 s[30:31]
148;
149; GFX9-LABEL: v_ssubsat_i8:
150; GFX9:       ; %bb.0:
151; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
152; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
153; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
154; GFX9-NEXT:    v_sub_i16 v0, v0, v1 clamp
155; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
156; GFX9-NEXT:    s_setpc_b64 s[30:31]
157;
158; GFX10PLUS-LABEL: v_ssubsat_i8:
159; GFX10PLUS:       ; %bb.0:
160; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 8, v0
162; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 8, v1
163; GFX10PLUS-NEXT:    v_sub_nc_i16 v0, v0, v1 clamp
164; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 8, v0
165; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
166  %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs)
167  ret i8 %result
168}
169
170define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
171; GFX6-LABEL: s_ssubsat_i8:
172; GFX6:       ; %bb.0:
173; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
174; GFX6-NEXT:    s_max_i32 s2, s0, -1
175; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
176; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000001
177; GFX6-NEXT:    s_min_i32 s3, s0, -1
178; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000000
179; GFX6-NEXT:    s_max_i32 s1, s2, s1
180; GFX6-NEXT:    s_min_i32 s1, s1, s3
181; GFX6-NEXT:    s_sub_i32 s0, s0, s1
182; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
183; GFX6-NEXT:    ; return to shader part epilog
184;
185; GFX8-LABEL: s_ssubsat_i8:
186; GFX8:       ; %bb.0:
187; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
188; GFX8-NEXT:    s_sext_i32_i16 s2, s0
189; GFX8-NEXT:    s_sext_i32_i16 s3, -1
190; GFX8-NEXT:    s_max_i32 s4, s2, s3
191; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
192; GFX8-NEXT:    s_addk_i32 s4, 0x8001
193; GFX8-NEXT:    s_min_i32 s2, s2, s3
194; GFX8-NEXT:    s_sext_i32_i16 s3, s4
195; GFX8-NEXT:    s_sext_i32_i16 s1, s1
196; GFX8-NEXT:    s_addk_i32 s2, 0x8000
197; GFX8-NEXT:    s_max_i32 s1, s3, s1
198; GFX8-NEXT:    s_sext_i32_i16 s1, s1
199; GFX8-NEXT:    s_sext_i32_i16 s2, s2
200; GFX8-NEXT:    s_min_i32 s1, s1, s2
201; GFX8-NEXT:    s_sub_i32 s0, s0, s1
202; GFX8-NEXT:    s_sext_i32_i16 s0, s0
203; GFX8-NEXT:    s_ashr_i32 s0, s0, 8
204; GFX8-NEXT:    ; return to shader part epilog
205;
206; GFX9-LABEL: s_ssubsat_i8:
207; GFX9:       ; %bb.0:
208; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
209; GFX9-NEXT:    s_lshl_b32 s0, s0, 8
210; GFX9-NEXT:    v_mov_b32_e32 v0, s1
211; GFX9-NEXT:    v_sub_i16 v0, s0, v0 clamp
212; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
213; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
214; GFX9-NEXT:    ; return to shader part epilog
215;
216; GFX10PLUS-LABEL: s_ssubsat_i8:
217; GFX10PLUS:       ; %bb.0:
218; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 8
219; GFX10PLUS-NEXT:    s_lshl_b32 s1, s1, 8
220; GFX10PLUS-NEXT:    v_sub_nc_i16 v0, s0, s1 clamp
221; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 8, v0
222; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
223; GFX10PLUS-NEXT:    ; return to shader part epilog
224  %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs)
225  ret i8 %result
226}
227
228define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
229; GFX6-LABEL: v_ssubsat_v2i8:
230; GFX6:       ; %bb.0:
231; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
232; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
233; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
234; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
235; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
236; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
237; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000001, v4
238; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
239; GFX6-NEXT:    v_bfrev_b32_e32 v6, 1
240; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
241; GFX6-NEXT:    v_max_i32_e32 v1, v4, v1
242; GFX6-NEXT:    v_min_i32_e32 v1, v1, v5
243; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
244; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
245; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
246; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
247; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000001, v3
248; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
249; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000000, v4
250; GFX6-NEXT:    v_max_i32_e32 v2, v3, v2
251; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
252; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
253; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
254; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
255; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
256; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
257; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
258; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
259; GFX6-NEXT:    s_setpc_b64 s[30:31]
260;
261; GFX8-LABEL: v_ssubsat_v2i8:
262; GFX8:       ; %bb.0:
263; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
264; GFX8-NEXT:    v_mov_b32_e32 v2, 8
265; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
266; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
267; GFX8-NEXT:    v_max_i16_e32 v4, -1, v0
268; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
269; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
270; GFX8-NEXT:    v_add_u16_e32 v4, 0x8001, v4
271; GFX8-NEXT:    v_min_i16_e32 v5, -1, v0
272; GFX8-NEXT:    v_add_u16_e32 v5, 0x8000, v5
273; GFX8-NEXT:    v_max_i16_e32 v1, v4, v1
274; GFX8-NEXT:    v_min_i16_e32 v1, v1, v5
275; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
276; GFX8-NEXT:    v_max_i16_e32 v1, -1, v3
277; GFX8-NEXT:    v_add_u16_e32 v1, 0x8001, v1
278; GFX8-NEXT:    v_min_i16_e32 v4, -1, v3
279; GFX8-NEXT:    v_add_u16_e32 v4, 0x8000, v4
280; GFX8-NEXT:    v_max_i16_e32 v1, v1, v2
281; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
282; GFX8-NEXT:    v_sub_u16_e32 v1, v3, v1
283; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
284; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
285; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
286; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
287; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
288; GFX8-NEXT:    s_setpc_b64 s[30:31]
289;
290; GFX9-LABEL: v_ssubsat_v2i8:
291; GFX9:       ; %bb.0:
292; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
294; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
295; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
296; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
297; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
298; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
299; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
300; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
301; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
302; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
303; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
304; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
305; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
306; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
307; GFX9-NEXT:    s_setpc_b64 s[30:31]
308;
309; GFX10-LABEL: v_ssubsat_v2i8:
310; GFX10:       ; %bb.0:
311; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
313; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
314; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
315; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
316; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
317; GFX10-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
318; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
319; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
320; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
321; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
322; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
323; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
324; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
325; GFX10-NEXT:    s_setpc_b64 s[30:31]
326;
327; GFX11-LABEL: v_ssubsat_v2i8:
328; GFX11:       ; %bb.0:
329; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
330; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
331; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
332; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
333; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
334; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
335; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
336; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
337; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
338; GFX11-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
339; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
340; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
341; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
342; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
343; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
344; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
345; GFX11-NEXT:    s_setpc_b64 s[30:31]
346  %lhs = bitcast i16 %lhs.arg to <2 x i8>
347  %rhs = bitcast i16 %rhs.arg to <2 x i8>
348  %result = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
349  %cast.result = bitcast <2 x i8> %result to i16
350  ret i16 %cast.result
351}
352
353define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
354; GFX6-LABEL: s_ssubsat_v2i8:
355; GFX6:       ; %bb.0:
356; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
357; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
358; GFX6-NEXT:    s_max_i32 s4, s0, -1
359; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
360; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
361; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000001
362; GFX6-NEXT:    s_min_i32 s5, s0, -1
363; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000000
364; GFX6-NEXT:    s_max_i32 s1, s4, s1
365; GFX6-NEXT:    s_min_i32 s1, s1, s5
366; GFX6-NEXT:    s_sub_i32 s0, s0, s1
367; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
368; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
369; GFX6-NEXT:    s_max_i32 s3, s1, -1
370; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000001
371; GFX6-NEXT:    s_min_i32 s4, s1, -1
372; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000000
373; GFX6-NEXT:    s_max_i32 s2, s3, s2
374; GFX6-NEXT:    s_min_i32 s2, s2, s4
375; GFX6-NEXT:    s_sub_i32 s1, s1, s2
376; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
377; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
378; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
379; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
380; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
381; GFX6-NEXT:    s_or_b32 s0, s0, s1
382; GFX6-NEXT:    ; return to shader part epilog
383;
384; GFX8-LABEL: s_ssubsat_v2i8:
385; GFX8:       ; %bb.0:
386; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
387; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
388; GFX8-NEXT:    s_sext_i32_i16 s4, s0
389; GFX8-NEXT:    s_sext_i32_i16 s5, -1
390; GFX8-NEXT:    s_max_i32 s6, s4, s5
391; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
392; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
393; GFX8-NEXT:    s_addk_i32 s6, 0x8001
394; GFX8-NEXT:    s_min_i32 s4, s4, s5
395; GFX8-NEXT:    s_sext_i32_i16 s6, s6
396; GFX8-NEXT:    s_sext_i32_i16 s1, s1
397; GFX8-NEXT:    s_addk_i32 s4, 0x8000
398; GFX8-NEXT:    s_max_i32 s1, s6, s1
399; GFX8-NEXT:    s_sext_i32_i16 s1, s1
400; GFX8-NEXT:    s_sext_i32_i16 s4, s4
401; GFX8-NEXT:    s_min_i32 s1, s1, s4
402; GFX8-NEXT:    s_sub_i32 s0, s0, s1
403; GFX8-NEXT:    s_lshl_b32 s1, s2, 8
404; GFX8-NEXT:    s_lshl_b32 s2, s3, 8
405; GFX8-NEXT:    s_sext_i32_i16 s3, s1
406; GFX8-NEXT:    s_max_i32 s4, s3, s5
407; GFX8-NEXT:    s_addk_i32 s4, 0x8001
408; GFX8-NEXT:    s_min_i32 s3, s3, s5
409; GFX8-NEXT:    s_sext_i32_i16 s4, s4
410; GFX8-NEXT:    s_sext_i32_i16 s2, s2
411; GFX8-NEXT:    s_addk_i32 s3, 0x8000
412; GFX8-NEXT:    s_max_i32 s2, s4, s2
413; GFX8-NEXT:    s_sext_i32_i16 s2, s2
414; GFX8-NEXT:    s_sext_i32_i16 s3, s3
415; GFX8-NEXT:    s_min_i32 s2, s2, s3
416; GFX8-NEXT:    s_sub_i32 s1, s1, s2
417; GFX8-NEXT:    s_sext_i32_i16 s1, s1
418; GFX8-NEXT:    s_sext_i32_i16 s0, s0
419; GFX8-NEXT:    s_ashr_i32 s1, s1, 8
420; GFX8-NEXT:    s_ashr_i32 s0, s0, 8
421; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
422; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
423; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
424; GFX8-NEXT:    s_or_b32 s0, s0, s1
425; GFX8-NEXT:    ; return to shader part epilog
426;
427; GFX9-LABEL: s_ssubsat_v2i8:
428; GFX9:       ; %bb.0:
429; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
430; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
431; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
432; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
433; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
434; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
435; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
436; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
437; GFX9-NEXT:    s_lshr_b32 s2, s1, 16
438; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
439; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
440; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
441; GFX9-NEXT:    v_mov_b32_e32 v0, s1
442; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
443; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
444; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
445; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
446; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
447; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
448; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
449; GFX9-NEXT:    ; return to shader part epilog
450;
451; GFX10-LABEL: s_ssubsat_v2i8:
452; GFX10:       ; %bb.0:
453; GFX10-NEXT:    s_lshr_b32 s2, s0, 8
454; GFX10-NEXT:    s_lshr_b32 s3, s1, 8
455; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
456; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
457; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
458; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
459; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x80008
460; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
461; GFX10-NEXT:    s_lshl_b32 s1, s1, 0x80008
462; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
463; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
464; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
465; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
466; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s1 clamp
467; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
468; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
469; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
470; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
471; GFX10-NEXT:    ; return to shader part epilog
472;
473; GFX11-LABEL: s_ssubsat_v2i8:
474; GFX11:       ; %bb.0:
475; GFX11-NEXT:    s_lshr_b32 s2, s0, 8
476; GFX11-NEXT:    s_lshr_b32 s3, s1, 8
477; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
478; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
479; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
480; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
481; GFX11-NEXT:    s_lshl_b32 s0, s0, 0x80008
482; GFX11-NEXT:    s_lshl_b32 s2, s2, 8
483; GFX11-NEXT:    s_lshl_b32 s1, s1, 0x80008
484; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
485; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
486; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
487; GFX11-NEXT:    v_pk_sub_i16 v0, s0, s1 clamp
488; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
489; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
490; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
491; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
492; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
493; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
494; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
495; GFX11-NEXT:    ; return to shader part epilog
496  %lhs = bitcast i16 %lhs.arg to <2 x i8>
497  %rhs = bitcast i16 %rhs.arg to <2 x i8>
498  %result = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
499  %cast.result = bitcast <2 x i8> %result to i16
500  ret i16 %cast.result
501}
502
503define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
504; GFX6-LABEL: v_ssubsat_v4i8:
505; GFX6:       ; %bb.0:
506; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
507; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
508; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
509; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
510; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
511; GFX6-NEXT:    v_max_i32_e32 v8, -1, v0
512; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
513; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
514; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
515; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
516; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x80000001, v8
517; GFX6-NEXT:    v_min_i32_e32 v10, -1, v0
518; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
519; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
520; GFX6-NEXT:    v_max_i32_e32 v1, v8, v1
521; GFX6-NEXT:    v_min_i32_e32 v1, v1, v10
522; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
523; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
524; GFX6-NEXT:    v_mov_b32_e32 v9, 0x80000001
525; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
526; GFX6-NEXT:    v_max_i32_e32 v5, -1, v1
527; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
528; GFX6-NEXT:    v_min_i32_e32 v8, -1, v1
529; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
530; GFX6-NEXT:    v_max_i32_e32 v2, v5, v2
531; GFX6-NEXT:    v_min_i32_e32 v2, v2, v8
532; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
533; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
534; GFX6-NEXT:    v_max_i32_e32 v5, -1, v2
535; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
536; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
537; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
538; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
539; GFX6-NEXT:    v_max_i32_e32 v3, v5, v3
540; GFX6-NEXT:    v_min_i32_e32 v3, v3, v6
541; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
542; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
543; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
544; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
545; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
546; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
547; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
548; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
549; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
550; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
551; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
552; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
553; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
554; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
555; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
556; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
557; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
558; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v2
559; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
560; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
561; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
562; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v3
563; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
564; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
565; GFX6-NEXT:    s_setpc_b64 s[30:31]
566;
567; GFX8-LABEL: v_ssubsat_v4i8:
568; GFX8:       ; %bb.0:
569; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
570; GFX8-NEXT:    v_mov_b32_e32 v2, 8
571; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
572; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
573; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
574; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
575; GFX8-NEXT:    v_max_i16_e32 v8, -1, v0
576; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
577; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
578; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
579; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
580; GFX8-NEXT:    v_add_u16_e32 v8, 0x8001, v8
581; GFX8-NEXT:    v_min_i16_e32 v9, -1, v0
582; GFX8-NEXT:    v_add_u16_e32 v9, 0x8000, v9
583; GFX8-NEXT:    v_max_i16_e32 v1, v8, v1
584; GFX8-NEXT:    v_min_i16_e32 v1, v1, v9
585; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
586; GFX8-NEXT:    v_max_i16_e32 v1, -1, v3
587; GFX8-NEXT:    v_add_u16_e32 v1, 0x8001, v1
588; GFX8-NEXT:    v_min_i16_e32 v8, -1, v3
589; GFX8-NEXT:    v_add_u16_e32 v8, 0x8000, v8
590; GFX8-NEXT:    v_max_i16_e32 v1, v1, v2
591; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
592; GFX8-NEXT:    v_min_i16_e32 v1, v1, v8
593; GFX8-NEXT:    v_max_i16_e32 v4, -1, v2
594; GFX8-NEXT:    v_sub_u16_e32 v1, v3, v1
595; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v6
596; GFX8-NEXT:    v_add_u16_e32 v4, 0x8001, v4
597; GFX8-NEXT:    v_min_i16_e32 v6, -1, v2
598; GFX8-NEXT:    v_add_u16_e32 v6, 0x8000, v6
599; GFX8-NEXT:    v_max_i16_e32 v3, v4, v3
600; GFX8-NEXT:    v_min_i16_e32 v3, v3, v6
601; GFX8-NEXT:    v_sub_u16_e32 v2, v2, v3
602; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v5
603; GFX8-NEXT:    v_max_i16_e32 v5, -1, v3
604; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
605; GFX8-NEXT:    v_add_u16_e32 v5, 0x8001, v5
606; GFX8-NEXT:    v_min_i16_e32 v6, -1, v3
607; GFX8-NEXT:    v_add_u16_e32 v6, 0x8000, v6
608; GFX8-NEXT:    v_max_i16_e32 v4, v5, v4
609; GFX8-NEXT:    v_min_i16_e32 v4, v4, v6
610; GFX8-NEXT:    v_sub_u16_e32 v3, v3, v4
611; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
612; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
613; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
614; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
615; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
616; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
617; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
618; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
619; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
620; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
621; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
622; GFX8-NEXT:    s_setpc_b64 s[30:31]
623;
624; GFX9-LABEL: v_ssubsat_v4i8:
625; GFX9:       ; %bb.0:
626; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
627; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
628; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
629; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
630; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff, v0
631; GFX9-NEXT:    v_alignbit_b32 v0, v3, v0, 16
632; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v1
633; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
634; GFX9-NEXT:    v_lshl_or_b32 v2, v2, 16, v6
635; GFX9-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
636; GFX9-NEXT:    v_alignbit_b32 v1, v5, v1, 16
637; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
638; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
639; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
640; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
641; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v3 clamp
642; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
643; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1]
644; GFX9-NEXT:    v_mov_b32_e32 v3, 8
645; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
646; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
647; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
648; GFX9-NEXT:    v_and_or_b32 v1, v1, v2, v3
649; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v0
650; GFX9-NEXT:    v_mov_b32_e32 v3, 24
651; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
652; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
653; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
654; GFX9-NEXT:    s_setpc_b64 s[30:31]
655;
656; GFX10-LABEL: v_ssubsat_v4i8:
657; GFX10:       ; %bb.0:
658; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
659; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
660; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
661; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v0
662; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
663; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff, v1
664; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
665; GFX10-NEXT:    v_alignbit_b32 v0, v3, v0, 16
666; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
667; GFX10-NEXT:    v_mov_b32_e32 v4, 24
668; GFX10-NEXT:    v_lshl_or_b32 v3, v5, 16, v6
669; GFX10-NEXT:    v_alignbit_b32 v1, v7, v1, 16
670; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
671; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
672; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
673; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
674; GFX10-NEXT:    v_pk_sub_i16 v2, v2, v3 clamp
675; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
676; GFX10-NEXT:    v_mov_b32_e32 v1, 8
677; GFX10-NEXT:    v_pk_ashrrev_i16 v2, 8, v2 op_sel_hi:[0,1]
678; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
679; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
680; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v0
681; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
682; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v2, v1
683; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
684; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
685; GFX10-NEXT:    s_setpc_b64 s[30:31]
686;
687; GFX11-LABEL: v_ssubsat_v4i8:
688; GFX11:       ; %bb.0:
689; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
690; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
691; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
692; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v0
693; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v1
694; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
695; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
696; GFX11-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
697; GFX11-NEXT:    v_lshl_or_b32 v3, v3, 16, v5
698; GFX11-NEXT:    v_alignbit_b32 v0, v6, v0, 16
699; GFX11-NEXT:    v_alignbit_b32 v1, v7, v1, 16
700; GFX11-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
701; GFX11-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
702; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
703; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
704; GFX11-NEXT:    v_pk_sub_i16 v2, v2, v3 clamp
705; GFX11-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
706; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1]
707; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
708; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 8
709; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v0
710; GFX11-NEXT:    v_bfe_u32 v0, v0, 16, 8
711; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
712; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
713; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
714; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
715; GFX11-NEXT:    v_or3_b32 v0, v1, v3, v0
716; GFX11-NEXT:    s_setpc_b64 s[30:31]
717  %lhs = bitcast i32 %lhs.arg to <4 x i8>
718  %rhs = bitcast i32 %rhs.arg to <4 x i8>
719  %result = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
720  %cast.result = bitcast <4 x i8> %result to i32
721  ret i32 %cast.result
722}
723
724define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
725; GFX6-LABEL: s_ssubsat_v4i8:
726; GFX6:       ; %bb.0:
727; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
728; GFX6-NEXT:    s_lshr_b32 s3, s0, 16
729; GFX6-NEXT:    s_lshr_b32 s4, s0, 24
730; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
731; GFX6-NEXT:    s_max_i32 s8, s0, -1
732; GFX6-NEXT:    s_lshr_b32 s5, s1, 8
733; GFX6-NEXT:    s_lshr_b32 s6, s1, 16
734; GFX6-NEXT:    s_lshr_b32 s7, s1, 24
735; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
736; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000001
737; GFX6-NEXT:    s_min_i32 s9, s0, -1
738; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000000
739; GFX6-NEXT:    s_max_i32 s1, s8, s1
740; GFX6-NEXT:    s_min_i32 s1, s1, s9
741; GFX6-NEXT:    s_sub_i32 s0, s0, s1
742; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
743; GFX6-NEXT:    s_lshl_b32 s2, s5, 24
744; GFX6-NEXT:    s_max_i32 s5, s1, -1
745; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
746; GFX6-NEXT:    s_min_i32 s8, s1, -1
747; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000000
748; GFX6-NEXT:    s_max_i32 s2, s5, s2
749; GFX6-NEXT:    s_min_i32 s2, s2, s8
750; GFX6-NEXT:    s_sub_i32 s1, s1, s2
751; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
752; GFX6-NEXT:    s_max_i32 s5, s2, -1
753; GFX6-NEXT:    s_lshl_b32 s3, s6, 24
754; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
755; GFX6-NEXT:    s_min_i32 s6, s2, -1
756; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
757; GFX6-NEXT:    s_max_i32 s3, s5, s3
758; GFX6-NEXT:    s_min_i32 s3, s3, s6
759; GFX6-NEXT:    s_sub_i32 s2, s2, s3
760; GFX6-NEXT:    s_lshl_b32 s3, s4, 24
761; GFX6-NEXT:    s_max_i32 s5, s3, -1
762; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
763; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
764; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
765; GFX6-NEXT:    s_min_i32 s6, s3, -1
766; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
767; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
768; GFX6-NEXT:    s_max_i32 s4, s5, s4
769; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
770; GFX6-NEXT:    s_ashr_i32 s2, s2, 24
771; GFX6-NEXT:    s_min_i32 s4, s4, s6
772; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
773; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
774; GFX6-NEXT:    s_sub_i32 s3, s3, s4
775; GFX6-NEXT:    s_or_b32 s0, s0, s1
776; GFX6-NEXT:    s_and_b32 s1, s2, 0xff
777; GFX6-NEXT:    s_ashr_i32 s3, s3, 24
778; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
779; GFX6-NEXT:    s_or_b32 s0, s0, s1
780; GFX6-NEXT:    s_and_b32 s1, s3, 0xff
781; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
782; GFX6-NEXT:    s_or_b32 s0, s0, s1
783; GFX6-NEXT:    ; return to shader part epilog
784;
785; GFX8-LABEL: s_ssubsat_v4i8:
786; GFX8:       ; %bb.0:
787; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
788; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
789; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
790; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
791; GFX8-NEXT:    s_sext_i32_i16 s8, s0
792; GFX8-NEXT:    s_sext_i32_i16 s9, -1
793; GFX8-NEXT:    s_max_i32 s10, s8, s9
794; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
795; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
796; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
797; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
798; GFX8-NEXT:    s_addk_i32 s10, 0x8001
799; GFX8-NEXT:    s_min_i32 s8, s8, s9
800; GFX8-NEXT:    s_sext_i32_i16 s10, s10
801; GFX8-NEXT:    s_sext_i32_i16 s1, s1
802; GFX8-NEXT:    s_addk_i32 s8, 0x8000
803; GFX8-NEXT:    s_max_i32 s1, s10, s1
804; GFX8-NEXT:    s_sext_i32_i16 s1, s1
805; GFX8-NEXT:    s_sext_i32_i16 s8, s8
806; GFX8-NEXT:    s_min_i32 s1, s1, s8
807; GFX8-NEXT:    s_sub_i32 s0, s0, s1
808; GFX8-NEXT:    s_lshl_b32 s1, s2, 8
809; GFX8-NEXT:    s_lshl_b32 s2, s5, 8
810; GFX8-NEXT:    s_sext_i32_i16 s5, s1
811; GFX8-NEXT:    s_max_i32 s8, s5, s9
812; GFX8-NEXT:    s_addk_i32 s8, 0x8001
813; GFX8-NEXT:    s_min_i32 s5, s5, s9
814; GFX8-NEXT:    s_sext_i32_i16 s8, s8
815; GFX8-NEXT:    s_sext_i32_i16 s2, s2
816; GFX8-NEXT:    s_addk_i32 s5, 0x8000
817; GFX8-NEXT:    s_max_i32 s2, s8, s2
818; GFX8-NEXT:    s_sext_i32_i16 s2, s2
819; GFX8-NEXT:    s_sext_i32_i16 s5, s5
820; GFX8-NEXT:    s_min_i32 s2, s2, s5
821; GFX8-NEXT:    s_sub_i32 s1, s1, s2
822; GFX8-NEXT:    s_lshl_b32 s2, s3, 8
823; GFX8-NEXT:    s_sext_i32_i16 s5, s2
824; GFX8-NEXT:    s_lshl_b32 s3, s6, 8
825; GFX8-NEXT:    s_max_i32 s6, s5, s9
826; GFX8-NEXT:    s_addk_i32 s6, 0x8001
827; GFX8-NEXT:    s_min_i32 s5, s5, s9
828; GFX8-NEXT:    s_sext_i32_i16 s6, s6
829; GFX8-NEXT:    s_sext_i32_i16 s3, s3
830; GFX8-NEXT:    s_addk_i32 s5, 0x8000
831; GFX8-NEXT:    s_max_i32 s3, s6, s3
832; GFX8-NEXT:    s_sext_i32_i16 s3, s3
833; GFX8-NEXT:    s_sext_i32_i16 s5, s5
834; GFX8-NEXT:    s_min_i32 s3, s3, s5
835; GFX8-NEXT:    s_sub_i32 s2, s2, s3
836; GFX8-NEXT:    s_lshl_b32 s3, s4, 8
837; GFX8-NEXT:    s_sext_i32_i16 s5, s3
838; GFX8-NEXT:    s_max_i32 s6, s5, s9
839; GFX8-NEXT:    s_lshl_b32 s4, s7, 8
840; GFX8-NEXT:    s_addk_i32 s6, 0x8001
841; GFX8-NEXT:    s_min_i32 s5, s5, s9
842; GFX8-NEXT:    s_sext_i32_i16 s6, s6
843; GFX8-NEXT:    s_sext_i32_i16 s4, s4
844; GFX8-NEXT:    s_sext_i32_i16 s1, s1
845; GFX8-NEXT:    s_addk_i32 s5, 0x8000
846; GFX8-NEXT:    s_max_i32 s4, s6, s4
847; GFX8-NEXT:    s_sext_i32_i16 s0, s0
848; GFX8-NEXT:    s_ashr_i32 s1, s1, 8
849; GFX8-NEXT:    s_sext_i32_i16 s4, s4
850; GFX8-NEXT:    s_sext_i32_i16 s5, s5
851; GFX8-NEXT:    s_ashr_i32 s0, s0, 8
852; GFX8-NEXT:    s_sext_i32_i16 s2, s2
853; GFX8-NEXT:    s_min_i32 s4, s4, s5
854; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
855; GFX8-NEXT:    s_ashr_i32 s2, s2, 8
856; GFX8-NEXT:    s_sub_i32 s3, s3, s4
857; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
858; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
859; GFX8-NEXT:    s_sext_i32_i16 s3, s3
860; GFX8-NEXT:    s_or_b32 s0, s0, s1
861; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
862; GFX8-NEXT:    s_ashr_i32 s3, s3, 8
863; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
864; GFX8-NEXT:    s_or_b32 s0, s0, s1
865; GFX8-NEXT:    s_and_b32 s1, s3, 0xff
866; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
867; GFX8-NEXT:    s_or_b32 s0, s0, s1
868; GFX8-NEXT:    ; return to shader part epilog
869;
870; GFX9-LABEL: s_ssubsat_v4i8:
871; GFX9:       ; %bb.0:
872; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
873; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
874; GFX9-NEXT:    s_lshr_b32 s4, s0, 24
875; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
876; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
877; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
878; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
879; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
880; GFX9-NEXT:    s_lshr_b32 s5, s1, 8
881; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
882; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
883; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
884; GFX9-NEXT:    s_lshr_b32 s7, s1, 24
885; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
886; GFX9-NEXT:    s_lshl_b32 s2, s2, 0x80008
887; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
888; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
889; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
890; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s6, s7
891; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
892; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
893; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
894; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
895; GFX9-NEXT:    s_lshl_b32 s3, s3, 0x80008
896; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
897; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
898; GFX9-NEXT:    v_mov_b32_e32 v0, s1
899; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
900; GFX9-NEXT:    v_mov_b32_e32 v1, s3
901; GFX9-NEXT:    v_pk_sub_i16 v1, s2, v1 clamp
902; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
903; GFX9-NEXT:    v_mov_b32_e32 v3, 8
904; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
905; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
906; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
907; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v3
908; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v1
909; GFX9-NEXT:    v_mov_b32_e32 v3, 24
910; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
911; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
912; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
913; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
914; GFX9-NEXT:    ; return to shader part epilog
915;
916; GFX10-LABEL: s_ssubsat_v4i8:
917; GFX10:       ; %bb.0:
918; GFX10-NEXT:    s_lshr_b32 s2, s0, 8
919; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
920; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
921; GFX10-NEXT:    s_lshr_b32 s5, s1, 8
922; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
923; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
924; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
925; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
926; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
927; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
928; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
929; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s6, s7
930; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x80008
931; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
932; GFX10-NEXT:    s_lshl_b32 s2, s2, 0x80008
933; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
934; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
935; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
936; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
937; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
938; GFX10-NEXT:    s_lshl_b32 s1, s1, 0x80008
939; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
940; GFX10-NEXT:    s_lshl_b32 s3, s3, 0x80008
941; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
942; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
943; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
944; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s1 clamp
945; GFX10-NEXT:    v_pk_sub_i16 v1, s2, s3 clamp
946; GFX10-NEXT:    v_mov_b32_e32 v2, 8
947; GFX10-NEXT:    v_mov_b32_e32 v4, 24
948; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
949; GFX10-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
950; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
951; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
952; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
953; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
954; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
955; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
956; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
957; GFX10-NEXT:    ; return to shader part epilog
958;
959; GFX11-LABEL: s_ssubsat_v4i8:
960; GFX11:       ; %bb.0:
961; GFX11-NEXT:    s_lshr_b32 s2, s0, 8
962; GFX11-NEXT:    s_lshr_b32 s3, s0, 24
963; GFX11-NEXT:    s_lshr_b32 s4, s1, 8
964; GFX11-NEXT:    s_lshr_b32 s5, s1, 24
965; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s0, s2
966; GFX11-NEXT:    s_pack_hl_b32_b16 s0, s0, s3
967; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s1, s4
968; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
969; GFX11-NEXT:    s_pack_hl_b32_b16 s1, s1, s5
970; GFX11-NEXT:    s_lshr_b32 s5, s3, 16
971; GFX11-NEXT:    s_lshl_b32 s2, s2, 0x80008
972; GFX11-NEXT:    s_lshl_b32 s4, s4, 8
973; GFX11-NEXT:    s_lshl_b32 s3, s3, 0x80008
974; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
975; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
976; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
977; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
978; GFX11-NEXT:    s_lshr_b32 s5, s1, 16
979; GFX11-NEXT:    v_pk_sub_i16 v0, s2, s3 clamp
980; GFX11-NEXT:    s_lshl_b32 s0, s0, 0x80008
981; GFX11-NEXT:    s_lshl_b32 s4, s4, 8
982; GFX11-NEXT:    s_lshl_b32 s1, s1, 0x80008
983; GFX11-NEXT:    s_lshl_b32 s2, s5, 8
984; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
985; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
986; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
987; GFX11-NEXT:    v_pk_sub_i16 v1, s0, s1 clamp
988; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 8
989; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
990; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
991; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v1
992; GFX11-NEXT:    v_bfe_u32 v1, v1, 16, 8
993; GFX11-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
994; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
995; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
996; GFX11-NEXT:    v_or3_b32 v0, v0, v2, v1
997; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
998; GFX11-NEXT:    ; return to shader part epilog
999  %lhs = bitcast i32 %lhs.arg to <4 x i8>
1000  %rhs = bitcast i32 %rhs.arg to <4 x i8>
1001  %result = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
1002  %cast.result = bitcast <4 x i8> %result to i32
1003  ret i32 %cast.result
1004}
1005
1006define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) {
1007; GFX6-LABEL: v_ssubsat_i24:
1008; GFX6:       ; %bb.0:
1009; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1010; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1011; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
1012; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1013; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000001, v2
1014; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
1015; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000000, v3
1016; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
1017; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
1018; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
1019; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
1020; GFX6-NEXT:    s_setpc_b64 s[30:31]
1021;
1022; GFX8-LABEL: v_ssubsat_i24:
1023; GFX8:       ; %bb.0:
1024; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1025; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v0, v1
1026; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 24
1027; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 24
1028; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
1029; GFX8-NEXT:    v_bfe_i32 v0, v1, 0, 24
1030; GFX8-NEXT:    v_cmp_lt_i32_e64 s[6:7], 0, v0
1031; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 23, v3
1032; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xff800000, v0
1033; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
1034; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
1035; GFX8-NEXT:    s_setpc_b64 s[30:31]
1036;
1037; GFX9-LABEL: v_ssubsat_i24:
1038; GFX9:       ; %bb.0:
1039; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1040; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1041; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1042; GFX9-NEXT:    v_sub_i32 v0, v0, v1 clamp
1043; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
1044; GFX9-NEXT:    s_setpc_b64 s[30:31]
1045;
1046; GFX10PLUS-LABEL: v_ssubsat_i24:
1047; GFX10PLUS:       ; %bb.0:
1048; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1049; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1050; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1051; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v1 clamp
1052; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
1053; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1054  %result = call i24 @llvm.ssub.sat.i24(i24 %lhs, i24 %rhs)
1055  ret i24 %result
1056}
1057
1058define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
1059; GFX6-LABEL: s_ssubsat_i24:
1060; GFX6:       ; %bb.0:
1061; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
1062; GFX6-NEXT:    s_max_i32 s2, s0, -1
1063; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
1064; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000001
1065; GFX6-NEXT:    s_min_i32 s3, s0, -1
1066; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000000
1067; GFX6-NEXT:    s_max_i32 s1, s2, s1
1068; GFX6-NEXT:    s_min_i32 s1, s1, s3
1069; GFX6-NEXT:    s_sub_i32 s0, s0, s1
1070; GFX6-NEXT:    s_ashr_i32 s0, s0, 8
1071; GFX6-NEXT:    ; return to shader part epilog
1072;
1073; GFX8-LABEL: s_ssubsat_i24:
1074; GFX8:       ; %bb.0:
1075; GFX8-NEXT:    s_sub_i32 s2, s0, s1
1076; GFX8-NEXT:    s_bfe_i32 s3, s2, 0x180000
1077; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x180000
1078; GFX8-NEXT:    s_cmp_lt_i32 s3, s0
1079; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
1080; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x180000
1081; GFX8-NEXT:    s_cmp_gt_i32 s1, 0
1082; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
1083; GFX8-NEXT:    s_xor_b32 s0, s1, s0
1084; GFX8-NEXT:    s_ashr_i32 s1, s3, 23
1085; GFX8-NEXT:    s_add_i32 s1, s1, 0xff800000
1086; GFX8-NEXT:    s_and_b32 s0, s0, 1
1087; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
1088; GFX8-NEXT:    s_cselect_b32 s0, s1, s2
1089; GFX8-NEXT:    ; return to shader part epilog
1090;
1091; GFX9-LABEL: s_ssubsat_i24:
1092; GFX9:       ; %bb.0:
1093; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
1094; GFX9-NEXT:    s_lshl_b32 s0, s0, 8
1095; GFX9-NEXT:    v_mov_b32_e32 v0, s1
1096; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
1097; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
1098; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1099; GFX9-NEXT:    ; return to shader part epilog
1100;
1101; GFX10PLUS-LABEL: s_ssubsat_i24:
1102; GFX10PLUS:       ; %bb.0:
1103; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 8
1104; GFX10PLUS-NEXT:    s_lshl_b32 s1, s1, 8
1105; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, s0, s1 clamp
1106; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
1107; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1108; GFX10PLUS-NEXT:    ; return to shader part epilog
1109  %result = call i24 @llvm.ssub.sat.i24(i24 %lhs, i24 %rhs)
1110  ret i24 %result
1111}
1112
1113define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
1114; GFX6-LABEL: v_ssubsat_i32:
1115; GFX6:       ; %bb.0:
1116; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1117; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
1118; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000001, v2
1119; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
1120; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000000, v3
1121; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
1122; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
1123; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
1124; GFX6-NEXT:    s_setpc_b64 s[30:31]
1125;
1126; GFX8-LABEL: v_ssubsat_i32:
1127; GFX8:       ; %bb.0:
1128; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1129; GFX8-NEXT:    v_max_i32_e32 v2, -1, v0
1130; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x80000001, v2
1131; GFX8-NEXT:    v_min_i32_e32 v3, -1, v0
1132; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x80000000, v3
1133; GFX8-NEXT:    v_max_i32_e32 v1, v2, v1
1134; GFX8-NEXT:    v_min_i32_e32 v1, v1, v3
1135; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
1136; GFX8-NEXT:    s_setpc_b64 s[30:31]
1137;
1138; GFX9-LABEL: v_ssubsat_i32:
1139; GFX9:       ; %bb.0:
1140; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1141; GFX9-NEXT:    v_sub_i32 v0, v0, v1 clamp
1142; GFX9-NEXT:    s_setpc_b64 s[30:31]
1143;
1144; GFX10PLUS-LABEL: v_ssubsat_i32:
1145; GFX10PLUS:       ; %bb.0:
1146; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1147; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v1 clamp
1148; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1149  %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs)
1150  ret i32 %result
1151}
1152
1153define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
1154; GFX6-LABEL: s_ssubsat_i32:
1155; GFX6:       ; %bb.0:
1156; GFX6-NEXT:    s_max_i32 s2, s0, -1
1157; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000001
1158; GFX6-NEXT:    s_min_i32 s3, s0, -1
1159; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000000
1160; GFX6-NEXT:    s_max_i32 s1, s2, s1
1161; GFX6-NEXT:    s_min_i32 s1, s1, s3
1162; GFX6-NEXT:    s_sub_i32 s0, s0, s1
1163; GFX6-NEXT:    ; return to shader part epilog
1164;
1165; GFX8-LABEL: s_ssubsat_i32:
1166; GFX8:       ; %bb.0:
1167; GFX8-NEXT:    s_max_i32 s2, s0, -1
1168; GFX8-NEXT:    s_add_i32 s2, s2, 0x80000001
1169; GFX8-NEXT:    s_min_i32 s3, s0, -1
1170; GFX8-NEXT:    s_add_i32 s3, s3, 0x80000000
1171; GFX8-NEXT:    s_max_i32 s1, s2, s1
1172; GFX8-NEXT:    s_min_i32 s1, s1, s3
1173; GFX8-NEXT:    s_sub_i32 s0, s0, s1
1174; GFX8-NEXT:    ; return to shader part epilog
1175;
1176; GFX9-LABEL: s_ssubsat_i32:
1177; GFX9:       ; %bb.0:
1178; GFX9-NEXT:    v_mov_b32_e32 v0, s1
1179; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
1180; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1181; GFX9-NEXT:    ; return to shader part epilog
1182;
1183; GFX10PLUS-LABEL: s_ssubsat_i32:
1184; GFX10PLUS:       ; %bb.0:
1185; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, s0, s1 clamp
1186; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1187; GFX10PLUS-NEXT:    ; return to shader part epilog
1188  %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs)
1189  ret i32 %result
1190}
1191
1192define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
1193; GFX6-LABEL: ssubsat_i32_sv:
1194; GFX6:       ; %bb.0:
1195; GFX6-NEXT:    s_max_i32 s1, s0, -1
1196; GFX6-NEXT:    s_add_i32 s1, s1, 0x80000001
1197; GFX6-NEXT:    s_min_i32 s2, s0, -1
1198; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000000
1199; GFX6-NEXT:    v_max_i32_e32 v0, s1, v0
1200; GFX6-NEXT:    v_min_i32_e32 v0, s2, v0
1201; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1202; GFX6-NEXT:    ; return to shader part epilog
1203;
1204; GFX8-LABEL: ssubsat_i32_sv:
1205; GFX8:       ; %bb.0:
1206; GFX8-NEXT:    s_max_i32 s1, s0, -1
1207; GFX8-NEXT:    s_add_i32 s1, s1, 0x80000001
1208; GFX8-NEXT:    s_min_i32 s2, s0, -1
1209; GFX8-NEXT:    s_add_i32 s2, s2, 0x80000000
1210; GFX8-NEXT:    v_max_i32_e32 v0, s1, v0
1211; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
1212; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1213; GFX8-NEXT:    ; return to shader part epilog
1214;
1215; GFX9-LABEL: ssubsat_i32_sv:
1216; GFX9:       ; %bb.0:
1217; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
1218; GFX9-NEXT:    ; return to shader part epilog
1219;
1220; GFX10PLUS-LABEL: ssubsat_i32_sv:
1221; GFX10PLUS:       ; %bb.0:
1222; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, s0, v0 clamp
1223; GFX10PLUS-NEXT:    ; return to shader part epilog
1224  %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs)
1225  %cast = bitcast i32 %result to float
1226  ret float %cast
1227}
1228
1229define amdgpu_ps float @ssubsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
1230; GFX6-LABEL: ssubsat_i32_vs:
1231; GFX6:       ; %bb.0:
1232; GFX6-NEXT:    v_max_i32_e32 v1, -1, v0
1233; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 0x80000001, v1
1234; GFX6-NEXT:    v_min_i32_e32 v2, -1, v0
1235; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000000, v2
1236; GFX6-NEXT:    v_max_i32_e32 v1, s0, v1
1237; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
1238; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
1239; GFX6-NEXT:    ; return to shader part epilog
1240;
1241; GFX8-LABEL: ssubsat_i32_vs:
1242; GFX8:       ; %bb.0:
1243; GFX8-NEXT:    v_max_i32_e32 v1, -1, v0
1244; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x80000001, v1
1245; GFX8-NEXT:    v_min_i32_e32 v2, -1, v0
1246; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x80000000, v2
1247; GFX8-NEXT:    v_max_i32_e32 v1, s0, v1
1248; GFX8-NEXT:    v_min_i32_e32 v1, v1, v2
1249; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
1250; GFX8-NEXT:    ; return to shader part epilog
1251;
1252; GFX9-LABEL: ssubsat_i32_vs:
1253; GFX9:       ; %bb.0:
1254; GFX9-NEXT:    v_sub_i32 v0, v0, s0 clamp
1255; GFX9-NEXT:    ; return to shader part epilog
1256;
1257; GFX10PLUS-LABEL: ssubsat_i32_vs:
1258; GFX10PLUS:       ; %bb.0:
1259; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, s0 clamp
1260; GFX10PLUS-NEXT:    ; return to shader part epilog
1261  %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs)
1262  %cast = bitcast i32 %result to float
1263  ret float %cast
1264}
1265
1266define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
1267; GFX6-LABEL: v_ssubsat_v2i32:
1268; GFX6:       ; %bb.0:
1269; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1270; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
1271; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000001, v4
1272; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
1273; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 0x80000000, v5
1274; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
1275; GFX6-NEXT:    v_min_i32_e32 v2, v2, v5
1276; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
1277; GFX6-NEXT:    v_max_i32_e32 v2, -1, v1
1278; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000001, v2
1279; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
1280; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000000, v4
1281; GFX6-NEXT:    v_max_i32_e32 v2, v2, v3
1282; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
1283; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
1284; GFX6-NEXT:    s_setpc_b64 s[30:31]
1285;
1286; GFX8-LABEL: v_ssubsat_v2i32:
1287; GFX8:       ; %bb.0:
1288; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1289; GFX8-NEXT:    v_max_i32_e32 v4, -1, v0
1290; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x80000001, v4
1291; GFX8-NEXT:    v_min_i32_e32 v5, -1, v0
1292; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x80000000, v5
1293; GFX8-NEXT:    v_max_i32_e32 v2, v4, v2
1294; GFX8-NEXT:    v_min_i32_e32 v2, v2, v5
1295; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
1296; GFX8-NEXT:    v_max_i32_e32 v2, -1, v1
1297; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x80000001, v2
1298; GFX8-NEXT:    v_min_i32_e32 v4, -1, v1
1299; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x80000000, v4
1300; GFX8-NEXT:    v_max_i32_e32 v2, v2, v3
1301; GFX8-NEXT:    v_min_i32_e32 v2, v2, v4
1302; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v2
1303; GFX8-NEXT:    s_setpc_b64 s[30:31]
1304;
1305; GFX9-LABEL: v_ssubsat_v2i32:
1306; GFX9:       ; %bb.0:
1307; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1308; GFX9-NEXT:    v_sub_i32 v0, v0, v2 clamp
1309; GFX9-NEXT:    v_sub_i32 v1, v1, v3 clamp
1310; GFX9-NEXT:    s_setpc_b64 s[30:31]
1311;
1312; GFX10PLUS-LABEL: v_ssubsat_v2i32:
1313; GFX10PLUS:       ; %bb.0:
1314; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1315; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v2 clamp
1316; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, v1, v3 clamp
1317; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1318  %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1319  ret <2 x i32> %result
1320}
1321
1322define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
1323; GFX6-LABEL: s_ssubsat_v2i32:
1324; GFX6:       ; %bb.0:
1325; GFX6-NEXT:    s_max_i32 s4, s0, -1
1326; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000001
1327; GFX6-NEXT:    s_min_i32 s5, s0, -1
1328; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000000
1329; GFX6-NEXT:    s_max_i32 s2, s4, s2
1330; GFX6-NEXT:    s_min_i32 s2, s2, s5
1331; GFX6-NEXT:    s_sub_i32 s0, s0, s2
1332; GFX6-NEXT:    s_max_i32 s2, s1, -1
1333; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000001
1334; GFX6-NEXT:    s_min_i32 s4, s1, -1
1335; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000000
1336; GFX6-NEXT:    s_max_i32 s2, s2, s3
1337; GFX6-NEXT:    s_min_i32 s2, s2, s4
1338; GFX6-NEXT:    s_sub_i32 s1, s1, s2
1339; GFX6-NEXT:    ; return to shader part epilog
1340;
1341; GFX8-LABEL: s_ssubsat_v2i32:
1342; GFX8:       ; %bb.0:
1343; GFX8-NEXT:    s_max_i32 s4, s0, -1
1344; GFX8-NEXT:    s_add_i32 s4, s4, 0x80000001
1345; GFX8-NEXT:    s_min_i32 s5, s0, -1
1346; GFX8-NEXT:    s_add_i32 s5, s5, 0x80000000
1347; GFX8-NEXT:    s_max_i32 s2, s4, s2
1348; GFX8-NEXT:    s_min_i32 s2, s2, s5
1349; GFX8-NEXT:    s_sub_i32 s0, s0, s2
1350; GFX8-NEXT:    s_max_i32 s2, s1, -1
1351; GFX8-NEXT:    s_add_i32 s2, s2, 0x80000001
1352; GFX8-NEXT:    s_min_i32 s4, s1, -1
1353; GFX8-NEXT:    s_add_i32 s4, s4, 0x80000000
1354; GFX8-NEXT:    s_max_i32 s2, s2, s3
1355; GFX8-NEXT:    s_min_i32 s2, s2, s4
1356; GFX8-NEXT:    s_sub_i32 s1, s1, s2
1357; GFX8-NEXT:    ; return to shader part epilog
1358;
1359; GFX9-LABEL: s_ssubsat_v2i32:
1360; GFX9:       ; %bb.0:
1361; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1362; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1363; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
1364; GFX9-NEXT:    v_sub_i32 v1, s1, v1 clamp
1365; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1366; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1367; GFX9-NEXT:    ; return to shader part epilog
1368;
1369; GFX10PLUS-LABEL: s_ssubsat_v2i32:
1370; GFX10PLUS:       ; %bb.0:
1371; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, s0, s2 clamp
1372; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, s1, s3 clamp
1373; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1374; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
1375; GFX10PLUS-NEXT:    ; return to shader part epilog
1376  %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1377  ret <2 x i32> %result
1378}
1379
1380define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
1381; GFX6-LABEL: v_ssubsat_v3i32:
1382; GFX6:       ; %bb.0:
1383; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1384; GFX6-NEXT:    v_max_i32_e32 v6, -1, v0
1385; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 0x80000001, v6
1386; GFX6-NEXT:    v_min_i32_e32 v8, -1, v0
1387; GFX6-NEXT:    v_bfrev_b32_e32 v9, 1
1388; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
1389; GFX6-NEXT:    v_max_i32_e32 v3, v6, v3
1390; GFX6-NEXT:    v_min_i32_e32 v3, v3, v8
1391; GFX6-NEXT:    v_mov_b32_e32 v7, 0x80000001
1392; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
1393; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
1394; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
1395; GFX6-NEXT:    v_min_i32_e32 v6, -1, v1
1396; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
1397; GFX6-NEXT:    v_max_i32_e32 v3, v3, v4
1398; GFX6-NEXT:    v_min_i32_e32 v3, v3, v6
1399; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
1400; GFX6-NEXT:    v_max_i32_e32 v3, -1, v2
1401; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000001, v3
1402; GFX6-NEXT:    v_min_i32_e32 v4, -1, v2
1403; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000000, v4
1404; GFX6-NEXT:    v_max_i32_e32 v3, v3, v5
1405; GFX6-NEXT:    v_min_i32_e32 v3, v3, v4
1406; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
1407; GFX6-NEXT:    s_setpc_b64 s[30:31]
1408;
1409; GFX8-LABEL: v_ssubsat_v3i32:
1410; GFX8:       ; %bb.0:
1411; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1412; GFX8-NEXT:    v_max_i32_e32 v6, -1, v0
1413; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x80000001, v6
1414; GFX8-NEXT:    v_min_i32_e32 v8, -1, v0
1415; GFX8-NEXT:    v_bfrev_b32_e32 v9, 1
1416; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
1417; GFX8-NEXT:    v_max_i32_e32 v3, v6, v3
1418; GFX8-NEXT:    v_min_i32_e32 v3, v3, v8
1419; GFX8-NEXT:    v_mov_b32_e32 v7, 0x80000001
1420; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v3
1421; GFX8-NEXT:    v_max_i32_e32 v3, -1, v1
1422; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
1423; GFX8-NEXT:    v_min_i32_e32 v6, -1, v1
1424; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
1425; GFX8-NEXT:    v_max_i32_e32 v3, v3, v4
1426; GFX8-NEXT:    v_min_i32_e32 v3, v3, v6
1427; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v3
1428; GFX8-NEXT:    v_max_i32_e32 v3, -1, v2
1429; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x80000001, v3
1430; GFX8-NEXT:    v_min_i32_e32 v4, -1, v2
1431; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x80000000, v4
1432; GFX8-NEXT:    v_max_i32_e32 v3, v3, v5
1433; GFX8-NEXT:    v_min_i32_e32 v3, v3, v4
1434; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
1435; GFX8-NEXT:    s_setpc_b64 s[30:31]
1436;
1437; GFX9-LABEL: v_ssubsat_v3i32:
1438; GFX9:       ; %bb.0:
1439; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1440; GFX9-NEXT:    v_sub_i32 v0, v0, v3 clamp
1441; GFX9-NEXT:    v_sub_i32 v1, v1, v4 clamp
1442; GFX9-NEXT:    v_sub_i32 v2, v2, v5 clamp
1443; GFX9-NEXT:    s_setpc_b64 s[30:31]
1444;
1445; GFX10PLUS-LABEL: v_ssubsat_v3i32:
1446; GFX10PLUS:       ; %bb.0:
1447; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1448; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v3 clamp
1449; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, v1, v4 clamp
1450; GFX10PLUS-NEXT:    v_sub_nc_i32 v2, v2, v5 clamp
1451; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1452  %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1453  ret <3 x i32> %result
1454}
1455
1456define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
1457; GFX6-LABEL: s_ssubsat_v3i32:
1458; GFX6:       ; %bb.0:
1459; GFX6-NEXT:    s_max_i32 s6, s0, -1
1460; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000001
1461; GFX6-NEXT:    s_min_i32 s7, s0, -1
1462; GFX6-NEXT:    s_add_i32 s7, s7, 0x80000000
1463; GFX6-NEXT:    s_max_i32 s3, s6, s3
1464; GFX6-NEXT:    s_min_i32 s3, s3, s7
1465; GFX6-NEXT:    s_sub_i32 s0, s0, s3
1466; GFX6-NEXT:    s_max_i32 s3, s1, -1
1467; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000001
1468; GFX6-NEXT:    s_min_i32 s6, s1, -1
1469; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
1470; GFX6-NEXT:    s_max_i32 s3, s3, s4
1471; GFX6-NEXT:    s_min_i32 s3, s3, s6
1472; GFX6-NEXT:    s_sub_i32 s1, s1, s3
1473; GFX6-NEXT:    s_max_i32 s3, s2, -1
1474; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000001
1475; GFX6-NEXT:    s_min_i32 s4, s2, -1
1476; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000000
1477; GFX6-NEXT:    s_max_i32 s3, s3, s5
1478; GFX6-NEXT:    s_min_i32 s3, s3, s4
1479; GFX6-NEXT:    s_sub_i32 s2, s2, s3
1480; GFX6-NEXT:    ; return to shader part epilog
1481;
1482; GFX8-LABEL: s_ssubsat_v3i32:
1483; GFX8:       ; %bb.0:
1484; GFX8-NEXT:    s_max_i32 s6, s0, -1
1485; GFX8-NEXT:    s_add_i32 s6, s6, 0x80000001
1486; GFX8-NEXT:    s_min_i32 s7, s0, -1
1487; GFX8-NEXT:    s_add_i32 s7, s7, 0x80000000
1488; GFX8-NEXT:    s_max_i32 s3, s6, s3
1489; GFX8-NEXT:    s_min_i32 s3, s3, s7
1490; GFX8-NEXT:    s_sub_i32 s0, s0, s3
1491; GFX8-NEXT:    s_max_i32 s3, s1, -1
1492; GFX8-NEXT:    s_add_i32 s3, s3, 0x80000001
1493; GFX8-NEXT:    s_min_i32 s6, s1, -1
1494; GFX8-NEXT:    s_add_i32 s6, s6, 0x80000000
1495; GFX8-NEXT:    s_max_i32 s3, s3, s4
1496; GFX8-NEXT:    s_min_i32 s3, s3, s6
1497; GFX8-NEXT:    s_sub_i32 s1, s1, s3
1498; GFX8-NEXT:    s_max_i32 s3, s2, -1
1499; GFX8-NEXT:    s_add_i32 s3, s3, 0x80000001
1500; GFX8-NEXT:    s_min_i32 s4, s2, -1
1501; GFX8-NEXT:    s_add_i32 s4, s4, 0x80000000
1502; GFX8-NEXT:    s_max_i32 s3, s3, s5
1503; GFX8-NEXT:    s_min_i32 s3, s3, s4
1504; GFX8-NEXT:    s_sub_i32 s2, s2, s3
1505; GFX8-NEXT:    ; return to shader part epilog
1506;
1507; GFX9-LABEL: s_ssubsat_v3i32:
1508; GFX9:       ; %bb.0:
1509; GFX9-NEXT:    v_mov_b32_e32 v0, s3
1510; GFX9-NEXT:    v_mov_b32_e32 v1, s4
1511; GFX9-NEXT:    v_mov_b32_e32 v2, s5
1512; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
1513; GFX9-NEXT:    v_sub_i32 v1, s1, v1 clamp
1514; GFX9-NEXT:    v_sub_i32 v2, s2, v2 clamp
1515; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1516; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1517; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1518; GFX9-NEXT:    ; return to shader part epilog
1519;
1520; GFX10PLUS-LABEL: s_ssubsat_v3i32:
1521; GFX10PLUS:       ; %bb.0:
1522; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, s0, s3 clamp
1523; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, s1, s4 clamp
1524; GFX10PLUS-NEXT:    v_sub_nc_i32 v2, s2, s5 clamp
1525; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1526; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
1527; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
1528; GFX10PLUS-NEXT:    ; return to shader part epilog
1529  %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1530  ret <3 x i32> %result
1531}
1532
1533define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
1534; GFX6-LABEL: v_ssubsat_v4i32:
1535; GFX6:       ; %bb.0:
1536; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1537; GFX6-NEXT:    v_max_i32_e32 v8, -1, v0
1538; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x80000001, v8
1539; GFX6-NEXT:    v_min_i32_e32 v10, -1, v0
1540; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
1541; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
1542; GFX6-NEXT:    v_max_i32_e32 v4, v8, v4
1543; GFX6-NEXT:    v_min_i32_e32 v4, v4, v10
1544; GFX6-NEXT:    v_mov_b32_e32 v9, 0x80000001
1545; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
1546; GFX6-NEXT:    v_max_i32_e32 v4, -1, v1
1547; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
1548; GFX6-NEXT:    v_min_i32_e32 v8, -1, v1
1549; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
1550; GFX6-NEXT:    v_max_i32_e32 v4, v4, v5
1551; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
1552; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
1553; GFX6-NEXT:    v_max_i32_e32 v4, -1, v2
1554; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
1555; GFX6-NEXT:    v_min_i32_e32 v5, -1, v2
1556; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
1557; GFX6-NEXT:    v_max_i32_e32 v4, v4, v6
1558; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
1559; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
1560; GFX6-NEXT:    v_max_i32_e32 v4, -1, v3
1561; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000001, v4
1562; GFX6-NEXT:    v_min_i32_e32 v5, -1, v3
1563; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 0x80000000, v5
1564; GFX6-NEXT:    v_max_i32_e32 v4, v4, v7
1565; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
1566; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
1567; GFX6-NEXT:    s_setpc_b64 s[30:31]
1568;
1569; GFX8-LABEL: v_ssubsat_v4i32:
1570; GFX8:       ; %bb.0:
1571; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1572; GFX8-NEXT:    v_max_i32_e32 v8, -1, v0
1573; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x80000001, v8
1574; GFX8-NEXT:    v_min_i32_e32 v10, -1, v0
1575; GFX8-NEXT:    v_bfrev_b32_e32 v11, 1
1576; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
1577; GFX8-NEXT:    v_max_i32_e32 v4, v8, v4
1578; GFX8-NEXT:    v_min_i32_e32 v4, v4, v10
1579; GFX8-NEXT:    v_mov_b32_e32 v9, 0x80000001
1580; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
1581; GFX8-NEXT:    v_max_i32_e32 v4, -1, v1
1582; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v9
1583; GFX8-NEXT:    v_min_i32_e32 v8, -1, v1
1584; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v11
1585; GFX8-NEXT:    v_max_i32_e32 v4, v4, v5
1586; GFX8-NEXT:    v_min_i32_e32 v4, v4, v8
1587; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v4
1588; GFX8-NEXT:    v_max_i32_e32 v4, -1, v2
1589; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v9
1590; GFX8-NEXT:    v_min_i32_e32 v5, -1, v2
1591; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v11
1592; GFX8-NEXT:    v_max_i32_e32 v4, v4, v6
1593; GFX8-NEXT:    v_min_i32_e32 v4, v4, v5
1594; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v4
1595; GFX8-NEXT:    v_max_i32_e32 v4, -1, v3
1596; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x80000001, v4
1597; GFX8-NEXT:    v_min_i32_e32 v5, -1, v3
1598; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x80000000, v5
1599; GFX8-NEXT:    v_max_i32_e32 v4, v4, v7
1600; GFX8-NEXT:    v_min_i32_e32 v4, v4, v5
1601; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v4
1602; GFX8-NEXT:    s_setpc_b64 s[30:31]
1603;
1604; GFX9-LABEL: v_ssubsat_v4i32:
1605; GFX9:       ; %bb.0:
1606; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1607; GFX9-NEXT:    v_sub_i32 v0, v0, v4 clamp
1608; GFX9-NEXT:    v_sub_i32 v1, v1, v5 clamp
1609; GFX9-NEXT:    v_sub_i32 v2, v2, v6 clamp
1610; GFX9-NEXT:    v_sub_i32 v3, v3, v7 clamp
1611; GFX9-NEXT:    s_setpc_b64 s[30:31]
1612;
1613; GFX10PLUS-LABEL: v_ssubsat_v4i32:
1614; GFX10PLUS:       ; %bb.0:
1615; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1616; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v4 clamp
1617; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, v1, v5 clamp
1618; GFX10PLUS-NEXT:    v_sub_nc_i32 v2, v2, v6 clamp
1619; GFX10PLUS-NEXT:    v_sub_nc_i32 v3, v3, v7 clamp
1620; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1621  %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1622  ret <4 x i32> %result
1623}
1624
1625define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
1626; GFX6-LABEL: s_ssubsat_v4i32:
1627; GFX6:       ; %bb.0:
1628; GFX6-NEXT:    s_max_i32 s8, s0, -1
1629; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000001
1630; GFX6-NEXT:    s_min_i32 s9, s0, -1
1631; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000000
1632; GFX6-NEXT:    s_max_i32 s4, s8, s4
1633; GFX6-NEXT:    s_min_i32 s4, s4, s9
1634; GFX6-NEXT:    s_sub_i32 s0, s0, s4
1635; GFX6-NEXT:    s_max_i32 s4, s1, -1
1636; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000001
1637; GFX6-NEXT:    s_min_i32 s8, s1, -1
1638; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000000
1639; GFX6-NEXT:    s_max_i32 s4, s4, s5
1640; GFX6-NEXT:    s_min_i32 s4, s4, s8
1641; GFX6-NEXT:    s_sub_i32 s1, s1, s4
1642; GFX6-NEXT:    s_max_i32 s4, s2, -1
1643; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000001
1644; GFX6-NEXT:    s_min_i32 s5, s2, -1
1645; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000000
1646; GFX6-NEXT:    s_max_i32 s4, s4, s6
1647; GFX6-NEXT:    s_min_i32 s4, s4, s5
1648; GFX6-NEXT:    s_sub_i32 s2, s2, s4
1649; GFX6-NEXT:    s_max_i32 s4, s3, -1
1650; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000001
1651; GFX6-NEXT:    s_min_i32 s5, s3, -1
1652; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000000
1653; GFX6-NEXT:    s_max_i32 s4, s4, s7
1654; GFX6-NEXT:    s_min_i32 s4, s4, s5
1655; GFX6-NEXT:    s_sub_i32 s3, s3, s4
1656; GFX6-NEXT:    ; return to shader part epilog
1657;
1658; GFX8-LABEL: s_ssubsat_v4i32:
1659; GFX8:       ; %bb.0:
1660; GFX8-NEXT:    s_max_i32 s8, s0, -1
1661; GFX8-NEXT:    s_add_i32 s8, s8, 0x80000001
1662; GFX8-NEXT:    s_min_i32 s9, s0, -1
1663; GFX8-NEXT:    s_add_i32 s9, s9, 0x80000000
1664; GFX8-NEXT:    s_max_i32 s4, s8, s4
1665; GFX8-NEXT:    s_min_i32 s4, s4, s9
1666; GFX8-NEXT:    s_sub_i32 s0, s0, s4
1667; GFX8-NEXT:    s_max_i32 s4, s1, -1
1668; GFX8-NEXT:    s_add_i32 s4, s4, 0x80000001
1669; GFX8-NEXT:    s_min_i32 s8, s1, -1
1670; GFX8-NEXT:    s_add_i32 s8, s8, 0x80000000
1671; GFX8-NEXT:    s_max_i32 s4, s4, s5
1672; GFX8-NEXT:    s_min_i32 s4, s4, s8
1673; GFX8-NEXT:    s_sub_i32 s1, s1, s4
1674; GFX8-NEXT:    s_max_i32 s4, s2, -1
1675; GFX8-NEXT:    s_add_i32 s4, s4, 0x80000001
1676; GFX8-NEXT:    s_min_i32 s5, s2, -1
1677; GFX8-NEXT:    s_add_i32 s5, s5, 0x80000000
1678; GFX8-NEXT:    s_max_i32 s4, s4, s6
1679; GFX8-NEXT:    s_min_i32 s4, s4, s5
1680; GFX8-NEXT:    s_sub_i32 s2, s2, s4
1681; GFX8-NEXT:    s_max_i32 s4, s3, -1
1682; GFX8-NEXT:    s_add_i32 s4, s4, 0x80000001
1683; GFX8-NEXT:    s_min_i32 s5, s3, -1
1684; GFX8-NEXT:    s_add_i32 s5, s5, 0x80000000
1685; GFX8-NEXT:    s_max_i32 s4, s4, s7
1686; GFX8-NEXT:    s_min_i32 s4, s4, s5
1687; GFX8-NEXT:    s_sub_i32 s3, s3, s4
1688; GFX8-NEXT:    ; return to shader part epilog
1689;
1690; GFX9-LABEL: s_ssubsat_v4i32:
1691; GFX9:       ; %bb.0:
1692; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1693; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1694; GFX9-NEXT:    v_mov_b32_e32 v2, s6
1695; GFX9-NEXT:    v_mov_b32_e32 v3, s7
1696; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
1697; GFX9-NEXT:    v_sub_i32 v1, s1, v1 clamp
1698; GFX9-NEXT:    v_sub_i32 v2, s2, v2 clamp
1699; GFX9-NEXT:    v_sub_i32 v3, s3, v3 clamp
1700; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1701; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1702; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1703; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
1704; GFX9-NEXT:    ; return to shader part epilog
1705;
1706; GFX10PLUS-LABEL: s_ssubsat_v4i32:
1707; GFX10PLUS:       ; %bb.0:
1708; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, s0, s4 clamp
1709; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, s1, s5 clamp
1710; GFX10PLUS-NEXT:    v_sub_nc_i32 v2, s2, s6 clamp
1711; GFX10PLUS-NEXT:    v_sub_nc_i32 v3, s3, s7 clamp
1712; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1713; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
1714; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
1715; GFX10PLUS-NEXT:    v_readfirstlane_b32 s3, v3
1716; GFX10PLUS-NEXT:    ; return to shader part epilog
1717  %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1718  ret <4 x i32> %result
1719}
1720
1721define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
1722; GFX6-LABEL: v_ssubsat_v5i32:
1723; GFX6:       ; %bb.0:
1724; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1725; GFX6-NEXT:    v_max_i32_e32 v10, -1, v0
1726; GFX6-NEXT:    v_add_i32_e32 v10, vcc, 0x80000001, v10
1727; GFX6-NEXT:    v_min_i32_e32 v12, -1, v0
1728; GFX6-NEXT:    v_bfrev_b32_e32 v13, 1
1729; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
1730; GFX6-NEXT:    v_max_i32_e32 v5, v10, v5
1731; GFX6-NEXT:    v_min_i32_e32 v5, v5, v12
1732; GFX6-NEXT:    v_mov_b32_e32 v11, 0x80000001
1733; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
1734; GFX6-NEXT:    v_max_i32_e32 v5, -1, v1
1735; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
1736; GFX6-NEXT:    v_min_i32_e32 v10, -1, v1
1737; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
1738; GFX6-NEXT:    v_max_i32_e32 v5, v5, v6
1739; GFX6-NEXT:    v_min_i32_e32 v5, v5, v10
1740; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
1741; GFX6-NEXT:    v_max_i32_e32 v5, -1, v2
1742; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
1743; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
1744; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
1745; GFX6-NEXT:    v_max_i32_e32 v5, v5, v7
1746; GFX6-NEXT:    v_min_i32_e32 v5, v5, v6
1747; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
1748; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
1749; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
1750; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
1751; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
1752; GFX6-NEXT:    v_max_i32_e32 v5, v5, v8
1753; GFX6-NEXT:    v_min_i32_e32 v5, v5, v6
1754; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
1755; GFX6-NEXT:    v_max_i32_e32 v5, -1, v4
1756; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 0x80000001, v5
1757; GFX6-NEXT:    v_min_i32_e32 v6, -1, v4
1758; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 0x80000000, v6
1759; GFX6-NEXT:    v_max_i32_e32 v5, v5, v9
1760; GFX6-NEXT:    v_min_i32_e32 v5, v5, v6
1761; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v5
1762; GFX6-NEXT:    s_setpc_b64 s[30:31]
1763;
1764; GFX8-LABEL: v_ssubsat_v5i32:
1765; GFX8:       ; %bb.0:
1766; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1767; GFX8-NEXT:    v_max_i32_e32 v10, -1, v0
1768; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x80000001, v10
1769; GFX8-NEXT:    v_min_i32_e32 v12, -1, v0
1770; GFX8-NEXT:    v_bfrev_b32_e32 v13, 1
1771; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v13
1772; GFX8-NEXT:    v_max_i32_e32 v5, v10, v5
1773; GFX8-NEXT:    v_min_i32_e32 v5, v5, v12
1774; GFX8-NEXT:    v_mov_b32_e32 v11, 0x80000001
1775; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v5
1776; GFX8-NEXT:    v_max_i32_e32 v5, -1, v1
1777; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v11
1778; GFX8-NEXT:    v_min_i32_e32 v10, -1, v1
1779; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v13
1780; GFX8-NEXT:    v_max_i32_e32 v5, v5, v6
1781; GFX8-NEXT:    v_min_i32_e32 v5, v5, v10
1782; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v5
1783; GFX8-NEXT:    v_max_i32_e32 v5, -1, v2
1784; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v11
1785; GFX8-NEXT:    v_min_i32_e32 v6, -1, v2
1786; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v13
1787; GFX8-NEXT:    v_max_i32_e32 v5, v5, v7
1788; GFX8-NEXT:    v_min_i32_e32 v5, v5, v6
1789; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v5
1790; GFX8-NEXT:    v_max_i32_e32 v5, -1, v3
1791; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v11
1792; GFX8-NEXT:    v_min_i32_e32 v6, -1, v3
1793; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v13
1794; GFX8-NEXT:    v_max_i32_e32 v5, v5, v8
1795; GFX8-NEXT:    v_min_i32_e32 v5, v5, v6
1796; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v5
1797; GFX8-NEXT:    v_max_i32_e32 v5, -1, v4
1798; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x80000001, v5
1799; GFX8-NEXT:    v_min_i32_e32 v6, -1, v4
1800; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x80000000, v6
1801; GFX8-NEXT:    v_max_i32_e32 v5, v5, v9
1802; GFX8-NEXT:    v_min_i32_e32 v5, v5, v6
1803; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v5
1804; GFX8-NEXT:    s_setpc_b64 s[30:31]
1805;
1806; GFX9-LABEL: v_ssubsat_v5i32:
1807; GFX9:       ; %bb.0:
1808; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1809; GFX9-NEXT:    v_sub_i32 v0, v0, v5 clamp
1810; GFX9-NEXT:    v_sub_i32 v1, v1, v6 clamp
1811; GFX9-NEXT:    v_sub_i32 v2, v2, v7 clamp
1812; GFX9-NEXT:    v_sub_i32 v3, v3, v8 clamp
1813; GFX9-NEXT:    v_sub_i32 v4, v4, v9 clamp
1814; GFX9-NEXT:    s_setpc_b64 s[30:31]
1815;
1816; GFX10PLUS-LABEL: v_ssubsat_v5i32:
1817; GFX10PLUS:       ; %bb.0:
1818; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1819; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v5 clamp
1820; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, v1, v6 clamp
1821; GFX10PLUS-NEXT:    v_sub_nc_i32 v2, v2, v7 clamp
1822; GFX10PLUS-NEXT:    v_sub_nc_i32 v3, v3, v8 clamp
1823; GFX10PLUS-NEXT:    v_sub_nc_i32 v4, v4, v9 clamp
1824; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1825  %result = call <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1826  ret <5 x i32> %result
1827}
1828
1829define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
1830; GFX6-LABEL: s_ssubsat_v5i32:
1831; GFX6:       ; %bb.0:
1832; GFX6-NEXT:    s_max_i32 s10, s0, -1
1833; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000001
1834; GFX6-NEXT:    s_min_i32 s11, s0, -1
1835; GFX6-NEXT:    s_add_i32 s11, s11, 0x80000000
1836; GFX6-NEXT:    s_max_i32 s5, s10, s5
1837; GFX6-NEXT:    s_min_i32 s5, s5, s11
1838; GFX6-NEXT:    s_sub_i32 s0, s0, s5
1839; GFX6-NEXT:    s_max_i32 s5, s1, -1
1840; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
1841; GFX6-NEXT:    s_min_i32 s10, s1, -1
1842; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000000
1843; GFX6-NEXT:    s_max_i32 s5, s5, s6
1844; GFX6-NEXT:    s_min_i32 s5, s5, s10
1845; GFX6-NEXT:    s_sub_i32 s1, s1, s5
1846; GFX6-NEXT:    s_max_i32 s5, s2, -1
1847; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
1848; GFX6-NEXT:    s_min_i32 s6, s2, -1
1849; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
1850; GFX6-NEXT:    s_max_i32 s5, s5, s7
1851; GFX6-NEXT:    s_min_i32 s5, s5, s6
1852; GFX6-NEXT:    s_sub_i32 s2, s2, s5
1853; GFX6-NEXT:    s_max_i32 s5, s3, -1
1854; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
1855; GFX6-NEXT:    s_min_i32 s6, s3, -1
1856; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
1857; GFX6-NEXT:    s_max_i32 s5, s5, s8
1858; GFX6-NEXT:    s_min_i32 s5, s5, s6
1859; GFX6-NEXT:    s_sub_i32 s3, s3, s5
1860; GFX6-NEXT:    s_max_i32 s5, s4, -1
1861; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
1862; GFX6-NEXT:    s_min_i32 s6, s4, -1
1863; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
1864; GFX6-NEXT:    s_max_i32 s5, s5, s9
1865; GFX6-NEXT:    s_min_i32 s5, s5, s6
1866; GFX6-NEXT:    s_sub_i32 s4, s4, s5
1867; GFX6-NEXT:    ; return to shader part epilog
1868;
1869; GFX8-LABEL: s_ssubsat_v5i32:
1870; GFX8:       ; %bb.0:
1871; GFX8-NEXT:    s_max_i32 s10, s0, -1
1872; GFX8-NEXT:    s_add_i32 s10, s10, 0x80000001
1873; GFX8-NEXT:    s_min_i32 s11, s0, -1
1874; GFX8-NEXT:    s_add_i32 s11, s11, 0x80000000
1875; GFX8-NEXT:    s_max_i32 s5, s10, s5
1876; GFX8-NEXT:    s_min_i32 s5, s5, s11
1877; GFX8-NEXT:    s_sub_i32 s0, s0, s5
1878; GFX8-NEXT:    s_max_i32 s5, s1, -1
1879; GFX8-NEXT:    s_add_i32 s5, s5, 0x80000001
1880; GFX8-NEXT:    s_min_i32 s10, s1, -1
1881; GFX8-NEXT:    s_add_i32 s10, s10, 0x80000000
1882; GFX8-NEXT:    s_max_i32 s5, s5, s6
1883; GFX8-NEXT:    s_min_i32 s5, s5, s10
1884; GFX8-NEXT:    s_sub_i32 s1, s1, s5
1885; GFX8-NEXT:    s_max_i32 s5, s2, -1
1886; GFX8-NEXT:    s_add_i32 s5, s5, 0x80000001
1887; GFX8-NEXT:    s_min_i32 s6, s2, -1
1888; GFX8-NEXT:    s_add_i32 s6, s6, 0x80000000
1889; GFX8-NEXT:    s_max_i32 s5, s5, s7
1890; GFX8-NEXT:    s_min_i32 s5, s5, s6
1891; GFX8-NEXT:    s_sub_i32 s2, s2, s5
1892; GFX8-NEXT:    s_max_i32 s5, s3, -1
1893; GFX8-NEXT:    s_add_i32 s5, s5, 0x80000001
1894; GFX8-NEXT:    s_min_i32 s6, s3, -1
1895; GFX8-NEXT:    s_add_i32 s6, s6, 0x80000000
1896; GFX8-NEXT:    s_max_i32 s5, s5, s8
1897; GFX8-NEXT:    s_min_i32 s5, s5, s6
1898; GFX8-NEXT:    s_sub_i32 s3, s3, s5
1899; GFX8-NEXT:    s_max_i32 s5, s4, -1
1900; GFX8-NEXT:    s_add_i32 s5, s5, 0x80000001
1901; GFX8-NEXT:    s_min_i32 s6, s4, -1
1902; GFX8-NEXT:    s_add_i32 s6, s6, 0x80000000
1903; GFX8-NEXT:    s_max_i32 s5, s5, s9
1904; GFX8-NEXT:    s_min_i32 s5, s5, s6
1905; GFX8-NEXT:    s_sub_i32 s4, s4, s5
1906; GFX8-NEXT:    ; return to shader part epilog
1907;
1908; GFX9-LABEL: s_ssubsat_v5i32:
1909; GFX9:       ; %bb.0:
1910; GFX9-NEXT:    v_mov_b32_e32 v0, s5
1911; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1912; GFX9-NEXT:    v_mov_b32_e32 v2, s7
1913; GFX9-NEXT:    v_mov_b32_e32 v3, s8
1914; GFX9-NEXT:    v_mov_b32_e32 v4, s9
1915; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
1916; GFX9-NEXT:    v_sub_i32 v1, s1, v1 clamp
1917; GFX9-NEXT:    v_sub_i32 v2, s2, v2 clamp
1918; GFX9-NEXT:    v_sub_i32 v3, s3, v3 clamp
1919; GFX9-NEXT:    v_sub_i32 v4, s4, v4 clamp
1920; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1921; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1922; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1923; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
1924; GFX9-NEXT:    v_readfirstlane_b32 s4, v4
1925; GFX9-NEXT:    ; return to shader part epilog
1926;
1927; GFX10PLUS-LABEL: s_ssubsat_v5i32:
1928; GFX10PLUS:       ; %bb.0:
1929; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, s0, s5 clamp
1930; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, s1, s6 clamp
1931; GFX10PLUS-NEXT:    v_sub_nc_i32 v2, s2, s7 clamp
1932; GFX10PLUS-NEXT:    v_sub_nc_i32 v3, s3, s8 clamp
1933; GFX10PLUS-NEXT:    v_sub_nc_i32 v4, s4, s9 clamp
1934; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1935; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
1936; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
1937; GFX10PLUS-NEXT:    v_readfirstlane_b32 s3, v3
1938; GFX10PLUS-NEXT:    v_readfirstlane_b32 s4, v4
1939; GFX10PLUS-NEXT:    ; return to shader part epilog
1940  %result = call <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1941  ret <5 x i32> %result
1942}
1943
1944define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
1945; GFX6-LABEL: v_ssubsat_v16i32:
1946; GFX6:       ; %bb.0:
1947; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1948; GFX6-NEXT:    v_max_i32_e32 v32, -1, v0
1949; GFX6-NEXT:    v_mov_b32_e32 v31, 0x80000001
1950; GFX6-NEXT:    v_add_i32_e32 v32, vcc, v32, v31
1951; GFX6-NEXT:    v_max_i32_e32 v32, v32, v16
1952; GFX6-NEXT:    v_min_i32_e32 v33, -1, v0
1953; GFX6-NEXT:    v_bfrev_b32_e32 v16, 1
1954; GFX6-NEXT:    v_add_i32_e32 v33, vcc, v33, v16
1955; GFX6-NEXT:    v_min_i32_e32 v32, v32, v33
1956; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v32
1957; GFX6-NEXT:    v_max_i32_e32 v32, -1, v1
1958; GFX6-NEXT:    v_add_i32_e32 v32, vcc, v32, v31
1959; GFX6-NEXT:    v_max_i32_e32 v17, v32, v17
1960; GFX6-NEXT:    v_min_i32_e32 v32, -1, v1
1961; GFX6-NEXT:    v_add_i32_e32 v32, vcc, v32, v16
1962; GFX6-NEXT:    v_min_i32_e32 v17, v17, v32
1963; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v17
1964; GFX6-NEXT:    v_max_i32_e32 v17, -1, v2
1965; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
1966; GFX6-NEXT:    v_max_i32_e32 v17, v17, v18
1967; GFX6-NEXT:    v_min_i32_e32 v18, -1, v2
1968; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
1969; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
1970; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v17
1971; GFX6-NEXT:    v_max_i32_e32 v17, -1, v3
1972; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
1973; GFX6-NEXT:    v_max_i32_e32 v17, v17, v19
1974; GFX6-NEXT:    buffer_load_dword v19, off, s[0:3], s32
1975; GFX6-NEXT:    v_min_i32_e32 v18, -1, v3
1976; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
1977; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
1978; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v17
1979; GFX6-NEXT:    v_max_i32_e32 v17, -1, v4
1980; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
1981; GFX6-NEXT:    v_min_i32_e32 v18, -1, v4
1982; GFX6-NEXT:    v_max_i32_e32 v17, v17, v20
1983; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
1984; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
1985; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v17
1986; GFX6-NEXT:    v_max_i32_e32 v17, -1, v5
1987; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
1988; GFX6-NEXT:    v_min_i32_e32 v18, -1, v5
1989; GFX6-NEXT:    v_max_i32_e32 v17, v17, v21
1990; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
1991; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
1992; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v17
1993; GFX6-NEXT:    v_max_i32_e32 v17, -1, v6
1994; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
1995; GFX6-NEXT:    v_min_i32_e32 v18, -1, v6
1996; GFX6-NEXT:    v_max_i32_e32 v17, v17, v22
1997; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
1998; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
1999; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v17
2000; GFX6-NEXT:    v_max_i32_e32 v17, -1, v7
2001; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
2002; GFX6-NEXT:    v_min_i32_e32 v18, -1, v7
2003; GFX6-NEXT:    v_max_i32_e32 v17, v17, v23
2004; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
2005; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2006; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v17
2007; GFX6-NEXT:    v_max_i32_e32 v17, -1, v8
2008; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
2009; GFX6-NEXT:    v_min_i32_e32 v18, -1, v8
2010; GFX6-NEXT:    v_max_i32_e32 v17, v17, v24
2011; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
2012; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2013; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v17
2014; GFX6-NEXT:    v_max_i32_e32 v17, -1, v9
2015; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
2016; GFX6-NEXT:    v_min_i32_e32 v18, -1, v9
2017; GFX6-NEXT:    v_max_i32_e32 v17, v17, v25
2018; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
2019; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2020; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
2021; GFX6-NEXT:    v_max_i32_e32 v17, -1, v10
2022; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
2023; GFX6-NEXT:    v_min_i32_e32 v18, -1, v10
2024; GFX6-NEXT:    v_max_i32_e32 v17, v17, v26
2025; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
2026; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2027; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v17
2028; GFX6-NEXT:    v_max_i32_e32 v17, -1, v11
2029; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
2030; GFX6-NEXT:    v_min_i32_e32 v18, -1, v11
2031; GFX6-NEXT:    v_max_i32_e32 v17, v17, v27
2032; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
2033; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2034; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, v11, v17
2035; GFX6-NEXT:    v_max_i32_e32 v17, -1, v12
2036; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
2037; GFX6-NEXT:    v_min_i32_e32 v18, -1, v12
2038; GFX6-NEXT:    v_max_i32_e32 v17, v17, v28
2039; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
2040; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2041; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v17
2042; GFX6-NEXT:    v_max_i32_e32 v17, -1, v13
2043; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
2044; GFX6-NEXT:    v_min_i32_e32 v18, -1, v13
2045; GFX6-NEXT:    v_max_i32_e32 v17, v17, v29
2046; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
2047; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2048; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, v13, v17
2049; GFX6-NEXT:    v_max_i32_e32 v17, -1, v14
2050; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
2051; GFX6-NEXT:    v_min_i32_e32 v18, -1, v14
2052; GFX6-NEXT:    v_max_i32_e32 v17, v17, v30
2053; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v16
2054; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2055; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v14, v17
2056; GFX6-NEXT:    v_max_i32_e32 v17, -1, v15
2057; GFX6-NEXT:    v_add_i32_e32 v17, vcc, v17, v31
2058; GFX6-NEXT:    v_min_i32_e32 v18, -1, v15
2059; GFX6-NEXT:    v_add_i32_e32 v16, vcc, v18, v16
2060; GFX6-NEXT:    s_waitcnt vmcnt(0)
2061; GFX6-NEXT:    v_max_i32_e32 v17, v17, v19
2062; GFX6-NEXT:    v_min_i32_e32 v16, v17, v16
2063; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, v15, v16
2064; GFX6-NEXT:    s_setpc_b64 s[30:31]
2065;
2066; GFX8-LABEL: v_ssubsat_v16i32:
2067; GFX8:       ; %bb.0:
2068; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2069; GFX8-NEXT:    v_max_i32_e32 v32, -1, v0
2070; GFX8-NEXT:    v_mov_b32_e32 v31, 0x80000001
2071; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v31
2072; GFX8-NEXT:    v_max_i32_e32 v32, v32, v16
2073; GFX8-NEXT:    v_min_i32_e32 v33, -1, v0
2074; GFX8-NEXT:    v_bfrev_b32_e32 v16, 1
2075; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v16
2076; GFX8-NEXT:    v_min_i32_e32 v32, v32, v33
2077; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v32
2078; GFX8-NEXT:    v_max_i32_e32 v32, -1, v1
2079; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v31
2080; GFX8-NEXT:    v_max_i32_e32 v17, v32, v17
2081; GFX8-NEXT:    v_min_i32_e32 v32, -1, v1
2082; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v16
2083; GFX8-NEXT:    v_min_i32_e32 v17, v17, v32
2084; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v17
2085; GFX8-NEXT:    v_max_i32_e32 v17, -1, v2
2086; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
2087; GFX8-NEXT:    v_max_i32_e32 v17, v17, v18
2088; GFX8-NEXT:    v_min_i32_e32 v18, -1, v2
2089; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
2090; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2091; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v17
2092; GFX8-NEXT:    v_max_i32_e32 v17, -1, v3
2093; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
2094; GFX8-NEXT:    v_max_i32_e32 v17, v17, v19
2095; GFX8-NEXT:    buffer_load_dword v19, off, s[0:3], s32
2096; GFX8-NEXT:    v_min_i32_e32 v18, -1, v3
2097; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
2098; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2099; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v17
2100; GFX8-NEXT:    v_max_i32_e32 v17, -1, v4
2101; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
2102; GFX8-NEXT:    v_min_i32_e32 v18, -1, v4
2103; GFX8-NEXT:    v_max_i32_e32 v17, v17, v20
2104; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
2105; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2106; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v17
2107; GFX8-NEXT:    v_max_i32_e32 v17, -1, v5
2108; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
2109; GFX8-NEXT:    v_min_i32_e32 v18, -1, v5
2110; GFX8-NEXT:    v_max_i32_e32 v17, v17, v21
2111; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
2112; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2113; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v17
2114; GFX8-NEXT:    v_max_i32_e32 v17, -1, v6
2115; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
2116; GFX8-NEXT:    v_min_i32_e32 v18, -1, v6
2117; GFX8-NEXT:    v_max_i32_e32 v17, v17, v22
2118; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
2119; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2120; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v6, v17
2121; GFX8-NEXT:    v_max_i32_e32 v17, -1, v7
2122; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
2123; GFX8-NEXT:    v_min_i32_e32 v18, -1, v7
2124; GFX8-NEXT:    v_max_i32_e32 v17, v17, v23
2125; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
2126; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2127; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, v7, v17
2128; GFX8-NEXT:    v_max_i32_e32 v17, -1, v8
2129; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
2130; GFX8-NEXT:    v_min_i32_e32 v18, -1, v8
2131; GFX8-NEXT:    v_max_i32_e32 v17, v17, v24
2132; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
2133; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2134; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, v8, v17
2135; GFX8-NEXT:    v_max_i32_e32 v17, -1, v9
2136; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
2137; GFX8-NEXT:    v_min_i32_e32 v18, -1, v9
2138; GFX8-NEXT:    v_max_i32_e32 v17, v17, v25
2139; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
2140; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2141; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, v9, v17
2142; GFX8-NEXT:    v_max_i32_e32 v17, -1, v10
2143; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
2144; GFX8-NEXT:    v_min_i32_e32 v18, -1, v10
2145; GFX8-NEXT:    v_max_i32_e32 v17, v17, v26
2146; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
2147; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2148; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, v10, v17
2149; GFX8-NEXT:    v_max_i32_e32 v17, -1, v11
2150; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
2151; GFX8-NEXT:    v_min_i32_e32 v18, -1, v11
2152; GFX8-NEXT:    v_max_i32_e32 v17, v17, v27
2153; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
2154; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2155; GFX8-NEXT:    v_sub_u32_e32 v11, vcc, v11, v17
2156; GFX8-NEXT:    v_max_i32_e32 v17, -1, v12
2157; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
2158; GFX8-NEXT:    v_min_i32_e32 v18, -1, v12
2159; GFX8-NEXT:    v_max_i32_e32 v17, v17, v28
2160; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
2161; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2162; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, v12, v17
2163; GFX8-NEXT:    v_max_i32_e32 v17, -1, v13
2164; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
2165; GFX8-NEXT:    v_min_i32_e32 v18, -1, v13
2166; GFX8-NEXT:    v_max_i32_e32 v17, v17, v29
2167; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
2168; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2169; GFX8-NEXT:    v_sub_u32_e32 v13, vcc, v13, v17
2170; GFX8-NEXT:    v_max_i32_e32 v17, -1, v14
2171; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
2172; GFX8-NEXT:    v_min_i32_e32 v18, -1, v14
2173; GFX8-NEXT:    v_max_i32_e32 v17, v17, v30
2174; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
2175; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2176; GFX8-NEXT:    v_sub_u32_e32 v14, vcc, v14, v17
2177; GFX8-NEXT:    v_max_i32_e32 v17, -1, v15
2178; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v31
2179; GFX8-NEXT:    v_min_i32_e32 v18, -1, v15
2180; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v18, v16
2181; GFX8-NEXT:    s_waitcnt vmcnt(0)
2182; GFX8-NEXT:    v_max_i32_e32 v17, v17, v19
2183; GFX8-NEXT:    v_min_i32_e32 v16, v17, v16
2184; GFX8-NEXT:    v_sub_u32_e32 v15, vcc, v15, v16
2185; GFX8-NEXT:    s_setpc_b64 s[30:31]
2186;
2187; GFX9-LABEL: v_ssubsat_v16i32:
2188; GFX9:       ; %bb.0:
2189; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2190; GFX9-NEXT:    v_sub_i32 v0, v0, v16 clamp
2191; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
2192; GFX9-NEXT:    v_sub_i32 v1, v1, v17 clamp
2193; GFX9-NEXT:    v_sub_i32 v2, v2, v18 clamp
2194; GFX9-NEXT:    v_sub_i32 v3, v3, v19 clamp
2195; GFX9-NEXT:    v_sub_i32 v4, v4, v20 clamp
2196; GFX9-NEXT:    v_sub_i32 v5, v5, v21 clamp
2197; GFX9-NEXT:    v_sub_i32 v6, v6, v22 clamp
2198; GFX9-NEXT:    v_sub_i32 v7, v7, v23 clamp
2199; GFX9-NEXT:    v_sub_i32 v8, v8, v24 clamp
2200; GFX9-NEXT:    v_sub_i32 v9, v9, v25 clamp
2201; GFX9-NEXT:    v_sub_i32 v10, v10, v26 clamp
2202; GFX9-NEXT:    v_sub_i32 v11, v11, v27 clamp
2203; GFX9-NEXT:    v_sub_i32 v12, v12, v28 clamp
2204; GFX9-NEXT:    v_sub_i32 v13, v13, v29 clamp
2205; GFX9-NEXT:    v_sub_i32 v14, v14, v30 clamp
2206; GFX9-NEXT:    s_waitcnt vmcnt(0)
2207; GFX9-NEXT:    v_sub_i32 v15, v15, v16 clamp
2208; GFX9-NEXT:    s_setpc_b64 s[30:31]
2209;
2210; GFX10-LABEL: v_ssubsat_v16i32:
2211; GFX10:       ; %bb.0:
2212; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2213; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
2214; GFX10-NEXT:    v_sub_nc_i32 v0, v0, v16 clamp
2215; GFX10-NEXT:    v_sub_nc_i32 v1, v1, v17 clamp
2216; GFX10-NEXT:    v_sub_nc_i32 v2, v2, v18 clamp
2217; GFX10-NEXT:    v_sub_nc_i32 v3, v3, v19 clamp
2218; GFX10-NEXT:    v_sub_nc_i32 v4, v4, v20 clamp
2219; GFX10-NEXT:    v_sub_nc_i32 v5, v5, v21 clamp
2220; GFX10-NEXT:    v_sub_nc_i32 v6, v6, v22 clamp
2221; GFX10-NEXT:    v_sub_nc_i32 v7, v7, v23 clamp
2222; GFX10-NEXT:    v_sub_nc_i32 v8, v8, v24 clamp
2223; GFX10-NEXT:    v_sub_nc_i32 v9, v9, v25 clamp
2224; GFX10-NEXT:    v_sub_nc_i32 v10, v10, v26 clamp
2225; GFX10-NEXT:    v_sub_nc_i32 v11, v11, v27 clamp
2226; GFX10-NEXT:    v_sub_nc_i32 v12, v12, v28 clamp
2227; GFX10-NEXT:    v_sub_nc_i32 v13, v13, v29 clamp
2228; GFX10-NEXT:    v_sub_nc_i32 v14, v14, v30 clamp
2229; GFX10-NEXT:    s_waitcnt vmcnt(0)
2230; GFX10-NEXT:    v_sub_nc_i32 v15, v15, v31 clamp
2231; GFX10-NEXT:    s_setpc_b64 s[30:31]
2232;
2233; GFX11-LABEL: v_ssubsat_v16i32:
2234; GFX11:       ; %bb.0:
2235; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2236; GFX11-NEXT:    scratch_load_b32 v31, off, s32
2237; GFX11-NEXT:    v_sub_nc_i32 v0, v0, v16 clamp
2238; GFX11-NEXT:    v_sub_nc_i32 v1, v1, v17 clamp
2239; GFX11-NEXT:    v_sub_nc_i32 v2, v2, v18 clamp
2240; GFX11-NEXT:    v_sub_nc_i32 v3, v3, v19 clamp
2241; GFX11-NEXT:    v_sub_nc_i32 v4, v4, v20 clamp
2242; GFX11-NEXT:    v_sub_nc_i32 v5, v5, v21 clamp
2243; GFX11-NEXT:    v_sub_nc_i32 v6, v6, v22 clamp
2244; GFX11-NEXT:    v_sub_nc_i32 v7, v7, v23 clamp
2245; GFX11-NEXT:    v_sub_nc_i32 v8, v8, v24 clamp
2246; GFX11-NEXT:    v_sub_nc_i32 v9, v9, v25 clamp
2247; GFX11-NEXT:    v_sub_nc_i32 v10, v10, v26 clamp
2248; GFX11-NEXT:    v_sub_nc_i32 v11, v11, v27 clamp
2249; GFX11-NEXT:    v_sub_nc_i32 v12, v12, v28 clamp
2250; GFX11-NEXT:    v_sub_nc_i32 v13, v13, v29 clamp
2251; GFX11-NEXT:    v_sub_nc_i32 v14, v14, v30 clamp
2252; GFX11-NEXT:    s_waitcnt vmcnt(0)
2253; GFX11-NEXT:    v_sub_nc_i32 v15, v15, v31 clamp
2254; GFX11-NEXT:    s_setpc_b64 s[30:31]
2255  %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
2256  ret <16 x i32> %result
2257}
2258
2259define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
2260; GFX6-LABEL: s_ssubsat_v16i32:
2261; GFX6:       ; %bb.0:
2262; GFX6-NEXT:    s_max_i32 s32, s0, -1
2263; GFX6-NEXT:    s_add_i32 s32, s32, 0x80000001
2264; GFX6-NEXT:    s_min_i32 s33, s0, -1
2265; GFX6-NEXT:    s_add_i32 s33, s33, 0x80000000
2266; GFX6-NEXT:    s_max_i32 s16, s32, s16
2267; GFX6-NEXT:    s_min_i32 s16, s16, s33
2268; GFX6-NEXT:    s_sub_i32 s0, s0, s16
2269; GFX6-NEXT:    s_max_i32 s16, s1, -1
2270; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
2271; GFX6-NEXT:    s_min_i32 s32, s1, -1
2272; GFX6-NEXT:    s_add_i32 s32, s32, 0x80000000
2273; GFX6-NEXT:    s_max_i32 s16, s16, s17
2274; GFX6-NEXT:    s_min_i32 s16, s16, s32
2275; GFX6-NEXT:    s_sub_i32 s1, s1, s16
2276; GFX6-NEXT:    s_max_i32 s16, s2, -1
2277; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
2278; GFX6-NEXT:    s_min_i32 s17, s2, -1
2279; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
2280; GFX6-NEXT:    s_max_i32 s16, s16, s18
2281; GFX6-NEXT:    s_min_i32 s16, s16, s17
2282; GFX6-NEXT:    s_sub_i32 s2, s2, s16
2283; GFX6-NEXT:    s_max_i32 s16, s3, -1
2284; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
2285; GFX6-NEXT:    s_min_i32 s17, s3, -1
2286; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
2287; GFX6-NEXT:    s_max_i32 s16, s16, s19
2288; GFX6-NEXT:    s_min_i32 s16, s16, s17
2289; GFX6-NEXT:    s_sub_i32 s3, s3, s16
2290; GFX6-NEXT:    s_max_i32 s16, s4, -1
2291; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
2292; GFX6-NEXT:    s_min_i32 s17, s4, -1
2293; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
2294; GFX6-NEXT:    s_max_i32 s16, s16, s20
2295; GFX6-NEXT:    s_min_i32 s16, s16, s17
2296; GFX6-NEXT:    s_sub_i32 s4, s4, s16
2297; GFX6-NEXT:    s_max_i32 s16, s5, -1
2298; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
2299; GFX6-NEXT:    s_min_i32 s17, s5, -1
2300; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
2301; GFX6-NEXT:    s_max_i32 s16, s16, s21
2302; GFX6-NEXT:    s_min_i32 s16, s16, s17
2303; GFX6-NEXT:    s_sub_i32 s5, s5, s16
2304; GFX6-NEXT:    s_max_i32 s16, s6, -1
2305; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
2306; GFX6-NEXT:    s_min_i32 s17, s6, -1
2307; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
2308; GFX6-NEXT:    s_max_i32 s16, s16, s22
2309; GFX6-NEXT:    s_min_i32 s16, s16, s17
2310; GFX6-NEXT:    s_sub_i32 s6, s6, s16
2311; GFX6-NEXT:    s_max_i32 s16, s7, -1
2312; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
2313; GFX6-NEXT:    s_min_i32 s17, s7, -1
2314; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
2315; GFX6-NEXT:    s_max_i32 s16, s16, s23
2316; GFX6-NEXT:    s_min_i32 s16, s16, s17
2317; GFX6-NEXT:    s_sub_i32 s7, s7, s16
2318; GFX6-NEXT:    s_max_i32 s16, s8, -1
2319; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
2320; GFX6-NEXT:    s_min_i32 s17, s8, -1
2321; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
2322; GFX6-NEXT:    s_max_i32 s16, s16, s24
2323; GFX6-NEXT:    s_min_i32 s16, s16, s17
2324; GFX6-NEXT:    s_sub_i32 s8, s8, s16
2325; GFX6-NEXT:    s_max_i32 s16, s9, -1
2326; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
2327; GFX6-NEXT:    s_min_i32 s17, s9, -1
2328; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
2329; GFX6-NEXT:    s_max_i32 s16, s16, s25
2330; GFX6-NEXT:    s_min_i32 s16, s16, s17
2331; GFX6-NEXT:    s_sub_i32 s9, s9, s16
2332; GFX6-NEXT:    s_max_i32 s16, s10, -1
2333; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
2334; GFX6-NEXT:    s_min_i32 s17, s10, -1
2335; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
2336; GFX6-NEXT:    s_max_i32 s16, s16, s26
2337; GFX6-NEXT:    s_min_i32 s16, s16, s17
2338; GFX6-NEXT:    s_sub_i32 s10, s10, s16
2339; GFX6-NEXT:    s_max_i32 s16, s11, -1
2340; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
2341; GFX6-NEXT:    s_min_i32 s17, s11, -1
2342; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
2343; GFX6-NEXT:    s_max_i32 s16, s16, s27
2344; GFX6-NEXT:    s_min_i32 s16, s16, s17
2345; GFX6-NEXT:    s_sub_i32 s11, s11, s16
2346; GFX6-NEXT:    s_max_i32 s16, s12, -1
2347; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
2348; GFX6-NEXT:    s_min_i32 s17, s12, -1
2349; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
2350; GFX6-NEXT:    s_max_i32 s16, s16, s28
2351; GFX6-NEXT:    s_min_i32 s16, s16, s17
2352; GFX6-NEXT:    s_sub_i32 s12, s12, s16
2353; GFX6-NEXT:    s_max_i32 s16, s13, -1
2354; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
2355; GFX6-NEXT:    s_min_i32 s17, s13, -1
2356; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
2357; GFX6-NEXT:    s_max_i32 s16, s16, s29
2358; GFX6-NEXT:    s_min_i32 s16, s16, s17
2359; GFX6-NEXT:    s_sub_i32 s13, s13, s16
2360; GFX6-NEXT:    s_max_i32 s16, s14, -1
2361; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
2362; GFX6-NEXT:    s_min_i32 s17, s14, -1
2363; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
2364; GFX6-NEXT:    s_max_i32 s16, s16, s30
2365; GFX6-NEXT:    s_min_i32 s16, s16, s17
2366; GFX6-NEXT:    s_sub_i32 s14, s14, s16
2367; GFX6-NEXT:    s_max_i32 s16, s15, -1
2368; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
2369; GFX6-NEXT:    s_min_i32 s17, s15, -1
2370; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
2371; GFX6-NEXT:    s_max_i32 s16, s16, s31
2372; GFX6-NEXT:    s_min_i32 s16, s16, s17
2373; GFX6-NEXT:    s_sub_i32 s15, s15, s16
2374; GFX6-NEXT:    ; return to shader part epilog
2375;
2376; GFX8-LABEL: s_ssubsat_v16i32:
2377; GFX8:       ; %bb.0:
2378; GFX8-NEXT:    s_max_i32 s32, s0, -1
2379; GFX8-NEXT:    s_add_i32 s32, s32, 0x80000001
2380; GFX8-NEXT:    s_min_i32 s33, s0, -1
2381; GFX8-NEXT:    s_add_i32 s33, s33, 0x80000000
2382; GFX8-NEXT:    s_max_i32 s16, s32, s16
2383; GFX8-NEXT:    s_min_i32 s16, s16, s33
2384; GFX8-NEXT:    s_sub_i32 s0, s0, s16
2385; GFX8-NEXT:    s_max_i32 s16, s1, -1
2386; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
2387; GFX8-NEXT:    s_min_i32 s32, s1, -1
2388; GFX8-NEXT:    s_add_i32 s32, s32, 0x80000000
2389; GFX8-NEXT:    s_max_i32 s16, s16, s17
2390; GFX8-NEXT:    s_min_i32 s16, s16, s32
2391; GFX8-NEXT:    s_sub_i32 s1, s1, s16
2392; GFX8-NEXT:    s_max_i32 s16, s2, -1
2393; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
2394; GFX8-NEXT:    s_min_i32 s17, s2, -1
2395; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
2396; GFX8-NEXT:    s_max_i32 s16, s16, s18
2397; GFX8-NEXT:    s_min_i32 s16, s16, s17
2398; GFX8-NEXT:    s_sub_i32 s2, s2, s16
2399; GFX8-NEXT:    s_max_i32 s16, s3, -1
2400; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
2401; GFX8-NEXT:    s_min_i32 s17, s3, -1
2402; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
2403; GFX8-NEXT:    s_max_i32 s16, s16, s19
2404; GFX8-NEXT:    s_min_i32 s16, s16, s17
2405; GFX8-NEXT:    s_sub_i32 s3, s3, s16
2406; GFX8-NEXT:    s_max_i32 s16, s4, -1
2407; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
2408; GFX8-NEXT:    s_min_i32 s17, s4, -1
2409; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
2410; GFX8-NEXT:    s_max_i32 s16, s16, s20
2411; GFX8-NEXT:    s_min_i32 s16, s16, s17
2412; GFX8-NEXT:    s_sub_i32 s4, s4, s16
2413; GFX8-NEXT:    s_max_i32 s16, s5, -1
2414; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
2415; GFX8-NEXT:    s_min_i32 s17, s5, -1
2416; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
2417; GFX8-NEXT:    s_max_i32 s16, s16, s21
2418; GFX8-NEXT:    s_min_i32 s16, s16, s17
2419; GFX8-NEXT:    s_sub_i32 s5, s5, s16
2420; GFX8-NEXT:    s_max_i32 s16, s6, -1
2421; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
2422; GFX8-NEXT:    s_min_i32 s17, s6, -1
2423; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
2424; GFX8-NEXT:    s_max_i32 s16, s16, s22
2425; GFX8-NEXT:    s_min_i32 s16, s16, s17
2426; GFX8-NEXT:    s_sub_i32 s6, s6, s16
2427; GFX8-NEXT:    s_max_i32 s16, s7, -1
2428; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
2429; GFX8-NEXT:    s_min_i32 s17, s7, -1
2430; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
2431; GFX8-NEXT:    s_max_i32 s16, s16, s23
2432; GFX8-NEXT:    s_min_i32 s16, s16, s17
2433; GFX8-NEXT:    s_sub_i32 s7, s7, s16
2434; GFX8-NEXT:    s_max_i32 s16, s8, -1
2435; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
2436; GFX8-NEXT:    s_min_i32 s17, s8, -1
2437; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
2438; GFX8-NEXT:    s_max_i32 s16, s16, s24
2439; GFX8-NEXT:    s_min_i32 s16, s16, s17
2440; GFX8-NEXT:    s_sub_i32 s8, s8, s16
2441; GFX8-NEXT:    s_max_i32 s16, s9, -1
2442; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
2443; GFX8-NEXT:    s_min_i32 s17, s9, -1
2444; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
2445; GFX8-NEXT:    s_max_i32 s16, s16, s25
2446; GFX8-NEXT:    s_min_i32 s16, s16, s17
2447; GFX8-NEXT:    s_sub_i32 s9, s9, s16
2448; GFX8-NEXT:    s_max_i32 s16, s10, -1
2449; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
2450; GFX8-NEXT:    s_min_i32 s17, s10, -1
2451; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
2452; GFX8-NEXT:    s_max_i32 s16, s16, s26
2453; GFX8-NEXT:    s_min_i32 s16, s16, s17
2454; GFX8-NEXT:    s_sub_i32 s10, s10, s16
2455; GFX8-NEXT:    s_max_i32 s16, s11, -1
2456; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
2457; GFX8-NEXT:    s_min_i32 s17, s11, -1
2458; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
2459; GFX8-NEXT:    s_max_i32 s16, s16, s27
2460; GFX8-NEXT:    s_min_i32 s16, s16, s17
2461; GFX8-NEXT:    s_sub_i32 s11, s11, s16
2462; GFX8-NEXT:    s_max_i32 s16, s12, -1
2463; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
2464; GFX8-NEXT:    s_min_i32 s17, s12, -1
2465; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
2466; GFX8-NEXT:    s_max_i32 s16, s16, s28
2467; GFX8-NEXT:    s_min_i32 s16, s16, s17
2468; GFX8-NEXT:    s_sub_i32 s12, s12, s16
2469; GFX8-NEXT:    s_max_i32 s16, s13, -1
2470; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
2471; GFX8-NEXT:    s_min_i32 s17, s13, -1
2472; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
2473; GFX8-NEXT:    s_max_i32 s16, s16, s29
2474; GFX8-NEXT:    s_min_i32 s16, s16, s17
2475; GFX8-NEXT:    s_sub_i32 s13, s13, s16
2476; GFX8-NEXT:    s_max_i32 s16, s14, -1
2477; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
2478; GFX8-NEXT:    s_min_i32 s17, s14, -1
2479; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
2480; GFX8-NEXT:    s_max_i32 s16, s16, s30
2481; GFX8-NEXT:    s_min_i32 s16, s16, s17
2482; GFX8-NEXT:    s_sub_i32 s14, s14, s16
2483; GFX8-NEXT:    s_max_i32 s16, s15, -1
2484; GFX8-NEXT:    s_add_i32 s16, s16, 0x80000001
2485; GFX8-NEXT:    s_min_i32 s17, s15, -1
2486; GFX8-NEXT:    s_add_i32 s17, s17, 0x80000000
2487; GFX8-NEXT:    s_max_i32 s16, s16, s31
2488; GFX8-NEXT:    s_min_i32 s16, s16, s17
2489; GFX8-NEXT:    s_sub_i32 s15, s15, s16
2490; GFX8-NEXT:    ; return to shader part epilog
2491;
2492; GFX9-LABEL: s_ssubsat_v16i32:
2493; GFX9:       ; %bb.0:
2494; GFX9-NEXT:    v_mov_b32_e32 v0, s16
2495; GFX9-NEXT:    v_mov_b32_e32 v1, s17
2496; GFX9-NEXT:    v_mov_b32_e32 v2, s18
2497; GFX9-NEXT:    v_mov_b32_e32 v3, s19
2498; GFX9-NEXT:    v_mov_b32_e32 v4, s20
2499; GFX9-NEXT:    v_mov_b32_e32 v5, s21
2500; GFX9-NEXT:    v_mov_b32_e32 v6, s22
2501; GFX9-NEXT:    v_mov_b32_e32 v7, s23
2502; GFX9-NEXT:    v_mov_b32_e32 v8, s24
2503; GFX9-NEXT:    v_mov_b32_e32 v9, s25
2504; GFX9-NEXT:    v_mov_b32_e32 v10, s26
2505; GFX9-NEXT:    v_mov_b32_e32 v11, s27
2506; GFX9-NEXT:    v_mov_b32_e32 v12, s28
2507; GFX9-NEXT:    v_mov_b32_e32 v13, s29
2508; GFX9-NEXT:    v_mov_b32_e32 v14, s30
2509; GFX9-NEXT:    v_mov_b32_e32 v15, s31
2510; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
2511; GFX9-NEXT:    v_sub_i32 v1, s1, v1 clamp
2512; GFX9-NEXT:    v_sub_i32 v2, s2, v2 clamp
2513; GFX9-NEXT:    v_sub_i32 v3, s3, v3 clamp
2514; GFX9-NEXT:    v_sub_i32 v4, s4, v4 clamp
2515; GFX9-NEXT:    v_sub_i32 v5, s5, v5 clamp
2516; GFX9-NEXT:    v_sub_i32 v6, s6, v6 clamp
2517; GFX9-NEXT:    v_sub_i32 v7, s7, v7 clamp
2518; GFX9-NEXT:    v_sub_i32 v8, s8, v8 clamp
2519; GFX9-NEXT:    v_sub_i32 v9, s9, v9 clamp
2520; GFX9-NEXT:    v_sub_i32 v10, s10, v10 clamp
2521; GFX9-NEXT:    v_sub_i32 v11, s11, v11 clamp
2522; GFX9-NEXT:    v_sub_i32 v12, s12, v12 clamp
2523; GFX9-NEXT:    v_sub_i32 v13, s13, v13 clamp
2524; GFX9-NEXT:    v_sub_i32 v14, s14, v14 clamp
2525; GFX9-NEXT:    v_sub_i32 v15, s15, v15 clamp
2526; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2527; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2528; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
2529; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
2530; GFX9-NEXT:    v_readfirstlane_b32 s4, v4
2531; GFX9-NEXT:    v_readfirstlane_b32 s5, v5
2532; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
2533; GFX9-NEXT:    v_readfirstlane_b32 s7, v7
2534; GFX9-NEXT:    v_readfirstlane_b32 s8, v8
2535; GFX9-NEXT:    v_readfirstlane_b32 s9, v9
2536; GFX9-NEXT:    v_readfirstlane_b32 s10, v10
2537; GFX9-NEXT:    v_readfirstlane_b32 s11, v11
2538; GFX9-NEXT:    v_readfirstlane_b32 s12, v12
2539; GFX9-NEXT:    v_readfirstlane_b32 s13, v13
2540; GFX9-NEXT:    v_readfirstlane_b32 s14, v14
2541; GFX9-NEXT:    v_readfirstlane_b32 s15, v15
2542; GFX9-NEXT:    ; return to shader part epilog
2543;
2544; GFX10PLUS-LABEL: s_ssubsat_v16i32:
2545; GFX10PLUS:       ; %bb.0:
2546; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, s0, s16 clamp
2547; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, s1, s17 clamp
2548; GFX10PLUS-NEXT:    v_sub_nc_i32 v2, s2, s18 clamp
2549; GFX10PLUS-NEXT:    v_sub_nc_i32 v3, s3, s19 clamp
2550; GFX10PLUS-NEXT:    v_sub_nc_i32 v4, s4, s20 clamp
2551; GFX10PLUS-NEXT:    v_sub_nc_i32 v5, s5, s21 clamp
2552; GFX10PLUS-NEXT:    v_sub_nc_i32 v6, s6, s22 clamp
2553; GFX10PLUS-NEXT:    v_sub_nc_i32 v7, s7, s23 clamp
2554; GFX10PLUS-NEXT:    v_sub_nc_i32 v8, s8, s24 clamp
2555; GFX10PLUS-NEXT:    v_sub_nc_i32 v9, s9, s25 clamp
2556; GFX10PLUS-NEXT:    v_sub_nc_i32 v10, s10, s26 clamp
2557; GFX10PLUS-NEXT:    v_sub_nc_i32 v11, s11, s27 clamp
2558; GFX10PLUS-NEXT:    v_sub_nc_i32 v12, s12, s28 clamp
2559; GFX10PLUS-NEXT:    v_sub_nc_i32 v13, s13, s29 clamp
2560; GFX10PLUS-NEXT:    v_sub_nc_i32 v14, s14, s30 clamp
2561; GFX10PLUS-NEXT:    v_sub_nc_i32 v15, s15, s31 clamp
2562; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
2563; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
2564; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
2565; GFX10PLUS-NEXT:    v_readfirstlane_b32 s3, v3
2566; GFX10PLUS-NEXT:    v_readfirstlane_b32 s4, v4
2567; GFX10PLUS-NEXT:    v_readfirstlane_b32 s5, v5
2568; GFX10PLUS-NEXT:    v_readfirstlane_b32 s6, v6
2569; GFX10PLUS-NEXT:    v_readfirstlane_b32 s7, v7
2570; GFX10PLUS-NEXT:    v_readfirstlane_b32 s8, v8
2571; GFX10PLUS-NEXT:    v_readfirstlane_b32 s9, v9
2572; GFX10PLUS-NEXT:    v_readfirstlane_b32 s10, v10
2573; GFX10PLUS-NEXT:    v_readfirstlane_b32 s11, v11
2574; GFX10PLUS-NEXT:    v_readfirstlane_b32 s12, v12
2575; GFX10PLUS-NEXT:    v_readfirstlane_b32 s13, v13
2576; GFX10PLUS-NEXT:    v_readfirstlane_b32 s14, v14
2577; GFX10PLUS-NEXT:    v_readfirstlane_b32 s15, v15
2578; GFX10PLUS-NEXT:    ; return to shader part epilog
2579  %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
2580  ret <16 x i32> %result
2581}
2582
2583define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
2584; GFX6-LABEL: v_ssubsat_i16:
2585; GFX6:       ; %bb.0:
2586; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2587; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2588; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
2589; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2590; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000001, v2
2591; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
2592; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000000, v3
2593; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
2594; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
2595; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
2596; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2597; GFX6-NEXT:    s_setpc_b64 s[30:31]
2598;
2599; GFX8-LABEL: v_ssubsat_i16:
2600; GFX8:       ; %bb.0:
2601; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2602; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
2603; GFX8-NEXT:    v_add_u16_e32 v2, 0x8001, v2
2604; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
2605; GFX8-NEXT:    v_add_u16_e32 v3, 0x8000, v3
2606; GFX8-NEXT:    v_max_i16_e32 v1, v2, v1
2607; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
2608; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
2609; GFX8-NEXT:    s_setpc_b64 s[30:31]
2610;
2611; GFX9-LABEL: v_ssubsat_i16:
2612; GFX9:       ; %bb.0:
2613; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2614; GFX9-NEXT:    v_sub_i16 v0, v0, v1 clamp
2615; GFX9-NEXT:    s_setpc_b64 s[30:31]
2616;
2617; GFX10PLUS-LABEL: v_ssubsat_i16:
2618; GFX10PLUS:       ; %bb.0:
2619; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2620; GFX10PLUS-NEXT:    v_sub_nc_i16 v0, v0, v1 clamp
2621; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
2622  %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
2623  ret i16 %result
2624}
2625
2626define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
2627; GFX6-LABEL: s_ssubsat_i16:
2628; GFX6:       ; %bb.0:
2629; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2630; GFX6-NEXT:    s_max_i32 s2, s0, -1
2631; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2632; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000001
2633; GFX6-NEXT:    s_min_i32 s3, s0, -1
2634; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000000
2635; GFX6-NEXT:    s_max_i32 s1, s2, s1
2636; GFX6-NEXT:    s_min_i32 s1, s1, s3
2637; GFX6-NEXT:    s_sub_i32 s0, s0, s1
2638; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
2639; GFX6-NEXT:    ; return to shader part epilog
2640;
2641; GFX8-LABEL: s_ssubsat_i16:
2642; GFX8:       ; %bb.0:
2643; GFX8-NEXT:    s_sext_i32_i16 s2, s0
2644; GFX8-NEXT:    s_sext_i32_i16 s3, -1
2645; GFX8-NEXT:    s_max_i32 s4, s2, s3
2646; GFX8-NEXT:    s_addk_i32 s4, 0x8001
2647; GFX8-NEXT:    s_min_i32 s2, s2, s3
2648; GFX8-NEXT:    s_sext_i32_i16 s3, s4
2649; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2650; GFX8-NEXT:    s_addk_i32 s2, 0x8000
2651; GFX8-NEXT:    s_max_i32 s1, s3, s1
2652; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2653; GFX8-NEXT:    s_sext_i32_i16 s2, s2
2654; GFX8-NEXT:    s_min_i32 s1, s1, s2
2655; GFX8-NEXT:    s_sub_i32 s0, s0, s1
2656; GFX8-NEXT:    ; return to shader part epilog
2657;
2658; GFX9-LABEL: s_ssubsat_i16:
2659; GFX9:       ; %bb.0:
2660; GFX9-NEXT:    v_mov_b32_e32 v0, s1
2661; GFX9-NEXT:    v_sub_i16 v0, s0, v0 clamp
2662; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2663; GFX9-NEXT:    ; return to shader part epilog
2664;
2665; GFX10PLUS-LABEL: s_ssubsat_i16:
2666; GFX10PLUS:       ; %bb.0:
2667; GFX10PLUS-NEXT:    v_sub_nc_i16 v0, s0, s1 clamp
2668; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
2669; GFX10PLUS-NEXT:    ; return to shader part epilog
2670  %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
2671  ret i16 %result
2672}
2673
2674define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
2675; GFX6-LABEL: ssubsat_i16_sv:
2676; GFX6:       ; %bb.0:
2677; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2678; GFX6-NEXT:    s_max_i32 s1, s0, -1
2679; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2680; GFX6-NEXT:    s_add_i32 s1, s1, 0x80000001
2681; GFX6-NEXT:    s_min_i32 s2, s0, -1
2682; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000000
2683; GFX6-NEXT:    v_max_i32_e32 v0, s1, v0
2684; GFX6-NEXT:    v_min_i32_e32 v0, s2, v0
2685; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2686; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2687; GFX6-NEXT:    ; return to shader part epilog
2688;
2689; GFX8-LABEL: ssubsat_i16_sv:
2690; GFX8:       ; %bb.0:
2691; GFX8-NEXT:    s_sext_i32_i16 s1, s0
2692; GFX8-NEXT:    s_sext_i32_i16 s2, -1
2693; GFX8-NEXT:    s_max_i32 s3, s1, s2
2694; GFX8-NEXT:    s_addk_i32 s3, 0x8001
2695; GFX8-NEXT:    s_min_i32 s1, s1, s2
2696; GFX8-NEXT:    s_addk_i32 s1, 0x8000
2697; GFX8-NEXT:    v_max_i16_e32 v0, s3, v0
2698; GFX8-NEXT:    v_min_i16_e32 v0, s1, v0
2699; GFX8-NEXT:    v_sub_u16_e32 v0, s0, v0
2700; GFX8-NEXT:    ; return to shader part epilog
2701;
2702; GFX9-LABEL: ssubsat_i16_sv:
2703; GFX9:       ; %bb.0:
2704; GFX9-NEXT:    v_sub_i16 v0, s0, v0 clamp
2705; GFX9-NEXT:    ; return to shader part epilog
2706;
2707; GFX10PLUS-LABEL: ssubsat_i16_sv:
2708; GFX10PLUS:       ; %bb.0:
2709; GFX10PLUS-NEXT:    v_sub_nc_i16 v0, s0, v0 clamp
2710; GFX10PLUS-NEXT:    ; return to shader part epilog
2711  %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
2712  %cast = bitcast i16 %result to half
2713  ret half %cast
2714}
2715
2716define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
2717; GFX6-LABEL: ssubsat_i16_vs:
2718; GFX6:       ; %bb.0:
2719; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2720; GFX6-NEXT:    v_max_i32_e32 v1, -1, v0
2721; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2722; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 0x80000001, v1
2723; GFX6-NEXT:    v_min_i32_e32 v2, -1, v0
2724; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000000, v2
2725; GFX6-NEXT:    v_max_i32_e32 v1, s0, v1
2726; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
2727; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
2728; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2729; GFX6-NEXT:    ; return to shader part epilog
2730;
2731; GFX8-LABEL: ssubsat_i16_vs:
2732; GFX8:       ; %bb.0:
2733; GFX8-NEXT:    v_max_i16_e32 v1, -1, v0
2734; GFX8-NEXT:    v_add_u16_e32 v1, 0x8001, v1
2735; GFX8-NEXT:    v_min_i16_e32 v2, -1, v0
2736; GFX8-NEXT:    v_add_u16_e32 v2, 0x8000, v2
2737; GFX8-NEXT:    v_max_i16_e32 v1, s0, v1
2738; GFX8-NEXT:    v_min_i16_e32 v1, v1, v2
2739; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
2740; GFX8-NEXT:    ; return to shader part epilog
2741;
2742; GFX9-LABEL: ssubsat_i16_vs:
2743; GFX9:       ; %bb.0:
2744; GFX9-NEXT:    v_sub_i16 v0, v0, s0 clamp
2745; GFX9-NEXT:    ; return to shader part epilog
2746;
2747; GFX10PLUS-LABEL: ssubsat_i16_vs:
2748; GFX10PLUS:       ; %bb.0:
2749; GFX10PLUS-NEXT:    v_sub_nc_i16 v0, v0, s0 clamp
2750; GFX10PLUS-NEXT:    ; return to shader part epilog
2751  %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
2752  %cast = bitcast i16 %result to half
2753  ret half %cast
2754}
2755
2756define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
2757; GFX6-LABEL: v_ssubsat_v2i16:
2758; GFX6:       ; %bb.0:
2759; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2760; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2761; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
2762; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2763; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000001, v4
2764; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
2765; GFX6-NEXT:    v_bfrev_b32_e32 v6, 1
2766; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
2767; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
2768; GFX6-NEXT:    v_min_i32_e32 v2, v2, v5
2769; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2770; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
2771; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
2772; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
2773; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000001, v3
2774; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
2775; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x80000000, v4
2776; GFX6-NEXT:    v_max_i32_e32 v2, v3, v2
2777; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
2778; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
2779; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2780; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2781; GFX6-NEXT:    s_setpc_b64 s[30:31]
2782;
2783; GFX8-LABEL: v_ssubsat_v2i16:
2784; GFX8:       ; %bb.0:
2785; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2786; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
2787; GFX8-NEXT:    v_add_u16_e32 v2, 0x8001, v2
2788; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
2789; GFX8-NEXT:    v_add_u16_e32 v3, 0x8000, v3
2790; GFX8-NEXT:    v_max_i16_e32 v2, v2, v1
2791; GFX8-NEXT:    v_min_i16_e32 v2, v2, v3
2792; GFX8-NEXT:    v_mov_b32_e32 v3, -1
2793; GFX8-NEXT:    v_max_i16_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2794; GFX8-NEXT:    v_add_u16_e32 v4, 0x8001, v4
2795; GFX8-NEXT:    v_min_i16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2796; GFX8-NEXT:    v_add_u16_e32 v3, 0x8000, v3
2797; GFX8-NEXT:    v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2798; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
2799; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v2
2800; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2801; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
2802; GFX8-NEXT:    s_setpc_b64 s[30:31]
2803;
2804; GFX9-LABEL: v_ssubsat_v2i16:
2805; GFX9:       ; %bb.0:
2806; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2807; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
2808; GFX9-NEXT:    s_setpc_b64 s[30:31]
2809;
2810; GFX10PLUS-LABEL: v_ssubsat_v2i16:
2811; GFX10PLUS:       ; %bb.0:
2812; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2813; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
2814; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
2815  %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2816  ret <2 x i16> %result
2817}
2818
2819define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
2820; GFX6-LABEL: s_ssubsat_v2i16:
2821; GFX6:       ; %bb.0:
2822; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2823; GFX6-NEXT:    s_max_i32 s4, s0, -1
2824; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
2825; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000001
2826; GFX6-NEXT:    s_min_i32 s5, s0, -1
2827; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000000
2828; GFX6-NEXT:    s_max_i32 s2, s4, s2
2829; GFX6-NEXT:    s_min_i32 s2, s2, s5
2830; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2831; GFX6-NEXT:    s_sub_i32 s0, s0, s2
2832; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
2833; GFX6-NEXT:    s_max_i32 s3, s1, -1
2834; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000001
2835; GFX6-NEXT:    s_min_i32 s4, s1, -1
2836; GFX6-NEXT:    s_add_i32 s4, s4, 0x80000000
2837; GFX6-NEXT:    s_max_i32 s2, s3, s2
2838; GFX6-NEXT:    s_min_i32 s2, s2, s4
2839; GFX6-NEXT:    s_sub_i32 s1, s1, s2
2840; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
2841; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
2842; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
2843; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
2844; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2845; GFX6-NEXT:    s_or_b32 s0, s0, s1
2846; GFX6-NEXT:    ; return to shader part epilog
2847;
2848; GFX8-LABEL: s_ssubsat_v2i16:
2849; GFX8:       ; %bb.0:
2850; GFX8-NEXT:    s_sext_i32_i16 s4, s0
2851; GFX8-NEXT:    s_sext_i32_i16 s5, -1
2852; GFX8-NEXT:    s_max_i32 s6, s4, s5
2853; GFX8-NEXT:    s_addk_i32 s6, 0x8001
2854; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
2855; GFX8-NEXT:    s_min_i32 s4, s4, s5
2856; GFX8-NEXT:    s_sext_i32_i16 s6, s6
2857; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2858; GFX8-NEXT:    s_addk_i32 s4, 0x8000
2859; GFX8-NEXT:    s_max_i32 s1, s6, s1
2860; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2861; GFX8-NEXT:    s_sext_i32_i16 s4, s4
2862; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
2863; GFX8-NEXT:    s_min_i32 s1, s1, s4
2864; GFX8-NEXT:    s_sub_i32 s0, s0, s1
2865; GFX8-NEXT:    s_sext_i32_i16 s1, s2
2866; GFX8-NEXT:    s_max_i32 s4, s1, s5
2867; GFX8-NEXT:    s_addk_i32 s4, 0x8001
2868; GFX8-NEXT:    s_min_i32 s1, s1, s5
2869; GFX8-NEXT:    s_sext_i32_i16 s4, s4
2870; GFX8-NEXT:    s_sext_i32_i16 s3, s3
2871; GFX8-NEXT:    s_addk_i32 s1, 0x8000
2872; GFX8-NEXT:    s_max_i32 s3, s4, s3
2873; GFX8-NEXT:    s_sext_i32_i16 s3, s3
2874; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2875; GFX8-NEXT:    s_min_i32 s1, s3, s1
2876; GFX8-NEXT:    s_sub_i32 s1, s2, s1
2877; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
2878; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
2879; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
2880; GFX8-NEXT:    s_or_b32 s0, s0, s1
2881; GFX8-NEXT:    ; return to shader part epilog
2882;
2883; GFX9-LABEL: s_ssubsat_v2i16:
2884; GFX9:       ; %bb.0:
2885; GFX9-NEXT:    v_mov_b32_e32 v0, s1
2886; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
2887; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2888; GFX9-NEXT:    ; return to shader part epilog
2889;
2890; GFX10PLUS-LABEL: s_ssubsat_v2i16:
2891; GFX10PLUS:       ; %bb.0:
2892; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, s0, s1 clamp
2893; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
2894; GFX10PLUS-NEXT:    ; return to shader part epilog
2895  %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2896  %cast = bitcast <2 x i16> %result to i32
2897  ret i32 %cast
2898}
2899
2900define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
2901; GFX6-LABEL: ssubsat_v2i16_sv:
2902; GFX6:       ; %bb.0:
2903; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2904; GFX6-NEXT:    s_max_i32 s2, s0, -1
2905; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2906; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000001
2907; GFX6-NEXT:    s_min_i32 s3, s0, -1
2908; GFX6-NEXT:    s_add_i32 s3, s3, 0x80000000
2909; GFX6-NEXT:    v_max_i32_e32 v0, s2, v0
2910; GFX6-NEXT:    v_min_i32_e32 v0, s3, v0
2911; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2912; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
2913; GFX6-NEXT:    s_max_i32 s1, s0, -1
2914; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2915; GFX6-NEXT:    s_add_i32 s1, s1, 0x80000001
2916; GFX6-NEXT:    s_min_i32 s2, s0, -1
2917; GFX6-NEXT:    s_add_i32 s2, s2, 0x80000000
2918; GFX6-NEXT:    v_max_i32_e32 v1, s1, v1
2919; GFX6-NEXT:    v_min_i32_e32 v1, s2, v1
2920; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
2921; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2922; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2923; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2924; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2925; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2926; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
2927; GFX6-NEXT:    ; return to shader part epilog
2928;
2929; GFX8-LABEL: ssubsat_v2i16_sv:
2930; GFX8:       ; %bb.0:
2931; GFX8-NEXT:    s_sext_i32_i16 s2, s0
2932; GFX8-NEXT:    s_sext_i32_i16 s3, -1
2933; GFX8-NEXT:    s_max_i32 s4, s2, s3
2934; GFX8-NEXT:    s_addk_i32 s4, 0x8001
2935; GFX8-NEXT:    s_min_i32 s2, s2, s3
2936; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
2937; GFX8-NEXT:    s_addk_i32 s2, 0x8000
2938; GFX8-NEXT:    v_max_i16_e32 v1, s4, v0
2939; GFX8-NEXT:    v_min_i16_e32 v1, s2, v1
2940; GFX8-NEXT:    s_sext_i32_i16 s2, s1
2941; GFX8-NEXT:    s_max_i32 s4, s2, s3
2942; GFX8-NEXT:    s_addk_i32 s4, 0x8001
2943; GFX8-NEXT:    s_min_i32 s2, s2, s3
2944; GFX8-NEXT:    v_mov_b32_e32 v2, s4
2945; GFX8-NEXT:    s_addk_i32 s2, 0x8000
2946; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2947; GFX8-NEXT:    v_min_i16_e32 v0, s2, v0
2948; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2949; GFX8-NEXT:    v_sub_u16_e32 v1, s0, v1
2950; GFX8-NEXT:    v_sub_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2951; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
2952; GFX8-NEXT:    ; return to shader part epilog
2953;
2954; GFX9-LABEL: ssubsat_v2i16_sv:
2955; GFX9:       ; %bb.0:
2956; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
2957; GFX9-NEXT:    ; return to shader part epilog
2958;
2959; GFX10PLUS-LABEL: ssubsat_v2i16_sv:
2960; GFX10PLUS:       ; %bb.0:
2961; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
2962; GFX10PLUS-NEXT:    ; return to shader part epilog
2963  %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2964  %cast = bitcast <2 x i16> %result to float
2965  ret float %cast
2966}
2967
2968define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
2969; GFX6-LABEL: ssubsat_v2i16_vs:
2970; GFX6:       ; %bb.0:
2971; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2972; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
2973; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2974; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0x80000001, v2
2975; GFX6-NEXT:    v_min_i32_e32 v4, -1, v0
2976; GFX6-NEXT:    v_bfrev_b32_e32 v5, 1
2977; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
2978; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
2979; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
2980; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2981; GFX6-NEXT:    v_mov_b32_e32 v3, 0x80000001
2982; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
2983; GFX6-NEXT:    v_max_i32_e32 v2, -1, v1
2984; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
2985; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
2986; GFX6-NEXT:    v_min_i32_e32 v3, -1, v1
2987; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000000, v3
2988; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
2989; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
2990; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
2991; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2992; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2993; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2994; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2995; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2996; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
2997; GFX6-NEXT:    ; return to shader part epilog
2998;
2999; GFX8-LABEL: ssubsat_v2i16_vs:
3000; GFX8:       ; %bb.0:
3001; GFX8-NEXT:    v_max_i16_e32 v1, -1, v0
3002; GFX8-NEXT:    v_add_u16_e32 v1, 0x8001, v1
3003; GFX8-NEXT:    v_min_i16_e32 v2, -1, v0
3004; GFX8-NEXT:    v_add_u16_e32 v2, 0x8000, v2
3005; GFX8-NEXT:    v_max_i16_e32 v1, s0, v1
3006; GFX8-NEXT:    v_min_i16_e32 v1, v1, v2
3007; GFX8-NEXT:    v_mov_b32_e32 v2, -1
3008; GFX8-NEXT:    v_max_i16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3009; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
3010; GFX8-NEXT:    v_add_u16_e32 v3, 0x8001, v3
3011; GFX8-NEXT:    v_min_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3012; GFX8-NEXT:    v_add_u16_e32 v2, 0x8000, v2
3013; GFX8-NEXT:    v_max_i16_e32 v3, s1, v3
3014; GFX8-NEXT:    v_min_i16_e32 v2, v3, v2
3015; GFX8-NEXT:    v_sub_u16_e32 v1, v0, v1
3016; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3017; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
3018; GFX8-NEXT:    ; return to shader part epilog
3019;
3020; GFX9-LABEL: ssubsat_v2i16_vs:
3021; GFX9:       ; %bb.0:
3022; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s0 clamp
3023; GFX9-NEXT:    ; return to shader part epilog
3024;
3025; GFX10PLUS-LABEL: ssubsat_v2i16_vs:
3026; GFX10PLUS:       ; %bb.0:
3027; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, v0, s0 clamp
3028; GFX10PLUS-NEXT:    ; return to shader part epilog
3029  %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
3030  %cast = bitcast <2 x i16> %result to float
3031  ret float %cast
3032}
3033
3034; FIXME: v3i16 insert/extract
3035; define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
3036;   %result = call <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
3037;   ret <3 x i16> %result
3038; }
3039
3040; define amdgpu_ps <3 x i16> @s_ssubsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) {
3041;   %result = call <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
3042;   ret <3 x i16> %result
3043; }
3044
3045define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
3046; GFX6-LABEL: v_ssubsat_v4i16:
3047; GFX6:       ; %bb.0:
3048; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3049; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
3050; GFX6-NEXT:    v_max_i32_e32 v8, -1, v0
3051; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3052; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x80000001, v8
3053; GFX6-NEXT:    v_min_i32_e32 v10, -1, v0
3054; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
3055; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
3056; GFX6-NEXT:    v_max_i32_e32 v4, v8, v4
3057; GFX6-NEXT:    v_min_i32_e32 v4, v4, v10
3058; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3059; GFX6-NEXT:    v_mov_b32_e32 v9, 0x80000001
3060; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
3061; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
3062; GFX6-NEXT:    v_max_i32_e32 v5, -1, v1
3063; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
3064; GFX6-NEXT:    v_min_i32_e32 v8, -1, v1
3065; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
3066; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
3067; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3068; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
3069; GFX6-NEXT:    v_max_i32_e32 v5, -1, v2
3070; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
3071; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
3072; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
3073; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
3074; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
3075; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
3076; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3077; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
3078; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
3079; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
3080; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
3081; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
3082; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
3083; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
3084; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
3085; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
3086; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
3087; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
3088; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
3089; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3090; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
3091; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
3092; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3093; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3094; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3095; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
3096; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
3097; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3098; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3099; GFX6-NEXT:    s_setpc_b64 s[30:31]
3100;
3101; GFX8-LABEL: v_ssubsat_v4i16:
3102; GFX8:       ; %bb.0:
3103; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3104; GFX8-NEXT:    v_max_i16_e32 v4, -1, v0
3105; GFX8-NEXT:    v_add_u16_e32 v4, 0x8001, v4
3106; GFX8-NEXT:    v_min_i16_e32 v5, -1, v0
3107; GFX8-NEXT:    v_add_u16_e32 v5, 0x8000, v5
3108; GFX8-NEXT:    v_max_i16_e32 v4, v4, v2
3109; GFX8-NEXT:    v_min_i16_e32 v4, v4, v5
3110; GFX8-NEXT:    v_mov_b32_e32 v5, -1
3111; GFX8-NEXT:    v_max_i16_sdwa v6, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3112; GFX8-NEXT:    v_add_u16_e32 v6, 0x8001, v6
3113; GFX8-NEXT:    v_min_i16_sdwa v7, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3114; GFX8-NEXT:    v_add_u16_e32 v7, 0x8000, v7
3115; GFX8-NEXT:    v_max_i16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3116; GFX8-NEXT:    v_max_i16_e32 v6, -1, v1
3117; GFX8-NEXT:    v_min_i16_e32 v2, v2, v7
3118; GFX8-NEXT:    v_add_u16_e32 v6, 0x8001, v6
3119; GFX8-NEXT:    v_min_i16_e32 v7, -1, v1
3120; GFX8-NEXT:    v_add_u16_e32 v7, 0x8000, v7
3121; GFX8-NEXT:    v_max_i16_e32 v6, v6, v3
3122; GFX8-NEXT:    v_min_i16_e32 v6, v6, v7
3123; GFX8-NEXT:    v_max_i16_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3124; GFX8-NEXT:    v_add_u16_e32 v7, 0x8001, v7
3125; GFX8-NEXT:    v_min_i16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3126; GFX8-NEXT:    v_add_u16_e32 v5, 0x8000, v5
3127; GFX8-NEXT:    v_max_i16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3128; GFX8-NEXT:    v_min_i16_e32 v3, v3, v5
3129; GFX8-NEXT:    v_sub_u16_e32 v4, v0, v4
3130; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3131; GFX8-NEXT:    v_sub_u16_e32 v2, v1, v6
3132; GFX8-NEXT:    v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3133; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
3134; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
3135; GFX8-NEXT:    s_setpc_b64 s[30:31]
3136;
3137; GFX9-LABEL: v_ssubsat_v4i16:
3138; GFX9:       ; %bb.0:
3139; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3140; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v2 clamp
3141; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v3 clamp
3142; GFX9-NEXT:    s_setpc_b64 s[30:31]
3143;
3144; GFX10PLUS-LABEL: v_ssubsat_v4i16:
3145; GFX10PLUS:       ; %bb.0:
3146; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3147; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, v0, v2 clamp
3148; GFX10PLUS-NEXT:    v_pk_sub_i16 v1, v1, v3 clamp
3149; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
3150  %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
3151  %cast = bitcast <4 x i16> %result to <2 x float>
3152  ret <2 x float> %cast
3153}
3154
3155define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) {
3156; GFX6-LABEL: s_ssubsat_v4i16:
3157; GFX6:       ; %bb.0:
3158; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
3159; GFX6-NEXT:    s_max_i32 s8, s0, -1
3160; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3161; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000001
3162; GFX6-NEXT:    s_min_i32 s9, s0, -1
3163; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000000
3164; GFX6-NEXT:    s_max_i32 s4, s8, s4
3165; GFX6-NEXT:    s_min_i32 s4, s4, s9
3166; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3167; GFX6-NEXT:    s_sub_i32 s0, s0, s4
3168; GFX6-NEXT:    s_lshl_b32 s4, s5, 16
3169; GFX6-NEXT:    s_max_i32 s5, s1, -1
3170; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
3171; GFX6-NEXT:    s_min_i32 s8, s1, -1
3172; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000000
3173; GFX6-NEXT:    s_max_i32 s4, s5, s4
3174; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3175; GFX6-NEXT:    s_min_i32 s4, s4, s8
3176; GFX6-NEXT:    s_max_i32 s5, s2, -1
3177; GFX6-NEXT:    s_sub_i32 s1, s1, s4
3178; GFX6-NEXT:    s_lshl_b32 s4, s6, 16
3179; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
3180; GFX6-NEXT:    s_min_i32 s6, s2, -1
3181; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
3182; GFX6-NEXT:    s_max_i32 s4, s5, s4
3183; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3184; GFX6-NEXT:    s_min_i32 s4, s4, s6
3185; GFX6-NEXT:    s_max_i32 s5, s3, -1
3186; GFX6-NEXT:    s_sub_i32 s2, s2, s4
3187; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
3188; GFX6-NEXT:    s_add_i32 s5, s5, 0x80000001
3189; GFX6-NEXT:    s_min_i32 s6, s3, -1
3190; GFX6-NEXT:    s_add_i32 s6, s6, 0x80000000
3191; GFX6-NEXT:    s_max_i32 s4, s5, s4
3192; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
3193; GFX6-NEXT:    s_min_i32 s4, s4, s6
3194; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3195; GFX6-NEXT:    s_sub_i32 s3, s3, s4
3196; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
3197; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
3198; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
3199; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
3200; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3201; GFX6-NEXT:    s_or_b32 s0, s0, s1
3202; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
3203; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
3204; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3205; GFX6-NEXT:    s_or_b32 s1, s1, s2
3206; GFX6-NEXT:    ; return to shader part epilog
3207;
3208; GFX8-LABEL: s_ssubsat_v4i16:
3209; GFX8:       ; %bb.0:
3210; GFX8-NEXT:    s_sext_i32_i16 s8, s0
3211; GFX8-NEXT:    s_sext_i32_i16 s9, -1
3212; GFX8-NEXT:    s_max_i32 s10, s8, s9
3213; GFX8-NEXT:    s_addk_i32 s10, 0x8001
3214; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
3215; GFX8-NEXT:    s_min_i32 s8, s8, s9
3216; GFX8-NEXT:    s_sext_i32_i16 s10, s10
3217; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3218; GFX8-NEXT:    s_addk_i32 s8, 0x8000
3219; GFX8-NEXT:    s_max_i32 s2, s10, s2
3220; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3221; GFX8-NEXT:    s_sext_i32_i16 s8, s8
3222; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
3223; GFX8-NEXT:    s_min_i32 s2, s2, s8
3224; GFX8-NEXT:    s_sub_i32 s0, s0, s2
3225; GFX8-NEXT:    s_sext_i32_i16 s2, s4
3226; GFX8-NEXT:    s_max_i32 s8, s2, s9
3227; GFX8-NEXT:    s_addk_i32 s8, 0x8001
3228; GFX8-NEXT:    s_min_i32 s2, s2, s9
3229; GFX8-NEXT:    s_sext_i32_i16 s8, s8
3230; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3231; GFX8-NEXT:    s_addk_i32 s2, 0x8000
3232; GFX8-NEXT:    s_max_i32 s6, s8, s6
3233; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3234; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3235; GFX8-NEXT:    s_min_i32 s2, s6, s2
3236; GFX8-NEXT:    s_sub_i32 s2, s4, s2
3237; GFX8-NEXT:    s_sext_i32_i16 s4, s1
3238; GFX8-NEXT:    s_max_i32 s6, s4, s9
3239; GFX8-NEXT:    s_addk_i32 s6, 0x8001
3240; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
3241; GFX8-NEXT:    s_min_i32 s4, s4, s9
3242; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3243; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3244; GFX8-NEXT:    s_addk_i32 s4, 0x8000
3245; GFX8-NEXT:    s_max_i32 s3, s6, s3
3246; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3247; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3248; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
3249; GFX8-NEXT:    s_min_i32 s3, s3, s4
3250; GFX8-NEXT:    s_sub_i32 s1, s1, s3
3251; GFX8-NEXT:    s_sext_i32_i16 s3, s5
3252; GFX8-NEXT:    s_max_i32 s4, s3, s9
3253; GFX8-NEXT:    s_addk_i32 s4, 0x8001
3254; GFX8-NEXT:    s_min_i32 s3, s3, s9
3255; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3256; GFX8-NEXT:    s_sext_i32_i16 s6, s7
3257; GFX8-NEXT:    s_addk_i32 s3, 0x8000
3258; GFX8-NEXT:    s_max_i32 s4, s4, s6
3259; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3260; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3261; GFX8-NEXT:    s_min_i32 s3, s4, s3
3262; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
3263; GFX8-NEXT:    s_sub_i32 s3, s5, s3
3264; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
3265; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
3266; GFX8-NEXT:    s_or_b32 s0, s0, s2
3267; GFX8-NEXT:    s_and_b32 s2, 0xffff, s3
3268; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
3269; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
3270; GFX8-NEXT:    s_or_b32 s1, s1, s2
3271; GFX8-NEXT:    ; return to shader part epilog
3272;
3273; GFX9-LABEL: s_ssubsat_v4i16:
3274; GFX9:       ; %bb.0:
3275; GFX9-NEXT:    v_mov_b32_e32 v0, s2
3276; GFX9-NEXT:    v_mov_b32_e32 v1, s3
3277; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
3278; GFX9-NEXT:    v_pk_sub_i16 v1, s1, v1 clamp
3279; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
3280; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
3281; GFX9-NEXT:    ; return to shader part epilog
3282;
3283; GFX10PLUS-LABEL: s_ssubsat_v4i16:
3284; GFX10PLUS:       ; %bb.0:
3285; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, s0, s2 clamp
3286; GFX10PLUS-NEXT:    v_pk_sub_i16 v1, s1, s3 clamp
3287; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
3288; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
3289; GFX10PLUS-NEXT:    ; return to shader part epilog
3290  %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
3291  %cast = bitcast <4 x i16> %result to <2 x i32>
3292  ret <2 x i32> %cast
3293}
3294
3295; FIXME
3296; define <5 x i16> @v_ssubsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) {
3297;   %result = call <5 x i16> @llvm.ssub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
3298;   ret <5 x i16> %result
3299; }
3300
3301; define amdgpu_ps <5 x i16> @s_ssubsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) {
3302;   %result = call <5 x i16> @llvm.ssub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
3303;   ret <5 x i16> %result
3304; }
3305
3306define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
3307; GFX6-LABEL: v_ssubsat_v6i16:
3308; GFX6:       ; %bb.0:
3309; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3310; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
3311; GFX6-NEXT:    v_max_i32_e32 v12, -1, v0
3312; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
3313; GFX6-NEXT:    v_add_i32_e32 v12, vcc, 0x80000001, v12
3314; GFX6-NEXT:    v_min_i32_e32 v14, -1, v0
3315; GFX6-NEXT:    v_bfrev_b32_e32 v15, 1
3316; GFX6-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
3317; GFX6-NEXT:    v_max_i32_e32 v6, v12, v6
3318; GFX6-NEXT:    v_min_i32_e32 v6, v6, v14
3319; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3320; GFX6-NEXT:    v_mov_b32_e32 v13, 0x80000001
3321; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
3322; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
3323; GFX6-NEXT:    v_max_i32_e32 v7, -1, v1
3324; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
3325; GFX6-NEXT:    v_min_i32_e32 v12, -1, v1
3326; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
3327; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
3328; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3329; GFX6-NEXT:    v_min_i32_e32 v6, v6, v12
3330; GFX6-NEXT:    v_max_i32_e32 v7, -1, v2
3331; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
3332; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
3333; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
3334; GFX6-NEXT:    v_min_i32_e32 v8, -1, v2
3335; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
3336; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
3337; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3338; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
3339; GFX6-NEXT:    v_max_i32_e32 v7, -1, v3
3340; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
3341; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
3342; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
3343; GFX6-NEXT:    v_min_i32_e32 v8, -1, v3
3344; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
3345; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
3346; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3347; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
3348; GFX6-NEXT:    v_max_i32_e32 v7, -1, v4
3349; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
3350; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
3351; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
3352; GFX6-NEXT:    v_min_i32_e32 v8, -1, v4
3353; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
3354; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
3355; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
3356; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
3357; GFX6-NEXT:    v_max_i32_e32 v7, -1, v5
3358; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
3359; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
3360; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
3361; GFX6-NEXT:    v_min_i32_e32 v8, -1, v5
3362; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
3363; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
3364; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
3365; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
3366; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
3367; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3368; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
3369; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
3370; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
3371; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3372; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3373; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
3374; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3375; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
3376; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
3377; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
3378; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3379; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
3380; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3381; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
3382; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3383; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3384; GFX6-NEXT:    s_setpc_b64 s[30:31]
3385;
3386; GFX8-LABEL: v_ssubsat_v6i16:
3387; GFX8:       ; %bb.0:
3388; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3389; GFX8-NEXT:    v_max_i16_e32 v6, -1, v0
3390; GFX8-NEXT:    v_add_u16_e32 v6, 0x8001, v6
3391; GFX8-NEXT:    v_min_i16_e32 v7, -1, v0
3392; GFX8-NEXT:    v_add_u16_e32 v7, 0x8000, v7
3393; GFX8-NEXT:    v_max_i16_e32 v6, v6, v3
3394; GFX8-NEXT:    v_min_i16_e32 v6, v6, v7
3395; GFX8-NEXT:    v_mov_b32_e32 v7, -1
3396; GFX8-NEXT:    v_max_i16_sdwa v8, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3397; GFX8-NEXT:    v_add_u16_e32 v8, 0x8001, v8
3398; GFX8-NEXT:    v_min_i16_sdwa v9, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3399; GFX8-NEXT:    v_add_u16_e32 v9, 0x8000, v9
3400; GFX8-NEXT:    v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3401; GFX8-NEXT:    v_max_i16_e32 v8, -1, v1
3402; GFX8-NEXT:    v_min_i16_e32 v3, v3, v9
3403; GFX8-NEXT:    v_add_u16_e32 v8, 0x8001, v8
3404; GFX8-NEXT:    v_min_i16_e32 v9, -1, v1
3405; GFX8-NEXT:    v_add_u16_e32 v9, 0x8000, v9
3406; GFX8-NEXT:    v_max_i16_e32 v8, v8, v4
3407; GFX8-NEXT:    v_min_i16_e32 v8, v8, v9
3408; GFX8-NEXT:    v_max_i16_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3409; GFX8-NEXT:    v_add_u16_e32 v9, 0x8001, v9
3410; GFX8-NEXT:    v_min_i16_sdwa v10, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3411; GFX8-NEXT:    v_add_u16_e32 v10, 0x8000, v10
3412; GFX8-NEXT:    v_max_i16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3413; GFX8-NEXT:    v_max_i16_e32 v9, -1, v2
3414; GFX8-NEXT:    v_min_i16_e32 v4, v4, v10
3415; GFX8-NEXT:    v_add_u16_e32 v9, 0x8001, v9
3416; GFX8-NEXT:    v_min_i16_e32 v10, -1, v2
3417; GFX8-NEXT:    v_add_u16_e32 v10, 0x8000, v10
3418; GFX8-NEXT:    v_max_i16_e32 v9, v9, v5
3419; GFX8-NEXT:    v_min_i16_e32 v9, v9, v10
3420; GFX8-NEXT:    v_max_i16_sdwa v10, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3421; GFX8-NEXT:    v_add_u16_e32 v10, 0x8001, v10
3422; GFX8-NEXT:    v_min_i16_sdwa v7, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3423; GFX8-NEXT:    v_add_u16_e32 v7, 0x8000, v7
3424; GFX8-NEXT:    v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3425; GFX8-NEXT:    v_min_i16_e32 v5, v5, v7
3426; GFX8-NEXT:    v_sub_u16_e32 v6, v0, v6
3427; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3428; GFX8-NEXT:    v_sub_u16_e32 v3, v1, v8
3429; GFX8-NEXT:    v_sub_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3430; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
3431; GFX8-NEXT:    v_sub_u16_e32 v3, v2, v9
3432; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3433; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
3434; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
3435; GFX8-NEXT:    s_setpc_b64 s[30:31]
3436;
3437; GFX9-LABEL: v_ssubsat_v6i16:
3438; GFX9:       ; %bb.0:
3439; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3440; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v3 clamp
3441; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v4 clamp
3442; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v5 clamp
3443; GFX9-NEXT:    s_setpc_b64 s[30:31]
3444;
3445; GFX10PLUS-LABEL: v_ssubsat_v6i16:
3446; GFX10PLUS:       ; %bb.0:
3447; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3448; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, v0, v3 clamp
3449; GFX10PLUS-NEXT:    v_pk_sub_i16 v1, v1, v4 clamp
3450; GFX10PLUS-NEXT:    v_pk_sub_i16 v2, v2, v5 clamp
3451; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
3452  %result = call <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
3453  %cast = bitcast <6 x i16> %result to <3 x float>
3454  ret <3 x float> %cast
3455}
3456
3457define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) {
3458; GFX6-LABEL: s_ssubsat_v6i16:
3459; GFX6:       ; %bb.0:
3460; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
3461; GFX6-NEXT:    s_max_i32 s12, s0, -1
3462; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
3463; GFX6-NEXT:    s_add_i32 s12, s12, 0x80000001
3464; GFX6-NEXT:    s_min_i32 s13, s0, -1
3465; GFX6-NEXT:    s_add_i32 s13, s13, 0x80000000
3466; GFX6-NEXT:    s_max_i32 s6, s12, s6
3467; GFX6-NEXT:    s_min_i32 s6, s6, s13
3468; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3469; GFX6-NEXT:    s_sub_i32 s0, s0, s6
3470; GFX6-NEXT:    s_lshl_b32 s6, s7, 16
3471; GFX6-NEXT:    s_max_i32 s7, s1, -1
3472; GFX6-NEXT:    s_add_i32 s7, s7, 0x80000001
3473; GFX6-NEXT:    s_min_i32 s12, s1, -1
3474; GFX6-NEXT:    s_add_i32 s12, s12, 0x80000000
3475; GFX6-NEXT:    s_max_i32 s6, s7, s6
3476; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3477; GFX6-NEXT:    s_min_i32 s6, s6, s12
3478; GFX6-NEXT:    s_max_i32 s7, s2, -1
3479; GFX6-NEXT:    s_sub_i32 s1, s1, s6
3480; GFX6-NEXT:    s_lshl_b32 s6, s8, 16
3481; GFX6-NEXT:    s_add_i32 s7, s7, 0x80000001
3482; GFX6-NEXT:    s_min_i32 s8, s2, -1
3483; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000000
3484; GFX6-NEXT:    s_max_i32 s6, s7, s6
3485; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3486; GFX6-NEXT:    s_min_i32 s6, s6, s8
3487; GFX6-NEXT:    s_max_i32 s7, s3, -1
3488; GFX6-NEXT:    s_sub_i32 s2, s2, s6
3489; GFX6-NEXT:    s_lshl_b32 s6, s9, 16
3490; GFX6-NEXT:    s_add_i32 s7, s7, 0x80000001
3491; GFX6-NEXT:    s_min_i32 s8, s3, -1
3492; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000000
3493; GFX6-NEXT:    s_max_i32 s6, s7, s6
3494; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3495; GFX6-NEXT:    s_min_i32 s6, s6, s8
3496; GFX6-NEXT:    s_max_i32 s7, s4, -1
3497; GFX6-NEXT:    s_sub_i32 s3, s3, s6
3498; GFX6-NEXT:    s_lshl_b32 s6, s10, 16
3499; GFX6-NEXT:    s_add_i32 s7, s7, 0x80000001
3500; GFX6-NEXT:    s_min_i32 s8, s4, -1
3501; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000000
3502; GFX6-NEXT:    s_max_i32 s6, s7, s6
3503; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
3504; GFX6-NEXT:    s_min_i32 s6, s6, s8
3505; GFX6-NEXT:    s_max_i32 s7, s5, -1
3506; GFX6-NEXT:    s_sub_i32 s4, s4, s6
3507; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
3508; GFX6-NEXT:    s_add_i32 s7, s7, 0x80000001
3509; GFX6-NEXT:    s_min_i32 s8, s5, -1
3510; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
3511; GFX6-NEXT:    s_add_i32 s8, s8, 0x80000000
3512; GFX6-NEXT:    s_max_i32 s6, s7, s6
3513; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3514; GFX6-NEXT:    s_min_i32 s6, s6, s8
3515; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
3516; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
3517; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
3518; GFX6-NEXT:    s_sub_i32 s5, s5, s6
3519; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
3520; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3521; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
3522; GFX6-NEXT:    s_or_b32 s0, s0, s1
3523; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
3524; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
3525; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
3526; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3527; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
3528; GFX6-NEXT:    s_or_b32 s1, s1, s2
3529; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
3530; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3531; GFX6-NEXT:    s_or_b32 s2, s2, s3
3532; GFX6-NEXT:    ; return to shader part epilog
3533;
3534; GFX8-LABEL: s_ssubsat_v6i16:
3535; GFX8:       ; %bb.0:
3536; GFX8-NEXT:    s_sext_i32_i16 s12, s0
3537; GFX8-NEXT:    s_sext_i32_i16 s13, -1
3538; GFX8-NEXT:    s_max_i32 s14, s12, s13
3539; GFX8-NEXT:    s_addk_i32 s14, 0x8001
3540; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
3541; GFX8-NEXT:    s_min_i32 s12, s12, s13
3542; GFX8-NEXT:    s_sext_i32_i16 s14, s14
3543; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3544; GFX8-NEXT:    s_addk_i32 s12, 0x8000
3545; GFX8-NEXT:    s_max_i32 s3, s14, s3
3546; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3547; GFX8-NEXT:    s_sext_i32_i16 s12, s12
3548; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
3549; GFX8-NEXT:    s_min_i32 s3, s3, s12
3550; GFX8-NEXT:    s_sub_i32 s0, s0, s3
3551; GFX8-NEXT:    s_sext_i32_i16 s3, s6
3552; GFX8-NEXT:    s_max_i32 s12, s3, s13
3553; GFX8-NEXT:    s_addk_i32 s12, 0x8001
3554; GFX8-NEXT:    s_min_i32 s3, s3, s13
3555; GFX8-NEXT:    s_sext_i32_i16 s12, s12
3556; GFX8-NEXT:    s_sext_i32_i16 s9, s9
3557; GFX8-NEXT:    s_addk_i32 s3, 0x8000
3558; GFX8-NEXT:    s_max_i32 s9, s12, s9
3559; GFX8-NEXT:    s_sext_i32_i16 s9, s9
3560; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3561; GFX8-NEXT:    s_min_i32 s3, s9, s3
3562; GFX8-NEXT:    s_sub_i32 s3, s6, s3
3563; GFX8-NEXT:    s_sext_i32_i16 s6, s1
3564; GFX8-NEXT:    s_max_i32 s9, s6, s13
3565; GFX8-NEXT:    s_addk_i32 s9, 0x8001
3566; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
3567; GFX8-NEXT:    s_min_i32 s6, s6, s13
3568; GFX8-NEXT:    s_sext_i32_i16 s9, s9
3569; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3570; GFX8-NEXT:    s_addk_i32 s6, 0x8000
3571; GFX8-NEXT:    s_max_i32 s4, s9, s4
3572; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3573; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3574; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
3575; GFX8-NEXT:    s_min_i32 s4, s4, s6
3576; GFX8-NEXT:    s_sub_i32 s1, s1, s4
3577; GFX8-NEXT:    s_sext_i32_i16 s4, s7
3578; GFX8-NEXT:    s_max_i32 s6, s4, s13
3579; GFX8-NEXT:    s_addk_i32 s6, 0x8001
3580; GFX8-NEXT:    s_min_i32 s4, s4, s13
3581; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3582; GFX8-NEXT:    s_sext_i32_i16 s9, s10
3583; GFX8-NEXT:    s_addk_i32 s4, 0x8000
3584; GFX8-NEXT:    s_max_i32 s6, s6, s9
3585; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3586; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3587; GFX8-NEXT:    s_min_i32 s4, s6, s4
3588; GFX8-NEXT:    s_sext_i32_i16 s6, s2
3589; GFX8-NEXT:    s_sub_i32 s4, s7, s4
3590; GFX8-NEXT:    s_max_i32 s7, s6, s13
3591; GFX8-NEXT:    s_addk_i32 s7, 0x8001
3592; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
3593; GFX8-NEXT:    s_min_i32 s6, s6, s13
3594; GFX8-NEXT:    s_sext_i32_i16 s7, s7
3595; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3596; GFX8-NEXT:    s_addk_i32 s6, 0x8000
3597; GFX8-NEXT:    s_max_i32 s5, s7, s5
3598; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3599; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3600; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
3601; GFX8-NEXT:    s_min_i32 s5, s5, s6
3602; GFX8-NEXT:    s_sub_i32 s2, s2, s5
3603; GFX8-NEXT:    s_sext_i32_i16 s5, s8
3604; GFX8-NEXT:    s_max_i32 s6, s5, s13
3605; GFX8-NEXT:    s_addk_i32 s6, 0x8001
3606; GFX8-NEXT:    s_min_i32 s5, s5, s13
3607; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3608; GFX8-NEXT:    s_sext_i32_i16 s7, s11
3609; GFX8-NEXT:    s_addk_i32 s5, 0x8000
3610; GFX8-NEXT:    s_max_i32 s6, s6, s7
3611; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
3612; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3613; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3614; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
3615; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
3616; GFX8-NEXT:    s_min_i32 s5, s6, s5
3617; GFX8-NEXT:    s_or_b32 s0, s0, s3
3618; GFX8-NEXT:    s_and_b32 s3, 0xffff, s4
3619; GFX8-NEXT:    s_sub_i32 s5, s8, s5
3620; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
3621; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
3622; GFX8-NEXT:    s_or_b32 s1, s1, s3
3623; GFX8-NEXT:    s_and_b32 s3, 0xffff, s5
3624; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
3625; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
3626; GFX8-NEXT:    s_or_b32 s2, s2, s3
3627; GFX8-NEXT:    ; return to shader part epilog
3628;
3629; GFX9-LABEL: s_ssubsat_v6i16:
3630; GFX9:       ; %bb.0:
3631; GFX9-NEXT:    v_mov_b32_e32 v0, s3
3632; GFX9-NEXT:    v_mov_b32_e32 v1, s4
3633; GFX9-NEXT:    v_mov_b32_e32 v2, s5
3634; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
3635; GFX9-NEXT:    v_pk_sub_i16 v1, s1, v1 clamp
3636; GFX9-NEXT:    v_pk_sub_i16 v2, s2, v2 clamp
3637; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
3638; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
3639; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
3640; GFX9-NEXT:    ; return to shader part epilog
3641;
3642; GFX10PLUS-LABEL: s_ssubsat_v6i16:
3643; GFX10PLUS:       ; %bb.0:
3644; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, s0, s3 clamp
3645; GFX10PLUS-NEXT:    v_pk_sub_i16 v1, s1, s4 clamp
3646; GFX10PLUS-NEXT:    v_pk_sub_i16 v2, s2, s5 clamp
3647; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
3648; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
3649; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
3650; GFX10PLUS-NEXT:    ; return to shader part epilog
3651  %result = call <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
3652  %cast = bitcast <6 x i16> %result to <3 x i32>
3653  ret <3 x i32> %cast
3654}
3655
3656define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
3657; GFX6-LABEL: v_ssubsat_v8i16:
3658; GFX6:       ; %bb.0:
3659; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3660; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
3661; GFX6-NEXT:    v_max_i32_e32 v16, -1, v0
3662; GFX6-NEXT:    v_mov_b32_e32 v17, 0x80000001
3663; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
3664; GFX6-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
3665; GFX6-NEXT:    v_min_i32_e32 v18, -1, v0
3666; GFX6-NEXT:    v_bfrev_b32_e32 v19, 1
3667; GFX6-NEXT:    v_add_i32_e32 v18, vcc, v18, v19
3668; GFX6-NEXT:    v_max_i32_e32 v8, v16, v8
3669; GFX6-NEXT:    v_min_i32_e32 v8, v8, v18
3670; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3671; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
3672; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
3673; GFX6-NEXT:    v_max_i32_e32 v9, -1, v1
3674; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
3675; GFX6-NEXT:    v_min_i32_e32 v16, -1, v1
3676; GFX6-NEXT:    v_add_i32_e32 v16, vcc, v16, v19
3677; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
3678; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3679; GFX6-NEXT:    v_min_i32_e32 v8, v8, v16
3680; GFX6-NEXT:    v_max_i32_e32 v9, -1, v2
3681; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
3682; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
3683; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
3684; GFX6-NEXT:    v_min_i32_e32 v10, -1, v2
3685; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v19
3686; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
3687; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3688; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
3689; GFX6-NEXT:    v_max_i32_e32 v9, -1, v3
3690; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
3691; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
3692; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
3693; GFX6-NEXT:    v_min_i32_e32 v10, -1, v3
3694; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v19
3695; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
3696; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3697; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
3698; GFX6-NEXT:    v_max_i32_e32 v9, -1, v4
3699; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v8
3700; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
3701; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
3702; GFX6-NEXT:    v_min_i32_e32 v10, -1, v4
3703; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v19
3704; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
3705; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
3706; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
3707; GFX6-NEXT:    v_max_i32_e32 v9, -1, v5
3708; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v8
3709; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
3710; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
3711; GFX6-NEXT:    v_min_i32_e32 v10, -1, v5
3712; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v19
3713; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
3714; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
3715; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
3716; GFX6-NEXT:    v_max_i32_e32 v9, -1, v6
3717; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v8
3718; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
3719; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
3720; GFX6-NEXT:    v_min_i32_e32 v10, -1, v6
3721; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v19
3722; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
3723; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
3724; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
3725; GFX6-NEXT:    v_max_i32_e32 v9, -1, v7
3726; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
3727; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
3728; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
3729; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
3730; GFX6-NEXT:    v_min_i32_e32 v10, -1, v7
3731; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
3732; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v19
3733; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
3734; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3735; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
3736; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
3737; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
3738; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3739; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3740; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
3741; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v8
3742; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3743; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
3744; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
3745; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
3746; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 16, v7
3747; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3748; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
3749; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 16, v6
3750; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3751; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
3752; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3753; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
3754; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3755; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
3756; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3757; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
3758; GFX6-NEXT:    s_setpc_b64 s[30:31]
3759;
3760; GFX8-LABEL: v_ssubsat_v8i16:
3761; GFX8:       ; %bb.0:
3762; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3763; GFX8-NEXT:    v_max_i16_e32 v8, -1, v0
3764; GFX8-NEXT:    v_add_u16_e32 v8, 0x8001, v8
3765; GFX8-NEXT:    v_min_i16_e32 v9, -1, v0
3766; GFX8-NEXT:    v_add_u16_e32 v9, 0x8000, v9
3767; GFX8-NEXT:    v_max_i16_e32 v8, v8, v4
3768; GFX8-NEXT:    v_min_i16_e32 v8, v8, v9
3769; GFX8-NEXT:    v_mov_b32_e32 v9, -1
3770; GFX8-NEXT:    v_max_i16_sdwa v10, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3771; GFX8-NEXT:    v_add_u16_e32 v10, 0x8001, v10
3772; GFX8-NEXT:    v_min_i16_sdwa v11, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3773; GFX8-NEXT:    v_add_u16_e32 v11, 0x8000, v11
3774; GFX8-NEXT:    v_max_i16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3775; GFX8-NEXT:    v_max_i16_e32 v10, -1, v1
3776; GFX8-NEXT:    v_min_i16_e32 v4, v4, v11
3777; GFX8-NEXT:    v_add_u16_e32 v10, 0x8001, v10
3778; GFX8-NEXT:    v_min_i16_e32 v11, -1, v1
3779; GFX8-NEXT:    v_add_u16_e32 v11, 0x8000, v11
3780; GFX8-NEXT:    v_max_i16_e32 v10, v10, v5
3781; GFX8-NEXT:    v_min_i16_e32 v10, v10, v11
3782; GFX8-NEXT:    v_max_i16_sdwa v11, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3783; GFX8-NEXT:    v_add_u16_e32 v11, 0x8001, v11
3784; GFX8-NEXT:    v_min_i16_sdwa v12, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3785; GFX8-NEXT:    v_add_u16_e32 v12, 0x8000, v12
3786; GFX8-NEXT:    v_max_i16_sdwa v5, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3787; GFX8-NEXT:    v_max_i16_e32 v11, -1, v2
3788; GFX8-NEXT:    v_min_i16_e32 v5, v5, v12
3789; GFX8-NEXT:    v_add_u16_e32 v11, 0x8001, v11
3790; GFX8-NEXT:    v_min_i16_e32 v12, -1, v2
3791; GFX8-NEXT:    v_add_u16_e32 v12, 0x8000, v12
3792; GFX8-NEXT:    v_max_i16_e32 v11, v11, v6
3793; GFX8-NEXT:    v_min_i16_e32 v11, v11, v12
3794; GFX8-NEXT:    v_max_i16_sdwa v12, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3795; GFX8-NEXT:    v_add_u16_e32 v12, 0x8001, v12
3796; GFX8-NEXT:    v_min_i16_sdwa v13, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3797; GFX8-NEXT:    v_add_u16_e32 v13, 0x8000, v13
3798; GFX8-NEXT:    v_max_i16_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3799; GFX8-NEXT:    v_max_i16_e32 v12, -1, v3
3800; GFX8-NEXT:    v_min_i16_e32 v6, v6, v13
3801; GFX8-NEXT:    v_add_u16_e32 v12, 0x8001, v12
3802; GFX8-NEXT:    v_min_i16_e32 v13, -1, v3
3803; GFX8-NEXT:    v_add_u16_e32 v13, 0x8000, v13
3804; GFX8-NEXT:    v_max_i16_e32 v12, v12, v7
3805; GFX8-NEXT:    v_min_i16_e32 v12, v12, v13
3806; GFX8-NEXT:    v_max_i16_sdwa v13, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3807; GFX8-NEXT:    v_add_u16_e32 v13, 0x8001, v13
3808; GFX8-NEXT:    v_min_i16_sdwa v9, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3809; GFX8-NEXT:    v_add_u16_e32 v9, 0x8000, v9
3810; GFX8-NEXT:    v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3811; GFX8-NEXT:    v_sub_u16_e32 v8, v0, v8
3812; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3813; GFX8-NEXT:    v_sub_u16_e32 v4, v1, v10
3814; GFX8-NEXT:    v_sub_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3815; GFX8-NEXT:    v_min_i16_e32 v7, v7, v9
3816; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
3817; GFX8-NEXT:    v_sub_u16_e32 v4, v2, v11
3818; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3819; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
3820; GFX8-NEXT:    v_sub_u16_e32 v4, v3, v12
3821; GFX8-NEXT:    v_sub_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3822; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
3823; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
3824; GFX8-NEXT:    s_setpc_b64 s[30:31]
3825;
3826; GFX9-LABEL: v_ssubsat_v8i16:
3827; GFX9:       ; %bb.0:
3828; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3829; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v4 clamp
3830; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v5 clamp
3831; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v6 clamp
3832; GFX9-NEXT:    v_pk_sub_i16 v3, v3, v7 clamp
3833; GFX9-NEXT:    s_setpc_b64 s[30:31]
3834;
3835; GFX10PLUS-LABEL: v_ssubsat_v8i16:
3836; GFX10PLUS:       ; %bb.0:
3837; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3838; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, v0, v4 clamp
3839; GFX10PLUS-NEXT:    v_pk_sub_i16 v1, v1, v5 clamp
3840; GFX10PLUS-NEXT:    v_pk_sub_i16 v2, v2, v6 clamp
3841; GFX10PLUS-NEXT:    v_pk_sub_i16 v3, v3, v7 clamp
3842; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
3843  %result = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
3844  %cast = bitcast <8 x i16> %result to <4 x float>
3845  ret <4 x float> %cast
3846}
3847
3848define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) {
3849; GFX6-LABEL: s_ssubsat_v8i16:
3850; GFX6:       ; %bb.0:
3851; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
3852; GFX6-NEXT:    s_max_i32 s16, s0, -1
3853; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
3854; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000001
3855; GFX6-NEXT:    s_min_i32 s17, s0, -1
3856; GFX6-NEXT:    s_add_i32 s17, s17, 0x80000000
3857; GFX6-NEXT:    s_max_i32 s8, s16, s8
3858; GFX6-NEXT:    s_min_i32 s8, s8, s17
3859; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3860; GFX6-NEXT:    s_sub_i32 s0, s0, s8
3861; GFX6-NEXT:    s_lshl_b32 s8, s9, 16
3862; GFX6-NEXT:    s_max_i32 s9, s1, -1
3863; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000001
3864; GFX6-NEXT:    s_min_i32 s16, s1, -1
3865; GFX6-NEXT:    s_add_i32 s16, s16, 0x80000000
3866; GFX6-NEXT:    s_max_i32 s8, s9, s8
3867; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3868; GFX6-NEXT:    s_min_i32 s8, s8, s16
3869; GFX6-NEXT:    s_max_i32 s9, s2, -1
3870; GFX6-NEXT:    s_sub_i32 s1, s1, s8
3871; GFX6-NEXT:    s_lshl_b32 s8, s10, 16
3872; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000001
3873; GFX6-NEXT:    s_min_i32 s10, s2, -1
3874; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000000
3875; GFX6-NEXT:    s_max_i32 s8, s9, s8
3876; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3877; GFX6-NEXT:    s_min_i32 s8, s8, s10
3878; GFX6-NEXT:    s_max_i32 s9, s3, -1
3879; GFX6-NEXT:    s_sub_i32 s2, s2, s8
3880; GFX6-NEXT:    s_lshl_b32 s8, s11, 16
3881; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000001
3882; GFX6-NEXT:    s_min_i32 s10, s3, -1
3883; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000000
3884; GFX6-NEXT:    s_max_i32 s8, s9, s8
3885; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3886; GFX6-NEXT:    s_min_i32 s8, s8, s10
3887; GFX6-NEXT:    s_max_i32 s9, s4, -1
3888; GFX6-NEXT:    s_sub_i32 s3, s3, s8
3889; GFX6-NEXT:    s_lshl_b32 s8, s12, 16
3890; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000001
3891; GFX6-NEXT:    s_min_i32 s10, s4, -1
3892; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000000
3893; GFX6-NEXT:    s_max_i32 s8, s9, s8
3894; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
3895; GFX6-NEXT:    s_min_i32 s8, s8, s10
3896; GFX6-NEXT:    s_max_i32 s9, s5, -1
3897; GFX6-NEXT:    s_sub_i32 s4, s4, s8
3898; GFX6-NEXT:    s_lshl_b32 s8, s13, 16
3899; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000001
3900; GFX6-NEXT:    s_min_i32 s10, s5, -1
3901; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000000
3902; GFX6-NEXT:    s_max_i32 s8, s9, s8
3903; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
3904; GFX6-NEXT:    s_min_i32 s8, s8, s10
3905; GFX6-NEXT:    s_max_i32 s9, s6, -1
3906; GFX6-NEXT:    s_sub_i32 s5, s5, s8
3907; GFX6-NEXT:    s_lshl_b32 s8, s14, 16
3908; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000001
3909; GFX6-NEXT:    s_min_i32 s10, s6, -1
3910; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000000
3911; GFX6-NEXT:    s_max_i32 s8, s9, s8
3912; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
3913; GFX6-NEXT:    s_min_i32 s8, s8, s10
3914; GFX6-NEXT:    s_max_i32 s9, s7, -1
3915; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
3916; GFX6-NEXT:    s_sub_i32 s6, s6, s8
3917; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
3918; GFX6-NEXT:    s_add_i32 s9, s9, 0x80000001
3919; GFX6-NEXT:    s_min_i32 s10, s7, -1
3920; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3921; GFX6-NEXT:    s_add_i32 s10, s10, 0x80000000
3922; GFX6-NEXT:    s_max_i32 s8, s9, s8
3923; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
3924; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
3925; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
3926; GFX6-NEXT:    s_min_i32 s8, s8, s10
3927; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
3928; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3929; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
3930; GFX6-NEXT:    s_sub_i32 s7, s7, s8
3931; GFX6-NEXT:    s_or_b32 s0, s0, s1
3932; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
3933; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
3934; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
3935; GFX6-NEXT:    s_ashr_i32 s7, s7, 16
3936; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3937; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
3938; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
3939; GFX6-NEXT:    s_or_b32 s1, s1, s2
3940; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
3941; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3942; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
3943; GFX6-NEXT:    s_or_b32 s2, s2, s3
3944; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
3945; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3946; GFX6-NEXT:    s_or_b32 s3, s3, s4
3947; GFX6-NEXT:    ; return to shader part epilog
3948;
3949; GFX8-LABEL: s_ssubsat_v8i16:
3950; GFX8:       ; %bb.0:
3951; GFX8-NEXT:    s_sext_i32_i16 s16, s0
3952; GFX8-NEXT:    s_sext_i32_i16 s17, -1
3953; GFX8-NEXT:    s_max_i32 s18, s16, s17
3954; GFX8-NEXT:    s_addk_i32 s18, 0x8001
3955; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
3956; GFX8-NEXT:    s_min_i32 s16, s16, s17
3957; GFX8-NEXT:    s_sext_i32_i16 s18, s18
3958; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3959; GFX8-NEXT:    s_addk_i32 s16, 0x8000
3960; GFX8-NEXT:    s_max_i32 s4, s18, s4
3961; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3962; GFX8-NEXT:    s_sext_i32_i16 s16, s16
3963; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
3964; GFX8-NEXT:    s_min_i32 s4, s4, s16
3965; GFX8-NEXT:    s_sub_i32 s0, s0, s4
3966; GFX8-NEXT:    s_sext_i32_i16 s4, s8
3967; GFX8-NEXT:    s_max_i32 s16, s4, s17
3968; GFX8-NEXT:    s_addk_i32 s16, 0x8001
3969; GFX8-NEXT:    s_min_i32 s4, s4, s17
3970; GFX8-NEXT:    s_sext_i32_i16 s16, s16
3971; GFX8-NEXT:    s_sext_i32_i16 s12, s12
3972; GFX8-NEXT:    s_addk_i32 s4, 0x8000
3973; GFX8-NEXT:    s_max_i32 s12, s16, s12
3974; GFX8-NEXT:    s_sext_i32_i16 s12, s12
3975; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3976; GFX8-NEXT:    s_min_i32 s4, s12, s4
3977; GFX8-NEXT:    s_sub_i32 s4, s8, s4
3978; GFX8-NEXT:    s_sext_i32_i16 s8, s1
3979; GFX8-NEXT:    s_max_i32 s12, s8, s17
3980; GFX8-NEXT:    s_addk_i32 s12, 0x8001
3981; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
3982; GFX8-NEXT:    s_min_i32 s8, s8, s17
3983; GFX8-NEXT:    s_sext_i32_i16 s12, s12
3984; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3985; GFX8-NEXT:    s_addk_i32 s8, 0x8000
3986; GFX8-NEXT:    s_max_i32 s5, s12, s5
3987; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3988; GFX8-NEXT:    s_sext_i32_i16 s8, s8
3989; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
3990; GFX8-NEXT:    s_min_i32 s5, s5, s8
3991; GFX8-NEXT:    s_sub_i32 s1, s1, s5
3992; GFX8-NEXT:    s_sext_i32_i16 s5, s9
3993; GFX8-NEXT:    s_max_i32 s8, s5, s17
3994; GFX8-NEXT:    s_addk_i32 s8, 0x8001
3995; GFX8-NEXT:    s_min_i32 s5, s5, s17
3996; GFX8-NEXT:    s_sext_i32_i16 s8, s8
3997; GFX8-NEXT:    s_sext_i32_i16 s12, s13
3998; GFX8-NEXT:    s_addk_i32 s5, 0x8000
3999; GFX8-NEXT:    s_max_i32 s8, s8, s12
4000; GFX8-NEXT:    s_sext_i32_i16 s8, s8
4001; GFX8-NEXT:    s_sext_i32_i16 s5, s5
4002; GFX8-NEXT:    s_min_i32 s5, s8, s5
4003; GFX8-NEXT:    s_sext_i32_i16 s8, s2
4004; GFX8-NEXT:    s_sub_i32 s5, s9, s5
4005; GFX8-NEXT:    s_max_i32 s9, s8, s17
4006; GFX8-NEXT:    s_addk_i32 s9, 0x8001
4007; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
4008; GFX8-NEXT:    s_min_i32 s8, s8, s17
4009; GFX8-NEXT:    s_sext_i32_i16 s9, s9
4010; GFX8-NEXT:    s_sext_i32_i16 s6, s6
4011; GFX8-NEXT:    s_addk_i32 s8, 0x8000
4012; GFX8-NEXT:    s_max_i32 s6, s9, s6
4013; GFX8-NEXT:    s_sext_i32_i16 s6, s6
4014; GFX8-NEXT:    s_sext_i32_i16 s8, s8
4015; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
4016; GFX8-NEXT:    s_min_i32 s6, s6, s8
4017; GFX8-NEXT:    s_sub_i32 s2, s2, s6
4018; GFX8-NEXT:    s_sext_i32_i16 s6, s10
4019; GFX8-NEXT:    s_max_i32 s8, s6, s17
4020; GFX8-NEXT:    s_addk_i32 s8, 0x8001
4021; GFX8-NEXT:    s_min_i32 s6, s6, s17
4022; GFX8-NEXT:    s_sext_i32_i16 s8, s8
4023; GFX8-NEXT:    s_sext_i32_i16 s9, s14
4024; GFX8-NEXT:    s_addk_i32 s6, 0x8000
4025; GFX8-NEXT:    s_max_i32 s8, s8, s9
4026; GFX8-NEXT:    s_sext_i32_i16 s8, s8
4027; GFX8-NEXT:    s_sext_i32_i16 s6, s6
4028; GFX8-NEXT:    s_min_i32 s6, s8, s6
4029; GFX8-NEXT:    s_sext_i32_i16 s8, s3
4030; GFX8-NEXT:    s_max_i32 s9, s8, s17
4031; GFX8-NEXT:    s_addk_i32 s9, 0x8001
4032; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
4033; GFX8-NEXT:    s_min_i32 s8, s8, s17
4034; GFX8-NEXT:    s_sext_i32_i16 s9, s9
4035; GFX8-NEXT:    s_sext_i32_i16 s7, s7
4036; GFX8-NEXT:    s_addk_i32 s8, 0x8000
4037; GFX8-NEXT:    s_max_i32 s7, s9, s7
4038; GFX8-NEXT:    s_sext_i32_i16 s7, s7
4039; GFX8-NEXT:    s_sext_i32_i16 s8, s8
4040; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
4041; GFX8-NEXT:    s_min_i32 s7, s7, s8
4042; GFX8-NEXT:    s_sub_i32 s3, s3, s7
4043; GFX8-NEXT:    s_sext_i32_i16 s7, s11
4044; GFX8-NEXT:    s_max_i32 s8, s7, s17
4045; GFX8-NEXT:    s_addk_i32 s8, 0x8001
4046; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
4047; GFX8-NEXT:    s_min_i32 s7, s7, s17
4048; GFX8-NEXT:    s_sext_i32_i16 s8, s8
4049; GFX8-NEXT:    s_sext_i32_i16 s9, s15
4050; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
4051; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4052; GFX8-NEXT:    s_addk_i32 s7, 0x8000
4053; GFX8-NEXT:    s_max_i32 s8, s8, s9
4054; GFX8-NEXT:    s_or_b32 s0, s0, s4
4055; GFX8-NEXT:    s_and_b32 s4, 0xffff, s5
4056; GFX8-NEXT:    s_sub_i32 s6, s10, s6
4057; GFX8-NEXT:    s_sext_i32_i16 s8, s8
4058; GFX8-NEXT:    s_sext_i32_i16 s7, s7
4059; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
4060; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4061; GFX8-NEXT:    s_min_i32 s7, s8, s7
4062; GFX8-NEXT:    s_or_b32 s1, s1, s4
4063; GFX8-NEXT:    s_and_b32 s4, 0xffff, s6
4064; GFX8-NEXT:    s_sub_i32 s7, s11, s7
4065; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
4066; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4067; GFX8-NEXT:    s_or_b32 s2, s2, s4
4068; GFX8-NEXT:    s_and_b32 s4, 0xffff, s7
4069; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
4070; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4071; GFX8-NEXT:    s_or_b32 s3, s3, s4
4072; GFX8-NEXT:    ; return to shader part epilog
4073;
4074; GFX9-LABEL: s_ssubsat_v8i16:
4075; GFX9:       ; %bb.0:
4076; GFX9-NEXT:    v_mov_b32_e32 v0, s4
4077; GFX9-NEXT:    v_mov_b32_e32 v1, s5
4078; GFX9-NEXT:    v_mov_b32_e32 v2, s6
4079; GFX9-NEXT:    v_mov_b32_e32 v3, s7
4080; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
4081; GFX9-NEXT:    v_pk_sub_i16 v1, s1, v1 clamp
4082; GFX9-NEXT:    v_pk_sub_i16 v2, s2, v2 clamp
4083; GFX9-NEXT:    v_pk_sub_i16 v3, s3, v3 clamp
4084; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
4085; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
4086; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
4087; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
4088; GFX9-NEXT:    ; return to shader part epilog
4089;
4090; GFX10PLUS-LABEL: s_ssubsat_v8i16:
4091; GFX10PLUS:       ; %bb.0:
4092; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, s0, s4 clamp
4093; GFX10PLUS-NEXT:    v_pk_sub_i16 v1, s1, s5 clamp
4094; GFX10PLUS-NEXT:    v_pk_sub_i16 v2, s2, s6 clamp
4095; GFX10PLUS-NEXT:    v_pk_sub_i16 v3, s3, s7 clamp
4096; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
4097; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
4098; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
4099; GFX10PLUS-NEXT:    v_readfirstlane_b32 s3, v3
4100; GFX10PLUS-NEXT:    ; return to shader part epilog
4101  %result = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
4102  %cast = bitcast <8 x i16> %result to <4 x i32>
4103  ret <4 x i32> %cast
4104}
4105
4106define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
4107; GFX6-LABEL: v_ssubsat_i48:
4108; GFX6:       ; %bb.0:
4109; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4110; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
4111; GFX6-NEXT:    v_subb_u32_e32 v6, vcc, v1, v3, vcc
4112; GFX6-NEXT:    v_bfe_i32 v5, v4, 0, 16
4113; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
4114; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
4115; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4116; GFX6-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
4117; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4118; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0xffff8000, v0
4119; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v5
4120; GFX6-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4121; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
4122; GFX6-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
4123; GFX6-NEXT:    s_setpc_b64 s[30:31]
4124;
4125; GFX8-LABEL: v_ssubsat_i48:
4126; GFX8:       ; %bb.0:
4127; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4128; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v0, v2
4129; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v1, v3, vcc
4130; GFX8-NEXT:    v_bfe_i32 v5, v4, 0, 16
4131; GFX8-NEXT:    v_bfe_i32 v1, v0, 0, 16
4132; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 16
4133; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4134; GFX8-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
4135; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4136; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xffff8000, v0
4137; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 15, v5
4138; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4139; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
4140; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
4141; GFX8-NEXT:    s_setpc_b64 s[30:31]
4142;
4143; GFX9-LABEL: v_ssubsat_i48:
4144; GFX9:       ; %bb.0:
4145; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4146; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4147; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
4148; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
4149; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
4150; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
4151; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
4152; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4153; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
4154; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4155; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
4156; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4157; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4158; GFX9-NEXT:    s_setpc_b64 s[30:31]
4159;
4160; GFX10-LABEL: v_ssubsat_i48:
4161; GFX10:       ; %bb.0:
4162; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4163; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4164; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
4165; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
4166; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4167; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
4168; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
4169; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
4170; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
4171; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s4
4172; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
4173; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
4174; GFX10-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4175; GFX10-NEXT:    s_setpc_b64 s[30:31]
4176;
4177; GFX11-LABEL: v_ssubsat_i48:
4178; GFX11:       ; %bb.0:
4179; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4180; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4181; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
4182; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
4183; GFX11-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4184; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
4185; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
4186; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
4187; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
4188; GFX11-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s0
4189; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
4190; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4191; GFX11-NEXT:    s_setpc_b64 s[30:31]
4192  %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
4193  ret i48 %result
4194}
4195
4196define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
4197; GFX6-LABEL: s_ssubsat_i48:
4198; GFX6:       ; %bb.0:
4199; GFX6-NEXT:    s_sub_u32 s4, s0, s2
4200; GFX6-NEXT:    s_subb_u32 s3, s1, s3
4201; GFX6-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
4202; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4203; GFX6-NEXT:    s_bfe_i64 s[6:7], s[4:5], 0x300000
4204; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4205; GFX6-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x300000
4206; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
4207; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
4208; GFX6-NEXT:    s_ashr_i32 s2, s7, 31
4209; GFX6-NEXT:    s_ashr_i32 s5, s7, 15
4210; GFX6-NEXT:    s_addk_i32 s2, 0x8000
4211; GFX6-NEXT:    v_mov_b32_e32 v0, s5
4212; GFX6-NEXT:    v_mov_b32_e32 v1, s2
4213; GFX6-NEXT:    v_mov_b32_e32 v2, s4
4214; GFX6-NEXT:    v_mov_b32_e32 v3, s3
4215; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4216; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4217; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4218; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
4219; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
4220; GFX6-NEXT:    ; return to shader part epilog
4221;
4222; GFX8-LABEL: s_ssubsat_i48:
4223; GFX8:       ; %bb.0:
4224; GFX8-NEXT:    s_sub_u32 s4, s0, s2
4225; GFX8-NEXT:    s_subb_u32 s3, s1, s3
4226; GFX8-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
4227; GFX8-NEXT:    v_mov_b32_e32 v0, s0
4228; GFX8-NEXT:    s_bfe_i64 s[6:7], s[4:5], 0x300000
4229; GFX8-NEXT:    v_mov_b32_e32 v1, s1
4230; GFX8-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x300000
4231; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
4232; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
4233; GFX8-NEXT:    s_ashr_i32 s2, s7, 31
4234; GFX8-NEXT:    s_ashr_i32 s5, s7, 15
4235; GFX8-NEXT:    s_addk_i32 s2, 0x8000
4236; GFX8-NEXT:    v_mov_b32_e32 v0, s5
4237; GFX8-NEXT:    v_mov_b32_e32 v1, s2
4238; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4239; GFX8-NEXT:    v_mov_b32_e32 v3, s3
4240; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4241; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4242; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4243; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
4244; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
4245; GFX8-NEXT:    ; return to shader part epilog
4246;
4247; GFX9-LABEL: s_ssubsat_i48:
4248; GFX9:       ; %bb.0:
4249; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4250; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
4251; GFX9-NEXT:    s_sub_u32 s4, s0, s2
4252; GFX9-NEXT:    v_mov_b32_e32 v0, s0
4253; GFX9-NEXT:    s_subb_u32 s5, s1, s3
4254; GFX9-NEXT:    v_mov_b32_e32 v1, s1
4255; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4256; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
4257; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
4258; GFX9-NEXT:    s_add_i32 s3, s2, 0x80000000
4259; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4260; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4261; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4262; GFX9-NEXT:    v_mov_b32_e32 v3, s5
4263; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4264; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4265; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4266; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4267; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
4268; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
4269; GFX9-NEXT:    ; return to shader part epilog
4270;
4271; GFX10-LABEL: s_ssubsat_i48:
4272; GFX10:       ; %bb.0:
4273; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4274; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
4275; GFX10-NEXT:    s_sub_u32 s4, s0, s2
4276; GFX10-NEXT:    s_subb_u32 s5, s1, s3
4277; GFX10-NEXT:    v_mov_b32_e32 v0, s4
4278; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4279; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
4280; GFX10-NEXT:    v_mov_b32_e32 v1, s5
4281; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
4282; GFX10-NEXT:    s_add_i32 s3, s2, 0x80000000
4283; GFX10-NEXT:    s_xor_b32 s0, s1, s0
4284; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
4285; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
4286; GFX10-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4287; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
4288; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
4289; GFX10-NEXT:    ; return to shader part epilog
4290;
4291; GFX11-LABEL: s_ssubsat_i48:
4292; GFX11:       ; %bb.0:
4293; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4294; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
4295; GFX11-NEXT:    s_sub_u32 s4, s0, s2
4296; GFX11-NEXT:    s_subb_u32 s5, s1, s3
4297; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
4298; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4299; GFX11-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
4300; GFX11-NEXT:    s_ashr_i32 s2, s5, 31
4301; GFX11-NEXT:    s_add_i32 s3, s2, 0x80000000
4302; GFX11-NEXT:    s_xor_b32 s0, s1, s0
4303; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
4304; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
4305; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4306; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
4307; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
4308; GFX11-NEXT:    ; return to shader part epilog
4309  %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
4310  ret i48 %result
4311}
4312
4313define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
4314; GFX6-LABEL: ssubsat_i48_sv:
4315; GFX6:       ; %bb.0:
4316; GFX6-NEXT:    v_mov_b32_e32 v3, s1
4317; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s0, v0
4318; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v3, v1, vcc
4319; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
4320; GFX6-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
4321; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
4322; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4323; GFX6-NEXT:    v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
4324; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4325; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v3
4326; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0xffff8000, v0
4327; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
4328; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
4329; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
4330; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4331; GFX6-NEXT:    ; return to shader part epilog
4332;
4333; GFX8-LABEL: ssubsat_i48_sv:
4334; GFX8:       ; %bb.0:
4335; GFX8-NEXT:    v_mov_b32_e32 v3, s1
4336; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
4337; GFX8-NEXT:    v_subb_u32_e32 v4, vcc, v3, v1, vcc
4338; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 16
4339; GFX8-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
4340; GFX8-NEXT:    v_bfe_i32 v1, v0, 0, 16
4341; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4342; GFX8-NEXT:    v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
4343; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4344; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 15, v3
4345; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xffff8000, v0
4346; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
4347; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
4348; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
4349; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4350; GFX8-NEXT:    ; return to shader part epilog
4351;
4352; GFX9-LABEL: ssubsat_i48_sv:
4353; GFX9:       ; %bb.0:
4354; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4355; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4356; GFX9-NEXT:    v_mov_b32_e32 v3, s1
4357; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s0, v0
4358; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
4359; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
4360; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
4361; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4362; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
4363; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4364; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4365; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4366; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4367; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4368; GFX9-NEXT:    ; return to shader part epilog
4369;
4370; GFX10-LABEL: ssubsat_i48_sv:
4371; GFX10:       ; %bb.0:
4372; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4373; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4374; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, s0, v0
4375; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4376; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4377; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4378; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[0:1]
4379; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4380; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4381; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4382; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4383; GFX10-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4384; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4385; GFX10-NEXT:    ; return to shader part epilog
4386;
4387; GFX11-LABEL: ssubsat_i48_sv:
4388; GFX11:       ; %bb.0:
4389; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4390; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4391; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, s0, v0
4392; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4393; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4394; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4395; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[0:1]
4396; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4397; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4398; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4399; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4400; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4401; GFX11-NEXT:    ; return to shader part epilog
4402  %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
4403  %ext.result = zext i48 %result to i64
4404  %cast = bitcast i64 %ext.result to <2 x float>
4405  ret <2 x float> %cast
4406}
4407
4408define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
4409; GFX6-LABEL: ssubsat_i48_vs:
4410; GFX6:       ; %bb.0:
4411; GFX6-NEXT:    v_mov_b32_e32 v3, s1
4412; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v0
4413; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v1, v3, vcc
4414; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
4415; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
4416; GFX6-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
4417; GFX6-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4418; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
4419; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4420; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v3
4421; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0xffff8000, v0
4422; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], s[2:3]
4423; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
4424; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
4425; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4426; GFX6-NEXT:    ; return to shader part epilog
4427;
4428; GFX8-LABEL: ssubsat_i48_vs:
4429; GFX8:       ; %bb.0:
4430; GFX8-NEXT:    v_mov_b32_e32 v3, s1
4431; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v0
4432; GFX8-NEXT:    v_subb_u32_e32 v4, vcc, v1, v3, vcc
4433; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 16
4434; GFX8-NEXT:    v_bfe_i32 v1, v0, 0, 16
4435; GFX8-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
4436; GFX8-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4437; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
4438; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4439; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 15, v3
4440; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xffff8000, v0
4441; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], s[2:3]
4442; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
4443; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
4444; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4445; GFX8-NEXT:    ; return to shader part epilog
4446;
4447; GFX9-LABEL: ssubsat_i48_vs:
4448; GFX9:       ; %bb.0:
4449; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4450; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4451; GFX9-NEXT:    v_mov_b32_e32 v3, s1
4452; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v0
4453; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
4454; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
4455; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
4456; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4457; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
4458; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4459; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4460; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4461; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4462; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4463; GFX9-NEXT:    ; return to shader part epilog
4464;
4465; GFX10-LABEL: ssubsat_i48_vs:
4466; GFX10:       ; %bb.0:
4467; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4468; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4469; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, s0
4470; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4471; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], 0
4472; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4473; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4474; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4475; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4476; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4477; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4478; GFX10-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4479; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4480; GFX10-NEXT:    ; return to shader part epilog
4481;
4482; GFX11-LABEL: ssubsat_i48_vs:
4483; GFX11:       ; %bb.0:
4484; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4485; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4486; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, s0
4487; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4488; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], 0
4489; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4490; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4491; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4492; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4493; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4494; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4495; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4496; GFX11-NEXT:    ; return to shader part epilog
4497  %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
4498  %ext.result = zext i48 %result to i64
4499  %cast = bitcast i64 %ext.result to <2 x float>
4500  ret <2 x float> %cast
4501}
4502
4503define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
4504; GFX6-LABEL: v_ssubsat_i64:
4505; GFX6:       ; %bb.0:
4506; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4507; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
4508; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
4509; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4510; GFX6-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
4511; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4512; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 0x80000000, v0
4513; GFX6-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4514; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
4515; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4516; GFX6-NEXT:    s_setpc_b64 s[30:31]
4517;
4518; GFX8-LABEL: v_ssubsat_i64:
4519; GFX8:       ; %bb.0:
4520; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4521; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v0, v2
4522; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
4523; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4524; GFX8-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
4525; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4526; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x80000000, v0
4527; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4528; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
4529; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4530; GFX8-NEXT:    s_setpc_b64 s[30:31]
4531;
4532; GFX9-LABEL: v_ssubsat_i64:
4533; GFX9:       ; %bb.0:
4534; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4535; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
4536; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
4537; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
4538; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
4539; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4540; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
4541; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4542; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
4543; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4544; GFX9-NEXT:    s_setpc_b64 s[30:31]
4545;
4546; GFX10-LABEL: v_ssubsat_i64:
4547; GFX10:       ; %bb.0:
4548; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4549; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
4550; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4551; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[2:3]
4552; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
4553; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
4554; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
4555; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
4556; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
4557; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
4558; GFX10-NEXT:    s_setpc_b64 s[30:31]
4559;
4560; GFX11-LABEL: v_ssubsat_i64:
4561; GFX11:       ; %bb.0:
4562; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4563; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
4564; GFX11-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4565; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[2:3]
4566; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
4567; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
4568; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
4569; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4570; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
4571; GFX11-NEXT:    s_setpc_b64 s[30:31]
4572  %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
4573  ret i64 %result
4574}
4575
4576define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
4577; GFX6-LABEL: s_ssubsat_i64:
4578; GFX6:       ; %bb.0:
4579; GFX6-NEXT:    s_sub_u32 s4, s0, s2
4580; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4581; GFX6-NEXT:    s_subb_u32 s5, s1, s3
4582; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4583; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4584; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
4585; GFX6-NEXT:    s_ashr_i32 s2, s5, 31
4586; GFX6-NEXT:    s_add_i32 s3, s2, 0x80000000
4587; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4588; GFX6-NEXT:    v_mov_b32_e32 v1, s3
4589; GFX6-NEXT:    v_mov_b32_e32 v2, s4
4590; GFX6-NEXT:    v_mov_b32_e32 v3, s5
4591; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4592; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4593; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4594; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
4595; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
4596; GFX6-NEXT:    ; return to shader part epilog
4597;
4598; GFX8-LABEL: s_ssubsat_i64:
4599; GFX8:       ; %bb.0:
4600; GFX8-NEXT:    s_sub_u32 s4, s0, s2
4601; GFX8-NEXT:    v_mov_b32_e32 v0, s0
4602; GFX8-NEXT:    s_subb_u32 s5, s1, s3
4603; GFX8-NEXT:    v_mov_b32_e32 v1, s1
4604; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4605; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
4606; GFX8-NEXT:    s_ashr_i32 s2, s5, 31
4607; GFX8-NEXT:    s_add_i32 s3, s2, 0x80000000
4608; GFX8-NEXT:    v_mov_b32_e32 v0, s2
4609; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4610; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4611; GFX8-NEXT:    v_mov_b32_e32 v3, s5
4612; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4613; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4614; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4615; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
4616; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
4617; GFX8-NEXT:    ; return to shader part epilog
4618;
4619; GFX9-LABEL: s_ssubsat_i64:
4620; GFX9:       ; %bb.0:
4621; GFX9-NEXT:    s_sub_u32 s4, s0, s2
4622; GFX9-NEXT:    v_mov_b32_e32 v0, s0
4623; GFX9-NEXT:    s_subb_u32 s5, s1, s3
4624; GFX9-NEXT:    v_mov_b32_e32 v1, s1
4625; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4626; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
4627; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
4628; GFX9-NEXT:    s_add_i32 s3, s2, 0x80000000
4629; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4630; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4631; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4632; GFX9-NEXT:    v_mov_b32_e32 v3, s5
4633; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4634; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4635; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4636; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
4637; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
4638; GFX9-NEXT:    ; return to shader part epilog
4639;
4640; GFX10-LABEL: s_ssubsat_i64:
4641; GFX10:       ; %bb.0:
4642; GFX10-NEXT:    s_sub_u32 s4, s0, s2
4643; GFX10-NEXT:    s_subb_u32 s5, s1, s3
4644; GFX10-NEXT:    v_mov_b32_e32 v0, s4
4645; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4646; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
4647; GFX10-NEXT:    v_mov_b32_e32 v1, s5
4648; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
4649; GFX10-NEXT:    s_add_i32 s3, s2, 0x80000000
4650; GFX10-NEXT:    s_xor_b32 s0, s1, s0
4651; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
4652; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
4653; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
4654; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
4655; GFX10-NEXT:    ; return to shader part epilog
4656;
4657; GFX11-LABEL: s_ssubsat_i64:
4658; GFX11:       ; %bb.0:
4659; GFX11-NEXT:    s_sub_u32 s4, s0, s2
4660; GFX11-NEXT:    s_subb_u32 s5, s1, s3
4661; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
4662; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4663; GFX11-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
4664; GFX11-NEXT:    s_ashr_i32 s2, s5, 31
4665; GFX11-NEXT:    s_add_i32 s3, s2, 0x80000000
4666; GFX11-NEXT:    s_xor_b32 s0, s1, s0
4667; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
4668; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
4669; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
4670; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
4671; GFX11-NEXT:    ; return to shader part epilog
4672  %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
4673  ret i64 %result
4674}
4675
4676define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
4677; GFX6-LABEL: ssubsat_i64_sv:
4678; GFX6:       ; %bb.0:
4679; GFX6-NEXT:    v_mov_b32_e32 v3, s1
4680; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s0, v0
4681; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
4682; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4683; GFX6-NEXT:    v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
4684; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4685; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 0x80000000, v0
4686; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
4687; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4688; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4689; GFX6-NEXT:    ; return to shader part epilog
4690;
4691; GFX8-LABEL: ssubsat_i64_sv:
4692; GFX8:       ; %bb.0:
4693; GFX8-NEXT:    v_mov_b32_e32 v3, s1
4694; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
4695; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
4696; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4697; GFX8-NEXT:    v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
4698; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4699; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x80000000, v0
4700; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
4701; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4702; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4703; GFX8-NEXT:    ; return to shader part epilog
4704;
4705; GFX9-LABEL: ssubsat_i64_sv:
4706; GFX9:       ; %bb.0:
4707; GFX9-NEXT:    v_mov_b32_e32 v3, s1
4708; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s0, v0
4709; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
4710; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
4711; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
4712; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4713; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
4714; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4715; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4716; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4717; GFX9-NEXT:    ; return to shader part epilog
4718;
4719; GFX10-LABEL: ssubsat_i64_sv:
4720; GFX10:       ; %bb.0:
4721; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, s0, v0
4722; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4723; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4724; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4725; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[0:1]
4726; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4727; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4728; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4729; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4730; GFX10-NEXT:    ; return to shader part epilog
4731;
4732; GFX11-LABEL: ssubsat_i64_sv:
4733; GFX11:       ; %bb.0:
4734; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, s0, v0
4735; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4736; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4737; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4738; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[0:1]
4739; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4740; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4741; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4742; GFX11-NEXT:    ; return to shader part epilog
4743  %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
4744  %cast = bitcast i64 %result to <2 x float>
4745  ret <2 x float> %cast
4746}
4747
4748define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
4749; GFX6-LABEL: ssubsat_i64_vs:
4750; GFX6:       ; %bb.0:
4751; GFX6-NEXT:    v_mov_b32_e32 v3, s1
4752; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v0
4753; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
4754; GFX6-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4755; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
4756; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4757; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 0x80000000, v0
4758; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], s[2:3]
4759; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4760; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4761; GFX6-NEXT:    ; return to shader part epilog
4762;
4763; GFX8-LABEL: ssubsat_i64_vs:
4764; GFX8:       ; %bb.0:
4765; GFX8-NEXT:    v_mov_b32_e32 v3, s1
4766; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v0
4767; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
4768; GFX8-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4769; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
4770; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4771; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x80000000, v0
4772; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], s[2:3]
4773; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4774; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4775; GFX8-NEXT:    ; return to shader part epilog
4776;
4777; GFX9-LABEL: ssubsat_i64_vs:
4778; GFX9:       ; %bb.0:
4779; GFX9-NEXT:    v_mov_b32_e32 v3, s1
4780; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v0
4781; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
4782; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
4783; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
4784; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4785; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
4786; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4787; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4788; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4789; GFX9-NEXT:    ; return to shader part epilog
4790;
4791; GFX10-LABEL: ssubsat_i64_vs:
4792; GFX10:       ; %bb.0:
4793; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, s0
4794; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4795; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], 0
4796; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4797; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4798; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4799; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4800; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4801; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4802; GFX10-NEXT:    ; return to shader part epilog
4803;
4804; GFX11-LABEL: ssubsat_i64_vs:
4805; GFX11:       ; %bb.0:
4806; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, s0
4807; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4808; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], 0
4809; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4810; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4811; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4812; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4813; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4814; GFX11-NEXT:    ; return to shader part epilog
4815  %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
4816  %cast = bitcast i64 %result to <2 x float>
4817  ret <2 x float> %cast
4818}
4819
4820define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
4821; GFX6-LABEL: v_ssubsat_v2i64:
4822; GFX6:       ; %bb.0:
4823; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4824; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v0, v4
4825; GFX6-NEXT:    v_subb_u32_e32 v9, vcc, v1, v5, vcc
4826; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
4827; GFX6-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[4:5]
4828; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
4829; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
4830; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v0, v1
4831; GFX6-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4832; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
4833; GFX6-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
4834; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v2, v6
4835; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v3, v7, vcc
4836; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
4837; GFX6-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[6:7]
4838; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
4839; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000000, v2
4840; GFX6-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4841; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
4842; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
4843; GFX6-NEXT:    s_setpc_b64 s[30:31]
4844;
4845; GFX8-LABEL: v_ssubsat_v2i64:
4846; GFX8:       ; %bb.0:
4847; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4848; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, v0, v4
4849; GFX8-NEXT:    v_subb_u32_e32 v9, vcc, v1, v5, vcc
4850; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
4851; GFX8-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[4:5]
4852; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
4853; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
4854; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v0, v1
4855; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4856; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
4857; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
4858; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v2, v6
4859; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v3, v7, vcc
4860; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
4861; GFX8-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[6:7]
4862; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
4863; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x80000000, v2
4864; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4865; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
4866; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
4867; GFX8-NEXT:    s_setpc_b64 s[30:31]
4868;
4869; GFX9-LABEL: v_ssubsat_v2i64:
4870; GFX9:       ; %bb.0:
4871; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4872; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v0, v4
4873; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, v1, v5, vcc
4874; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
4875; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[4:5]
4876; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
4877; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
4878; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4879; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
4880; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
4881; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v2, v6
4882; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v3, v7, vcc
4883; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
4884; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[6:7]
4885; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
4886; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
4887; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4888; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
4889; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
4890; GFX9-NEXT:    s_setpc_b64 s[30:31]
4891;
4892; GFX10-LABEL: v_ssubsat_v2i64:
4893; GFX10:       ; %bb.0:
4894; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4895; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v0, v4
4896; GFX10-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
4897; GFX10-NEXT:    v_sub_co_u32 v10, vcc_lo, v2, v6
4898; GFX10-NEXT:    v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
4899; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 31, v9
4900; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
4901; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[4:5]
4902; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
4903; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
4904; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, 0, v[6:7]
4905; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v12
4906; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v4
4907; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
4908; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc_lo
4909; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
4910; GFX10-NEXT:    s_xor_b32 vcc_lo, s6, s5
4911; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v4, vcc_lo
4912; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
4913; GFX10-NEXT:    s_setpc_b64 s[30:31]
4914;
4915; GFX11-LABEL: v_ssubsat_v2i64:
4916; GFX11:       ; %bb.0:
4917; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4918; GFX11-NEXT:    v_sub_co_u32 v8, vcc_lo, v0, v4
4919; GFX11-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
4920; GFX11-NEXT:    v_sub_co_u32 v10, vcc_lo, v2, v6
4921; GFX11-NEXT:    v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
4922; GFX11-NEXT:    v_ashrrev_i32_e32 v12, 31, v9
4923; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
4924; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[4:5]
4925; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
4926; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, v[10:11], v[2:3]
4927; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, 0, v[6:7]
4928; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v12
4929; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v4
4930; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4931; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1
4932; GFX11-NEXT:    s_xor_b32 vcc_lo, s2, s1
4933; GFX11-NEXT:    v_dual_cndmask_b32 v2, v10, v4 :: v_dual_cndmask_b32 v3, v11, v3
4934; GFX11-NEXT:    s_setpc_b64 s[30:31]
4935  %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
4936  ret <2 x i64> %result
4937}
4938
4939define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) {
4940; GFX6-LABEL: s_ssubsat_v2i64:
4941; GFX6:       ; %bb.0:
4942; GFX6-NEXT:    s_sub_u32 s8, s0, s4
4943; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4944; GFX6-NEXT:    s_subb_u32 s9, s1, s5
4945; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4946; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
4947; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
4948; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
4949; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
4950; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4951; GFX6-NEXT:    v_mov_b32_e32 v1, s5
4952; GFX6-NEXT:    v_mov_b32_e32 v2, s8
4953; GFX6-NEXT:    v_mov_b32_e32 v3, s9
4954; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4955; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
4956; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
4957; GFX6-NEXT:    s_sub_u32 s0, s2, s6
4958; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4959; GFX6-NEXT:    s_subb_u32 s1, s3, s7
4960; GFX6-NEXT:    v_mov_b32_e32 v1, s3
4961; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
4962; GFX6-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
4963; GFX6-NEXT:    s_ashr_i32 s4, s1, 31
4964; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
4965; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4966; GFX6-NEXT:    v_mov_b32_e32 v1, s5
4967; GFX6-NEXT:    v_mov_b32_e32 v4, s0
4968; GFX6-NEXT:    v_mov_b32_e32 v5, s1
4969; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], vcc
4970; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
4971; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4972; GFX6-NEXT:    v_readfirstlane_b32 s0, v2
4973; GFX6-NEXT:    v_readfirstlane_b32 s1, v3
4974; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
4975; GFX6-NEXT:    v_readfirstlane_b32 s3, v1
4976; GFX6-NEXT:    ; return to shader part epilog
4977;
4978; GFX8-LABEL: s_ssubsat_v2i64:
4979; GFX8:       ; %bb.0:
4980; GFX8-NEXT:    s_sub_u32 s8, s0, s4
4981; GFX8-NEXT:    v_mov_b32_e32 v0, s0
4982; GFX8-NEXT:    s_subb_u32 s9, s1, s5
4983; GFX8-NEXT:    v_mov_b32_e32 v1, s1
4984; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
4985; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
4986; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
4987; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
4988; GFX8-NEXT:    v_mov_b32_e32 v0, s4
4989; GFX8-NEXT:    v_mov_b32_e32 v1, s5
4990; GFX8-NEXT:    v_mov_b32_e32 v2, s8
4991; GFX8-NEXT:    v_mov_b32_e32 v3, s9
4992; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4993; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
4994; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
4995; GFX8-NEXT:    s_sub_u32 s0, s2, s6
4996; GFX8-NEXT:    v_mov_b32_e32 v0, s2
4997; GFX8-NEXT:    s_subb_u32 s1, s3, s7
4998; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4999; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
5000; GFX8-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
5001; GFX8-NEXT:    s_ashr_i32 s4, s1, 31
5002; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
5003; GFX8-NEXT:    v_mov_b32_e32 v0, s4
5004; GFX8-NEXT:    v_mov_b32_e32 v1, s5
5005; GFX8-NEXT:    v_mov_b32_e32 v4, s0
5006; GFX8-NEXT:    v_mov_b32_e32 v5, s1
5007; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], vcc
5008; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
5009; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
5010; GFX8-NEXT:    v_readfirstlane_b32 s0, v2
5011; GFX8-NEXT:    v_readfirstlane_b32 s1, v3
5012; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
5013; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
5014; GFX8-NEXT:    ; return to shader part epilog
5015;
5016; GFX9-LABEL: s_ssubsat_v2i64:
5017; GFX9:       ; %bb.0:
5018; GFX9-NEXT:    s_sub_u32 s8, s0, s4
5019; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5020; GFX9-NEXT:    s_subb_u32 s9, s1, s5
5021; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5022; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
5023; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
5024; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
5025; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
5026; GFX9-NEXT:    v_mov_b32_e32 v0, s4
5027; GFX9-NEXT:    v_mov_b32_e32 v1, s5
5028; GFX9-NEXT:    v_mov_b32_e32 v2, s8
5029; GFX9-NEXT:    v_mov_b32_e32 v3, s9
5030; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
5031; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
5032; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
5033; GFX9-NEXT:    s_sub_u32 s0, s2, s6
5034; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5035; GFX9-NEXT:    s_subb_u32 s1, s3, s7
5036; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5037; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
5038; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
5039; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
5040; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
5041; GFX9-NEXT:    v_mov_b32_e32 v0, s4
5042; GFX9-NEXT:    v_mov_b32_e32 v1, s5
5043; GFX9-NEXT:    v_mov_b32_e32 v4, s0
5044; GFX9-NEXT:    v_mov_b32_e32 v5, s1
5045; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], vcc
5046; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
5047; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
5048; GFX9-NEXT:    v_readfirstlane_b32 s0, v2
5049; GFX9-NEXT:    v_readfirstlane_b32 s1, v3
5050; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
5051; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
5052; GFX9-NEXT:    ; return to shader part epilog
5053;
5054; GFX10-LABEL: s_ssubsat_v2i64:
5055; GFX10:       ; %bb.0:
5056; GFX10-NEXT:    s_sub_u32 s8, s0, s4
5057; GFX10-NEXT:    s_subb_u32 s9, s1, s5
5058; GFX10-NEXT:    v_mov_b32_e32 v0, s8
5059; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
5060; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[4:5], 0
5061; GFX10-NEXT:    s_ashr_i32 s4, s9, 31
5062; GFX10-NEXT:    v_mov_b32_e32 v1, s9
5063; GFX10-NEXT:    s_add_i32 s5, s4, 0x80000000
5064; GFX10-NEXT:    s_xor_b32 s8, s1, s0
5065; GFX10-NEXT:    s_sub_u32 s0, s2, s6
5066; GFX10-NEXT:    s_subb_u32 s1, s3, s7
5067; GFX10-NEXT:    v_mov_b32_e32 v2, s0
5068; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
5069; GFX10-NEXT:    v_cmp_gt_i64_e64 s3, s[6:7], 0
5070; GFX10-NEXT:    v_mov_b32_e32 v3, s1
5071; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
5072; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
5073; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
5074; GFX10-NEXT:    s_add_i32 s0, s4, 0x80000000
5075; GFX10-NEXT:    s_xor_b32 s1, s3, s2
5076; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
5077; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s0, s1
5078; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
5079; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
5080; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
5081; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
5082; GFX10-NEXT:    ; return to shader part epilog
5083;
5084; GFX11-LABEL: s_ssubsat_v2i64:
5085; GFX11:       ; %bb.0:
5086; GFX11-NEXT:    s_sub_u32 s8, s0, s4
5087; GFX11-NEXT:    s_subb_u32 s9, s1, s5
5088; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
5089; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
5090; GFX11-NEXT:    v_cmp_gt_i64_e64 s1, s[4:5], 0
5091; GFX11-NEXT:    s_ashr_i32 s4, s9, 31
5092; GFX11-NEXT:    s_add_i32 s5, s4, 0x80000000
5093; GFX11-NEXT:    s_xor_b32 s8, s1, s0
5094; GFX11-NEXT:    s_sub_u32 s0, s2, s6
5095; GFX11-NEXT:    s_subb_u32 s1, s3, s7
5096; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
5097; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
5098; GFX11-NEXT:    v_cmp_gt_i64_e64 s3, s[6:7], 0
5099; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
5100; GFX11-NEXT:    s_ashr_i32 s4, s1, 31
5101; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
5102; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
5103; GFX11-NEXT:    s_xor_b32 s1, s3, s2
5104; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
5105; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s0, s1
5106; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
5107; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
5108; GFX11-NEXT:    v_readfirstlane_b32 s2, v2
5109; GFX11-NEXT:    v_readfirstlane_b32 s3, v3
5110; GFX11-NEXT:    ; return to shader part epilog
5111  %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
5112  ret <2 x i64> %result
5113}
5114
5115define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
5116; GFX6-LABEL: s_ssubsat_i128:
5117; GFX6:       ; %bb.0:
5118; GFX6-NEXT:    s_sub_u32 s8, s0, s4
5119; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5120; GFX6-NEXT:    s_subb_u32 s9, s1, s5
5121; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5122; GFX6-NEXT:    s_subb_u32 s10, s2, s6
5123; GFX6-NEXT:    v_mov_b32_e32 v2, s2
5124; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
5125; GFX6-NEXT:    s_subb_u32 s11, s3, s7
5126; GFX6-NEXT:    v_mov_b32_e32 v3, s3
5127; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5128; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3]
5129; GFX6-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
5130; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5131; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[10:11], v[2:3]
5132; GFX6-NEXT:    v_mov_b32_e32 v3, s9
5133; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5134; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5135; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
5136; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[6:7], 0
5137; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5138; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5139; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5140; GFX6-NEXT:    s_ashr_i32 s0, s11, 31
5141; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5142; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
5143; GFX6-NEXT:    v_mov_b32_e32 v1, s0
5144; GFX6-NEXT:    v_mov_b32_e32 v2, s8
5145; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5146; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
5147; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
5148; GFX6-NEXT:    v_mov_b32_e32 v3, s1
5149; GFX6-NEXT:    v_mov_b32_e32 v4, s10
5150; GFX6-NEXT:    v_mov_b32_e32 v5, s11
5151; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
5152; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
5153; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
5154; GFX6-NEXT:    v_readfirstlane_b32 s1, v2
5155; GFX6-NEXT:    v_readfirstlane_b32 s2, v1
5156; GFX6-NEXT:    v_readfirstlane_b32 s3, v3
5157; GFX6-NEXT:    ; return to shader part epilog
5158;
5159; GFX8-LABEL: s_ssubsat_i128:
5160; GFX8:       ; %bb.0:
5161; GFX8-NEXT:    s_sub_u32 s8, s0, s4
5162; GFX8-NEXT:    s_subb_u32 s9, s1, s5
5163; GFX8-NEXT:    v_mov_b32_e32 v0, s0
5164; GFX8-NEXT:    s_subb_u32 s10, s2, s6
5165; GFX8-NEXT:    v_mov_b32_e32 v1, s1
5166; GFX8-NEXT:    s_subb_u32 s11, s3, s7
5167; GFX8-NEXT:    v_mov_b32_e32 v2, s2
5168; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
5169; GFX8-NEXT:    v_mov_b32_e32 v3, s3
5170; GFX8-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
5171; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
5172; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5173; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3]
5174; GFX8-NEXT:    s_and_b32 s0, 1, s0
5175; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5176; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5177; GFX8-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
5178; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5179; GFX8-NEXT:    s_cmp_eq_u64 s[6:7], 0
5180; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5181; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
5182; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
5183; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5184; GFX8-NEXT:    s_and_b32 s0, 1, s2
5185; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5186; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5187; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
5188; GFX8-NEXT:    s_ashr_i32 s0, s11, 31
5189; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5190; GFX8-NEXT:    s_add_i32 s1, s0, 0x80000000
5191; GFX8-NEXT:    v_mov_b32_e32 v1, s0
5192; GFX8-NEXT:    v_mov_b32_e32 v2, s8
5193; GFX8-NEXT:    v_mov_b32_e32 v3, s9
5194; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5195; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
5196; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
5197; GFX8-NEXT:    v_mov_b32_e32 v3, s1
5198; GFX8-NEXT:    v_mov_b32_e32 v4, s10
5199; GFX8-NEXT:    v_mov_b32_e32 v5, s11
5200; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
5201; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
5202; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
5203; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
5204; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
5205; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
5206; GFX8-NEXT:    ; return to shader part epilog
5207;
5208; GFX9-LABEL: s_ssubsat_i128:
5209; GFX9:       ; %bb.0:
5210; GFX9-NEXT:    s_sub_u32 s8, s0, s4
5211; GFX9-NEXT:    s_subb_u32 s9, s1, s5
5212; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5213; GFX9-NEXT:    s_subb_u32 s10, s2, s6
5214; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5215; GFX9-NEXT:    s_subb_u32 s11, s3, s7
5216; GFX9-NEXT:    v_mov_b32_e32 v2, s2
5217; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
5218; GFX9-NEXT:    v_mov_b32_e32 v3, s3
5219; GFX9-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
5220; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
5221; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5222; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3]
5223; GFX9-NEXT:    s_and_b32 s0, 1, s0
5224; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5225; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5226; GFX9-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
5227; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5228; GFX9-NEXT:    s_cmp_eq_u64 s[6:7], 0
5229; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5230; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
5231; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
5232; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5233; GFX9-NEXT:    s_and_b32 s0, 1, s2
5234; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5235; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5236; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
5237; GFX9-NEXT:    s_ashr_i32 s0, s11, 31
5238; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5239; GFX9-NEXT:    s_add_i32 s1, s0, 0x80000000
5240; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5241; GFX9-NEXT:    v_mov_b32_e32 v2, s8
5242; GFX9-NEXT:    v_mov_b32_e32 v3, s9
5243; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5244; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
5245; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
5246; GFX9-NEXT:    v_mov_b32_e32 v3, s1
5247; GFX9-NEXT:    v_mov_b32_e32 v4, s10
5248; GFX9-NEXT:    v_mov_b32_e32 v5, s11
5249; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
5250; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
5251; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
5252; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
5253; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
5254; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
5255; GFX9-NEXT:    ; return to shader part epilog
5256;
5257; GFX10-LABEL: s_ssubsat_i128:
5258; GFX10:       ; %bb.0:
5259; GFX10-NEXT:    s_sub_u32 s8, s0, s4
5260; GFX10-NEXT:    s_subb_u32 s9, s1, s5
5261; GFX10-NEXT:    s_subb_u32 s10, s2, s6
5262; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
5263; GFX10-NEXT:    s_subb_u32 s11, s3, s7
5264; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
5265; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
5266; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
5267; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[10:11], s[2:3]
5268; GFX10-NEXT:    v_cmp_gt_u64_e64 s2, s[4:5], 0
5269; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
5270; GFX10-NEXT:    s_and_b32 s0, 1, s12
5271; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], 0
5272; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
5273; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[6:7], 0
5274; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
5275; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
5276; GFX10-NEXT:    s_ashr_i32 s0, s11, 31
5277; GFX10-NEXT:    s_and_b32 s1, 1, s1
5278; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
5279; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5280; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
5281; GFX10-NEXT:    s_add_i32 s1, s0, 0x80000000
5282; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
5283; GFX10-NEXT:    v_mov_b32_e32 v2, s9
5284; GFX10-NEXT:    v_mov_b32_e32 v3, s11
5285; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
5286; GFX10-NEXT:    v_mov_b32_e32 v1, s8
5287; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
5288; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5289; GFX10-NEXT:    v_mov_b32_e32 v0, s10
5290; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s0, vcc_lo
5291; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s0, vcc_lo
5292; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, vcc_lo
5293; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s1, vcc_lo
5294; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
5295; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
5296; GFX10-NEXT:    v_readfirstlane_b32 s2, v0
5297; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
5298; GFX10-NEXT:    ; return to shader part epilog
5299;
5300; GFX11-LABEL: s_ssubsat_i128:
5301; GFX11:       ; %bb.0:
5302; GFX11-NEXT:    s_sub_u32 s8, s0, s4
5303; GFX11-NEXT:    s_subb_u32 s9, s1, s5
5304; GFX11-NEXT:    s_subb_u32 s10, s2, s6
5305; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
5306; GFX11-NEXT:    s_subb_u32 s11, s3, s7
5307; GFX11-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
5308; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
5309; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
5310; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[10:11], s[2:3]
5311; GFX11-NEXT:    v_cmp_gt_u64_e64 s2, s[4:5], 0
5312; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
5313; GFX11-NEXT:    s_and_b32 s0, 1, s12
5314; GFX11-NEXT:    s_cmp_eq_u64 s[6:7], 0
5315; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
5316; GFX11-NEXT:    v_cmp_gt_i64_e64 s2, s[6:7], 0
5317; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
5318; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
5319; GFX11-NEXT:    s_ashr_i32 s0, s11, 31
5320; GFX11-NEXT:    s_and_b32 s1, 1, s1
5321; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
5322; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5323; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
5324; GFX11-NEXT:    s_add_i32 s1, s0, 0x80000000
5325; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9
5326; GFX11-NEXT:    v_mov_b32_e32 v3, s11
5327; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
5328; GFX11-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_and_b32 v0, 1, v0
5329; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5330; GFX11-NEXT:    v_mov_b32_e32 v0, s10
5331; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s0, vcc_lo
5332; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s0, vcc_lo
5333; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s0, vcc_lo
5334; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s1, vcc_lo
5335; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
5336; GFX11-NEXT:    v_readfirstlane_b32 s1, v2
5337; GFX11-NEXT:    v_readfirstlane_b32 s2, v0
5338; GFX11-NEXT:    v_readfirstlane_b32 s3, v3
5339; GFX11-NEXT:    ; return to shader part epilog
5340  %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
5341  ret i128 %result
5342}
5343
5344define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
5345; GFX6-LABEL: ssubsat_i128_sv:
5346; GFX6:       ; %bb.0:
5347; GFX6-NEXT:    v_mov_b32_e32 v5, s1
5348; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s0, v0
5349; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v5, v1, vcc
5350; GFX6-NEXT:    v_mov_b32_e32 v6, s2
5351; GFX6-NEXT:    v_mov_b32_e32 v7, s3
5352; GFX6-NEXT:    v_subb_u32_e32 v6, vcc, v6, v2, vcc
5353; GFX6-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
5354; GFX6-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
5355; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
5356; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7]
5357; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
5358; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7]
5359; GFX6-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
5360; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
5361; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5362; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[2:3]
5363; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5364; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5365; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
5366; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5367; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v8
5368; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
5369; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v2, v1
5370; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5371; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5372; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5373; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
5374; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
5375; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
5376; GFX6-NEXT:    ; return to shader part epilog
5377;
5378; GFX8-LABEL: ssubsat_i128_sv:
5379; GFX8:       ; %bb.0:
5380; GFX8-NEXT:    v_mov_b32_e32 v5, s1
5381; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s0, v0
5382; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v5, v1, vcc
5383; GFX8-NEXT:    v_mov_b32_e32 v6, s2
5384; GFX8-NEXT:    v_mov_b32_e32 v7, s3
5385; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v2, vcc
5386; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
5387; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
5388; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
5389; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7]
5390; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
5391; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7]
5392; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
5393; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
5394; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5395; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[2:3]
5396; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5397; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5398; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
5399; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5400; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v8
5401; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
5402; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v2, v1
5403; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5404; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5405; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5406; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
5407; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
5408; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
5409; GFX8-NEXT:    ; return to shader part epilog
5410;
5411; GFX9-LABEL: ssubsat_i128_sv:
5412; GFX9:       ; %bb.0:
5413; GFX9-NEXT:    v_mov_b32_e32 v5, s1
5414; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s0, v0
5415; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
5416; GFX9-NEXT:    v_mov_b32_e32 v6, s2
5417; GFX9-NEXT:    v_mov_b32_e32 v7, s3
5418; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v2, vcc
5419; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v3, vcc
5420; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
5421; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
5422; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7]
5423; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
5424; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7]
5425; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
5426; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
5427; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5428; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[2:3]
5429; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5430; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5431; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
5432; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5433; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v8
5434; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5435; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
5436; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5437; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5438; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
5439; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
5440; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
5441; GFX9-NEXT:    ; return to shader part epilog
5442;
5443; GFX10-LABEL: ssubsat_i128_sv:
5444; GFX10:       ; %bb.0:
5445; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, s0, v0
5446; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
5447; GFX10-NEXT:    v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
5448; GFX10-NEXT:    v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5449; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5]
5450; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
5451; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7]
5452; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
5453; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1]
5454; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5455; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
5456; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5457; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7]
5458; GFX10-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
5459; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
5460; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
5461; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5462; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
5463; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v8
5464; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
5465; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5466; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
5467; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
5468; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
5469; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
5470; GFX10-NEXT:    ; return to shader part epilog
5471;
5472; GFX11-LABEL: ssubsat_i128_sv:
5473; GFX11:       ; %bb.0:
5474; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, s0, v0
5475; GFX11-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
5476; GFX11-NEXT:    v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
5477; GFX11-NEXT:    v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5478; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5]
5479; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
5480; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7]
5481; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
5482; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1]
5483; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5484; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
5485; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5486; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7]
5487; GFX11-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
5488; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
5489; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
5490; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
5491; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v8
5492; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
5493; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5494; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
5495; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
5496; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
5497; GFX11-NEXT:    ; return to shader part epilog
5498  %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
5499  %cast = bitcast i128 %result to <4 x float>
5500  ret <4 x float> %cast
5501}
5502
5503define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
5504; GFX6-LABEL: ssubsat_i128_vs:
5505; GFX6:       ; %bb.0:
5506; GFX6-NEXT:    v_mov_b32_e32 v5, s1
5507; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s0, v0
5508; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v1, v5, vcc
5509; GFX6-NEXT:    v_mov_b32_e32 v6, s2
5510; GFX6-NEXT:    v_mov_b32_e32 v7, s3
5511; GFX6-NEXT:    v_subb_u32_e32 v6, vcc, v2, v6, vcc
5512; GFX6-NEXT:    v_subb_u32_e32 v7, vcc, v3, v7, vcc
5513; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5514; GFX6-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
5515; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5516; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5517; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5518; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5519; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5520; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5521; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
5522; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[2:3], 0
5523; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5524; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5525; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5526; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
5527; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
5528; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v2, v1
5529; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5530; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5531; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5532; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
5533; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
5534; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
5535; GFX6-NEXT:    ; return to shader part epilog
5536;
5537; GFX8-LABEL: ssubsat_i128_vs:
5538; GFX8:       ; %bb.0:
5539; GFX8-NEXT:    v_mov_b32_e32 v5, s1
5540; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s0, v0
5541; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v1, v5, vcc
5542; GFX8-NEXT:    v_mov_b32_e32 v6, s2
5543; GFX8-NEXT:    v_mov_b32_e32 v7, s3
5544; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v2, v6, vcc
5545; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v3, v7, vcc
5546; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5547; GFX8-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
5548; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5549; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5550; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], 0
5551; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5552; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5553; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
5554; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5555; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5556; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
5557; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5558; GFX8-NEXT:    s_and_b32 s0, 1, s4
5559; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5560; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5561; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
5562; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
5563; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
5564; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v2, v1
5565; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5566; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5567; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5568; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
5569; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
5570; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
5571; GFX8-NEXT:    ; return to shader part epilog
5572;
5573; GFX9-LABEL: ssubsat_i128_vs:
5574; GFX9:       ; %bb.0:
5575; GFX9-NEXT:    v_mov_b32_e32 v5, s1
5576; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s0, v0
5577; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v5, vcc
5578; GFX9-NEXT:    v_mov_b32_e32 v6, s2
5579; GFX9-NEXT:    v_mov_b32_e32 v7, s3
5580; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v2, v6, vcc
5581; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v3, v7, vcc
5582; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5583; GFX9-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
5584; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5585; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5586; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], 0
5587; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5588; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5589; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
5590; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5591; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5592; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
5593; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5594; GFX9-NEXT:    s_and_b32 s0, 1, s4
5595; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5596; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5597; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
5598; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
5599; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5600; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
5601; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5602; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5603; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
5604; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
5605; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
5606; GFX9-NEXT:    ; return to shader part epilog
5607;
5608; GFX10-LABEL: ssubsat_i128_vs:
5609; GFX10:       ; %bb.0:
5610; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, s0
5611; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
5612; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
5613; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5614; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
5615; GFX10-NEXT:    v_cmp_gt_u64_e64 s0, s[0:1], 0
5616; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], 0
5617; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
5618; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5619; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
5620; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
5621; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[2:3], 0
5622; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5623; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5624; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
5625; GFX10-NEXT:    s_and_b32 s0, 1, s4
5626; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
5627; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5628; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
5629; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
5630; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
5631; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
5632; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
5633; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5634; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
5635; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
5636; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
5637; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
5638; GFX10-NEXT:    ; return to shader part epilog
5639;
5640; GFX11-LABEL: ssubsat_i128_vs:
5641; GFX11:       ; %bb.0:
5642; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, s0
5643; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
5644; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
5645; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5646; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
5647; GFX11-NEXT:    v_cmp_gt_u64_e64 s0, s[0:1], 0
5648; GFX11-NEXT:    s_cmp_eq_u64 s[2:3], 0
5649; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
5650; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5651; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
5652; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
5653; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, s[2:3], 0
5654; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5655; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5656; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
5657; GFX11-NEXT:    s_and_b32 s0, 1, s4
5658; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
5659; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5660; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
5661; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
5662; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
5663; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
5664; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
5665; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5666; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
5667; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
5668; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
5669; GFX11-NEXT:    ; return to shader part epilog
5670  %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
5671  %cast = bitcast i128 %result to <4 x float>
5672  ret <4 x float> %cast
5673}
5674
5675define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
5676; GFX6-LABEL: v_ssubsat_v2i128:
5677; GFX6:       ; %bb.0:
5678; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5679; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v0, v8
5680; GFX6-NEXT:    v_subb_u32_e32 v17, vcc, v1, v9, vcc
5681; GFX6-NEXT:    v_subb_u32_e32 v18, vcc, v2, v10, vcc
5682; GFX6-NEXT:    v_subb_u32_e32 v19, vcc, v3, v11, vcc
5683; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1]
5684; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5685; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3]
5686; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5687; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3]
5688; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5689; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[8:9]
5690; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5691; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[10:11]
5692; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5693; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5694; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5695; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5696; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v19
5697; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
5698; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v2, v1
5699; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5700; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5701; GFX6-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc
5702; GFX6-NEXT:    v_cndmask_b32_e32 v1, v17, v2, vcc
5703; GFX6-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
5704; GFX6-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
5705; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v4, v12
5706; GFX6-NEXT:    v_subb_u32_e32 v9, vcc, v5, v13, vcc
5707; GFX6-NEXT:    v_subb_u32_e32 v10, vcc, v6, v14, vcc
5708; GFX6-NEXT:    v_subb_u32_e32 v11, vcc, v7, v15, vcc
5709; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5710; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
5711; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5712; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5713; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5714; GFX6-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
5715; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[12:13]
5716; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5717; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[14:15]
5718; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
5719; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5720; GFX6-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
5721; GFX6-NEXT:    v_xor_b32_e32 v4, v5, v4
5722; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
5723; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 0x80000000, v6
5724; GFX6-NEXT:    v_and_b32_e32 v4, 1, v4
5725; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
5726; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
5727; GFX6-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
5728; GFX6-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
5729; GFX6-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
5730; GFX6-NEXT:    s_setpc_b64 s[30:31]
5731;
5732; GFX8-LABEL: v_ssubsat_v2i128:
5733; GFX8:       ; %bb.0:
5734; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5735; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, v0, v8
5736; GFX8-NEXT:    v_subb_u32_e32 v17, vcc, v1, v9, vcc
5737; GFX8-NEXT:    v_subb_u32_e32 v18, vcc, v2, v10, vcc
5738; GFX8-NEXT:    v_subb_u32_e32 v19, vcc, v3, v11, vcc
5739; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1]
5740; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5741; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3]
5742; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5743; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3]
5744; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5745; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[8:9]
5746; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5747; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[10:11]
5748; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5749; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5750; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5751; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
5752; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v19
5753; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
5754; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v2, v1
5755; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5756; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5757; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc
5758; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v2, vcc
5759; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
5760; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
5761; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, v4, v12
5762; GFX8-NEXT:    v_subb_u32_e32 v9, vcc, v5, v13, vcc
5763; GFX8-NEXT:    v_subb_u32_e32 v10, vcc, v6, v14, vcc
5764; GFX8-NEXT:    v_subb_u32_e32 v11, vcc, v7, v15, vcc
5765; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5766; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
5767; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5768; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5769; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5770; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
5771; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[12:13]
5772; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5773; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[14:15]
5774; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
5775; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5776; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
5777; GFX8-NEXT:    v_xor_b32_e32 v4, v5, v4
5778; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
5779; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x80000000, v6
5780; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
5781; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
5782; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
5783; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
5784; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
5785; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
5786; GFX8-NEXT:    s_setpc_b64 s[30:31]
5787;
5788; GFX9-LABEL: v_ssubsat_v2i128:
5789; GFX9:       ; %bb.0:
5790; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5791; GFX9-NEXT:    v_sub_co_u32_e32 v16, vcc, v0, v8
5792; GFX9-NEXT:    v_subb_co_u32_e32 v17, vcc, v1, v9, vcc
5793; GFX9-NEXT:    v_subb_co_u32_e32 v18, vcc, v2, v10, vcc
5794; GFX9-NEXT:    v_subb_co_u32_e32 v19, vcc, v3, v11, vcc
5795; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1]
5796; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5797; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3]
5798; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5799; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3]
5800; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5801; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[8:9]
5802; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5803; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[10:11]
5804; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5805; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5806; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5807; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
5808; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v19
5809; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5810; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
5811; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5812; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc
5813; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v2, vcc
5814; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
5815; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
5816; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v4, v12
5817; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, v5, v13, vcc
5818; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, v6, v14, vcc
5819; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, v7, v15, vcc
5820; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5821; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
5822; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5823; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5824; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5825; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
5826; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[12:13]
5827; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5828; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[14:15]
5829; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
5830; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5831; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
5832; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v4
5833; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
5834; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
5835; GFX9-NEXT:    v_add_u32_e32 v7, 0x80000000, v6
5836; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
5837; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
5838; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
5839; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
5840; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
5841; GFX9-NEXT:    s_setpc_b64 s[30:31]
5842;
5843; GFX10-LABEL: v_ssubsat_v2i128:
5844; GFX10:       ; %bb.0:
5845; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5846; GFX10-NEXT:    v_sub_co_u32 v16, vcc_lo, v0, v8
5847; GFX10-NEXT:    v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo
5848; GFX10-NEXT:    v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo
5849; GFX10-NEXT:    v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo
5850; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1]
5851; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5852; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3]
5853; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5854; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3]
5855; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5856; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9]
5857; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5858; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11]
5859; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
5860; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v4, v12
5861; GFX10-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo
5862; GFX10-NEXT:    v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo
5863; GFX10-NEXT:    v_sub_co_ci_u32_e32 v21, vcc_lo, v7, v15, vcc_lo
5864; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
5865; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
5866; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5]
5867; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
5868; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
5869; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7]
5870; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
5871; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
5872; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13]
5873; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
5874; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[14:15]
5875; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
5876; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[6:7]
5877; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v21
5878; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
5879; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
5880; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x80000000, v6
5881; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
5882; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5883; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
5884; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v19
5885; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
5886; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x80000000, v2
5887; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc_lo
5888; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v2, vcc_lo
5889; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
5890; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v3
5891; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v4, vcc_lo
5892; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s4
5893; GFX10-NEXT:    v_cndmask_b32_e64 v5, v9, v6, s4
5894; GFX10-NEXT:    v_cndmask_b32_e64 v6, v20, v6, s4
5895; GFX10-NEXT:    v_cndmask_b32_e64 v7, v21, v7, s4
5896; GFX10-NEXT:    s_setpc_b64 s[30:31]
5897;
5898; GFX11-LABEL: v_ssubsat_v2i128:
5899; GFX11:       ; %bb.0:
5900; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5901; GFX11-NEXT:    v_sub_co_u32 v16, vcc_lo, v0, v8
5902; GFX11-NEXT:    v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo
5903; GFX11-NEXT:    v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo
5904; GFX11-NEXT:    v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo
5905; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1]
5906; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5907; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3]
5908; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5909; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3]
5910; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5911; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9]
5912; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5913; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11]
5914; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
5915; GFX11-NEXT:    v_sub_co_u32 v8, vcc_lo, v4, v12
5916; GFX11-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo
5917; GFX11-NEXT:    v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo
5918; GFX11-NEXT:    v_sub_co_ci_u32_e32 v21, vcc_lo, v7, v15, vcc_lo
5919; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
5920; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
5921; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5]
5922; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
5923; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
5924; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7]
5925; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
5926; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13]
5927; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
5928; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[14:15]
5929; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
5930; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[6:7]
5931; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v21
5932; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
5933; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
5934; GFX11-NEXT:    v_dual_cndmask_b32 v2, v5, v4 :: v_dual_add_nc_u32 v7, 0x80000000, v6
5935; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
5936; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v19
5937; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
5938; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x80000000, v2
5939; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5940; GFX11-NEXT:    v_dual_cndmask_b32 v0, v16, v2 :: v_dual_and_b32 v3, 1, v1
5941; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v3
5942; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v2, vcc_lo
5943; GFX11-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v4
5944; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s0
5945; GFX11-NEXT:    v_cndmask_b32_e64 v5, v9, v6, s0
5946; GFX11-NEXT:    v_cndmask_b32_e64 v6, v20, v6, s0
5947; GFX11-NEXT:    v_cndmask_b32_e64 v7, v21, v7, s0
5948; GFX11-NEXT:    s_setpc_b64 s[30:31]
5949  %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
5950  ret <2 x i128> %result
5951}
5952
5953define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) {
5954; GFX6-LABEL: s_ssubsat_v2i128:
5955; GFX6:       ; %bb.0:
5956; GFX6-NEXT:    s_sub_u32 s16, s0, s8
5957; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5958; GFX6-NEXT:    s_subb_u32 s17, s1, s9
5959; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5960; GFX6-NEXT:    s_subb_u32 s18, s2, s10
5961; GFX6-NEXT:    v_mov_b32_e32 v2, s2
5962; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
5963; GFX6-NEXT:    s_subb_u32 s19, s3, s11
5964; GFX6-NEXT:    v_mov_b32_e32 v3, s3
5965; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5966; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3]
5967; GFX6-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
5968; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5969; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[18:19], v[2:3]
5970; GFX6-NEXT:    v_mov_b32_e32 v3, s17
5971; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5972; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5973; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
5974; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[10:11], 0
5975; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5976; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5977; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5978; GFX6-NEXT:    s_ashr_i32 s0, s19, 31
5979; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5980; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
5981; GFX6-NEXT:    v_mov_b32_e32 v1, s0
5982; GFX6-NEXT:    v_mov_b32_e32 v2, s16
5983; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5984; GFX6-NEXT:    v_cndmask_b32_e32 v4, v2, v1, vcc
5985; GFX6-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
5986; GFX6-NEXT:    v_mov_b32_e32 v0, s1
5987; GFX6-NEXT:    v_mov_b32_e32 v2, s18
5988; GFX6-NEXT:    v_mov_b32_e32 v3, s19
5989; GFX6-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
5990; GFX6-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
5991; GFX6-NEXT:    s_sub_u32 s0, s4, s12
5992; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5993; GFX6-NEXT:    s_subb_u32 s1, s5, s13
5994; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5995; GFX6-NEXT:    s_subb_u32 s2, s6, s14
5996; GFX6-NEXT:    v_mov_b32_e32 v2, s6
5997; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
5998; GFX6-NEXT:    s_subb_u32 s3, s7, s15
5999; GFX6-NEXT:    v_mov_b32_e32 v3, s7
6000; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
6001; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
6002; GFX6-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[12:13], 0
6003; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
6004; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
6005; GFX6-NEXT:    v_mov_b32_e32 v3, s1
6006; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
6007; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
6008; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
6009; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[14:15], 0
6010; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
6011; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
6012; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
6013; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
6014; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
6015; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
6016; GFX6-NEXT:    v_mov_b32_e32 v1, s4
6017; GFX6-NEXT:    v_mov_b32_e32 v2, s0
6018; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
6019; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
6020; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
6021; GFX6-NEXT:    v_mov_b32_e32 v3, s5
6022; GFX6-NEXT:    v_mov_b32_e32 v8, s2
6023; GFX6-NEXT:    v_mov_b32_e32 v9, s3
6024; GFX6-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
6025; GFX6-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
6026; GFX6-NEXT:    v_readfirstlane_b32 s0, v4
6027; GFX6-NEXT:    v_readfirstlane_b32 s1, v5
6028; GFX6-NEXT:    v_readfirstlane_b32 s2, v6
6029; GFX6-NEXT:    v_readfirstlane_b32 s3, v7
6030; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
6031; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
6032; GFX6-NEXT:    v_readfirstlane_b32 s6, v1
6033; GFX6-NEXT:    v_readfirstlane_b32 s7, v3
6034; GFX6-NEXT:    ; return to shader part epilog
6035;
6036; GFX8-LABEL: s_ssubsat_v2i128:
6037; GFX8:       ; %bb.0:
6038; GFX8-NEXT:    s_sub_u32 s16, s0, s8
6039; GFX8-NEXT:    s_subb_u32 s17, s1, s9
6040; GFX8-NEXT:    v_mov_b32_e32 v0, s0
6041; GFX8-NEXT:    s_subb_u32 s18, s2, s10
6042; GFX8-NEXT:    v_mov_b32_e32 v1, s1
6043; GFX8-NEXT:    s_subb_u32 s19, s3, s11
6044; GFX8-NEXT:    v_mov_b32_e32 v2, s2
6045; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
6046; GFX8-NEXT:    v_mov_b32_e32 v3, s3
6047; GFX8-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
6048; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
6049; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
6050; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3]
6051; GFX8-NEXT:    s_and_b32 s0, 1, s0
6052; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
6053; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
6054; GFX8-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
6055; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
6056; GFX8-NEXT:    s_cmp_eq_u64 s[10:11], 0
6057; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
6058; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
6059; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
6060; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
6061; GFX8-NEXT:    s_and_b32 s0, 1, s2
6062; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
6063; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
6064; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
6065; GFX8-NEXT:    s_ashr_i32 s0, s19, 31
6066; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
6067; GFX8-NEXT:    s_add_i32 s1, s0, 0x80000000
6068; GFX8-NEXT:    v_mov_b32_e32 v1, s0
6069; GFX8-NEXT:    v_mov_b32_e32 v2, s16
6070; GFX8-NEXT:    v_mov_b32_e32 v3, s17
6071; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
6072; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v1, vcc
6073; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
6074; GFX8-NEXT:    v_mov_b32_e32 v0, s1
6075; GFX8-NEXT:    v_mov_b32_e32 v2, s18
6076; GFX8-NEXT:    v_mov_b32_e32 v3, s19
6077; GFX8-NEXT:    s_sub_u32 s0, s4, s12
6078; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
6079; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
6080; GFX8-NEXT:    s_subb_u32 s1, s5, s13
6081; GFX8-NEXT:    v_mov_b32_e32 v0, s4
6082; GFX8-NEXT:    s_subb_u32 s2, s6, s14
6083; GFX8-NEXT:    v_mov_b32_e32 v1, s5
6084; GFX8-NEXT:    s_subb_u32 s3, s7, s15
6085; GFX8-NEXT:    v_mov_b32_e32 v2, s6
6086; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
6087; GFX8-NEXT:    v_mov_b32_e32 v3, s7
6088; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
6089; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
6090; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
6091; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
6092; GFX8-NEXT:    s_and_b32 s4, 1, s4
6093; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
6094; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
6095; GFX8-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[12:13], 0
6096; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
6097; GFX8-NEXT:    s_cmp_eq_u64 s[14:15], 0
6098; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
6099; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
6100; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
6101; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
6102; GFX8-NEXT:    s_and_b32 s4, 1, s6
6103; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
6104; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
6105; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
6106; GFX8-NEXT:    s_ashr_i32 s4, s3, 31
6107; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
6108; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
6109; GFX8-NEXT:    v_mov_b32_e32 v1, s4
6110; GFX8-NEXT:    v_mov_b32_e32 v2, s0
6111; GFX8-NEXT:    v_mov_b32_e32 v3, s1
6112; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
6113; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
6114; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
6115; GFX8-NEXT:    v_mov_b32_e32 v3, s5
6116; GFX8-NEXT:    v_mov_b32_e32 v8, s2
6117; GFX8-NEXT:    v_mov_b32_e32 v9, s3
6118; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
6119; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
6120; GFX8-NEXT:    v_readfirstlane_b32 s0, v4
6121; GFX8-NEXT:    v_readfirstlane_b32 s1, v5
6122; GFX8-NEXT:    v_readfirstlane_b32 s2, v6
6123; GFX8-NEXT:    v_readfirstlane_b32 s3, v7
6124; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
6125; GFX8-NEXT:    v_readfirstlane_b32 s5, v2
6126; GFX8-NEXT:    v_readfirstlane_b32 s6, v1
6127; GFX8-NEXT:    v_readfirstlane_b32 s7, v3
6128; GFX8-NEXT:    ; return to shader part epilog
6129;
6130; GFX9-LABEL: s_ssubsat_v2i128:
6131; GFX9:       ; %bb.0:
6132; GFX9-NEXT:    s_sub_u32 s16, s0, s8
6133; GFX9-NEXT:    s_subb_u32 s17, s1, s9
6134; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6135; GFX9-NEXT:    s_subb_u32 s18, s2, s10
6136; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6137; GFX9-NEXT:    s_subb_u32 s19, s3, s11
6138; GFX9-NEXT:    v_mov_b32_e32 v2, s2
6139; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
6140; GFX9-NEXT:    v_mov_b32_e32 v3, s3
6141; GFX9-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
6142; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
6143; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
6144; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3]
6145; GFX9-NEXT:    s_and_b32 s0, 1, s0
6146; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
6147; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
6148; GFX9-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
6149; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
6150; GFX9-NEXT:    s_cmp_eq_u64 s[10:11], 0
6151; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
6152; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
6153; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
6154; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
6155; GFX9-NEXT:    s_and_b32 s0, 1, s2
6156; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
6157; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
6158; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
6159; GFX9-NEXT:    s_ashr_i32 s0, s19, 31
6160; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
6161; GFX9-NEXT:    s_add_i32 s1, s0, 0x80000000
6162; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6163; GFX9-NEXT:    v_mov_b32_e32 v2, s16
6164; GFX9-NEXT:    v_mov_b32_e32 v3, s17
6165; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
6166; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v1, vcc
6167; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
6168; GFX9-NEXT:    v_mov_b32_e32 v0, s1
6169; GFX9-NEXT:    v_mov_b32_e32 v2, s18
6170; GFX9-NEXT:    v_mov_b32_e32 v3, s19
6171; GFX9-NEXT:    s_sub_u32 s0, s4, s12
6172; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
6173; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
6174; GFX9-NEXT:    s_subb_u32 s1, s5, s13
6175; GFX9-NEXT:    v_mov_b32_e32 v0, s4
6176; GFX9-NEXT:    s_subb_u32 s2, s6, s14
6177; GFX9-NEXT:    v_mov_b32_e32 v1, s5
6178; GFX9-NEXT:    s_subb_u32 s3, s7, s15
6179; GFX9-NEXT:    v_mov_b32_e32 v2, s6
6180; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
6181; GFX9-NEXT:    v_mov_b32_e32 v3, s7
6182; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
6183; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
6184; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
6185; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
6186; GFX9-NEXT:    s_and_b32 s4, 1, s4
6187; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
6188; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
6189; GFX9-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[12:13], 0
6190; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
6191; GFX9-NEXT:    s_cmp_eq_u64 s[14:15], 0
6192; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
6193; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
6194; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
6195; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
6196; GFX9-NEXT:    s_and_b32 s4, 1, s6
6197; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
6198; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
6199; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
6200; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
6201; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
6202; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
6203; GFX9-NEXT:    v_mov_b32_e32 v1, s4
6204; GFX9-NEXT:    v_mov_b32_e32 v2, s0
6205; GFX9-NEXT:    v_mov_b32_e32 v3, s1
6206; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
6207; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
6208; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
6209; GFX9-NEXT:    v_mov_b32_e32 v3, s5
6210; GFX9-NEXT:    v_mov_b32_e32 v8, s2
6211; GFX9-NEXT:    v_mov_b32_e32 v9, s3
6212; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
6213; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
6214; GFX9-NEXT:    v_readfirstlane_b32 s0, v4
6215; GFX9-NEXT:    v_readfirstlane_b32 s1, v5
6216; GFX9-NEXT:    v_readfirstlane_b32 s2, v6
6217; GFX9-NEXT:    v_readfirstlane_b32 s3, v7
6218; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
6219; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
6220; GFX9-NEXT:    v_readfirstlane_b32 s6, v1
6221; GFX9-NEXT:    v_readfirstlane_b32 s7, v3
6222; GFX9-NEXT:    ; return to shader part epilog
6223;
6224; GFX10-LABEL: s_ssubsat_v2i128:
6225; GFX10:       ; %bb.0:
6226; GFX10-NEXT:    s_sub_u32 s18, s0, s8
6227; GFX10-NEXT:    s_subb_u32 s19, s1, s9
6228; GFX10-NEXT:    s_subb_u32 s16, s2, s10
6229; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[18:19], s[0:1]
6230; GFX10-NEXT:    s_subb_u32 s17, s3, s11
6231; GFX10-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
6232; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
6233; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
6234; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
6235; GFX10-NEXT:    v_cmp_gt_u64_e64 s2, s[8:9], 0
6236; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
6237; GFX10-NEXT:    s_and_b32 s0, 1, s20
6238; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], 0
6239; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
6240; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[10:11], 0
6241; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
6242; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
6243; GFX10-NEXT:    s_ashr_i32 s8, s17, 31
6244; GFX10-NEXT:    s_and_b32 s1, 1, s1
6245; GFX10-NEXT:    s_add_i32 s9, s8, 0x80000000
6246; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
6247; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
6248; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
6249; GFX10-NEXT:    s_sub_u32 s0, s4, s12
6250; GFX10-NEXT:    s_subb_u32 s1, s5, s13
6251; GFX10-NEXT:    s_subb_u32 s2, s6, s14
6252; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
6253; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
6254; GFX10-NEXT:    s_subb_u32 s3, s7, s15
6255; GFX10-NEXT:    v_mov_b32_e32 v5, s0
6256; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
6257; GFX10-NEXT:    v_mov_b32_e32 v6, s1
6258; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
6259; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
6260; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
6261; GFX10-NEXT:    v_cmp_gt_u64_e64 s6, s[12:13], 0
6262; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
6263; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
6264; GFX10-NEXT:    v_mov_b32_e32 v7, s3
6265; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
6266; GFX10-NEXT:    s_and_b32 s4, 1, s10
6267; GFX10-NEXT:    s_cmp_eq_u64 s[14:15], 0
6268; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
6269; GFX10-NEXT:    v_cmp_gt_i64_e64 s6, s[14:15], 0
6270; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
6271; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
6272; GFX10-NEXT:    s_ashr_i32 s4, s3, 31
6273; GFX10-NEXT:    s_and_b32 s5, 1, s5
6274; GFX10-NEXT:    s_add_i32 s0, s4, 0x80000000
6275; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s6
6276; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
6277; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
6278; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc_lo
6279; GFX10-NEXT:    v_mov_b32_e32 v3, s18
6280; GFX10-NEXT:    v_mov_b32_e32 v4, s19
6281; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
6282; GFX10-NEXT:    v_mov_b32_e32 v0, s16
6283; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
6284; GFX10-NEXT:    v_mov_b32_e32 v2, s17
6285; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s8, vcc_lo
6286; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s8, vcc_lo
6287; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
6288; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s8, vcc_lo
6289; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s9, vcc_lo
6290; GFX10-NEXT:    v_readfirstlane_b32 s1, v4
6291; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
6292; GFX10-NEXT:    v_mov_b32_e32 v1, s2
6293; GFX10-NEXT:    v_readfirstlane_b32 s2, v0
6294; GFX10-NEXT:    v_readfirstlane_b32 s3, v2
6295; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s4, vcc_lo
6296; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, s4, vcc_lo
6297; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s4, vcc_lo
6298; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, s0, vcc_lo
6299; GFX10-NEXT:    v_readfirstlane_b32 s0, v3
6300; GFX10-NEXT:    v_readfirstlane_b32 s4, v5
6301; GFX10-NEXT:    v_readfirstlane_b32 s5, v6
6302; GFX10-NEXT:    v_readfirstlane_b32 s6, v1
6303; GFX10-NEXT:    v_readfirstlane_b32 s7, v7
6304; GFX10-NEXT:    ; return to shader part epilog
6305;
6306; GFX11-LABEL: s_ssubsat_v2i128:
6307; GFX11:       ; %bb.0:
6308; GFX11-NEXT:    s_sub_u32 s18, s0, s8
6309; GFX11-NEXT:    s_subb_u32 s19, s1, s9
6310; GFX11-NEXT:    s_subb_u32 s16, s2, s10
6311; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, s[18:19], s[0:1]
6312; GFX11-NEXT:    s_subb_u32 s17, s3, s11
6313; GFX11-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
6314; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
6315; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
6316; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
6317; GFX11-NEXT:    v_cmp_gt_u64_e64 s2, s[8:9], 0
6318; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
6319; GFX11-NEXT:    s_and_b32 s0, 1, s20
6320; GFX11-NEXT:    s_cmp_eq_u64 s[10:11], 0
6321; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
6322; GFX11-NEXT:    v_cmp_gt_i64_e64 s2, s[10:11], 0
6323; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
6324; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
6325; GFX11-NEXT:    s_ashr_i32 s8, s17, 31
6326; GFX11-NEXT:    s_and_b32 s1, 1, s1
6327; GFX11-NEXT:    s_add_i32 s9, s8, 0x80000000
6328; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
6329; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
6330; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
6331; GFX11-NEXT:    s_sub_u32 s0, s4, s12
6332; GFX11-NEXT:    s_subb_u32 s1, s5, s13
6333; GFX11-NEXT:    s_subb_u32 s2, s6, s14
6334; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
6335; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
6336; GFX11-NEXT:    s_subb_u32 s3, s7, s15
6337; GFX11-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s3
6338; GFX11-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
6339; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
6340; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
6341; GFX11-NEXT:    v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
6342; GFX11-NEXT:    v_cmp_gt_u64_e64 s6, s[12:13], 0
6343; GFX11-NEXT:    s_cselect_b32 s10, 1, 0
6344; GFX11-NEXT:    v_mov_b32_e32 v5, s0
6345; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
6346; GFX11-NEXT:    s_and_b32 s4, 1, s10
6347; GFX11-NEXT:    s_cmp_eq_u64 s[14:15], 0
6348; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
6349; GFX11-NEXT:    v_cmp_gt_i64_e64 s6, s[14:15], 0
6350; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
6351; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
6352; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
6353; GFX11-NEXT:    s_and_b32 s5, 1, s5
6354; GFX11-NEXT:    s_ashr_i32 s4, s3, 31
6355; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
6356; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s6
6357; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
6358; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
6359; GFX11-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s18
6360; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
6361; GFX11-NEXT:    v_mov_b32_e32 v0, s16
6362; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
6363; GFX11-NEXT:    v_mov_b32_e32 v4, s19
6364; GFX11-NEXT:    v_mov_b32_e32 v2, s17
6365; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s8, vcc_lo
6366; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s8, vcc_lo
6367; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
6368; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s8, vcc_lo
6369; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s9, vcc_lo
6370; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
6371; GFX11-NEXT:    v_mov_b32_e32 v1, s2
6372; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
6373; GFX11-NEXT:    v_readfirstlane_b32 s2, v0
6374; GFX11-NEXT:    v_readfirstlane_b32 s3, v2
6375; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, s4, vcc_lo
6376; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, s4, vcc_lo
6377; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s4, vcc_lo
6378; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, s0, vcc_lo
6379; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
6380; GFX11-NEXT:    v_readfirstlane_b32 s4, v5
6381; GFX11-NEXT:    v_readfirstlane_b32 s5, v6
6382; GFX11-NEXT:    v_readfirstlane_b32 s6, v1
6383; GFX11-NEXT:    v_readfirstlane_b32 s7, v7
6384; GFX11-NEXT:    ; return to shader part epilog
6385  %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
6386  ret <2 x i128> %result
6387}
6388
6389declare i7 @llvm.ssub.sat.i7(i7, i7) #0
6390declare i8 @llvm.ssub.sat.i8(i8, i8) #0
6391declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>) #0
6392declare <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8>, <4 x i8>) #0
6393
6394declare i16 @llvm.ssub.sat.i16(i16, i16) #0
6395declare <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16>, <2 x i16>) #0
6396declare <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16>, <3 x i16>) #0
6397declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>) #0
6398declare <5 x i16> @llvm.ssub.sat.v5i16(<5 x i16>, <5 x i16>) #0
6399declare <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16>, <6 x i16>) #0
6400declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) #0
6401
6402declare i24 @llvm.ssub.sat.i24(i24, i24) #0
6403
6404declare i32 @llvm.ssub.sat.i32(i32, i32) #0
6405declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>) #0
6406declare <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32>, <3 x i32>) #0
6407declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) #0
6408declare <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32>, <5 x i32>) #0
6409declare <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32>, <16 x i32>) #0
6410
6411declare i48 @llvm.ssub.sat.i48(i48, i48) #0
6412
6413declare i64 @llvm.ssub.sat.i64(i64, i64) #0
6414declare <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64>, <2 x i64>) #0
6415
6416declare i128 @llvm.ssub.sat.i128(i128, i128) #0
6417declare <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128>, <2 x i128>) #0
6418
6419attributes #0 = { nounwind readnone speculatable willreturn }
6420