xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (revision bfd9bc274586b0261e16e22ac50d50586a0152e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
7
8define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) {
9; GFX6-LABEL: v_saddsat_i7:
10; GFX6:       ; %bb.0:
11; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
13; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
14; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
15; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
16; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
17; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
18; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
19; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
20; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
21; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 25, v0
22; GFX6-NEXT:    s_setpc_b64 s[30:31]
23;
24; GFX8-LABEL: v_saddsat_i7:
25; GFX8:       ; %bb.0:
26; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
28; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
29; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
30; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
31; GFX8-NEXT:    v_sub_u16_e32 v3, 0x8000, v3
32; GFX8-NEXT:    v_sub_u16_e32 v2, 0x7fff, v2
33; GFX8-NEXT:    v_max_i16_e32 v1, v3, v1
34; GFX8-NEXT:    v_min_i16_e32 v1, v1, v2
35; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
36; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
37; GFX8-NEXT:    s_setpc_b64 s[30:31]
38;
39; GFX9-LABEL: v_saddsat_i7:
40; GFX9:       ; %bb.0:
41; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
43; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
44; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
45; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
46; GFX9-NEXT:    s_setpc_b64 s[30:31]
47;
48; GFX10PLUS-LABEL: v_saddsat_i7:
49; GFX10PLUS:       ; %bb.0:
50; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 9, v0
52; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 9, v1
53; GFX10PLUS-NEXT:    v_add_nc_i16 v0, v0, v1 clamp
54; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 9, v0
55; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
56  %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs)
57  ret i7 %result
58}
59
60define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
61; GFX6-LABEL: s_saddsat_i7:
62; GFX6:       ; %bb.0:
63; GFX6-NEXT:    s_lshl_b32 s0, s0, 25
64; GFX6-NEXT:    s_min_i32 s3, s0, 0
65; GFX6-NEXT:    s_lshl_b32 s1, s1, 25
66; GFX6-NEXT:    s_max_i32 s2, s0, 0
67; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
68; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
69; GFX6-NEXT:    s_max_i32 s1, s3, s1
70; GFX6-NEXT:    s_min_i32 s1, s1, s2
71; GFX6-NEXT:    s_add_i32 s0, s0, s1
72; GFX6-NEXT:    s_ashr_i32 s0, s0, 25
73; GFX6-NEXT:    ; return to shader part epilog
74;
75; GFX8-LABEL: s_saddsat_i7:
76; GFX8:       ; %bb.0:
77; GFX8-NEXT:    s_lshl_b32 s0, s0, 9
78; GFX8-NEXT:    s_sext_i32_i16 s2, s0
79; GFX8-NEXT:    s_sext_i32_i16 s3, 0
80; GFX8-NEXT:    s_max_i32 s4, s2, s3
81; GFX8-NEXT:    s_min_i32 s2, s2, s3
82; GFX8-NEXT:    s_lshl_b32 s1, s1, 9
83; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
84; GFX8-NEXT:    s_sext_i32_i16 s2, s2
85; GFX8-NEXT:    s_sext_i32_i16 s1, s1
86; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
87; GFX8-NEXT:    s_max_i32 s1, s2, s1
88; GFX8-NEXT:    s_sext_i32_i16 s1, s1
89; GFX8-NEXT:    s_sext_i32_i16 s2, s4
90; GFX8-NEXT:    s_min_i32 s1, s1, s2
91; GFX8-NEXT:    s_add_i32 s0, s0, s1
92; GFX8-NEXT:    s_sext_i32_i16 s0, s0
93; GFX8-NEXT:    s_ashr_i32 s0, s0, 9
94; GFX8-NEXT:    ; return to shader part epilog
95;
96; GFX9-LABEL: s_saddsat_i7:
97; GFX9:       ; %bb.0:
98; GFX9-NEXT:    s_lshl_b32 s1, s1, 9
99; GFX9-NEXT:    s_lshl_b32 s0, s0, 9
100; GFX9-NEXT:    v_mov_b32_e32 v0, s1
101; GFX9-NEXT:    v_add_i16 v0, s0, v0 clamp
102; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
103; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
104; GFX9-NEXT:    ; return to shader part epilog
105;
106; GFX10PLUS-LABEL: s_saddsat_i7:
107; GFX10PLUS:       ; %bb.0:
108; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 9
109; GFX10PLUS-NEXT:    s_lshl_b32 s1, s1, 9
110; GFX10PLUS-NEXT:    v_add_nc_i16 v0, s0, s1 clamp
111; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 9, v0
112; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
113; GFX10PLUS-NEXT:    ; return to shader part epilog
114  %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs)
115  ret i7 %result
116}
117
118define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
119; GFX6-LABEL: v_saddsat_i8:
120; GFX6:       ; %bb.0:
121; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
122; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
123; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
124; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
125; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
126; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
127; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
128; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
129; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
130; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
131; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
132; GFX6-NEXT:    s_setpc_b64 s[30:31]
133;
134; GFX8-LABEL: v_saddsat_i8:
135; GFX8:       ; %bb.0:
136; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
138; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
139; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
140; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
141; GFX8-NEXT:    v_sub_u16_e32 v3, 0x8000, v3
142; GFX8-NEXT:    v_sub_u16_e32 v2, 0x7fff, v2
143; GFX8-NEXT:    v_max_i16_e32 v1, v3, v1
144; GFX8-NEXT:    v_min_i16_e32 v1, v1, v2
145; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
146; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
147; GFX8-NEXT:    s_setpc_b64 s[30:31]
148;
149; GFX9-LABEL: v_saddsat_i8:
150; GFX9:       ; %bb.0:
151; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
152; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
153; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
154; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
155; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
156; GFX9-NEXT:    s_setpc_b64 s[30:31]
157;
158; GFX10PLUS-LABEL: v_saddsat_i8:
159; GFX10PLUS:       ; %bb.0:
160; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 8, v0
162; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 8, v1
163; GFX10PLUS-NEXT:    v_add_nc_i16 v0, v0, v1 clamp
164; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 8, v0
165; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
166  %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
167  ret i8 %result
168}
169
170define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
171; GFX6-LABEL: s_saddsat_i8:
172; GFX6:       ; %bb.0:
173; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
174; GFX6-NEXT:    s_min_i32 s3, s0, 0
175; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
176; GFX6-NEXT:    s_max_i32 s2, s0, 0
177; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
178; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
179; GFX6-NEXT:    s_max_i32 s1, s3, s1
180; GFX6-NEXT:    s_min_i32 s1, s1, s2
181; GFX6-NEXT:    s_add_i32 s0, s0, s1
182; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
183; GFX6-NEXT:    ; return to shader part epilog
184;
185; GFX8-LABEL: s_saddsat_i8:
186; GFX8:       ; %bb.0:
187; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
188; GFX8-NEXT:    s_sext_i32_i16 s2, s0
189; GFX8-NEXT:    s_sext_i32_i16 s3, 0
190; GFX8-NEXT:    s_max_i32 s4, s2, s3
191; GFX8-NEXT:    s_min_i32 s2, s2, s3
192; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
193; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
194; GFX8-NEXT:    s_sext_i32_i16 s2, s2
195; GFX8-NEXT:    s_sext_i32_i16 s1, s1
196; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
197; GFX8-NEXT:    s_max_i32 s1, s2, s1
198; GFX8-NEXT:    s_sext_i32_i16 s1, s1
199; GFX8-NEXT:    s_sext_i32_i16 s2, s4
200; GFX8-NEXT:    s_min_i32 s1, s1, s2
201; GFX8-NEXT:    s_add_i32 s0, s0, s1
202; GFX8-NEXT:    s_sext_i32_i16 s0, s0
203; GFX8-NEXT:    s_ashr_i32 s0, s0, 8
204; GFX8-NEXT:    ; return to shader part epilog
205;
206; GFX9-LABEL: s_saddsat_i8:
207; GFX9:       ; %bb.0:
208; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
209; GFX9-NEXT:    s_lshl_b32 s0, s0, 8
210; GFX9-NEXT:    v_mov_b32_e32 v0, s1
211; GFX9-NEXT:    v_add_i16 v0, s0, v0 clamp
212; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
213; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
214; GFX9-NEXT:    ; return to shader part epilog
215;
216; GFX10PLUS-LABEL: s_saddsat_i8:
217; GFX10PLUS:       ; %bb.0:
218; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 8
219; GFX10PLUS-NEXT:    s_lshl_b32 s1, s1, 8
220; GFX10PLUS-NEXT:    v_add_nc_i16 v0, s0, s1 clamp
221; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 8, v0
222; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
223; GFX10PLUS-NEXT:    ; return to shader part epilog
224  %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
225  ret i8 %result
226}
227
228define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
229; GFX6-LABEL: v_saddsat_v2i8:
230; GFX6:       ; %bb.0:
231; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
232; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
233; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
234; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
235; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
236; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
237; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
238; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
239; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
240; GFX6-NEXT:    v_max_i32_e32 v1, v5, v1
241; GFX6-NEXT:    v_min_i32_e32 v1, v1, v4
242; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
243; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
244; GFX6-NEXT:    v_min_i32_e32 v4, 0, v1
245; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
246; GFX6-NEXT:    v_max_i32_e32 v3, 0, v1
247; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x80000000, v4
248; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x7fffffff, v3
249; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
250; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
251; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
252; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
253; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
254; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
255; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
256; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
257; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
258; GFX6-NEXT:    s_setpc_b64 s[30:31]
259;
260; GFX8-LABEL: v_saddsat_v2i8:
261; GFX8:       ; %bb.0:
262; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
263; GFX8-NEXT:    v_mov_b32_e32 v2, 8
264; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
265; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
266; GFX8-NEXT:    v_min_i16_e32 v5, 0, v0
267; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
268; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
269; GFX8-NEXT:    v_max_i16_e32 v4, 0, v0
270; GFX8-NEXT:    v_sub_u16_e32 v5, 0x8000, v5
271; GFX8-NEXT:    v_sub_u16_e32 v4, 0x7fff, v4
272; GFX8-NEXT:    v_max_i16_e32 v1, v5, v1
273; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
274; GFX8-NEXT:    v_min_i16_e32 v4, 0, v3
275; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
276; GFX8-NEXT:    v_max_i16_e32 v1, 0, v3
277; GFX8-NEXT:    v_sub_u16_e32 v4, 0x8000, v4
278; GFX8-NEXT:    v_sub_u16_e32 v1, 0x7fff, v1
279; GFX8-NEXT:    v_max_i16_e32 v2, v4, v2
280; GFX8-NEXT:    v_min_i16_e32 v1, v2, v1
281; GFX8-NEXT:    v_add_u16_e32 v1, v3, v1
282; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
283; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
284; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
285; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
286; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
287; GFX8-NEXT:    s_setpc_b64 s[30:31]
288;
289; GFX9-LABEL: v_saddsat_v2i8:
290; GFX9:       ; %bb.0:
291; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
293; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
294; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
295; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
296; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
297; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
298; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
299; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
300; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
301; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
302; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
303; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
304; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
305; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
306; GFX9-NEXT:    s_setpc_b64 s[30:31]
307;
308; GFX10-LABEL: v_saddsat_v2i8:
309; GFX10:       ; %bb.0:
310; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
311; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
312; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
313; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
314; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
315; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
316; GFX10-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
317; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
318; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
319; GFX10-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
320; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
321; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
322; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
323; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
324; GFX10-NEXT:    s_setpc_b64 s[30:31]
325;
326; GFX11-LABEL: v_saddsat_v2i8:
327; GFX11:       ; %bb.0:
328; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
329; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
330; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
331; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
332; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
333; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
334; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
335; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
336; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
337; GFX11-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
338; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
339; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
340; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
341; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
342; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
343; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
344; GFX11-NEXT:    s_setpc_b64 s[30:31]
345  %lhs = bitcast i16 %lhs.arg to <2 x i8>
346  %rhs = bitcast i16 %rhs.arg to <2 x i8>
347  %result = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
348  %cast.result = bitcast <2 x i8> %result to i16
349  ret i16 %cast.result
350}
351
352define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
353; GFX6-LABEL: s_saddsat_v2i8:
354; GFX6:       ; %bb.0:
355; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
356; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
357; GFX6-NEXT:    s_min_i32 s5, s0, 0
358; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
359; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
360; GFX6-NEXT:    s_max_i32 s4, s0, 0
361; GFX6-NEXT:    s_sub_i32 s5, 0x80000000, s5
362; GFX6-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
363; GFX6-NEXT:    s_max_i32 s1, s5, s1
364; GFX6-NEXT:    s_min_i32 s1, s1, s4
365; GFX6-NEXT:    s_add_i32 s0, s0, s1
366; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
367; GFX6-NEXT:    s_min_i32 s4, s1, 0
368; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
369; GFX6-NEXT:    s_max_i32 s3, s1, 0
370; GFX6-NEXT:    s_sub_i32 s4, 0x80000000, s4
371; GFX6-NEXT:    s_sub_i32 s3, 0x7fffffff, s3
372; GFX6-NEXT:    s_max_i32 s2, s4, s2
373; GFX6-NEXT:    s_min_i32 s2, s2, s3
374; GFX6-NEXT:    s_add_i32 s1, s1, s2
375; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
376; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
377; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
378; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
379; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
380; GFX6-NEXT:    s_or_b32 s0, s0, s1
381; GFX6-NEXT:    ; return to shader part epilog
382;
383; GFX8-LABEL: s_saddsat_v2i8:
384; GFX8:       ; %bb.0:
385; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
386; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
387; GFX8-NEXT:    s_sext_i32_i16 s4, s0
388; GFX8-NEXT:    s_sext_i32_i16 s5, 0
389; GFX8-NEXT:    s_max_i32 s6, s4, s5
390; GFX8-NEXT:    s_min_i32 s4, s4, s5
391; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
392; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
393; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
394; GFX8-NEXT:    s_sext_i32_i16 s4, s4
395; GFX8-NEXT:    s_sext_i32_i16 s1, s1
396; GFX8-NEXT:    s_sub_i32 s6, 0x7fff, s6
397; GFX8-NEXT:    s_max_i32 s1, s4, s1
398; GFX8-NEXT:    s_sext_i32_i16 s1, s1
399; GFX8-NEXT:    s_sext_i32_i16 s4, s6
400; GFX8-NEXT:    s_min_i32 s1, s1, s4
401; GFX8-NEXT:    s_add_i32 s0, s0, s1
402; GFX8-NEXT:    s_lshl_b32 s1, s2, 8
403; GFX8-NEXT:    s_lshl_b32 s2, s3, 8
404; GFX8-NEXT:    s_sext_i32_i16 s3, s1
405; GFX8-NEXT:    s_max_i32 s4, s3, s5
406; GFX8-NEXT:    s_min_i32 s3, s3, s5
407; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
408; GFX8-NEXT:    s_sext_i32_i16 s3, s3
409; GFX8-NEXT:    s_sext_i32_i16 s2, s2
410; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
411; GFX8-NEXT:    s_max_i32 s2, s3, s2
412; GFX8-NEXT:    s_sext_i32_i16 s2, s2
413; GFX8-NEXT:    s_sext_i32_i16 s3, s4
414; GFX8-NEXT:    s_min_i32 s2, s2, s3
415; GFX8-NEXT:    s_add_i32 s1, s1, s2
416; GFX8-NEXT:    s_sext_i32_i16 s1, s1
417; GFX8-NEXT:    s_sext_i32_i16 s0, s0
418; GFX8-NEXT:    s_ashr_i32 s1, s1, 8
419; GFX8-NEXT:    s_ashr_i32 s0, s0, 8
420; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
421; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
422; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
423; GFX8-NEXT:    s_or_b32 s0, s0, s1
424; GFX8-NEXT:    ; return to shader part epilog
425;
426; GFX9-LABEL: s_saddsat_v2i8:
427; GFX9:       ; %bb.0:
428; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
429; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
430; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
431; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
432; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
433; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
434; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
435; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
436; GFX9-NEXT:    s_lshr_b32 s2, s1, 16
437; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
438; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
439; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
440; GFX9-NEXT:    v_mov_b32_e32 v0, s1
441; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
442; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
443; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
444; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
445; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
446; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
447; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
448; GFX9-NEXT:    ; return to shader part epilog
449;
450; GFX10-LABEL: s_saddsat_v2i8:
451; GFX10:       ; %bb.0:
452; GFX10-NEXT:    s_lshr_b32 s2, s0, 8
453; GFX10-NEXT:    s_lshr_b32 s3, s1, 8
454; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
455; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
456; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
457; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
458; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x80008
459; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
460; GFX10-NEXT:    s_lshl_b32 s1, s1, 0x80008
461; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
462; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
463; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
464; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
465; GFX10-NEXT:    v_pk_add_i16 v0, s0, s1 clamp
466; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
467; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
468; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
469; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
470; GFX10-NEXT:    ; return to shader part epilog
471;
472; GFX11-LABEL: s_saddsat_v2i8:
473; GFX11:       ; %bb.0:
474; GFX11-NEXT:    s_lshr_b32 s2, s0, 8
475; GFX11-NEXT:    s_lshr_b32 s3, s1, 8
476; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
477; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
478; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
479; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
480; GFX11-NEXT:    s_lshl_b32 s0, s0, 0x80008
481; GFX11-NEXT:    s_lshl_b32 s2, s2, 8
482; GFX11-NEXT:    s_lshl_b32 s1, s1, 0x80008
483; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
484; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
485; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
486; GFX11-NEXT:    v_pk_add_i16 v0, s0, s1 clamp
487; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
488; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
489; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
490; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
491; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
492; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
493; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
494; GFX11-NEXT:    ; return to shader part epilog
495  %lhs = bitcast i16 %lhs.arg to <2 x i8>
496  %rhs = bitcast i16 %rhs.arg to <2 x i8>
497  %result = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
498  %cast.result = bitcast <2 x i8> %result to i16
499  ret i16 %cast.result
500}
501
502define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
503; GFX6-LABEL: v_saddsat_v4i8:
504; GFX6:       ; %bb.0:
505; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
506; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
507; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
508; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
509; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
510; GFX6-NEXT:    v_min_i32_e32 v10, 0, v0
511; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
512; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
513; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
514; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
515; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
516; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
517; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v11, v10
518; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0x7fffffff, v8
519; GFX6-NEXT:    v_max_i32_e32 v1, v10, v1
520; GFX6-NEXT:    v_min_i32_e32 v1, v1, v8
521; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
522; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
523; GFX6-NEXT:    v_min_i32_e32 v8, 0, v1
524; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
525; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
526; GFX6-NEXT:    v_max_i32_e32 v5, 0, v1
527; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v11, v8
528; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
529; GFX6-NEXT:    v_max_i32_e32 v2, v8, v2
530; GFX6-NEXT:    v_min_i32_e32 v2, v2, v5
531; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
532; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
533; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
534; GFX6-NEXT:    v_min_i32_e32 v6, 0, v2
535; GFX6-NEXT:    v_max_i32_e32 v5, 0, v2
536; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v11, v6
537; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
538; GFX6-NEXT:    v_max_i32_e32 v3, v6, v3
539; GFX6-NEXT:    v_min_i32_e32 v3, v3, v5
540; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
541; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
542; GFX6-NEXT:    v_min_i32_e32 v6, 0, v3
543; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
544; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
545; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
546; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v11, v6
547; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
548; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
549; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
550; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
551; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
552; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
553; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
554; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
555; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
556; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
557; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v2
558; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
559; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
560; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
561; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v3
562; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
563; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
564; GFX6-NEXT:    s_setpc_b64 s[30:31]
565;
566; GFX8-LABEL: v_saddsat_v4i8:
567; GFX8:       ; %bb.0:
568; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
569; GFX8-NEXT:    v_mov_b32_e32 v2, 8
570; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
571; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
572; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
573; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
574; GFX8-NEXT:    v_min_i16_e32 v9, 0, v0
575; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
576; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
577; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
578; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
579; GFX8-NEXT:    v_max_i16_e32 v8, 0, v0
580; GFX8-NEXT:    v_sub_u16_e32 v9, 0x8000, v9
581; GFX8-NEXT:    v_sub_u16_e32 v8, 0x7fff, v8
582; GFX8-NEXT:    v_max_i16_e32 v1, v9, v1
583; GFX8-NEXT:    v_min_i16_e32 v1, v1, v8
584; GFX8-NEXT:    v_min_i16_e32 v8, 0, v3
585; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
586; GFX8-NEXT:    v_max_i16_e32 v1, 0, v3
587; GFX8-NEXT:    v_sub_u16_e32 v8, 0x8000, v8
588; GFX8-NEXT:    v_sub_u16_e32 v1, 0x7fff, v1
589; GFX8-NEXT:    v_max_i16_e32 v2, v8, v2
590; GFX8-NEXT:    v_min_i16_e32 v1, v2, v1
591; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
592; GFX8-NEXT:    v_add_u16_e32 v1, v3, v1
593; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v6
594; GFX8-NEXT:    v_min_i16_e32 v6, 0, v2
595; GFX8-NEXT:    v_max_i16_e32 v4, 0, v2
596; GFX8-NEXT:    v_sub_u16_e32 v6, 0x8000, v6
597; GFX8-NEXT:    v_sub_u16_e32 v4, 0x7fff, v4
598; GFX8-NEXT:    v_max_i16_e32 v3, v6, v3
599; GFX8-NEXT:    v_min_i16_e32 v3, v3, v4
600; GFX8-NEXT:    v_add_u16_e32 v2, v2, v3
601; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v5
602; GFX8-NEXT:    v_min_i16_e32 v6, 0, v3
603; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
604; GFX8-NEXT:    v_max_i16_e32 v5, 0, v3
605; GFX8-NEXT:    v_sub_u16_e32 v6, 0x8000, v6
606; GFX8-NEXT:    v_sub_u16_e32 v5, 0x7fff, v5
607; GFX8-NEXT:    v_max_i16_e32 v4, v6, v4
608; GFX8-NEXT:    v_min_i16_e32 v4, v4, v5
609; GFX8-NEXT:    v_add_u16_e32 v3, v3, v4
610; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
611; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
612; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
613; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
614; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
615; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
616; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
617; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
618; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
619; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
620; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
621; GFX8-NEXT:    s_setpc_b64 s[30:31]
622;
623; GFX9-LABEL: v_saddsat_v4i8:
624; GFX9:       ; %bb.0:
625; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
626; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
627; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
628; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
629; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff, v0
630; GFX9-NEXT:    v_alignbit_b32 v0, v3, v0, 16
631; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v1
632; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
633; GFX9-NEXT:    v_lshl_or_b32 v2, v2, 16, v6
634; GFX9-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
635; GFX9-NEXT:    v_alignbit_b32 v1, v5, v1, 16
636; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
637; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
638; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
639; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
640; GFX9-NEXT:    v_pk_add_i16 v2, v2, v3 clamp
641; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
642; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1]
643; GFX9-NEXT:    v_mov_b32_e32 v3, 8
644; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
645; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
646; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
647; GFX9-NEXT:    v_and_or_b32 v1, v1, v2, v3
648; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v0
649; GFX9-NEXT:    v_mov_b32_e32 v3, 24
650; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
651; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
652; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
653; GFX9-NEXT:    s_setpc_b64 s[30:31]
654;
655; GFX10-LABEL: v_saddsat_v4i8:
656; GFX10:       ; %bb.0:
657; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
658; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
659; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
660; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v0
661; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
662; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff, v1
663; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
664; GFX10-NEXT:    v_alignbit_b32 v0, v3, v0, 16
665; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
666; GFX10-NEXT:    v_mov_b32_e32 v4, 24
667; GFX10-NEXT:    v_lshl_or_b32 v3, v5, 16, v6
668; GFX10-NEXT:    v_alignbit_b32 v1, v7, v1, 16
669; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
670; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
671; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
672; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
673; GFX10-NEXT:    v_pk_add_i16 v2, v2, v3 clamp
674; GFX10-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
675; GFX10-NEXT:    v_mov_b32_e32 v1, 8
676; GFX10-NEXT:    v_pk_ashrrev_i16 v2, 8, v2 op_sel_hi:[0,1]
677; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
678; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
679; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v0
680; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
681; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v2, v1
682; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
683; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
684; GFX10-NEXT:    s_setpc_b64 s[30:31]
685;
686; GFX11-LABEL: v_saddsat_v4i8:
687; GFX11:       ; %bb.0:
688; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
689; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
690; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
691; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v0
692; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v1
693; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
694; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
695; GFX11-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
696; GFX11-NEXT:    v_lshl_or_b32 v3, v3, 16, v5
697; GFX11-NEXT:    v_alignbit_b32 v0, v6, v0, 16
698; GFX11-NEXT:    v_alignbit_b32 v1, v7, v1, 16
699; GFX11-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
700; GFX11-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
701; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
702; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
703; GFX11-NEXT:    v_pk_add_i16 v2, v2, v3 clamp
704; GFX11-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
705; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1]
706; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
707; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 8
708; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v0
709; GFX11-NEXT:    v_bfe_u32 v0, v0, 16, 8
710; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
711; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
712; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
713; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
714; GFX11-NEXT:    v_or3_b32 v0, v1, v3, v0
715; GFX11-NEXT:    s_setpc_b64 s[30:31]
716  %lhs = bitcast i32 %lhs.arg to <4 x i8>
717  %rhs = bitcast i32 %rhs.arg to <4 x i8>
718  %result = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
719  %cast.result = bitcast <4 x i8> %result to i32
720  ret i32 %cast.result
721}
722
723define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
724; GFX6-LABEL: s_saddsat_v4i8:
725; GFX6:       ; %bb.0:
726; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
727; GFX6-NEXT:    s_lshr_b32 s3, s0, 16
728; GFX6-NEXT:    s_lshr_b32 s4, s0, 24
729; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
730; GFX6-NEXT:    s_min_i32 s9, s0, 0
731; GFX6-NEXT:    s_lshr_b32 s5, s1, 8
732; GFX6-NEXT:    s_lshr_b32 s6, s1, 16
733; GFX6-NEXT:    s_lshr_b32 s7, s1, 24
734; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
735; GFX6-NEXT:    s_max_i32 s8, s0, 0
736; GFX6-NEXT:    s_sub_i32 s9, 0x80000000, s9
737; GFX6-NEXT:    s_sub_i32 s8, 0x7fffffff, s8
738; GFX6-NEXT:    s_max_i32 s1, s9, s1
739; GFX6-NEXT:    s_min_i32 s1, s1, s8
740; GFX6-NEXT:    s_add_i32 s0, s0, s1
741; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
742; GFX6-NEXT:    s_min_i32 s8, s1, 0
743; GFX6-NEXT:    s_lshl_b32 s2, s5, 24
744; GFX6-NEXT:    s_max_i32 s5, s1, 0
745; GFX6-NEXT:    s_sub_i32 s8, 0x80000000, s8
746; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
747; GFX6-NEXT:    s_max_i32 s2, s8, s2
748; GFX6-NEXT:    s_min_i32 s2, s2, s5
749; GFX6-NEXT:    s_add_i32 s1, s1, s2
750; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
751; GFX6-NEXT:    s_lshl_b32 s3, s6, 24
752; GFX6-NEXT:    s_min_i32 s6, s2, 0
753; GFX6-NEXT:    s_max_i32 s5, s2, 0
754; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
755; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
756; GFX6-NEXT:    s_max_i32 s3, s6, s3
757; GFX6-NEXT:    s_min_i32 s3, s3, s5
758; GFX6-NEXT:    s_add_i32 s2, s2, s3
759; GFX6-NEXT:    s_lshl_b32 s3, s4, 24
760; GFX6-NEXT:    s_min_i32 s6, s3, 0
761; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
762; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
763; GFX6-NEXT:    s_max_i32 s5, s3, 0
764; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
765; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
766; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
767; GFX6-NEXT:    s_max_i32 s4, s6, s4
768; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
769; GFX6-NEXT:    s_ashr_i32 s2, s2, 24
770; GFX6-NEXT:    s_min_i32 s4, s4, s5
771; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
772; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
773; GFX6-NEXT:    s_add_i32 s3, s3, s4
774; GFX6-NEXT:    s_or_b32 s0, s0, s1
775; GFX6-NEXT:    s_and_b32 s1, s2, 0xff
776; GFX6-NEXT:    s_ashr_i32 s3, s3, 24
777; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
778; GFX6-NEXT:    s_or_b32 s0, s0, s1
779; GFX6-NEXT:    s_and_b32 s1, s3, 0xff
780; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
781; GFX6-NEXT:    s_or_b32 s0, s0, s1
782; GFX6-NEXT:    ; return to shader part epilog
783;
784; GFX8-LABEL: s_saddsat_v4i8:
785; GFX8:       ; %bb.0:
786; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
787; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
788; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
789; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
790; GFX8-NEXT:    s_sext_i32_i16 s8, s0
791; GFX8-NEXT:    s_sext_i32_i16 s9, 0
792; GFX8-NEXT:    s_max_i32 s10, s8, s9
793; GFX8-NEXT:    s_min_i32 s8, s8, s9
794; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
795; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
796; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
797; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
798; GFX8-NEXT:    s_sub_i32 s8, 0xffff8000, s8
799; GFX8-NEXT:    s_sext_i32_i16 s8, s8
800; GFX8-NEXT:    s_sext_i32_i16 s1, s1
801; GFX8-NEXT:    s_sub_i32 s10, 0x7fff, s10
802; GFX8-NEXT:    s_max_i32 s1, s8, s1
803; GFX8-NEXT:    s_sext_i32_i16 s1, s1
804; GFX8-NEXT:    s_sext_i32_i16 s8, s10
805; GFX8-NEXT:    s_min_i32 s1, s1, s8
806; GFX8-NEXT:    s_add_i32 s0, s0, s1
807; GFX8-NEXT:    s_lshl_b32 s1, s2, 8
808; GFX8-NEXT:    s_lshl_b32 s2, s5, 8
809; GFX8-NEXT:    s_sext_i32_i16 s5, s1
810; GFX8-NEXT:    s_max_i32 s8, s5, s9
811; GFX8-NEXT:    s_min_i32 s5, s5, s9
812; GFX8-NEXT:    s_sub_i32 s5, 0xffff8000, s5
813; GFX8-NEXT:    s_sext_i32_i16 s5, s5
814; GFX8-NEXT:    s_sext_i32_i16 s2, s2
815; GFX8-NEXT:    s_sub_i32 s8, 0x7fff, s8
816; GFX8-NEXT:    s_max_i32 s2, s5, s2
817; GFX8-NEXT:    s_sext_i32_i16 s2, s2
818; GFX8-NEXT:    s_sext_i32_i16 s5, s8
819; GFX8-NEXT:    s_min_i32 s2, s2, s5
820; GFX8-NEXT:    s_add_i32 s1, s1, s2
821; GFX8-NEXT:    s_lshl_b32 s2, s3, 8
822; GFX8-NEXT:    s_sext_i32_i16 s5, s2
823; GFX8-NEXT:    s_lshl_b32 s3, s6, 8
824; GFX8-NEXT:    s_max_i32 s6, s5, s9
825; GFX8-NEXT:    s_min_i32 s5, s5, s9
826; GFX8-NEXT:    s_sub_i32 s5, 0xffff8000, s5
827; GFX8-NEXT:    s_sext_i32_i16 s5, s5
828; GFX8-NEXT:    s_sext_i32_i16 s3, s3
829; GFX8-NEXT:    s_sub_i32 s6, 0x7fff, s6
830; GFX8-NEXT:    s_max_i32 s3, s5, s3
831; GFX8-NEXT:    s_sext_i32_i16 s3, s3
832; GFX8-NEXT:    s_sext_i32_i16 s5, s6
833; GFX8-NEXT:    s_min_i32 s3, s3, s5
834; GFX8-NEXT:    s_add_i32 s2, s2, s3
835; GFX8-NEXT:    s_lshl_b32 s3, s4, 8
836; GFX8-NEXT:    s_sext_i32_i16 s5, s3
837; GFX8-NEXT:    s_max_i32 s6, s5, s9
838; GFX8-NEXT:    s_min_i32 s5, s5, s9
839; GFX8-NEXT:    s_lshl_b32 s4, s7, 8
840; GFX8-NEXT:    s_sub_i32 s5, 0xffff8000, s5
841; GFX8-NEXT:    s_sext_i32_i16 s5, s5
842; GFX8-NEXT:    s_sext_i32_i16 s4, s4
843; GFX8-NEXT:    s_sext_i32_i16 s1, s1
844; GFX8-NEXT:    s_sub_i32 s6, 0x7fff, s6
845; GFX8-NEXT:    s_max_i32 s4, s5, s4
846; GFX8-NEXT:    s_sext_i32_i16 s0, s0
847; GFX8-NEXT:    s_ashr_i32 s1, s1, 8
848; GFX8-NEXT:    s_sext_i32_i16 s4, s4
849; GFX8-NEXT:    s_sext_i32_i16 s5, s6
850; GFX8-NEXT:    s_ashr_i32 s0, s0, 8
851; GFX8-NEXT:    s_sext_i32_i16 s2, s2
852; GFX8-NEXT:    s_min_i32 s4, s4, s5
853; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
854; GFX8-NEXT:    s_ashr_i32 s2, s2, 8
855; GFX8-NEXT:    s_add_i32 s3, s3, s4
856; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
857; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
858; GFX8-NEXT:    s_sext_i32_i16 s3, s3
859; GFX8-NEXT:    s_or_b32 s0, s0, s1
860; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
861; GFX8-NEXT:    s_ashr_i32 s3, s3, 8
862; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
863; GFX8-NEXT:    s_or_b32 s0, s0, s1
864; GFX8-NEXT:    s_and_b32 s1, s3, 0xff
865; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
866; GFX8-NEXT:    s_or_b32 s0, s0, s1
867; GFX8-NEXT:    ; return to shader part epilog
868;
869; GFX9-LABEL: s_saddsat_v4i8:
870; GFX9:       ; %bb.0:
871; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
872; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
873; GFX9-NEXT:    s_lshr_b32 s4, s0, 24
874; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
875; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
876; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
877; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
878; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
879; GFX9-NEXT:    s_lshr_b32 s5, s1, 8
880; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
881; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
882; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
883; GFX9-NEXT:    s_lshr_b32 s7, s1, 24
884; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
885; GFX9-NEXT:    s_lshl_b32 s2, s2, 0x80008
886; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
887; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
888; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
889; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s6, s7
890; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
891; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
892; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
893; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
894; GFX9-NEXT:    s_lshl_b32 s3, s3, 0x80008
895; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
896; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
897; GFX9-NEXT:    v_mov_b32_e32 v0, s1
898; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
899; GFX9-NEXT:    v_mov_b32_e32 v1, s3
900; GFX9-NEXT:    v_pk_add_i16 v1, s2, v1 clamp
901; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
902; GFX9-NEXT:    v_mov_b32_e32 v3, 8
903; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
904; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
905; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
906; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v3
907; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v1
908; GFX9-NEXT:    v_mov_b32_e32 v3, 24
909; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
910; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
911; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
912; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
913; GFX9-NEXT:    ; return to shader part epilog
914;
915; GFX10-LABEL: s_saddsat_v4i8:
916; GFX10:       ; %bb.0:
917; GFX10-NEXT:    s_lshr_b32 s2, s0, 8
918; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
919; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
920; GFX10-NEXT:    s_lshr_b32 s5, s1, 8
921; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
922; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
923; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
924; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
925; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
926; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
927; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
928; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s6, s7
929; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x80008
930; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
931; GFX10-NEXT:    s_lshl_b32 s2, s2, 0x80008
932; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
933; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
934; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
935; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
936; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
937; GFX10-NEXT:    s_lshl_b32 s1, s1, 0x80008
938; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
939; GFX10-NEXT:    s_lshl_b32 s3, s3, 0x80008
940; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
941; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
942; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
943; GFX10-NEXT:    v_pk_add_i16 v0, s0, s1 clamp
944; GFX10-NEXT:    v_pk_add_i16 v1, s2, s3 clamp
945; GFX10-NEXT:    v_mov_b32_e32 v2, 8
946; GFX10-NEXT:    v_mov_b32_e32 v4, 24
947; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
948; GFX10-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
949; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
950; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
951; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
952; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
953; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
954; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
955; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
956; GFX10-NEXT:    ; return to shader part epilog
957;
958; GFX11-LABEL: s_saddsat_v4i8:
959; GFX11:       ; %bb.0:
960; GFX11-NEXT:    s_lshr_b32 s2, s0, 8
961; GFX11-NEXT:    s_lshr_b32 s3, s0, 24
962; GFX11-NEXT:    s_lshr_b32 s4, s1, 8
963; GFX11-NEXT:    s_lshr_b32 s5, s1, 24
964; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s0, s2
965; GFX11-NEXT:    s_pack_hl_b32_b16 s0, s0, s3
966; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s1, s4
967; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
968; GFX11-NEXT:    s_pack_hl_b32_b16 s1, s1, s5
969; GFX11-NEXT:    s_lshr_b32 s5, s3, 16
970; GFX11-NEXT:    s_lshl_b32 s2, s2, 0x80008
971; GFX11-NEXT:    s_lshl_b32 s4, s4, 8
972; GFX11-NEXT:    s_lshl_b32 s3, s3, 0x80008
973; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
974; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
975; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
976; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
977; GFX11-NEXT:    s_lshr_b32 s5, s1, 16
978; GFX11-NEXT:    v_pk_add_i16 v0, s2, s3 clamp
979; GFX11-NEXT:    s_lshl_b32 s0, s0, 0x80008
980; GFX11-NEXT:    s_lshl_b32 s4, s4, 8
981; GFX11-NEXT:    s_lshl_b32 s1, s1, 0x80008
982; GFX11-NEXT:    s_lshl_b32 s2, s5, 8
983; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
984; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
985; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
986; GFX11-NEXT:    v_pk_add_i16 v1, s0, s1 clamp
987; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 8
988; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
989; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
990; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v1
991; GFX11-NEXT:    v_bfe_u32 v1, v1, 16, 8
992; GFX11-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
993; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
994; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
995; GFX11-NEXT:    v_or3_b32 v0, v0, v2, v1
996; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
997; GFX11-NEXT:    ; return to shader part epilog
998  %lhs = bitcast i32 %lhs.arg to <4 x i8>
999  %rhs = bitcast i32 %rhs.arg to <4 x i8>
1000  %result = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
1001  %cast.result = bitcast <4 x i8> %result to i32
1002  ret i32 %cast.result
1003}
1004
1005define i24 @v_saddsat_i24(i24 %lhs, i24 %rhs) {
1006; GFX6-LABEL: v_saddsat_i24:
1007; GFX6:       ; %bb.0:
1008; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1009; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1010; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
1011; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1012; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
1013; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
1014; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
1015; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
1016; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
1017; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1018; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
1019; GFX6-NEXT:    s_setpc_b64 s[30:31]
1020;
1021; GFX8-LABEL: v_saddsat_i24:
1022; GFX8:       ; %bb.0:
1023; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1024; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
1025; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 24
1026; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 24
1027; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
1028; GFX8-NEXT:    v_bfe_i32 v0, v1, 0, 24
1029; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v0
1030; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 23, v3
1031; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xff800000, v0
1032; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
1033; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
1034; GFX8-NEXT:    s_setpc_b64 s[30:31]
1035;
1036; GFX9-LABEL: v_saddsat_i24:
1037; GFX9:       ; %bb.0:
1038; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1039; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1040; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1041; GFX9-NEXT:    v_add_i32 v0, v0, v1 clamp
1042; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
1043; GFX9-NEXT:    s_setpc_b64 s[30:31]
1044;
1045; GFX10PLUS-LABEL: v_saddsat_i24:
1046; GFX10PLUS:       ; %bb.0:
1047; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1048; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1049; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1050; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v1 clamp
1051; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
1052; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1053  %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs)
1054  ret i24 %result
1055}
1056
1057define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
1058; GFX6-LABEL: s_saddsat_i24:
1059; GFX6:       ; %bb.0:
1060; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
1061; GFX6-NEXT:    s_min_i32 s3, s0, 0
1062; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
1063; GFX6-NEXT:    s_max_i32 s2, s0, 0
1064; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
1065; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
1066; GFX6-NEXT:    s_max_i32 s1, s3, s1
1067; GFX6-NEXT:    s_min_i32 s1, s1, s2
1068; GFX6-NEXT:    s_add_i32 s0, s0, s1
1069; GFX6-NEXT:    s_ashr_i32 s0, s0, 8
1070; GFX6-NEXT:    ; return to shader part epilog
1071;
1072; GFX8-LABEL: s_saddsat_i24:
1073; GFX8:       ; %bb.0:
1074; GFX8-NEXT:    s_add_i32 s2, s0, s1
1075; GFX8-NEXT:    s_bfe_i32 s3, s2, 0x180000
1076; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x180000
1077; GFX8-NEXT:    s_cmp_lt_i32 s3, s0
1078; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
1079; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x180000
1080; GFX8-NEXT:    s_cmp_lt_i32 s1, 0
1081; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
1082; GFX8-NEXT:    s_xor_b32 s0, s1, s0
1083; GFX8-NEXT:    s_ashr_i32 s1, s3, 23
1084; GFX8-NEXT:    s_add_i32 s1, s1, 0xff800000
1085; GFX8-NEXT:    s_and_b32 s0, s0, 1
1086; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
1087; GFX8-NEXT:    s_cselect_b32 s0, s1, s2
1088; GFX8-NEXT:    ; return to shader part epilog
1089;
1090; GFX9-LABEL: s_saddsat_i24:
1091; GFX9:       ; %bb.0:
1092; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
1093; GFX9-NEXT:    s_lshl_b32 s0, s0, 8
1094; GFX9-NEXT:    v_mov_b32_e32 v0, s1
1095; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
1096; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
1097; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1098; GFX9-NEXT:    ; return to shader part epilog
1099;
1100; GFX10PLUS-LABEL: s_saddsat_i24:
1101; GFX10PLUS:       ; %bb.0:
1102; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 8
1103; GFX10PLUS-NEXT:    s_lshl_b32 s1, s1, 8
1104; GFX10PLUS-NEXT:    v_add_nc_i32 v0, s0, s1 clamp
1105; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
1106; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1107; GFX10PLUS-NEXT:    ; return to shader part epilog
1108  %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs)
1109  ret i24 %result
1110}
1111
1112define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
1113; GFX6-LABEL: v_saddsat_i32:
1114; GFX6:       ; %bb.0:
1115; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1116; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
1117; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
1118; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
1119; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
1120; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
1121; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
1122; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1123; GFX6-NEXT:    s_setpc_b64 s[30:31]
1124;
1125; GFX8-LABEL: v_saddsat_i32:
1126; GFX8:       ; %bb.0:
1127; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1128; GFX8-NEXT:    v_min_i32_e32 v3, 0, v0
1129; GFX8-NEXT:    v_max_i32_e32 v2, 0, v0
1130; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 0x80000000, v3
1131; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 0x7fffffff, v2
1132; GFX8-NEXT:    v_max_i32_e32 v1, v3, v1
1133; GFX8-NEXT:    v_min_i32_e32 v1, v1, v2
1134; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
1135; GFX8-NEXT:    s_setpc_b64 s[30:31]
1136;
1137; GFX9-LABEL: v_saddsat_i32:
1138; GFX9:       ; %bb.0:
1139; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1140; GFX9-NEXT:    v_add_i32 v0, v0, v1 clamp
1141; GFX9-NEXT:    s_setpc_b64 s[30:31]
1142;
1143; GFX10PLUS-LABEL: v_saddsat_i32:
1144; GFX10PLUS:       ; %bb.0:
1145; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1146; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v1 clamp
1147; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1148  %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
1149  ret i32 %result
1150}
1151
1152define amdgpu_ps i32 @s_saddsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
1153; GFX6-LABEL: s_saddsat_i32:
1154; GFX6:       ; %bb.0:
1155; GFX6-NEXT:    s_min_i32 s3, s0, 0
1156; GFX6-NEXT:    s_max_i32 s2, s0, 0
1157; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
1158; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
1159; GFX6-NEXT:    s_max_i32 s1, s3, s1
1160; GFX6-NEXT:    s_min_i32 s1, s1, s2
1161; GFX6-NEXT:    s_add_i32 s0, s0, s1
1162; GFX6-NEXT:    ; return to shader part epilog
1163;
1164; GFX8-LABEL: s_saddsat_i32:
1165; GFX8:       ; %bb.0:
1166; GFX8-NEXT:    s_min_i32 s3, s0, 0
1167; GFX8-NEXT:    s_max_i32 s2, s0, 0
1168; GFX8-NEXT:    s_sub_i32 s3, 0x80000000, s3
1169; GFX8-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
1170; GFX8-NEXT:    s_max_i32 s1, s3, s1
1171; GFX8-NEXT:    s_min_i32 s1, s1, s2
1172; GFX8-NEXT:    s_add_i32 s0, s0, s1
1173; GFX8-NEXT:    ; return to shader part epilog
1174;
1175; GFX9-LABEL: s_saddsat_i32:
1176; GFX9:       ; %bb.0:
1177; GFX9-NEXT:    v_mov_b32_e32 v0, s1
1178; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
1179; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1180; GFX9-NEXT:    ; return to shader part epilog
1181;
1182; GFX10PLUS-LABEL: s_saddsat_i32:
1183; GFX10PLUS:       ; %bb.0:
1184; GFX10PLUS-NEXT:    v_add_nc_i32 v0, s0, s1 clamp
1185; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1186; GFX10PLUS-NEXT:    ; return to shader part epilog
1187  %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
1188  ret i32 %result
1189}
1190
1191define amdgpu_ps float @saddsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
1192; GFX6-LABEL: saddsat_i32_sv:
1193; GFX6:       ; %bb.0:
1194; GFX6-NEXT:    s_min_i32 s2, s0, 0
1195; GFX6-NEXT:    s_max_i32 s1, s0, 0
1196; GFX6-NEXT:    s_sub_i32 s2, 0x80000000, s2
1197; GFX6-NEXT:    s_sub_i32 s1, 0x7fffffff, s1
1198; GFX6-NEXT:    v_max_i32_e32 v0, s2, v0
1199; GFX6-NEXT:    v_min_i32_e32 v0, s1, v0
1200; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
1201; GFX6-NEXT:    ; return to shader part epilog
1202;
1203; GFX8-LABEL: saddsat_i32_sv:
1204; GFX8:       ; %bb.0:
1205; GFX8-NEXT:    s_min_i32 s2, s0, 0
1206; GFX8-NEXT:    s_max_i32 s1, s0, 0
1207; GFX8-NEXT:    s_sub_i32 s2, 0x80000000, s2
1208; GFX8-NEXT:    s_sub_i32 s1, 0x7fffffff, s1
1209; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
1210; GFX8-NEXT:    v_min_i32_e32 v0, s1, v0
1211; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1212; GFX8-NEXT:    ; return to shader part epilog
1213;
1214; GFX9-LABEL: saddsat_i32_sv:
1215; GFX9:       ; %bb.0:
1216; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
1217; GFX9-NEXT:    ; return to shader part epilog
1218;
1219; GFX10PLUS-LABEL: saddsat_i32_sv:
1220; GFX10PLUS:       ; %bb.0:
1221; GFX10PLUS-NEXT:    v_add_nc_i32 v0, s0, v0 clamp
1222; GFX10PLUS-NEXT:    ; return to shader part epilog
1223  %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
1224  %cast = bitcast i32 %result to float
1225  ret float %cast
1226}
1227
1228define amdgpu_ps float @saddsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
1229; GFX6-LABEL: saddsat_i32_vs:
1230; GFX6:       ; %bb.0:
1231; GFX6-NEXT:    v_min_i32_e32 v2, 0, v0
1232; GFX6-NEXT:    v_max_i32_e32 v1, 0, v0
1233; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x80000000, v2
1234; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 0x7fffffff, v1
1235; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
1236; GFX6-NEXT:    v_min_i32_e32 v1, v2, v1
1237; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1238; GFX6-NEXT:    ; return to shader part epilog
1239;
1240; GFX8-LABEL: saddsat_i32_vs:
1241; GFX8:       ; %bb.0:
1242; GFX8-NEXT:    v_min_i32_e32 v2, 0, v0
1243; GFX8-NEXT:    v_max_i32_e32 v1, 0, v0
1244; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 0x80000000, v2
1245; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 0x7fffffff, v1
1246; GFX8-NEXT:    v_max_i32_e32 v2, s0, v2
1247; GFX8-NEXT:    v_min_i32_e32 v1, v2, v1
1248; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
1249; GFX8-NEXT:    ; return to shader part epilog
1250;
1251; GFX9-LABEL: saddsat_i32_vs:
1252; GFX9:       ; %bb.0:
1253; GFX9-NEXT:    v_add_i32 v0, v0, s0 clamp
1254; GFX9-NEXT:    ; return to shader part epilog
1255;
1256; GFX10PLUS-LABEL: saddsat_i32_vs:
1257; GFX10PLUS:       ; %bb.0:
1258; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, s0 clamp
1259; GFX10PLUS-NEXT:    ; return to shader part epilog
1260  %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
1261  %cast = bitcast i32 %result to float
1262  ret float %cast
1263}
1264
1265define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
1266; GFX6-LABEL: v_saddsat_v2i32:
1267; GFX6:       ; %bb.0:
1268; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1269; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
1270; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
1271; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
1272; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
1273; GFX6-NEXT:    v_max_i32_e32 v2, v5, v2
1274; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
1275; GFX6-NEXT:    v_min_i32_e32 v4, 0, v1
1276; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1277; GFX6-NEXT:    v_max_i32_e32 v2, 0, v1
1278; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x80000000, v4
1279; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
1280; GFX6-NEXT:    v_max_i32_e32 v3, v4, v3
1281; GFX6-NEXT:    v_min_i32_e32 v2, v3, v2
1282; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
1283; GFX6-NEXT:    s_setpc_b64 s[30:31]
1284;
1285; GFX8-LABEL: v_saddsat_v2i32:
1286; GFX8:       ; %bb.0:
1287; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1288; GFX8-NEXT:    v_min_i32_e32 v5, 0, v0
1289; GFX8-NEXT:    v_max_i32_e32 v4, 0, v0
1290; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 0x80000000, v5
1291; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0x7fffffff, v4
1292; GFX8-NEXT:    v_max_i32_e32 v2, v5, v2
1293; GFX8-NEXT:    v_min_i32_e32 v2, v2, v4
1294; GFX8-NEXT:    v_min_i32_e32 v4, 0, v1
1295; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1296; GFX8-NEXT:    v_max_i32_e32 v2, 0, v1
1297; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0x80000000, v4
1298; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 0x7fffffff, v2
1299; GFX8-NEXT:    v_max_i32_e32 v3, v4, v3
1300; GFX8-NEXT:    v_min_i32_e32 v2, v3, v2
1301; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
1302; GFX8-NEXT:    s_setpc_b64 s[30:31]
1303;
1304; GFX9-LABEL: v_saddsat_v2i32:
1305; GFX9:       ; %bb.0:
1306; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1307; GFX9-NEXT:    v_add_i32 v0, v0, v2 clamp
1308; GFX9-NEXT:    v_add_i32 v1, v1, v3 clamp
1309; GFX9-NEXT:    s_setpc_b64 s[30:31]
1310;
1311; GFX10PLUS-LABEL: v_saddsat_v2i32:
1312; GFX10PLUS:       ; %bb.0:
1313; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1314; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v2 clamp
1315; GFX10PLUS-NEXT:    v_add_nc_i32 v1, v1, v3 clamp
1316; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1317  %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1318  ret <2 x i32> %result
1319}
1320
1321define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
1322; GFX6-LABEL: s_saddsat_v2i32:
1323; GFX6:       ; %bb.0:
1324; GFX6-NEXT:    s_min_i32 s5, s0, 0
1325; GFX6-NEXT:    s_max_i32 s4, s0, 0
1326; GFX6-NEXT:    s_sub_i32 s5, 0x80000000, s5
1327; GFX6-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
1328; GFX6-NEXT:    s_max_i32 s2, s5, s2
1329; GFX6-NEXT:    s_min_i32 s2, s2, s4
1330; GFX6-NEXT:    s_min_i32 s4, s1, 0
1331; GFX6-NEXT:    s_add_i32 s0, s0, s2
1332; GFX6-NEXT:    s_max_i32 s2, s1, 0
1333; GFX6-NEXT:    s_sub_i32 s4, 0x80000000, s4
1334; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
1335; GFX6-NEXT:    s_max_i32 s3, s4, s3
1336; GFX6-NEXT:    s_min_i32 s2, s3, s2
1337; GFX6-NEXT:    s_add_i32 s1, s1, s2
1338; GFX6-NEXT:    ; return to shader part epilog
1339;
1340; GFX8-LABEL: s_saddsat_v2i32:
1341; GFX8:       ; %bb.0:
1342; GFX8-NEXT:    s_min_i32 s5, s0, 0
1343; GFX8-NEXT:    s_max_i32 s4, s0, 0
1344; GFX8-NEXT:    s_sub_i32 s5, 0x80000000, s5
1345; GFX8-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
1346; GFX8-NEXT:    s_max_i32 s2, s5, s2
1347; GFX8-NEXT:    s_min_i32 s2, s2, s4
1348; GFX8-NEXT:    s_min_i32 s4, s1, 0
1349; GFX8-NEXT:    s_add_i32 s0, s0, s2
1350; GFX8-NEXT:    s_max_i32 s2, s1, 0
1351; GFX8-NEXT:    s_sub_i32 s4, 0x80000000, s4
1352; GFX8-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
1353; GFX8-NEXT:    s_max_i32 s3, s4, s3
1354; GFX8-NEXT:    s_min_i32 s2, s3, s2
1355; GFX8-NEXT:    s_add_i32 s1, s1, s2
1356; GFX8-NEXT:    ; return to shader part epilog
1357;
1358; GFX9-LABEL: s_saddsat_v2i32:
1359; GFX9:       ; %bb.0:
1360; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1361; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1362; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
1363; GFX9-NEXT:    v_add_i32 v1, s1, v1 clamp
1364; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1365; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1366; GFX9-NEXT:    ; return to shader part epilog
1367;
1368; GFX10PLUS-LABEL: s_saddsat_v2i32:
1369; GFX10PLUS:       ; %bb.0:
1370; GFX10PLUS-NEXT:    v_add_nc_i32 v0, s0, s2 clamp
1371; GFX10PLUS-NEXT:    v_add_nc_i32 v1, s1, s3 clamp
1372; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1373; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
1374; GFX10PLUS-NEXT:    ; return to shader part epilog
1375  %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1376  ret <2 x i32> %result
1377}
1378
1379define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
1380; GFX6-LABEL: v_saddsat_v3i32:
1381; GFX6:       ; %bb.0:
1382; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1383; GFX6-NEXT:    v_min_i32_e32 v8, 0, v0
1384; GFX6-NEXT:    v_max_i32_e32 v6, 0, v0
1385; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0x80000000, v8
1386; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x7fffffff, v6
1387; GFX6-NEXT:    v_max_i32_e32 v3, v8, v3
1388; GFX6-NEXT:    v_min_i32_e32 v3, v3, v6
1389; GFX6-NEXT:    v_min_i32_e32 v6, 0, v1
1390; GFX6-NEXT:    v_bfrev_b32_e32 v7, -2
1391; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
1392; GFX6-NEXT:    v_max_i32_e32 v3, 0, v1
1393; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x80000000, v6
1394; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v7, v3
1395; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
1396; GFX6-NEXT:    v_min_i32_e32 v3, v4, v3
1397; GFX6-NEXT:    v_min_i32_e32 v4, 0, v2
1398; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
1399; GFX6-NEXT:    v_max_i32_e32 v3, 0, v2
1400; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x80000000, v4
1401; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x7fffffff, v3
1402; GFX6-NEXT:    v_max_i32_e32 v4, v4, v5
1403; GFX6-NEXT:    v_min_i32_e32 v3, v4, v3
1404; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1405; GFX6-NEXT:    s_setpc_b64 s[30:31]
1406;
1407; GFX8-LABEL: v_saddsat_v3i32:
1408; GFX8:       ; %bb.0:
1409; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1410; GFX8-NEXT:    v_min_i32_e32 v8, 0, v0
1411; GFX8-NEXT:    v_max_i32_e32 v6, 0, v0
1412; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 0x80000000, v8
1413; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 0x7fffffff, v6
1414; GFX8-NEXT:    v_max_i32_e32 v3, v8, v3
1415; GFX8-NEXT:    v_min_i32_e32 v3, v3, v6
1416; GFX8-NEXT:    v_min_i32_e32 v6, 0, v1
1417; GFX8-NEXT:    v_bfrev_b32_e32 v7, -2
1418; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
1419; GFX8-NEXT:    v_max_i32_e32 v3, 0, v1
1420; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 0x80000000, v6
1421; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v7, v3
1422; GFX8-NEXT:    v_max_i32_e32 v4, v6, v4
1423; GFX8-NEXT:    v_min_i32_e32 v3, v4, v3
1424; GFX8-NEXT:    v_min_i32_e32 v4, 0, v2
1425; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
1426; GFX8-NEXT:    v_max_i32_e32 v3, 0, v2
1427; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0x80000000, v4
1428; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 0x7fffffff, v3
1429; GFX8-NEXT:    v_max_i32_e32 v4, v4, v5
1430; GFX8-NEXT:    v_min_i32_e32 v3, v4, v3
1431; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
1432; GFX8-NEXT:    s_setpc_b64 s[30:31]
1433;
1434; GFX9-LABEL: v_saddsat_v3i32:
1435; GFX9:       ; %bb.0:
1436; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1437; GFX9-NEXT:    v_add_i32 v0, v0, v3 clamp
1438; GFX9-NEXT:    v_add_i32 v1, v1, v4 clamp
1439; GFX9-NEXT:    v_add_i32 v2, v2, v5 clamp
1440; GFX9-NEXT:    s_setpc_b64 s[30:31]
1441;
1442; GFX10PLUS-LABEL: v_saddsat_v3i32:
1443; GFX10PLUS:       ; %bb.0:
1444; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1445; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v3 clamp
1446; GFX10PLUS-NEXT:    v_add_nc_i32 v1, v1, v4 clamp
1447; GFX10PLUS-NEXT:    v_add_nc_i32 v2, v2, v5 clamp
1448; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1449  %result = call <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1450  ret <3 x i32> %result
1451}
1452
1453define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
1454; GFX6-LABEL: s_saddsat_v3i32:
1455; GFX6:       ; %bb.0:
1456; GFX6-NEXT:    s_min_i32 s7, s0, 0
1457; GFX6-NEXT:    s_max_i32 s6, s0, 0
1458; GFX6-NEXT:    s_sub_i32 s7, 0x80000000, s7
1459; GFX6-NEXT:    s_sub_i32 s6, 0x7fffffff, s6
1460; GFX6-NEXT:    s_max_i32 s3, s7, s3
1461; GFX6-NEXT:    s_min_i32 s3, s3, s6
1462; GFX6-NEXT:    s_min_i32 s6, s1, 0
1463; GFX6-NEXT:    s_add_i32 s0, s0, s3
1464; GFX6-NEXT:    s_max_i32 s3, s1, 0
1465; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
1466; GFX6-NEXT:    s_sub_i32 s3, 0x7fffffff, s3
1467; GFX6-NEXT:    s_max_i32 s4, s6, s4
1468; GFX6-NEXT:    s_min_i32 s3, s4, s3
1469; GFX6-NEXT:    s_min_i32 s4, s2, 0
1470; GFX6-NEXT:    s_add_i32 s1, s1, s3
1471; GFX6-NEXT:    s_max_i32 s3, s2, 0
1472; GFX6-NEXT:    s_sub_i32 s4, 0x80000000, s4
1473; GFX6-NEXT:    s_sub_i32 s3, 0x7fffffff, s3
1474; GFX6-NEXT:    s_max_i32 s4, s4, s5
1475; GFX6-NEXT:    s_min_i32 s3, s4, s3
1476; GFX6-NEXT:    s_add_i32 s2, s2, s3
1477; GFX6-NEXT:    ; return to shader part epilog
1478;
1479; GFX8-LABEL: s_saddsat_v3i32:
1480; GFX8:       ; %bb.0:
1481; GFX8-NEXT:    s_min_i32 s7, s0, 0
1482; GFX8-NEXT:    s_max_i32 s6, s0, 0
1483; GFX8-NEXT:    s_sub_i32 s7, 0x80000000, s7
1484; GFX8-NEXT:    s_sub_i32 s6, 0x7fffffff, s6
1485; GFX8-NEXT:    s_max_i32 s3, s7, s3
1486; GFX8-NEXT:    s_min_i32 s3, s3, s6
1487; GFX8-NEXT:    s_min_i32 s6, s1, 0
1488; GFX8-NEXT:    s_add_i32 s0, s0, s3
1489; GFX8-NEXT:    s_max_i32 s3, s1, 0
1490; GFX8-NEXT:    s_sub_i32 s6, 0x80000000, s6
1491; GFX8-NEXT:    s_sub_i32 s3, 0x7fffffff, s3
1492; GFX8-NEXT:    s_max_i32 s4, s6, s4
1493; GFX8-NEXT:    s_min_i32 s3, s4, s3
1494; GFX8-NEXT:    s_min_i32 s4, s2, 0
1495; GFX8-NEXT:    s_add_i32 s1, s1, s3
1496; GFX8-NEXT:    s_max_i32 s3, s2, 0
1497; GFX8-NEXT:    s_sub_i32 s4, 0x80000000, s4
1498; GFX8-NEXT:    s_sub_i32 s3, 0x7fffffff, s3
1499; GFX8-NEXT:    s_max_i32 s4, s4, s5
1500; GFX8-NEXT:    s_min_i32 s3, s4, s3
1501; GFX8-NEXT:    s_add_i32 s2, s2, s3
1502; GFX8-NEXT:    ; return to shader part epilog
1503;
1504; GFX9-LABEL: s_saddsat_v3i32:
1505; GFX9:       ; %bb.0:
1506; GFX9-NEXT:    v_mov_b32_e32 v0, s3
1507; GFX9-NEXT:    v_mov_b32_e32 v1, s4
1508; GFX9-NEXT:    v_mov_b32_e32 v2, s5
1509; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
1510; GFX9-NEXT:    v_add_i32 v1, s1, v1 clamp
1511; GFX9-NEXT:    v_add_i32 v2, s2, v2 clamp
1512; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1513; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1514; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1515; GFX9-NEXT:    ; return to shader part epilog
1516;
1517; GFX10PLUS-LABEL: s_saddsat_v3i32:
1518; GFX10PLUS:       ; %bb.0:
1519; GFX10PLUS-NEXT:    v_add_nc_i32 v0, s0, s3 clamp
1520; GFX10PLUS-NEXT:    v_add_nc_i32 v1, s1, s4 clamp
1521; GFX10PLUS-NEXT:    v_add_nc_i32 v2, s2, s5 clamp
1522; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1523; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
1524; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
1525; GFX10PLUS-NEXT:    ; return to shader part epilog
1526  %result = call <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1527  ret <3 x i32> %result
1528}
1529
1530define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
1531; GFX6-LABEL: v_saddsat_v4i32:
1532; GFX6:       ; %bb.0:
1533; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1534; GFX6-NEXT:    v_min_i32_e32 v10, 0, v0
1535; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
1536; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
1537; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v11, v10
1538; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0x7fffffff, v8
1539; GFX6-NEXT:    v_max_i32_e32 v4, v10, v4
1540; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
1541; GFX6-NEXT:    v_min_i32_e32 v8, 0, v1
1542; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
1543; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
1544; GFX6-NEXT:    v_max_i32_e32 v4, 0, v1
1545; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0x80000000, v8
1546; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v9, v4
1547; GFX6-NEXT:    v_max_i32_e32 v5, v8, v5
1548; GFX6-NEXT:    v_min_i32_e32 v4, v5, v4
1549; GFX6-NEXT:    v_min_i32_e32 v5, 0, v2
1550; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
1551; GFX6-NEXT:    v_max_i32_e32 v4, 0, v2
1552; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
1553; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v9, v4
1554; GFX6-NEXT:    v_max_i32_e32 v5, v5, v6
1555; GFX6-NEXT:    v_min_i32_e32 v4, v5, v4
1556; GFX6-NEXT:    v_min_i32_e32 v5, 0, v3
1557; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
1558; GFX6-NEXT:    v_max_i32_e32 v4, 0, v3
1559; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
1560; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
1561; GFX6-NEXT:    v_max_i32_e32 v5, v5, v7
1562; GFX6-NEXT:    v_min_i32_e32 v4, v5, v4
1563; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
1564; GFX6-NEXT:    s_setpc_b64 s[30:31]
1565;
1566; GFX8-LABEL: v_saddsat_v4i32:
1567; GFX8:       ; %bb.0:
1568; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1569; GFX8-NEXT:    v_min_i32_e32 v10, 0, v0
1570; GFX8-NEXT:    v_bfrev_b32_e32 v11, 1
1571; GFX8-NEXT:    v_max_i32_e32 v8, 0, v0
1572; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, v11, v10
1573; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 0x7fffffff, v8
1574; GFX8-NEXT:    v_max_i32_e32 v4, v10, v4
1575; GFX8-NEXT:    v_min_i32_e32 v4, v4, v8
1576; GFX8-NEXT:    v_min_i32_e32 v8, 0, v1
1577; GFX8-NEXT:    v_bfrev_b32_e32 v9, -2
1578; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
1579; GFX8-NEXT:    v_max_i32_e32 v4, 0, v1
1580; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 0x80000000, v8
1581; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v9, v4
1582; GFX8-NEXT:    v_max_i32_e32 v5, v8, v5
1583; GFX8-NEXT:    v_min_i32_e32 v4, v5, v4
1584; GFX8-NEXT:    v_min_i32_e32 v5, 0, v2
1585; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
1586; GFX8-NEXT:    v_max_i32_e32 v4, 0, v2
1587; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 0x80000000, v5
1588; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v9, v4
1589; GFX8-NEXT:    v_max_i32_e32 v5, v5, v6
1590; GFX8-NEXT:    v_min_i32_e32 v4, v5, v4
1591; GFX8-NEXT:    v_min_i32_e32 v5, 0, v3
1592; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
1593; GFX8-NEXT:    v_max_i32_e32 v4, 0, v3
1594; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 0x80000000, v5
1595; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0x7fffffff, v4
1596; GFX8-NEXT:    v_max_i32_e32 v5, v5, v7
1597; GFX8-NEXT:    v_min_i32_e32 v4, v5, v4
1598; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
1599; GFX8-NEXT:    s_setpc_b64 s[30:31]
1600;
1601; GFX9-LABEL: v_saddsat_v4i32:
1602; GFX9:       ; %bb.0:
1603; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1604; GFX9-NEXT:    v_add_i32 v0, v0, v4 clamp
1605; GFX9-NEXT:    v_add_i32 v1, v1, v5 clamp
1606; GFX9-NEXT:    v_add_i32 v2, v2, v6 clamp
1607; GFX9-NEXT:    v_add_i32 v3, v3, v7 clamp
1608; GFX9-NEXT:    s_setpc_b64 s[30:31]
1609;
1610; GFX10PLUS-LABEL: v_saddsat_v4i32:
1611; GFX10PLUS:       ; %bb.0:
1612; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1613; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v4 clamp
1614; GFX10PLUS-NEXT:    v_add_nc_i32 v1, v1, v5 clamp
1615; GFX10PLUS-NEXT:    v_add_nc_i32 v2, v2, v6 clamp
1616; GFX10PLUS-NEXT:    v_add_nc_i32 v3, v3, v7 clamp
1617; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1618  %result = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1619  ret <4 x i32> %result
1620}
1621
1622define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
1623; GFX6-LABEL: s_saddsat_v4i32:
1624; GFX6:       ; %bb.0:
1625; GFX6-NEXT:    s_min_i32 s9, s0, 0
1626; GFX6-NEXT:    s_max_i32 s8, s0, 0
1627; GFX6-NEXT:    s_sub_i32 s9, 0x80000000, s9
1628; GFX6-NEXT:    s_sub_i32 s8, 0x7fffffff, s8
1629; GFX6-NEXT:    s_max_i32 s4, s9, s4
1630; GFX6-NEXT:    s_min_i32 s4, s4, s8
1631; GFX6-NEXT:    s_min_i32 s8, s1, 0
1632; GFX6-NEXT:    s_add_i32 s0, s0, s4
1633; GFX6-NEXT:    s_max_i32 s4, s1, 0
1634; GFX6-NEXT:    s_sub_i32 s8, 0x80000000, s8
1635; GFX6-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
1636; GFX6-NEXT:    s_max_i32 s5, s8, s5
1637; GFX6-NEXT:    s_min_i32 s4, s5, s4
1638; GFX6-NEXT:    s_min_i32 s5, s2, 0
1639; GFX6-NEXT:    s_add_i32 s1, s1, s4
1640; GFX6-NEXT:    s_max_i32 s4, s2, 0
1641; GFX6-NEXT:    s_sub_i32 s5, 0x80000000, s5
1642; GFX6-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
1643; GFX6-NEXT:    s_max_i32 s5, s5, s6
1644; GFX6-NEXT:    s_min_i32 s4, s5, s4
1645; GFX6-NEXT:    s_min_i32 s5, s3, 0
1646; GFX6-NEXT:    s_add_i32 s2, s2, s4
1647; GFX6-NEXT:    s_max_i32 s4, s3, 0
1648; GFX6-NEXT:    s_sub_i32 s5, 0x80000000, s5
1649; GFX6-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
1650; GFX6-NEXT:    s_max_i32 s5, s5, s7
1651; GFX6-NEXT:    s_min_i32 s4, s5, s4
1652; GFX6-NEXT:    s_add_i32 s3, s3, s4
1653; GFX6-NEXT:    ; return to shader part epilog
1654;
1655; GFX8-LABEL: s_saddsat_v4i32:
1656; GFX8:       ; %bb.0:
1657; GFX8-NEXT:    s_min_i32 s9, s0, 0
1658; GFX8-NEXT:    s_max_i32 s8, s0, 0
1659; GFX8-NEXT:    s_sub_i32 s9, 0x80000000, s9
1660; GFX8-NEXT:    s_sub_i32 s8, 0x7fffffff, s8
1661; GFX8-NEXT:    s_max_i32 s4, s9, s4
1662; GFX8-NEXT:    s_min_i32 s4, s4, s8
1663; GFX8-NEXT:    s_min_i32 s8, s1, 0
1664; GFX8-NEXT:    s_add_i32 s0, s0, s4
1665; GFX8-NEXT:    s_max_i32 s4, s1, 0
1666; GFX8-NEXT:    s_sub_i32 s8, 0x80000000, s8
1667; GFX8-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
1668; GFX8-NEXT:    s_max_i32 s5, s8, s5
1669; GFX8-NEXT:    s_min_i32 s4, s5, s4
1670; GFX8-NEXT:    s_min_i32 s5, s2, 0
1671; GFX8-NEXT:    s_add_i32 s1, s1, s4
1672; GFX8-NEXT:    s_max_i32 s4, s2, 0
1673; GFX8-NEXT:    s_sub_i32 s5, 0x80000000, s5
1674; GFX8-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
1675; GFX8-NEXT:    s_max_i32 s5, s5, s6
1676; GFX8-NEXT:    s_min_i32 s4, s5, s4
1677; GFX8-NEXT:    s_min_i32 s5, s3, 0
1678; GFX8-NEXT:    s_add_i32 s2, s2, s4
1679; GFX8-NEXT:    s_max_i32 s4, s3, 0
1680; GFX8-NEXT:    s_sub_i32 s5, 0x80000000, s5
1681; GFX8-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
1682; GFX8-NEXT:    s_max_i32 s5, s5, s7
1683; GFX8-NEXT:    s_min_i32 s4, s5, s4
1684; GFX8-NEXT:    s_add_i32 s3, s3, s4
1685; GFX8-NEXT:    ; return to shader part epilog
1686;
1687; GFX9-LABEL: s_saddsat_v4i32:
1688; GFX9:       ; %bb.0:
1689; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1690; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1691; GFX9-NEXT:    v_mov_b32_e32 v2, s6
1692; GFX9-NEXT:    v_mov_b32_e32 v3, s7
1693; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
1694; GFX9-NEXT:    v_add_i32 v1, s1, v1 clamp
1695; GFX9-NEXT:    v_add_i32 v2, s2, v2 clamp
1696; GFX9-NEXT:    v_add_i32 v3, s3, v3 clamp
1697; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1698; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1699; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1700; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
1701; GFX9-NEXT:    ; return to shader part epilog
1702;
1703; GFX10PLUS-LABEL: s_saddsat_v4i32:
1704; GFX10PLUS:       ; %bb.0:
1705; GFX10PLUS-NEXT:    v_add_nc_i32 v0, s0, s4 clamp
1706; GFX10PLUS-NEXT:    v_add_nc_i32 v1, s1, s5 clamp
1707; GFX10PLUS-NEXT:    v_add_nc_i32 v2, s2, s6 clamp
1708; GFX10PLUS-NEXT:    v_add_nc_i32 v3, s3, s7 clamp
1709; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1710; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
1711; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
1712; GFX10PLUS-NEXT:    v_readfirstlane_b32 s3, v3
1713; GFX10PLUS-NEXT:    ; return to shader part epilog
1714  %result = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1715  ret <4 x i32> %result
1716}
1717
1718define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
1719; GFX6-LABEL: v_saddsat_v5i32:
1720; GFX6:       ; %bb.0:
1721; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1722; GFX6-NEXT:    v_min_i32_e32 v12, 0, v0
1723; GFX6-NEXT:    v_bfrev_b32_e32 v13, 1
1724; GFX6-NEXT:    v_max_i32_e32 v10, 0, v0
1725; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v13, v12
1726; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 0x7fffffff, v10
1727; GFX6-NEXT:    v_max_i32_e32 v5, v12, v5
1728; GFX6-NEXT:    v_min_i32_e32 v5, v5, v10
1729; GFX6-NEXT:    v_min_i32_e32 v10, 0, v1
1730; GFX6-NEXT:    v_bfrev_b32_e32 v11, -2
1731; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
1732; GFX6-NEXT:    v_max_i32_e32 v5, 0, v1
1733; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v13, v10
1734; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
1735; GFX6-NEXT:    v_max_i32_e32 v6, v10, v6
1736; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
1737; GFX6-NEXT:    v_min_i32_e32 v6, 0, v2
1738; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
1739; GFX6-NEXT:    v_max_i32_e32 v5, 0, v2
1740; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x80000000, v6
1741; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
1742; GFX6-NEXT:    v_max_i32_e32 v6, v6, v7
1743; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
1744; GFX6-NEXT:    v_min_i32_e32 v6, 0, v3
1745; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
1746; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
1747; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x80000000, v6
1748; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
1749; GFX6-NEXT:    v_max_i32_e32 v6, v6, v8
1750; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
1751; GFX6-NEXT:    v_min_i32_e32 v6, 0, v4
1752; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
1753; GFX6-NEXT:    v_max_i32_e32 v5, 0, v4
1754; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x80000000, v6
1755; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x7fffffff, v5
1756; GFX6-NEXT:    v_max_i32_e32 v6, v6, v9
1757; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
1758; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
1759; GFX6-NEXT:    s_setpc_b64 s[30:31]
1760;
1761; GFX8-LABEL: v_saddsat_v5i32:
1762; GFX8:       ; %bb.0:
1763; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1764; GFX8-NEXT:    v_min_i32_e32 v12, 0, v0
1765; GFX8-NEXT:    v_bfrev_b32_e32 v13, 1
1766; GFX8-NEXT:    v_max_i32_e32 v10, 0, v0
1767; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, v13, v12
1768; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 0x7fffffff, v10
1769; GFX8-NEXT:    v_max_i32_e32 v5, v12, v5
1770; GFX8-NEXT:    v_min_i32_e32 v5, v5, v10
1771; GFX8-NEXT:    v_min_i32_e32 v10, 0, v1
1772; GFX8-NEXT:    v_bfrev_b32_e32 v11, -2
1773; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
1774; GFX8-NEXT:    v_max_i32_e32 v5, 0, v1
1775; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, v13, v10
1776; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v11, v5
1777; GFX8-NEXT:    v_max_i32_e32 v6, v10, v6
1778; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
1779; GFX8-NEXT:    v_min_i32_e32 v6, 0, v2
1780; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
1781; GFX8-NEXT:    v_max_i32_e32 v5, 0, v2
1782; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 0x80000000, v6
1783; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v11, v5
1784; GFX8-NEXT:    v_max_i32_e32 v6, v6, v7
1785; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
1786; GFX8-NEXT:    v_min_i32_e32 v6, 0, v3
1787; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
1788; GFX8-NEXT:    v_max_i32_e32 v5, 0, v3
1789; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 0x80000000, v6
1790; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v11, v5
1791; GFX8-NEXT:    v_max_i32_e32 v6, v6, v8
1792; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
1793; GFX8-NEXT:    v_min_i32_e32 v6, 0, v4
1794; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
1795; GFX8-NEXT:    v_max_i32_e32 v5, 0, v4
1796; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 0x80000000, v6
1797; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 0x7fffffff, v5
1798; GFX8-NEXT:    v_max_i32_e32 v6, v6, v9
1799; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
1800; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
1801; GFX8-NEXT:    s_setpc_b64 s[30:31]
1802;
1803; GFX9-LABEL: v_saddsat_v5i32:
1804; GFX9:       ; %bb.0:
1805; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1806; GFX9-NEXT:    v_add_i32 v0, v0, v5 clamp
1807; GFX9-NEXT:    v_add_i32 v1, v1, v6 clamp
1808; GFX9-NEXT:    v_add_i32 v2, v2, v7 clamp
1809; GFX9-NEXT:    v_add_i32 v3, v3, v8 clamp
1810; GFX9-NEXT:    v_add_i32 v4, v4, v9 clamp
1811; GFX9-NEXT:    s_setpc_b64 s[30:31]
1812;
1813; GFX10PLUS-LABEL: v_saddsat_v5i32:
1814; GFX10PLUS:       ; %bb.0:
1815; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1816; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v5 clamp
1817; GFX10PLUS-NEXT:    v_add_nc_i32 v1, v1, v6 clamp
1818; GFX10PLUS-NEXT:    v_add_nc_i32 v2, v2, v7 clamp
1819; GFX10PLUS-NEXT:    v_add_nc_i32 v3, v3, v8 clamp
1820; GFX10PLUS-NEXT:    v_add_nc_i32 v4, v4, v9 clamp
1821; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1822  %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1823  ret <5 x i32> %result
1824}
1825
1826define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
1827; GFX6-LABEL: s_saddsat_v5i32:
1828; GFX6:       ; %bb.0:
1829; GFX6-NEXT:    s_min_i32 s11, s0, 0
1830; GFX6-NEXT:    s_max_i32 s10, s0, 0
1831; GFX6-NEXT:    s_sub_i32 s11, 0x80000000, s11
1832; GFX6-NEXT:    s_sub_i32 s10, 0x7fffffff, s10
1833; GFX6-NEXT:    s_max_i32 s5, s11, s5
1834; GFX6-NEXT:    s_min_i32 s5, s5, s10
1835; GFX6-NEXT:    s_min_i32 s10, s1, 0
1836; GFX6-NEXT:    s_add_i32 s0, s0, s5
1837; GFX6-NEXT:    s_max_i32 s5, s1, 0
1838; GFX6-NEXT:    s_sub_i32 s10, 0x80000000, s10
1839; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
1840; GFX6-NEXT:    s_max_i32 s6, s10, s6
1841; GFX6-NEXT:    s_min_i32 s5, s6, s5
1842; GFX6-NEXT:    s_min_i32 s6, s2, 0
1843; GFX6-NEXT:    s_add_i32 s1, s1, s5
1844; GFX6-NEXT:    s_max_i32 s5, s2, 0
1845; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
1846; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
1847; GFX6-NEXT:    s_max_i32 s6, s6, s7
1848; GFX6-NEXT:    s_min_i32 s5, s6, s5
1849; GFX6-NEXT:    s_min_i32 s6, s3, 0
1850; GFX6-NEXT:    s_add_i32 s2, s2, s5
1851; GFX6-NEXT:    s_max_i32 s5, s3, 0
1852; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
1853; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
1854; GFX6-NEXT:    s_max_i32 s6, s6, s8
1855; GFX6-NEXT:    s_min_i32 s5, s6, s5
1856; GFX6-NEXT:    s_min_i32 s6, s4, 0
1857; GFX6-NEXT:    s_add_i32 s3, s3, s5
1858; GFX6-NEXT:    s_max_i32 s5, s4, 0
1859; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
1860; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
1861; GFX6-NEXT:    s_max_i32 s6, s6, s9
1862; GFX6-NEXT:    s_min_i32 s5, s6, s5
1863; GFX6-NEXT:    s_add_i32 s4, s4, s5
1864; GFX6-NEXT:    ; return to shader part epilog
1865;
1866; GFX8-LABEL: s_saddsat_v5i32:
1867; GFX8:       ; %bb.0:
1868; GFX8-NEXT:    s_min_i32 s11, s0, 0
1869; GFX8-NEXT:    s_max_i32 s10, s0, 0
1870; GFX8-NEXT:    s_sub_i32 s11, 0x80000000, s11
1871; GFX8-NEXT:    s_sub_i32 s10, 0x7fffffff, s10
1872; GFX8-NEXT:    s_max_i32 s5, s11, s5
1873; GFX8-NEXT:    s_min_i32 s5, s5, s10
1874; GFX8-NEXT:    s_min_i32 s10, s1, 0
1875; GFX8-NEXT:    s_add_i32 s0, s0, s5
1876; GFX8-NEXT:    s_max_i32 s5, s1, 0
1877; GFX8-NEXT:    s_sub_i32 s10, 0x80000000, s10
1878; GFX8-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
1879; GFX8-NEXT:    s_max_i32 s6, s10, s6
1880; GFX8-NEXT:    s_min_i32 s5, s6, s5
1881; GFX8-NEXT:    s_min_i32 s6, s2, 0
1882; GFX8-NEXT:    s_add_i32 s1, s1, s5
1883; GFX8-NEXT:    s_max_i32 s5, s2, 0
1884; GFX8-NEXT:    s_sub_i32 s6, 0x80000000, s6
1885; GFX8-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
1886; GFX8-NEXT:    s_max_i32 s6, s6, s7
1887; GFX8-NEXT:    s_min_i32 s5, s6, s5
1888; GFX8-NEXT:    s_min_i32 s6, s3, 0
1889; GFX8-NEXT:    s_add_i32 s2, s2, s5
1890; GFX8-NEXT:    s_max_i32 s5, s3, 0
1891; GFX8-NEXT:    s_sub_i32 s6, 0x80000000, s6
1892; GFX8-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
1893; GFX8-NEXT:    s_max_i32 s6, s6, s8
1894; GFX8-NEXT:    s_min_i32 s5, s6, s5
1895; GFX8-NEXT:    s_min_i32 s6, s4, 0
1896; GFX8-NEXT:    s_add_i32 s3, s3, s5
1897; GFX8-NEXT:    s_max_i32 s5, s4, 0
1898; GFX8-NEXT:    s_sub_i32 s6, 0x80000000, s6
1899; GFX8-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
1900; GFX8-NEXT:    s_max_i32 s6, s6, s9
1901; GFX8-NEXT:    s_min_i32 s5, s6, s5
1902; GFX8-NEXT:    s_add_i32 s4, s4, s5
1903; GFX8-NEXT:    ; return to shader part epilog
1904;
1905; GFX9-LABEL: s_saddsat_v5i32:
1906; GFX9:       ; %bb.0:
1907; GFX9-NEXT:    v_mov_b32_e32 v0, s5
1908; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1909; GFX9-NEXT:    v_mov_b32_e32 v2, s7
1910; GFX9-NEXT:    v_mov_b32_e32 v3, s8
1911; GFX9-NEXT:    v_mov_b32_e32 v4, s9
1912; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
1913; GFX9-NEXT:    v_add_i32 v1, s1, v1 clamp
1914; GFX9-NEXT:    v_add_i32 v2, s2, v2 clamp
1915; GFX9-NEXT:    v_add_i32 v3, s3, v3 clamp
1916; GFX9-NEXT:    v_add_i32 v4, s4, v4 clamp
1917; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1918; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1919; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1920; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
1921; GFX9-NEXT:    v_readfirstlane_b32 s4, v4
1922; GFX9-NEXT:    ; return to shader part epilog
1923;
1924; GFX10PLUS-LABEL: s_saddsat_v5i32:
1925; GFX10PLUS:       ; %bb.0:
1926; GFX10PLUS-NEXT:    v_add_nc_i32 v0, s0, s5 clamp
1927; GFX10PLUS-NEXT:    v_add_nc_i32 v1, s1, s6 clamp
1928; GFX10PLUS-NEXT:    v_add_nc_i32 v2, s2, s7 clamp
1929; GFX10PLUS-NEXT:    v_add_nc_i32 v3, s3, s8 clamp
1930; GFX10PLUS-NEXT:    v_add_nc_i32 v4, s4, s9 clamp
1931; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
1932; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
1933; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
1934; GFX10PLUS-NEXT:    v_readfirstlane_b32 s3, v3
1935; GFX10PLUS-NEXT:    v_readfirstlane_b32 s4, v4
1936; GFX10PLUS-NEXT:    ; return to shader part epilog
1937  %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1938  ret <5 x i32> %result
1939}
1940
1941define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
1942; GFX6-LABEL: v_saddsat_v16i32:
1943; GFX6:       ; %bb.0:
1944; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1945; GFX6-NEXT:    v_min_i32_e32 v32, 0, v0
1946; GFX6-NEXT:    v_bfrev_b32_e32 v31, 1
1947; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, v31, v32
1948; GFX6-NEXT:    v_max_i32_e32 v32, v32, v16
1949; GFX6-NEXT:    v_max_i32_e32 v33, 0, v0
1950; GFX6-NEXT:    v_bfrev_b32_e32 v16, -2
1951; GFX6-NEXT:    v_sub_i32_e32 v33, vcc, v16, v33
1952; GFX6-NEXT:    v_min_i32_e32 v32, v32, v33
1953; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v32
1954; GFX6-NEXT:    v_min_i32_e32 v32, 0, v1
1955; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, v31, v32
1956; GFX6-NEXT:    v_max_i32_e32 v17, v32, v17
1957; GFX6-NEXT:    v_max_i32_e32 v32, 0, v1
1958; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, v16, v32
1959; GFX6-NEXT:    v_min_i32_e32 v17, v17, v32
1960; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v17
1961; GFX6-NEXT:    v_min_i32_e32 v17, 0, v2
1962; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
1963; GFX6-NEXT:    v_max_i32_e32 v17, v17, v18
1964; GFX6-NEXT:    v_max_i32_e32 v18, 0, v2
1965; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
1966; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
1967; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v17
1968; GFX6-NEXT:    v_min_i32_e32 v17, 0, v3
1969; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
1970; GFX6-NEXT:    v_max_i32_e32 v17, v17, v19
1971; GFX6-NEXT:    buffer_load_dword v19, off, s[0:3], s32
1972; GFX6-NEXT:    v_max_i32_e32 v18, 0, v3
1973; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
1974; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
1975; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v17
1976; GFX6-NEXT:    v_min_i32_e32 v17, 0, v4
1977; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
1978; GFX6-NEXT:    v_max_i32_e32 v18, 0, v4
1979; GFX6-NEXT:    v_max_i32_e32 v17, v17, v20
1980; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
1981; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
1982; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v17
1983; GFX6-NEXT:    v_min_i32_e32 v17, 0, v5
1984; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
1985; GFX6-NEXT:    v_max_i32_e32 v18, 0, v5
1986; GFX6-NEXT:    v_max_i32_e32 v17, v17, v21
1987; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
1988; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
1989; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v17
1990; GFX6-NEXT:    v_min_i32_e32 v17, 0, v6
1991; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
1992; GFX6-NEXT:    v_max_i32_e32 v18, 0, v6
1993; GFX6-NEXT:    v_max_i32_e32 v17, v17, v22
1994; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
1995; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
1996; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v17
1997; GFX6-NEXT:    v_min_i32_e32 v17, 0, v7
1998; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
1999; GFX6-NEXT:    v_max_i32_e32 v18, 0, v7
2000; GFX6-NEXT:    v_max_i32_e32 v17, v17, v23
2001; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
2002; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2003; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
2004; GFX6-NEXT:    v_min_i32_e32 v17, 0, v8
2005; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
2006; GFX6-NEXT:    v_max_i32_e32 v18, 0, v8
2007; GFX6-NEXT:    v_max_i32_e32 v17, v17, v24
2008; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
2009; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2010; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v17
2011; GFX6-NEXT:    v_min_i32_e32 v17, 0, v9
2012; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
2013; GFX6-NEXT:    v_max_i32_e32 v18, 0, v9
2014; GFX6-NEXT:    v_max_i32_e32 v17, v17, v25
2015; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
2016; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2017; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
2018; GFX6-NEXT:    v_min_i32_e32 v17, 0, v10
2019; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
2020; GFX6-NEXT:    v_max_i32_e32 v18, 0, v10
2021; GFX6-NEXT:    v_max_i32_e32 v17, v17, v26
2022; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
2023; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2024; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
2025; GFX6-NEXT:    v_min_i32_e32 v17, 0, v11
2026; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
2027; GFX6-NEXT:    v_max_i32_e32 v18, 0, v11
2028; GFX6-NEXT:    v_max_i32_e32 v17, v17, v27
2029; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
2030; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2031; GFX6-NEXT:    v_add_i32_e32 v11, vcc, v11, v17
2032; GFX6-NEXT:    v_min_i32_e32 v17, 0, v12
2033; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
2034; GFX6-NEXT:    v_max_i32_e32 v18, 0, v12
2035; GFX6-NEXT:    v_max_i32_e32 v17, v17, v28
2036; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
2037; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2038; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
2039; GFX6-NEXT:    v_min_i32_e32 v17, 0, v13
2040; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
2041; GFX6-NEXT:    v_max_i32_e32 v18, 0, v13
2042; GFX6-NEXT:    v_max_i32_e32 v17, v17, v29
2043; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
2044; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2045; GFX6-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
2046; GFX6-NEXT:    v_min_i32_e32 v17, 0, v14
2047; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
2048; GFX6-NEXT:    v_max_i32_e32 v18, 0, v14
2049; GFX6-NEXT:    v_max_i32_e32 v17, v17, v30
2050; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
2051; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
2052; GFX6-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
2053; GFX6-NEXT:    v_max_i32_e32 v17, 0, v15
2054; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v16, v17
2055; GFX6-NEXT:    v_min_i32_e32 v17, 0, v15
2056; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
2057; GFX6-NEXT:    s_waitcnt vmcnt(0)
2058; GFX6-NEXT:    v_max_i32_e32 v17, v17, v19
2059; GFX6-NEXT:    v_min_i32_e32 v16, v17, v16
2060; GFX6-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
2061; GFX6-NEXT:    s_setpc_b64 s[30:31]
2062;
2063; GFX8-LABEL: v_saddsat_v16i32:
2064; GFX8:       ; %bb.0:
2065; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2066; GFX8-NEXT:    v_min_i32_e32 v32, 0, v0
2067; GFX8-NEXT:    v_bfrev_b32_e32 v31, 1
2068; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, v31, v32
2069; GFX8-NEXT:    v_max_i32_e32 v32, v32, v16
2070; GFX8-NEXT:    v_max_i32_e32 v33, 0, v0
2071; GFX8-NEXT:    v_bfrev_b32_e32 v16, -2
2072; GFX8-NEXT:    v_sub_u32_e32 v33, vcc, v16, v33
2073; GFX8-NEXT:    v_min_i32_e32 v32, v32, v33
2074; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v32
2075; GFX8-NEXT:    v_min_i32_e32 v32, 0, v1
2076; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, v31, v32
2077; GFX8-NEXT:    v_max_i32_e32 v17, v32, v17
2078; GFX8-NEXT:    v_max_i32_e32 v32, 0, v1
2079; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, v16, v32
2080; GFX8-NEXT:    v_min_i32_e32 v17, v17, v32
2081; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v17
2082; GFX8-NEXT:    v_min_i32_e32 v17, 0, v2
2083; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
2084; GFX8-NEXT:    v_max_i32_e32 v17, v17, v18
2085; GFX8-NEXT:    v_max_i32_e32 v18, 0, v2
2086; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
2087; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2088; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v17
2089; GFX8-NEXT:    v_min_i32_e32 v17, 0, v3
2090; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
2091; GFX8-NEXT:    v_max_i32_e32 v17, v17, v19
2092; GFX8-NEXT:    buffer_load_dword v19, off, s[0:3], s32
2093; GFX8-NEXT:    v_max_i32_e32 v18, 0, v3
2094; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
2095; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2096; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v17
2097; GFX8-NEXT:    v_min_i32_e32 v17, 0, v4
2098; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
2099; GFX8-NEXT:    v_max_i32_e32 v18, 0, v4
2100; GFX8-NEXT:    v_max_i32_e32 v17, v17, v20
2101; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
2102; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2103; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v17
2104; GFX8-NEXT:    v_min_i32_e32 v17, 0, v5
2105; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
2106; GFX8-NEXT:    v_max_i32_e32 v18, 0, v5
2107; GFX8-NEXT:    v_max_i32_e32 v17, v17, v21
2108; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
2109; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2110; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v17
2111; GFX8-NEXT:    v_min_i32_e32 v17, 0, v6
2112; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
2113; GFX8-NEXT:    v_max_i32_e32 v18, 0, v6
2114; GFX8-NEXT:    v_max_i32_e32 v17, v17, v22
2115; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
2116; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2117; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v17
2118; GFX8-NEXT:    v_min_i32_e32 v17, 0, v7
2119; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
2120; GFX8-NEXT:    v_max_i32_e32 v18, 0, v7
2121; GFX8-NEXT:    v_max_i32_e32 v17, v17, v23
2122; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
2123; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2124; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v17
2125; GFX8-NEXT:    v_min_i32_e32 v17, 0, v8
2126; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
2127; GFX8-NEXT:    v_max_i32_e32 v18, 0, v8
2128; GFX8-NEXT:    v_max_i32_e32 v17, v17, v24
2129; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
2130; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2131; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v17
2132; GFX8-NEXT:    v_min_i32_e32 v17, 0, v9
2133; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
2134; GFX8-NEXT:    v_max_i32_e32 v18, 0, v9
2135; GFX8-NEXT:    v_max_i32_e32 v17, v17, v25
2136; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
2137; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2138; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v17
2139; GFX8-NEXT:    v_min_i32_e32 v17, 0, v10
2140; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
2141; GFX8-NEXT:    v_max_i32_e32 v18, 0, v10
2142; GFX8-NEXT:    v_max_i32_e32 v17, v17, v26
2143; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
2144; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2145; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v17
2146; GFX8-NEXT:    v_min_i32_e32 v17, 0, v11
2147; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
2148; GFX8-NEXT:    v_max_i32_e32 v18, 0, v11
2149; GFX8-NEXT:    v_max_i32_e32 v17, v17, v27
2150; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
2151; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2152; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v17
2153; GFX8-NEXT:    v_min_i32_e32 v17, 0, v12
2154; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
2155; GFX8-NEXT:    v_max_i32_e32 v18, 0, v12
2156; GFX8-NEXT:    v_max_i32_e32 v17, v17, v28
2157; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
2158; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2159; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v17
2160; GFX8-NEXT:    v_min_i32_e32 v17, 0, v13
2161; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
2162; GFX8-NEXT:    v_max_i32_e32 v18, 0, v13
2163; GFX8-NEXT:    v_max_i32_e32 v17, v17, v29
2164; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
2165; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2166; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v17
2167; GFX8-NEXT:    v_min_i32_e32 v17, 0, v14
2168; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
2169; GFX8-NEXT:    v_max_i32_e32 v18, 0, v14
2170; GFX8-NEXT:    v_max_i32_e32 v17, v17, v30
2171; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
2172; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
2173; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v17
2174; GFX8-NEXT:    v_max_i32_e32 v17, 0, v15
2175; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, v16, v17
2176; GFX8-NEXT:    v_min_i32_e32 v17, 0, v15
2177; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
2178; GFX8-NEXT:    s_waitcnt vmcnt(0)
2179; GFX8-NEXT:    v_max_i32_e32 v17, v17, v19
2180; GFX8-NEXT:    v_min_i32_e32 v16, v17, v16
2181; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v16
2182; GFX8-NEXT:    s_setpc_b64 s[30:31]
2183;
2184; GFX9-LABEL: v_saddsat_v16i32:
2185; GFX9:       ; %bb.0:
2186; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2187; GFX9-NEXT:    v_add_i32 v0, v0, v16 clamp
2188; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
2189; GFX9-NEXT:    v_add_i32 v1, v1, v17 clamp
2190; GFX9-NEXT:    v_add_i32 v2, v2, v18 clamp
2191; GFX9-NEXT:    v_add_i32 v3, v3, v19 clamp
2192; GFX9-NEXT:    v_add_i32 v4, v4, v20 clamp
2193; GFX9-NEXT:    v_add_i32 v5, v5, v21 clamp
2194; GFX9-NEXT:    v_add_i32 v6, v6, v22 clamp
2195; GFX9-NEXT:    v_add_i32 v7, v7, v23 clamp
2196; GFX9-NEXT:    v_add_i32 v8, v8, v24 clamp
2197; GFX9-NEXT:    v_add_i32 v9, v9, v25 clamp
2198; GFX9-NEXT:    v_add_i32 v10, v10, v26 clamp
2199; GFX9-NEXT:    v_add_i32 v11, v11, v27 clamp
2200; GFX9-NEXT:    v_add_i32 v12, v12, v28 clamp
2201; GFX9-NEXT:    v_add_i32 v13, v13, v29 clamp
2202; GFX9-NEXT:    v_add_i32 v14, v14, v30 clamp
2203; GFX9-NEXT:    s_waitcnt vmcnt(0)
2204; GFX9-NEXT:    v_add_i32 v15, v15, v16 clamp
2205; GFX9-NEXT:    s_setpc_b64 s[30:31]
2206;
2207; GFX10-LABEL: v_saddsat_v16i32:
2208; GFX10:       ; %bb.0:
2209; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2210; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
2211; GFX10-NEXT:    v_add_nc_i32 v0, v0, v16 clamp
2212; GFX10-NEXT:    v_add_nc_i32 v1, v1, v17 clamp
2213; GFX10-NEXT:    v_add_nc_i32 v2, v2, v18 clamp
2214; GFX10-NEXT:    v_add_nc_i32 v3, v3, v19 clamp
2215; GFX10-NEXT:    v_add_nc_i32 v4, v4, v20 clamp
2216; GFX10-NEXT:    v_add_nc_i32 v5, v5, v21 clamp
2217; GFX10-NEXT:    v_add_nc_i32 v6, v6, v22 clamp
2218; GFX10-NEXT:    v_add_nc_i32 v7, v7, v23 clamp
2219; GFX10-NEXT:    v_add_nc_i32 v8, v8, v24 clamp
2220; GFX10-NEXT:    v_add_nc_i32 v9, v9, v25 clamp
2221; GFX10-NEXT:    v_add_nc_i32 v10, v10, v26 clamp
2222; GFX10-NEXT:    v_add_nc_i32 v11, v11, v27 clamp
2223; GFX10-NEXT:    v_add_nc_i32 v12, v12, v28 clamp
2224; GFX10-NEXT:    v_add_nc_i32 v13, v13, v29 clamp
2225; GFX10-NEXT:    v_add_nc_i32 v14, v14, v30 clamp
2226; GFX10-NEXT:    s_waitcnt vmcnt(0)
2227; GFX10-NEXT:    v_add_nc_i32 v15, v15, v31 clamp
2228; GFX10-NEXT:    s_setpc_b64 s[30:31]
2229;
2230; GFX11-LABEL: v_saddsat_v16i32:
2231; GFX11:       ; %bb.0:
2232; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2233; GFX11-NEXT:    scratch_load_b32 v31, off, s32
2234; GFX11-NEXT:    v_add_nc_i32 v0, v0, v16 clamp
2235; GFX11-NEXT:    v_add_nc_i32 v1, v1, v17 clamp
2236; GFX11-NEXT:    v_add_nc_i32 v2, v2, v18 clamp
2237; GFX11-NEXT:    v_add_nc_i32 v3, v3, v19 clamp
2238; GFX11-NEXT:    v_add_nc_i32 v4, v4, v20 clamp
2239; GFX11-NEXT:    v_add_nc_i32 v5, v5, v21 clamp
2240; GFX11-NEXT:    v_add_nc_i32 v6, v6, v22 clamp
2241; GFX11-NEXT:    v_add_nc_i32 v7, v7, v23 clamp
2242; GFX11-NEXT:    v_add_nc_i32 v8, v8, v24 clamp
2243; GFX11-NEXT:    v_add_nc_i32 v9, v9, v25 clamp
2244; GFX11-NEXT:    v_add_nc_i32 v10, v10, v26 clamp
2245; GFX11-NEXT:    v_add_nc_i32 v11, v11, v27 clamp
2246; GFX11-NEXT:    v_add_nc_i32 v12, v12, v28 clamp
2247; GFX11-NEXT:    v_add_nc_i32 v13, v13, v29 clamp
2248; GFX11-NEXT:    v_add_nc_i32 v14, v14, v30 clamp
2249; GFX11-NEXT:    s_waitcnt vmcnt(0)
2250; GFX11-NEXT:    v_add_nc_i32 v15, v15, v31 clamp
2251; GFX11-NEXT:    s_setpc_b64 s[30:31]
2252  %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
2253  ret <16 x i32> %result
2254}
2255
2256define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
2257; GFX6-LABEL: s_saddsat_v16i32:
2258; GFX6:       ; %bb.0:
2259; GFX6-NEXT:    s_min_i32 s33, s0, 0
2260; GFX6-NEXT:    s_max_i32 s32, s0, 0
2261; GFX6-NEXT:    s_sub_i32 s33, 0x80000000, s33
2262; GFX6-NEXT:    s_sub_i32 s32, 0x7fffffff, s32
2263; GFX6-NEXT:    s_max_i32 s16, s33, s16
2264; GFX6-NEXT:    s_min_i32 s16, s16, s32
2265; GFX6-NEXT:    s_min_i32 s32, s1, 0
2266; GFX6-NEXT:    s_add_i32 s0, s0, s16
2267; GFX6-NEXT:    s_max_i32 s16, s1, 0
2268; GFX6-NEXT:    s_sub_i32 s32, 0x80000000, s32
2269; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2270; GFX6-NEXT:    s_max_i32 s17, s32, s17
2271; GFX6-NEXT:    s_min_i32 s16, s17, s16
2272; GFX6-NEXT:    s_min_i32 s17, s2, 0
2273; GFX6-NEXT:    s_add_i32 s1, s1, s16
2274; GFX6-NEXT:    s_max_i32 s16, s2, 0
2275; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
2276; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2277; GFX6-NEXT:    s_max_i32 s17, s17, s18
2278; GFX6-NEXT:    s_min_i32 s16, s17, s16
2279; GFX6-NEXT:    s_min_i32 s17, s3, 0
2280; GFX6-NEXT:    s_add_i32 s2, s2, s16
2281; GFX6-NEXT:    s_max_i32 s16, s3, 0
2282; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
2283; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2284; GFX6-NEXT:    s_max_i32 s17, s17, s19
2285; GFX6-NEXT:    s_min_i32 s16, s17, s16
2286; GFX6-NEXT:    s_min_i32 s17, s4, 0
2287; GFX6-NEXT:    s_add_i32 s3, s3, s16
2288; GFX6-NEXT:    s_max_i32 s16, s4, 0
2289; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
2290; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2291; GFX6-NEXT:    s_max_i32 s17, s17, s20
2292; GFX6-NEXT:    s_min_i32 s16, s17, s16
2293; GFX6-NEXT:    s_min_i32 s17, s5, 0
2294; GFX6-NEXT:    s_add_i32 s4, s4, s16
2295; GFX6-NEXT:    s_max_i32 s16, s5, 0
2296; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
2297; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2298; GFX6-NEXT:    s_max_i32 s17, s17, s21
2299; GFX6-NEXT:    s_min_i32 s16, s17, s16
2300; GFX6-NEXT:    s_min_i32 s17, s6, 0
2301; GFX6-NEXT:    s_add_i32 s5, s5, s16
2302; GFX6-NEXT:    s_max_i32 s16, s6, 0
2303; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
2304; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2305; GFX6-NEXT:    s_max_i32 s17, s17, s22
2306; GFX6-NEXT:    s_min_i32 s16, s17, s16
2307; GFX6-NEXT:    s_min_i32 s17, s7, 0
2308; GFX6-NEXT:    s_add_i32 s6, s6, s16
2309; GFX6-NEXT:    s_max_i32 s16, s7, 0
2310; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
2311; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2312; GFX6-NEXT:    s_max_i32 s17, s17, s23
2313; GFX6-NEXT:    s_min_i32 s16, s17, s16
2314; GFX6-NEXT:    s_min_i32 s17, s8, 0
2315; GFX6-NEXT:    s_add_i32 s7, s7, s16
2316; GFX6-NEXT:    s_max_i32 s16, s8, 0
2317; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
2318; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2319; GFX6-NEXT:    s_max_i32 s17, s17, s24
2320; GFX6-NEXT:    s_min_i32 s16, s17, s16
2321; GFX6-NEXT:    s_min_i32 s17, s9, 0
2322; GFX6-NEXT:    s_add_i32 s8, s8, s16
2323; GFX6-NEXT:    s_max_i32 s16, s9, 0
2324; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
2325; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2326; GFX6-NEXT:    s_max_i32 s17, s17, s25
2327; GFX6-NEXT:    s_min_i32 s16, s17, s16
2328; GFX6-NEXT:    s_min_i32 s17, s10, 0
2329; GFX6-NEXT:    s_add_i32 s9, s9, s16
2330; GFX6-NEXT:    s_max_i32 s16, s10, 0
2331; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
2332; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2333; GFX6-NEXT:    s_max_i32 s17, s17, s26
2334; GFX6-NEXT:    s_min_i32 s16, s17, s16
2335; GFX6-NEXT:    s_min_i32 s17, s11, 0
2336; GFX6-NEXT:    s_add_i32 s10, s10, s16
2337; GFX6-NEXT:    s_max_i32 s16, s11, 0
2338; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
2339; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2340; GFX6-NEXT:    s_max_i32 s17, s17, s27
2341; GFX6-NEXT:    s_min_i32 s16, s17, s16
2342; GFX6-NEXT:    s_min_i32 s17, s12, 0
2343; GFX6-NEXT:    s_add_i32 s11, s11, s16
2344; GFX6-NEXT:    s_max_i32 s16, s12, 0
2345; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
2346; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2347; GFX6-NEXT:    s_max_i32 s17, s17, s28
2348; GFX6-NEXT:    s_min_i32 s16, s17, s16
2349; GFX6-NEXT:    s_min_i32 s17, s13, 0
2350; GFX6-NEXT:    s_add_i32 s12, s12, s16
2351; GFX6-NEXT:    s_max_i32 s16, s13, 0
2352; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
2353; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2354; GFX6-NEXT:    s_max_i32 s17, s17, s29
2355; GFX6-NEXT:    s_min_i32 s16, s17, s16
2356; GFX6-NEXT:    s_min_i32 s17, s14, 0
2357; GFX6-NEXT:    s_add_i32 s13, s13, s16
2358; GFX6-NEXT:    s_max_i32 s16, s14, 0
2359; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
2360; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2361; GFX6-NEXT:    s_max_i32 s17, s17, s30
2362; GFX6-NEXT:    s_min_i32 s16, s17, s16
2363; GFX6-NEXT:    s_min_i32 s17, s15, 0
2364; GFX6-NEXT:    s_add_i32 s14, s14, s16
2365; GFX6-NEXT:    s_max_i32 s16, s15, 0
2366; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
2367; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2368; GFX6-NEXT:    s_max_i32 s17, s17, s31
2369; GFX6-NEXT:    s_min_i32 s16, s17, s16
2370; GFX6-NEXT:    s_add_i32 s15, s15, s16
2371; GFX6-NEXT:    ; return to shader part epilog
2372;
2373; GFX8-LABEL: s_saddsat_v16i32:
2374; GFX8:       ; %bb.0:
2375; GFX8-NEXT:    s_min_i32 s33, s0, 0
2376; GFX8-NEXT:    s_max_i32 s32, s0, 0
2377; GFX8-NEXT:    s_sub_i32 s33, 0x80000000, s33
2378; GFX8-NEXT:    s_sub_i32 s32, 0x7fffffff, s32
2379; GFX8-NEXT:    s_max_i32 s16, s33, s16
2380; GFX8-NEXT:    s_min_i32 s16, s16, s32
2381; GFX8-NEXT:    s_min_i32 s32, s1, 0
2382; GFX8-NEXT:    s_add_i32 s0, s0, s16
2383; GFX8-NEXT:    s_max_i32 s16, s1, 0
2384; GFX8-NEXT:    s_sub_i32 s32, 0x80000000, s32
2385; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2386; GFX8-NEXT:    s_max_i32 s17, s32, s17
2387; GFX8-NEXT:    s_min_i32 s16, s17, s16
2388; GFX8-NEXT:    s_min_i32 s17, s2, 0
2389; GFX8-NEXT:    s_add_i32 s1, s1, s16
2390; GFX8-NEXT:    s_max_i32 s16, s2, 0
2391; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
2392; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2393; GFX8-NEXT:    s_max_i32 s17, s17, s18
2394; GFX8-NEXT:    s_min_i32 s16, s17, s16
2395; GFX8-NEXT:    s_min_i32 s17, s3, 0
2396; GFX8-NEXT:    s_add_i32 s2, s2, s16
2397; GFX8-NEXT:    s_max_i32 s16, s3, 0
2398; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
2399; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2400; GFX8-NEXT:    s_max_i32 s17, s17, s19
2401; GFX8-NEXT:    s_min_i32 s16, s17, s16
2402; GFX8-NEXT:    s_min_i32 s17, s4, 0
2403; GFX8-NEXT:    s_add_i32 s3, s3, s16
2404; GFX8-NEXT:    s_max_i32 s16, s4, 0
2405; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
2406; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2407; GFX8-NEXT:    s_max_i32 s17, s17, s20
2408; GFX8-NEXT:    s_min_i32 s16, s17, s16
2409; GFX8-NEXT:    s_min_i32 s17, s5, 0
2410; GFX8-NEXT:    s_add_i32 s4, s4, s16
2411; GFX8-NEXT:    s_max_i32 s16, s5, 0
2412; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
2413; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2414; GFX8-NEXT:    s_max_i32 s17, s17, s21
2415; GFX8-NEXT:    s_min_i32 s16, s17, s16
2416; GFX8-NEXT:    s_min_i32 s17, s6, 0
2417; GFX8-NEXT:    s_add_i32 s5, s5, s16
2418; GFX8-NEXT:    s_max_i32 s16, s6, 0
2419; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
2420; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2421; GFX8-NEXT:    s_max_i32 s17, s17, s22
2422; GFX8-NEXT:    s_min_i32 s16, s17, s16
2423; GFX8-NEXT:    s_min_i32 s17, s7, 0
2424; GFX8-NEXT:    s_add_i32 s6, s6, s16
2425; GFX8-NEXT:    s_max_i32 s16, s7, 0
2426; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
2427; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2428; GFX8-NEXT:    s_max_i32 s17, s17, s23
2429; GFX8-NEXT:    s_min_i32 s16, s17, s16
2430; GFX8-NEXT:    s_min_i32 s17, s8, 0
2431; GFX8-NEXT:    s_add_i32 s7, s7, s16
2432; GFX8-NEXT:    s_max_i32 s16, s8, 0
2433; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
2434; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2435; GFX8-NEXT:    s_max_i32 s17, s17, s24
2436; GFX8-NEXT:    s_min_i32 s16, s17, s16
2437; GFX8-NEXT:    s_min_i32 s17, s9, 0
2438; GFX8-NEXT:    s_add_i32 s8, s8, s16
2439; GFX8-NEXT:    s_max_i32 s16, s9, 0
2440; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
2441; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2442; GFX8-NEXT:    s_max_i32 s17, s17, s25
2443; GFX8-NEXT:    s_min_i32 s16, s17, s16
2444; GFX8-NEXT:    s_min_i32 s17, s10, 0
2445; GFX8-NEXT:    s_add_i32 s9, s9, s16
2446; GFX8-NEXT:    s_max_i32 s16, s10, 0
2447; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
2448; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2449; GFX8-NEXT:    s_max_i32 s17, s17, s26
2450; GFX8-NEXT:    s_min_i32 s16, s17, s16
2451; GFX8-NEXT:    s_min_i32 s17, s11, 0
2452; GFX8-NEXT:    s_add_i32 s10, s10, s16
2453; GFX8-NEXT:    s_max_i32 s16, s11, 0
2454; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
2455; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2456; GFX8-NEXT:    s_max_i32 s17, s17, s27
2457; GFX8-NEXT:    s_min_i32 s16, s17, s16
2458; GFX8-NEXT:    s_min_i32 s17, s12, 0
2459; GFX8-NEXT:    s_add_i32 s11, s11, s16
2460; GFX8-NEXT:    s_max_i32 s16, s12, 0
2461; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
2462; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2463; GFX8-NEXT:    s_max_i32 s17, s17, s28
2464; GFX8-NEXT:    s_min_i32 s16, s17, s16
2465; GFX8-NEXT:    s_min_i32 s17, s13, 0
2466; GFX8-NEXT:    s_add_i32 s12, s12, s16
2467; GFX8-NEXT:    s_max_i32 s16, s13, 0
2468; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
2469; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2470; GFX8-NEXT:    s_max_i32 s17, s17, s29
2471; GFX8-NEXT:    s_min_i32 s16, s17, s16
2472; GFX8-NEXT:    s_min_i32 s17, s14, 0
2473; GFX8-NEXT:    s_add_i32 s13, s13, s16
2474; GFX8-NEXT:    s_max_i32 s16, s14, 0
2475; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
2476; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2477; GFX8-NEXT:    s_max_i32 s17, s17, s30
2478; GFX8-NEXT:    s_min_i32 s16, s17, s16
2479; GFX8-NEXT:    s_min_i32 s17, s15, 0
2480; GFX8-NEXT:    s_add_i32 s14, s14, s16
2481; GFX8-NEXT:    s_max_i32 s16, s15, 0
2482; GFX8-NEXT:    s_sub_i32 s17, 0x80000000, s17
2483; GFX8-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
2484; GFX8-NEXT:    s_max_i32 s17, s17, s31
2485; GFX8-NEXT:    s_min_i32 s16, s17, s16
2486; GFX8-NEXT:    s_add_i32 s15, s15, s16
2487; GFX8-NEXT:    ; return to shader part epilog
2488;
2489; GFX9-LABEL: s_saddsat_v16i32:
2490; GFX9:       ; %bb.0:
2491; GFX9-NEXT:    v_mov_b32_e32 v0, s16
2492; GFX9-NEXT:    v_mov_b32_e32 v1, s17
2493; GFX9-NEXT:    v_mov_b32_e32 v2, s18
2494; GFX9-NEXT:    v_mov_b32_e32 v3, s19
2495; GFX9-NEXT:    v_mov_b32_e32 v4, s20
2496; GFX9-NEXT:    v_mov_b32_e32 v5, s21
2497; GFX9-NEXT:    v_mov_b32_e32 v6, s22
2498; GFX9-NEXT:    v_mov_b32_e32 v7, s23
2499; GFX9-NEXT:    v_mov_b32_e32 v8, s24
2500; GFX9-NEXT:    v_mov_b32_e32 v9, s25
2501; GFX9-NEXT:    v_mov_b32_e32 v10, s26
2502; GFX9-NEXT:    v_mov_b32_e32 v11, s27
2503; GFX9-NEXT:    v_mov_b32_e32 v12, s28
2504; GFX9-NEXT:    v_mov_b32_e32 v13, s29
2505; GFX9-NEXT:    v_mov_b32_e32 v14, s30
2506; GFX9-NEXT:    v_mov_b32_e32 v15, s31
2507; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
2508; GFX9-NEXT:    v_add_i32 v1, s1, v1 clamp
2509; GFX9-NEXT:    v_add_i32 v2, s2, v2 clamp
2510; GFX9-NEXT:    v_add_i32 v3, s3, v3 clamp
2511; GFX9-NEXT:    v_add_i32 v4, s4, v4 clamp
2512; GFX9-NEXT:    v_add_i32 v5, s5, v5 clamp
2513; GFX9-NEXT:    v_add_i32 v6, s6, v6 clamp
2514; GFX9-NEXT:    v_add_i32 v7, s7, v7 clamp
2515; GFX9-NEXT:    v_add_i32 v8, s8, v8 clamp
2516; GFX9-NEXT:    v_add_i32 v9, s9, v9 clamp
2517; GFX9-NEXT:    v_add_i32 v10, s10, v10 clamp
2518; GFX9-NEXT:    v_add_i32 v11, s11, v11 clamp
2519; GFX9-NEXT:    v_add_i32 v12, s12, v12 clamp
2520; GFX9-NEXT:    v_add_i32 v13, s13, v13 clamp
2521; GFX9-NEXT:    v_add_i32 v14, s14, v14 clamp
2522; GFX9-NEXT:    v_add_i32 v15, s15, v15 clamp
2523; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2524; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2525; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
2526; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
2527; GFX9-NEXT:    v_readfirstlane_b32 s4, v4
2528; GFX9-NEXT:    v_readfirstlane_b32 s5, v5
2529; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
2530; GFX9-NEXT:    v_readfirstlane_b32 s7, v7
2531; GFX9-NEXT:    v_readfirstlane_b32 s8, v8
2532; GFX9-NEXT:    v_readfirstlane_b32 s9, v9
2533; GFX9-NEXT:    v_readfirstlane_b32 s10, v10
2534; GFX9-NEXT:    v_readfirstlane_b32 s11, v11
2535; GFX9-NEXT:    v_readfirstlane_b32 s12, v12
2536; GFX9-NEXT:    v_readfirstlane_b32 s13, v13
2537; GFX9-NEXT:    v_readfirstlane_b32 s14, v14
2538; GFX9-NEXT:    v_readfirstlane_b32 s15, v15
2539; GFX9-NEXT:    ; return to shader part epilog
2540;
2541; GFX10PLUS-LABEL: s_saddsat_v16i32:
2542; GFX10PLUS:       ; %bb.0:
2543; GFX10PLUS-NEXT:    v_add_nc_i32 v0, s0, s16 clamp
2544; GFX10PLUS-NEXT:    v_add_nc_i32 v1, s1, s17 clamp
2545; GFX10PLUS-NEXT:    v_add_nc_i32 v2, s2, s18 clamp
2546; GFX10PLUS-NEXT:    v_add_nc_i32 v3, s3, s19 clamp
2547; GFX10PLUS-NEXT:    v_add_nc_i32 v4, s4, s20 clamp
2548; GFX10PLUS-NEXT:    v_add_nc_i32 v5, s5, s21 clamp
2549; GFX10PLUS-NEXT:    v_add_nc_i32 v6, s6, s22 clamp
2550; GFX10PLUS-NEXT:    v_add_nc_i32 v7, s7, s23 clamp
2551; GFX10PLUS-NEXT:    v_add_nc_i32 v8, s8, s24 clamp
2552; GFX10PLUS-NEXT:    v_add_nc_i32 v9, s9, s25 clamp
2553; GFX10PLUS-NEXT:    v_add_nc_i32 v10, s10, s26 clamp
2554; GFX10PLUS-NEXT:    v_add_nc_i32 v11, s11, s27 clamp
2555; GFX10PLUS-NEXT:    v_add_nc_i32 v12, s12, s28 clamp
2556; GFX10PLUS-NEXT:    v_add_nc_i32 v13, s13, s29 clamp
2557; GFX10PLUS-NEXT:    v_add_nc_i32 v14, s14, s30 clamp
2558; GFX10PLUS-NEXT:    v_add_nc_i32 v15, s15, s31 clamp
2559; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
2560; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
2561; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
2562; GFX10PLUS-NEXT:    v_readfirstlane_b32 s3, v3
2563; GFX10PLUS-NEXT:    v_readfirstlane_b32 s4, v4
2564; GFX10PLUS-NEXT:    v_readfirstlane_b32 s5, v5
2565; GFX10PLUS-NEXT:    v_readfirstlane_b32 s6, v6
2566; GFX10PLUS-NEXT:    v_readfirstlane_b32 s7, v7
2567; GFX10PLUS-NEXT:    v_readfirstlane_b32 s8, v8
2568; GFX10PLUS-NEXT:    v_readfirstlane_b32 s9, v9
2569; GFX10PLUS-NEXT:    v_readfirstlane_b32 s10, v10
2570; GFX10PLUS-NEXT:    v_readfirstlane_b32 s11, v11
2571; GFX10PLUS-NEXT:    v_readfirstlane_b32 s12, v12
2572; GFX10PLUS-NEXT:    v_readfirstlane_b32 s13, v13
2573; GFX10PLUS-NEXT:    v_readfirstlane_b32 s14, v14
2574; GFX10PLUS-NEXT:    v_readfirstlane_b32 s15, v15
2575; GFX10PLUS-NEXT:    ; return to shader part epilog
2576  %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
2577  ret <16 x i32> %result
2578}
2579
2580define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
2581; GFX6-LABEL: v_saddsat_i16:
2582; GFX6:       ; %bb.0:
2583; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2584; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2585; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
2586; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2587; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
2588; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
2589; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
2590; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
2591; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
2592; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
2593; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2594; GFX6-NEXT:    s_setpc_b64 s[30:31]
2595;
2596; GFX8-LABEL: v_saddsat_i16:
2597; GFX8:       ; %bb.0:
2598; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2599; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
2600; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
2601; GFX8-NEXT:    v_sub_u16_e32 v3, 0x8000, v3
2602; GFX8-NEXT:    v_sub_u16_e32 v2, 0x7fff, v2
2603; GFX8-NEXT:    v_max_i16_e32 v1, v3, v1
2604; GFX8-NEXT:    v_min_i16_e32 v1, v1, v2
2605; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
2606; GFX8-NEXT:    s_setpc_b64 s[30:31]
2607;
2608; GFX9-LABEL: v_saddsat_i16:
2609; GFX9:       ; %bb.0:
2610; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2611; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
2612; GFX9-NEXT:    s_setpc_b64 s[30:31]
2613;
2614; GFX10PLUS-LABEL: v_saddsat_i16:
2615; GFX10PLUS:       ; %bb.0:
2616; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2617; GFX10PLUS-NEXT:    v_add_nc_i16 v0, v0, v1 clamp
2618; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
2619  %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
2620  ret i16 %result
2621}
2622
2623define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
2624; GFX6-LABEL: s_saddsat_i16:
2625; GFX6:       ; %bb.0:
2626; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2627; GFX6-NEXT:    s_min_i32 s3, s0, 0
2628; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2629; GFX6-NEXT:    s_max_i32 s2, s0, 0
2630; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
2631; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
2632; GFX6-NEXT:    s_max_i32 s1, s3, s1
2633; GFX6-NEXT:    s_min_i32 s1, s1, s2
2634; GFX6-NEXT:    s_add_i32 s0, s0, s1
2635; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
2636; GFX6-NEXT:    ; return to shader part epilog
2637;
2638; GFX8-LABEL: s_saddsat_i16:
2639; GFX8:       ; %bb.0:
2640; GFX8-NEXT:    s_sext_i32_i16 s2, s0
2641; GFX8-NEXT:    s_sext_i32_i16 s3, 0
2642; GFX8-NEXT:    s_max_i32 s4, s2, s3
2643; GFX8-NEXT:    s_min_i32 s2, s2, s3
2644; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
2645; GFX8-NEXT:    s_sext_i32_i16 s2, s2
2646; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2647; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
2648; GFX8-NEXT:    s_max_i32 s1, s2, s1
2649; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2650; GFX8-NEXT:    s_sext_i32_i16 s2, s4
2651; GFX8-NEXT:    s_min_i32 s1, s1, s2
2652; GFX8-NEXT:    s_add_i32 s0, s0, s1
2653; GFX8-NEXT:    ; return to shader part epilog
2654;
2655; GFX9-LABEL: s_saddsat_i16:
2656; GFX9:       ; %bb.0:
2657; GFX9-NEXT:    v_mov_b32_e32 v0, s1
2658; GFX9-NEXT:    v_add_i16 v0, s0, v0 clamp
2659; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2660; GFX9-NEXT:    ; return to shader part epilog
2661;
2662; GFX10PLUS-LABEL: s_saddsat_i16:
2663; GFX10PLUS:       ; %bb.0:
2664; GFX10PLUS-NEXT:    v_add_nc_i16 v0, s0, s1 clamp
2665; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
2666; GFX10PLUS-NEXT:    ; return to shader part epilog
2667  %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
2668  ret i16 %result
2669}
2670
2671define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
2672; GFX6-LABEL: saddsat_i16_sv:
2673; GFX6:       ; %bb.0:
2674; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2675; GFX6-NEXT:    s_min_i32 s2, s0, 0
2676; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2677; GFX6-NEXT:    s_max_i32 s1, s0, 0
2678; GFX6-NEXT:    s_sub_i32 s2, 0x80000000, s2
2679; GFX6-NEXT:    s_sub_i32 s1, 0x7fffffff, s1
2680; GFX6-NEXT:    v_max_i32_e32 v0, s2, v0
2681; GFX6-NEXT:    v_min_i32_e32 v0, s1, v0
2682; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
2683; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2684; GFX6-NEXT:    ; return to shader part epilog
2685;
2686; GFX8-LABEL: saddsat_i16_sv:
2687; GFX8:       ; %bb.0:
2688; GFX8-NEXT:    s_sext_i32_i16 s1, s0
2689; GFX8-NEXT:    s_sext_i32_i16 s2, 0
2690; GFX8-NEXT:    s_max_i32 s3, s1, s2
2691; GFX8-NEXT:    s_min_i32 s1, s1, s2
2692; GFX8-NEXT:    s_sub_i32 s1, 0xffff8000, s1
2693; GFX8-NEXT:    s_sub_i32 s3, 0x7fff, s3
2694; GFX8-NEXT:    v_max_i16_e32 v0, s1, v0
2695; GFX8-NEXT:    v_min_i16_e32 v0, s3, v0
2696; GFX8-NEXT:    v_add_u16_e32 v0, s0, v0
2697; GFX8-NEXT:    ; return to shader part epilog
2698;
2699; GFX9-LABEL: saddsat_i16_sv:
2700; GFX9:       ; %bb.0:
2701; GFX9-NEXT:    v_add_i16 v0, s0, v0 clamp
2702; GFX9-NEXT:    ; return to shader part epilog
2703;
2704; GFX10PLUS-LABEL: saddsat_i16_sv:
2705; GFX10PLUS:       ; %bb.0:
2706; GFX10PLUS-NEXT:    v_add_nc_i16 v0, s0, v0 clamp
2707; GFX10PLUS-NEXT:    ; return to shader part epilog
2708  %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
2709  %cast = bitcast i16 %result to half
2710  ret half %cast
2711}
2712
2713define amdgpu_ps half @saddsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
2714; GFX6-LABEL: saddsat_i16_vs:
2715; GFX6:       ; %bb.0:
2716; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2717; GFX6-NEXT:    v_min_i32_e32 v2, 0, v0
2718; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2719; GFX6-NEXT:    v_max_i32_e32 v1, 0, v0
2720; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x80000000, v2
2721; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 0x7fffffff, v1
2722; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
2723; GFX6-NEXT:    v_min_i32_e32 v1, v2, v1
2724; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
2725; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2726; GFX6-NEXT:    ; return to shader part epilog
2727;
2728; GFX8-LABEL: saddsat_i16_vs:
2729; GFX8:       ; %bb.0:
2730; GFX8-NEXT:    v_min_i16_e32 v2, 0, v0
2731; GFX8-NEXT:    v_max_i16_e32 v1, 0, v0
2732; GFX8-NEXT:    v_sub_u16_e32 v2, 0x8000, v2
2733; GFX8-NEXT:    v_sub_u16_e32 v1, 0x7fff, v1
2734; GFX8-NEXT:    v_max_i16_e32 v2, s0, v2
2735; GFX8-NEXT:    v_min_i16_e32 v1, v2, v1
2736; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
2737; GFX8-NEXT:    ; return to shader part epilog
2738;
2739; GFX9-LABEL: saddsat_i16_vs:
2740; GFX9:       ; %bb.0:
2741; GFX9-NEXT:    v_add_i16 v0, v0, s0 clamp
2742; GFX9-NEXT:    ; return to shader part epilog
2743;
2744; GFX10PLUS-LABEL: saddsat_i16_vs:
2745; GFX10PLUS:       ; %bb.0:
2746; GFX10PLUS-NEXT:    v_add_nc_i16 v0, v0, s0 clamp
2747; GFX10PLUS-NEXT:    ; return to shader part epilog
2748  %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
2749  %cast = bitcast i16 %result to half
2750  ret half %cast
2751}
2752
2753define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
2754; GFX6-LABEL: v_saddsat_v2i16:
2755; GFX6:       ; %bb.0:
2756; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2757; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2758; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
2759; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2760; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
2761; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
2762; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
2763; GFX6-NEXT:    v_max_i32_e32 v2, v5, v2
2764; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2765; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
2766; GFX6-NEXT:    v_min_i32_e32 v4, 0, v1
2767; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2768; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
2769; GFX6-NEXT:    v_max_i32_e32 v3, 0, v1
2770; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x80000000, v4
2771; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x7fffffff, v3
2772; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
2773; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
2774; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
2775; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2776; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2777; GFX6-NEXT:    s_setpc_b64 s[30:31]
2778;
2779; GFX8-LABEL: v_saddsat_v2i16:
2780; GFX8:       ; %bb.0:
2781; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2782; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
2783; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
2784; GFX8-NEXT:    v_sub_u16_e32 v3, 0x8000, v3
2785; GFX8-NEXT:    v_sub_u16_e32 v2, 0x7fff, v2
2786; GFX8-NEXT:    v_max_i16_e32 v3, v3, v1
2787; GFX8-NEXT:    v_min_i16_e32 v2, v3, v2
2788; GFX8-NEXT:    v_mov_b32_e32 v3, 0
2789; GFX8-NEXT:    v_max_i16_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2790; GFX8-NEXT:    v_min_i16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2791; GFX8-NEXT:    v_sub_u16_e32 v3, 0x8000, v3
2792; GFX8-NEXT:    v_sub_u16_e32 v4, 0x7fff, v4
2793; GFX8-NEXT:    v_max_i16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2794; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
2795; GFX8-NEXT:    v_add_u16_e32 v2, v0, v2
2796; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2797; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
2798; GFX8-NEXT:    s_setpc_b64 s[30:31]
2799;
2800; GFX9-LABEL: v_saddsat_v2i16:
2801; GFX9:       ; %bb.0:
2802; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2803; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
2804; GFX9-NEXT:    s_setpc_b64 s[30:31]
2805;
2806; GFX10PLUS-LABEL: v_saddsat_v2i16:
2807; GFX10PLUS:       ; %bb.0:
2808; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2809; GFX10PLUS-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
2810; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
2811  %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2812  ret <2 x i16> %result
2813}
2814
2815define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
2816; GFX6-LABEL: s_saddsat_v2i16:
2817; GFX6:       ; %bb.0:
2818; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2819; GFX6-NEXT:    s_min_i32 s5, s0, 0
2820; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
2821; GFX6-NEXT:    s_max_i32 s4, s0, 0
2822; GFX6-NEXT:    s_sub_i32 s5, 0x80000000, s5
2823; GFX6-NEXT:    s_sub_i32 s4, 0x7fffffff, s4
2824; GFX6-NEXT:    s_max_i32 s2, s5, s2
2825; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2826; GFX6-NEXT:    s_min_i32 s2, s2, s4
2827; GFX6-NEXT:    s_min_i32 s4, s1, 0
2828; GFX6-NEXT:    s_add_i32 s0, s0, s2
2829; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
2830; GFX6-NEXT:    s_max_i32 s3, s1, 0
2831; GFX6-NEXT:    s_sub_i32 s4, 0x80000000, s4
2832; GFX6-NEXT:    s_sub_i32 s3, 0x7fffffff, s3
2833; GFX6-NEXT:    s_max_i32 s2, s4, s2
2834; GFX6-NEXT:    s_min_i32 s2, s2, s3
2835; GFX6-NEXT:    s_add_i32 s1, s1, s2
2836; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
2837; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
2838; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
2839; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
2840; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2841; GFX6-NEXT:    s_or_b32 s0, s0, s1
2842; GFX6-NEXT:    ; return to shader part epilog
2843;
2844; GFX8-LABEL: s_saddsat_v2i16:
2845; GFX8:       ; %bb.0:
2846; GFX8-NEXT:    s_sext_i32_i16 s4, s0
2847; GFX8-NEXT:    s_sext_i32_i16 s5, 0
2848; GFX8-NEXT:    s_max_i32 s6, s4, s5
2849; GFX8-NEXT:    s_min_i32 s4, s4, s5
2850; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
2851; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
2852; GFX8-NEXT:    s_sext_i32_i16 s4, s4
2853; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2854; GFX8-NEXT:    s_sub_i32 s6, 0x7fff, s6
2855; GFX8-NEXT:    s_max_i32 s1, s4, s1
2856; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2857; GFX8-NEXT:    s_sext_i32_i16 s4, s6
2858; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
2859; GFX8-NEXT:    s_min_i32 s1, s1, s4
2860; GFX8-NEXT:    s_add_i32 s0, s0, s1
2861; GFX8-NEXT:    s_sext_i32_i16 s1, s2
2862; GFX8-NEXT:    s_max_i32 s4, s1, s5
2863; GFX8-NEXT:    s_min_i32 s1, s1, s5
2864; GFX8-NEXT:    s_sub_i32 s1, 0xffff8000, s1
2865; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2866; GFX8-NEXT:    s_sext_i32_i16 s3, s3
2867; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
2868; GFX8-NEXT:    s_max_i32 s1, s1, s3
2869; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2870; GFX8-NEXT:    s_sext_i32_i16 s3, s4
2871; GFX8-NEXT:    s_min_i32 s1, s1, s3
2872; GFX8-NEXT:    s_add_i32 s2, s2, s1
2873; GFX8-NEXT:    s_and_b32 s1, 0xffff, s2
2874; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
2875; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
2876; GFX8-NEXT:    s_or_b32 s0, s0, s1
2877; GFX8-NEXT:    ; return to shader part epilog
2878;
2879; GFX9-LABEL: s_saddsat_v2i16:
2880; GFX9:       ; %bb.0:
2881; GFX9-NEXT:    v_mov_b32_e32 v0, s1
2882; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
2883; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2884; GFX9-NEXT:    ; return to shader part epilog
2885;
2886; GFX10PLUS-LABEL: s_saddsat_v2i16:
2887; GFX10PLUS:       ; %bb.0:
2888; GFX10PLUS-NEXT:    v_pk_add_i16 v0, s0, s1 clamp
2889; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
2890; GFX10PLUS-NEXT:    ; return to shader part epilog
2891  %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2892  %cast = bitcast <2 x i16> %result to i32
2893  ret i32 %cast
2894}
2895
2896define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
2897; GFX6-LABEL: saddsat_v2i16_sv:
2898; GFX6:       ; %bb.0:
2899; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2900; GFX6-NEXT:    s_min_i32 s3, s0, 0
2901; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2902; GFX6-NEXT:    s_max_i32 s2, s0, 0
2903; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
2904; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
2905; GFX6-NEXT:    v_max_i32_e32 v0, s3, v0
2906; GFX6-NEXT:    v_min_i32_e32 v0, s2, v0
2907; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
2908; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
2909; GFX6-NEXT:    s_min_i32 s2, s0, 0
2910; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2911; GFX6-NEXT:    s_max_i32 s1, s0, 0
2912; GFX6-NEXT:    s_sub_i32 s2, 0x80000000, s2
2913; GFX6-NEXT:    s_sub_i32 s1, 0x7fffffff, s1
2914; GFX6-NEXT:    v_max_i32_e32 v1, s2, v1
2915; GFX6-NEXT:    v_min_i32_e32 v1, s1, v1
2916; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s0, v1
2917; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2918; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2919; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2920; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2921; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2922; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
2923; GFX6-NEXT:    ; return to shader part epilog
2924;
2925; GFX8-LABEL: saddsat_v2i16_sv:
2926; GFX8:       ; %bb.0:
2927; GFX8-NEXT:    s_sext_i32_i16 s2, s0
2928; GFX8-NEXT:    s_sext_i32_i16 s3, 0
2929; GFX8-NEXT:    s_max_i32 s4, s2, s3
2930; GFX8-NEXT:    s_min_i32 s2, s2, s3
2931; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
2932; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
2933; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
2934; GFX8-NEXT:    v_max_i16_e32 v1, s2, v0
2935; GFX8-NEXT:    s_sext_i32_i16 s2, s1
2936; GFX8-NEXT:    v_min_i16_e32 v1, s4, v1
2937; GFX8-NEXT:    s_max_i32 s4, s2, s3
2938; GFX8-NEXT:    s_min_i32 s2, s2, s3
2939; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
2940; GFX8-NEXT:    v_mov_b32_e32 v2, s2
2941; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
2942; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2943; GFX8-NEXT:    v_min_i16_e32 v0, s4, v0
2944; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2945; GFX8-NEXT:    v_add_u16_e32 v1, s0, v1
2946; GFX8-NEXT:    v_add_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2947; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
2948; GFX8-NEXT:    ; return to shader part epilog
2949;
2950; GFX9-LABEL: saddsat_v2i16_sv:
2951; GFX9:       ; %bb.0:
2952; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
2953; GFX9-NEXT:    ; return to shader part epilog
2954;
2955; GFX10PLUS-LABEL: saddsat_v2i16_sv:
2956; GFX10PLUS:       ; %bb.0:
2957; GFX10PLUS-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
2958; GFX10PLUS-NEXT:    ; return to shader part epilog
2959  %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2960  %cast = bitcast <2 x i16> %result to float
2961  ret float %cast
2962}
2963
2964define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
2965; GFX6-LABEL: saddsat_v2i16_vs:
2966; GFX6:       ; %bb.0:
2967; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2968; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
2969; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2970; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
2971; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
2972; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
2973; GFX6-NEXT:    v_max_i32_e32 v3, s0, v3
2974; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2975; GFX6-NEXT:    v_min_i32_e32 v2, v3, v2
2976; GFX6-NEXT:    v_min_i32_e32 v3, 0, v1
2977; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2978; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
2979; GFX6-NEXT:    v_max_i32_e32 v2, 0, v1
2980; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
2981; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
2982; GFX6-NEXT:    v_max_i32_e32 v3, s0, v3
2983; GFX6-NEXT:    v_min_i32_e32 v2, v3, v2
2984; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
2985; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2986; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2987; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2988; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2989; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2990; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
2991; GFX6-NEXT:    ; return to shader part epilog
2992;
2993; GFX8-LABEL: saddsat_v2i16_vs:
2994; GFX8:       ; %bb.0:
2995; GFX8-NEXT:    v_min_i16_e32 v2, 0, v0
2996; GFX8-NEXT:    v_max_i16_e32 v1, 0, v0
2997; GFX8-NEXT:    v_sub_u16_e32 v2, 0x8000, v2
2998; GFX8-NEXT:    v_sub_u16_e32 v1, 0x7fff, v1
2999; GFX8-NEXT:    v_max_i16_e32 v2, s0, v2
3000; GFX8-NEXT:    v_min_i16_e32 v1, v2, v1
3001; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3002; GFX8-NEXT:    v_max_i16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3003; GFX8-NEXT:    v_min_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3004; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
3005; GFX8-NEXT:    v_sub_u16_e32 v2, 0x8000, v2
3006; GFX8-NEXT:    v_sub_u16_e32 v3, 0x7fff, v3
3007; GFX8-NEXT:    v_max_i16_e32 v2, s1, v2
3008; GFX8-NEXT:    v_min_i16_e32 v2, v2, v3
3009; GFX8-NEXT:    v_add_u16_e32 v1, v0, v1
3010; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3011; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
3012; GFX8-NEXT:    ; return to shader part epilog
3013;
3014; GFX9-LABEL: saddsat_v2i16_vs:
3015; GFX9:       ; %bb.0:
3016; GFX9-NEXT:    v_pk_add_i16 v0, v0, s0 clamp
3017; GFX9-NEXT:    ; return to shader part epilog
3018;
3019; GFX10PLUS-LABEL: saddsat_v2i16_vs:
3020; GFX10PLUS:       ; %bb.0:
3021; GFX10PLUS-NEXT:    v_pk_add_i16 v0, v0, s0 clamp
3022; GFX10PLUS-NEXT:    ; return to shader part epilog
3023  %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
3024  %cast = bitcast <2 x i16> %result to float
3025  ret float %cast
3026}
3027
3028; FIXME: v3i16 insert/extract
3029; define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
3030;   %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
3031;   ret <3 x i16> %result
3032; }
3033
3034; define amdgpu_ps <3 x i16> @s_saddsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) {
3035;   %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
3036;   ret <3 x i16> %result
3037; }
3038
3039define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
3040; GFX6-LABEL: v_saddsat_v4i16:
3041; GFX6:       ; %bb.0:
3042; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3043; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
3044; GFX6-NEXT:    v_min_i32_e32 v10, 0, v0
3045; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
3046; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3047; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
3048; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v11, v10
3049; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0x7fffffff, v8
3050; GFX6-NEXT:    v_max_i32_e32 v4, v10, v4
3051; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3052; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
3053; GFX6-NEXT:    v_min_i32_e32 v8, 0, v1
3054; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
3055; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
3056; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
3057; GFX6-NEXT:    v_max_i32_e32 v5, 0, v1
3058; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v11, v8
3059; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
3060; GFX6-NEXT:    v_max_i32_e32 v4, v8, v4
3061; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
3062; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3063; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
3064; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
3065; GFX6-NEXT:    v_min_i32_e32 v6, 0, v2
3066; GFX6-NEXT:    v_max_i32_e32 v5, 0, v2
3067; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v11, v6
3068; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
3069; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
3070; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3071; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
3072; GFX6-NEXT:    v_min_i32_e32 v6, 0, v3
3073; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
3074; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
3075; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
3076; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v11, v6
3077; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
3078; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
3079; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
3080; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
3081; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
3082; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
3083; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3084; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
3085; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
3086; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3087; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3088; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3089; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
3090; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
3091; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3092; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3093; GFX6-NEXT:    s_setpc_b64 s[30:31]
3094;
3095; GFX8-LABEL: v_saddsat_v4i16:
3096; GFX8:       ; %bb.0:
3097; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3098; GFX8-NEXT:    v_min_i16_e32 v5, 0, v0
3099; GFX8-NEXT:    v_max_i16_e32 v4, 0, v0
3100; GFX8-NEXT:    v_sub_u16_e32 v5, 0x8000, v5
3101; GFX8-NEXT:    v_sub_u16_e32 v4, 0x7fff, v4
3102; GFX8-NEXT:    v_max_i16_e32 v5, v5, v2
3103; GFX8-NEXT:    v_min_i16_e32 v4, v5, v4
3104; GFX8-NEXT:    v_mov_b32_e32 v5, 0
3105; GFX8-NEXT:    v_min_i16_sdwa v7, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3106; GFX8-NEXT:    v_max_i16_sdwa v6, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3107; GFX8-NEXT:    v_sub_u16_e32 v7, 0x8000, v7
3108; GFX8-NEXT:    v_sub_u16_e32 v6, 0x7fff, v6
3109; GFX8-NEXT:    v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3110; GFX8-NEXT:    v_min_i16_e32 v7, 0, v1
3111; GFX8-NEXT:    v_min_i16_e32 v2, v2, v6
3112; GFX8-NEXT:    v_max_i16_e32 v6, 0, v1
3113; GFX8-NEXT:    v_sub_u16_e32 v7, 0x8000, v7
3114; GFX8-NEXT:    v_sub_u16_e32 v6, 0x7fff, v6
3115; GFX8-NEXT:    v_max_i16_e32 v7, v7, v3
3116; GFX8-NEXT:    v_min_i16_e32 v6, v7, v6
3117; GFX8-NEXT:    v_max_i16_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3118; GFX8-NEXT:    v_min_i16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3119; GFX8-NEXT:    v_sub_u16_e32 v5, 0x8000, v5
3120; GFX8-NEXT:    v_sub_u16_e32 v7, 0x7fff, v7
3121; GFX8-NEXT:    v_max_i16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3122; GFX8-NEXT:    v_min_i16_e32 v3, v3, v7
3123; GFX8-NEXT:    v_add_u16_e32 v4, v0, v4
3124; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3125; GFX8-NEXT:    v_add_u16_e32 v2, v1, v6
3126; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3127; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
3128; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
3129; GFX8-NEXT:    s_setpc_b64 s[30:31]
3130;
3131; GFX9-LABEL: v_saddsat_v4i16:
3132; GFX9:       ; %bb.0:
3133; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3134; GFX9-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
3135; GFX9-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
3136; GFX9-NEXT:    s_setpc_b64 s[30:31]
3137;
3138; GFX10PLUS-LABEL: v_saddsat_v4i16:
3139; GFX10PLUS:       ; %bb.0:
3140; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3141; GFX10PLUS-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
3142; GFX10PLUS-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
3143; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
3144  %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
3145  %cast = bitcast <4 x i16> %result to <2 x float>
3146  ret <2 x float> %cast
3147}
3148
3149define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) {
3150; GFX6-LABEL: s_saddsat_v4i16:
3151; GFX6:       ; %bb.0:
3152; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
3153; GFX6-NEXT:    s_min_i32 s9, s0, 0
3154; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3155; GFX6-NEXT:    s_max_i32 s8, s0, 0
3156; GFX6-NEXT:    s_sub_i32 s9, 0x80000000, s9
3157; GFX6-NEXT:    s_sub_i32 s8, 0x7fffffff, s8
3158; GFX6-NEXT:    s_max_i32 s4, s9, s4
3159; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3160; GFX6-NEXT:    s_min_i32 s4, s4, s8
3161; GFX6-NEXT:    s_min_i32 s8, s1, 0
3162; GFX6-NEXT:    s_add_i32 s0, s0, s4
3163; GFX6-NEXT:    s_lshl_b32 s4, s5, 16
3164; GFX6-NEXT:    s_max_i32 s5, s1, 0
3165; GFX6-NEXT:    s_sub_i32 s8, 0x80000000, s8
3166; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
3167; GFX6-NEXT:    s_max_i32 s4, s8, s4
3168; GFX6-NEXT:    s_min_i32 s4, s4, s5
3169; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3170; GFX6-NEXT:    s_add_i32 s1, s1, s4
3171; GFX6-NEXT:    s_lshl_b32 s4, s6, 16
3172; GFX6-NEXT:    s_min_i32 s6, s2, 0
3173; GFX6-NEXT:    s_max_i32 s5, s2, 0
3174; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
3175; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
3176; GFX6-NEXT:    s_max_i32 s4, s6, s4
3177; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3178; GFX6-NEXT:    s_min_i32 s4, s4, s5
3179; GFX6-NEXT:    s_min_i32 s6, s3, 0
3180; GFX6-NEXT:    s_add_i32 s2, s2, s4
3181; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
3182; GFX6-NEXT:    s_max_i32 s5, s3, 0
3183; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
3184; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
3185; GFX6-NEXT:    s_max_i32 s4, s6, s4
3186; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
3187; GFX6-NEXT:    s_min_i32 s4, s4, s5
3188; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3189; GFX6-NEXT:    s_add_i32 s3, s3, s4
3190; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
3191; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
3192; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
3193; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
3194; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3195; GFX6-NEXT:    s_or_b32 s0, s0, s1
3196; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
3197; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
3198; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3199; GFX6-NEXT:    s_or_b32 s1, s1, s2
3200; GFX6-NEXT:    ; return to shader part epilog
3201;
3202; GFX8-LABEL: s_saddsat_v4i16:
3203; GFX8:       ; %bb.0:
3204; GFX8-NEXT:    s_sext_i32_i16 s8, s0
3205; GFX8-NEXT:    s_sext_i32_i16 s9, 0
3206; GFX8-NEXT:    s_max_i32 s10, s8, s9
3207; GFX8-NEXT:    s_min_i32 s8, s8, s9
3208; GFX8-NEXT:    s_sub_i32 s8, 0xffff8000, s8
3209; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
3210; GFX8-NEXT:    s_sext_i32_i16 s8, s8
3211; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3212; GFX8-NEXT:    s_sub_i32 s10, 0x7fff, s10
3213; GFX8-NEXT:    s_max_i32 s2, s8, s2
3214; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3215; GFX8-NEXT:    s_sext_i32_i16 s8, s10
3216; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
3217; GFX8-NEXT:    s_min_i32 s2, s2, s8
3218; GFX8-NEXT:    s_add_i32 s0, s0, s2
3219; GFX8-NEXT:    s_sext_i32_i16 s2, s4
3220; GFX8-NEXT:    s_max_i32 s8, s2, s9
3221; GFX8-NEXT:    s_min_i32 s2, s2, s9
3222; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
3223; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3224; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3225; GFX8-NEXT:    s_sub_i32 s8, 0x7fff, s8
3226; GFX8-NEXT:    s_max_i32 s2, s2, s6
3227; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3228; GFX8-NEXT:    s_sext_i32_i16 s6, s8
3229; GFX8-NEXT:    s_min_i32 s2, s2, s6
3230; GFX8-NEXT:    s_add_i32 s4, s4, s2
3231; GFX8-NEXT:    s_sext_i32_i16 s2, s1
3232; GFX8-NEXT:    s_max_i32 s6, s2, s9
3233; GFX8-NEXT:    s_min_i32 s2, s2, s9
3234; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
3235; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
3236; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3237; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3238; GFX8-NEXT:    s_sub_i32 s6, 0x7fff, s6
3239; GFX8-NEXT:    s_max_i32 s2, s2, s3
3240; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3241; GFX8-NEXT:    s_sext_i32_i16 s3, s6
3242; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
3243; GFX8-NEXT:    s_min_i32 s2, s2, s3
3244; GFX8-NEXT:    s_add_i32 s1, s1, s2
3245; GFX8-NEXT:    s_sext_i32_i16 s2, s5
3246; GFX8-NEXT:    s_max_i32 s3, s2, s9
3247; GFX8-NEXT:    s_min_i32 s2, s2, s9
3248; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
3249; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3250; GFX8-NEXT:    s_sext_i32_i16 s6, s7
3251; GFX8-NEXT:    s_sub_i32 s3, 0x7fff, s3
3252; GFX8-NEXT:    s_max_i32 s2, s2, s6
3253; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3254; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3255; GFX8-NEXT:    s_min_i32 s2, s2, s3
3256; GFX8-NEXT:    s_add_i32 s5, s5, s2
3257; GFX8-NEXT:    s_and_b32 s2, 0xffff, s4
3258; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
3259; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
3260; GFX8-NEXT:    s_or_b32 s0, s0, s2
3261; GFX8-NEXT:    s_and_b32 s2, 0xffff, s5
3262; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
3263; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
3264; GFX8-NEXT:    s_or_b32 s1, s1, s2
3265; GFX8-NEXT:    ; return to shader part epilog
3266;
3267; GFX9-LABEL: s_saddsat_v4i16:
3268; GFX9:       ; %bb.0:
3269; GFX9-NEXT:    v_mov_b32_e32 v0, s2
3270; GFX9-NEXT:    v_mov_b32_e32 v1, s3
3271; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
3272; GFX9-NEXT:    v_pk_add_i16 v1, s1, v1 clamp
3273; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
3274; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
3275; GFX9-NEXT:    ; return to shader part epilog
3276;
3277; GFX10PLUS-LABEL: s_saddsat_v4i16:
3278; GFX10PLUS:       ; %bb.0:
3279; GFX10PLUS-NEXT:    v_pk_add_i16 v0, s0, s2 clamp
3280; GFX10PLUS-NEXT:    v_pk_add_i16 v1, s1, s3 clamp
3281; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
3282; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
3283; GFX10PLUS-NEXT:    ; return to shader part epilog
3284  %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
3285  %cast = bitcast <4 x i16> %result to <2 x i32>
3286  ret <2 x i32> %cast
3287}
3288
3289; FIXME
3290; define <5 x i16> @v_saddsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) {
3291;   %result = call <5 x i16> @llvm.sadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
3292;   ret <5 x i16> %result
3293; }
3294
3295; define amdgpu_ps <5 x i16> @s_saddsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) {
3296;   %result = call <5 x i16> @llvm.sadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
3297;   ret <5 x i16> %result
3298; }
3299
3300define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
3301; GFX6-LABEL: v_saddsat_v6i16:
3302; GFX6:       ; %bb.0:
3303; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3304; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
3305; GFX6-NEXT:    v_min_i32_e32 v14, 0, v0
3306; GFX6-NEXT:    v_bfrev_b32_e32 v15, 1
3307; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
3308; GFX6-NEXT:    v_max_i32_e32 v12, 0, v0
3309; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v15, v14
3310; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 0x7fffffff, v12
3311; GFX6-NEXT:    v_max_i32_e32 v6, v14, v6
3312; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3313; GFX6-NEXT:    v_min_i32_e32 v6, v6, v12
3314; GFX6-NEXT:    v_min_i32_e32 v12, 0, v1
3315; GFX6-NEXT:    v_bfrev_b32_e32 v13, -2
3316; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
3317; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
3318; GFX6-NEXT:    v_max_i32_e32 v7, 0, v1
3319; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v15, v12
3320; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
3321; GFX6-NEXT:    v_max_i32_e32 v6, v12, v6
3322; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
3323; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3324; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
3325; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
3326; GFX6-NEXT:    v_min_i32_e32 v8, 0, v2
3327; GFX6-NEXT:    v_max_i32_e32 v7, 0, v2
3328; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v15, v8
3329; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
3330; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
3331; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3332; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
3333; GFX6-NEXT:    v_min_i32_e32 v8, 0, v3
3334; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
3335; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
3336; GFX6-NEXT:    v_max_i32_e32 v7, 0, v3
3337; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v15, v8
3338; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
3339; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
3340; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3341; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
3342; GFX6-NEXT:    v_min_i32_e32 v8, 0, v4
3343; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
3344; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
3345; GFX6-NEXT:    v_max_i32_e32 v7, 0, v4
3346; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v15, v8
3347; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
3348; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
3349; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
3350; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
3351; GFX6-NEXT:    v_min_i32_e32 v8, 0, v5
3352; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
3353; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
3354; GFX6-NEXT:    v_max_i32_e32 v7, 0, v5
3355; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v15, v8
3356; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
3357; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
3358; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
3359; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
3360; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
3361; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3362; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
3363; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
3364; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
3365; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3366; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3367; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
3368; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3369; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
3370; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
3371; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
3372; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3373; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
3374; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3375; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
3376; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3377; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3378; GFX6-NEXT:    s_setpc_b64 s[30:31]
3379;
3380; GFX8-LABEL: v_saddsat_v6i16:
3381; GFX8:       ; %bb.0:
3382; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3383; GFX8-NEXT:    v_min_i16_e32 v7, 0, v0
3384; GFX8-NEXT:    v_max_i16_e32 v6, 0, v0
3385; GFX8-NEXT:    v_sub_u16_e32 v7, 0x8000, v7
3386; GFX8-NEXT:    v_sub_u16_e32 v6, 0x7fff, v6
3387; GFX8-NEXT:    v_max_i16_e32 v7, v7, v3
3388; GFX8-NEXT:    v_min_i16_e32 v6, v7, v6
3389; GFX8-NEXT:    v_mov_b32_e32 v7, 0
3390; GFX8-NEXT:    v_min_i16_sdwa v9, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3391; GFX8-NEXT:    v_max_i16_sdwa v8, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3392; GFX8-NEXT:    v_sub_u16_e32 v9, 0x8000, v9
3393; GFX8-NEXT:    v_sub_u16_e32 v8, 0x7fff, v8
3394; GFX8-NEXT:    v_max_i16_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3395; GFX8-NEXT:    v_min_i16_e32 v9, 0, v1
3396; GFX8-NEXT:    v_min_i16_e32 v3, v3, v8
3397; GFX8-NEXT:    v_max_i16_e32 v8, 0, v1
3398; GFX8-NEXT:    v_sub_u16_e32 v9, 0x8000, v9
3399; GFX8-NEXT:    v_sub_u16_e32 v8, 0x7fff, v8
3400; GFX8-NEXT:    v_max_i16_e32 v9, v9, v4
3401; GFX8-NEXT:    v_min_i16_sdwa v10, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3402; GFX8-NEXT:    v_min_i16_e32 v8, v9, v8
3403; GFX8-NEXT:    v_max_i16_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3404; GFX8-NEXT:    v_sub_u16_e32 v10, 0x8000, v10
3405; GFX8-NEXT:    v_sub_u16_e32 v9, 0x7fff, v9
3406; GFX8-NEXT:    v_max_i16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3407; GFX8-NEXT:    v_min_i16_e32 v10, 0, v2
3408; GFX8-NEXT:    v_min_i16_e32 v4, v4, v9
3409; GFX8-NEXT:    v_max_i16_e32 v9, 0, v2
3410; GFX8-NEXT:    v_sub_u16_e32 v10, 0x8000, v10
3411; GFX8-NEXT:    v_sub_u16_e32 v9, 0x7fff, v9
3412; GFX8-NEXT:    v_max_i16_e32 v10, v10, v5
3413; GFX8-NEXT:    v_min_i16_e32 v9, v10, v9
3414; GFX8-NEXT:    v_max_i16_sdwa v10, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3415; GFX8-NEXT:    v_min_i16_sdwa v7, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3416; GFX8-NEXT:    v_sub_u16_e32 v7, 0x8000, v7
3417; GFX8-NEXT:    v_sub_u16_e32 v10, 0x7fff, v10
3418; GFX8-NEXT:    v_max_i16_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3419; GFX8-NEXT:    v_min_i16_e32 v5, v5, v10
3420; GFX8-NEXT:    v_add_u16_e32 v6, v0, v6
3421; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3422; GFX8-NEXT:    v_add_u16_e32 v3, v1, v8
3423; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3424; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
3425; GFX8-NEXT:    v_add_u16_e32 v3, v2, v9
3426; GFX8-NEXT:    v_add_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3427; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
3428; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
3429; GFX8-NEXT:    s_setpc_b64 s[30:31]
3430;
3431; GFX9-LABEL: v_saddsat_v6i16:
3432; GFX9:       ; %bb.0:
3433; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3434; GFX9-NEXT:    v_pk_add_i16 v0, v0, v3 clamp
3435; GFX9-NEXT:    v_pk_add_i16 v1, v1, v4 clamp
3436; GFX9-NEXT:    v_pk_add_i16 v2, v2, v5 clamp
3437; GFX9-NEXT:    s_setpc_b64 s[30:31]
3438;
3439; GFX10PLUS-LABEL: v_saddsat_v6i16:
3440; GFX10PLUS:       ; %bb.0:
3441; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3442; GFX10PLUS-NEXT:    v_pk_add_i16 v0, v0, v3 clamp
3443; GFX10PLUS-NEXT:    v_pk_add_i16 v1, v1, v4 clamp
3444; GFX10PLUS-NEXT:    v_pk_add_i16 v2, v2, v5 clamp
3445; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
3446  %result = call <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
3447  %cast = bitcast <6 x i16> %result to <3 x float>
3448  ret <3 x float> %cast
3449}
3450
3451define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) {
3452; GFX6-LABEL: s_saddsat_v6i16:
3453; GFX6:       ; %bb.0:
3454; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
3455; GFX6-NEXT:    s_min_i32 s13, s0, 0
3456; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
3457; GFX6-NEXT:    s_max_i32 s12, s0, 0
3458; GFX6-NEXT:    s_sub_i32 s13, 0x80000000, s13
3459; GFX6-NEXT:    s_sub_i32 s12, 0x7fffffff, s12
3460; GFX6-NEXT:    s_max_i32 s6, s13, s6
3461; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3462; GFX6-NEXT:    s_min_i32 s6, s6, s12
3463; GFX6-NEXT:    s_min_i32 s12, s1, 0
3464; GFX6-NEXT:    s_add_i32 s0, s0, s6
3465; GFX6-NEXT:    s_lshl_b32 s6, s7, 16
3466; GFX6-NEXT:    s_max_i32 s7, s1, 0
3467; GFX6-NEXT:    s_sub_i32 s12, 0x80000000, s12
3468; GFX6-NEXT:    s_sub_i32 s7, 0x7fffffff, s7
3469; GFX6-NEXT:    s_max_i32 s6, s12, s6
3470; GFX6-NEXT:    s_min_i32 s6, s6, s7
3471; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3472; GFX6-NEXT:    s_add_i32 s1, s1, s6
3473; GFX6-NEXT:    s_lshl_b32 s6, s8, 16
3474; GFX6-NEXT:    s_min_i32 s8, s2, 0
3475; GFX6-NEXT:    s_max_i32 s7, s2, 0
3476; GFX6-NEXT:    s_sub_i32 s8, 0x80000000, s8
3477; GFX6-NEXT:    s_sub_i32 s7, 0x7fffffff, s7
3478; GFX6-NEXT:    s_max_i32 s6, s8, s6
3479; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3480; GFX6-NEXT:    s_min_i32 s6, s6, s7
3481; GFX6-NEXT:    s_min_i32 s8, s3, 0
3482; GFX6-NEXT:    s_add_i32 s2, s2, s6
3483; GFX6-NEXT:    s_lshl_b32 s6, s9, 16
3484; GFX6-NEXT:    s_max_i32 s7, s3, 0
3485; GFX6-NEXT:    s_sub_i32 s8, 0x80000000, s8
3486; GFX6-NEXT:    s_sub_i32 s7, 0x7fffffff, s7
3487; GFX6-NEXT:    s_max_i32 s6, s8, s6
3488; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3489; GFX6-NEXT:    s_min_i32 s6, s6, s7
3490; GFX6-NEXT:    s_min_i32 s8, s4, 0
3491; GFX6-NEXT:    s_add_i32 s3, s3, s6
3492; GFX6-NEXT:    s_lshl_b32 s6, s10, 16
3493; GFX6-NEXT:    s_max_i32 s7, s4, 0
3494; GFX6-NEXT:    s_sub_i32 s8, 0x80000000, s8
3495; GFX6-NEXT:    s_sub_i32 s7, 0x7fffffff, s7
3496; GFX6-NEXT:    s_max_i32 s6, s8, s6
3497; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
3498; GFX6-NEXT:    s_min_i32 s6, s6, s7
3499; GFX6-NEXT:    s_min_i32 s8, s5, 0
3500; GFX6-NEXT:    s_add_i32 s4, s4, s6
3501; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
3502; GFX6-NEXT:    s_max_i32 s7, s5, 0
3503; GFX6-NEXT:    s_sub_i32 s8, 0x80000000, s8
3504; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
3505; GFX6-NEXT:    s_sub_i32 s7, 0x7fffffff, s7
3506; GFX6-NEXT:    s_max_i32 s6, s8, s6
3507; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3508; GFX6-NEXT:    s_min_i32 s6, s6, s7
3509; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
3510; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
3511; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
3512; GFX6-NEXT:    s_add_i32 s5, s5, s6
3513; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
3514; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3515; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
3516; GFX6-NEXT:    s_or_b32 s0, s0, s1
3517; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
3518; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
3519; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
3520; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3521; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
3522; GFX6-NEXT:    s_or_b32 s1, s1, s2
3523; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
3524; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3525; GFX6-NEXT:    s_or_b32 s2, s2, s3
3526; GFX6-NEXT:    ; return to shader part epilog
3527;
3528; GFX8-LABEL: s_saddsat_v6i16:
3529; GFX8:       ; %bb.0:
3530; GFX8-NEXT:    s_sext_i32_i16 s12, s0
3531; GFX8-NEXT:    s_sext_i32_i16 s13, 0
3532; GFX8-NEXT:    s_max_i32 s14, s12, s13
3533; GFX8-NEXT:    s_min_i32 s12, s12, s13
3534; GFX8-NEXT:    s_sub_i32 s12, 0xffff8000, s12
3535; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
3536; GFX8-NEXT:    s_sext_i32_i16 s12, s12
3537; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3538; GFX8-NEXT:    s_sub_i32 s14, 0x7fff, s14
3539; GFX8-NEXT:    s_max_i32 s3, s12, s3
3540; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3541; GFX8-NEXT:    s_sext_i32_i16 s12, s14
3542; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
3543; GFX8-NEXT:    s_min_i32 s3, s3, s12
3544; GFX8-NEXT:    s_add_i32 s0, s0, s3
3545; GFX8-NEXT:    s_sext_i32_i16 s3, s6
3546; GFX8-NEXT:    s_max_i32 s12, s3, s13
3547; GFX8-NEXT:    s_min_i32 s3, s3, s13
3548; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
3549; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3550; GFX8-NEXT:    s_sext_i32_i16 s9, s9
3551; GFX8-NEXT:    s_sub_i32 s12, 0x7fff, s12
3552; GFX8-NEXT:    s_max_i32 s3, s3, s9
3553; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3554; GFX8-NEXT:    s_sext_i32_i16 s9, s12
3555; GFX8-NEXT:    s_min_i32 s3, s3, s9
3556; GFX8-NEXT:    s_add_i32 s6, s6, s3
3557; GFX8-NEXT:    s_sext_i32_i16 s3, s1
3558; GFX8-NEXT:    s_max_i32 s9, s3, s13
3559; GFX8-NEXT:    s_min_i32 s3, s3, s13
3560; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
3561; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
3562; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3563; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3564; GFX8-NEXT:    s_sub_i32 s9, 0x7fff, s9
3565; GFX8-NEXT:    s_max_i32 s3, s3, s4
3566; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3567; GFX8-NEXT:    s_sext_i32_i16 s4, s9
3568; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
3569; GFX8-NEXT:    s_min_i32 s3, s3, s4
3570; GFX8-NEXT:    s_add_i32 s1, s1, s3
3571; GFX8-NEXT:    s_sext_i32_i16 s3, s7
3572; GFX8-NEXT:    s_max_i32 s4, s3, s13
3573; GFX8-NEXT:    s_min_i32 s3, s3, s13
3574; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
3575; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3576; GFX8-NEXT:    s_sext_i32_i16 s9, s10
3577; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
3578; GFX8-NEXT:    s_max_i32 s3, s3, s9
3579; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3580; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3581; GFX8-NEXT:    s_min_i32 s3, s3, s4
3582; GFX8-NEXT:    s_add_i32 s7, s7, s3
3583; GFX8-NEXT:    s_sext_i32_i16 s3, s2
3584; GFX8-NEXT:    s_max_i32 s4, s3, s13
3585; GFX8-NEXT:    s_min_i32 s3, s3, s13
3586; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
3587; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
3588; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3589; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3590; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
3591; GFX8-NEXT:    s_max_i32 s3, s3, s5
3592; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3593; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3594; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
3595; GFX8-NEXT:    s_min_i32 s3, s3, s4
3596; GFX8-NEXT:    s_add_i32 s2, s2, s3
3597; GFX8-NEXT:    s_sext_i32_i16 s3, s8
3598; GFX8-NEXT:    s_max_i32 s4, s3, s13
3599; GFX8-NEXT:    s_min_i32 s3, s3, s13
3600; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
3601; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3602; GFX8-NEXT:    s_sext_i32_i16 s5, s11
3603; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
3604; GFX8-NEXT:    s_max_i32 s3, s3, s5
3605; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3606; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3607; GFX8-NEXT:    s_min_i32 s3, s3, s4
3608; GFX8-NEXT:    s_add_i32 s8, s8, s3
3609; GFX8-NEXT:    s_and_b32 s3, 0xffff, s6
3610; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
3611; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
3612; GFX8-NEXT:    s_or_b32 s0, s0, s3
3613; GFX8-NEXT:    s_and_b32 s3, 0xffff, s7
3614; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
3615; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
3616; GFX8-NEXT:    s_or_b32 s1, s1, s3
3617; GFX8-NEXT:    s_and_b32 s3, 0xffff, s8
3618; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
3619; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
3620; GFX8-NEXT:    s_or_b32 s2, s2, s3
3621; GFX8-NEXT:    ; return to shader part epilog
3622;
3623; GFX9-LABEL: s_saddsat_v6i16:
3624; GFX9:       ; %bb.0:
3625; GFX9-NEXT:    v_mov_b32_e32 v0, s3
3626; GFX9-NEXT:    v_mov_b32_e32 v1, s4
3627; GFX9-NEXT:    v_mov_b32_e32 v2, s5
3628; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
3629; GFX9-NEXT:    v_pk_add_i16 v1, s1, v1 clamp
3630; GFX9-NEXT:    v_pk_add_i16 v2, s2, v2 clamp
3631; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
3632; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
3633; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
3634; GFX9-NEXT:    ; return to shader part epilog
3635;
3636; GFX10PLUS-LABEL: s_saddsat_v6i16:
3637; GFX10PLUS:       ; %bb.0:
3638; GFX10PLUS-NEXT:    v_pk_add_i16 v0, s0, s3 clamp
3639; GFX10PLUS-NEXT:    v_pk_add_i16 v1, s1, s4 clamp
3640; GFX10PLUS-NEXT:    v_pk_add_i16 v2, s2, s5 clamp
3641; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
3642; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
3643; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
3644; GFX10PLUS-NEXT:    ; return to shader part epilog
3645  %result = call <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
3646  %cast = bitcast <6 x i16> %result to <3 x i32>
3647  ret <3 x i32> %cast
3648}
3649
3650define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
3651; GFX6-LABEL: v_saddsat_v8i16:
3652; GFX6:       ; %bb.0:
3653; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3654; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
3655; GFX6-NEXT:    v_min_i32_e32 v18, 0, v0
3656; GFX6-NEXT:    v_bfrev_b32_e32 v19, 1
3657; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
3658; GFX6-NEXT:    v_max_i32_e32 v16, 0, v0
3659; GFX6-NEXT:    v_bfrev_b32_e32 v17, -2
3660; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v19, v18
3661; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v17, v16
3662; GFX6-NEXT:    v_max_i32_e32 v8, v18, v8
3663; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3664; GFX6-NEXT:    v_min_i32_e32 v8, v8, v16
3665; GFX6-NEXT:    v_min_i32_e32 v16, 0, v1
3666; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
3667; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
3668; GFX6-NEXT:    v_max_i32_e32 v9, 0, v1
3669; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v19, v16
3670; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
3671; GFX6-NEXT:    v_max_i32_e32 v8, v16, v8
3672; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
3673; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3674; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
3675; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
3676; GFX6-NEXT:    v_min_i32_e32 v10, 0, v2
3677; GFX6-NEXT:    v_max_i32_e32 v9, 0, v2
3678; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
3679; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
3680; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
3681; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3682; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
3683; GFX6-NEXT:    v_min_i32_e32 v10, 0, v3
3684; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
3685; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
3686; GFX6-NEXT:    v_max_i32_e32 v9, 0, v3
3687; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
3688; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
3689; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
3690; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3691; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
3692; GFX6-NEXT:    v_min_i32_e32 v10, 0, v4
3693; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
3694; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
3695; GFX6-NEXT:    v_max_i32_e32 v9, 0, v4
3696; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
3697; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
3698; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
3699; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
3700; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
3701; GFX6-NEXT:    v_min_i32_e32 v10, 0, v5
3702; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
3703; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
3704; GFX6-NEXT:    v_max_i32_e32 v9, 0, v5
3705; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
3706; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
3707; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
3708; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
3709; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
3710; GFX6-NEXT:    v_min_i32_e32 v10, 0, v6
3711; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
3712; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
3713; GFX6-NEXT:    v_max_i32_e32 v9, 0, v6
3714; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
3715; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
3716; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
3717; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
3718; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
3719; GFX6-NEXT:    v_min_i32_e32 v10, 0, v7
3720; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
3721; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
3722; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
3723; GFX6-NEXT:    v_max_i32_e32 v9, 0, v7
3724; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
3725; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
3726; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
3727; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
3728; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3729; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
3730; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
3731; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
3732; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3733; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3734; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
3735; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
3736; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3737; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
3738; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
3739; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
3740; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 16, v7
3741; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3742; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
3743; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 16, v6
3744; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3745; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
3746; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3747; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
3748; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3749; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
3750; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3751; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
3752; GFX6-NEXT:    s_setpc_b64 s[30:31]
3753;
3754; GFX8-LABEL: v_saddsat_v8i16:
3755; GFX8:       ; %bb.0:
3756; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3757; GFX8-NEXT:    v_min_i16_e32 v9, 0, v0
3758; GFX8-NEXT:    v_max_i16_e32 v8, 0, v0
3759; GFX8-NEXT:    v_sub_u16_e32 v9, 0x8000, v9
3760; GFX8-NEXT:    v_sub_u16_e32 v8, 0x7fff, v8
3761; GFX8-NEXT:    v_max_i16_e32 v9, v9, v4
3762; GFX8-NEXT:    v_min_i16_e32 v8, v9, v8
3763; GFX8-NEXT:    v_mov_b32_e32 v9, 0
3764; GFX8-NEXT:    v_min_i16_sdwa v11, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3765; GFX8-NEXT:    v_max_i16_sdwa v10, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3766; GFX8-NEXT:    v_sub_u16_e32 v11, 0x8000, v11
3767; GFX8-NEXT:    v_sub_u16_e32 v10, 0x7fff, v10
3768; GFX8-NEXT:    v_max_i16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3769; GFX8-NEXT:    v_min_i16_e32 v11, 0, v1
3770; GFX8-NEXT:    v_min_i16_e32 v4, v4, v10
3771; GFX8-NEXT:    v_max_i16_e32 v10, 0, v1
3772; GFX8-NEXT:    v_sub_u16_e32 v11, 0x8000, v11
3773; GFX8-NEXT:    v_sub_u16_e32 v10, 0x7fff, v10
3774; GFX8-NEXT:    v_max_i16_e32 v11, v11, v5
3775; GFX8-NEXT:    v_min_i16_sdwa v12, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3776; GFX8-NEXT:    v_min_i16_e32 v10, v11, v10
3777; GFX8-NEXT:    v_max_i16_sdwa v11, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3778; GFX8-NEXT:    v_sub_u16_e32 v12, 0x8000, v12
3779; GFX8-NEXT:    v_sub_u16_e32 v11, 0x7fff, v11
3780; GFX8-NEXT:    v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3781; GFX8-NEXT:    v_min_i16_e32 v12, 0, v2
3782; GFX8-NEXT:    v_min_i16_e32 v5, v5, v11
3783; GFX8-NEXT:    v_max_i16_e32 v11, 0, v2
3784; GFX8-NEXT:    v_sub_u16_e32 v12, 0x8000, v12
3785; GFX8-NEXT:    v_sub_u16_e32 v11, 0x7fff, v11
3786; GFX8-NEXT:    v_max_i16_e32 v12, v12, v6
3787; GFX8-NEXT:    v_min_i16_sdwa v13, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3788; GFX8-NEXT:    v_min_i16_e32 v11, v12, v11
3789; GFX8-NEXT:    v_max_i16_sdwa v12, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3790; GFX8-NEXT:    v_sub_u16_e32 v13, 0x8000, v13
3791; GFX8-NEXT:    v_sub_u16_e32 v12, 0x7fff, v12
3792; GFX8-NEXT:    v_max_i16_sdwa v6, v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3793; GFX8-NEXT:    v_min_i16_e32 v13, 0, v3
3794; GFX8-NEXT:    v_min_i16_e32 v6, v6, v12
3795; GFX8-NEXT:    v_max_i16_e32 v12, 0, v3
3796; GFX8-NEXT:    v_sub_u16_e32 v13, 0x8000, v13
3797; GFX8-NEXT:    v_sub_u16_e32 v12, 0x7fff, v12
3798; GFX8-NEXT:    v_max_i16_e32 v13, v13, v7
3799; GFX8-NEXT:    v_min_i16_e32 v12, v13, v12
3800; GFX8-NEXT:    v_max_i16_sdwa v13, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3801; GFX8-NEXT:    v_min_i16_sdwa v9, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3802; GFX8-NEXT:    v_sub_u16_e32 v9, 0x8000, v9
3803; GFX8-NEXT:    v_sub_u16_e32 v13, 0x7fff, v13
3804; GFX8-NEXT:    v_max_i16_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3805; GFX8-NEXT:    v_add_u16_e32 v8, v0, v8
3806; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3807; GFX8-NEXT:    v_add_u16_e32 v4, v1, v10
3808; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3809; GFX8-NEXT:    v_min_i16_e32 v7, v7, v13
3810; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
3811; GFX8-NEXT:    v_add_u16_e32 v4, v2, v11
3812; GFX8-NEXT:    v_add_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3813; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
3814; GFX8-NEXT:    v_add_u16_e32 v4, v3, v12
3815; GFX8-NEXT:    v_add_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3816; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
3817; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
3818; GFX8-NEXT:    s_setpc_b64 s[30:31]
3819;
3820; GFX9-LABEL: v_saddsat_v8i16:
3821; GFX9:       ; %bb.0:
3822; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3823; GFX9-NEXT:    v_pk_add_i16 v0, v0, v4 clamp
3824; GFX9-NEXT:    v_pk_add_i16 v1, v1, v5 clamp
3825; GFX9-NEXT:    v_pk_add_i16 v2, v2, v6 clamp
3826; GFX9-NEXT:    v_pk_add_i16 v3, v3, v7 clamp
3827; GFX9-NEXT:    s_setpc_b64 s[30:31]
3828;
3829; GFX10PLUS-LABEL: v_saddsat_v8i16:
3830; GFX10PLUS:       ; %bb.0:
3831; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3832; GFX10PLUS-NEXT:    v_pk_add_i16 v0, v0, v4 clamp
3833; GFX10PLUS-NEXT:    v_pk_add_i16 v1, v1, v5 clamp
3834; GFX10PLUS-NEXT:    v_pk_add_i16 v2, v2, v6 clamp
3835; GFX10PLUS-NEXT:    v_pk_add_i16 v3, v3, v7 clamp
3836; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
3837  %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
3838  %cast = bitcast <8 x i16> %result to <4 x float>
3839  ret <4 x float> %cast
3840}
3841
3842define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) {
3843; GFX6-LABEL: s_saddsat_v8i16:
3844; GFX6:       ; %bb.0:
3845; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
3846; GFX6-NEXT:    s_min_i32 s17, s0, 0
3847; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
3848; GFX6-NEXT:    s_max_i32 s16, s0, 0
3849; GFX6-NEXT:    s_sub_i32 s17, 0x80000000, s17
3850; GFX6-NEXT:    s_sub_i32 s16, 0x7fffffff, s16
3851; GFX6-NEXT:    s_max_i32 s8, s17, s8
3852; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3853; GFX6-NEXT:    s_min_i32 s8, s8, s16
3854; GFX6-NEXT:    s_min_i32 s16, s1, 0
3855; GFX6-NEXT:    s_add_i32 s0, s0, s8
3856; GFX6-NEXT:    s_lshl_b32 s8, s9, 16
3857; GFX6-NEXT:    s_max_i32 s9, s1, 0
3858; GFX6-NEXT:    s_sub_i32 s16, 0x80000000, s16
3859; GFX6-NEXT:    s_sub_i32 s9, 0x7fffffff, s9
3860; GFX6-NEXT:    s_max_i32 s8, s16, s8
3861; GFX6-NEXT:    s_min_i32 s8, s8, s9
3862; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3863; GFX6-NEXT:    s_add_i32 s1, s1, s8
3864; GFX6-NEXT:    s_lshl_b32 s8, s10, 16
3865; GFX6-NEXT:    s_min_i32 s10, s2, 0
3866; GFX6-NEXT:    s_max_i32 s9, s2, 0
3867; GFX6-NEXT:    s_sub_i32 s10, 0x80000000, s10
3868; GFX6-NEXT:    s_sub_i32 s9, 0x7fffffff, s9
3869; GFX6-NEXT:    s_max_i32 s8, s10, s8
3870; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3871; GFX6-NEXT:    s_min_i32 s8, s8, s9
3872; GFX6-NEXT:    s_min_i32 s10, s3, 0
3873; GFX6-NEXT:    s_add_i32 s2, s2, s8
3874; GFX6-NEXT:    s_lshl_b32 s8, s11, 16
3875; GFX6-NEXT:    s_max_i32 s9, s3, 0
3876; GFX6-NEXT:    s_sub_i32 s10, 0x80000000, s10
3877; GFX6-NEXT:    s_sub_i32 s9, 0x7fffffff, s9
3878; GFX6-NEXT:    s_max_i32 s8, s10, s8
3879; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3880; GFX6-NEXT:    s_min_i32 s8, s8, s9
3881; GFX6-NEXT:    s_min_i32 s10, s4, 0
3882; GFX6-NEXT:    s_add_i32 s3, s3, s8
3883; GFX6-NEXT:    s_lshl_b32 s8, s12, 16
3884; GFX6-NEXT:    s_max_i32 s9, s4, 0
3885; GFX6-NEXT:    s_sub_i32 s10, 0x80000000, s10
3886; GFX6-NEXT:    s_sub_i32 s9, 0x7fffffff, s9
3887; GFX6-NEXT:    s_max_i32 s8, s10, s8
3888; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
3889; GFX6-NEXT:    s_min_i32 s8, s8, s9
3890; GFX6-NEXT:    s_min_i32 s10, s5, 0
3891; GFX6-NEXT:    s_add_i32 s4, s4, s8
3892; GFX6-NEXT:    s_lshl_b32 s8, s13, 16
3893; GFX6-NEXT:    s_max_i32 s9, s5, 0
3894; GFX6-NEXT:    s_sub_i32 s10, 0x80000000, s10
3895; GFX6-NEXT:    s_sub_i32 s9, 0x7fffffff, s9
3896; GFX6-NEXT:    s_max_i32 s8, s10, s8
3897; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
3898; GFX6-NEXT:    s_min_i32 s8, s8, s9
3899; GFX6-NEXT:    s_min_i32 s10, s6, 0
3900; GFX6-NEXT:    s_add_i32 s5, s5, s8
3901; GFX6-NEXT:    s_lshl_b32 s8, s14, 16
3902; GFX6-NEXT:    s_max_i32 s9, s6, 0
3903; GFX6-NEXT:    s_sub_i32 s10, 0x80000000, s10
3904; GFX6-NEXT:    s_sub_i32 s9, 0x7fffffff, s9
3905; GFX6-NEXT:    s_max_i32 s8, s10, s8
3906; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
3907; GFX6-NEXT:    s_min_i32 s8, s8, s9
3908; GFX6-NEXT:    s_min_i32 s10, s7, 0
3909; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
3910; GFX6-NEXT:    s_add_i32 s6, s6, s8
3911; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
3912; GFX6-NEXT:    s_max_i32 s9, s7, 0
3913; GFX6-NEXT:    s_sub_i32 s10, 0x80000000, s10
3914; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3915; GFX6-NEXT:    s_sub_i32 s9, 0x7fffffff, s9
3916; GFX6-NEXT:    s_max_i32 s8, s10, s8
3917; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
3918; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
3919; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
3920; GFX6-NEXT:    s_min_i32 s8, s8, s9
3921; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
3922; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3923; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
3924; GFX6-NEXT:    s_add_i32 s7, s7, s8
3925; GFX6-NEXT:    s_or_b32 s0, s0, s1
3926; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
3927; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
3928; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
3929; GFX6-NEXT:    s_ashr_i32 s7, s7, 16
3930; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3931; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
3932; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
3933; GFX6-NEXT:    s_or_b32 s1, s1, s2
3934; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
3935; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3936; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
3937; GFX6-NEXT:    s_or_b32 s2, s2, s3
3938; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
3939; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3940; GFX6-NEXT:    s_or_b32 s3, s3, s4
3941; GFX6-NEXT:    ; return to shader part epilog
3942;
3943; GFX8-LABEL: s_saddsat_v8i16:
3944; GFX8:       ; %bb.0:
3945; GFX8-NEXT:    s_sext_i32_i16 s16, s0
3946; GFX8-NEXT:    s_sext_i32_i16 s17, 0
3947; GFX8-NEXT:    s_max_i32 s18, s16, s17
3948; GFX8-NEXT:    s_min_i32 s16, s16, s17
3949; GFX8-NEXT:    s_sub_i32 s16, 0xffff8000, s16
3950; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
3951; GFX8-NEXT:    s_sext_i32_i16 s16, s16
3952; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3953; GFX8-NEXT:    s_sub_i32 s18, 0x7fff, s18
3954; GFX8-NEXT:    s_max_i32 s4, s16, s4
3955; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3956; GFX8-NEXT:    s_sext_i32_i16 s16, s18
3957; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
3958; GFX8-NEXT:    s_min_i32 s4, s4, s16
3959; GFX8-NEXT:    s_add_i32 s0, s0, s4
3960; GFX8-NEXT:    s_sext_i32_i16 s4, s8
3961; GFX8-NEXT:    s_max_i32 s16, s4, s17
3962; GFX8-NEXT:    s_min_i32 s4, s4, s17
3963; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
3964; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3965; GFX8-NEXT:    s_sext_i32_i16 s12, s12
3966; GFX8-NEXT:    s_sub_i32 s16, 0x7fff, s16
3967; GFX8-NEXT:    s_max_i32 s4, s4, s12
3968; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3969; GFX8-NEXT:    s_sext_i32_i16 s12, s16
3970; GFX8-NEXT:    s_min_i32 s4, s4, s12
3971; GFX8-NEXT:    s_add_i32 s8, s8, s4
3972; GFX8-NEXT:    s_sext_i32_i16 s4, s1
3973; GFX8-NEXT:    s_max_i32 s12, s4, s17
3974; GFX8-NEXT:    s_min_i32 s4, s4, s17
3975; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
3976; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
3977; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3978; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3979; GFX8-NEXT:    s_sub_i32 s12, 0x7fff, s12
3980; GFX8-NEXT:    s_max_i32 s4, s4, s5
3981; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3982; GFX8-NEXT:    s_sext_i32_i16 s5, s12
3983; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
3984; GFX8-NEXT:    s_min_i32 s4, s4, s5
3985; GFX8-NEXT:    s_add_i32 s1, s1, s4
3986; GFX8-NEXT:    s_sext_i32_i16 s4, s9
3987; GFX8-NEXT:    s_max_i32 s5, s4, s17
3988; GFX8-NEXT:    s_min_i32 s4, s4, s17
3989; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
3990; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3991; GFX8-NEXT:    s_sext_i32_i16 s12, s13
3992; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
3993; GFX8-NEXT:    s_max_i32 s4, s4, s12
3994; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3995; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3996; GFX8-NEXT:    s_min_i32 s4, s4, s5
3997; GFX8-NEXT:    s_add_i32 s9, s9, s4
3998; GFX8-NEXT:    s_sext_i32_i16 s4, s2
3999; GFX8-NEXT:    s_max_i32 s5, s4, s17
4000; GFX8-NEXT:    s_min_i32 s4, s4, s17
4001; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
4002; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
4003; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4004; GFX8-NEXT:    s_sext_i32_i16 s6, s6
4005; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
4006; GFX8-NEXT:    s_max_i32 s4, s4, s6
4007; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4008; GFX8-NEXT:    s_sext_i32_i16 s5, s5
4009; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
4010; GFX8-NEXT:    s_min_i32 s4, s4, s5
4011; GFX8-NEXT:    s_add_i32 s2, s2, s4
4012; GFX8-NEXT:    s_sext_i32_i16 s4, s10
4013; GFX8-NEXT:    s_max_i32 s5, s4, s17
4014; GFX8-NEXT:    s_min_i32 s4, s4, s17
4015; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
4016; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4017; GFX8-NEXT:    s_sext_i32_i16 s6, s14
4018; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
4019; GFX8-NEXT:    s_max_i32 s4, s4, s6
4020; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4021; GFX8-NEXT:    s_sext_i32_i16 s5, s5
4022; GFX8-NEXT:    s_min_i32 s4, s4, s5
4023; GFX8-NEXT:    s_add_i32 s10, s10, s4
4024; GFX8-NEXT:    s_sext_i32_i16 s4, s3
4025; GFX8-NEXT:    s_max_i32 s5, s4, s17
4026; GFX8-NEXT:    s_min_i32 s4, s4, s17
4027; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
4028; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4029; GFX8-NEXT:    s_sext_i32_i16 s6, s7
4030; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
4031; GFX8-NEXT:    s_max_i32 s4, s4, s6
4032; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4033; GFX8-NEXT:    s_sext_i32_i16 s5, s5
4034; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
4035; GFX8-NEXT:    s_min_i32 s4, s4, s5
4036; GFX8-NEXT:    s_add_i32 s3, s3, s4
4037; GFX8-NEXT:    s_sext_i32_i16 s4, s11
4038; GFX8-NEXT:    s_max_i32 s5, s4, s17
4039; GFX8-NEXT:    s_min_i32 s4, s4, s17
4040; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
4041; GFX8-NEXT:    s_sub_i32 s4, 0xffff8000, s4
4042; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4043; GFX8-NEXT:    s_sext_i32_i16 s6, s15
4044; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
4045; GFX8-NEXT:    s_max_i32 s4, s4, s6
4046; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4047; GFX8-NEXT:    s_sext_i32_i16 s5, s5
4048; GFX8-NEXT:    s_min_i32 s4, s4, s5
4049; GFX8-NEXT:    s_add_i32 s11, s11, s4
4050; GFX8-NEXT:    s_and_b32 s4, 0xffff, s8
4051; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
4052; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4053; GFX8-NEXT:    s_or_b32 s0, s0, s4
4054; GFX8-NEXT:    s_and_b32 s4, 0xffff, s9
4055; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
4056; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4057; GFX8-NEXT:    s_or_b32 s1, s1, s4
4058; GFX8-NEXT:    s_and_b32 s4, 0xffff, s10
4059; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
4060; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4061; GFX8-NEXT:    s_or_b32 s2, s2, s4
4062; GFX8-NEXT:    s_and_b32 s4, 0xffff, s11
4063; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
4064; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4065; GFX8-NEXT:    s_or_b32 s3, s3, s4
4066; GFX8-NEXT:    ; return to shader part epilog
4067;
4068; GFX9-LABEL: s_saddsat_v8i16:
4069; GFX9:       ; %bb.0:
4070; GFX9-NEXT:    v_mov_b32_e32 v0, s4
4071; GFX9-NEXT:    v_mov_b32_e32 v1, s5
4072; GFX9-NEXT:    v_mov_b32_e32 v2, s6
4073; GFX9-NEXT:    v_mov_b32_e32 v3, s7
4074; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
4075; GFX9-NEXT:    v_pk_add_i16 v1, s1, v1 clamp
4076; GFX9-NEXT:    v_pk_add_i16 v2, s2, v2 clamp
4077; GFX9-NEXT:    v_pk_add_i16 v3, s3, v3 clamp
4078; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
4079; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
4080; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
4081; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
4082; GFX9-NEXT:    ; return to shader part epilog
4083;
4084; GFX10PLUS-LABEL: s_saddsat_v8i16:
4085; GFX10PLUS:       ; %bb.0:
4086; GFX10PLUS-NEXT:    v_pk_add_i16 v0, s0, s4 clamp
4087; GFX10PLUS-NEXT:    v_pk_add_i16 v1, s1, s5 clamp
4088; GFX10PLUS-NEXT:    v_pk_add_i16 v2, s2, s6 clamp
4089; GFX10PLUS-NEXT:    v_pk_add_i16 v3, s3, s7 clamp
4090; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
4091; GFX10PLUS-NEXT:    v_readfirstlane_b32 s1, v1
4092; GFX10PLUS-NEXT:    v_readfirstlane_b32 s2, v2
4093; GFX10PLUS-NEXT:    v_readfirstlane_b32 s3, v3
4094; GFX10PLUS-NEXT:    ; return to shader part epilog
4095  %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
4096  %cast = bitcast <8 x i16> %result to <4 x i32>
4097  ret <4 x i32> %cast
4098}
4099
4100define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
4101; GFX6-LABEL: v_saddsat_i48:
4102; GFX6:       ; %bb.0:
4103; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4104; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
4105; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v1, v3, vcc
4106; GFX6-NEXT:    v_bfe_i32 v5, v4, 0, 16
4107; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
4108; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
4109; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4110; GFX6-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
4111; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4112; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0xffff8000, v0
4113; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v5
4114; GFX6-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4115; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
4116; GFX6-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
4117; GFX6-NEXT:    s_setpc_b64 s[30:31]
4118;
4119; GFX8-LABEL: v_saddsat_i48:
4120; GFX8:       ; %bb.0:
4121; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4122; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
4123; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v1, v3, vcc
4124; GFX8-NEXT:    v_bfe_i32 v5, v4, 0, 16
4125; GFX8-NEXT:    v_bfe_i32 v1, v0, 0, 16
4126; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 16
4127; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4128; GFX8-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
4129; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4130; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xffff8000, v0
4131; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 15, v5
4132; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4133; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
4134; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
4135; GFX8-NEXT:    s_setpc_b64 s[30:31]
4136;
4137; GFX9-LABEL: v_saddsat_i48:
4138; GFX9:       ; %bb.0:
4139; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4140; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4141; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
4142; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
4143; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
4144; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
4145; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
4146; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4147; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
4148; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4149; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
4150; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4151; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4152; GFX9-NEXT:    s_setpc_b64 s[30:31]
4153;
4154; GFX10-LABEL: v_saddsat_i48:
4155; GFX10:       ; %bb.0:
4156; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4157; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4158; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
4159; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
4160; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4161; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
4162; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
4163; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
4164; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
4165; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s4
4166; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
4167; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
4168; GFX10-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4169; GFX10-NEXT:    s_setpc_b64 s[30:31]
4170;
4171; GFX11-LABEL: v_saddsat_i48:
4172; GFX11:       ; %bb.0:
4173; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4174; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4175; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
4176; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
4177; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4178; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
4179; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
4180; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
4181; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
4182; GFX11-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s0
4183; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
4184; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4185; GFX11-NEXT:    s_setpc_b64 s[30:31]
4186  %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
4187  ret i48 %result
4188}
4189
4190define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
4191; GFX6-LABEL: s_saddsat_i48:
4192; GFX6:       ; %bb.0:
4193; GFX6-NEXT:    s_add_u32 s4, s0, s2
4194; GFX6-NEXT:    s_addc_u32 s3, s1, s3
4195; GFX6-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
4196; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4197; GFX6-NEXT:    s_bfe_i64 s[6:7], s[4:5], 0x300000
4198; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4199; GFX6-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x300000
4200; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
4201; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
4202; GFX6-NEXT:    s_ashr_i32 s2, s7, 31
4203; GFX6-NEXT:    s_ashr_i32 s5, s7, 15
4204; GFX6-NEXT:    s_addk_i32 s2, 0x8000
4205; GFX6-NEXT:    v_mov_b32_e32 v0, s5
4206; GFX6-NEXT:    v_mov_b32_e32 v1, s2
4207; GFX6-NEXT:    v_mov_b32_e32 v2, s4
4208; GFX6-NEXT:    v_mov_b32_e32 v3, s3
4209; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4210; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4211; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4212; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
4213; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
4214; GFX6-NEXT:    ; return to shader part epilog
4215;
4216; GFX8-LABEL: s_saddsat_i48:
4217; GFX8:       ; %bb.0:
4218; GFX8-NEXT:    s_add_u32 s4, s0, s2
4219; GFX8-NEXT:    s_addc_u32 s3, s1, s3
4220; GFX8-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
4221; GFX8-NEXT:    v_mov_b32_e32 v0, s0
4222; GFX8-NEXT:    s_bfe_i64 s[6:7], s[4:5], 0x300000
4223; GFX8-NEXT:    v_mov_b32_e32 v1, s1
4224; GFX8-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x300000
4225; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
4226; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
4227; GFX8-NEXT:    s_ashr_i32 s2, s7, 31
4228; GFX8-NEXT:    s_ashr_i32 s5, s7, 15
4229; GFX8-NEXT:    s_addk_i32 s2, 0x8000
4230; GFX8-NEXT:    v_mov_b32_e32 v0, s5
4231; GFX8-NEXT:    v_mov_b32_e32 v1, s2
4232; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4233; GFX8-NEXT:    v_mov_b32_e32 v3, s3
4234; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4235; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4236; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4237; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
4238; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
4239; GFX8-NEXT:    ; return to shader part epilog
4240;
4241; GFX9-LABEL: s_saddsat_i48:
4242; GFX9:       ; %bb.0:
4243; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4244; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
4245; GFX9-NEXT:    s_add_u32 s4, s0, s2
4246; GFX9-NEXT:    v_mov_b32_e32 v0, s0
4247; GFX9-NEXT:    s_addc_u32 s5, s1, s3
4248; GFX9-NEXT:    v_mov_b32_e32 v1, s1
4249; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4250; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
4251; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
4252; GFX9-NEXT:    s_add_i32 s3, s2, 0x80000000
4253; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4254; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4255; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4256; GFX9-NEXT:    v_mov_b32_e32 v3, s5
4257; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4258; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4259; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4260; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4261; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
4262; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
4263; GFX9-NEXT:    ; return to shader part epilog
4264;
4265; GFX10-LABEL: s_saddsat_i48:
4266; GFX10:       ; %bb.0:
4267; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4268; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
4269; GFX10-NEXT:    s_add_u32 s4, s0, s2
4270; GFX10-NEXT:    s_addc_u32 s5, s1, s3
4271; GFX10-NEXT:    v_mov_b32_e32 v0, s4
4272; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4273; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
4274; GFX10-NEXT:    v_mov_b32_e32 v1, s5
4275; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
4276; GFX10-NEXT:    s_add_i32 s3, s2, 0x80000000
4277; GFX10-NEXT:    s_xor_b32 s0, s1, s0
4278; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
4279; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
4280; GFX10-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4281; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
4282; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
4283; GFX10-NEXT:    ; return to shader part epilog
4284;
4285; GFX11-LABEL: s_saddsat_i48:
4286; GFX11:       ; %bb.0:
4287; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4288; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
4289; GFX11-NEXT:    s_add_u32 s4, s0, s2
4290; GFX11-NEXT:    s_addc_u32 s5, s1, s3
4291; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
4292; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4293; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
4294; GFX11-NEXT:    s_ashr_i32 s2, s5, 31
4295; GFX11-NEXT:    s_add_i32 s3, s2, 0x80000000
4296; GFX11-NEXT:    s_xor_b32 s0, s1, s0
4297; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
4298; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
4299; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4300; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
4301; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
4302; GFX11-NEXT:    ; return to shader part epilog
4303  %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
4304  ret i48 %result
4305}
4306
4307define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
4308; GFX6-LABEL: saddsat_i48_sv:
4309; GFX6:       ; %bb.0:
4310; GFX6-NEXT:    v_mov_b32_e32 v3, s1
4311; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s0, v0
4312; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v3, v1, vcc
4313; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
4314; GFX6-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
4315; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
4316; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4317; GFX6-NEXT:    v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
4318; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4319; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v3
4320; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0xffff8000, v0
4321; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
4322; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
4323; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
4324; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4325; GFX6-NEXT:    ; return to shader part epilog
4326;
4327; GFX8-LABEL: saddsat_i48_sv:
4328; GFX8:       ; %bb.0:
4329; GFX8-NEXT:    v_mov_b32_e32 v3, s1
4330; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
4331; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v3, v1, vcc
4332; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 16
4333; GFX8-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
4334; GFX8-NEXT:    v_bfe_i32 v1, v0, 0, 16
4335; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4336; GFX8-NEXT:    v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
4337; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4338; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 15, v3
4339; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xffff8000, v0
4340; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
4341; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
4342; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
4343; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4344; GFX8-NEXT:    ; return to shader part epilog
4345;
4346; GFX9-LABEL: saddsat_i48_sv:
4347; GFX9:       ; %bb.0:
4348; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4349; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4350; GFX9-NEXT:    v_mov_b32_e32 v3, s1
4351; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
4352; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
4353; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
4354; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
4355; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4356; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
4357; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4358; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4359; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4360; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4361; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4362; GFX9-NEXT:    ; return to shader part epilog
4363;
4364; GFX10-LABEL: saddsat_i48_sv:
4365; GFX10:       ; %bb.0:
4366; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4367; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4368; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, s0, v0
4369; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4370; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4371; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4372; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[0:1]
4373; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4374; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4375; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4376; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4377; GFX10-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4378; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4379; GFX10-NEXT:    ; return to shader part epilog
4380;
4381; GFX11-LABEL: saddsat_i48_sv:
4382; GFX11:       ; %bb.0:
4383; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4384; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4385; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, s0, v0
4386; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4387; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4388; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4389; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[0:1]
4390; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4391; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4392; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4393; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4394; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4395; GFX11-NEXT:    ; return to shader part epilog
4396  %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
4397  %ext.result = zext i48 %result to i64
4398  %cast = bitcast i64 %ext.result to <2 x float>
4399  ret <2 x float> %cast
4400}
4401
4402define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
4403; GFX6-LABEL: saddsat_i48_vs:
4404; GFX6:       ; %bb.0:
4405; GFX6-NEXT:    v_mov_b32_e32 v3, s1
4406; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s0, v0
4407; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v1, v3, vcc
4408; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
4409; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
4410; GFX6-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
4411; GFX6-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4412; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
4413; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4414; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v3
4415; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0xffff8000, v0
4416; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], s[2:3]
4417; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
4418; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
4419; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4420; GFX6-NEXT:    ; return to shader part epilog
4421;
4422; GFX8-LABEL: saddsat_i48_vs:
4423; GFX8:       ; %bb.0:
4424; GFX8-NEXT:    v_mov_b32_e32 v3, s1
4425; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
4426; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v1, v3, vcc
4427; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 16
4428; GFX8-NEXT:    v_bfe_i32 v1, v0, 0, 16
4429; GFX8-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
4430; GFX8-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4431; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
4432; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4433; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 15, v3
4434; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xffff8000, v0
4435; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], s[2:3]
4436; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
4437; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
4438; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4439; GFX8-NEXT:    ; return to shader part epilog
4440;
4441; GFX9-LABEL: saddsat_i48_vs:
4442; GFX9:       ; %bb.0:
4443; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4444; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4445; GFX9-NEXT:    v_mov_b32_e32 v3, s1
4446; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
4447; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
4448; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
4449; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
4450; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4451; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
4452; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4453; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4454; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4455; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4456; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4457; GFX9-NEXT:    ; return to shader part epilog
4458;
4459; GFX10-LABEL: saddsat_i48_vs:
4460; GFX10:       ; %bb.0:
4461; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4462; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4463; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, s0
4464; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4465; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[0:1], 0
4466; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4467; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4468; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4469; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4470; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4471; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4472; GFX10-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4473; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4474; GFX10-NEXT:    ; return to shader part epilog
4475;
4476; GFX11-LABEL: saddsat_i48_vs:
4477; GFX11:       ; %bb.0:
4478; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
4479; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
4480; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v0, s0
4481; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4482; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[0:1], 0
4483; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4484; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4485; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4486; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4487; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4488; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
4489; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4490; GFX11-NEXT:    ; return to shader part epilog
4491  %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
4492  %ext.result = zext i48 %result to i64
4493  %cast = bitcast i64 %ext.result to <2 x float>
4494  ret <2 x float> %cast
4495}
4496
4497define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
4498; GFX6-LABEL: v_saddsat_i64:
4499; GFX6:       ; %bb.0:
4500; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4501; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
4502; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
4503; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4504; GFX6-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
4505; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4506; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 0x80000000, v0
4507; GFX6-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4508; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
4509; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4510; GFX6-NEXT:    s_setpc_b64 s[30:31]
4511;
4512; GFX8-LABEL: v_saddsat_i64:
4513; GFX8:       ; %bb.0:
4514; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4515; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
4516; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
4517; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4518; GFX8-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
4519; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4520; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x80000000, v0
4521; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4522; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
4523; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4524; GFX8-NEXT:    s_setpc_b64 s[30:31]
4525;
4526; GFX9-LABEL: v_saddsat_i64:
4527; GFX9:       ; %bb.0:
4528; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4529; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
4530; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
4531; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
4532; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
4533; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4534; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
4535; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4536; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
4537; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4538; GFX9-NEXT:    s_setpc_b64 s[30:31]
4539;
4540; GFX10-LABEL: v_saddsat_i64:
4541; GFX10:       ; %bb.0:
4542; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4543; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
4544; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4545; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[2:3]
4546; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
4547; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
4548; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
4549; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
4550; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
4551; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
4552; GFX10-NEXT:    s_setpc_b64 s[30:31]
4553;
4554; GFX11-LABEL: v_saddsat_i64:
4555; GFX11:       ; %bb.0:
4556; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4557; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
4558; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4559; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[2:3]
4560; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
4561; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
4562; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
4563; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4564; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
4565; GFX11-NEXT:    s_setpc_b64 s[30:31]
4566  %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
4567  ret i64 %result
4568}
4569
4570define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
4571; GFX6-LABEL: s_saddsat_i64:
4572; GFX6:       ; %bb.0:
4573; GFX6-NEXT:    s_add_u32 s4, s0, s2
4574; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4575; GFX6-NEXT:    s_addc_u32 s5, s1, s3
4576; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4577; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4578; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
4579; GFX6-NEXT:    s_ashr_i32 s2, s5, 31
4580; GFX6-NEXT:    s_add_i32 s3, s2, 0x80000000
4581; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4582; GFX6-NEXT:    v_mov_b32_e32 v1, s3
4583; GFX6-NEXT:    v_mov_b32_e32 v2, s4
4584; GFX6-NEXT:    v_mov_b32_e32 v3, s5
4585; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4586; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4587; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4588; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
4589; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
4590; GFX6-NEXT:    ; return to shader part epilog
4591;
4592; GFX8-LABEL: s_saddsat_i64:
4593; GFX8:       ; %bb.0:
4594; GFX8-NEXT:    s_add_u32 s4, s0, s2
4595; GFX8-NEXT:    v_mov_b32_e32 v0, s0
4596; GFX8-NEXT:    s_addc_u32 s5, s1, s3
4597; GFX8-NEXT:    v_mov_b32_e32 v1, s1
4598; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4599; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
4600; GFX8-NEXT:    s_ashr_i32 s2, s5, 31
4601; GFX8-NEXT:    s_add_i32 s3, s2, 0x80000000
4602; GFX8-NEXT:    v_mov_b32_e32 v0, s2
4603; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4604; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4605; GFX8-NEXT:    v_mov_b32_e32 v3, s5
4606; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4607; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4608; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4609; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
4610; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
4611; GFX8-NEXT:    ; return to shader part epilog
4612;
4613; GFX9-LABEL: s_saddsat_i64:
4614; GFX9:       ; %bb.0:
4615; GFX9-NEXT:    s_add_u32 s4, s0, s2
4616; GFX9-NEXT:    v_mov_b32_e32 v0, s0
4617; GFX9-NEXT:    s_addc_u32 s5, s1, s3
4618; GFX9-NEXT:    v_mov_b32_e32 v1, s1
4619; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4620; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
4621; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
4622; GFX9-NEXT:    s_add_i32 s3, s2, 0x80000000
4623; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4624; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4625; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4626; GFX9-NEXT:    v_mov_b32_e32 v3, s5
4627; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4628; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4629; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4630; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
4631; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
4632; GFX9-NEXT:    ; return to shader part epilog
4633;
4634; GFX10-LABEL: s_saddsat_i64:
4635; GFX10:       ; %bb.0:
4636; GFX10-NEXT:    s_add_u32 s4, s0, s2
4637; GFX10-NEXT:    s_addc_u32 s5, s1, s3
4638; GFX10-NEXT:    v_mov_b32_e32 v0, s4
4639; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4640; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
4641; GFX10-NEXT:    v_mov_b32_e32 v1, s5
4642; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
4643; GFX10-NEXT:    s_add_i32 s3, s2, 0x80000000
4644; GFX10-NEXT:    s_xor_b32 s0, s1, s0
4645; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
4646; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
4647; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
4648; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
4649; GFX10-NEXT:    ; return to shader part epilog
4650;
4651; GFX11-LABEL: s_saddsat_i64:
4652; GFX11:       ; %bb.0:
4653; GFX11-NEXT:    s_add_u32 s4, s0, s2
4654; GFX11-NEXT:    s_addc_u32 s5, s1, s3
4655; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
4656; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4657; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
4658; GFX11-NEXT:    s_ashr_i32 s2, s5, 31
4659; GFX11-NEXT:    s_add_i32 s3, s2, 0x80000000
4660; GFX11-NEXT:    s_xor_b32 s0, s1, s0
4661; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
4662; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
4663; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
4664; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
4665; GFX11-NEXT:    ; return to shader part epilog
4666  %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
4667  ret i64 %result
4668}
4669
4670define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
4671; GFX6-LABEL: saddsat_i64_sv:
4672; GFX6:       ; %bb.0:
4673; GFX6-NEXT:    v_mov_b32_e32 v3, s1
4674; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s0, v0
4675; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v1, vcc
4676; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4677; GFX6-NEXT:    v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
4678; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4679; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 0x80000000, v0
4680; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
4681; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4682; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4683; GFX6-NEXT:    ; return to shader part epilog
4684;
4685; GFX8-LABEL: saddsat_i64_sv:
4686; GFX8:       ; %bb.0:
4687; GFX8-NEXT:    v_mov_b32_e32 v3, s1
4688; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
4689; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v1, vcc
4690; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4691; GFX8-NEXT:    v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
4692; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4693; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x80000000, v0
4694; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
4695; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4696; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4697; GFX8-NEXT:    ; return to shader part epilog
4698;
4699; GFX9-LABEL: saddsat_i64_sv:
4700; GFX9:       ; %bb.0:
4701; GFX9-NEXT:    v_mov_b32_e32 v3, s1
4702; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
4703; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
4704; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
4705; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
4706; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4707; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
4708; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4709; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4710; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4711; GFX9-NEXT:    ; return to shader part epilog
4712;
4713; GFX10-LABEL: saddsat_i64_sv:
4714; GFX10:       ; %bb.0:
4715; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, s0, v0
4716; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4717; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4718; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4719; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[0:1]
4720; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4721; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4722; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4723; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4724; GFX10-NEXT:    ; return to shader part epilog
4725;
4726; GFX11-LABEL: saddsat_i64_sv:
4727; GFX11:       ; %bb.0:
4728; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, s0, v0
4729; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4730; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4731; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4732; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[0:1]
4733; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4734; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4735; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4736; GFX11-NEXT:    ; return to shader part epilog
4737  %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
4738  %cast = bitcast i64 %result to <2 x float>
4739  ret <2 x float> %cast
4740}
4741
4742define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
4743; GFX6-LABEL: saddsat_i64_vs:
4744; GFX6:       ; %bb.0:
4745; GFX6-NEXT:    v_mov_b32_e32 v3, s1
4746; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s0, v0
4747; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
4748; GFX6-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4749; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
4750; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4751; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 0x80000000, v0
4752; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], s[2:3]
4753; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4754; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4755; GFX6-NEXT:    ; return to shader part epilog
4756;
4757; GFX8-LABEL: saddsat_i64_vs:
4758; GFX8:       ; %bb.0:
4759; GFX8-NEXT:    v_mov_b32_e32 v3, s1
4760; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
4761; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
4762; GFX8-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4763; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
4764; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4765; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x80000000, v0
4766; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], s[2:3]
4767; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4768; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4769; GFX8-NEXT:    ; return to shader part epilog
4770;
4771; GFX9-LABEL: saddsat_i64_vs:
4772; GFX9:       ; %bb.0:
4773; GFX9-NEXT:    v_mov_b32_e32 v3, s1
4774; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
4775; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
4776; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
4777; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
4778; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4779; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
4780; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4781; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
4782; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4783; GFX9-NEXT:    ; return to shader part epilog
4784;
4785; GFX10-LABEL: saddsat_i64_vs:
4786; GFX10:       ; %bb.0:
4787; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, s0
4788; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4789; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[0:1], 0
4790; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4791; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4792; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4793; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4794; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4795; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4796; GFX10-NEXT:    ; return to shader part epilog
4797;
4798; GFX11-LABEL: saddsat_i64_vs:
4799; GFX11:       ; %bb.0:
4800; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v0, s0
4801; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4802; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[0:1], 0
4803; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4804; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4805; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
4806; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4807; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4808; GFX11-NEXT:    ; return to shader part epilog
4809  %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
4810  %cast = bitcast i64 %result to <2 x float>
4811  ret <2 x float> %cast
4812}
4813
4814define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
4815; GFX6-LABEL: v_saddsat_v2i64:
4816; GFX6:       ; %bb.0:
4817; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4818; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v0, v4
4819; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v1, v5, vcc
4820; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
4821; GFX6-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[4:5]
4822; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
4823; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
4824; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v0, v1
4825; GFX6-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4826; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
4827; GFX6-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
4828; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v2, v6
4829; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v3, v7, vcc
4830; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
4831; GFX6-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[6:7]
4832; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
4833; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 0x80000000, v2
4834; GFX6-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4835; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
4836; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
4837; GFX6-NEXT:    s_setpc_b64 s[30:31]
4838;
4839; GFX8-LABEL: v_saddsat_v2i64:
4840; GFX8:       ; %bb.0:
4841; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4842; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v0, v4
4843; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v1, v5, vcc
4844; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
4845; GFX8-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[4:5]
4846; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
4847; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
4848; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v0, v1
4849; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4850; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
4851; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
4852; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v2, v6
4853; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v3, v7, vcc
4854; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
4855; GFX8-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[6:7]
4856; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
4857; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x80000000, v2
4858; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
4859; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
4860; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
4861; GFX8-NEXT:    s_setpc_b64 s[30:31]
4862;
4863; GFX9-LABEL: v_saddsat_v2i64:
4864; GFX9:       ; %bb.0:
4865; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4866; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v4
4867; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v1, v5, vcc
4868; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
4869; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
4870; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
4871; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
4872; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4873; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
4874; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
4875; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v6
4876; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v3, v7, vcc
4877; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
4878; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
4879; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
4880; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
4881; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4882; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
4883; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
4884; GFX9-NEXT:    s_setpc_b64 s[30:31]
4885;
4886; GFX10-LABEL: v_saddsat_v2i64:
4887; GFX10:       ; %bb.0:
4888; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4889; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v4
4890; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
4891; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v2, v6
4892; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
4893; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 31, v9
4894; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
4895; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[4:5]
4896; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
4897; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
4898; GFX10-NEXT:    v_cmp_gt_i64_e64 s6, 0, v[6:7]
4899; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v12
4900; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v4
4901; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
4902; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc_lo
4903; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
4904; GFX10-NEXT:    s_xor_b32 vcc_lo, s6, s5
4905; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v4, vcc_lo
4906; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
4907; GFX10-NEXT:    s_setpc_b64 s[30:31]
4908;
4909; GFX11-LABEL: v_saddsat_v2i64:
4910; GFX11:       ; %bb.0:
4911; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4912; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v4
4913; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
4914; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v2, v6
4915; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
4916; GFX11-NEXT:    v_ashrrev_i32_e32 v12, 31, v9
4917; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
4918; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[4:5]
4919; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
4920; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, v[10:11], v[2:3]
4921; GFX11-NEXT:    v_cmp_gt_i64_e64 s2, 0, v[6:7]
4922; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v12
4923; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v4
4924; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
4925; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1
4926; GFX11-NEXT:    s_xor_b32 vcc_lo, s2, s1
4927; GFX11-NEXT:    v_dual_cndmask_b32 v2, v10, v4 :: v_dual_cndmask_b32 v3, v11, v3
4928; GFX11-NEXT:    s_setpc_b64 s[30:31]
4929  %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
4930  ret <2 x i64> %result
4931}
4932
4933define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) {
4934; GFX6-LABEL: s_saddsat_v2i64:
4935; GFX6:       ; %bb.0:
4936; GFX6-NEXT:    s_add_u32 s8, s0, s4
4937; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4938; GFX6-NEXT:    s_addc_u32 s9, s1, s5
4939; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4940; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
4941; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
4942; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
4943; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
4944; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4945; GFX6-NEXT:    v_mov_b32_e32 v1, s5
4946; GFX6-NEXT:    v_mov_b32_e32 v2, s8
4947; GFX6-NEXT:    v_mov_b32_e32 v3, s9
4948; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4949; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
4950; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
4951; GFX6-NEXT:    s_add_u32 s0, s2, s6
4952; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4953; GFX6-NEXT:    s_addc_u32 s1, s3, s7
4954; GFX6-NEXT:    v_mov_b32_e32 v1, s3
4955; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
4956; GFX6-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
4957; GFX6-NEXT:    s_ashr_i32 s4, s1, 31
4958; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
4959; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4960; GFX6-NEXT:    v_mov_b32_e32 v1, s5
4961; GFX6-NEXT:    v_mov_b32_e32 v4, s0
4962; GFX6-NEXT:    v_mov_b32_e32 v5, s1
4963; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], vcc
4964; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
4965; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4966; GFX6-NEXT:    v_readfirstlane_b32 s0, v2
4967; GFX6-NEXT:    v_readfirstlane_b32 s1, v3
4968; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
4969; GFX6-NEXT:    v_readfirstlane_b32 s3, v1
4970; GFX6-NEXT:    ; return to shader part epilog
4971;
4972; GFX8-LABEL: s_saddsat_v2i64:
4973; GFX8:       ; %bb.0:
4974; GFX8-NEXT:    s_add_u32 s8, s0, s4
4975; GFX8-NEXT:    v_mov_b32_e32 v0, s0
4976; GFX8-NEXT:    s_addc_u32 s9, s1, s5
4977; GFX8-NEXT:    v_mov_b32_e32 v1, s1
4978; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
4979; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
4980; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
4981; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
4982; GFX8-NEXT:    v_mov_b32_e32 v0, s4
4983; GFX8-NEXT:    v_mov_b32_e32 v1, s5
4984; GFX8-NEXT:    v_mov_b32_e32 v2, s8
4985; GFX8-NEXT:    v_mov_b32_e32 v3, s9
4986; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4987; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
4988; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
4989; GFX8-NEXT:    s_add_u32 s0, s2, s6
4990; GFX8-NEXT:    v_mov_b32_e32 v0, s2
4991; GFX8-NEXT:    s_addc_u32 s1, s3, s7
4992; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4993; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
4994; GFX8-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
4995; GFX8-NEXT:    s_ashr_i32 s4, s1, 31
4996; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
4997; GFX8-NEXT:    v_mov_b32_e32 v0, s4
4998; GFX8-NEXT:    v_mov_b32_e32 v1, s5
4999; GFX8-NEXT:    v_mov_b32_e32 v4, s0
5000; GFX8-NEXT:    v_mov_b32_e32 v5, s1
5001; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], vcc
5002; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
5003; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
5004; GFX8-NEXT:    v_readfirstlane_b32 s0, v2
5005; GFX8-NEXT:    v_readfirstlane_b32 s1, v3
5006; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
5007; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
5008; GFX8-NEXT:    ; return to shader part epilog
5009;
5010; GFX9-LABEL: s_saddsat_v2i64:
5011; GFX9:       ; %bb.0:
5012; GFX9-NEXT:    s_add_u32 s8, s0, s4
5013; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5014; GFX9-NEXT:    s_addc_u32 s9, s1, s5
5015; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5016; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
5017; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
5018; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
5019; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
5020; GFX9-NEXT:    v_mov_b32_e32 v0, s4
5021; GFX9-NEXT:    v_mov_b32_e32 v1, s5
5022; GFX9-NEXT:    v_mov_b32_e32 v2, s8
5023; GFX9-NEXT:    v_mov_b32_e32 v3, s9
5024; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
5025; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
5026; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
5027; GFX9-NEXT:    s_add_u32 s0, s2, s6
5028; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5029; GFX9-NEXT:    s_addc_u32 s1, s3, s7
5030; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5031; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
5032; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
5033; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
5034; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
5035; GFX9-NEXT:    v_mov_b32_e32 v0, s4
5036; GFX9-NEXT:    v_mov_b32_e32 v1, s5
5037; GFX9-NEXT:    v_mov_b32_e32 v4, s0
5038; GFX9-NEXT:    v_mov_b32_e32 v5, s1
5039; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], vcc
5040; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
5041; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
5042; GFX9-NEXT:    v_readfirstlane_b32 s0, v2
5043; GFX9-NEXT:    v_readfirstlane_b32 s1, v3
5044; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
5045; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
5046; GFX9-NEXT:    ; return to shader part epilog
5047;
5048; GFX10-LABEL: s_saddsat_v2i64:
5049; GFX10:       ; %bb.0:
5050; GFX10-NEXT:    s_add_u32 s8, s0, s4
5051; GFX10-NEXT:    s_addc_u32 s9, s1, s5
5052; GFX10-NEXT:    v_mov_b32_e32 v0, s8
5053; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
5054; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[4:5], 0
5055; GFX10-NEXT:    s_ashr_i32 s4, s9, 31
5056; GFX10-NEXT:    v_mov_b32_e32 v1, s9
5057; GFX10-NEXT:    s_add_i32 s5, s4, 0x80000000
5058; GFX10-NEXT:    s_xor_b32 s8, s1, s0
5059; GFX10-NEXT:    s_add_u32 s0, s2, s6
5060; GFX10-NEXT:    s_addc_u32 s1, s3, s7
5061; GFX10-NEXT:    v_mov_b32_e32 v2, s0
5062; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
5063; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[6:7], 0
5064; GFX10-NEXT:    v_mov_b32_e32 v3, s1
5065; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
5066; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
5067; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
5068; GFX10-NEXT:    s_add_i32 s0, s4, 0x80000000
5069; GFX10-NEXT:    s_xor_b32 s1, s3, s2
5070; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
5071; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s0, s1
5072; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
5073; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
5074; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
5075; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
5076; GFX10-NEXT:    ; return to shader part epilog
5077;
5078; GFX11-LABEL: s_saddsat_v2i64:
5079; GFX11:       ; %bb.0:
5080; GFX11-NEXT:    s_add_u32 s8, s0, s4
5081; GFX11-NEXT:    s_addc_u32 s9, s1, s5
5082; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
5083; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
5084; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[4:5], 0
5085; GFX11-NEXT:    s_ashr_i32 s4, s9, 31
5086; GFX11-NEXT:    s_add_i32 s5, s4, 0x80000000
5087; GFX11-NEXT:    s_xor_b32 s8, s1, s0
5088; GFX11-NEXT:    s_add_u32 s0, s2, s6
5089; GFX11-NEXT:    s_addc_u32 s1, s3, s7
5090; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
5091; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
5092; GFX11-NEXT:    v_cmp_lt_i64_e64 s3, s[6:7], 0
5093; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
5094; GFX11-NEXT:    s_ashr_i32 s4, s1, 31
5095; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
5096; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
5097; GFX11-NEXT:    s_xor_b32 s1, s3, s2
5098; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
5099; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s0, s1
5100; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
5101; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
5102; GFX11-NEXT:    v_readfirstlane_b32 s2, v2
5103; GFX11-NEXT:    v_readfirstlane_b32 s3, v3
5104; GFX11-NEXT:    ; return to shader part epilog
5105  %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
5106  ret <2 x i64> %result
5107}
5108
5109define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
5110; GFX6-LABEL: s_saddsat_i128:
5111; GFX6:       ; %bb.0:
5112; GFX6-NEXT:    s_add_u32 s4, s0, s4
5113; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5114; GFX6-NEXT:    s_addc_u32 s5, s1, s5
5115; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5116; GFX6-NEXT:    s_addc_u32 s8, s2, s6
5117; GFX6-NEXT:    v_mov_b32_e32 v2, s2
5118; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
5119; GFX6-NEXT:    s_addc_u32 s9, s3, s7
5120; GFX6-NEXT:    v_mov_b32_e32 v3, s3
5121; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5122; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3]
5123; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
5124; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5125; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
5126; GFX6-NEXT:    v_mov_b32_e32 v2, s4
5127; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5128; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5129; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
5130; GFX6-NEXT:    v_mov_b32_e32 v3, s5
5131; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5132; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5133; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
5134; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5135; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
5136; GFX6-NEXT:    v_mov_b32_e32 v1, s0
5137; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5138; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
5139; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
5140; GFX6-NEXT:    v_mov_b32_e32 v3, s1
5141; GFX6-NEXT:    v_mov_b32_e32 v4, s8
5142; GFX6-NEXT:    v_mov_b32_e32 v5, s9
5143; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
5144; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
5145; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
5146; GFX6-NEXT:    v_readfirstlane_b32 s1, v2
5147; GFX6-NEXT:    v_readfirstlane_b32 s2, v1
5148; GFX6-NEXT:    v_readfirstlane_b32 s3, v3
5149; GFX6-NEXT:    ; return to shader part epilog
5150;
5151; GFX8-LABEL: s_saddsat_i128:
5152; GFX8:       ; %bb.0:
5153; GFX8-NEXT:    s_add_u32 s4, s0, s4
5154; GFX8-NEXT:    s_addc_u32 s5, s1, s5
5155; GFX8-NEXT:    v_mov_b32_e32 v0, s0
5156; GFX8-NEXT:    s_addc_u32 s8, s2, s6
5157; GFX8-NEXT:    v_mov_b32_e32 v1, s1
5158; GFX8-NEXT:    s_addc_u32 s9, s3, s7
5159; GFX8-NEXT:    v_mov_b32_e32 v2, s2
5160; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
5161; GFX8-NEXT:    v_mov_b32_e32 v3, s3
5162; GFX8-NEXT:    s_cmp_eq_u64 s[8:9], s[2:3]
5163; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
5164; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5165; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3]
5166; GFX8-NEXT:    s_and_b32 s0, 1, s0
5167; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5168; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5169; GFX8-NEXT:    s_cmp_eq_u64 s[6:7], 0
5170; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
5171; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
5172; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5173; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5174; GFX8-NEXT:    s_and_b32 s0, 1, s2
5175; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
5176; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5177; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
5178; GFX8-NEXT:    s_ashr_i32 s0, s9, 31
5179; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5180; GFX8-NEXT:    s_add_i32 s1, s0, 0x80000000
5181; GFX8-NEXT:    v_mov_b32_e32 v1, s0
5182; GFX8-NEXT:    v_mov_b32_e32 v2, s4
5183; GFX8-NEXT:    v_mov_b32_e32 v3, s5
5184; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5185; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
5186; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
5187; GFX8-NEXT:    v_mov_b32_e32 v3, s1
5188; GFX8-NEXT:    v_mov_b32_e32 v4, s8
5189; GFX8-NEXT:    v_mov_b32_e32 v5, s9
5190; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
5191; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
5192; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
5193; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
5194; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
5195; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
5196; GFX8-NEXT:    ; return to shader part epilog
5197;
5198; GFX9-LABEL: s_saddsat_i128:
5199; GFX9:       ; %bb.0:
5200; GFX9-NEXT:    s_add_u32 s4, s0, s4
5201; GFX9-NEXT:    s_addc_u32 s5, s1, s5
5202; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5203; GFX9-NEXT:    s_addc_u32 s8, s2, s6
5204; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5205; GFX9-NEXT:    s_addc_u32 s9, s3, s7
5206; GFX9-NEXT:    v_mov_b32_e32 v2, s2
5207; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
5208; GFX9-NEXT:    v_mov_b32_e32 v3, s3
5209; GFX9-NEXT:    s_cmp_eq_u64 s[8:9], s[2:3]
5210; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
5211; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5212; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3]
5213; GFX9-NEXT:    s_and_b32 s0, 1, s0
5214; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5215; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5216; GFX9-NEXT:    s_cmp_eq_u64 s[6:7], 0
5217; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
5218; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
5219; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5220; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5221; GFX9-NEXT:    s_and_b32 s0, 1, s2
5222; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
5223; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5224; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
5225; GFX9-NEXT:    s_ashr_i32 s0, s9, 31
5226; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5227; GFX9-NEXT:    s_add_i32 s1, s0, 0x80000000
5228; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5229; GFX9-NEXT:    v_mov_b32_e32 v2, s4
5230; GFX9-NEXT:    v_mov_b32_e32 v3, s5
5231; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5232; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
5233; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
5234; GFX9-NEXT:    v_mov_b32_e32 v3, s1
5235; GFX9-NEXT:    v_mov_b32_e32 v4, s8
5236; GFX9-NEXT:    v_mov_b32_e32 v5, s9
5237; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
5238; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
5239; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
5240; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
5241; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
5242; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
5243; GFX9-NEXT:    ; return to shader part epilog
5244;
5245; GFX10-LABEL: s_saddsat_i128:
5246; GFX10:       ; %bb.0:
5247; GFX10-NEXT:    s_add_u32 s4, s0, s4
5248; GFX10-NEXT:    s_addc_u32 s5, s1, s5
5249; GFX10-NEXT:    s_addc_u32 s8, s2, s6
5250; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
5251; GFX10-NEXT:    s_addc_u32 s9, s3, s7
5252; GFX10-NEXT:    s_cmp_eq_u64 s[8:9], s[2:3]
5253; GFX10-NEXT:    v_mov_b32_e32 v3, s9
5254; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
5255; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
5256; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[2:3]
5257; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[6:7], 0
5258; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
5259; GFX10-NEXT:    s_and_b32 s0, 1, s10
5260; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], 0
5261; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
5262; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
5263; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
5264; GFX10-NEXT:    s_and_b32 s1, 1, s1
5265; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
5266; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5267; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
5268; GFX10-NEXT:    v_mov_b32_e32 v2, s5
5269; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
5270; GFX10-NEXT:    s_add_i32 s1, s0, 0x80000000
5271; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
5272; GFX10-NEXT:    v_mov_b32_e32 v1, s4
5273; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
5274; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5275; GFX10-NEXT:    v_mov_b32_e32 v0, s8
5276; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s0, vcc_lo
5277; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s0, vcc_lo
5278; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, vcc_lo
5279; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s1, vcc_lo
5280; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
5281; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
5282; GFX10-NEXT:    v_readfirstlane_b32 s2, v0
5283; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
5284; GFX10-NEXT:    ; return to shader part epilog
5285;
5286; GFX11-LABEL: s_saddsat_i128:
5287; GFX11:       ; %bb.0:
5288; GFX11-NEXT:    s_add_u32 s4, s0, s4
5289; GFX11-NEXT:    s_addc_u32 s5, s1, s5
5290; GFX11-NEXT:    s_addc_u32 s8, s2, s6
5291; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
5292; GFX11-NEXT:    s_addc_u32 s9, s3, s7
5293; GFX11-NEXT:    s_cmp_eq_u64 s[8:9], s[2:3]
5294; GFX11-NEXT:    v_mov_b32_e32 v3, s9
5295; GFX11-NEXT:    s_cselect_b32 s10, 1, 0
5296; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
5297; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[2:3]
5298; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[6:7], 0
5299; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
5300; GFX11-NEXT:    s_and_b32 s0, 1, s10
5301; GFX11-NEXT:    s_cmp_eq_u64 s[6:7], 0
5302; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
5303; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
5304; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
5305; GFX11-NEXT:    s_and_b32 s1, 1, s1
5306; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
5307; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5308; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
5309; GFX11-NEXT:    v_mov_b32_e32 v2, s5
5310; GFX11-NEXT:    s_ashr_i32 s0, s9, 31
5311; GFX11-NEXT:    s_add_i32 s1, s0, 0x80000000
5312; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
5313; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0
5314; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5315; GFX11-NEXT:    v_mov_b32_e32 v0, s8
5316; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s0, vcc_lo
5317; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s0, vcc_lo
5318; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s0, vcc_lo
5319; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s1, vcc_lo
5320; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
5321; GFX11-NEXT:    v_readfirstlane_b32 s1, v2
5322; GFX11-NEXT:    v_readfirstlane_b32 s2, v0
5323; GFX11-NEXT:    v_readfirstlane_b32 s3, v3
5324; GFX11-NEXT:    ; return to shader part epilog
5325  %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
5326  ret i128 %result
5327}
5328
5329define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
5330; GFX6-LABEL: saddsat_i128_sv:
5331; GFX6:       ; %bb.0:
5332; GFX6-NEXT:    v_mov_b32_e32 v4, s1
5333; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
5334; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
5335; GFX6-NEXT:    v_mov_b32_e32 v4, s2
5336; GFX6-NEXT:    v_mov_b32_e32 v5, s3
5337; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v2, vcc
5338; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v3, vcc
5339; GFX6-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
5340; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
5341; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5]
5342; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
5343; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
5344; GFX6-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
5345; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
5346; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
5347; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5348; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
5349; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
5350; GFX6-NEXT:    v_xor_b32_e32 v2, v2, v6
5351; GFX6-NEXT:    v_bfrev_b32_e32 v6, 1
5352; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v3, v6
5353; GFX6-NEXT:    v_and_b32_e32 v2, 1, v2
5354; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
5355; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
5356; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
5357; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
5358; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
5359; GFX6-NEXT:    ; return to shader part epilog
5360;
5361; GFX8-LABEL: saddsat_i128_sv:
5362; GFX8:       ; %bb.0:
5363; GFX8-NEXT:    v_mov_b32_e32 v4, s1
5364; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
5365; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
5366; GFX8-NEXT:    v_mov_b32_e32 v4, s2
5367; GFX8-NEXT:    v_mov_b32_e32 v5, s3
5368; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v2, vcc
5369; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v3, vcc
5370; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
5371; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
5372; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5]
5373; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
5374; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
5375; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
5376; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
5377; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
5378; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5379; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
5380; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
5381; GFX8-NEXT:    v_xor_b32_e32 v2, v2, v6
5382; GFX8-NEXT:    v_bfrev_b32_e32 v6, 1
5383; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v3, v6
5384; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
5385; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
5386; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
5387; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
5388; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
5389; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
5390; GFX8-NEXT:    ; return to shader part epilog
5391;
5392; GFX9-LABEL: saddsat_i128_sv:
5393; GFX9:       ; %bb.0:
5394; GFX9-NEXT:    v_mov_b32_e32 v4, s1
5395; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
5396; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
5397; GFX9-NEXT:    v_mov_b32_e32 v4, s2
5398; GFX9-NEXT:    v_mov_b32_e32 v5, s3
5399; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
5400; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
5401; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
5402; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
5403; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5]
5404; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
5405; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
5406; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
5407; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
5408; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
5409; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5410; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
5411; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
5412; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v6
5413; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
5414; GFX9-NEXT:    v_add_u32_e32 v6, 0x80000000, v3
5415; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
5416; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
5417; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
5418; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
5419; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
5420; GFX9-NEXT:    ; return to shader part epilog
5421;
5422; GFX10-LABEL: saddsat_i128_sv:
5423; GFX10:       ; %bb.0:
5424; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
5425; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
5426; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo
5427; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo
5428; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
5429; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
5430; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5]
5431; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
5432; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
5433; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
5434; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
5435; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
5436; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
5437; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
5438; GFX10-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc_lo
5439; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v6
5440; GFX10-NEXT:    v_add_nc_u32_e32 v6, 0x80000000, v3
5441; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
5442; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
5443; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
5444; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
5445; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc_lo
5446; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
5447; GFX10-NEXT:    ; return to shader part epilog
5448;
5449; GFX11-LABEL: saddsat_i128_sv:
5450; GFX11:       ; %bb.0:
5451; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
5452; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
5453; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo
5454; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo
5455; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
5456; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
5457; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5]
5458; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
5459; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
5460; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
5461; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
5462; GFX11-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
5463; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
5464; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
5465; GFX11-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc_lo
5466; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v6
5467; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x80000000, v3
5468; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
5469; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
5470; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
5471; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
5472; GFX11-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_cndmask_b32 v3, v5, v6
5473; GFX11-NEXT:    ; return to shader part epilog
5474  %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
5475  %cast = bitcast i128 %result to <4 x float>
5476  ret <4 x float> %cast
5477}
5478
5479define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
5480; GFX6-LABEL: saddsat_i128_vs:
5481; GFX6:       ; %bb.0:
5482; GFX6-NEXT:    v_mov_b32_e32 v5, s1
5483; GFX6-NEXT:    v_add_i32_e32 v4, vcc, s0, v0
5484; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v5, vcc
5485; GFX6-NEXT:    v_mov_b32_e32 v6, s2
5486; GFX6-NEXT:    v_mov_b32_e32 v7, s3
5487; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v2, v6, vcc
5488; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v3, v7, vcc
5489; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5490; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
5491; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5492; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5493; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5494; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5495; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
5496; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5497; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5498; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
5499; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5500; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5501; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
5502; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v2, v1
5503; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5504; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5505; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5506; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
5507; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
5508; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
5509; GFX6-NEXT:    ; return to shader part epilog
5510;
5511; GFX8-LABEL: saddsat_i128_vs:
5512; GFX8:       ; %bb.0:
5513; GFX8-NEXT:    v_mov_b32_e32 v5, s1
5514; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s0, v0
5515; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v1, v5, vcc
5516; GFX8-NEXT:    v_mov_b32_e32 v6, s2
5517; GFX8-NEXT:    v_mov_b32_e32 v7, s3
5518; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v2, v6, vcc
5519; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v3, v7, vcc
5520; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5521; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], 0
5522; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5523; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5524; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
5525; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5526; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5527; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
5528; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5529; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5530; GFX8-NEXT:    s_and_b32 s0, 1, s4
5531; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
5532; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5533; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
5534; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
5535; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
5536; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v2, v1
5537; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5538; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5539; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5540; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
5541; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
5542; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
5543; GFX8-NEXT:    ; return to shader part epilog
5544;
5545; GFX9-LABEL: saddsat_i128_vs:
5546; GFX9:       ; %bb.0:
5547; GFX9-NEXT:    v_mov_b32_e32 v5, s1
5548; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
5549; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v5, vcc
5550; GFX9-NEXT:    v_mov_b32_e32 v6, s2
5551; GFX9-NEXT:    v_mov_b32_e32 v7, s3
5552; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v2, v6, vcc
5553; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v3, v7, vcc
5554; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5555; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], 0
5556; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5557; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5558; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
5559; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5560; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5561; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
5562; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5563; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5564; GFX9-NEXT:    s_and_b32 s0, 1, s4
5565; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
5566; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5567; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
5568; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
5569; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5570; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
5571; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5572; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5573; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
5574; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
5575; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
5576; GFX9-NEXT:    ; return to shader part epilog
5577;
5578; GFX10-LABEL: saddsat_i128_vs:
5579; GFX10:       ; %bb.0:
5580; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, s0
5581; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
5582; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
5583; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5584; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
5585; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], 0
5586; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
5587; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
5588; GFX10-NEXT:    s_and_b32 s0, 1, s0
5589; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5590; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
5591; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
5592; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
5593; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5594; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5595; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
5596; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
5597; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5598; GFX10-NEXT:    v_cndmask_b32_e64 v1, v8, 0, s0
5599; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
5600; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
5601; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5602; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
5603; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
5604; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
5605; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
5606; GFX10-NEXT:    ; return to shader part epilog
5607;
5608; GFX11-LABEL: saddsat_i128_vs:
5609; GFX11:       ; %bb.0:
5610; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, s0
5611; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
5612; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
5613; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5614; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
5615; GFX11-NEXT:    s_cmp_eq_u64 s[2:3], 0
5616; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
5617; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
5618; GFX11-NEXT:    s_and_b32 s0, 1, s0
5619; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5620; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
5621; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
5622; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
5623; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5624; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5625; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
5626; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
5627; GFX11-NEXT:    v_cndmask_b32_e64 v1, v8, 0, s0
5628; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
5629; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
5630; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5631; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
5632; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3
5633; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
5634; GFX11-NEXT:    ; return to shader part epilog
5635  %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
5636  %cast = bitcast i128 %result to <4 x float>
5637  ret <4 x float> %cast
5638}
5639
5640define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
5641; GFX6-LABEL: v_saddsat_v2i128:
5642; GFX6:       ; %bb.0:
5643; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5644; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v0, v8
5645; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v1, v9, vcc
5646; GFX6-NEXT:    v_addc_u32_e32 v16, vcc, v2, v10, vcc
5647; GFX6-NEXT:    v_addc_u32_e32 v17, vcc, v3, v11, vcc
5648; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1]
5649; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5650; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3]
5651; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5652; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3]
5653; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
5654; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5655; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
5656; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5657; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5658; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
5659; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5660; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
5661; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v2, v1
5662; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5663; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5664; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
5665; GFX6-NEXT:    v_cndmask_b32_e32 v1, v9, v2, vcc
5666; GFX6-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
5667; GFX6-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
5668; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v4, v12
5669; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v5, v13, vcc
5670; GFX6-NEXT:    v_addc_u32_e32 v10, vcc, v6, v14, vcc
5671; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, v7, v15, vcc
5672; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5673; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
5674; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5675; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5676; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5677; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
5678; GFX6-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
5679; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
5680; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5681; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5682; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
5683; GFX6-NEXT:    v_xor_b32_e32 v4, v5, v4
5684; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 0x80000000, v6
5685; GFX6-NEXT:    v_and_b32_e32 v4, 1, v4
5686; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
5687; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
5688; GFX6-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
5689; GFX6-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
5690; GFX6-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
5691; GFX6-NEXT:    s_setpc_b64 s[30:31]
5692;
5693; GFX8-LABEL: v_saddsat_v2i128:
5694; GFX8:       ; %bb.0:
5695; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5696; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v0, v8
5697; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v1, v9, vcc
5698; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, v2, v10, vcc
5699; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, v3, v11, vcc
5700; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1]
5701; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5702; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3]
5703; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5704; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3]
5705; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
5706; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5707; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
5708; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5709; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5710; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
5711; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
5712; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
5713; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v2, v1
5714; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5715; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5716; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
5717; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v2, vcc
5718; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
5719; GFX8-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
5720; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v12
5721; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v5, v13, vcc
5722; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, v6, v14, vcc
5723; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v7, v15, vcc
5724; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5725; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
5726; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5727; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5728; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5729; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
5730; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
5731; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
5732; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5733; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5734; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
5735; GFX8-NEXT:    v_xor_b32_e32 v4, v5, v4
5736; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x80000000, v6
5737; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
5738; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
5739; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
5740; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
5741; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
5742; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
5743; GFX8-NEXT:    s_setpc_b64 s[30:31]
5744;
5745; GFX9-LABEL: v_saddsat_v2i128:
5746; GFX9:       ; %bb.0:
5747; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5748; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v8
5749; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v1, v9, vcc
5750; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, v2, v10, vcc
5751; GFX9-NEXT:    v_addc_co_u32_e32 v17, vcc, v3, v11, vcc
5752; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1]
5753; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5754; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3]
5755; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5756; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3]
5757; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
5758; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5759; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
5760; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
5761; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5762; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5763; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
5764; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
5765; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5766; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5767; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
5768; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v2, vcc
5769; GFX9-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
5770; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
5771; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v4, v12
5772; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v5, v13, vcc
5773; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v6, v14, vcc
5774; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v7, v15, vcc
5775; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5776; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
5777; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5778; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5779; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5780; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
5781; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
5782; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
5783; GFX9-NEXT:    v_add_u32_e32 v7, 0x80000000, v6
5784; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5785; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5786; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
5787; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v4
5788; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
5789; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
5790; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
5791; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
5792; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
5793; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
5794; GFX9-NEXT:    s_setpc_b64 s[30:31]
5795;
5796; GFX10-LABEL: v_saddsat_v2i128:
5797; GFX10:       ; %bb.0:
5798; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5799; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v8
5800; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo
5801; GFX10-NEXT:    v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo
5802; GFX10-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo
5803; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1]
5804; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5805; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3]
5806; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5807; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3]
5808; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5809; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11]
5810; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5811; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, v4, v12
5812; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo
5813; GFX10-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo
5814; GFX10-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo
5815; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
5816; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
5817; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5]
5818; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
5819; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
5820; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7]
5821; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
5822; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
5823; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15]
5824; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
5825; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7]
5826; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v19
5827; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
5828; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
5829; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x80000000, v6
5830; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
5831; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5832; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
5833; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
5834; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
5835; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x80000000, v2
5836; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc_lo
5837; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v2, vcc_lo
5838; GFX10-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc_lo
5839; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v3
5840; GFX10-NEXT:    v_cndmask_b32_e32 v3, v17, v4, vcc_lo
5841; GFX10-NEXT:    v_cndmask_b32_e64 v4, v12, v6, s4
5842; GFX10-NEXT:    v_cndmask_b32_e64 v5, v13, v6, s4
5843; GFX10-NEXT:    v_cndmask_b32_e64 v6, v18, v6, s4
5844; GFX10-NEXT:    v_cndmask_b32_e64 v7, v19, v7, s4
5845; GFX10-NEXT:    s_setpc_b64 s[30:31]
5846;
5847; GFX11-LABEL: v_saddsat_v2i128:
5848; GFX11:       ; %bb.0:
5849; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5850; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v8
5851; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo
5852; GFX11-NEXT:    v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo
5853; GFX11-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo
5854; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1]
5855; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5856; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3]
5857; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5858; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3]
5859; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5860; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11]
5861; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5862; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, v4, v12
5863; GFX11-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo
5864; GFX11-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo
5865; GFX11-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo
5866; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
5867; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
5868; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5]
5869; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
5870; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
5871; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7]
5872; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
5873; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15]
5874; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
5875; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7]
5876; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v19
5877; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
5878; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
5879; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x80000000, v6
5880; GFX11-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
5881; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
5882; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
5883; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
5884; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x80000000, v2
5885; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5886; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v2 :: v_dual_and_b32 v3, 1, v1
5887; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v3
5888; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v2, vcc_lo
5889; GFX11-NEXT:    v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4
5890; GFX11-NEXT:    v_cndmask_b32_e64 v4, v12, v6, s0
5891; GFX11-NEXT:    v_cndmask_b32_e64 v5, v13, v6, s0
5892; GFX11-NEXT:    v_cndmask_b32_e64 v6, v18, v6, s0
5893; GFX11-NEXT:    v_cndmask_b32_e64 v7, v19, v7, s0
5894; GFX11-NEXT:    s_setpc_b64 s[30:31]
5895  %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
5896  ret <2 x i128> %result
5897}
5898
5899define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) {
5900; GFX6-LABEL: s_saddsat_v2i128:
5901; GFX6:       ; %bb.0:
5902; GFX6-NEXT:    s_add_u32 s8, s0, s8
5903; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5904; GFX6-NEXT:    s_addc_u32 s9, s1, s9
5905; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5906; GFX6-NEXT:    s_addc_u32 s16, s2, s10
5907; GFX6-NEXT:    v_mov_b32_e32 v2, s2
5908; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
5909; GFX6-NEXT:    s_addc_u32 s17, s3, s11
5910; GFX6-NEXT:    v_mov_b32_e32 v3, s3
5911; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5912; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3]
5913; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
5914; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5915; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[2:3]
5916; GFX6-NEXT:    v_mov_b32_e32 v2, s8
5917; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5918; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5919; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
5920; GFX6-NEXT:    v_mov_b32_e32 v3, s9
5921; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5922; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5923; GFX6-NEXT:    s_ashr_i32 s0, s17, 31
5924; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5925; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
5926; GFX6-NEXT:    v_mov_b32_e32 v1, s0
5927; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5928; GFX6-NEXT:    v_cndmask_b32_e32 v4, v2, v1, vcc
5929; GFX6-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
5930; GFX6-NEXT:    v_mov_b32_e32 v0, s1
5931; GFX6-NEXT:    v_mov_b32_e32 v2, s16
5932; GFX6-NEXT:    v_mov_b32_e32 v3, s17
5933; GFX6-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
5934; GFX6-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
5935; GFX6-NEXT:    s_add_u32 s0, s4, s12
5936; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5937; GFX6-NEXT:    s_addc_u32 s1, s5, s13
5938; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5939; GFX6-NEXT:    s_addc_u32 s2, s6, s14
5940; GFX6-NEXT:    v_mov_b32_e32 v2, s6
5941; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
5942; GFX6-NEXT:    s_addc_u32 s3, s7, s15
5943; GFX6-NEXT:    v_mov_b32_e32 v3, s7
5944; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5945; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
5946; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
5947; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5948; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
5949; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5950; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5951; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
5952; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[14:15], 0
5953; GFX6-NEXT:    v_mov_b32_e32 v3, s1
5954; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
5955; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5956; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
5957; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5958; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
5959; GFX6-NEXT:    v_mov_b32_e32 v1, s4
5960; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5961; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
5962; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
5963; GFX6-NEXT:    v_mov_b32_e32 v3, s5
5964; GFX6-NEXT:    v_mov_b32_e32 v8, s2
5965; GFX6-NEXT:    v_mov_b32_e32 v9, s3
5966; GFX6-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
5967; GFX6-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
5968; GFX6-NEXT:    v_readfirstlane_b32 s0, v4
5969; GFX6-NEXT:    v_readfirstlane_b32 s1, v5
5970; GFX6-NEXT:    v_readfirstlane_b32 s2, v6
5971; GFX6-NEXT:    v_readfirstlane_b32 s3, v7
5972; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
5973; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
5974; GFX6-NEXT:    v_readfirstlane_b32 s6, v1
5975; GFX6-NEXT:    v_readfirstlane_b32 s7, v3
5976; GFX6-NEXT:    ; return to shader part epilog
5977;
5978; GFX8-LABEL: s_saddsat_v2i128:
5979; GFX8:       ; %bb.0:
5980; GFX8-NEXT:    s_add_u32 s8, s0, s8
5981; GFX8-NEXT:    s_addc_u32 s9, s1, s9
5982; GFX8-NEXT:    v_mov_b32_e32 v0, s0
5983; GFX8-NEXT:    s_addc_u32 s16, s2, s10
5984; GFX8-NEXT:    v_mov_b32_e32 v1, s1
5985; GFX8-NEXT:    s_addc_u32 s17, s3, s11
5986; GFX8-NEXT:    v_mov_b32_e32 v2, s2
5987; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
5988; GFX8-NEXT:    v_mov_b32_e32 v3, s3
5989; GFX8-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
5990; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
5991; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5992; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3]
5993; GFX8-NEXT:    s_and_b32 s0, 1, s0
5994; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5995; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5996; GFX8-NEXT:    s_cmp_eq_u64 s[10:11], 0
5997; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
5998; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
5999; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
6000; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
6001; GFX8-NEXT:    s_and_b32 s0, 1, s2
6002; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
6003; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
6004; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
6005; GFX8-NEXT:    s_ashr_i32 s0, s17, 31
6006; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
6007; GFX8-NEXT:    s_add_i32 s1, s0, 0x80000000
6008; GFX8-NEXT:    v_mov_b32_e32 v1, s0
6009; GFX8-NEXT:    v_mov_b32_e32 v2, s8
6010; GFX8-NEXT:    v_mov_b32_e32 v3, s9
6011; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
6012; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v1, vcc
6013; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
6014; GFX8-NEXT:    v_mov_b32_e32 v0, s1
6015; GFX8-NEXT:    v_mov_b32_e32 v2, s16
6016; GFX8-NEXT:    v_mov_b32_e32 v3, s17
6017; GFX8-NEXT:    s_add_u32 s0, s4, s12
6018; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
6019; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
6020; GFX8-NEXT:    s_addc_u32 s1, s5, s13
6021; GFX8-NEXT:    v_mov_b32_e32 v0, s4
6022; GFX8-NEXT:    s_addc_u32 s2, s6, s14
6023; GFX8-NEXT:    v_mov_b32_e32 v1, s5
6024; GFX8-NEXT:    s_addc_u32 s3, s7, s15
6025; GFX8-NEXT:    v_mov_b32_e32 v2, s6
6026; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
6027; GFX8-NEXT:    v_mov_b32_e32 v3, s7
6028; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
6029; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
6030; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
6031; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
6032; GFX8-NEXT:    s_and_b32 s4, 1, s4
6033; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
6034; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
6035; GFX8-NEXT:    s_cmp_eq_u64 s[14:15], 0
6036; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
6037; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
6038; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
6039; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
6040; GFX8-NEXT:    s_and_b32 s4, 1, s6
6041; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
6042; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
6043; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
6044; GFX8-NEXT:    s_ashr_i32 s4, s3, 31
6045; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
6046; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
6047; GFX8-NEXT:    v_mov_b32_e32 v1, s4
6048; GFX8-NEXT:    v_mov_b32_e32 v2, s0
6049; GFX8-NEXT:    v_mov_b32_e32 v3, s1
6050; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
6051; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
6052; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
6053; GFX8-NEXT:    v_mov_b32_e32 v3, s5
6054; GFX8-NEXT:    v_mov_b32_e32 v8, s2
6055; GFX8-NEXT:    v_mov_b32_e32 v9, s3
6056; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
6057; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
6058; GFX8-NEXT:    v_readfirstlane_b32 s0, v4
6059; GFX8-NEXT:    v_readfirstlane_b32 s1, v5
6060; GFX8-NEXT:    v_readfirstlane_b32 s2, v6
6061; GFX8-NEXT:    v_readfirstlane_b32 s3, v7
6062; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
6063; GFX8-NEXT:    v_readfirstlane_b32 s5, v2
6064; GFX8-NEXT:    v_readfirstlane_b32 s6, v1
6065; GFX8-NEXT:    v_readfirstlane_b32 s7, v3
6066; GFX8-NEXT:    ; return to shader part epilog
6067;
6068; GFX9-LABEL: s_saddsat_v2i128:
6069; GFX9:       ; %bb.0:
6070; GFX9-NEXT:    s_add_u32 s8, s0, s8
6071; GFX9-NEXT:    s_addc_u32 s9, s1, s9
6072; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6073; GFX9-NEXT:    s_addc_u32 s16, s2, s10
6074; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6075; GFX9-NEXT:    s_addc_u32 s17, s3, s11
6076; GFX9-NEXT:    v_mov_b32_e32 v2, s2
6077; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
6078; GFX9-NEXT:    v_mov_b32_e32 v3, s3
6079; GFX9-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
6080; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
6081; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
6082; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3]
6083; GFX9-NEXT:    s_and_b32 s0, 1, s0
6084; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
6085; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
6086; GFX9-NEXT:    s_cmp_eq_u64 s[10:11], 0
6087; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
6088; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
6089; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
6090; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
6091; GFX9-NEXT:    s_and_b32 s0, 1, s2
6092; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
6093; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
6094; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
6095; GFX9-NEXT:    s_ashr_i32 s0, s17, 31
6096; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
6097; GFX9-NEXT:    s_add_i32 s1, s0, 0x80000000
6098; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6099; GFX9-NEXT:    v_mov_b32_e32 v2, s8
6100; GFX9-NEXT:    v_mov_b32_e32 v3, s9
6101; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
6102; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v1, vcc
6103; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
6104; GFX9-NEXT:    v_mov_b32_e32 v0, s1
6105; GFX9-NEXT:    v_mov_b32_e32 v2, s16
6106; GFX9-NEXT:    v_mov_b32_e32 v3, s17
6107; GFX9-NEXT:    s_add_u32 s0, s4, s12
6108; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
6109; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
6110; GFX9-NEXT:    s_addc_u32 s1, s5, s13
6111; GFX9-NEXT:    v_mov_b32_e32 v0, s4
6112; GFX9-NEXT:    s_addc_u32 s2, s6, s14
6113; GFX9-NEXT:    v_mov_b32_e32 v1, s5
6114; GFX9-NEXT:    s_addc_u32 s3, s7, s15
6115; GFX9-NEXT:    v_mov_b32_e32 v2, s6
6116; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
6117; GFX9-NEXT:    v_mov_b32_e32 v3, s7
6118; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
6119; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
6120; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
6121; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
6122; GFX9-NEXT:    s_and_b32 s4, 1, s4
6123; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
6124; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
6125; GFX9-NEXT:    s_cmp_eq_u64 s[14:15], 0
6126; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
6127; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
6128; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
6129; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
6130; GFX9-NEXT:    s_and_b32 s4, 1, s6
6131; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
6132; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
6133; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
6134; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
6135; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
6136; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
6137; GFX9-NEXT:    v_mov_b32_e32 v1, s4
6138; GFX9-NEXT:    v_mov_b32_e32 v2, s0
6139; GFX9-NEXT:    v_mov_b32_e32 v3, s1
6140; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
6141; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
6142; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
6143; GFX9-NEXT:    v_mov_b32_e32 v3, s5
6144; GFX9-NEXT:    v_mov_b32_e32 v8, s2
6145; GFX9-NEXT:    v_mov_b32_e32 v9, s3
6146; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
6147; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
6148; GFX9-NEXT:    v_readfirstlane_b32 s0, v4
6149; GFX9-NEXT:    v_readfirstlane_b32 s1, v5
6150; GFX9-NEXT:    v_readfirstlane_b32 s2, v6
6151; GFX9-NEXT:    v_readfirstlane_b32 s3, v7
6152; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
6153; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
6154; GFX9-NEXT:    v_readfirstlane_b32 s6, v1
6155; GFX9-NEXT:    v_readfirstlane_b32 s7, v3
6156; GFX9-NEXT:    ; return to shader part epilog
6157;
6158; GFX10-LABEL: s_saddsat_v2i128:
6159; GFX10:       ; %bb.0:
6160; GFX10-NEXT:    s_add_u32 s8, s0, s8
6161; GFX10-NEXT:    s_addc_u32 s9, s1, s9
6162; GFX10-NEXT:    s_addc_u32 s16, s2, s10
6163; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
6164; GFX10-NEXT:    s_addc_u32 s17, s3, s11
6165; GFX10-NEXT:    v_mov_b32_e32 v4, s9
6166; GFX10-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
6167; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
6168; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
6169; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
6170; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[10:11], 0
6171; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
6172; GFX10-NEXT:    s_and_b32 s0, 1, s18
6173; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], 0
6174; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
6175; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
6176; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
6177; GFX10-NEXT:    s_and_b32 s1, 1, s1
6178; GFX10-NEXT:    s_ashr_i32 s10, s17, 31
6179; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
6180; GFX10-NEXT:    s_add_i32 s11, s10, 0x80000000
6181; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
6182; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
6183; GFX10-NEXT:    s_add_u32 s0, s4, s12
6184; GFX10-NEXT:    s_addc_u32 s1, s5, s13
6185; GFX10-NEXT:    s_addc_u32 s2, s6, s14
6186; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
6187; GFX10-NEXT:    s_addc_u32 s3, s7, s15
6188; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
6189; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
6190; GFX10-NEXT:    v_mov_b32_e32 v5, s0
6191; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
6192; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
6193; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
6194; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, s[14:15], 0
6195; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
6196; GFX10-NEXT:    v_mov_b32_e32 v6, s1
6197; GFX10-NEXT:    v_mov_b32_e32 v7, s3
6198; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
6199; GFX10-NEXT:    s_and_b32 s4, 1, s12
6200; GFX10-NEXT:    s_cmp_eq_u64 s[14:15], 0
6201; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
6202; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
6203; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
6204; GFX10-NEXT:    s_and_b32 s5, 1, s5
6205; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
6206; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
6207; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
6208; GFX10-NEXT:    v_mov_b32_e32 v0, s16
6209; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, 0, s4
6210; GFX10-NEXT:    v_mov_b32_e32 v3, s8
6211; GFX10-NEXT:    s_ashr_i32 s4, s3, 31
6212; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s10, vcc_lo
6213; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s10, vcc_lo
6214; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
6215; GFX10-NEXT:    v_mov_b32_e32 v2, s17
6216; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s10, vcc_lo
6217; GFX10-NEXT:    s_add_i32 s0, s4, 0x80000000
6218; GFX10-NEXT:    v_readfirstlane_b32 s1, v4
6219; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
6220; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
6221; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
6222; GFX10-NEXT:    v_mov_b32_e32 v1, s2
6223; GFX10-NEXT:    v_readfirstlane_b32 s2, v0
6224; GFX10-NEXT:    v_readfirstlane_b32 s3, v2
6225; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s4, vcc_lo
6226; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, s4, vcc_lo
6227; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s4, vcc_lo
6228; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, s0, vcc_lo
6229; GFX10-NEXT:    v_readfirstlane_b32 s0, v3
6230; GFX10-NEXT:    v_readfirstlane_b32 s4, v5
6231; GFX10-NEXT:    v_readfirstlane_b32 s5, v6
6232; GFX10-NEXT:    v_readfirstlane_b32 s6, v1
6233; GFX10-NEXT:    v_readfirstlane_b32 s7, v7
6234; GFX10-NEXT:    ; return to shader part epilog
6235;
6236; GFX11-LABEL: s_saddsat_v2i128:
6237; GFX11:       ; %bb.0:
6238; GFX11-NEXT:    s_add_u32 s8, s0, s8
6239; GFX11-NEXT:    s_addc_u32 s9, s1, s9
6240; GFX11-NEXT:    s_addc_u32 s16, s2, s10
6241; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
6242; GFX11-NEXT:    s_addc_u32 s17, s3, s11
6243; GFX11-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
6244; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
6245; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
6246; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
6247; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[10:11], 0
6248; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
6249; GFX11-NEXT:    s_and_b32 s0, 1, s18
6250; GFX11-NEXT:    s_cmp_eq_u64 s[10:11], 0
6251; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
6252; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
6253; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
6254; GFX11-NEXT:    s_and_b32 s1, 1, s1
6255; GFX11-NEXT:    s_ashr_i32 s10, s17, 31
6256; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
6257; GFX11-NEXT:    s_add_i32 s11, s10, 0x80000000
6258; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
6259; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
6260; GFX11-NEXT:    s_add_u32 s0, s4, s12
6261; GFX11-NEXT:    s_addc_u32 s1, s5, s13
6262; GFX11-NEXT:    s_addc_u32 s2, s6, s14
6263; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
6264; GFX11-NEXT:    s_addc_u32 s3, s7, s15
6265; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
6266; GFX11-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
6267; GFX11-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s3
6268; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
6269; GFX11-NEXT:    v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
6270; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
6271; GFX11-NEXT:    v_cmp_lt_i64_e64 s6, s[14:15], 0
6272; GFX11-NEXT:    v_dual_mov_b32 v5, s0 :: v_dual_and_b32 v0, 1, v0
6273; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
6274; GFX11-NEXT:    s_and_b32 s4, 1, s12
6275; GFX11-NEXT:    s_cmp_eq_u64 s[14:15], 0
6276; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
6277; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
6278; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
6279; GFX11-NEXT:    s_and_b32 s5, 1, s5
6280; GFX11-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
6281; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
6282; GFX11-NEXT:    v_cndmask_b32_e64 v2, v3, 0, s4
6283; GFX11-NEXT:    v_mov_b32_e32 v3, s8
6284; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
6285; GFX11-NEXT:    v_mov_b32_e32 v0, s16
6286; GFX11-NEXT:    s_ashr_i32 s4, s3, 31
6287; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
6288; GFX11-NEXT:    v_mov_b32_e32 v4, s9
6289; GFX11-NEXT:    v_mov_b32_e32 v2, s17
6290; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s10, vcc_lo
6291; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s10, vcc_lo
6292; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
6293; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s10, vcc_lo
6294; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
6295; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
6296; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
6297; GFX11-NEXT:    v_mov_b32_e32 v1, s2
6298; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
6299; GFX11-NEXT:    v_readfirstlane_b32 s2, v0
6300; GFX11-NEXT:    v_readfirstlane_b32 s3, v2
6301; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, s4, vcc_lo
6302; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, s4, vcc_lo
6303; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s4, vcc_lo
6304; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, s0, vcc_lo
6305; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
6306; GFX11-NEXT:    v_readfirstlane_b32 s4, v5
6307; GFX11-NEXT:    v_readfirstlane_b32 s5, v6
6308; GFX11-NEXT:    v_readfirstlane_b32 s6, v1
6309; GFX11-NEXT:    v_readfirstlane_b32 s7, v7
6310; GFX11-NEXT:    ; return to shader part epilog
6311  %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
6312  ret <2 x i128> %result
6313}
6314
6315declare i7 @llvm.sadd.sat.i7(i7, i7) #0
6316declare i8 @llvm.sadd.sat.i8(i8, i8) #0
6317declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>) #0
6318declare <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8>, <4 x i8>) #0
6319
6320declare i16 @llvm.sadd.sat.i16(i16, i16) #0
6321declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) #0
6322declare <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16>, <3 x i16>) #0
6323declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) #0
6324declare <5 x i16> @llvm.sadd.sat.v5i16(<5 x i16>, <5 x i16>) #0
6325declare <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16>, <6 x i16>) #0
6326declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) #0
6327
6328declare i24 @llvm.sadd.sat.i24(i24, i24) #0
6329
6330declare i32 @llvm.sadd.sat.i32(i32, i32) #0
6331declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) #0
6332declare <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32>, <3 x i32>) #0
6333declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) #0
6334declare <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32>, <5 x i32>) #0
6335declare <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32>, <16 x i32>) #0
6336
6337declare i48 @llvm.sadd.sat.i48(i48, i48) #0
6338
6339declare i64 @llvm.sadd.sat.i64(i64, i64) #0
6340declare <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>) #0
6341
6342declare i128 @llvm.sadd.sat.i128(i128, i128) #0
6343declare <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128>, <2 x i128>) #0
6344
6345attributes #0 = { nounwind readnone speculatable willreturn }
6346