xref: /llvm-project/llvm/test/CodeGen/X86/vec_ssubo.ll (revision ce6251540d7af30585d4ca753ca2a0ab34d32be2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512
9
10declare {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32>, <1 x i32>)
11declare {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>)
12declare {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32>, <3 x i32>)
13declare {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32>, <4 x i32>)
14declare {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32>, <6 x i32>)
15declare {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32>, <8 x i32>)
16declare {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32>, <16 x i32>)
17
18declare {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8>, <16 x i8>)
19declare {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16>, <8 x i16>)
20declare {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
21
22declare {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24>, <4 x i24>)
23declare {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1>, <4 x i1>)
24declare {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128>, <2 x i128>)
25
26define <1 x i32> @ssubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, ptr %p2) nounwind {
27; CHECK-LABEL: ssubo_v1i32:
28; CHECK:       # %bb.0:
29; CHECK-NEXT:    xorl %eax, %eax
30; CHECK-NEXT:    subl %esi, %edi
31; CHECK-NEXT:    seto %al
32; CHECK-NEXT:    negl %eax
33; CHECK-NEXT:    movl %edi, (%rdx)
34; CHECK-NEXT:    retq
35  %t = call {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
36  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
37  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
38  %res = sext <1 x i1> %obit to <1 x i32>
39  store <1 x i32> %val, ptr %p2
40  ret <1 x i32> %res
41}
42
43define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
44; SSE-LABEL: ssubo_v2i32:
45; SSE:       # %bb.0:
46; SSE-NEXT:    pxor %xmm2, %xmm2
47; SSE-NEXT:    movdqa %xmm0, %xmm3
48; SSE-NEXT:    psubd %xmm1, %xmm3
49; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
50; SSE-NEXT:    pcmpgtd %xmm3, %xmm0
51; SSE-NEXT:    pxor %xmm1, %xmm0
52; SSE-NEXT:    movq %xmm3, (%rdi)
53; SSE-NEXT:    retq
54;
55; AVX-LABEL: ssubo_v2i32:
56; AVX:       # %bb.0:
57; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
58; AVX-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
59; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
60; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
61; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
62; AVX-NEXT:    vmovq %xmm1, (%rdi)
63; AVX-NEXT:    retq
64;
65; AVX512-LABEL: ssubo_v2i32:
66; AVX512:       # %bb.0:
67; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
68; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
69; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
70; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
71; AVX512-NEXT:    kxorw %k1, %k0, %k1
72; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
73; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
74; AVX512-NEXT:    vmovq %xmm1, (%rdi)
75; AVX512-NEXT:    retq
76  %t = call {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
77  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
78  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
79  %res = sext <2 x i1> %obit to <2 x i32>
80  store <2 x i32> %val, ptr %p2
81  ret <2 x i32> %res
82}
83
84define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
85; SSE2-LABEL: ssubo_v3i32:
86; SSE2:       # %bb.0:
87; SSE2-NEXT:    pxor %xmm2, %xmm2
88; SSE2-NEXT:    movdqa %xmm0, %xmm3
89; SSE2-NEXT:    psubd %xmm1, %xmm3
90; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
91; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
92; SSE2-NEXT:    pxor %xmm1, %xmm0
93; SSE2-NEXT:    movq %xmm3, (%rdi)
94; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
95; SSE2-NEXT:    movd %xmm1, 8(%rdi)
96; SSE2-NEXT:    retq
97;
98; SSSE3-LABEL: ssubo_v3i32:
99; SSSE3:       # %bb.0:
100; SSSE3-NEXT:    pxor %xmm2, %xmm2
101; SSSE3-NEXT:    movdqa %xmm0, %xmm3
102; SSSE3-NEXT:    psubd %xmm1, %xmm3
103; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
104; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
105; SSSE3-NEXT:    pxor %xmm1, %xmm0
106; SSSE3-NEXT:    movq %xmm3, (%rdi)
107; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
108; SSSE3-NEXT:    movd %xmm1, 8(%rdi)
109; SSSE3-NEXT:    retq
110;
111; SSE41-LABEL: ssubo_v3i32:
112; SSE41:       # %bb.0:
113; SSE41-NEXT:    pxor %xmm2, %xmm2
114; SSE41-NEXT:    movdqa %xmm0, %xmm3
115; SSE41-NEXT:    psubd %xmm1, %xmm3
116; SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
117; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
118; SSE41-NEXT:    pxor %xmm1, %xmm0
119; SSE41-NEXT:    pextrd $2, %xmm3, 8(%rdi)
120; SSE41-NEXT:    movq %xmm3, (%rdi)
121; SSE41-NEXT:    retq
122;
123; AVX-LABEL: ssubo_v3i32:
124; AVX:       # %bb.0:
125; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
126; AVX-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
127; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
128; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
129; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
130; AVX-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
131; AVX-NEXT:    vmovq %xmm1, (%rdi)
132; AVX-NEXT:    retq
133;
134; AVX512-LABEL: ssubo_v3i32:
135; AVX512:       # %bb.0:
136; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
137; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
138; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
139; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
140; AVX512-NEXT:    kxorw %k1, %k0, %k1
141; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
142; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
143; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
144; AVX512-NEXT:    vmovq %xmm1, (%rdi)
145; AVX512-NEXT:    retq
146  %t = call {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
147  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
148  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
149  %res = sext <3 x i1> %obit to <3 x i32>
150  store <3 x i32> %val, ptr %p2
151  ret <3 x i32> %res
152}
153
154define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
155; SSE-LABEL: ssubo_v4i32:
156; SSE:       # %bb.0:
157; SSE-NEXT:    pxor %xmm2, %xmm2
158; SSE-NEXT:    movdqa %xmm0, %xmm3
159; SSE-NEXT:    psubd %xmm1, %xmm3
160; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
161; SSE-NEXT:    pcmpgtd %xmm3, %xmm0
162; SSE-NEXT:    pxor %xmm1, %xmm0
163; SSE-NEXT:    movdqa %xmm3, (%rdi)
164; SSE-NEXT:    retq
165;
166; AVX-LABEL: ssubo_v4i32:
167; AVX:       # %bb.0:
168; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
169; AVX-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
170; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
171; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
172; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
173; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
174; AVX-NEXT:    retq
175;
176; AVX512-LABEL: ssubo_v4i32:
177; AVX512:       # %bb.0:
178; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
179; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
180; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
181; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
182; AVX512-NEXT:    kxorw %k1, %k0, %k1
183; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
184; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
185; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
186; AVX512-NEXT:    retq
187  %t = call {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
188  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
189  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
190  %res = sext <4 x i1> %obit to <4 x i32>
191  store <4 x i32> %val, ptr %p2
192  ret <4 x i32> %res
193}
194
195define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
196; SSE2-LABEL: ssubo_v6i32:
197; SSE2:       # %bb.0:
198; SSE2-NEXT:    movq %rdi, %rax
199; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
200; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
201; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
202; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
203; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
204; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
205; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
206; SSE2-NEXT:    movd %r8d, %xmm1
207; SSE2-NEXT:    movd %ecx, %xmm2
208; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
209; SSE2-NEXT:    movd %edx, %xmm1
210; SSE2-NEXT:    movd %esi, %xmm3
211; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
212; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
213; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
214; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
215; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
216; SSE2-NEXT:    movd %r9d, %xmm1
217; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
218; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
219; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
220; SSE2-NEXT:    movdqa %xmm3, %xmm4
221; SSE2-NEXT:    psubd %xmm0, %xmm4
222; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
223; SSE2-NEXT:    pxor %xmm5, %xmm5
224; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
225; SSE2-NEXT:    pxor %xmm3, %xmm0
226; SSE2-NEXT:    movdqa %xmm1, %xmm3
227; SSE2-NEXT:    psubd %xmm2, %xmm3
228; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
229; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
230; SSE2-NEXT:    pxor %xmm1, %xmm2
231; SSE2-NEXT:    movq %xmm3, 16(%rcx)
232; SSE2-NEXT:    movdqa %xmm4, (%rcx)
233; SSE2-NEXT:    movq %xmm2, 16(%rdi)
234; SSE2-NEXT:    movdqa %xmm0, (%rdi)
235; SSE2-NEXT:    retq
236;
237; SSSE3-LABEL: ssubo_v6i32:
238; SSSE3:       # %bb.0:
239; SSSE3-NEXT:    movq %rdi, %rax
240; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
241; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
242; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
243; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
244; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
245; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
246; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
247; SSSE3-NEXT:    movd %r8d, %xmm1
248; SSSE3-NEXT:    movd %ecx, %xmm2
249; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
250; SSSE3-NEXT:    movd %edx, %xmm1
251; SSSE3-NEXT:    movd %esi, %xmm3
252; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
253; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
254; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
255; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
256; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
257; SSSE3-NEXT:    movd %r9d, %xmm1
258; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
259; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
260; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
261; SSSE3-NEXT:    movdqa %xmm3, %xmm4
262; SSSE3-NEXT:    psubd %xmm0, %xmm4
263; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
264; SSSE3-NEXT:    pxor %xmm5, %xmm5
265; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
266; SSSE3-NEXT:    pxor %xmm3, %xmm0
267; SSSE3-NEXT:    movdqa %xmm1, %xmm3
268; SSSE3-NEXT:    psubd %xmm2, %xmm3
269; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
270; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
271; SSSE3-NEXT:    pxor %xmm1, %xmm2
272; SSSE3-NEXT:    movq %xmm3, 16(%rcx)
273; SSSE3-NEXT:    movdqa %xmm4, (%rcx)
274; SSSE3-NEXT:    movq %xmm2, 16(%rdi)
275; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
276; SSSE3-NEXT:    retq
277;
278; SSE41-LABEL: ssubo_v6i32:
279; SSE41:       # %bb.0:
280; SSE41-NEXT:    movq %rdi, %rax
281; SSE41-NEXT:    movd %esi, %xmm1
282; SSE41-NEXT:    pinsrd $1, %edx, %xmm1
283; SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
284; SSE41-NEXT:    pinsrd $3, %r8d, %xmm1
285; SSE41-NEXT:    movd %r9d, %xmm0
286; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm0
287; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
288; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
289; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
290; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
291; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
292; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
293; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
294; SSE41-NEXT:    movdqa %xmm1, %xmm4
295; SSE41-NEXT:    psubd %xmm3, %xmm4
296; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
297; SSE41-NEXT:    pxor %xmm5, %xmm5
298; SSE41-NEXT:    pcmpgtd %xmm5, %xmm3
299; SSE41-NEXT:    pxor %xmm1, %xmm3
300; SSE41-NEXT:    movdqa %xmm0, %xmm1
301; SSE41-NEXT:    psubd %xmm2, %xmm1
302; SSE41-NEXT:    pcmpgtd %xmm5, %xmm2
303; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
304; SSE41-NEXT:    pxor %xmm2, %xmm0
305; SSE41-NEXT:    movq %xmm1, 16(%rcx)
306; SSE41-NEXT:    movdqa %xmm4, (%rcx)
307; SSE41-NEXT:    movq %xmm0, 16(%rdi)
308; SSE41-NEXT:    movdqa %xmm3, (%rdi)
309; SSE41-NEXT:    retq
310;
311; AVX1-LABEL: ssubo_v6i32:
312; AVX1:       # %bb.0:
313; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
314; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
315; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm4
316; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm3
317; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
318; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
319; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
320; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
321; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
322; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
323; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
324; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
325; AVX1-NEXT:    vmovq %xmm2, 16(%rdi)
326; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
327; AVX1-NEXT:    retq
328;
329; AVX2-LABEL: ssubo_v6i32:
330; AVX2:       # %bb.0:
331; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
332; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm2
333; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
334; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
335; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
336; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
337; AVX2-NEXT:    vmovq %xmm2, 16(%rdi)
338; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
339; AVX2-NEXT:    retq
340;
341; AVX512-LABEL: ssubo_v6i32:
342; AVX512:       # %bb.0:
343; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
344; AVX512-NEXT:    vpcmpgtd %ymm2, %ymm1, %k0
345; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
346; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
347; AVX512-NEXT:    kxorw %k1, %k0, %k1
348; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
349; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
350; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
351; AVX512-NEXT:    vmovq %xmm2, 16(%rdi)
352; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
353; AVX512-NEXT:    retq
354  %t = call {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
355  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
356  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
357  %res = sext <6 x i1> %obit to <6 x i32>
358  store <6 x i32> %val, ptr %p2
359  ret <6 x i32> %res
360}
361
362define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
363; SSE-LABEL: ssubo_v8i32:
364; SSE:       # %bb.0:
365; SSE-NEXT:    pxor %xmm4, %xmm4
366; SSE-NEXT:    movdqa %xmm0, %xmm5
367; SSE-NEXT:    psubd %xmm2, %xmm5
368; SSE-NEXT:    pcmpgtd %xmm4, %xmm2
369; SSE-NEXT:    pcmpgtd %xmm5, %xmm0
370; SSE-NEXT:    pxor %xmm2, %xmm0
371; SSE-NEXT:    movdqa %xmm1, %xmm2
372; SSE-NEXT:    psubd %xmm3, %xmm2
373; SSE-NEXT:    pcmpgtd %xmm4, %xmm3
374; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
375; SSE-NEXT:    pxor %xmm3, %xmm1
376; SSE-NEXT:    movdqa %xmm2, 16(%rdi)
377; SSE-NEXT:    movdqa %xmm5, (%rdi)
378; SSE-NEXT:    retq
379;
380; AVX1-LABEL: ssubo_v8i32:
381; AVX1:       # %bb.0:
382; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
383; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
384; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm4
385; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm3
386; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
387; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
388; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
389; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
390; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
391; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
392; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
393; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
394; AVX1-NEXT:    vmovdqa %xmm2, 16(%rdi)
395; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
396; AVX1-NEXT:    retq
397;
398; AVX2-LABEL: ssubo_v8i32:
399; AVX2:       # %bb.0:
400; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
401; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm2
402; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
403; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
404; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
405; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
406; AVX2-NEXT:    retq
407;
408; AVX512-LABEL: ssubo_v8i32:
409; AVX512:       # %bb.0:
410; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
411; AVX512-NEXT:    vpcmpgtd %ymm2, %ymm1, %k0
412; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
413; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
414; AVX512-NEXT:    kxorw %k1, %k0, %k1
415; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
416; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
417; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
418; AVX512-NEXT:    retq
419  %t = call {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
420  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
421  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
422  %res = sext <8 x i1> %obit to <8 x i32>
423  store <8 x i32> %val, ptr %p2
424  ret <8 x i32> %res
425}
426
427define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwind {
428; SSE-LABEL: ssubo_v16i32:
429; SSE:       # %bb.0:
430; SSE-NEXT:    pxor %xmm9, %xmm9
431; SSE-NEXT:    movdqa %xmm0, %xmm8
432; SSE-NEXT:    psubd %xmm4, %xmm8
433; SSE-NEXT:    pcmpgtd %xmm9, %xmm4
434; SSE-NEXT:    pcmpgtd %xmm8, %xmm0
435; SSE-NEXT:    pxor %xmm4, %xmm0
436; SSE-NEXT:    movdqa %xmm1, %xmm4
437; SSE-NEXT:    psubd %xmm5, %xmm4
438; SSE-NEXT:    pcmpgtd %xmm9, %xmm5
439; SSE-NEXT:    pcmpgtd %xmm4, %xmm1
440; SSE-NEXT:    pxor %xmm5, %xmm1
441; SSE-NEXT:    movdqa %xmm2, %xmm5
442; SSE-NEXT:    psubd %xmm6, %xmm5
443; SSE-NEXT:    pcmpgtd %xmm9, %xmm6
444; SSE-NEXT:    pcmpgtd %xmm5, %xmm2
445; SSE-NEXT:    pxor %xmm6, %xmm2
446; SSE-NEXT:    movdqa %xmm3, %xmm6
447; SSE-NEXT:    psubd %xmm7, %xmm6
448; SSE-NEXT:    pcmpgtd %xmm9, %xmm7
449; SSE-NEXT:    pcmpgtd %xmm6, %xmm3
450; SSE-NEXT:    pxor %xmm7, %xmm3
451; SSE-NEXT:    movdqa %xmm6, 48(%rdi)
452; SSE-NEXT:    movdqa %xmm5, 32(%rdi)
453; SSE-NEXT:    movdqa %xmm4, 16(%rdi)
454; SSE-NEXT:    movdqa %xmm8, (%rdi)
455; SSE-NEXT:    retq
456;
457; AVX1-LABEL: ssubo_v16i32:
458; AVX1:       # %bb.0:
459; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
460; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
461; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm4, %xmm6
462; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
463; AVX1-NEXT:    vpsubd %xmm4, %xmm7, %xmm4
464; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm7, %xmm7
465; AVX1-NEXT:    vpxor %xmm7, %xmm6, %xmm6
466; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm3, %xmm7
467; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm3
468; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm1
469; AVX1-NEXT:    vpxor %xmm1, %xmm7, %xmm1
470; AVX1-NEXT:    vpackssdw %xmm6, %xmm1, %xmm1
471; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
472; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm6, %xmm7
473; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm8
474; AVX1-NEXT:    vpsubd %xmm6, %xmm8, %xmm6
475; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm8, %xmm8
476; AVX1-NEXT:    vpxor %xmm7, %xmm8, %xmm7
477; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm2, %xmm5
478; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm2
479; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
480; AVX1-NEXT:    vpxor %xmm0, %xmm5, %xmm0
481; AVX1-NEXT:    vpackssdw %xmm7, %xmm0, %xmm0
482; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
483; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm5
484; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
485; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
486; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm5, %ymm0
487; AVX1-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
488; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm5
489; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
490; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
491; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm1
492; AVX1-NEXT:    vmovdqa %xmm4, 48(%rdi)
493; AVX1-NEXT:    vmovdqa %xmm3, 32(%rdi)
494; AVX1-NEXT:    vmovdqa %xmm6, 16(%rdi)
495; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
496; AVX1-NEXT:    retq
497;
498; AVX2-LABEL: ssubo_v16i32:
499; AVX2:       # %bb.0:
500; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
501; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm3, %ymm5
502; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm3
503; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm1, %ymm1
504; AVX2-NEXT:    vpxor %ymm1, %ymm5, %ymm1
505; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
506; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
507; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm2, %ymm4
508; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm2
509; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
510; AVX2-NEXT:    vpxor %ymm0, %ymm4, %ymm0
511; AVX2-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
512; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
513; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
514; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
515; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
516; AVX2-NEXT:    retq
517;
518; AVX512-LABEL: ssubo_v16i32:
519; AVX512:       # %bb.0:
520; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
521; AVX512-NEXT:    vpcmpgtd %zmm2, %zmm1, %k0
522; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
523; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
524; AVX512-NEXT:    kxorw %k1, %k0, %k1
525; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
526; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
527; AVX512-NEXT:    retq
528  %t = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
529  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
530  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
531  %res = sext <16 x i1> %obit to <16 x i32>
532  store <16 x i32> %val, ptr %p2
533  ret <16 x i32> %res
534}
535
536define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
537; SSE2-LABEL: ssubo_v16i8:
538; SSE2:       # %bb.0:
539; SSE2-NEXT:    movdqa %xmm0, %xmm2
540; SSE2-NEXT:    psubsb %xmm1, %xmm2
541; SSE2-NEXT:    psubb %xmm1, %xmm0
542; SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
543; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
544; SSE2-NEXT:    pxor %xmm2, %xmm3
545; SSE2-NEXT:    movdqa %xmm3, %xmm4
546; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
547; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
548; SSE2-NEXT:    movdqa %xmm3, %xmm1
549; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
550; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
551; SSE2-NEXT:    pslld $31, %xmm1
552; SSE2-NEXT:    psrad $31, %xmm1
553; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
554; SSE2-NEXT:    movdqa %xmm3, %xmm2
555; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
556; SSE2-NEXT:    pslld $31, %xmm2
557; SSE2-NEXT:    psrad $31, %xmm2
558; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
559; SSE2-NEXT:    pslld $31, %xmm3
560; SSE2-NEXT:    psrad $31, %xmm3
561; SSE2-NEXT:    movdqa %xmm0, (%rdi)
562; SSE2-NEXT:    movdqa %xmm4, %xmm0
563; SSE2-NEXT:    retq
564;
565; SSSE3-LABEL: ssubo_v16i8:
566; SSSE3:       # %bb.0:
567; SSSE3-NEXT:    movdqa %xmm0, %xmm2
568; SSSE3-NEXT:    psubsb %xmm1, %xmm2
569; SSSE3-NEXT:    psubb %xmm1, %xmm0
570; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
571; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
572; SSSE3-NEXT:    pxor %xmm2, %xmm3
573; SSSE3-NEXT:    movdqa %xmm3, %xmm4
574; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
575; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
576; SSSE3-NEXT:    movdqa %xmm3, %xmm1
577; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
578; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
579; SSSE3-NEXT:    pslld $31, %xmm1
580; SSSE3-NEXT:    psrad $31, %xmm1
581; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
582; SSSE3-NEXT:    movdqa %xmm3, %xmm2
583; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
584; SSSE3-NEXT:    pslld $31, %xmm2
585; SSSE3-NEXT:    psrad $31, %xmm2
586; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
587; SSSE3-NEXT:    pslld $31, %xmm3
588; SSSE3-NEXT:    psrad $31, %xmm3
589; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
590; SSSE3-NEXT:    movdqa %xmm4, %xmm0
591; SSSE3-NEXT:    retq
592;
593; SSE41-LABEL: ssubo_v16i8:
594; SSE41:       # %bb.0:
595; SSE41-NEXT:    movdqa %xmm0, %xmm2
596; SSE41-NEXT:    psubsb %xmm1, %xmm2
597; SSE41-NEXT:    psubb %xmm1, %xmm0
598; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
599; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
600; SSE41-NEXT:    pxor %xmm2, %xmm3
601; SSE41-NEXT:    pmovsxbd %xmm3, %xmm4
602; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
603; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
604; SSE41-NEXT:    pslld $31, %xmm1
605; SSE41-NEXT:    psrad $31, %xmm1
606; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
607; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
608; SSE41-NEXT:    pslld $31, %xmm2
609; SSE41-NEXT:    psrad $31, %xmm2
610; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
611; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
612; SSE41-NEXT:    pslld $31, %xmm3
613; SSE41-NEXT:    psrad $31, %xmm3
614; SSE41-NEXT:    movdqa %xmm0, (%rdi)
615; SSE41-NEXT:    movdqa %xmm4, %xmm0
616; SSE41-NEXT:    retq
617;
618; AVX1-LABEL: ssubo_v16i8:
619; AVX1:       # %bb.0:
620; AVX1-NEXT:    vpsubsb %xmm1, %xmm0, %xmm2
621; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm3
622; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm3, %xmm0
623; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
624; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
625; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
626; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
627; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
628; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
629; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
630; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
631; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
632; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
633; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
634; AVX1-NEXT:    vmovdqa %xmm3, (%rdi)
635; AVX1-NEXT:    retq
636;
637; AVX2-LABEL: ssubo_v16i8:
638; AVX2:       # %bb.0:
639; AVX2-NEXT:    vpsubsb %xmm1, %xmm0, %xmm2
640; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm3
641; AVX2-NEXT:    vpcmpeqb %xmm2, %xmm3, %xmm0
642; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
643; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
644; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm0
645; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
646; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
647; AVX2-NEXT:    vmovdqa %xmm3, (%rdi)
648; AVX2-NEXT:    retq
649;
650; AVX512-LABEL: ssubo_v16i8:
651; AVX512:       # %bb.0:
652; AVX512-NEXT:    vpsubsb %xmm1, %xmm0, %xmm2
653; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm1
654; AVX512-NEXT:    vpcmpneqb %xmm2, %xmm1, %k1
655; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
656; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
657; AVX512-NEXT:    retq
658  %t = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
659  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
660  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
661  %res = sext <16 x i1> %obit to <16 x i32>
662  store <16 x i8> %val, ptr %p2
663  ret <16 x i32> %res
664}
665
666define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
667; SSE2-LABEL: ssubo_v8i16:
668; SSE2:       # %bb.0:
669; SSE2-NEXT:    movdqa %xmm0, %xmm2
670; SSE2-NEXT:    psubsw %xmm1, %xmm2
671; SSE2-NEXT:    psubw %xmm1, %xmm0
672; SSE2-NEXT:    pcmpeqw %xmm0, %xmm2
673; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
674; SSE2-NEXT:    pxor %xmm2, %xmm1
675; SSE2-NEXT:    movdqa %xmm1, %xmm2
676; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
677; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
678; SSE2-NEXT:    pslld $31, %xmm1
679; SSE2-NEXT:    psrad $31, %xmm1
680; SSE2-NEXT:    movdqa %xmm0, (%rdi)
681; SSE2-NEXT:    movdqa %xmm2, %xmm0
682; SSE2-NEXT:    retq
683;
684; SSSE3-LABEL: ssubo_v8i16:
685; SSSE3:       # %bb.0:
686; SSSE3-NEXT:    movdqa %xmm0, %xmm2
687; SSSE3-NEXT:    psubsw %xmm1, %xmm2
688; SSSE3-NEXT:    psubw %xmm1, %xmm0
689; SSSE3-NEXT:    pcmpeqw %xmm0, %xmm2
690; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
691; SSSE3-NEXT:    pxor %xmm2, %xmm1
692; SSSE3-NEXT:    movdqa %xmm1, %xmm2
693; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
694; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
695; SSSE3-NEXT:    pslld $31, %xmm1
696; SSSE3-NEXT:    psrad $31, %xmm1
697; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
698; SSSE3-NEXT:    movdqa %xmm2, %xmm0
699; SSSE3-NEXT:    retq
700;
701; SSE41-LABEL: ssubo_v8i16:
702; SSE41:       # %bb.0:
703; SSE41-NEXT:    movdqa %xmm0, %xmm2
704; SSE41-NEXT:    psubsw %xmm1, %xmm2
705; SSE41-NEXT:    psubw %xmm1, %xmm0
706; SSE41-NEXT:    pcmpeqw %xmm0, %xmm2
707; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
708; SSE41-NEXT:    pxor %xmm2, %xmm1
709; SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
710; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
711; SSE41-NEXT:    pslld $31, %xmm1
712; SSE41-NEXT:    psrad $31, %xmm1
713; SSE41-NEXT:    movdqa %xmm0, (%rdi)
714; SSE41-NEXT:    movdqa %xmm2, %xmm0
715; SSE41-NEXT:    retq
716;
717; AVX1-LABEL: ssubo_v8i16:
718; AVX1:       # %bb.0:
719; AVX1-NEXT:    vpsubsw %xmm1, %xmm0, %xmm2
720; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
721; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm0
722; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
723; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
724; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
725; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
726; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
727; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
728; AVX1-NEXT:    retq
729;
730; AVX2-LABEL: ssubo_v8i16:
731; AVX2:       # %bb.0:
732; AVX2-NEXT:    vpsubsw %xmm1, %xmm0, %xmm2
733; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
734; AVX2-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm0
735; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
736; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
737; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
738; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
739; AVX2-NEXT:    retq
740;
741; AVX512-LABEL: ssubo_v8i16:
742; AVX512:       # %bb.0:
743; AVX512-NEXT:    vpsubsw %xmm1, %xmm0, %xmm2
744; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
745; AVX512-NEXT:    vpcmpneqw %xmm2, %xmm1, %k1
746; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
747; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
748; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
749; AVX512-NEXT:    retq
750  %t = call {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
751  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
752  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
753  %res = sext <8 x i1> %obit to <8 x i32>
754  store <8 x i16> %val, ptr %p2
755  ret <8 x i32> %res
756}
757
758define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
759; SSE2-LABEL: ssubo_v2i64:
760; SSE2:       # %bb.0:
761; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
762; SSE2-NEXT:    movdqa %xmm0, %xmm3
763; SSE2-NEXT:    pxor %xmm2, %xmm3
764; SSE2-NEXT:    psubq %xmm1, %xmm0
765; SSE2-NEXT:    movdqa %xmm0, (%rdi)
766; SSE2-NEXT:    pxor %xmm2, %xmm0
767; SSE2-NEXT:    movdqa %xmm3, %xmm4
768; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
769; SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
770; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
771; SSE2-NEXT:    pand %xmm4, %xmm0
772; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
773; SSE2-NEXT:    por %xmm0, %xmm3
774; SSE2-NEXT:    pxor %xmm2, %xmm1
775; SSE2-NEXT:    movdqa %xmm1, %xmm0
776; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
777; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
778; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
779; SSE2-NEXT:    pand %xmm0, %xmm1
780; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
781; SSE2-NEXT:    por %xmm1, %xmm0
782; SSE2-NEXT:    pxor %xmm3, %xmm0
783; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
784; SSE2-NEXT:    retq
785;
786; SSSE3-LABEL: ssubo_v2i64:
787; SSSE3:       # %bb.0:
788; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
789; SSSE3-NEXT:    movdqa %xmm0, %xmm3
790; SSSE3-NEXT:    pxor %xmm2, %xmm3
791; SSSE3-NEXT:    psubq %xmm1, %xmm0
792; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
793; SSSE3-NEXT:    pxor %xmm2, %xmm0
794; SSSE3-NEXT:    movdqa %xmm3, %xmm4
795; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
796; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm0
797; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
798; SSSE3-NEXT:    pand %xmm4, %xmm0
799; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
800; SSSE3-NEXT:    por %xmm0, %xmm3
801; SSSE3-NEXT:    pxor %xmm2, %xmm1
802; SSSE3-NEXT:    movdqa %xmm1, %xmm0
803; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
804; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
805; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
806; SSSE3-NEXT:    pand %xmm0, %xmm1
807; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
808; SSSE3-NEXT:    por %xmm1, %xmm0
809; SSSE3-NEXT:    pxor %xmm3, %xmm0
810; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
811; SSSE3-NEXT:    retq
812;
813; SSE41-LABEL: ssubo_v2i64:
814; SSE41:       # %bb.0:
815; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648]
816; SSE41-NEXT:    movdqa %xmm0, %xmm3
817; SSE41-NEXT:    pxor %xmm2, %xmm3
818; SSE41-NEXT:    psubq %xmm1, %xmm0
819; SSE41-NEXT:    movdqa %xmm0, (%rdi)
820; SSE41-NEXT:    pxor %xmm2, %xmm0
821; SSE41-NEXT:    movdqa %xmm3, %xmm4
822; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
823; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
824; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
825; SSE41-NEXT:    pand %xmm4, %xmm0
826; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
827; SSE41-NEXT:    por %xmm0, %xmm3
828; SSE41-NEXT:    pxor %xmm2, %xmm1
829; SSE41-NEXT:    movdqa %xmm1, %xmm0
830; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
831; SSE41-NEXT:    pcmpeqd %xmm2, %xmm1
832; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
833; SSE41-NEXT:    pand %xmm0, %xmm1
834; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
835; SSE41-NEXT:    por %xmm1, %xmm0
836; SSE41-NEXT:    pxor %xmm3, %xmm0
837; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
838; SSE41-NEXT:    retq
839;
840; AVX-LABEL: ssubo_v2i64:
841; AVX:       # %bb.0:
842; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
843; AVX-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
844; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
845; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
846; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
847; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
848; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
849; AVX-NEXT:    retq
850;
851; AVX512-LABEL: ssubo_v2i64:
852; AVX512:       # %bb.0:
853; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
854; AVX512-NEXT:    vpcmpgtq %xmm2, %xmm1, %k0
855; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
856; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
857; AVX512-NEXT:    kxorw %k1, %k0, %k1
858; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
859; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
860; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
861; AVX512-NEXT:    retq
862  %t = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
863  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
864  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
865  %res = sext <2 x i1> %obit to <2 x i32>
866  store <2 x i64> %val, ptr %p2
867  ret <2 x i32> %res
868}
869
870define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
871; SSE2-LABEL: ssubo_v4i24:
872; SSE2:       # %bb.0:
873; SSE2-NEXT:    movdqa %xmm0, %xmm2
874; SSE2-NEXT:    pslld $8, %xmm1
875; SSE2-NEXT:    psrad $8, %xmm1
876; SSE2-NEXT:    pslld $8, %xmm2
877; SSE2-NEXT:    psrad $8, %xmm2
878; SSE2-NEXT:    psubd %xmm1, %xmm2
879; SSE2-NEXT:    movdqa %xmm2, %xmm1
880; SSE2-NEXT:    pslld $8, %xmm1
881; SSE2-NEXT:    psrad $8, %xmm1
882; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
883; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
884; SSE2-NEXT:    pxor %xmm1, %xmm0
885; SSE2-NEXT:    movd %xmm2, %eax
886; SSE2-NEXT:    movw %ax, (%rdi)
887; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
888; SSE2-NEXT:    movd %xmm1, %ecx
889; SSE2-NEXT:    movw %cx, 9(%rdi)
890; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
891; SSE2-NEXT:    movd %xmm1, %edx
892; SSE2-NEXT:    movw %dx, 6(%rdi)
893; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
894; SSE2-NEXT:    movd %xmm1, %esi
895; SSE2-NEXT:    movw %si, 3(%rdi)
896; SSE2-NEXT:    shrl $16, %eax
897; SSE2-NEXT:    movb %al, 2(%rdi)
898; SSE2-NEXT:    shrl $16, %ecx
899; SSE2-NEXT:    movb %cl, 11(%rdi)
900; SSE2-NEXT:    shrl $16, %edx
901; SSE2-NEXT:    movb %dl, 8(%rdi)
902; SSE2-NEXT:    shrl $16, %esi
903; SSE2-NEXT:    movb %sil, 5(%rdi)
904; SSE2-NEXT:    retq
905;
906; SSSE3-LABEL: ssubo_v4i24:
907; SSSE3:       # %bb.0:
908; SSSE3-NEXT:    movdqa %xmm0, %xmm2
909; SSSE3-NEXT:    pslld $8, %xmm1
910; SSSE3-NEXT:    psrad $8, %xmm1
911; SSSE3-NEXT:    pslld $8, %xmm2
912; SSSE3-NEXT:    psrad $8, %xmm2
913; SSSE3-NEXT:    psubd %xmm1, %xmm2
914; SSSE3-NEXT:    movdqa %xmm2, %xmm1
915; SSSE3-NEXT:    pslld $8, %xmm1
916; SSSE3-NEXT:    psrad $8, %xmm1
917; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
918; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
919; SSSE3-NEXT:    pxor %xmm1, %xmm0
920; SSSE3-NEXT:    movd %xmm2, %eax
921; SSSE3-NEXT:    movw %ax, (%rdi)
922; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
923; SSSE3-NEXT:    movd %xmm1, %ecx
924; SSSE3-NEXT:    movw %cx, 9(%rdi)
925; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
926; SSSE3-NEXT:    movd %xmm1, %edx
927; SSSE3-NEXT:    movw %dx, 6(%rdi)
928; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
929; SSSE3-NEXT:    movd %xmm1, %esi
930; SSSE3-NEXT:    movw %si, 3(%rdi)
931; SSSE3-NEXT:    shrl $16, %eax
932; SSSE3-NEXT:    movb %al, 2(%rdi)
933; SSSE3-NEXT:    shrl $16, %ecx
934; SSSE3-NEXT:    movb %cl, 11(%rdi)
935; SSSE3-NEXT:    shrl $16, %edx
936; SSSE3-NEXT:    movb %dl, 8(%rdi)
937; SSSE3-NEXT:    shrl $16, %esi
938; SSSE3-NEXT:    movb %sil, 5(%rdi)
939; SSSE3-NEXT:    retq
940;
941; SSE41-LABEL: ssubo_v4i24:
942; SSE41:       # %bb.0:
943; SSE41-NEXT:    pslld $8, %xmm1
944; SSE41-NEXT:    psrad $8, %xmm1
945; SSE41-NEXT:    pslld $8, %xmm0
946; SSE41-NEXT:    psrad $8, %xmm0
947; SSE41-NEXT:    psubd %xmm1, %xmm0
948; SSE41-NEXT:    movdqa %xmm0, %xmm2
949; SSE41-NEXT:    pslld $8, %xmm2
950; SSE41-NEXT:    psrad $8, %xmm2
951; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
952; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
953; SSE41-NEXT:    pxor %xmm2, %xmm1
954; SSE41-NEXT:    pextrd $3, %xmm0, %eax
955; SSE41-NEXT:    movw %ax, 9(%rdi)
956; SSE41-NEXT:    pextrd $2, %xmm0, %ecx
957; SSE41-NEXT:    movw %cx, 6(%rdi)
958; SSE41-NEXT:    pextrd $1, %xmm0, %edx
959; SSE41-NEXT:    movw %dx, 3(%rdi)
960; SSE41-NEXT:    movd %xmm0, %esi
961; SSE41-NEXT:    movw %si, (%rdi)
962; SSE41-NEXT:    shrl $16, %eax
963; SSE41-NEXT:    movb %al, 11(%rdi)
964; SSE41-NEXT:    shrl $16, %ecx
965; SSE41-NEXT:    movb %cl, 8(%rdi)
966; SSE41-NEXT:    shrl $16, %edx
967; SSE41-NEXT:    movb %dl, 5(%rdi)
968; SSE41-NEXT:    shrl $16, %esi
969; SSE41-NEXT:    movb %sil, 2(%rdi)
970; SSE41-NEXT:    movdqa %xmm1, %xmm0
971; SSE41-NEXT:    retq
972;
973; AVX-LABEL: ssubo_v4i24:
974; AVX:       # %bb.0:
975; AVX-NEXT:    vpslld $8, %xmm1, %xmm1
976; AVX-NEXT:    vpsrad $8, %xmm1, %xmm1
977; AVX-NEXT:    vpslld $8, %xmm0, %xmm0
978; AVX-NEXT:    vpsrad $8, %xmm0, %xmm0
979; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
980; AVX-NEXT:    vpslld $8, %xmm1, %xmm0
981; AVX-NEXT:    vpsrad $8, %xmm0, %xmm0
982; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
983; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
984; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
985; AVX-NEXT:    vpextrd $3, %xmm1, %eax
986; AVX-NEXT:    movw %ax, 9(%rdi)
987; AVX-NEXT:    vpextrd $2, %xmm1, %ecx
988; AVX-NEXT:    movw %cx, 6(%rdi)
989; AVX-NEXT:    vpextrd $1, %xmm1, %edx
990; AVX-NEXT:    movw %dx, 3(%rdi)
991; AVX-NEXT:    vmovd %xmm1, %esi
992; AVX-NEXT:    movw %si, (%rdi)
993; AVX-NEXT:    shrl $16, %eax
994; AVX-NEXT:    movb %al, 11(%rdi)
995; AVX-NEXT:    shrl $16, %ecx
996; AVX-NEXT:    movb %cl, 8(%rdi)
997; AVX-NEXT:    shrl $16, %edx
998; AVX-NEXT:    movb %dl, 5(%rdi)
999; AVX-NEXT:    shrl $16, %esi
1000; AVX-NEXT:    movb %sil, 2(%rdi)
1001; AVX-NEXT:    retq
1002;
1003; AVX512-LABEL: ssubo_v4i24:
1004; AVX512:       # %bb.0:
1005; AVX512-NEXT:    vpslld $8, %xmm1, %xmm1
1006; AVX512-NEXT:    vpsrad $8, %xmm1, %xmm1
1007; AVX512-NEXT:    vpslld $8, %xmm0, %xmm0
1008; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
1009; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
1010; AVX512-NEXT:    vpslld $8, %xmm1, %xmm0
1011; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
1012; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
1013; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
1014; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
1015; AVX512-NEXT:    movw %ax, 9(%rdi)
1016; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
1017; AVX512-NEXT:    movw %cx, 6(%rdi)
1018; AVX512-NEXT:    vpextrd $1, %xmm1, %edx
1019; AVX512-NEXT:    movw %dx, 3(%rdi)
1020; AVX512-NEXT:    vmovd %xmm1, %esi
1021; AVX512-NEXT:    movw %si, (%rdi)
1022; AVX512-NEXT:    shrl $16, %eax
1023; AVX512-NEXT:    movb %al, 11(%rdi)
1024; AVX512-NEXT:    shrl $16, %ecx
1025; AVX512-NEXT:    movb %cl, 8(%rdi)
1026; AVX512-NEXT:    shrl $16, %edx
1027; AVX512-NEXT:    movb %dl, 5(%rdi)
1028; AVX512-NEXT:    shrl $16, %esi
1029; AVX512-NEXT:    movb %sil, 2(%rdi)
1030; AVX512-NEXT:    retq
1031  %t = call {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
1032  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
1033  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
1034  %res = sext <4 x i1> %obit to <4 x i32>
1035  store <4 x i24> %val, ptr %p2
1036  ret <4 x i32> %res
1037}
1038
1039define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
1040; SSE-LABEL: ssubo_v4i1:
1041; SSE:       # %bb.0:
1042; SSE-NEXT:    movdqa %xmm0, %xmm2
1043; SSE-NEXT:    pxor %xmm1, %xmm2
1044; SSE-NEXT:    pslld $31, %xmm2
1045; SSE-NEXT:    movmskps %xmm2, %eax
1046; SSE-NEXT:    pandn %xmm1, %xmm0
1047; SSE-NEXT:    pslld $31, %xmm0
1048; SSE-NEXT:    psrad $31, %xmm0
1049; SSE-NEXT:    movb %al, (%rdi)
1050; SSE-NEXT:    retq
1051;
1052; AVX-LABEL: ssubo_v4i1:
1053; AVX:       # %bb.0:
1054; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm2
1055; AVX-NEXT:    vpslld $31, %xmm2, %xmm2
1056; AVX-NEXT:    vmovmskps %xmm2, %eax
1057; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1058; AVX-NEXT:    vpslld $31, %xmm0, %xmm0
1059; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
1060; AVX-NEXT:    movb %al, (%rdi)
1061; AVX-NEXT:    retq
1062;
1063; AVX512-LABEL: ssubo_v4i1:
1064; AVX512:       # %bb.0:
1065; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
1066; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
1067; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
1068; AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1069; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
1070; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
1071; AVX512-NEXT:    kmovd %k0, %eax
1072; AVX512-NEXT:    movb %al, (%rdi)
1073; AVX512-NEXT:    retq
1074  %t = call {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
1075  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
1076  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
1077  %res = sext <4 x i1> %obit to <4 x i32>
1078  store <4 x i1> %val, ptr %p2
1079  ret <4 x i32> %res
1080}
1081
1082define <2 x i32> @ssubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind {
1083; SSE2-LABEL: ssubo_v2i128:
1084; SSE2:       # %bb.0:
1085; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1086; SSE2-NEXT:    subq %r8, %rdi
1087; SSE2-NEXT:    sbbq %r9, %rsi
1088; SSE2-NEXT:    seto %r8b
1089; SSE2-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1090; SSE2-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1091; SSE2-NEXT:    seto %r9b
1092; SSE2-NEXT:    movzbl %r9b, %r9d
1093; SSE2-NEXT:    negl %r9d
1094; SSE2-NEXT:    movd %r9d, %xmm1
1095; SSE2-NEXT:    movzbl %r8b, %r8d
1096; SSE2-NEXT:    negl %r8d
1097; SSE2-NEXT:    movd %r8d, %xmm0
1098; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1099; SSE2-NEXT:    movq %rdx, 16(%rax)
1100; SSE2-NEXT:    movq %rdi, (%rax)
1101; SSE2-NEXT:    movq %rcx, 24(%rax)
1102; SSE2-NEXT:    movq %rsi, 8(%rax)
1103; SSE2-NEXT:    retq
1104;
1105; SSSE3-LABEL: ssubo_v2i128:
1106; SSSE3:       # %bb.0:
1107; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1108; SSSE3-NEXT:    subq %r8, %rdi
1109; SSSE3-NEXT:    sbbq %r9, %rsi
1110; SSSE3-NEXT:    seto %r8b
1111; SSSE3-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1112; SSSE3-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1113; SSSE3-NEXT:    seto %r9b
1114; SSSE3-NEXT:    movzbl %r9b, %r9d
1115; SSSE3-NEXT:    negl %r9d
1116; SSSE3-NEXT:    movd %r9d, %xmm1
1117; SSSE3-NEXT:    movzbl %r8b, %r8d
1118; SSSE3-NEXT:    negl %r8d
1119; SSSE3-NEXT:    movd %r8d, %xmm0
1120; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1121; SSSE3-NEXT:    movq %rdx, 16(%rax)
1122; SSSE3-NEXT:    movq %rdi, (%rax)
1123; SSSE3-NEXT:    movq %rcx, 24(%rax)
1124; SSSE3-NEXT:    movq %rsi, 8(%rax)
1125; SSSE3-NEXT:    retq
1126;
1127; SSE41-LABEL: ssubo_v2i128:
1128; SSE41:       # %bb.0:
1129; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1130; SSE41-NEXT:    subq %r8, %rdi
1131; SSE41-NEXT:    sbbq %r9, %rsi
1132; SSE41-NEXT:    seto %r8b
1133; SSE41-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1134; SSE41-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1135; SSE41-NEXT:    seto %r9b
1136; SSE41-NEXT:    movzbl %r9b, %r9d
1137; SSE41-NEXT:    negl %r9d
1138; SSE41-NEXT:    movzbl %r8b, %r8d
1139; SSE41-NEXT:    negl %r8d
1140; SSE41-NEXT:    movd %r8d, %xmm0
1141; SSE41-NEXT:    pinsrd $1, %r9d, %xmm0
1142; SSE41-NEXT:    movq %rdx, 16(%rax)
1143; SSE41-NEXT:    movq %rdi, (%rax)
1144; SSE41-NEXT:    movq %rcx, 24(%rax)
1145; SSE41-NEXT:    movq %rsi, 8(%rax)
1146; SSE41-NEXT:    retq
1147;
1148; AVX-LABEL: ssubo_v2i128:
1149; AVX:       # %bb.0:
1150; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1151; AVX-NEXT:    subq %r8, %rdi
1152; AVX-NEXT:    sbbq %r9, %rsi
1153; AVX-NEXT:    seto %r8b
1154; AVX-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1155; AVX-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1156; AVX-NEXT:    seto %r9b
1157; AVX-NEXT:    movzbl %r9b, %r9d
1158; AVX-NEXT:    negl %r9d
1159; AVX-NEXT:    movzbl %r8b, %r8d
1160; AVX-NEXT:    negl %r8d
1161; AVX-NEXT:    vmovd %r8d, %xmm0
1162; AVX-NEXT:    vpinsrd $1, %r9d, %xmm0, %xmm0
1163; AVX-NEXT:    movq %rdx, 16(%rax)
1164; AVX-NEXT:    movq %rdi, (%rax)
1165; AVX-NEXT:    movq %rcx, 24(%rax)
1166; AVX-NEXT:    movq %rsi, 8(%rax)
1167; AVX-NEXT:    retq
1168;
1169; AVX512-LABEL: ssubo_v2i128:
1170; AVX512:       # %bb.0:
1171; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1172; AVX512-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1173; AVX512-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1174; AVX512-NEXT:    seto %r10b
1175; AVX512-NEXT:    kmovd %r10d, %k0
1176; AVX512-NEXT:    subq %r8, %rdi
1177; AVX512-NEXT:    sbbq %r9, %rsi
1178; AVX512-NEXT:    seto %r8b
1179; AVX512-NEXT:    andl $1, %r8d
1180; AVX512-NEXT:    kmovw %r8d, %k1
1181; AVX512-NEXT:    kshiftlw $1, %k0, %k0
1182; AVX512-NEXT:    korw %k0, %k1, %k1
1183; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1184; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1185; AVX512-NEXT:    movq %rdx, 16(%rax)
1186; AVX512-NEXT:    movq %rdi, (%rax)
1187; AVX512-NEXT:    movq %rcx, 24(%rax)
1188; AVX512-NEXT:    movq %rsi, 8(%rax)
1189; AVX512-NEXT:    retq
1190  %t = call {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
1191  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
1192  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
1193  %res = sext <2 x i1> %obit to <2 x i32>
1194  store <2 x i128> %val, ptr %p2
1195  ret <2 x i32> %res
1196}
1197