xref: /llvm-project/llvm/test/CodeGen/X86/vec_usubo.ll (revision ce6251540d7af30585d4ca753ca2a0ab34d32be2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512
9
10declare {<1 x i32>, <1 x i1>} @llvm.usub.with.overflow.v1i32(<1 x i32>, <1 x i32>)
11declare {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32>, <2 x i32>)
12declare {<3 x i32>, <3 x i1>} @llvm.usub.with.overflow.v3i32(<3 x i32>, <3 x i32>)
13declare {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32>, <4 x i32>)
14declare {<6 x i32>, <6 x i1>} @llvm.usub.with.overflow.v6i32(<6 x i32>, <6 x i32>)
15declare {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32>, <8 x i32>)
16declare {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32>, <16 x i32>)
17
18declare {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8>, <16 x i8>)
19declare {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16>, <8 x i16>)
20declare {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
21
22declare {<4 x i24>, <4 x i1>} @llvm.usub.with.overflow.v4i24(<4 x i24>, <4 x i24>)
23declare {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1>, <4 x i1>)
24declare {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128>, <2 x i128>)
25
26define <1 x i32> @usubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, ptr %p2) nounwind {
27; CHECK-LABEL: usubo_v1i32:
28; CHECK:       # %bb.0:
29; CHECK-NEXT:    xorl %eax, %eax
30; CHECK-NEXT:    subl %esi, %edi
31; CHECK-NEXT:    sbbl %eax, %eax
32; CHECK-NEXT:    movl %edi, (%rdx)
33; CHECK-NEXT:    retq
34  %t = call {<1 x i32>, <1 x i1>} @llvm.usub.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
35  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
36  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
37  %res = sext <1 x i1> %obit to <1 x i32>
38  store <1 x i32> %val, ptr %p2
39  ret <1 x i32> %res
40}
41
42define <2 x i32> @usubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
43; SSE2-LABEL: usubo_v2i32:
44; SSE2:       # %bb.0:
45; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
46; SSE2-NEXT:    movdqa %xmm0, %xmm3
47; SSE2-NEXT:    pxor %xmm2, %xmm3
48; SSE2-NEXT:    psubd %xmm1, %xmm0
49; SSE2-NEXT:    pxor %xmm0, %xmm2
50; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
51; SSE2-NEXT:    movq %xmm0, (%rdi)
52; SSE2-NEXT:    movdqa %xmm2, %xmm0
53; SSE2-NEXT:    retq
54;
55; SSSE3-LABEL: usubo_v2i32:
56; SSSE3:       # %bb.0:
57; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
58; SSSE3-NEXT:    movdqa %xmm0, %xmm3
59; SSSE3-NEXT:    pxor %xmm2, %xmm3
60; SSSE3-NEXT:    psubd %xmm1, %xmm0
61; SSSE3-NEXT:    pxor %xmm0, %xmm2
62; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
63; SSSE3-NEXT:    movq %xmm0, (%rdi)
64; SSSE3-NEXT:    movdqa %xmm2, %xmm0
65; SSSE3-NEXT:    retq
66;
67; SSE41-LABEL: usubo_v2i32:
68; SSE41:       # %bb.0:
69; SSE41-NEXT:    movdqa %xmm0, %xmm2
70; SSE41-NEXT:    psubd %xmm1, %xmm2
71; SSE41-NEXT:    pminud %xmm2, %xmm0
72; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
73; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
74; SSE41-NEXT:    pxor %xmm1, %xmm0
75; SSE41-NEXT:    movq %xmm2, (%rdi)
76; SSE41-NEXT:    retq
77;
78; AVX-LABEL: usubo_v2i32:
79; AVX:       # %bb.0:
80; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
81; AVX-NEXT:    vpminud %xmm0, %xmm1, %xmm0
82; AVX-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
83; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
84; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
85; AVX-NEXT:    vmovq %xmm1, (%rdi)
86; AVX-NEXT:    retq
87;
88; AVX512-LABEL: usubo_v2i32:
89; AVX512:       # %bb.0:
90; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
91; AVX512-NEXT:    vpcmpnleud %xmm0, %xmm1, %k1
92; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
93; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
94; AVX512-NEXT:    vmovq %xmm1, (%rdi)
95; AVX512-NEXT:    retq
96  %t = call {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
97  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
98  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
99  %res = sext <2 x i1> %obit to <2 x i32>
100  store <2 x i32> %val, ptr %p2
101  ret <2 x i32> %res
102}
103
104define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
105; SSE2-LABEL: usubo_v3i32:
106; SSE2:       # %bb.0:
107; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
108; SSE2-NEXT:    movdqa %xmm0, %xmm3
109; SSE2-NEXT:    pxor %xmm2, %xmm3
110; SSE2-NEXT:    psubd %xmm1, %xmm0
111; SSE2-NEXT:    pxor %xmm0, %xmm2
112; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
113; SSE2-NEXT:    movq %xmm0, (%rdi)
114; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
115; SSE2-NEXT:    movd %xmm0, 8(%rdi)
116; SSE2-NEXT:    movdqa %xmm2, %xmm0
117; SSE2-NEXT:    retq
118;
119; SSSE3-LABEL: usubo_v3i32:
120; SSSE3:       # %bb.0:
121; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
122; SSSE3-NEXT:    movdqa %xmm0, %xmm3
123; SSSE3-NEXT:    pxor %xmm2, %xmm3
124; SSSE3-NEXT:    psubd %xmm1, %xmm0
125; SSSE3-NEXT:    pxor %xmm0, %xmm2
126; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
127; SSSE3-NEXT:    movq %xmm0, (%rdi)
128; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
129; SSSE3-NEXT:    movd %xmm0, 8(%rdi)
130; SSSE3-NEXT:    movdqa %xmm2, %xmm0
131; SSSE3-NEXT:    retq
132;
133; SSE41-LABEL: usubo_v3i32:
134; SSE41:       # %bb.0:
135; SSE41-NEXT:    movdqa %xmm0, %xmm2
136; SSE41-NEXT:    psubd %xmm1, %xmm2
137; SSE41-NEXT:    pminud %xmm2, %xmm0
138; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
139; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
140; SSE41-NEXT:    pxor %xmm1, %xmm0
141; SSE41-NEXT:    pextrd $2, %xmm2, 8(%rdi)
142; SSE41-NEXT:    movq %xmm2, (%rdi)
143; SSE41-NEXT:    retq
144;
145; AVX-LABEL: usubo_v3i32:
146; AVX:       # %bb.0:
147; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
148; AVX-NEXT:    vpminud %xmm0, %xmm1, %xmm0
149; AVX-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
150; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
151; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
152; AVX-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
153; AVX-NEXT:    vmovq %xmm1, (%rdi)
154; AVX-NEXT:    retq
155;
156; AVX512-LABEL: usubo_v3i32:
157; AVX512:       # %bb.0:
158; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
159; AVX512-NEXT:    vpcmpnleud %xmm0, %xmm1, %k1
160; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
161; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
162; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
163; AVX512-NEXT:    vmovq %xmm1, (%rdi)
164; AVX512-NEXT:    retq
165  %t = call {<3 x i32>, <3 x i1>} @llvm.usub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
166  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
167  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
168  %res = sext <3 x i1> %obit to <3 x i32>
169  store <3 x i32> %val, ptr %p2
170  ret <3 x i32> %res
171}
172
173define <4 x i32> @usubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
174; SSE2-LABEL: usubo_v4i32:
175; SSE2:       # %bb.0:
176; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
177; SSE2-NEXT:    movdqa %xmm0, %xmm3
178; SSE2-NEXT:    pxor %xmm2, %xmm3
179; SSE2-NEXT:    psubd %xmm1, %xmm0
180; SSE2-NEXT:    pxor %xmm0, %xmm2
181; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
182; SSE2-NEXT:    movdqa %xmm0, (%rdi)
183; SSE2-NEXT:    movdqa %xmm2, %xmm0
184; SSE2-NEXT:    retq
185;
186; SSSE3-LABEL: usubo_v4i32:
187; SSSE3:       # %bb.0:
188; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
189; SSSE3-NEXT:    movdqa %xmm0, %xmm3
190; SSSE3-NEXT:    pxor %xmm2, %xmm3
191; SSSE3-NEXT:    psubd %xmm1, %xmm0
192; SSSE3-NEXT:    pxor %xmm0, %xmm2
193; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
194; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
195; SSSE3-NEXT:    movdqa %xmm2, %xmm0
196; SSSE3-NEXT:    retq
197;
198; SSE41-LABEL: usubo_v4i32:
199; SSE41:       # %bb.0:
200; SSE41-NEXT:    movdqa %xmm0, %xmm2
201; SSE41-NEXT:    psubd %xmm1, %xmm2
202; SSE41-NEXT:    pminud %xmm2, %xmm0
203; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
204; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
205; SSE41-NEXT:    pxor %xmm1, %xmm0
206; SSE41-NEXT:    movdqa %xmm2, (%rdi)
207; SSE41-NEXT:    retq
208;
209; AVX-LABEL: usubo_v4i32:
210; AVX:       # %bb.0:
211; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
212; AVX-NEXT:    vpminud %xmm0, %xmm1, %xmm0
213; AVX-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
214; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
215; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
216; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
217; AVX-NEXT:    retq
218;
219; AVX512-LABEL: usubo_v4i32:
220; AVX512:       # %bb.0:
221; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
222; AVX512-NEXT:    vpcmpnleud %xmm0, %xmm1, %k1
223; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
224; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
225; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
226; AVX512-NEXT:    retq
227  %t = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
228  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
229  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
230  %res = sext <4 x i1> %obit to <4 x i32>
231  store <4 x i32> %val, ptr %p2
232  ret <4 x i32> %res
233}
234
235define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
236; SSE2-LABEL: usubo_v6i32:
237; SSE2:       # %bb.0:
238; SSE2-NEXT:    movq %rdi, %rax
239; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
240; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
241; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
242; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
243; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
244; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
245; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
246; SSE2-NEXT:    movd %r8d, %xmm0
247; SSE2-NEXT:    movd %ecx, %xmm1
248; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
249; SSE2-NEXT:    movd %edx, %xmm2
250; SSE2-NEXT:    movd %esi, %xmm0
251; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
252; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
253; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
254; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
255; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
256; SSE2-NEXT:    movd %r9d, %xmm1
257; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
258; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
259; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
260; SSE2-NEXT:    movdqa %xmm0, %xmm4
261; SSE2-NEXT:    psubd %xmm3, %xmm4
262; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
263; SSE2-NEXT:    movdqa %xmm4, (%rcx)
264; SSE2-NEXT:    pxor %xmm3, %xmm4
265; SSE2-NEXT:    pxor %xmm3, %xmm0
266; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
267; SSE2-NEXT:    movdqa %xmm1, %xmm0
268; SSE2-NEXT:    psubd %xmm2, %xmm0
269; SSE2-NEXT:    movq %xmm0, 16(%rcx)
270; SSE2-NEXT:    pxor %xmm3, %xmm0
271; SSE2-NEXT:    pxor %xmm3, %xmm1
272; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
273; SSE2-NEXT:    movq %xmm0, 16(%rdi)
274; SSE2-NEXT:    movdqa %xmm4, (%rdi)
275; SSE2-NEXT:    retq
276;
277; SSSE3-LABEL: usubo_v6i32:
278; SSSE3:       # %bb.0:
279; SSSE3-NEXT:    movq %rdi, %rax
280; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
281; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
282; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
283; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
284; SSSE3-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
285; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
286; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
287; SSSE3-NEXT:    movd %r8d, %xmm0
288; SSSE3-NEXT:    movd %ecx, %xmm1
289; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
290; SSSE3-NEXT:    movd %edx, %xmm2
291; SSSE3-NEXT:    movd %esi, %xmm0
292; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
293; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
294; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
295; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
296; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
297; SSSE3-NEXT:    movd %r9d, %xmm1
298; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
299; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
300; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
301; SSSE3-NEXT:    movdqa %xmm0, %xmm4
302; SSSE3-NEXT:    psubd %xmm3, %xmm4
303; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
304; SSSE3-NEXT:    movdqa %xmm4, (%rcx)
305; SSSE3-NEXT:    pxor %xmm3, %xmm4
306; SSSE3-NEXT:    pxor %xmm3, %xmm0
307; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
308; SSSE3-NEXT:    movdqa %xmm1, %xmm0
309; SSSE3-NEXT:    psubd %xmm2, %xmm0
310; SSSE3-NEXT:    movq %xmm0, 16(%rcx)
311; SSSE3-NEXT:    pxor %xmm3, %xmm0
312; SSSE3-NEXT:    pxor %xmm3, %xmm1
313; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
314; SSSE3-NEXT:    movq %xmm0, 16(%rdi)
315; SSSE3-NEXT:    movdqa %xmm4, (%rdi)
316; SSSE3-NEXT:    retq
317;
318; SSE41-LABEL: usubo_v6i32:
319; SSE41:       # %bb.0:
320; SSE41-NEXT:    movq %rdi, %rax
321; SSE41-NEXT:    movd %esi, %xmm0
322; SSE41-NEXT:    pinsrd $1, %edx, %xmm0
323; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
324; SSE41-NEXT:    pinsrd $3, %r8d, %xmm0
325; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
326; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
327; SSE41-NEXT:    movd %r9d, %xmm2
328; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
329; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
330; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
331; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
332; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
333; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
334; SSE41-NEXT:    movdqa %xmm0, %xmm4
335; SSE41-NEXT:    psubd %xmm3, %xmm4
336; SSE41-NEXT:    pminud %xmm4, %xmm0
337; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
338; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
339; SSE41-NEXT:    pxor %xmm3, %xmm0
340; SSE41-NEXT:    movdqa %xmm2, %xmm5
341; SSE41-NEXT:    psubd %xmm1, %xmm5
342; SSE41-NEXT:    pminud %xmm5, %xmm2
343; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2
344; SSE41-NEXT:    pxor %xmm3, %xmm2
345; SSE41-NEXT:    movq %xmm5, 16(%rcx)
346; SSE41-NEXT:    movdqa %xmm4, (%rcx)
347; SSE41-NEXT:    movq %xmm2, 16(%rdi)
348; SSE41-NEXT:    movdqa %xmm0, (%rdi)
349; SSE41-NEXT:    retq
350;
351; AVX1-LABEL: usubo_v6i32:
352; AVX1:       # %bb.0:
353; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
354; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
355; AVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
356; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm3
357; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm3
358; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
359; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
360; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
361; AVX1-NEXT:    vpminud %xmm0, %xmm1, %xmm0
362; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
363; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
364; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
365; AVX1-NEXT:    vmovq %xmm2, 16(%rdi)
366; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
367; AVX1-NEXT:    retq
368;
369; AVX2-LABEL: usubo_v6i32:
370; AVX2:       # %bb.0:
371; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
372; AVX2-NEXT:    vpminud %ymm0, %ymm1, %ymm0
373; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm1, %ymm0
374; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
375; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
376; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
377; AVX2-NEXT:    vmovq %xmm2, 16(%rdi)
378; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
379; AVX2-NEXT:    retq
380;
381; AVX512-LABEL: usubo_v6i32:
382; AVX512:       # %bb.0:
383; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
384; AVX512-NEXT:    vpcmpnleud %ymm0, %ymm1, %k1
385; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
386; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
387; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
388; AVX512-NEXT:    vmovq %xmm2, 16(%rdi)
389; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
390; AVX512-NEXT:    retq
391  %t = call {<6 x i32>, <6 x i1>} @llvm.usub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
392  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
393  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
394  %res = sext <6 x i1> %obit to <6 x i32>
395  store <6 x i32> %val, ptr %p2
396  ret <6 x i32> %res
397}
398
399define <8 x i32> @usubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
400; SSE2-LABEL: usubo_v8i32:
401; SSE2:       # %bb.0:
402; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
403; SSE2-NEXT:    movdqa %xmm0, %xmm5
404; SSE2-NEXT:    pxor %xmm4, %xmm5
405; SSE2-NEXT:    psubd %xmm2, %xmm0
406; SSE2-NEXT:    movdqa %xmm0, (%rdi)
407; SSE2-NEXT:    pxor %xmm4, %xmm0
408; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
409; SSE2-NEXT:    movdqa %xmm1, %xmm2
410; SSE2-NEXT:    pxor %xmm4, %xmm2
411; SSE2-NEXT:    psubd %xmm3, %xmm1
412; SSE2-NEXT:    pxor %xmm1, %xmm4
413; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
414; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
415; SSE2-NEXT:    movdqa %xmm4, %xmm1
416; SSE2-NEXT:    retq
417;
418; SSSE3-LABEL: usubo_v8i32:
419; SSSE3:       # %bb.0:
420; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
421; SSSE3-NEXT:    movdqa %xmm0, %xmm5
422; SSSE3-NEXT:    pxor %xmm4, %xmm5
423; SSSE3-NEXT:    psubd %xmm2, %xmm0
424; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
425; SSSE3-NEXT:    pxor %xmm4, %xmm0
426; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
427; SSSE3-NEXT:    movdqa %xmm1, %xmm2
428; SSSE3-NEXT:    pxor %xmm4, %xmm2
429; SSSE3-NEXT:    psubd %xmm3, %xmm1
430; SSSE3-NEXT:    pxor %xmm1, %xmm4
431; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
432; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi)
433; SSSE3-NEXT:    movdqa %xmm4, %xmm1
434; SSSE3-NEXT:    retq
435;
436; SSE41-LABEL: usubo_v8i32:
437; SSE41:       # %bb.0:
438; SSE41-NEXT:    movdqa %xmm0, %xmm4
439; SSE41-NEXT:    psubd %xmm2, %xmm4
440; SSE41-NEXT:    pminud %xmm4, %xmm0
441; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
442; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
443; SSE41-NEXT:    pxor %xmm2, %xmm0
444; SSE41-NEXT:    movdqa %xmm1, %xmm5
445; SSE41-NEXT:    psubd %xmm3, %xmm5
446; SSE41-NEXT:    pminud %xmm5, %xmm1
447; SSE41-NEXT:    pcmpeqd %xmm5, %xmm1
448; SSE41-NEXT:    pxor %xmm2, %xmm1
449; SSE41-NEXT:    movdqa %xmm5, 16(%rdi)
450; SSE41-NEXT:    movdqa %xmm4, (%rdi)
451; SSE41-NEXT:    retq
452;
453; AVX1-LABEL: usubo_v8i32:
454; AVX1:       # %bb.0:
455; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
456; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
457; AVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
458; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm3
459; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm3
460; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
461; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
462; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
463; AVX1-NEXT:    vpminud %xmm0, %xmm1, %xmm0
464; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
465; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
466; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
467; AVX1-NEXT:    vmovdqa %xmm2, 16(%rdi)
468; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
469; AVX1-NEXT:    retq
470;
471; AVX2-LABEL: usubo_v8i32:
472; AVX2:       # %bb.0:
473; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
474; AVX2-NEXT:    vpminud %ymm0, %ymm1, %ymm0
475; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm1, %ymm0
476; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
477; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
478; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
479; AVX2-NEXT:    retq
480;
481; AVX512-LABEL: usubo_v8i32:
482; AVX512:       # %bb.0:
483; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
484; AVX512-NEXT:    vpcmpnleud %ymm0, %ymm1, %k1
485; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
486; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
487; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
488; AVX512-NEXT:    retq
489  %t = call {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
490  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
491  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
492  %res = sext <8 x i1> %obit to <8 x i32>
493  store <8 x i32> %val, ptr %p2
494  ret <8 x i32> %res
495}
496
497define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwind {
498; SSE2-LABEL: usubo_v16i32:
499; SSE2:       # %bb.0:
500; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
501; SSE2-NEXT:    movdqa %xmm0, %xmm9
502; SSE2-NEXT:    pxor %xmm8, %xmm9
503; SSE2-NEXT:    psubd %xmm4, %xmm0
504; SSE2-NEXT:    movdqa %xmm0, (%rdi)
505; SSE2-NEXT:    pxor %xmm8, %xmm0
506; SSE2-NEXT:    pcmpgtd %xmm9, %xmm0
507; SSE2-NEXT:    movdqa %xmm1, %xmm4
508; SSE2-NEXT:    pxor %xmm8, %xmm4
509; SSE2-NEXT:    psubd %xmm5, %xmm1
510; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
511; SSE2-NEXT:    pxor %xmm8, %xmm1
512; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
513; SSE2-NEXT:    movdqa %xmm2, %xmm4
514; SSE2-NEXT:    pxor %xmm8, %xmm4
515; SSE2-NEXT:    psubd %xmm6, %xmm2
516; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
517; SSE2-NEXT:    pxor %xmm8, %xmm2
518; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
519; SSE2-NEXT:    movdqa %xmm3, %xmm4
520; SSE2-NEXT:    pxor %xmm8, %xmm4
521; SSE2-NEXT:    psubd %xmm7, %xmm3
522; SSE2-NEXT:    pxor %xmm3, %xmm8
523; SSE2-NEXT:    pcmpgtd %xmm4, %xmm8
524; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
525; SSE2-NEXT:    movdqa %xmm8, %xmm3
526; SSE2-NEXT:    retq
527;
528; SSSE3-LABEL: usubo_v16i32:
529; SSSE3:       # %bb.0:
530; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
531; SSSE3-NEXT:    movdqa %xmm0, %xmm9
532; SSSE3-NEXT:    pxor %xmm8, %xmm9
533; SSSE3-NEXT:    psubd %xmm4, %xmm0
534; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
535; SSSE3-NEXT:    pxor %xmm8, %xmm0
536; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm0
537; SSSE3-NEXT:    movdqa %xmm1, %xmm4
538; SSSE3-NEXT:    pxor %xmm8, %xmm4
539; SSSE3-NEXT:    psubd %xmm5, %xmm1
540; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi)
541; SSSE3-NEXT:    pxor %xmm8, %xmm1
542; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm1
543; SSSE3-NEXT:    movdqa %xmm2, %xmm4
544; SSSE3-NEXT:    pxor %xmm8, %xmm4
545; SSSE3-NEXT:    psubd %xmm6, %xmm2
546; SSSE3-NEXT:    movdqa %xmm2, 32(%rdi)
547; SSSE3-NEXT:    pxor %xmm8, %xmm2
548; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
549; SSSE3-NEXT:    movdqa %xmm3, %xmm4
550; SSSE3-NEXT:    pxor %xmm8, %xmm4
551; SSSE3-NEXT:    psubd %xmm7, %xmm3
552; SSSE3-NEXT:    pxor %xmm3, %xmm8
553; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm8
554; SSSE3-NEXT:    movdqa %xmm3, 48(%rdi)
555; SSSE3-NEXT:    movdqa %xmm8, %xmm3
556; SSSE3-NEXT:    retq
557;
558; SSE41-LABEL: usubo_v16i32:
559; SSE41:       # %bb.0:
560; SSE41-NEXT:    movdqa %xmm0, %xmm8
561; SSE41-NEXT:    psubd %xmm4, %xmm8
562; SSE41-NEXT:    pminud %xmm8, %xmm0
563; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
564; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
565; SSE41-NEXT:    pxor %xmm4, %xmm0
566; SSE41-NEXT:    movdqa %xmm1, %xmm9
567; SSE41-NEXT:    psubd %xmm5, %xmm9
568; SSE41-NEXT:    pminud %xmm9, %xmm1
569; SSE41-NEXT:    pcmpeqd %xmm9, %xmm1
570; SSE41-NEXT:    pxor %xmm4, %xmm1
571; SSE41-NEXT:    movdqa %xmm2, %xmm5
572; SSE41-NEXT:    psubd %xmm6, %xmm5
573; SSE41-NEXT:    pminud %xmm5, %xmm2
574; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2
575; SSE41-NEXT:    pxor %xmm4, %xmm2
576; SSE41-NEXT:    movdqa %xmm3, %xmm6
577; SSE41-NEXT:    psubd %xmm7, %xmm6
578; SSE41-NEXT:    pminud %xmm6, %xmm3
579; SSE41-NEXT:    pcmpeqd %xmm6, %xmm3
580; SSE41-NEXT:    pxor %xmm4, %xmm3
581; SSE41-NEXT:    movdqa %xmm6, 48(%rdi)
582; SSE41-NEXT:    movdqa %xmm5, 32(%rdi)
583; SSE41-NEXT:    movdqa %xmm9, 16(%rdi)
584; SSE41-NEXT:    movdqa %xmm8, (%rdi)
585; SSE41-NEXT:    retq
586;
587; AVX1-LABEL: usubo_v16i32:
588; AVX1:       # %bb.0:
589; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
590; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
591; AVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm4
592; AVX1-NEXT:    vpminud %xmm5, %xmm4, %xmm5
593; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm5
594; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm3
595; AVX1-NEXT:    vpminud %xmm1, %xmm3, %xmm1
596; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm3, %xmm1
597; AVX1-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
598; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
599; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
600; AVX1-NEXT:    vpsubd %xmm5, %xmm6, %xmm5
601; AVX1-NEXT:    vpminud %xmm6, %xmm5, %xmm6
602; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm5, %xmm6
603; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm2
604; AVX1-NEXT:    vpminud %xmm0, %xmm2, %xmm0
605; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
606; AVX1-NEXT:    vpackssdw %xmm6, %xmm0, %xmm0
607; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
608; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
609; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
610; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm7
611; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
612; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
613; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
614; AVX1-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
615; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1
616; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm6
617; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
618; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
619; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm6, %ymm1
620; AVX1-NEXT:    vmovdqa %xmm4, 48(%rdi)
621; AVX1-NEXT:    vmovdqa %xmm3, 32(%rdi)
622; AVX1-NEXT:    vmovdqa %xmm5, 16(%rdi)
623; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
624; AVX1-NEXT:    retq
625;
626; AVX2-LABEL: usubo_v16i32:
627; AVX2:       # %bb.0:
628; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm3
629; AVX2-NEXT:    vpminud %ymm1, %ymm3, %ymm1
630; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm3, %ymm1
631; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
632; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm1
633; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
634; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
635; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm2
636; AVX2-NEXT:    vpminud %ymm0, %ymm2, %ymm0
637; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm2, %ymm0
638; AVX2-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
639; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
640; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
641; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
642; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
643; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
644; AVX2-NEXT:    retq
645;
646; AVX512-LABEL: usubo_v16i32:
647; AVX512:       # %bb.0:
648; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
649; AVX512-NEXT:    vpcmpnleud %zmm0, %zmm1, %k1
650; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
651; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
652; AVX512-NEXT:    retq
653  %t = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
654  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
655  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
656  %res = sext <16 x i1> %obit to <16 x i32>
657  store <16 x i32> %val, ptr %p2
658  ret <16 x i32> %res
659}
660
661define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
662; SSE2-LABEL: usubo_v16i8:
663; SSE2:       # %bb.0:
664; SSE2-NEXT:    movdqa %xmm0, %xmm4
665; SSE2-NEXT:    psubb %xmm1, %xmm4
666; SSE2-NEXT:    pminub %xmm4, %xmm0
667; SSE2-NEXT:    pcmpeqb %xmm4, %xmm0
668; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
669; SSE2-NEXT:    pxor %xmm0, %xmm3
670; SSE2-NEXT:    movdqa %xmm3, %xmm0
671; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
672; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
673; SSE2-NEXT:    movdqa %xmm3, %xmm1
674; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
675; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
676; SSE2-NEXT:    pslld $31, %xmm1
677; SSE2-NEXT:    psrad $31, %xmm1
678; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
679; SSE2-NEXT:    movdqa %xmm3, %xmm2
680; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
681; SSE2-NEXT:    pslld $31, %xmm2
682; SSE2-NEXT:    psrad $31, %xmm2
683; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
684; SSE2-NEXT:    pslld $31, %xmm3
685; SSE2-NEXT:    psrad $31, %xmm3
686; SSE2-NEXT:    movdqa %xmm4, (%rdi)
687; SSE2-NEXT:    retq
688;
689; SSSE3-LABEL: usubo_v16i8:
690; SSSE3:       # %bb.0:
691; SSSE3-NEXT:    movdqa %xmm0, %xmm4
692; SSSE3-NEXT:    psubb %xmm1, %xmm4
693; SSSE3-NEXT:    pminub %xmm4, %xmm0
694; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm0
695; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
696; SSSE3-NEXT:    pxor %xmm0, %xmm3
697; SSSE3-NEXT:    movdqa %xmm3, %xmm0
698; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
699; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
700; SSSE3-NEXT:    movdqa %xmm3, %xmm1
701; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
702; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
703; SSSE3-NEXT:    pslld $31, %xmm1
704; SSSE3-NEXT:    psrad $31, %xmm1
705; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
706; SSSE3-NEXT:    movdqa %xmm3, %xmm2
707; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
708; SSSE3-NEXT:    pslld $31, %xmm2
709; SSSE3-NEXT:    psrad $31, %xmm2
710; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
711; SSSE3-NEXT:    pslld $31, %xmm3
712; SSSE3-NEXT:    psrad $31, %xmm3
713; SSSE3-NEXT:    movdqa %xmm4, (%rdi)
714; SSSE3-NEXT:    retq
715;
716; SSE41-LABEL: usubo_v16i8:
717; SSE41:       # %bb.0:
718; SSE41-NEXT:    movdqa %xmm0, %xmm4
719; SSE41-NEXT:    psubb %xmm1, %xmm4
720; SSE41-NEXT:    pminub %xmm4, %xmm0
721; SSE41-NEXT:    pcmpeqb %xmm4, %xmm0
722; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
723; SSE41-NEXT:    pxor %xmm0, %xmm3
724; SSE41-NEXT:    pmovsxbd %xmm3, %xmm0
725; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
726; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
727; SSE41-NEXT:    pslld $31, %xmm1
728; SSE41-NEXT:    psrad $31, %xmm1
729; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
730; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
731; SSE41-NEXT:    pslld $31, %xmm2
732; SSE41-NEXT:    psrad $31, %xmm2
733; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
734; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
735; SSE41-NEXT:    pslld $31, %xmm3
736; SSE41-NEXT:    psrad $31, %xmm3
737; SSE41-NEXT:    movdqa %xmm4, (%rdi)
738; SSE41-NEXT:    retq
739;
740; AVX1-LABEL: usubo_v16i8:
741; AVX1:       # %bb.0:
742; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm2
743; AVX1-NEXT:    vpminub %xmm0, %xmm2, %xmm0
744; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm2, %xmm0
745; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
746; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
747; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
748; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
749; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
750; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
751; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
752; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
753; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
754; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
755; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
756; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
757; AVX1-NEXT:    retq
758;
759; AVX2-LABEL: usubo_v16i8:
760; AVX2:       # %bb.0:
761; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm2
762; AVX2-NEXT:    vpminub %xmm0, %xmm2, %xmm0
763; AVX2-NEXT:    vpcmpeqb %xmm0, %xmm2, %xmm0
764; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
765; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
766; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm0
767; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
768; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
769; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
770; AVX2-NEXT:    retq
771;
772; AVX512-LABEL: usubo_v16i8:
773; AVX512:       # %bb.0:
774; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm1
775; AVX512-NEXT:    vpcmpnleub %xmm0, %xmm1, %k1
776; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
777; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
778; AVX512-NEXT:    retq
779  %t = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
780  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
781  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
782  %res = sext <16 x i1> %obit to <16 x i32>
783  store <16 x i8> %val, ptr %p2
784  ret <16 x i32> %res
785}
786
787define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
788; SSE2-LABEL: usubo_v8i16:
789; SSE2:       # %bb.0:
790; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
791; SSE2-NEXT:    movdqa %xmm0, %xmm3
792; SSE2-NEXT:    pxor %xmm2, %xmm3
793; SSE2-NEXT:    psubw %xmm1, %xmm0
794; SSE2-NEXT:    pxor %xmm0, %xmm2
795; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
796; SSE2-NEXT:    movdqa %xmm2, %xmm1
797; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
798; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
799; SSE2-NEXT:    pslld $31, %xmm2
800; SSE2-NEXT:    psrad $31, %xmm2
801; SSE2-NEXT:    movdqa %xmm0, (%rdi)
802; SSE2-NEXT:    movdqa %xmm1, %xmm0
803; SSE2-NEXT:    movdqa %xmm2, %xmm1
804; SSE2-NEXT:    retq
805;
806; SSSE3-LABEL: usubo_v8i16:
807; SSSE3:       # %bb.0:
808; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
809; SSSE3-NEXT:    movdqa %xmm0, %xmm3
810; SSSE3-NEXT:    pxor %xmm2, %xmm3
811; SSSE3-NEXT:    psubw %xmm1, %xmm0
812; SSSE3-NEXT:    pxor %xmm0, %xmm2
813; SSSE3-NEXT:    pcmpgtw %xmm3, %xmm2
814; SSSE3-NEXT:    movdqa %xmm2, %xmm1
815; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
816; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
817; SSSE3-NEXT:    pslld $31, %xmm2
818; SSSE3-NEXT:    psrad $31, %xmm2
819; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
820; SSSE3-NEXT:    movdqa %xmm1, %xmm0
821; SSSE3-NEXT:    movdqa %xmm2, %xmm1
822; SSSE3-NEXT:    retq
823;
824; SSE41-LABEL: usubo_v8i16:
825; SSE41:       # %bb.0:
826; SSE41-NEXT:    movdqa %xmm0, %xmm2
827; SSE41-NEXT:    psubw %xmm1, %xmm2
828; SSE41-NEXT:    pminuw %xmm2, %xmm0
829; SSE41-NEXT:    pcmpeqw %xmm2, %xmm0
830; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
831; SSE41-NEXT:    pxor %xmm0, %xmm1
832; SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
833; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
834; SSE41-NEXT:    pslld $31, %xmm1
835; SSE41-NEXT:    psrad $31, %xmm1
836; SSE41-NEXT:    movdqa %xmm2, (%rdi)
837; SSE41-NEXT:    retq
838;
839; AVX1-LABEL: usubo_v8i16:
840; AVX1:       # %bb.0:
841; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
842; AVX1-NEXT:    vpminuw %xmm0, %xmm1, %xmm0
843; AVX1-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0
844; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
845; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
846; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
847; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
848; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
849; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
850; AVX1-NEXT:    retq
851;
852; AVX2-LABEL: usubo_v8i16:
853; AVX2:       # %bb.0:
854; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
855; AVX2-NEXT:    vpminuw %xmm0, %xmm1, %xmm0
856; AVX2-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0
857; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
858; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
859; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
860; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
861; AVX2-NEXT:    retq
862;
863; AVX512-LABEL: usubo_v8i16:
864; AVX512:       # %bb.0:
865; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
866; AVX512-NEXT:    vpcmpnleuw %xmm0, %xmm1, %k1
867; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
868; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
869; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
870; AVX512-NEXT:    retq
871  %t = call {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
872  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
873  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
874  %res = sext <8 x i1> %obit to <8 x i32>
875  store <8 x i16> %val, ptr %p2
876  ret <8 x i32> %res
877}
878
879define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
880; SSE-LABEL: usubo_v2i64:
881; SSE:       # %bb.0:
882; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
883; SSE-NEXT:    movdqa %xmm0, %xmm3
884; SSE-NEXT:    pxor %xmm2, %xmm3
885; SSE-NEXT:    psubq %xmm1, %xmm0
886; SSE-NEXT:    pxor %xmm0, %xmm2
887; SSE-NEXT:    movdqa %xmm2, %xmm1
888; SSE-NEXT:    pcmpeqd %xmm3, %xmm1
889; SSE-NEXT:    pcmpgtd %xmm3, %xmm2
890; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
891; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,3,3,3]
892; SSE-NEXT:    pand %xmm3, %xmm4
893; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,3,3]
894; SSE-NEXT:    por %xmm4, %xmm1
895; SSE-NEXT:    movdqa %xmm0, (%rdi)
896; SSE-NEXT:    movdqa %xmm1, %xmm0
897; SSE-NEXT:    retq
898;
899; AVX1-LABEL: usubo_v2i64:
900; AVX1:       # %bb.0:
901; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
902; AVX1-NEXT:    # xmm2 = mem[0,0]
903; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
904; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
905; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm0
906; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm0
907; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
908; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
909; AVX1-NEXT:    retq
910;
911; AVX2-LABEL: usubo_v2i64:
912; AVX2:       # %bb.0:
913; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
914; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
915; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
916; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm0
917; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm0
918; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
919; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
920; AVX2-NEXT:    retq
921;
922; AVX512-LABEL: usubo_v2i64:
923; AVX512:       # %bb.0:
924; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
925; AVX512-NEXT:    vpcmpnleuq %xmm0, %xmm1, %k1
926; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
927; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
928; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
929; AVX512-NEXT:    retq
930  %t = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
931  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
932  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
933  %res = sext <2 x i1> %obit to <2 x i32>
934  store <2 x i64> %val, ptr %p2
935  ret <2 x i32> %res
936}
937
938define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
939; SSE2-LABEL: usubo_v4i24:
940; SSE2:       # %bb.0:
941; SSE2-NEXT:    movdqa %xmm0, %xmm2
942; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
943; SSE2-NEXT:    pand %xmm3, %xmm1
944; SSE2-NEXT:    pand %xmm3, %xmm2
945; SSE2-NEXT:    psubd %xmm1, %xmm2
946; SSE2-NEXT:    pand %xmm2, %xmm3
947; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
948; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
949; SSE2-NEXT:    pxor %xmm3, %xmm0
950; SSE2-NEXT:    movd %xmm2, %eax
951; SSE2-NEXT:    movw %ax, (%rdi)
952; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
953; SSE2-NEXT:    movd %xmm1, %ecx
954; SSE2-NEXT:    movw %cx, 9(%rdi)
955; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
956; SSE2-NEXT:    movd %xmm1, %edx
957; SSE2-NEXT:    movw %dx, 6(%rdi)
958; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
959; SSE2-NEXT:    movd %xmm1, %esi
960; SSE2-NEXT:    movw %si, 3(%rdi)
961; SSE2-NEXT:    shrl $16, %eax
962; SSE2-NEXT:    movb %al, 2(%rdi)
963; SSE2-NEXT:    shrl $16, %ecx
964; SSE2-NEXT:    movb %cl, 11(%rdi)
965; SSE2-NEXT:    shrl $16, %edx
966; SSE2-NEXT:    movb %dl, 8(%rdi)
967; SSE2-NEXT:    shrl $16, %esi
968; SSE2-NEXT:    movb %sil, 5(%rdi)
969; SSE2-NEXT:    retq
970;
971; SSSE3-LABEL: usubo_v4i24:
972; SSSE3:       # %bb.0:
973; SSSE3-NEXT:    movdqa %xmm0, %xmm2
974; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
975; SSSE3-NEXT:    pand %xmm3, %xmm1
976; SSSE3-NEXT:    pand %xmm3, %xmm2
977; SSSE3-NEXT:    psubd %xmm1, %xmm2
978; SSSE3-NEXT:    pand %xmm2, %xmm3
979; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3
980; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
981; SSSE3-NEXT:    pxor %xmm3, %xmm0
982; SSSE3-NEXT:    movd %xmm2, %eax
983; SSSE3-NEXT:    movw %ax, (%rdi)
984; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
985; SSSE3-NEXT:    movd %xmm1, %ecx
986; SSSE3-NEXT:    movw %cx, 9(%rdi)
987; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
988; SSSE3-NEXT:    movd %xmm1, %edx
989; SSSE3-NEXT:    movw %dx, 6(%rdi)
990; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
991; SSSE3-NEXT:    movd %xmm1, %esi
992; SSSE3-NEXT:    movw %si, 3(%rdi)
993; SSSE3-NEXT:    shrl $16, %eax
994; SSSE3-NEXT:    movb %al, 2(%rdi)
995; SSSE3-NEXT:    shrl $16, %ecx
996; SSSE3-NEXT:    movb %cl, 11(%rdi)
997; SSSE3-NEXT:    shrl $16, %edx
998; SSSE3-NEXT:    movb %dl, 8(%rdi)
999; SSSE3-NEXT:    shrl $16, %esi
1000; SSSE3-NEXT:    movb %sil, 5(%rdi)
1001; SSSE3-NEXT:    retq
1002;
1003; SSE41-LABEL: usubo_v4i24:
1004; SSE41:       # %bb.0:
1005; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
1006; SSE41-NEXT:    pand %xmm2, %xmm1
1007; SSE41-NEXT:    pand %xmm2, %xmm0
1008; SSE41-NEXT:    psubd %xmm1, %xmm0
1009; SSE41-NEXT:    pand %xmm0, %xmm2
1010; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
1011; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
1012; SSE41-NEXT:    pxor %xmm2, %xmm1
1013; SSE41-NEXT:    pextrd $3, %xmm0, %eax
1014; SSE41-NEXT:    movw %ax, 9(%rdi)
1015; SSE41-NEXT:    pextrd $2, %xmm0, %ecx
1016; SSE41-NEXT:    movw %cx, 6(%rdi)
1017; SSE41-NEXT:    pextrd $1, %xmm0, %edx
1018; SSE41-NEXT:    movw %dx, 3(%rdi)
1019; SSE41-NEXT:    movd %xmm0, %esi
1020; SSE41-NEXT:    movw %si, (%rdi)
1021; SSE41-NEXT:    shrl $16, %eax
1022; SSE41-NEXT:    movb %al, 11(%rdi)
1023; SSE41-NEXT:    shrl $16, %ecx
1024; SSE41-NEXT:    movb %cl, 8(%rdi)
1025; SSE41-NEXT:    shrl $16, %edx
1026; SSE41-NEXT:    movb %dl, 5(%rdi)
1027; SSE41-NEXT:    shrl $16, %esi
1028; SSE41-NEXT:    movb %sil, 2(%rdi)
1029; SSE41-NEXT:    movdqa %xmm1, %xmm0
1030; SSE41-NEXT:    retq
1031;
1032; AVX1-LABEL: usubo_v4i24:
1033; AVX1:       # %bb.0:
1034; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
1035; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm1
1036; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm0
1037; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
1038; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm0
1039; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
1040; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1041; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
1042; AVX1-NEXT:    vpextrd $3, %xmm1, %eax
1043; AVX1-NEXT:    movw %ax, 9(%rdi)
1044; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
1045; AVX1-NEXT:    movw %cx, 6(%rdi)
1046; AVX1-NEXT:    vpextrd $1, %xmm1, %edx
1047; AVX1-NEXT:    movw %dx, 3(%rdi)
1048; AVX1-NEXT:    vmovd %xmm1, %esi
1049; AVX1-NEXT:    movw %si, (%rdi)
1050; AVX1-NEXT:    shrl $16, %eax
1051; AVX1-NEXT:    movb %al, 11(%rdi)
1052; AVX1-NEXT:    shrl $16, %ecx
1053; AVX1-NEXT:    movb %cl, 8(%rdi)
1054; AVX1-NEXT:    shrl $16, %edx
1055; AVX1-NEXT:    movb %dl, 5(%rdi)
1056; AVX1-NEXT:    shrl $16, %esi
1057; AVX1-NEXT:    movb %sil, 2(%rdi)
1058; AVX1-NEXT:    retq
1059;
1060; AVX2-LABEL: usubo_v4i24:
1061; AVX2:       # %bb.0:
1062; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
1063; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
1064; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
1065; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
1066; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm0
1067; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
1068; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1069; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
1070; AVX2-NEXT:    vpextrd $3, %xmm1, %eax
1071; AVX2-NEXT:    movw %ax, 9(%rdi)
1072; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
1073; AVX2-NEXT:    movw %cx, 6(%rdi)
1074; AVX2-NEXT:    vpextrd $1, %xmm1, %edx
1075; AVX2-NEXT:    movw %dx, 3(%rdi)
1076; AVX2-NEXT:    vmovd %xmm1, %esi
1077; AVX2-NEXT:    movw %si, (%rdi)
1078; AVX2-NEXT:    shrl $16, %eax
1079; AVX2-NEXT:    movb %al, 11(%rdi)
1080; AVX2-NEXT:    shrl $16, %ecx
1081; AVX2-NEXT:    movb %cl, 8(%rdi)
1082; AVX2-NEXT:    shrl $16, %edx
1083; AVX2-NEXT:    movb %dl, 5(%rdi)
1084; AVX2-NEXT:    shrl $16, %esi
1085; AVX2-NEXT:    movb %sil, 2(%rdi)
1086; AVX2-NEXT:    retq
1087;
1088; AVX512-LABEL: usubo_v4i24:
1089; AVX512:       # %bb.0:
1090; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
1091; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
1092; AVX512-NEXT:    vpand %xmm2, %xmm0, %xmm0
1093; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
1094; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm0
1095; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
1096; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
1097; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
1098; AVX512-NEXT:    movw %ax, 9(%rdi)
1099; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
1100; AVX512-NEXT:    movw %cx, 6(%rdi)
1101; AVX512-NEXT:    vpextrd $1, %xmm1, %edx
1102; AVX512-NEXT:    movw %dx, 3(%rdi)
1103; AVX512-NEXT:    vmovd %xmm1, %esi
1104; AVX512-NEXT:    movw %si, (%rdi)
1105; AVX512-NEXT:    shrl $16, %eax
1106; AVX512-NEXT:    movb %al, 11(%rdi)
1107; AVX512-NEXT:    shrl $16, %ecx
1108; AVX512-NEXT:    movb %cl, 8(%rdi)
1109; AVX512-NEXT:    shrl $16, %edx
1110; AVX512-NEXT:    movb %dl, 5(%rdi)
1111; AVX512-NEXT:    shrl $16, %esi
1112; AVX512-NEXT:    movb %sil, 2(%rdi)
1113; AVX512-NEXT:    retq
1114  %t = call {<4 x i24>, <4 x i1>} @llvm.usub.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
1115  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
1116  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
1117  %res = sext <4 x i1> %obit to <4 x i32>
1118  store <4 x i24> %val, ptr %p2
1119  ret <4 x i32> %res
1120}
1121
1122define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
1123; SSE-LABEL: usubo_v4i1:
1124; SSE:       # %bb.0:
1125; SSE-NEXT:    movdqa %xmm0, %xmm2
1126; SSE-NEXT:    pxor %xmm1, %xmm2
1127; SSE-NEXT:    pslld $31, %xmm2
1128; SSE-NEXT:    movmskps %xmm2, %eax
1129; SSE-NEXT:    pandn %xmm1, %xmm0
1130; SSE-NEXT:    pslld $31, %xmm0
1131; SSE-NEXT:    psrad $31, %xmm0
1132; SSE-NEXT:    movb %al, (%rdi)
1133; SSE-NEXT:    retq
1134;
1135; AVX-LABEL: usubo_v4i1:
1136; AVX:       # %bb.0:
1137; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm2
1138; AVX-NEXT:    vpslld $31, %xmm2, %xmm2
1139; AVX-NEXT:    vmovmskps %xmm2, %eax
1140; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1141; AVX-NEXT:    vpslld $31, %xmm0, %xmm0
1142; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
1143; AVX-NEXT:    movb %al, (%rdi)
1144; AVX-NEXT:    retq
1145;
1146; AVX512-LABEL: usubo_v4i1:
1147; AVX512:       # %bb.0:
1148; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
1149; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
1150; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
1151; AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1152; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
1153; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
1154; AVX512-NEXT:    kmovd %k0, %eax
1155; AVX512-NEXT:    movb %al, (%rdi)
1156; AVX512-NEXT:    retq
1157  %t = call {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
1158  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
1159  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
1160  %res = sext <4 x i1> %obit to <4 x i32>
1161  store <4 x i1> %val, ptr %p2
1162  ret <4 x i32> %res
1163}
1164
1165define <2 x i32> @usubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind {
1166; SSE2-LABEL: usubo_v2i128:
1167; SSE2:       # %bb.0:
1168; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1169; SSE2-NEXT:    xorl %r10d, %r10d
1170; SSE2-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1171; SSE2-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1172; SSE2-NEXT:    movl $0, %r11d
1173; SSE2-NEXT:    sbbl %r11d, %r11d
1174; SSE2-NEXT:    subq %r8, %rdi
1175; SSE2-NEXT:    sbbq %r9, %rsi
1176; SSE2-NEXT:    movd %r11d, %xmm1
1177; SSE2-NEXT:    sbbl %r10d, %r10d
1178; SSE2-NEXT:    movd %r10d, %xmm0
1179; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1180; SSE2-NEXT:    movq %rdx, 16(%rax)
1181; SSE2-NEXT:    movq %rdi, (%rax)
1182; SSE2-NEXT:    movq %rcx, 24(%rax)
1183; SSE2-NEXT:    movq %rsi, 8(%rax)
1184; SSE2-NEXT:    retq
1185;
1186; SSSE3-LABEL: usubo_v2i128:
1187; SSSE3:       # %bb.0:
1188; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1189; SSSE3-NEXT:    xorl %r10d, %r10d
1190; SSSE3-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1191; SSSE3-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1192; SSSE3-NEXT:    movl $0, %r11d
1193; SSSE3-NEXT:    sbbl %r11d, %r11d
1194; SSSE3-NEXT:    subq %r8, %rdi
1195; SSSE3-NEXT:    sbbq %r9, %rsi
1196; SSSE3-NEXT:    movd %r11d, %xmm1
1197; SSSE3-NEXT:    sbbl %r10d, %r10d
1198; SSSE3-NEXT:    movd %r10d, %xmm0
1199; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1200; SSSE3-NEXT:    movq %rdx, 16(%rax)
1201; SSSE3-NEXT:    movq %rdi, (%rax)
1202; SSSE3-NEXT:    movq %rcx, 24(%rax)
1203; SSSE3-NEXT:    movq %rsi, 8(%rax)
1204; SSSE3-NEXT:    retq
1205;
1206; SSE41-LABEL: usubo_v2i128:
1207; SSE41:       # %bb.0:
1208; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1209; SSE41-NEXT:    xorl %r10d, %r10d
1210; SSE41-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1211; SSE41-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1212; SSE41-NEXT:    movl $0, %r11d
1213; SSE41-NEXT:    sbbl %r11d, %r11d
1214; SSE41-NEXT:    subq %r8, %rdi
1215; SSE41-NEXT:    sbbq %r9, %rsi
1216; SSE41-NEXT:    sbbl %r10d, %r10d
1217; SSE41-NEXT:    movd %r10d, %xmm0
1218; SSE41-NEXT:    pinsrd $1, %r11d, %xmm0
1219; SSE41-NEXT:    movq %rdx, 16(%rax)
1220; SSE41-NEXT:    movq %rdi, (%rax)
1221; SSE41-NEXT:    movq %rcx, 24(%rax)
1222; SSE41-NEXT:    movq %rsi, 8(%rax)
1223; SSE41-NEXT:    retq
1224;
1225; AVX-LABEL: usubo_v2i128:
1226; AVX:       # %bb.0:
1227; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1228; AVX-NEXT:    xorl %r10d, %r10d
1229; AVX-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1230; AVX-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1231; AVX-NEXT:    movl $0, %r11d
1232; AVX-NEXT:    sbbl %r11d, %r11d
1233; AVX-NEXT:    subq %r8, %rdi
1234; AVX-NEXT:    sbbq %r9, %rsi
1235; AVX-NEXT:    sbbl %r10d, %r10d
1236; AVX-NEXT:    vmovd %r10d, %xmm0
1237; AVX-NEXT:    vpinsrd $1, %r11d, %xmm0, %xmm0
1238; AVX-NEXT:    movq %rdx, 16(%rax)
1239; AVX-NEXT:    movq %rdi, (%rax)
1240; AVX-NEXT:    movq %rcx, 24(%rax)
1241; AVX-NEXT:    movq %rsi, 8(%rax)
1242; AVX-NEXT:    retq
1243;
1244; AVX512-LABEL: usubo_v2i128:
1245; AVX512:       # %bb.0:
1246; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1247; AVX512-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1248; AVX512-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1249; AVX512-NEXT:    setb %r10b
1250; AVX512-NEXT:    kmovd %r10d, %k0
1251; AVX512-NEXT:    subq %r8, %rdi
1252; AVX512-NEXT:    sbbq %r9, %rsi
1253; AVX512-NEXT:    setb %r8b
1254; AVX512-NEXT:    andl $1, %r8d
1255; AVX512-NEXT:    kmovw %r8d, %k1
1256; AVX512-NEXT:    kshiftlw $1, %k0, %k0
1257; AVX512-NEXT:    korw %k0, %k1, %k1
1258; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1259; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1260; AVX512-NEXT:    movq %rdx, 16(%rax)
1261; AVX512-NEXT:    movq %rdi, (%rax)
1262; AVX512-NEXT:    movq %rcx, 24(%rax)
1263; AVX512-NEXT:    movq %rsi, 8(%rax)
1264; AVX512-NEXT:    retq
1265  %t = call {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
1266  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
1267  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
1268  %res = sext <2 x i1> %obit to <2 x i32>
1269  store <2 x i128> %val, ptr %p2
1270  ret <2 x i32> %res
1271}
1272