xref: /llvm-project/llvm/test/CodeGen/X86/vec_umulo.ll (revision 5e79ae60a67726805fcc27081f67c41cbd8a1e4e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
11
12declare {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32>, <1 x i32>)
13declare {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32>, <2 x i32>)
14declare {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32>, <3 x i32>)
15declare {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32>, <4 x i32>)
16declare {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32>, <6 x i32>)
17declare {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32>, <8 x i32>)
18declare {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32>, <16 x i32>)
19
20declare {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8>, <16 x i8>)
21declare {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8>, <32 x i8>)
22declare {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x i8>)
23declare {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16>, <8 x i16>)
24declare {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64>, <2 x i64>)
25
26declare {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24>, <4 x i24>)
27declare {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1>, <4 x i1>)
28declare {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128>, <2 x i128>)
29
30define <1 x i32> @umulo_v1i32(<1 x i32> %a0, <1 x i32> %a1, ptr %p2) nounwind {
31; CHECK-LABEL: umulo_v1i32:
32; CHECK:       # %bb.0:
33; CHECK-NEXT:    movq %rdx, %rcx
34; CHECK-NEXT:    movl %edi, %eax
35; CHECK-NEXT:    xorl %edi, %edi
36; CHECK-NEXT:    mull %esi
37; CHECK-NEXT:    seto %dil
38; CHECK-NEXT:    negl %edi
39; CHECK-NEXT:    movl %eax, (%rcx)
40; CHECK-NEXT:    movl %edi, %eax
41; CHECK-NEXT:    retq
42  %t = call {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
43  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
44  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
45  %res = sext <1 x i1> %obit to <1 x i32>
46  store <1 x i32> %val, ptr %p2
47  ret <1 x i32> %res
48}
49
50define <2 x i32> @umulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
51; SSE2-LABEL: umulo_v2i32:
52; SSE2:       # %bb.0:
53; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
54; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
55; SSE2-NEXT:    pmuludq %xmm1, %xmm2
56; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
57; SSE2-NEXT:    pxor %xmm1, %xmm1
58; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
59; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
60; SSE2-NEXT:    pxor %xmm1, %xmm0
61; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
62; SSE2-NEXT:    movq %xmm1, (%rdi)
63; SSE2-NEXT:    retq
64;
65; SSSE3-LABEL: umulo_v2i32:
66; SSSE3:       # %bb.0:
67; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
68; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
69; SSSE3-NEXT:    pmuludq %xmm1, %xmm2
70; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
71; SSSE3-NEXT:    pxor %xmm1, %xmm1
72; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
73; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
74; SSSE3-NEXT:    pxor %xmm1, %xmm0
75; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
76; SSSE3-NEXT:    movq %xmm1, (%rdi)
77; SSSE3-NEXT:    retq
78;
79; SSE41-LABEL: umulo_v2i32:
80; SSE41:       # %bb.0:
81; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
82; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
83; SSE41-NEXT:    pmuludq %xmm1, %xmm2
84; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
85; SSE41-NEXT:    pxor %xmm1, %xmm1
86; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
87; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
88; SSE41-NEXT:    pxor %xmm1, %xmm0
89; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
90; SSE41-NEXT:    movq %xmm1, (%rdi)
91; SSE41-NEXT:    retq
92;
93; AVX-LABEL: umulo_v2i32:
94; AVX:       # %bb.0:
95; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
96; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
97; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
98; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
99; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
100; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
101; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
102; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
103; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
104; AVX-NEXT:    vmovq %xmm1, (%rdi)
105; AVX-NEXT:    retq
106;
107; AVX512-LABEL: umulo_v2i32:
108; AVX512:       # %bb.0:
109; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
110; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
111; AVX512-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
112; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
113; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
114; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
115; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
116; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
117; AVX512-NEXT:    vmovq %xmm1, (%rdi)
118; AVX512-NEXT:    retq
119  %t = call {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
120  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
121  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
122  %res = sext <2 x i1> %obit to <2 x i32>
123  store <2 x i32> %val, ptr %p2
124  ret <2 x i32> %res
125}
126
127define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
128; SSE2-LABEL: umulo_v3i32:
129; SSE2:       # %bb.0:
130; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
131; SSE2-NEXT:    pmuludq %xmm1, %xmm0
132; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
133; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
134; SSE2-NEXT:    pmuludq %xmm2, %xmm4
135; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
136; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
137; SSE2-NEXT:    pxor %xmm2, %xmm2
138; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
139; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
140; SSE2-NEXT:    pxor %xmm2, %xmm1
141; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
142; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
143; SSE2-NEXT:    movd %xmm2, 8(%rdi)
144; SSE2-NEXT:    movq %xmm0, (%rdi)
145; SSE2-NEXT:    movdqa %xmm1, %xmm0
146; SSE2-NEXT:    retq
147;
148; SSSE3-LABEL: umulo_v3i32:
149; SSSE3:       # %bb.0:
150; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
151; SSSE3-NEXT:    pmuludq %xmm1, %xmm0
152; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
153; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
154; SSSE3-NEXT:    pmuludq %xmm2, %xmm4
155; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
156; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
157; SSSE3-NEXT:    pxor %xmm2, %xmm2
158; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm2
159; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
160; SSSE3-NEXT:    pxor %xmm2, %xmm1
161; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
162; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
163; SSSE3-NEXT:    movd %xmm2, 8(%rdi)
164; SSSE3-NEXT:    movq %xmm0, (%rdi)
165; SSSE3-NEXT:    movdqa %xmm1, %xmm0
166; SSSE3-NEXT:    retq
167;
168; SSE41-LABEL: umulo_v3i32:
169; SSE41:       # %bb.0:
170; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
171; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
172; SSE41-NEXT:    pmuludq %xmm2, %xmm3
173; SSE41-NEXT:    movdqa %xmm0, %xmm2
174; SSE41-NEXT:    pmuludq %xmm1, %xmm2
175; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
176; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
177; SSE41-NEXT:    pxor %xmm3, %xmm3
178; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
179; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
180; SSE41-NEXT:    pxor %xmm3, %xmm2
181; SSE41-NEXT:    pmulld %xmm1, %xmm0
182; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdi)
183; SSE41-NEXT:    movq %xmm0, (%rdi)
184; SSE41-NEXT:    movdqa %xmm2, %xmm0
185; SSE41-NEXT:    retq
186;
187; AVX1-LABEL: umulo_v3i32:
188; AVX1:       # %bb.0:
189; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
190; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
191; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
192; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
193; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
194; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
195; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
196; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
197; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
198; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
199; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
200; AVX1-NEXT:    vpextrd $2, %xmm0, 8(%rdi)
201; AVX1-NEXT:    vmovq %xmm0, (%rdi)
202; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
203; AVX1-NEXT:    retq
204;
205; AVX2-LABEL: umulo_v3i32:
206; AVX2:       # %bb.0:
207; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
208; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
209; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
210; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
211; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
212; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
213; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
214; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
215; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
216; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
217; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
218; AVX2-NEXT:    vpextrd $2, %xmm0, 8(%rdi)
219; AVX2-NEXT:    vmovq %xmm0, (%rdi)
220; AVX2-NEXT:    vmovdqa %xmm2, %xmm0
221; AVX2-NEXT:    retq
222;
223; AVX512-LABEL: umulo_v3i32:
224; AVX512:       # %bb.0:
225; AVX512-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
226; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
227; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
228; AVX512-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
229; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7]
230; AVX512-NEXT:    vpermi2d %xmm3, %xmm2, %xmm4
231; AVX512-NEXT:    vptestmd %xmm4, %xmm4, %k1
232; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
233; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
234; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
235; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
236; AVX512-NEXT:    vmovq %xmm1, (%rdi)
237; AVX512-NEXT:    retq
238  %t = call {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
239  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
240  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
241  %res = sext <3 x i1> %obit to <3 x i32>
242  store <3 x i32> %val, ptr %p2
243  ret <3 x i32> %res
244}
245
246define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
247; SSE2-LABEL: umulo_v4i32:
248; SSE2:       # %bb.0:
249; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
250; SSE2-NEXT:    pmuludq %xmm1, %xmm0
251; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
252; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
253; SSE2-NEXT:    pmuludq %xmm2, %xmm4
254; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
255; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
256; SSE2-NEXT:    pxor %xmm2, %xmm2
257; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
258; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
259; SSE2-NEXT:    pxor %xmm2, %xmm1
260; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
261; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
262; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
263; SSE2-NEXT:    movdqa %xmm0, (%rdi)
264; SSE2-NEXT:    movdqa %xmm1, %xmm0
265; SSE2-NEXT:    retq
266;
267; SSSE3-LABEL: umulo_v4i32:
268; SSSE3:       # %bb.0:
269; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
270; SSSE3-NEXT:    pmuludq %xmm1, %xmm0
271; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
272; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
273; SSSE3-NEXT:    pmuludq %xmm2, %xmm4
274; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
275; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
276; SSSE3-NEXT:    pxor %xmm2, %xmm2
277; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm2
278; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
279; SSSE3-NEXT:    pxor %xmm2, %xmm1
280; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
281; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
282; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
283; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
284; SSSE3-NEXT:    movdqa %xmm1, %xmm0
285; SSSE3-NEXT:    retq
286;
287; SSE41-LABEL: umulo_v4i32:
288; SSE41:       # %bb.0:
289; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
290; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
291; SSE41-NEXT:    pmuludq %xmm2, %xmm3
292; SSE41-NEXT:    movdqa %xmm0, %xmm2
293; SSE41-NEXT:    pmuludq %xmm1, %xmm2
294; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
295; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
296; SSE41-NEXT:    pxor %xmm3, %xmm3
297; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
298; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
299; SSE41-NEXT:    pxor %xmm3, %xmm2
300; SSE41-NEXT:    pmulld %xmm1, %xmm0
301; SSE41-NEXT:    movdqa %xmm0, (%rdi)
302; SSE41-NEXT:    movdqa %xmm2, %xmm0
303; SSE41-NEXT:    retq
304;
305; AVX1-LABEL: umulo_v4i32:
306; AVX1:       # %bb.0:
307; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
308; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
309; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
310; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
311; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
312; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
313; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
314; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
315; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
316; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
317; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
318; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
319; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
320; AVX1-NEXT:    retq
321;
322; AVX2-LABEL: umulo_v4i32:
323; AVX2:       # %bb.0:
324; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
325; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
326; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
327; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
328; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
329; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
330; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
331; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
332; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
333; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
334; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
335; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
336; AVX2-NEXT:    vmovdqa %xmm2, %xmm0
337; AVX2-NEXT:    retq
338;
339; AVX512-LABEL: umulo_v4i32:
340; AVX512:       # %bb.0:
341; AVX512-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
342; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
343; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
344; AVX512-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
345; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7]
346; AVX512-NEXT:    vpermi2d %xmm3, %xmm2, %xmm4
347; AVX512-NEXT:    vptestmd %xmm4, %xmm4, %k1
348; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
349; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
350; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
351; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
352; AVX512-NEXT:    retq
353  %t = call {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
354  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
355  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
356  %res = sext <4 x i1> %obit to <4 x i32>
357  store <4 x i32> %val, ptr %p2
358  ret <4 x i32> %res
359}
360
361define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
362; SSE2-LABEL: umulo_v6i32:
363; SSE2:       # %bb.0:
364; SSE2-NEXT:    movq %rdi, %rax
365; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
366; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
367; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
368; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
369; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
370; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
371; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
372; SSE2-NEXT:    movd %r8d, %xmm0
373; SSE2-NEXT:    movd %ecx, %xmm1
374; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
375; SSE2-NEXT:    movd %edx, %xmm0
376; SSE2-NEXT:    movd %esi, %xmm3
377; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
378; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
379; SSE2-NEXT:    movd %r9d, %xmm1
380; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
381; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
382; SSE2-NEXT:    pmuludq %xmm1, %xmm0
383; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
384; SSE2-NEXT:    pmuludq %xmm2, %xmm3
385; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
386; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
387; SSE2-NEXT:    pmuludq %xmm4, %xmm2
388; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
389; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
390; SSE2-NEXT:    pxor %xmm4, %xmm4
391; SSE2-NEXT:    pcmpeqd %xmm4, %xmm1
392; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
393; SSE2-NEXT:    pxor %xmm5, %xmm1
394; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
395; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
396; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
397; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
398; SSE2-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
399; SSE2-NEXT:    pmuludq %xmm2, %xmm6
400; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
401; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3]
402; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
403; SSE2-NEXT:    pcmpeqd %xmm4, %xmm7
404; SSE2-NEXT:    pxor %xmm5, %xmm7
405; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
406; SSE2-NEXT:    movq %xmm0, 16(%rcx)
407; SSE2-NEXT:    movdqa %xmm3, (%rcx)
408; SSE2-NEXT:    movq %xmm7, 16(%rdi)
409; SSE2-NEXT:    movdqa %xmm1, (%rdi)
410; SSE2-NEXT:    retq
411;
412; SSSE3-LABEL: umulo_v6i32:
413; SSSE3:       # %bb.0:
414; SSSE3-NEXT:    movq %rdi, %rax
415; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
416; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
417; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
418; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
419; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
420; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
421; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
422; SSSE3-NEXT:    movd %r8d, %xmm0
423; SSSE3-NEXT:    movd %ecx, %xmm1
424; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
425; SSSE3-NEXT:    movd %edx, %xmm0
426; SSSE3-NEXT:    movd %esi, %xmm3
427; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
428; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
429; SSSE3-NEXT:    movd %r9d, %xmm1
430; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
431; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
432; SSSE3-NEXT:    pmuludq %xmm1, %xmm0
433; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
434; SSSE3-NEXT:    pmuludq %xmm2, %xmm3
435; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
436; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
437; SSSE3-NEXT:    pmuludq %xmm4, %xmm2
438; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
439; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
440; SSSE3-NEXT:    pxor %xmm4, %xmm4
441; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm1
442; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5
443; SSSE3-NEXT:    pxor %xmm5, %xmm1
444; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
445; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
446; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
447; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
448; SSSE3-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
449; SSSE3-NEXT:    pmuludq %xmm2, %xmm6
450; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
451; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3]
452; SSSE3-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
453; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm7
454; SSSE3-NEXT:    pxor %xmm5, %xmm7
455; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
456; SSSE3-NEXT:    movq %xmm0, 16(%rcx)
457; SSSE3-NEXT:    movdqa %xmm3, (%rcx)
458; SSSE3-NEXT:    movq %xmm7, 16(%rdi)
459; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
460; SSSE3-NEXT:    retq
461;
462; SSE41-LABEL: umulo_v6i32:
463; SSE41:       # %bb.0:
464; SSE41-NEXT:    movq %rdi, %rax
465; SSE41-NEXT:    movl {{[0-9]+}}(%rsp), %edi
466; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
467; SSE41-NEXT:    movd %r9d, %xmm1
468; SSE41-NEXT:    movdqa %xmm1, %xmm0
469; SSE41-NEXT:    pmuludq %xmm2, %xmm1
470; SSE41-NEXT:    pinsrd $1, %edi, %xmm2
471; SSE41-NEXT:    movl {{[0-9]+}}(%rsp), %r9d
472; SSE41-NEXT:    pinsrd $1, %r9d, %xmm0
473; SSE41-NEXT:    pmulld %xmm2, %xmm0
474; SSE41-NEXT:    movd %esi, %xmm2
475; SSE41-NEXT:    pinsrd $1, %edx, %xmm2
476; SSE41-NEXT:    pinsrd $2, %ecx, %xmm2
477; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
478; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
479; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
480; SSE41-NEXT:    movdqa %xmm3, %xmm4
481; SSE41-NEXT:    pmuludq %xmm2, %xmm3
482; SSE41-NEXT:    pinsrd $3, %r8d, %xmm2
483; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm4
484; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
485; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
486; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
487; SSE41-NEXT:    pmuludq %xmm5, %xmm6
488; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
489; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7]
490; SSE41-NEXT:    pxor %xmm5, %xmm5
491; SSE41-NEXT:    pcmpeqd %xmm5, %xmm3
492; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
493; SSE41-NEXT:    pxor %xmm6, %xmm3
494; SSE41-NEXT:    movd %edi, %xmm7
495; SSE41-NEXT:    movd %r9d, %xmm8
496; SSE41-NEXT:    pmuludq %xmm7, %xmm8
497; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
498; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7]
499; SSE41-NEXT:    pcmpeqd %xmm5, %xmm1
500; SSE41-NEXT:    pxor %xmm6, %xmm1
501; SSE41-NEXT:    pmulld %xmm2, %xmm4
502; SSE41-NEXT:    movq %xmm0, 16(%rcx)
503; SSE41-NEXT:    movdqa %xmm4, (%rcx)
504; SSE41-NEXT:    movq %xmm1, 16(%rax)
505; SSE41-NEXT:    movdqa %xmm3, (%rax)
506; SSE41-NEXT:    retq
507;
508; AVX1-LABEL: umulo_v6i32:
509; AVX1:       # %bb.0:
510; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
511; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
512; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
513; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
514; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm2
515; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm5
516; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
517; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7]
518; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
519; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm2, %xmm2
520; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
521; AVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm2
522; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
523; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
524; AVX1-NEXT:    vpmuludq %xmm7, %xmm8, %xmm7
525; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm8
526; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
527; AVX1-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7]
528; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm7, %xmm5
529; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
530; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
531; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
532; AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm1
533; AVX1-NEXT:    vmovq %xmm1, 16(%rdi)
534; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
535; AVX1-NEXT:    vmovaps %ymm2, %ymm0
536; AVX1-NEXT:    retq
537;
538; AVX2-LABEL: umulo_v6i32:
539; AVX2:       # %bb.0:
540; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
541; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
542; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
543; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm3
544; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
545; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
546; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
547; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
548; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
549; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
550; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
551; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
552; AVX2-NEXT:    vmovq %xmm1, 16(%rdi)
553; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
554; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
555; AVX2-NEXT:    retq
556;
557; AVX512-LABEL: umulo_v6i32:
558; AVX512:       # %bb.0:
559; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
560; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
561; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
562; AVX512-NEXT:    vpmuludq %ymm3, %ymm4, %ymm3
563; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
564; AVX512-NEXT:    vpermi2d %ymm3, %ymm2, %ymm4
565; AVX512-NEXT:    vptestmd %ymm4, %ymm4, %k1
566; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm1
567; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
568; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
569; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
570; AVX512-NEXT:    vmovq %xmm2, 16(%rdi)
571; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
572; AVX512-NEXT:    retq
573  %t = call {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
574  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
575  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
576  %res = sext <6 x i1> %obit to <6 x i32>
577  store <6 x i32> %val, ptr %p2
578  ret <6 x i32> %res
579}
580
581define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
582; SSE2-LABEL: umulo_v8i32:
583; SSE2:       # %bb.0:
584; SSE2-NEXT:    movdqa %xmm0, %xmm4
585; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
586; SSE2-NEXT:    pmuludq %xmm2, %xmm4
587; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3]
588; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
589; SSE2-NEXT:    pmuludq %xmm5, %xmm6
590; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
591; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
592; SSE2-NEXT:    pxor %xmm5, %xmm5
593; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
594; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
595; SSE2-NEXT:    pxor %xmm7, %xmm0
596; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
597; SSE2-NEXT:    pmuludq %xmm3, %xmm1
598; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
599; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
600; SSE2-NEXT:    pmuludq %xmm8, %xmm3
601; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[1,3,2,3]
602; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
603; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
604; SSE2-NEXT:    pxor %xmm7, %xmm2
605; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
606; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
607; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
608; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
609; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
610; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
611; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
612; SSE2-NEXT:    movdqa %xmm4, (%rdi)
613; SSE2-NEXT:    movdqa %xmm2, %xmm1
614; SSE2-NEXT:    retq
615;
616; SSSE3-LABEL: umulo_v8i32:
617; SSSE3:       # %bb.0:
618; SSSE3-NEXT:    movdqa %xmm0, %xmm4
619; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
620; SSSE3-NEXT:    pmuludq %xmm2, %xmm4
621; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3]
622; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
623; SSSE3-NEXT:    pmuludq %xmm5, %xmm6
624; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
625; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
626; SSSE3-NEXT:    pxor %xmm5, %xmm5
627; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm0
628; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm7
629; SSSE3-NEXT:    pxor %xmm7, %xmm0
630; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
631; SSSE3-NEXT:    pmuludq %xmm3, %xmm1
632; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
633; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
634; SSSE3-NEXT:    pmuludq %xmm8, %xmm3
635; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[1,3,2,3]
636; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
637; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
638; SSSE3-NEXT:    pxor %xmm7, %xmm2
639; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
640; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
641; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
642; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
643; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
644; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
645; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi)
646; SSSE3-NEXT:    movdqa %xmm4, (%rdi)
647; SSSE3-NEXT:    movdqa %xmm2, %xmm1
648; SSSE3-NEXT:    retq
649;
650; SSE41-LABEL: umulo_v8i32:
651; SSE41:       # %bb.0:
652; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
653; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
654; SSE41-NEXT:    pmuludq %xmm4, %xmm5
655; SSE41-NEXT:    movdqa %xmm0, %xmm4
656; SSE41-NEXT:    pmuludq %xmm2, %xmm4
657; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
658; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
659; SSE41-NEXT:    pxor %xmm6, %xmm6
660; SSE41-NEXT:    pcmpeqd %xmm6, %xmm4
661; SSE41-NEXT:    pcmpeqd %xmm7, %xmm7
662; SSE41-NEXT:    pxor %xmm7, %xmm4
663; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
664; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
665; SSE41-NEXT:    pmuludq %xmm5, %xmm8
666; SSE41-NEXT:    movdqa %xmm1, %xmm5
667; SSE41-NEXT:    pmuludq %xmm3, %xmm5
668; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
669; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5],xmm8[6,7]
670; SSE41-NEXT:    pcmpeqd %xmm6, %xmm5
671; SSE41-NEXT:    pxor %xmm7, %xmm5
672; SSE41-NEXT:    pmulld %xmm2, %xmm0
673; SSE41-NEXT:    pmulld %xmm3, %xmm1
674; SSE41-NEXT:    movdqa %xmm1, 16(%rdi)
675; SSE41-NEXT:    movdqa %xmm0, (%rdi)
676; SSE41-NEXT:    movdqa %xmm4, %xmm0
677; SSE41-NEXT:    movdqa %xmm5, %xmm1
678; SSE41-NEXT:    retq
679;
680; AVX1-LABEL: umulo_v8i32:
681; AVX1:       # %bb.0:
682; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
683; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
684; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
685; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
686; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm2
687; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm5
688; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
689; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7]
690; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
691; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm2, %xmm2
692; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
693; AVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm2
694; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
695; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
696; AVX1-NEXT:    vpmuludq %xmm7, %xmm8, %xmm7
697; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm8
698; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
699; AVX1-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7]
700; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm7, %xmm5
701; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
702; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
703; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
704; AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm1
705; AVX1-NEXT:    vmovdqa %xmm1, 16(%rdi)
706; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
707; AVX1-NEXT:    vmovaps %ymm2, %ymm0
708; AVX1-NEXT:    retq
709;
710; AVX2-LABEL: umulo_v8i32:
711; AVX2:       # %bb.0:
712; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
713; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
714; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
715; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm3
716; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
717; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
718; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
719; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
720; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
721; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
722; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
723; AVX2-NEXT:    vmovdqa %ymm0, (%rdi)
724; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
725; AVX2-NEXT:    retq
726;
727; AVX512-LABEL: umulo_v8i32:
728; AVX512:       # %bb.0:
729; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
730; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
731; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
732; AVX512-NEXT:    vpmuludq %ymm3, %ymm4, %ymm3
733; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
734; AVX512-NEXT:    vpermi2d %ymm3, %ymm2, %ymm4
735; AVX512-NEXT:    vptestmd %ymm4, %ymm4, %k1
736; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm1
737; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
738; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
739; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
740; AVX512-NEXT:    retq
741  %t = call {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
742  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
743  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
744  %res = sext <8 x i1> %obit to <8 x i32>
745  store <8 x i32> %val, ptr %p2
746  ret <8 x i32> %res
747}
748
749define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwind {
750; SSE2-LABEL: umulo_v16i32:
751; SSE2:       # %bb.0:
752; SSE2-NEXT:    movdqa %xmm0, %xmm9
753; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
754; SSE2-NEXT:    pmuludq %xmm4, %xmm9
755; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,3,2,3]
756; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[1,1,3,3]
757; SSE2-NEXT:    pmuludq %xmm8, %xmm10
758; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3]
759; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
760; SSE2-NEXT:    pxor %xmm11, %xmm11
761; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
762; SSE2-NEXT:    pcmpeqd %xmm12, %xmm12
763; SSE2-NEXT:    pxor %xmm12, %xmm0
764; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
765; SSE2-NEXT:    pmuludq %xmm5, %xmm1
766; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
767; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
768; SSE2-NEXT:    pmuludq %xmm8, %xmm5
769; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3]
770; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
771; SSE2-NEXT:    pcmpeqd %xmm11, %xmm4
772; SSE2-NEXT:    pxor %xmm12, %xmm4
773; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3]
774; SSE2-NEXT:    pmuludq %xmm6, %xmm2
775; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[1,3,2,3]
776; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3]
777; SSE2-NEXT:    pmuludq %xmm14, %xmm13
778; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3]
779; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
780; SSE2-NEXT:    pcmpeqd %xmm11, %xmm8
781; SSE2-NEXT:    pxor %xmm12, %xmm8
782; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3]
783; SSE2-NEXT:    pmuludq %xmm7, %xmm3
784; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3]
785; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
786; SSE2-NEXT:    pmuludq %xmm14, %xmm7
787; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm7[1,3,2,3]
788; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1]
789; SSE2-NEXT:    pcmpeqd %xmm11, %xmm6
790; SSE2-NEXT:    pxor %xmm12, %xmm6
791; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
792; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
793; SSE2-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
794; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
795; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
796; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
797; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
798; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3]
799; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
800; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
801; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
802; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
803; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
804; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
805; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
806; SSE2-NEXT:    movdqa %xmm9, (%rdi)
807; SSE2-NEXT:    movdqa %xmm4, %xmm1
808; SSE2-NEXT:    movdqa %xmm8, %xmm2
809; SSE2-NEXT:    movdqa %xmm6, %xmm3
810; SSE2-NEXT:    retq
811;
812; SSSE3-LABEL: umulo_v16i32:
813; SSSE3:       # %bb.0:
814; SSSE3-NEXT:    movdqa %xmm0, %xmm9
815; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
816; SSSE3-NEXT:    pmuludq %xmm4, %xmm9
817; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,3,2,3]
818; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[1,1,3,3]
819; SSSE3-NEXT:    pmuludq %xmm8, %xmm10
820; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3]
821; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
822; SSSE3-NEXT:    pxor %xmm11, %xmm11
823; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
824; SSSE3-NEXT:    pcmpeqd %xmm12, %xmm12
825; SSSE3-NEXT:    pxor %xmm12, %xmm0
826; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
827; SSSE3-NEXT:    pmuludq %xmm5, %xmm1
828; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
829; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
830; SSSE3-NEXT:    pmuludq %xmm8, %xmm5
831; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3]
832; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
833; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm4
834; SSSE3-NEXT:    pxor %xmm12, %xmm4
835; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3]
836; SSSE3-NEXT:    pmuludq %xmm6, %xmm2
837; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[1,3,2,3]
838; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3]
839; SSSE3-NEXT:    pmuludq %xmm14, %xmm13
840; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3]
841; SSSE3-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
842; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm8
843; SSSE3-NEXT:    pxor %xmm12, %xmm8
844; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3]
845; SSSE3-NEXT:    pmuludq %xmm7, %xmm3
846; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3]
847; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
848; SSSE3-NEXT:    pmuludq %xmm14, %xmm7
849; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm7[1,3,2,3]
850; SSSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1]
851; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm6
852; SSSE3-NEXT:    pxor %xmm12, %xmm6
853; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
854; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
855; SSSE3-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
856; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
857; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
858; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
859; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
860; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3]
861; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
862; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
863; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
864; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
865; SSSE3-NEXT:    movdqa %xmm3, 48(%rdi)
866; SSSE3-NEXT:    movdqa %xmm2, 32(%rdi)
867; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi)
868; SSSE3-NEXT:    movdqa %xmm9, (%rdi)
869; SSSE3-NEXT:    movdqa %xmm4, %xmm1
870; SSSE3-NEXT:    movdqa %xmm8, %xmm2
871; SSSE3-NEXT:    movdqa %xmm6, %xmm3
872; SSSE3-NEXT:    retq
873;
874; SSE41-LABEL: umulo_v16i32:
875; SSE41:       # %bb.0:
876; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
877; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
878; SSE41-NEXT:    pmuludq %xmm8, %xmm9
879; SSE41-NEXT:    movdqa %xmm0, %xmm8
880; SSE41-NEXT:    pmuludq %xmm4, %xmm8
881; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
882; SSE41-NEXT:    pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5],xmm9[6,7]
883; SSE41-NEXT:    pxor %xmm12, %xmm12
884; SSE41-NEXT:    pcmpeqd %xmm12, %xmm8
885; SSE41-NEXT:    pcmpeqd %xmm13, %xmm13
886; SSE41-NEXT:    pxor %xmm13, %xmm8
887; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
888; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
889; SSE41-NEXT:    pmuludq %xmm9, %xmm10
890; SSE41-NEXT:    movdqa %xmm1, %xmm9
891; SSE41-NEXT:    pmuludq %xmm5, %xmm9
892; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
893; SSE41-NEXT:    pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5],xmm10[6,7]
894; SSE41-NEXT:    pcmpeqd %xmm12, %xmm9
895; SSE41-NEXT:    pxor %xmm13, %xmm9
896; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3]
897; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3]
898; SSE41-NEXT:    pmuludq %xmm10, %xmm11
899; SSE41-NEXT:    movdqa %xmm2, %xmm10
900; SSE41-NEXT:    pmuludq %xmm6, %xmm10
901; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
902; SSE41-NEXT:    pblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7]
903; SSE41-NEXT:    pcmpeqd %xmm12, %xmm10
904; SSE41-NEXT:    pxor %xmm13, %xmm10
905; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm7[1,1,3,3]
906; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3]
907; SSE41-NEXT:    pmuludq %xmm11, %xmm14
908; SSE41-NEXT:    movdqa %xmm3, %xmm11
909; SSE41-NEXT:    pmuludq %xmm7, %xmm11
910; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
911; SSE41-NEXT:    pblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2,3],xmm11[4,5],xmm14[6,7]
912; SSE41-NEXT:    pcmpeqd %xmm12, %xmm11
913; SSE41-NEXT:    pxor %xmm13, %xmm11
914; SSE41-NEXT:    pmulld %xmm4, %xmm0
915; SSE41-NEXT:    pmulld %xmm5, %xmm1
916; SSE41-NEXT:    pmulld %xmm6, %xmm2
917; SSE41-NEXT:    pmulld %xmm7, %xmm3
918; SSE41-NEXT:    movdqa %xmm3, 48(%rdi)
919; SSE41-NEXT:    movdqa %xmm2, 32(%rdi)
920; SSE41-NEXT:    movdqa %xmm1, 16(%rdi)
921; SSE41-NEXT:    movdqa %xmm0, (%rdi)
922; SSE41-NEXT:    movdqa %xmm8, %xmm0
923; SSE41-NEXT:    movdqa %xmm9, %xmm1
924; SSE41-NEXT:    movdqa %xmm10, %xmm2
925; SSE41-NEXT:    movdqa %xmm11, %xmm3
926; SSE41-NEXT:    retq
927;
928; AVX1-LABEL: umulo_v16i32:
929; AVX1:       # %bb.0:
930; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
931; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
932; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
933; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
934; AVX1-NEXT:    vpmuludq %xmm6, %xmm7, %xmm6
935; AVX1-NEXT:    vpmuludq %xmm4, %xmm5, %xmm7
936; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
937; AVX1-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7]
938; AVX1-NEXT:    vpxor %xmm7, %xmm7, %xmm7
939; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm6, %xmm6
940; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm3[1,1,3,3]
941; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
942; AVX1-NEXT:    vpmuludq %xmm8, %xmm9, %xmm8
943; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm9
944; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
945; AVX1-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5],xmm8[6,7]
946; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm8, %xmm8
947; AVX1-NEXT:    vpackssdw %xmm6, %xmm8, %xmm6
948; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm8
949; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
950; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm10
951; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[1,1,3,3]
952; AVX1-NEXT:    vpmuludq %xmm9, %xmm11, %xmm9
953; AVX1-NEXT:    vpmuludq %xmm8, %xmm10, %xmm11
954; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
955; AVX1-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3],xmm11[4,5],xmm9[6,7]
956; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm9, %xmm9
957; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm2[1,1,3,3]
958; AVX1-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[1,1,3,3]
959; AVX1-NEXT:    vpmuludq %xmm11, %xmm12, %xmm11
960; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm12
961; AVX1-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
962; AVX1-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3],xmm12[4,5],xmm11[6,7]
963; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm11, %xmm7
964; AVX1-NEXT:    vpackssdw %xmm9, %xmm7, %xmm7
965; AVX1-NEXT:    vpacksswb %xmm6, %xmm7, %xmm7
966; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm9, %xmm9
967; AVX1-NEXT:    vpxor %xmm7, %xmm9, %xmm7
968; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm2
969; AVX1-NEXT:    vpmulld %xmm8, %xmm10, %xmm8
970; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm3
971; AVX1-NEXT:    vpmulld %xmm4, %xmm5, %xmm4
972; AVX1-NEXT:    vpmovsxbd %xmm7, %xmm0
973; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm7[1,1,1,1]
974; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
975; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
976; AVX1-NEXT:    vpacksswb %xmm6, %xmm6, %xmm1
977; AVX1-NEXT:    vpxor %xmm1, %xmm9, %xmm1
978; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm5
979; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
980; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
981; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm1
982; AVX1-NEXT:    vmovdqa %xmm4, 48(%rdi)
983; AVX1-NEXT:    vmovdqa %xmm3, 32(%rdi)
984; AVX1-NEXT:    vmovdqa %xmm8, 16(%rdi)
985; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
986; AVX1-NEXT:    retq
987;
988; AVX2-LABEL: umulo_v16i32:
989; AVX2:       # %bb.0:
990; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7]
991; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[1,1,3,3,5,5,7,7]
992; AVX2-NEXT:    vpmuludq %ymm4, %ymm5, %ymm4
993; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm5
994; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[1,1,3,3,5,5,7,7]
995; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7]
996; AVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
997; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm4, %ymm4
998; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
999; AVX2-NEXT:    vpxor %ymm6, %ymm4, %ymm4
1000; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm7
1001; AVX2-NEXT:    vpackssdw %xmm7, %xmm4, %xmm4
1002; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[1,1,3,3,5,5,7,7]
1003; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm0[1,1,3,3,5,5,7,7]
1004; AVX2-NEXT:    vpmuludq %ymm7, %ymm8, %ymm7
1005; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm8
1006; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[1,1,3,3,5,5,7,7]
1007; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7]
1008; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5
1009; AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm2
1010; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm3
1011; AVX2-NEXT:    vpacksswb %xmm4, %xmm4, %xmm0
1012; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm1
1013; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
1014; AVX2-NEXT:    vpxor %ymm6, %ymm0, %ymm0
1015; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
1016; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
1017; AVX2-NEXT:    retq
1018;
1019; AVX512-LABEL: umulo_v16i32:
1020; AVX512:       # %bb.0:
1021; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2
1022; AVX512-NEXT:    vpshufd {{.*#+}} zmm3 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
1023; AVX512-NEXT:    vpshufd {{.*#+}} zmm4 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
1024; AVX512-NEXT:    vpmuludq %zmm3, %zmm4, %zmm3
1025; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
1026; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1027; AVX512-NEXT:    vptestmd %zmm4, %zmm4, %k1
1028; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm1
1029; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
1030; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
1031; AVX512-NEXT:    retq
1032  %t = call {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
1033  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
1034  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
1035  %res = sext <16 x i1> %obit to <16 x i32>
1036  store <16 x i32> %val, ptr %p2
1037  ret <16 x i32> %res
1038}
1039
1040define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
1041; SSE2-LABEL: umulo_v16i8:
1042; SSE2:       # %bb.0:
1043; SSE2-NEXT:    pxor %xmm2, %xmm2
1044; SSE2-NEXT:    movdqa %xmm1, %xmm3
1045; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1046; SSE2-NEXT:    movdqa %xmm0, %xmm5
1047; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
1048; SSE2-NEXT:    pmullw %xmm3, %xmm5
1049; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1050; SSE2-NEXT:    movdqa %xmm5, %xmm3
1051; SSE2-NEXT:    pand %xmm4, %xmm3
1052; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1053; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1054; SSE2-NEXT:    pmullw %xmm1, %xmm0
1055; SSE2-NEXT:    pand %xmm0, %xmm4
1056; SSE2-NEXT:    packuswb %xmm3, %xmm4
1057; SSE2-NEXT:    psrlw $8, %xmm5
1058; SSE2-NEXT:    psrlw $8, %xmm0
1059; SSE2-NEXT:    packuswb %xmm5, %xmm0
1060; SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
1061; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
1062; SSE2-NEXT:    pxor %xmm2, %xmm3
1063; SSE2-NEXT:    movdqa %xmm3, %xmm0
1064; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1065; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1066; SSE2-NEXT:    movdqa %xmm3, %xmm1
1067; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1068; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1069; SSE2-NEXT:    pslld $31, %xmm1
1070; SSE2-NEXT:    psrad $31, %xmm1
1071; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1072; SSE2-NEXT:    movdqa %xmm3, %xmm2
1073; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1074; SSE2-NEXT:    pslld $31, %xmm2
1075; SSE2-NEXT:    psrad $31, %xmm2
1076; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1077; SSE2-NEXT:    pslld $31, %xmm3
1078; SSE2-NEXT:    psrad $31, %xmm3
1079; SSE2-NEXT:    movdqa %xmm4, (%rdi)
1080; SSE2-NEXT:    retq
1081;
1082; SSSE3-LABEL: umulo_v16i8:
1083; SSSE3:       # %bb.0:
1084; SSSE3-NEXT:    pxor %xmm2, %xmm2
1085; SSSE3-NEXT:    movdqa %xmm1, %xmm3
1086; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1087; SSSE3-NEXT:    movdqa %xmm0, %xmm5
1088; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
1089; SSSE3-NEXT:    pmullw %xmm3, %xmm5
1090; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1091; SSSE3-NEXT:    movdqa %xmm5, %xmm3
1092; SSSE3-NEXT:    pand %xmm4, %xmm3
1093; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1094; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1095; SSSE3-NEXT:    pmullw %xmm1, %xmm0
1096; SSSE3-NEXT:    pand %xmm0, %xmm4
1097; SSSE3-NEXT:    packuswb %xmm3, %xmm4
1098; SSSE3-NEXT:    psrlw $8, %xmm5
1099; SSSE3-NEXT:    psrlw $8, %xmm0
1100; SSSE3-NEXT:    packuswb %xmm5, %xmm0
1101; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
1102; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
1103; SSSE3-NEXT:    pxor %xmm2, %xmm3
1104; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1105; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1106; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1107; SSSE3-NEXT:    movdqa %xmm3, %xmm1
1108; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1109; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1110; SSSE3-NEXT:    pslld $31, %xmm1
1111; SSSE3-NEXT:    psrad $31, %xmm1
1112; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1113; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1114; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1115; SSSE3-NEXT:    pslld $31, %xmm2
1116; SSSE3-NEXT:    psrad $31, %xmm2
1117; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1118; SSSE3-NEXT:    pslld $31, %xmm3
1119; SSSE3-NEXT:    psrad $31, %xmm3
1120; SSSE3-NEXT:    movdqa %xmm4, (%rdi)
1121; SSSE3-NEXT:    retq
1122;
1123; SSE41-LABEL: umulo_v16i8:
1124; SSE41:       # %bb.0:
1125; SSE41-NEXT:    pxor %xmm2, %xmm2
1126; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1127; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1128; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1129; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1130; SSE41-NEXT:    pmullw %xmm1, %xmm0
1131; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1132; SSE41-NEXT:    movdqa %xmm0, %xmm1
1133; SSE41-NEXT:    pand %xmm4, %xmm1
1134; SSE41-NEXT:    pmullw %xmm3, %xmm5
1135; SSE41-NEXT:    pand %xmm5, %xmm4
1136; SSE41-NEXT:    packuswb %xmm1, %xmm4
1137; SSE41-NEXT:    psrlw $8, %xmm0
1138; SSE41-NEXT:    psrlw $8, %xmm5
1139; SSE41-NEXT:    packuswb %xmm0, %xmm5
1140; SSE41-NEXT:    pcmpeqb %xmm2, %xmm5
1141; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
1142; SSE41-NEXT:    pxor %xmm5, %xmm3
1143; SSE41-NEXT:    pmovsxbd %xmm3, %xmm0
1144; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
1145; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1146; SSE41-NEXT:    pslld $31, %xmm1
1147; SSE41-NEXT:    psrad $31, %xmm1
1148; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1149; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
1150; SSE41-NEXT:    pslld $31, %xmm2
1151; SSE41-NEXT:    psrad $31, %xmm2
1152; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
1153; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
1154; SSE41-NEXT:    pslld $31, %xmm3
1155; SSE41-NEXT:    psrad $31, %xmm3
1156; SSE41-NEXT:    movdqa %xmm4, (%rdi)
1157; SSE41-NEXT:    retq
1158;
1159; AVX1-LABEL: umulo_v16i8:
1160; AVX1:       # %bb.0:
1161; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1162; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1163; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1164; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm3
1165; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1166; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm5
1167; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1168; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1169; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1170; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm1
1171; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm4
1172; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm1
1173; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1174; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1175; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
1176; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1177; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
1178; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
1179; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
1180; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
1181; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1182; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
1183; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
1184; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
1185; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
1186; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1187; AVX1-NEXT:    vmovdqa %xmm4, (%rdi)
1188; AVX1-NEXT:    retq
1189;
1190; AVX2-LABEL: umulo_v16i8:
1191; AVX2:       # %bb.0:
1192; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1193; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1194; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1195; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1196; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1197; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm2
1198; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1199; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1200; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1201; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1202; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
1203; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1204; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
1205; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm0
1206; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1207; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
1208; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
1209; AVX2-NEXT:    retq
1210;
1211; AVX512F-LABEL: umulo_v16i8:
1212; AVX512F:       # %bb.0:
1213; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1214; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1215; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm1
1216; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm0
1217; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1218; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
1219; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
1220; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1221; AVX512F-NEXT:    vpmovdb %zmm1, (%rdi)
1222; AVX512F-NEXT:    retq
1223;
1224; AVX512BW-LABEL: umulo_v16i8:
1225; AVX512BW:       # %bb.0:
1226; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1227; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1228; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm1
1229; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm0
1230; AVX512BW-NEXT:    vptestmw %ymm0, %ymm0, %k1
1231; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
1232; AVX512BW-NEXT:    vpmovwb %ymm1, (%rdi)
1233; AVX512BW-NEXT:    retq
1234  %t = call {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
1235  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
1236  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
1237  %res = sext <16 x i1> %obit to <16 x i32>
1238  store <16 x i8> %val, ptr %p2
1239  ret <16 x i32> %res
1240}
1241
1242define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
1243; SSE2-LABEL: umulo_v32i8:
1244; SSE2:       # %bb.0:
1245; SSE2-NEXT:    movq %rdi, %rax
1246; SSE2-NEXT:    pxor %xmm5, %xmm5
1247; SSE2-NEXT:    movdqa %xmm2, %xmm4
1248; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
1249; SSE2-NEXT:    movdqa %xmm0, %xmm6
1250; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
1251; SSE2-NEXT:    pmullw %xmm4, %xmm6
1252; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1253; SSE2-NEXT:    movdqa %xmm6, %xmm7
1254; SSE2-NEXT:    pand %xmm4, %xmm7
1255; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
1256; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
1257; SSE2-NEXT:    pmullw %xmm2, %xmm0
1258; SSE2-NEXT:    movdqa %xmm0, %xmm2
1259; SSE2-NEXT:    pand %xmm4, %xmm2
1260; SSE2-NEXT:    packuswb %xmm7, %xmm2
1261; SSE2-NEXT:    movdqa %xmm3, %xmm7
1262; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
1263; SSE2-NEXT:    movdqa %xmm1, %xmm8
1264; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
1265; SSE2-NEXT:    pmullw %xmm7, %xmm8
1266; SSE2-NEXT:    movdqa %xmm8, %xmm7
1267; SSE2-NEXT:    pand %xmm4, %xmm7
1268; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
1269; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
1270; SSE2-NEXT:    pmullw %xmm3, %xmm1
1271; SSE2-NEXT:    pand %xmm1, %xmm4
1272; SSE2-NEXT:    packuswb %xmm7, %xmm4
1273; SSE2-NEXT:    psrlw $8, %xmm8
1274; SSE2-NEXT:    psrlw $8, %xmm1
1275; SSE2-NEXT:    packuswb %xmm8, %xmm1
1276; SSE2-NEXT:    pcmpeqb %xmm5, %xmm1
1277; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
1278; SSE2-NEXT:    pxor %xmm3, %xmm1
1279; SSE2-NEXT:    psrlw $8, %xmm6
1280; SSE2-NEXT:    psrlw $8, %xmm0
1281; SSE2-NEXT:    packuswb %xmm6, %xmm0
1282; SSE2-NEXT:    pcmpeqb %xmm5, %xmm0
1283; SSE2-NEXT:    pxor %xmm3, %xmm0
1284; SSE2-NEXT:    movdqa %xmm0, %xmm3
1285; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1286; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1287; SSE2-NEXT:    pslld $31, %xmm3
1288; SSE2-NEXT:    psrad $31, %xmm3
1289; SSE2-NEXT:    movdqa %xmm0, %xmm5
1290; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1291; SSE2-NEXT:    movdqa %xmm5, %xmm6
1292; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3]
1293; SSE2-NEXT:    pslld $31, %xmm6
1294; SSE2-NEXT:    psrad $31, %xmm6
1295; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
1296; SSE2-NEXT:    pslld $31, %xmm5
1297; SSE2-NEXT:    psrad $31, %xmm5
1298; SSE2-NEXT:    movdqa %xmm1, %xmm7
1299; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1300; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
1301; SSE2-NEXT:    pslld $31, %xmm7
1302; SSE2-NEXT:    psrad $31, %xmm7
1303; SSE2-NEXT:    movdqa %xmm1, %xmm8
1304; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1305; SSE2-NEXT:    movdqa %xmm8, %xmm9
1306; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3]
1307; SSE2-NEXT:    pslld $31, %xmm9
1308; SSE2-NEXT:    psrad $31, %xmm9
1309; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
1310; SSE2-NEXT:    pslld $31, %xmm8
1311; SSE2-NEXT:    psrad $31, %xmm8
1312; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1313; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1314; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1315; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1316; SSE2-NEXT:    movdqa %xmm4, 16(%rsi)
1317; SSE2-NEXT:    movdqa %xmm2, (%rsi)
1318; SSE2-NEXT:    movdqa %xmm1, 64(%rdi)
1319; SSE2-NEXT:    movdqa %xmm0, (%rdi)
1320; SSE2-NEXT:    movdqa %xmm8, 112(%rdi)
1321; SSE2-NEXT:    movdqa %xmm9, 96(%rdi)
1322; SSE2-NEXT:    movdqa %xmm7, 80(%rdi)
1323; SSE2-NEXT:    movdqa %xmm5, 48(%rdi)
1324; SSE2-NEXT:    movdqa %xmm6, 32(%rdi)
1325; SSE2-NEXT:    movdqa %xmm3, 16(%rdi)
1326; SSE2-NEXT:    retq
1327;
1328; SSSE3-LABEL: umulo_v32i8:
1329; SSSE3:       # %bb.0:
1330; SSSE3-NEXT:    movq %rdi, %rax
1331; SSSE3-NEXT:    pxor %xmm5, %xmm5
1332; SSSE3-NEXT:    movdqa %xmm2, %xmm4
1333; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
1334; SSSE3-NEXT:    movdqa %xmm0, %xmm6
1335; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
1336; SSSE3-NEXT:    pmullw %xmm4, %xmm6
1337; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1338; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1339; SSSE3-NEXT:    pand %xmm4, %xmm7
1340; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
1341; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
1342; SSSE3-NEXT:    pmullw %xmm2, %xmm0
1343; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1344; SSSE3-NEXT:    pand %xmm4, %xmm2
1345; SSSE3-NEXT:    packuswb %xmm7, %xmm2
1346; SSSE3-NEXT:    movdqa %xmm3, %xmm7
1347; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
1348; SSSE3-NEXT:    movdqa %xmm1, %xmm8
1349; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
1350; SSSE3-NEXT:    pmullw %xmm7, %xmm8
1351; SSSE3-NEXT:    movdqa %xmm8, %xmm7
1352; SSSE3-NEXT:    pand %xmm4, %xmm7
1353; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
1354; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
1355; SSSE3-NEXT:    pmullw %xmm3, %xmm1
1356; SSSE3-NEXT:    pand %xmm1, %xmm4
1357; SSSE3-NEXT:    packuswb %xmm7, %xmm4
1358; SSSE3-NEXT:    psrlw $8, %xmm8
1359; SSSE3-NEXT:    psrlw $8, %xmm1
1360; SSSE3-NEXT:    packuswb %xmm8, %xmm1
1361; SSSE3-NEXT:    pcmpeqb %xmm5, %xmm1
1362; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
1363; SSSE3-NEXT:    pxor %xmm3, %xmm1
1364; SSSE3-NEXT:    psrlw $8, %xmm6
1365; SSSE3-NEXT:    psrlw $8, %xmm0
1366; SSSE3-NEXT:    packuswb %xmm6, %xmm0
1367; SSSE3-NEXT:    pcmpeqb %xmm5, %xmm0
1368; SSSE3-NEXT:    pxor %xmm3, %xmm0
1369; SSSE3-NEXT:    movdqa %xmm0, %xmm3
1370; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1371; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1372; SSSE3-NEXT:    pslld $31, %xmm3
1373; SSSE3-NEXT:    psrad $31, %xmm3
1374; SSSE3-NEXT:    movdqa %xmm0, %xmm5
1375; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1376; SSSE3-NEXT:    movdqa %xmm5, %xmm6
1377; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3]
1378; SSSE3-NEXT:    pslld $31, %xmm6
1379; SSSE3-NEXT:    psrad $31, %xmm6
1380; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
1381; SSSE3-NEXT:    pslld $31, %xmm5
1382; SSSE3-NEXT:    psrad $31, %xmm5
1383; SSSE3-NEXT:    movdqa %xmm1, %xmm7
1384; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1385; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
1386; SSSE3-NEXT:    pslld $31, %xmm7
1387; SSSE3-NEXT:    psrad $31, %xmm7
1388; SSSE3-NEXT:    movdqa %xmm1, %xmm8
1389; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1390; SSSE3-NEXT:    movdqa %xmm8, %xmm9
1391; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3]
1392; SSSE3-NEXT:    pslld $31, %xmm9
1393; SSSE3-NEXT:    psrad $31, %xmm9
1394; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
1395; SSSE3-NEXT:    pslld $31, %xmm8
1396; SSSE3-NEXT:    psrad $31, %xmm8
1397; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1398; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1399; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1400; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1401; SSSE3-NEXT:    movdqa %xmm4, 16(%rsi)
1402; SSSE3-NEXT:    movdqa %xmm2, (%rsi)
1403; SSSE3-NEXT:    movdqa %xmm1, 64(%rdi)
1404; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
1405; SSSE3-NEXT:    movdqa %xmm8, 112(%rdi)
1406; SSSE3-NEXT:    movdqa %xmm9, 96(%rdi)
1407; SSSE3-NEXT:    movdqa %xmm7, 80(%rdi)
1408; SSSE3-NEXT:    movdqa %xmm5, 48(%rdi)
1409; SSSE3-NEXT:    movdqa %xmm6, 32(%rdi)
1410; SSSE3-NEXT:    movdqa %xmm3, 16(%rdi)
1411; SSSE3-NEXT:    retq
1412;
1413; SSE41-LABEL: umulo_v32i8:
1414; SSE41:       # %bb.0:
1415; SSE41-NEXT:    movq %rdi, %rax
1416; SSE41-NEXT:    pxor %xmm7, %xmm7
1417; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1418; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
1419; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1420; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15]
1421; SSE41-NEXT:    pmullw %xmm2, %xmm0
1422; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1423; SSE41-NEXT:    movdqa %xmm0, %xmm6
1424; SSE41-NEXT:    pand %xmm2, %xmm6
1425; SSE41-NEXT:    pmullw %xmm5, %xmm4
1426; SSE41-NEXT:    movdqa %xmm4, %xmm5
1427; SSE41-NEXT:    pand %xmm2, %xmm5
1428; SSE41-NEXT:    packuswb %xmm6, %xmm5
1429; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1430; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15]
1431; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1432; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15]
1433; SSE41-NEXT:    pmullw %xmm3, %xmm1
1434; SSE41-NEXT:    movdqa %xmm1, %xmm3
1435; SSE41-NEXT:    pand %xmm2, %xmm3
1436; SSE41-NEXT:    pmullw %xmm8, %xmm6
1437; SSE41-NEXT:    pand %xmm6, %xmm2
1438; SSE41-NEXT:    packuswb %xmm3, %xmm2
1439; SSE41-NEXT:    psrlw $8, %xmm1
1440; SSE41-NEXT:    psrlw $8, %xmm6
1441; SSE41-NEXT:    packuswb %xmm1, %xmm6
1442; SSE41-NEXT:    pcmpeqb %xmm7, %xmm6
1443; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
1444; SSE41-NEXT:    pxor %xmm1, %xmm6
1445; SSE41-NEXT:    psrlw $8, %xmm0
1446; SSE41-NEXT:    psrlw $8, %xmm4
1447; SSE41-NEXT:    packuswb %xmm0, %xmm4
1448; SSE41-NEXT:    pcmpeqb %xmm7, %xmm4
1449; SSE41-NEXT:    pxor %xmm1, %xmm4
1450; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
1451; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1452; SSE41-NEXT:    pslld $31, %xmm0
1453; SSE41-NEXT:    psrad $31, %xmm0
1454; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
1455; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1456; SSE41-NEXT:    pslld $31, %xmm1
1457; SSE41-NEXT:    psrad $31, %xmm1
1458; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3]
1459; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
1460; SSE41-NEXT:    pslld $31, %xmm3
1461; SSE41-NEXT:    psrad $31, %xmm3
1462; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,1,1]
1463; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
1464; SSE41-NEXT:    pslld $31, %xmm7
1465; SSE41-NEXT:    psrad $31, %xmm7
1466; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[2,3,2,3]
1467; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
1468; SSE41-NEXT:    pslld $31, %xmm8
1469; SSE41-NEXT:    psrad $31, %xmm8
1470; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[3,3,3,3]
1471; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
1472; SSE41-NEXT:    pslld $31, %xmm9
1473; SSE41-NEXT:    psrad $31, %xmm9
1474; SSE41-NEXT:    pmovsxbd %xmm4, %xmm4
1475; SSE41-NEXT:    pmovsxbd %xmm6, %xmm6
1476; SSE41-NEXT:    movdqa %xmm2, 16(%rsi)
1477; SSE41-NEXT:    movdqa %xmm5, (%rsi)
1478; SSE41-NEXT:    movdqa %xmm6, 64(%rdi)
1479; SSE41-NEXT:    movdqa %xmm4, (%rdi)
1480; SSE41-NEXT:    movdqa %xmm9, 112(%rdi)
1481; SSE41-NEXT:    movdqa %xmm8, 96(%rdi)
1482; SSE41-NEXT:    movdqa %xmm7, 80(%rdi)
1483; SSE41-NEXT:    movdqa %xmm3, 48(%rdi)
1484; SSE41-NEXT:    movdqa %xmm1, 32(%rdi)
1485; SSE41-NEXT:    movdqa %xmm0, 16(%rdi)
1486; SSE41-NEXT:    retq
1487;
1488; AVX1-LABEL: umulo_v32i8:
1489; AVX1:       # %bb.0:
1490; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1491; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1492; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1493; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm3
1494; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
1495; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm4
1496; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1497; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1498; AVX1-NEXT:    vpmullw %xmm6, %xmm7, %xmm6
1499; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm7
1500; AVX1-NEXT:    vpackuswb %xmm4, %xmm7, %xmm4
1501; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1502; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1503; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1504; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1505; AVX1-NEXT:    vpmullw %xmm7, %xmm8, %xmm7
1506; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm8
1507; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1508; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1509; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1510; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm1
1511; AVX1-NEXT:    vpackuswb %xmm8, %xmm1, %xmm5
1512; AVX1-NEXT:    vpsrlw $8, %xmm7, %xmm1
1513; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1514; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1515; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
1516; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1517; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm7
1518; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm0
1519; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm3
1520; AVX1-NEXT:    vpackuswb %xmm0, %xmm3, %xmm0
1521; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
1522; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
1523; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
1524; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
1525; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
1526; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1527; AVX1-NEXT:    vpmovsxbd %xmm7, %xmm2
1528; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1]
1529; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
1530; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
1531; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
1532; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
1533; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
1534; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
1535; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
1536; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm7[2,3,2,3]
1537; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
1538; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
1539; AVX1-NEXT:    vpmovsxbd %xmm6, %xmm6
1540; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm3, %ymm3
1541; AVX1-NEXT:    vmovdqa %xmm5, 16(%rdi)
1542; AVX1-NEXT:    vmovdqa %xmm4, (%rdi)
1543; AVX1-NEXT:    retq
1544;
1545; AVX2-LABEL: umulo_v32i8:
1546; AVX2:       # %bb.0:
1547; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1548; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
1549; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1550; AVX2-NEXT:    vpmullw %ymm3, %ymm4, %ymm3
1551; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1552; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm5
1553; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
1554; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1555; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1556; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm1
1557; AVX2-NEXT:    vpackuswb %ymm5, %ymm1, %ymm4
1558; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm1
1559; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1560; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1561; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
1562; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
1563; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm1
1564; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm0
1565; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
1566; AVX2-NEXT:    vpmovsxbd %xmm3, %ymm2
1567; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1568; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
1569; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
1570; AVX2-NEXT:    vpmovsxbd %xmm3, %ymm3
1571; AVX2-NEXT:    vmovdqa %ymm4, (%rdi)
1572; AVX2-NEXT:    retq
1573;
1574; AVX512F-LABEL: umulo_v32i8:
1575; AVX512F:       # %bb.0:
1576; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
1577; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
1578; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
1579; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
1580; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
1581; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm3
1582; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
1583; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
1584; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1585; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1586; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm3
1587; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm0
1588; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1589; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
1590; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
1591; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
1592; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1593; AVX512F-NEXT:    vpmovdb %zmm2, 16(%rdi)
1594; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
1595; AVX512F-NEXT:    vpmovdb %zmm2, (%rdi)
1596; AVX512F-NEXT:    retq
1597;
1598; AVX512BW-LABEL: umulo_v32i8:
1599; AVX512BW:       # %bb.0:
1600; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
1601; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1602; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm2
1603; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm0
1604; AVX512BW-NEXT:    vptestmw %zmm0, %zmm0, %k1
1605; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
1606; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
1607; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
1608; AVX512BW-NEXT:    vpmovwb %zmm2, (%rdi)
1609; AVX512BW-NEXT:    retq
1610  %t = call {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8> %a0, <32 x i8> %a1)
1611  %val = extractvalue {<32 x i8>, <32 x i1>} %t, 0
1612  %obit = extractvalue {<32 x i8>, <32 x i1>} %t, 1
1613  %res = sext <32 x i1> %obit to <32 x i32>
1614  store <32 x i8> %val, ptr %p2
1615  ret <32 x i32> %res
1616}
1617
1618define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
1619; SSE2-LABEL: umulo_v64i8:
1620; SSE2:       # %bb.0:
1621; SSE2-NEXT:    movq %rdi, %rax
1622; SSE2-NEXT:    pxor %xmm10, %xmm10
1623; SSE2-NEXT:    movdqa %xmm4, %xmm8
1624; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15]
1625; SSE2-NEXT:    movdqa %xmm0, %xmm11
1626; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
1627; SSE2-NEXT:    pmullw %xmm8, %xmm11
1628; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
1629; SSE2-NEXT:    movdqa %xmm11, %xmm9
1630; SSE2-NEXT:    pand %xmm8, %xmm9
1631; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
1632; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
1633; SSE2-NEXT:    pmullw %xmm4, %xmm0
1634; SSE2-NEXT:    movdqa %xmm0, %xmm4
1635; SSE2-NEXT:    pand %xmm8, %xmm4
1636; SSE2-NEXT:    packuswb %xmm9, %xmm4
1637; SSE2-NEXT:    movdqa %xmm5, %xmm9
1638; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
1639; SSE2-NEXT:    movdqa %xmm1, %xmm12
1640; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15]
1641; SSE2-NEXT:    pmullw %xmm9, %xmm12
1642; SSE2-NEXT:    movdqa %xmm12, %xmm9
1643; SSE2-NEXT:    pand %xmm8, %xmm9
1644; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
1645; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
1646; SSE2-NEXT:    pmullw %xmm5, %xmm1
1647; SSE2-NEXT:    movdqa %xmm1, %xmm5
1648; SSE2-NEXT:    pand %xmm8, %xmm5
1649; SSE2-NEXT:    packuswb %xmm9, %xmm5
1650; SSE2-NEXT:    movdqa %xmm6, %xmm9
1651; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
1652; SSE2-NEXT:    movdqa %xmm2, %xmm13
1653; SSE2-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15]
1654; SSE2-NEXT:    pmullw %xmm9, %xmm13
1655; SSE2-NEXT:    movdqa %xmm13, %xmm14
1656; SSE2-NEXT:    pand %xmm8, %xmm14
1657; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
1658; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
1659; SSE2-NEXT:    pmullw %xmm6, %xmm2
1660; SSE2-NEXT:    movdqa %xmm2, %xmm9
1661; SSE2-NEXT:    pand %xmm8, %xmm9
1662; SSE2-NEXT:    packuswb %xmm14, %xmm9
1663; SSE2-NEXT:    movdqa %xmm7, %xmm6
1664; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
1665; SSE2-NEXT:    movdqa %xmm3, %xmm14
1666; SSE2-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15]
1667; SSE2-NEXT:    pmullw %xmm6, %xmm14
1668; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
1669; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
1670; SSE2-NEXT:    pmullw %xmm7, %xmm3
1671; SSE2-NEXT:    movdqa %xmm14, %xmm6
1672; SSE2-NEXT:    pand %xmm8, %xmm6
1673; SSE2-NEXT:    pand %xmm3, %xmm8
1674; SSE2-NEXT:    packuswb %xmm6, %xmm8
1675; SSE2-NEXT:    psrlw $8, %xmm14
1676; SSE2-NEXT:    psrlw $8, %xmm3
1677; SSE2-NEXT:    packuswb %xmm14, %xmm3
1678; SSE2-NEXT:    psrlw $8, %xmm13
1679; SSE2-NEXT:    psrlw $8, %xmm2
1680; SSE2-NEXT:    packuswb %xmm13, %xmm2
1681; SSE2-NEXT:    psrlw $8, %xmm12
1682; SSE2-NEXT:    psrlw $8, %xmm1
1683; SSE2-NEXT:    packuswb %xmm12, %xmm1
1684; SSE2-NEXT:    psrlw $8, %xmm11
1685; SSE2-NEXT:    psrlw $8, %xmm0
1686; SSE2-NEXT:    packuswb %xmm11, %xmm0
1687; SSE2-NEXT:    pcmpeqb %xmm10, %xmm3
1688; SSE2-NEXT:    pcmpeqb %xmm10, %xmm2
1689; SSE2-NEXT:    pcmpeqb %xmm10, %xmm1
1690; SSE2-NEXT:    pcmpeqb %xmm10, %xmm0
1691; SSE2-NEXT:    pcmpeqd %xmm6, %xmm6
1692; SSE2-NEXT:    pxor %xmm6, %xmm3
1693; SSE2-NEXT:    pxor %xmm6, %xmm2
1694; SSE2-NEXT:    pxor %xmm6, %xmm1
1695; SSE2-NEXT:    pxor %xmm6, %xmm0
1696; SSE2-NEXT:    movdqa %xmm0, %xmm6
1697; SSE2-NEXT:    movdqa %xmm8, 48(%rsi)
1698; SSE2-NEXT:    movdqa %xmm1, %xmm7
1699; SSE2-NEXT:    movdqa %xmm9, 32(%rsi)
1700; SSE2-NEXT:    movdqa %xmm2, %xmm8
1701; SSE2-NEXT:    movdqa %xmm5, 16(%rsi)
1702; SSE2-NEXT:    movdqa %xmm3, %xmm5
1703; SSE2-NEXT:    movdqa %xmm4, (%rsi)
1704; SSE2-NEXT:    movdqa %xmm3, %xmm4
1705; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1706; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
1707; SSE2-NEXT:    movdqa %xmm3, 192(%rdi)
1708; SSE2-NEXT:    movdqa %xmm2, %xmm3
1709; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1710; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1711; SSE2-NEXT:    movdqa %xmm2, 128(%rdi)
1712; SSE2-NEXT:    movdqa %xmm1, %xmm2
1713; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1714; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1715; SSE2-NEXT:    movdqa %xmm1, 64(%rdi)
1716; SSE2-NEXT:    movdqa %xmm0, %xmm1
1717; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1718; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1719; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1720; SSE2-NEXT:    movdqa %xmm0, (%rdi)
1721; SSE2-NEXT:    movdqa %xmm4, %xmm0
1722; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
1723; SSE2-NEXT:    pslld $31, %xmm4
1724; SSE2-NEXT:    psrad $31, %xmm4
1725; SSE2-NEXT:    movdqa %xmm4, 224(%rdi)
1726; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1727; SSE2-NEXT:    pslld $31, %xmm0
1728; SSE2-NEXT:    psrad $31, %xmm0
1729; SSE2-NEXT:    movdqa %xmm0, 240(%rdi)
1730; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1731; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1732; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
1733; SSE2-NEXT:    pslld $31, %xmm5
1734; SSE2-NEXT:    psrad $31, %xmm5
1735; SSE2-NEXT:    movdqa %xmm5, 208(%rdi)
1736; SSE2-NEXT:    movdqa %xmm3, %xmm0
1737; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
1738; SSE2-NEXT:    pslld $31, %xmm3
1739; SSE2-NEXT:    psrad $31, %xmm3
1740; SSE2-NEXT:    movdqa %xmm3, 160(%rdi)
1741; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1742; SSE2-NEXT:    pslld $31, %xmm0
1743; SSE2-NEXT:    psrad $31, %xmm0
1744; SSE2-NEXT:    movdqa %xmm0, 176(%rdi)
1745; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1746; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1747; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
1748; SSE2-NEXT:    pslld $31, %xmm8
1749; SSE2-NEXT:    psrad $31, %xmm8
1750; SSE2-NEXT:    movdqa %xmm8, 144(%rdi)
1751; SSE2-NEXT:    movdqa %xmm2, %xmm0
1752; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1753; SSE2-NEXT:    pslld $31, %xmm2
1754; SSE2-NEXT:    psrad $31, %xmm2
1755; SSE2-NEXT:    movdqa %xmm2, 96(%rdi)
1756; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1757; SSE2-NEXT:    pslld $31, %xmm0
1758; SSE2-NEXT:    psrad $31, %xmm0
1759; SSE2-NEXT:    movdqa %xmm0, 112(%rdi)
1760; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1761; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1762; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
1763; SSE2-NEXT:    pslld $31, %xmm7
1764; SSE2-NEXT:    psrad $31, %xmm7
1765; SSE2-NEXT:    movdqa %xmm7, 80(%rdi)
1766; SSE2-NEXT:    movdqa %xmm1, %xmm0
1767; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1768; SSE2-NEXT:    pslld $31, %xmm1
1769; SSE2-NEXT:    psrad $31, %xmm1
1770; SSE2-NEXT:    movdqa %xmm1, 32(%rdi)
1771; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1772; SSE2-NEXT:    pslld $31, %xmm0
1773; SSE2-NEXT:    psrad $31, %xmm0
1774; SSE2-NEXT:    movdqa %xmm0, 48(%rdi)
1775; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1776; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
1777; SSE2-NEXT:    pslld $31, %xmm6
1778; SSE2-NEXT:    psrad $31, %xmm6
1779; SSE2-NEXT:    movdqa %xmm6, 16(%rdi)
1780; SSE2-NEXT:    retq
1781;
1782; SSSE3-LABEL: umulo_v64i8:
1783; SSSE3:       # %bb.0:
1784; SSSE3-NEXT:    movq %rdi, %rax
1785; SSSE3-NEXT:    pxor %xmm10, %xmm10
1786; SSSE3-NEXT:    movdqa %xmm4, %xmm8
1787; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15]
1788; SSSE3-NEXT:    movdqa %xmm0, %xmm11
1789; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
1790; SSSE3-NEXT:    pmullw %xmm8, %xmm11
1791; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
1792; SSSE3-NEXT:    movdqa %xmm11, %xmm9
1793; SSSE3-NEXT:    pand %xmm8, %xmm9
1794; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
1795; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
1796; SSSE3-NEXT:    pmullw %xmm4, %xmm0
1797; SSSE3-NEXT:    movdqa %xmm0, %xmm4
1798; SSSE3-NEXT:    pand %xmm8, %xmm4
1799; SSSE3-NEXT:    packuswb %xmm9, %xmm4
1800; SSSE3-NEXT:    movdqa %xmm5, %xmm9
1801; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
1802; SSSE3-NEXT:    movdqa %xmm1, %xmm12
1803; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15]
1804; SSSE3-NEXT:    pmullw %xmm9, %xmm12
1805; SSSE3-NEXT:    movdqa %xmm12, %xmm9
1806; SSSE3-NEXT:    pand %xmm8, %xmm9
1807; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
1808; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
1809; SSSE3-NEXT:    pmullw %xmm5, %xmm1
1810; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1811; SSSE3-NEXT:    pand %xmm8, %xmm5
1812; SSSE3-NEXT:    packuswb %xmm9, %xmm5
1813; SSSE3-NEXT:    movdqa %xmm6, %xmm9
1814; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
1815; SSSE3-NEXT:    movdqa %xmm2, %xmm13
1816; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15]
1817; SSSE3-NEXT:    pmullw %xmm9, %xmm13
1818; SSSE3-NEXT:    movdqa %xmm13, %xmm14
1819; SSSE3-NEXT:    pand %xmm8, %xmm14
1820; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
1821; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
1822; SSSE3-NEXT:    pmullw %xmm6, %xmm2
1823; SSSE3-NEXT:    movdqa %xmm2, %xmm9
1824; SSSE3-NEXT:    pand %xmm8, %xmm9
1825; SSSE3-NEXT:    packuswb %xmm14, %xmm9
1826; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1827; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
1828; SSSE3-NEXT:    movdqa %xmm3, %xmm14
1829; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15]
1830; SSSE3-NEXT:    pmullw %xmm6, %xmm14
1831; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
1832; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
1833; SSSE3-NEXT:    pmullw %xmm7, %xmm3
1834; SSSE3-NEXT:    movdqa %xmm14, %xmm6
1835; SSSE3-NEXT:    pand %xmm8, %xmm6
1836; SSSE3-NEXT:    pand %xmm3, %xmm8
1837; SSSE3-NEXT:    packuswb %xmm6, %xmm8
1838; SSSE3-NEXT:    psrlw $8, %xmm14
1839; SSSE3-NEXT:    psrlw $8, %xmm3
1840; SSSE3-NEXT:    packuswb %xmm14, %xmm3
1841; SSSE3-NEXT:    psrlw $8, %xmm13
1842; SSSE3-NEXT:    psrlw $8, %xmm2
1843; SSSE3-NEXT:    packuswb %xmm13, %xmm2
1844; SSSE3-NEXT:    psrlw $8, %xmm12
1845; SSSE3-NEXT:    psrlw $8, %xmm1
1846; SSSE3-NEXT:    packuswb %xmm12, %xmm1
1847; SSSE3-NEXT:    psrlw $8, %xmm11
1848; SSSE3-NEXT:    psrlw $8, %xmm0
1849; SSSE3-NEXT:    packuswb %xmm11, %xmm0
1850; SSSE3-NEXT:    pcmpeqb %xmm10, %xmm3
1851; SSSE3-NEXT:    pcmpeqb %xmm10, %xmm2
1852; SSSE3-NEXT:    pcmpeqb %xmm10, %xmm1
1853; SSSE3-NEXT:    pcmpeqb %xmm10, %xmm0
1854; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm6
1855; SSSE3-NEXT:    pxor %xmm6, %xmm3
1856; SSSE3-NEXT:    pxor %xmm6, %xmm2
1857; SSSE3-NEXT:    pxor %xmm6, %xmm1
1858; SSSE3-NEXT:    pxor %xmm6, %xmm0
1859; SSSE3-NEXT:    movdqa %xmm0, %xmm6
1860; SSSE3-NEXT:    movdqa %xmm8, 48(%rsi)
1861; SSSE3-NEXT:    movdqa %xmm1, %xmm7
1862; SSSE3-NEXT:    movdqa %xmm9, 32(%rsi)
1863; SSSE3-NEXT:    movdqa %xmm2, %xmm8
1864; SSSE3-NEXT:    movdqa %xmm5, 16(%rsi)
1865; SSSE3-NEXT:    movdqa %xmm3, %xmm5
1866; SSSE3-NEXT:    movdqa %xmm4, (%rsi)
1867; SSSE3-NEXT:    movdqa %xmm3, %xmm4
1868; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1869; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
1870; SSSE3-NEXT:    movdqa %xmm3, 192(%rdi)
1871; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1872; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1873; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1874; SSSE3-NEXT:    movdqa %xmm2, 128(%rdi)
1875; SSSE3-NEXT:    movdqa %xmm1, %xmm2
1876; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1877; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1878; SSSE3-NEXT:    movdqa %xmm1, 64(%rdi)
1879; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1880; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1881; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1882; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1883; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
1884; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1885; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
1886; SSSE3-NEXT:    pslld $31, %xmm4
1887; SSSE3-NEXT:    psrad $31, %xmm4
1888; SSSE3-NEXT:    movdqa %xmm4, 224(%rdi)
1889; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1890; SSSE3-NEXT:    pslld $31, %xmm0
1891; SSSE3-NEXT:    psrad $31, %xmm0
1892; SSSE3-NEXT:    movdqa %xmm0, 240(%rdi)
1893; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1894; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1895; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
1896; SSSE3-NEXT:    pslld $31, %xmm5
1897; SSSE3-NEXT:    psrad $31, %xmm5
1898; SSSE3-NEXT:    movdqa %xmm5, 208(%rdi)
1899; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1900; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
1901; SSSE3-NEXT:    pslld $31, %xmm3
1902; SSSE3-NEXT:    psrad $31, %xmm3
1903; SSSE3-NEXT:    movdqa %xmm3, 160(%rdi)
1904; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1905; SSSE3-NEXT:    pslld $31, %xmm0
1906; SSSE3-NEXT:    psrad $31, %xmm0
1907; SSSE3-NEXT:    movdqa %xmm0, 176(%rdi)
1908; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1909; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1910; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
1911; SSSE3-NEXT:    pslld $31, %xmm8
1912; SSSE3-NEXT:    psrad $31, %xmm8
1913; SSSE3-NEXT:    movdqa %xmm8, 144(%rdi)
1914; SSSE3-NEXT:    movdqa %xmm2, %xmm0
1915; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1916; SSSE3-NEXT:    pslld $31, %xmm2
1917; SSSE3-NEXT:    psrad $31, %xmm2
1918; SSSE3-NEXT:    movdqa %xmm2, 96(%rdi)
1919; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1920; SSSE3-NEXT:    pslld $31, %xmm0
1921; SSSE3-NEXT:    psrad $31, %xmm0
1922; SSSE3-NEXT:    movdqa %xmm0, 112(%rdi)
1923; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1924; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1925; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
1926; SSSE3-NEXT:    pslld $31, %xmm7
1927; SSSE3-NEXT:    psrad $31, %xmm7
1928; SSSE3-NEXT:    movdqa %xmm7, 80(%rdi)
1929; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1930; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1931; SSSE3-NEXT:    pslld $31, %xmm1
1932; SSSE3-NEXT:    psrad $31, %xmm1
1933; SSSE3-NEXT:    movdqa %xmm1, 32(%rdi)
1934; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1935; SSSE3-NEXT:    pslld $31, %xmm0
1936; SSSE3-NEXT:    psrad $31, %xmm0
1937; SSSE3-NEXT:    movdqa %xmm0, 48(%rdi)
1938; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1939; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
1940; SSSE3-NEXT:    pslld $31, %xmm6
1941; SSSE3-NEXT:    psrad $31, %xmm6
1942; SSSE3-NEXT:    movdqa %xmm6, 16(%rdi)
1943; SSSE3-NEXT:    retq
1944;
1945; SSE41-LABEL: umulo_v64i8:
1946; SSE41:       # %bb.0:
1947; SSE41-NEXT:    movq %rdi, %rax
1948; SSE41-NEXT:    pxor %xmm13, %xmm13
1949; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
1950; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15]
1951; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1952; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15]
1953; SSE41-NEXT:    pmullw %xmm4, %xmm0
1954; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
1955; SSE41-NEXT:    movdqa %xmm0, %xmm4
1956; SSE41-NEXT:    pand %xmm9, %xmm4
1957; SSE41-NEXT:    pmullw %xmm10, %xmm8
1958; SSE41-NEXT:    movdqa %xmm8, %xmm10
1959; SSE41-NEXT:    pand %xmm9, %xmm10
1960; SSE41-NEXT:    packuswb %xmm4, %xmm10
1961; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm11 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
1962; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
1963; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1964; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15]
1965; SSE41-NEXT:    pmullw %xmm5, %xmm1
1966; SSE41-NEXT:    movdqa %xmm1, %xmm5
1967; SSE41-NEXT:    pand %xmm9, %xmm5
1968; SSE41-NEXT:    pmullw %xmm11, %xmm4
1969; SSE41-NEXT:    movdqa %xmm4, %xmm11
1970; SSE41-NEXT:    pand %xmm9, %xmm11
1971; SSE41-NEXT:    packuswb %xmm5, %xmm11
1972; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
1973; SSE41-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15]
1974; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1975; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15]
1976; SSE41-NEXT:    pmullw %xmm6, %xmm2
1977; SSE41-NEXT:    movdqa %xmm2, %xmm6
1978; SSE41-NEXT:    pand %xmm9, %xmm6
1979; SSE41-NEXT:    pmullw %xmm12, %xmm5
1980; SSE41-NEXT:    movdqa %xmm5, %xmm12
1981; SSE41-NEXT:    pand %xmm9, %xmm12
1982; SSE41-NEXT:    packuswb %xmm6, %xmm12
1983; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm14 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
1984; SSE41-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm13[8],xmm7[9],xmm13[9],xmm7[10],xmm13[10],xmm7[11],xmm13[11],xmm7[12],xmm13[12],xmm7[13],xmm13[13],xmm7[14],xmm13[14],xmm7[15],xmm13[15]
1985; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1986; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15]
1987; SSE41-NEXT:    pmullw %xmm7, %xmm3
1988; SSE41-NEXT:    pmullw %xmm14, %xmm6
1989; SSE41-NEXT:    movdqa %xmm3, %xmm7
1990; SSE41-NEXT:    pand %xmm9, %xmm7
1991; SSE41-NEXT:    pand %xmm6, %xmm9
1992; SSE41-NEXT:    packuswb %xmm7, %xmm9
1993; SSE41-NEXT:    psrlw $8, %xmm3
1994; SSE41-NEXT:    psrlw $8, %xmm6
1995; SSE41-NEXT:    packuswb %xmm3, %xmm6
1996; SSE41-NEXT:    psrlw $8, %xmm2
1997; SSE41-NEXT:    psrlw $8, %xmm5
1998; SSE41-NEXT:    packuswb %xmm2, %xmm5
1999; SSE41-NEXT:    psrlw $8, %xmm1
2000; SSE41-NEXT:    psrlw $8, %xmm4
2001; SSE41-NEXT:    packuswb %xmm1, %xmm4
2002; SSE41-NEXT:    psrlw $8, %xmm0
2003; SSE41-NEXT:    psrlw $8, %xmm8
2004; SSE41-NEXT:    packuswb %xmm0, %xmm8
2005; SSE41-NEXT:    pcmpeqb %xmm13, %xmm6
2006; SSE41-NEXT:    pcmpeqb %xmm13, %xmm5
2007; SSE41-NEXT:    pcmpeqb %xmm13, %xmm4
2008; SSE41-NEXT:    pcmpeqb %xmm13, %xmm8
2009; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
2010; SSE41-NEXT:    pxor %xmm0, %xmm6
2011; SSE41-NEXT:    pxor %xmm0, %xmm5
2012; SSE41-NEXT:    pxor %xmm0, %xmm4
2013; SSE41-NEXT:    pxor %xmm0, %xmm8
2014; SSE41-NEXT:    movdqa %xmm9, 48(%rsi)
2015; SSE41-NEXT:    movdqa %xmm12, 32(%rsi)
2016; SSE41-NEXT:    movdqa %xmm11, 16(%rsi)
2017; SSE41-NEXT:    movdqa %xmm10, (%rsi)
2018; SSE41-NEXT:    pmovsxbd %xmm6, %xmm0
2019; SSE41-NEXT:    movdqa %xmm0, 192(%rdi)
2020; SSE41-NEXT:    pmovsxbd %xmm5, %xmm0
2021; SSE41-NEXT:    movdqa %xmm0, 128(%rdi)
2022; SSE41-NEXT:    pmovsxbd %xmm4, %xmm0
2023; SSE41-NEXT:    movdqa %xmm0, 64(%rdi)
2024; SSE41-NEXT:    pmovsxbd %xmm8, %xmm0
2025; SSE41-NEXT:    movdqa %xmm0, (%rdi)
2026; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
2027; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2028; SSE41-NEXT:    pslld $31, %xmm0
2029; SSE41-NEXT:    psrad $31, %xmm0
2030; SSE41-NEXT:    movdqa %xmm0, 224(%rdi)
2031; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
2032; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2033; SSE41-NEXT:    pslld $31, %xmm0
2034; SSE41-NEXT:    psrad $31, %xmm0
2035; SSE41-NEXT:    movdqa %xmm0, 240(%rdi)
2036; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
2037; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2038; SSE41-NEXT:    pslld $31, %xmm0
2039; SSE41-NEXT:    psrad $31, %xmm0
2040; SSE41-NEXT:    movdqa %xmm0, 208(%rdi)
2041; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
2042; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2043; SSE41-NEXT:    pslld $31, %xmm0
2044; SSE41-NEXT:    psrad $31, %xmm0
2045; SSE41-NEXT:    movdqa %xmm0, 160(%rdi)
2046; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
2047; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2048; SSE41-NEXT:    pslld $31, %xmm0
2049; SSE41-NEXT:    psrad $31, %xmm0
2050; SSE41-NEXT:    movdqa %xmm0, 176(%rdi)
2051; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
2052; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2053; SSE41-NEXT:    pslld $31, %xmm0
2054; SSE41-NEXT:    psrad $31, %xmm0
2055; SSE41-NEXT:    movdqa %xmm0, 144(%rdi)
2056; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
2057; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2058; SSE41-NEXT:    pslld $31, %xmm0
2059; SSE41-NEXT:    psrad $31, %xmm0
2060; SSE41-NEXT:    movdqa %xmm0, 96(%rdi)
2061; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3]
2062; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2063; SSE41-NEXT:    pslld $31, %xmm0
2064; SSE41-NEXT:    psrad $31, %xmm0
2065; SSE41-NEXT:    movdqa %xmm0, 112(%rdi)
2066; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
2067; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2068; SSE41-NEXT:    pslld $31, %xmm0
2069; SSE41-NEXT:    psrad $31, %xmm0
2070; SSE41-NEXT:    movdqa %xmm0, 80(%rdi)
2071; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3]
2072; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2073; SSE41-NEXT:    pslld $31, %xmm0
2074; SSE41-NEXT:    psrad $31, %xmm0
2075; SSE41-NEXT:    movdqa %xmm0, 32(%rdi)
2076; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3]
2077; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2078; SSE41-NEXT:    pslld $31, %xmm0
2079; SSE41-NEXT:    psrad $31, %xmm0
2080; SSE41-NEXT:    movdqa %xmm0, 48(%rdi)
2081; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
2082; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2083; SSE41-NEXT:    pslld $31, %xmm0
2084; SSE41-NEXT:    psrad $31, %xmm0
2085; SSE41-NEXT:    movdqa %xmm0, 16(%rdi)
2086; SSE41-NEXT:    retq
2087;
2088; AVX1-LABEL: umulo_v64i8:
2089; AVX1:       # %bb.0:
2090; AVX1-NEXT:    movq %rdi, %rax
2091; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
2092; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
2093; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
2094; AVX1-NEXT:    vpmullw %xmm4, %xmm6, %xmm6
2095; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
2096; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm4
2097; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2098; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2099; AVX1-NEXT:    vpmullw %xmm8, %xmm9, %xmm8
2100; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm9
2101; AVX1-NEXT:    vpackuswb %xmm4, %xmm9, %xmm4
2102; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
2103; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
2104; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2105; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
2106; AVX1-NEXT:    vpmullw %xmm9, %xmm10, %xmm9
2107; AVX1-NEXT:    vpand %xmm7, %xmm9, %xmm11
2108; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2109; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2110; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm10
2111; AVX1-NEXT:    vpand %xmm7, %xmm10, %xmm0
2112; AVX1-NEXT:    vpackuswb %xmm11, %xmm0, %xmm0
2113; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
2114; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
2115; AVX1-NEXT:    vpmullw %xmm2, %xmm11, %xmm11
2116; AVX1-NEXT:    vpand %xmm7, %xmm11, %xmm2
2117; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm12 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2118; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2119; AVX1-NEXT:    vpmullw %xmm12, %xmm13, %xmm12
2120; AVX1-NEXT:    vpand %xmm7, %xmm12, %xmm13
2121; AVX1-NEXT:    vpackuswb %xmm2, %xmm13, %xmm2
2122; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
2123; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
2124; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2125; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
2126; AVX1-NEXT:    vpmullw %xmm13, %xmm14, %xmm13
2127; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2128; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2129; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
2130; AVX1-NEXT:    vpand %xmm7, %xmm13, %xmm3
2131; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm7
2132; AVX1-NEXT:    vpackuswb %xmm3, %xmm7, %xmm7
2133; AVX1-NEXT:    vpsrlw $8, %xmm13, %xmm3
2134; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
2135; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
2136; AVX1-NEXT:    vpsrlw $8, %xmm11, %xmm3
2137; AVX1-NEXT:    vpsrlw $8, %xmm12, %xmm11
2138; AVX1-NEXT:    vpackuswb %xmm3, %xmm11, %xmm3
2139; AVX1-NEXT:    vpsrlw $8, %xmm9, %xmm9
2140; AVX1-NEXT:    vpsrlw $8, %xmm10, %xmm10
2141; AVX1-NEXT:    vpackuswb %xmm9, %xmm10, %xmm9
2142; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
2143; AVX1-NEXT:    vpsrlw $8, %xmm8, %xmm8
2144; AVX1-NEXT:    vpackuswb %xmm6, %xmm8, %xmm6
2145; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm1
2146; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm3, %xmm3
2147; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm9, %xmm8
2148; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm6, %xmm9
2149; AVX1-NEXT:    vpcmpeqd %xmm10, %xmm10, %xmm10
2150; AVX1-NEXT:    vpxor %xmm1, %xmm10, %xmm6
2151; AVX1-NEXT:    vpxor %xmm3, %xmm10, %xmm5
2152; AVX1-NEXT:    vpxor %xmm10, %xmm8, %xmm3
2153; AVX1-NEXT:    vpxor %xmm10, %xmm9, %xmm1
2154; AVX1-NEXT:    vmovdqa %xmm7, 48(%rsi)
2155; AVX1-NEXT:    vmovdqa %xmm2, 32(%rsi)
2156; AVX1-NEXT:    vmovdqa %xmm0, 16(%rsi)
2157; AVX1-NEXT:    vmovdqa %xmm4, (%rsi)
2158; AVX1-NEXT:    vpmovsxbd %xmm6, %xmm0
2159; AVX1-NEXT:    vmovdqa %xmm0, 192(%rdi)
2160; AVX1-NEXT:    vpmovsxbd %xmm5, %xmm0
2161; AVX1-NEXT:    vmovdqa %xmm0, 128(%rdi)
2162; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm0
2163; AVX1-NEXT:    vmovdqa %xmm0, 64(%rdi)
2164; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
2165; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
2166; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
2167; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2168; AVX1-NEXT:    vmovdqa %xmm0, 224(%rdi)
2169; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
2170; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2171; AVX1-NEXT:    vmovdqa %xmm0, 240(%rdi)
2172; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
2173; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2174; AVX1-NEXT:    vmovdqa %xmm0, 208(%rdi)
2175; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
2176; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2177; AVX1-NEXT:    vmovdqa %xmm0, 160(%rdi)
2178; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
2179; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2180; AVX1-NEXT:    vmovdqa %xmm0, 176(%rdi)
2181; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
2182; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2183; AVX1-NEXT:    vmovdqa %xmm0, 144(%rdi)
2184; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
2185; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2186; AVX1-NEXT:    vmovdqa %xmm0, 96(%rdi)
2187; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[3,3,3,3]
2188; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2189; AVX1-NEXT:    vmovdqa %xmm0, 112(%rdi)
2190; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
2191; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2192; AVX1-NEXT:    vmovdqa %xmm0, 80(%rdi)
2193; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
2194; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2195; AVX1-NEXT:    vmovdqa %xmm0, 32(%rdi)
2196; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
2197; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2198; AVX1-NEXT:    vmovdqa %xmm0, 48(%rdi)
2199; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
2200; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2201; AVX1-NEXT:    vmovdqa %xmm0, 16(%rdi)
2202; AVX1-NEXT:    vzeroupper
2203; AVX1-NEXT:    retq
2204;
2205; AVX2-LABEL: umulo_v64i8:
2206; AVX2:       # %bb.0:
2207; AVX2-NEXT:    movq %rdi, %rax
2208; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
2209; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
2210; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15],ymm0[24],ymm4[24],ymm0[25],ymm4[25],ymm0[26],ymm4[26],ymm0[27],ymm4[27],ymm0[28],ymm4[28],ymm0[29],ymm4[29],ymm0[30],ymm4[30],ymm0[31],ymm4[31]
2211; AVX2-NEXT:    vpmullw %ymm5, %ymm6, %ymm5
2212; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2213; AVX2-NEXT:    vpand %ymm6, %ymm5, %ymm7
2214; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
2215; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23]
2216; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm2
2217; AVX2-NEXT:    vpand %ymm6, %ymm2, %ymm0
2218; AVX2-NEXT:    vpackuswb %ymm7, %ymm0, %ymm0
2219; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31]
2220; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31]
2221; AVX2-NEXT:    vpmullw %ymm7, %ymm8, %ymm7
2222; AVX2-NEXT:    vpand %ymm6, %ymm7, %ymm8
2223; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23]
2224; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23]
2225; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
2226; AVX2-NEXT:    vpand %ymm6, %ymm1, %ymm3
2227; AVX2-NEXT:    vpackuswb %ymm8, %ymm3, %ymm3
2228; AVX2-NEXT:    vpsrlw $8, %ymm7, %ymm6
2229; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
2230; AVX2-NEXT:    vpackuswb %ymm6, %ymm1, %ymm1
2231; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm1
2232; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
2233; AVX2-NEXT:    vpxor %ymm6, %ymm1, %ymm1
2234; AVX2-NEXT:    vpsrlw $8, %ymm5, %ymm5
2235; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
2236; AVX2-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
2237; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm2, %ymm2
2238; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
2239; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
2240; AVX2-NEXT:    vpmovsxbd %xmm4, %ymm4
2241; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm5
2242; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
2243; AVX2-NEXT:    vpmovsxbd %xmm6, %ymm6
2244; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
2245; AVX2-NEXT:    vpmovsxbd %xmm7, %ymm7
2246; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm8
2247; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm8[2,3,2,3]
2248; AVX2-NEXT:    vpmovsxbd %xmm9, %ymm9
2249; AVX2-NEXT:    vpmovsxbd %xmm2, %ymm2
2250; AVX2-NEXT:    vpmovsxbd %xmm5, %ymm5
2251; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
2252; AVX2-NEXT:    vpmovsxbd %xmm8, %ymm8
2253; AVX2-NEXT:    vmovdqa %ymm3, 32(%rsi)
2254; AVX2-NEXT:    vmovdqa %ymm0, (%rsi)
2255; AVX2-NEXT:    vmovdqa %ymm8, 192(%rdi)
2256; AVX2-NEXT:    vmovdqa %ymm1, 128(%rdi)
2257; AVX2-NEXT:    vmovdqa %ymm5, 64(%rdi)
2258; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
2259; AVX2-NEXT:    vmovdqa %ymm9, 224(%rdi)
2260; AVX2-NEXT:    vmovdqa %ymm7, 160(%rdi)
2261; AVX2-NEXT:    vmovdqa %ymm6, 96(%rdi)
2262; AVX2-NEXT:    vmovdqa %ymm4, 32(%rdi)
2263; AVX2-NEXT:    vzeroupper
2264; AVX2-NEXT:    retq
2265;
2266; AVX512F-LABEL: umulo_v64i8:
2267; AVX512F:       # %bb.0:
2268; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
2269; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm3
2270; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
2271; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
2272; AVX512F-NEXT:    vextracti128 $1, %ymm5, %xmm4
2273; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
2274; AVX512F-NEXT:    vpmullw %ymm3, %ymm4, %ymm4
2275; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm3
2276; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
2277; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
2278; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2279; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
2280; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm5
2281; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm2
2282; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
2283; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k2
2284; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
2285; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2286; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
2287; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
2288; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm6
2289; AVX512F-NEXT:    vpsrlw $8, %ymm6, %ymm2
2290; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
2291; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k3
2292; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2293; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2294; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm7
2295; AVX512F-NEXT:    vpsrlw $8, %ymm7, %ymm0
2296; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2297; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k4
2298; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k4} {z} = -1
2299; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 {%k3} {z} = -1
2300; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
2301; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
2302; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
2303; AVX512F-NEXT:    vpmovdb %zmm4, 48(%rdi)
2304; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
2305; AVX512F-NEXT:    vpmovdb %zmm4, 32(%rdi)
2306; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
2307; AVX512F-NEXT:    vpmovdb %zmm4, 16(%rdi)
2308; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero
2309; AVX512F-NEXT:    vpmovdb %zmm4, (%rdi)
2310; AVX512F-NEXT:    retq
2311;
2312; AVX512BW-LABEL: umulo_v64i8:
2313; AVX512BW:       # %bb.0:
2314; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2315; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
2316; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31],zmm0[40],zmm2[40],zmm0[41],zmm2[41],zmm0[42],zmm2[42],zmm0[43],zmm2[43],zmm0[44],zmm2[44],zmm0[45],zmm2[45],zmm0[46],zmm2[46],zmm0[47],zmm2[47],zmm0[56],zmm2[56],zmm0[57],zmm2[57],zmm0[58],zmm2[58],zmm0[59],zmm2[59],zmm0[60],zmm2[60],zmm0[61],zmm2[61],zmm0[62],zmm2[62],zmm0[63],zmm2[63]
2317; AVX512BW-NEXT:    vpmullw %zmm3, %zmm4, %zmm3
2318; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2319; AVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm5
2320; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
2321; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[32],zmm2[32],zmm0[33],zmm2[33],zmm0[34],zmm2[34],zmm0[35],zmm2[35],zmm0[36],zmm2[36],zmm0[37],zmm2[37],zmm0[38],zmm2[38],zmm0[39],zmm2[39],zmm0[48],zmm2[48],zmm0[49],zmm2[49],zmm0[50],zmm2[50],zmm0[51],zmm2[51],zmm0[52],zmm2[52],zmm0[53],zmm2[53],zmm0[54],zmm2[54],zmm0[55],zmm2[55]
2322; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2323; AVX512BW-NEXT:    vpandq %zmm4, %zmm0, %zmm1
2324; AVX512BW-NEXT:    vpackuswb %zmm5, %zmm1, %zmm4
2325; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm1
2326; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
2327; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
2328; AVX512BW-NEXT:    vptestmb %zmm0, %zmm0, %k1
2329; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
2330; AVX512BW-NEXT:    kshiftrq $16, %k1, %k2
2331; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
2332; AVX512BW-NEXT:    kshiftrq $32, %k1, %k2
2333; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
2334; AVX512BW-NEXT:    kshiftrq $48, %k1, %k1
2335; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
2336; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rdi)
2337; AVX512BW-NEXT:    retq
2338  %t = call {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8> %a0, <64 x i8> %a1)
2339  %val = extractvalue {<64 x i8>, <64 x i1>} %t, 0
2340  %obit = extractvalue {<64 x i8>, <64 x i1>} %t, 1
2341  %res = sext <64 x i1> %obit to <64 x i32>
2342  store <64 x i8> %val, ptr %p2
2343  ret <64 x i32> %res
2344}
2345
2346define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
2347; SSE2-LABEL: umulo_v8i16:
2348; SSE2:       # %bb.0:
2349; SSE2-NEXT:    movdqa %xmm0, %xmm2
2350; SSE2-NEXT:    pmullw %xmm1, %xmm2
2351; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
2352; SSE2-NEXT:    pxor %xmm0, %xmm0
2353; SSE2-NEXT:    pcmpeqw %xmm0, %xmm1
2354; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
2355; SSE2-NEXT:    pxor %xmm0, %xmm1
2356; SSE2-NEXT:    movdqa %xmm1, %xmm0
2357; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2358; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2359; SSE2-NEXT:    pslld $31, %xmm1
2360; SSE2-NEXT:    psrad $31, %xmm1
2361; SSE2-NEXT:    movdqa %xmm2, (%rdi)
2362; SSE2-NEXT:    retq
2363;
2364; SSSE3-LABEL: umulo_v8i16:
2365; SSSE3:       # %bb.0:
2366; SSSE3-NEXT:    movdqa %xmm0, %xmm2
2367; SSSE3-NEXT:    pmullw %xmm1, %xmm2
2368; SSSE3-NEXT:    pmulhuw %xmm0, %xmm1
2369; SSSE3-NEXT:    pxor %xmm0, %xmm0
2370; SSSE3-NEXT:    pcmpeqw %xmm0, %xmm1
2371; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
2372; SSSE3-NEXT:    pxor %xmm0, %xmm1
2373; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2374; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2375; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2376; SSSE3-NEXT:    pslld $31, %xmm1
2377; SSSE3-NEXT:    psrad $31, %xmm1
2378; SSSE3-NEXT:    movdqa %xmm2, (%rdi)
2379; SSSE3-NEXT:    retq
2380;
2381; SSE41-LABEL: umulo_v8i16:
2382; SSE41:       # %bb.0:
2383; SSE41-NEXT:    movdqa %xmm0, %xmm2
2384; SSE41-NEXT:    pmullw %xmm1, %xmm2
2385; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
2386; SSE41-NEXT:    pxor %xmm0, %xmm0
2387; SSE41-NEXT:    pcmpeqw %xmm0, %xmm1
2388; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
2389; SSE41-NEXT:    pxor %xmm0, %xmm1
2390; SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
2391; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2392; SSE41-NEXT:    pslld $31, %xmm1
2393; SSE41-NEXT:    psrad $31, %xmm1
2394; SSE41-NEXT:    movdqa %xmm2, (%rdi)
2395; SSE41-NEXT:    retq
2396;
2397; AVX1-LABEL: umulo_v8i16:
2398; AVX1:       # %bb.0:
2399; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2400; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
2401; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2402; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
2403; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2404; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2405; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
2406; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
2407; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2408; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
2409; AVX1-NEXT:    retq
2410;
2411; AVX2-LABEL: umulo_v8i16:
2412; AVX2:       # %bb.0:
2413; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2414; AVX2-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
2415; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2416; AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
2417; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2418; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2419; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
2420; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
2421; AVX2-NEXT:    retq
2422;
2423; AVX512F-LABEL: umulo_v8i16:
2424; AVX512F:       # %bb.0:
2425; AVX512F-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2426; AVX512F-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
2427; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2428; AVX512F-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
2429; AVX512F-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
2430; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
2431; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k1
2432; AVX512F-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
2433; AVX512F-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
2434; AVX512F-NEXT:    vmovdqa %xmm2, (%rdi)
2435; AVX512F-NEXT:    retq
2436;
2437; AVX512BW-LABEL: umulo_v8i16:
2438; AVX512BW:       # %bb.0:
2439; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2440; AVX512BW-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
2441; AVX512BW-NEXT:    vptestmw %xmm0, %xmm0, %k1
2442; AVX512BW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
2443; AVX512BW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
2444; AVX512BW-NEXT:    vmovdqa %xmm2, (%rdi)
2445; AVX512BW-NEXT:    retq
2446  %t = call {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
2447  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
2448  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
2449  %res = sext <8 x i1> %obit to <8 x i32>
2450  store <8 x i16> %val, ptr %p2
2451  ret <8 x i32> %res
2452}
2453
2454define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
2455; SSE2-LABEL: umulo_v2i64:
2456; SSE2:       # %bb.0:
2457; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
2458; SSE2-NEXT:    movq %xmm2, %rcx
2459; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
2460; SSE2-NEXT:    movq %xmm2, %rsi
2461; SSE2-NEXT:    movq %xmm0, %rax
2462; SSE2-NEXT:    movq %xmm1, %rdx
2463; SSE2-NEXT:    xorl %r8d, %r8d
2464; SSE2-NEXT:    mulq %rdx
2465; SSE2-NEXT:    movq $-1, %r9
2466; SSE2-NEXT:    movl $0, %r10d
2467; SSE2-NEXT:    cmovoq %r9, %r10
2468; SSE2-NEXT:    movq %rax, %xmm1
2469; SSE2-NEXT:    movq %rcx, %rax
2470; SSE2-NEXT:    mulq %rsi
2471; SSE2-NEXT:    movq %rax, %xmm0
2472; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2473; SSE2-NEXT:    movq %r10, %xmm0
2474; SSE2-NEXT:    cmovoq %r9, %r8
2475; SSE2-NEXT:    movq %r8, %xmm2
2476; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2477; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2478; SSE2-NEXT:    movdqa %xmm1, (%rdi)
2479; SSE2-NEXT:    retq
2480;
2481; SSSE3-LABEL: umulo_v2i64:
2482; SSSE3:       # %bb.0:
2483; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
2484; SSSE3-NEXT:    movq %xmm2, %rcx
2485; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
2486; SSSE3-NEXT:    movq %xmm2, %rsi
2487; SSSE3-NEXT:    movq %xmm0, %rax
2488; SSSE3-NEXT:    movq %xmm1, %rdx
2489; SSSE3-NEXT:    xorl %r8d, %r8d
2490; SSSE3-NEXT:    mulq %rdx
2491; SSSE3-NEXT:    movq $-1, %r9
2492; SSSE3-NEXT:    movl $0, %r10d
2493; SSSE3-NEXT:    cmovoq %r9, %r10
2494; SSSE3-NEXT:    movq %rax, %xmm1
2495; SSSE3-NEXT:    movq %rcx, %rax
2496; SSSE3-NEXT:    mulq %rsi
2497; SSSE3-NEXT:    movq %rax, %xmm0
2498; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2499; SSSE3-NEXT:    movq %r10, %xmm0
2500; SSSE3-NEXT:    cmovoq %r9, %r8
2501; SSSE3-NEXT:    movq %r8, %xmm2
2502; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2503; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2504; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
2505; SSSE3-NEXT:    retq
2506;
2507; SSE41-LABEL: umulo_v2i64:
2508; SSE41:       # %bb.0:
2509; SSE41-NEXT:    movq %xmm0, %rcx
2510; SSE41-NEXT:    movq %xmm1, %rsi
2511; SSE41-NEXT:    pextrq $1, %xmm0, %rax
2512; SSE41-NEXT:    pextrq $1, %xmm1, %rdx
2513; SSE41-NEXT:    xorl %r8d, %r8d
2514; SSE41-NEXT:    mulq %rdx
2515; SSE41-NEXT:    movq $-1, %r9
2516; SSE41-NEXT:    movl $0, %r10d
2517; SSE41-NEXT:    cmovoq %r9, %r10
2518; SSE41-NEXT:    movq %rax, %xmm0
2519; SSE41-NEXT:    movq %rcx, %rax
2520; SSE41-NEXT:    mulq %rsi
2521; SSE41-NEXT:    movq %rax, %xmm1
2522; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2523; SSE41-NEXT:    movq %r10, %xmm0
2524; SSE41-NEXT:    cmovoq %r9, %r8
2525; SSE41-NEXT:    movq %r8, %xmm2
2526; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2527; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2528; SSE41-NEXT:    movdqa %xmm1, (%rdi)
2529; SSE41-NEXT:    retq
2530;
2531; AVX-LABEL: umulo_v2i64:
2532; AVX:       # %bb.0:
2533; AVX-NEXT:    vmovq %xmm0, %rcx
2534; AVX-NEXT:    vmovq %xmm1, %rsi
2535; AVX-NEXT:    vpextrq $1, %xmm0, %rax
2536; AVX-NEXT:    vpextrq $1, %xmm1, %rdx
2537; AVX-NEXT:    xorl %r8d, %r8d
2538; AVX-NEXT:    mulq %rdx
2539; AVX-NEXT:    movq $-1, %r9
2540; AVX-NEXT:    movl $0, %r10d
2541; AVX-NEXT:    cmovoq %r9, %r10
2542; AVX-NEXT:    vmovq %rax, %xmm0
2543; AVX-NEXT:    movq %rcx, %rax
2544; AVX-NEXT:    mulq %rsi
2545; AVX-NEXT:    vmovq %rax, %xmm1
2546; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2547; AVX-NEXT:    vmovq %r10, %xmm0
2548; AVX-NEXT:    cmovoq %r9, %r8
2549; AVX-NEXT:    vmovq %r8, %xmm2
2550; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2551; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2552; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
2553; AVX-NEXT:    retq
2554;
2555; AVX512F-LABEL: umulo_v2i64:
2556; AVX512F:       # %bb.0:
2557; AVX512F-NEXT:    vmovq %xmm0, %rcx
2558; AVX512F-NEXT:    vmovq %xmm1, %rsi
2559; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
2560; AVX512F-NEXT:    vpextrq $1, %xmm1, %rdx
2561; AVX512F-NEXT:    mulq %rdx
2562; AVX512F-NEXT:    seto %r8b
2563; AVX512F-NEXT:    vmovq %rax, %xmm0
2564; AVX512F-NEXT:    movq %rcx, %rax
2565; AVX512F-NEXT:    mulq %rsi
2566; AVX512F-NEXT:    vmovq %rax, %xmm1
2567; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2568; AVX512F-NEXT:    seto %al
2569; AVX512F-NEXT:    andl $1, %eax
2570; AVX512F-NEXT:    kmovw %eax, %k0
2571; AVX512F-NEXT:    kmovw %r8d, %k1
2572; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2573; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
2574; AVX512F-NEXT:    korw %k1, %k0, %k1
2575; AVX512F-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
2576; AVX512F-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
2577; AVX512F-NEXT:    vmovdqa %xmm1, (%rdi)
2578; AVX512F-NEXT:    retq
2579;
2580; AVX512BW-LABEL: umulo_v2i64:
2581; AVX512BW:       # %bb.0:
2582; AVX512BW-NEXT:    vmovq %xmm0, %rcx
2583; AVX512BW-NEXT:    vmovq %xmm1, %rsi
2584; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
2585; AVX512BW-NEXT:    vpextrq $1, %xmm1, %rdx
2586; AVX512BW-NEXT:    mulq %rdx
2587; AVX512BW-NEXT:    seto %r8b
2588; AVX512BW-NEXT:    vmovq %rax, %xmm0
2589; AVX512BW-NEXT:    movq %rcx, %rax
2590; AVX512BW-NEXT:    mulq %rsi
2591; AVX512BW-NEXT:    vmovq %rax, %xmm1
2592; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2593; AVX512BW-NEXT:    seto %al
2594; AVX512BW-NEXT:    andl $1, %eax
2595; AVX512BW-NEXT:    kmovw %eax, %k0
2596; AVX512BW-NEXT:    kmovd %r8d, %k1
2597; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
2598; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
2599; AVX512BW-NEXT:    korw %k1, %k0, %k1
2600; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
2601; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
2602; AVX512BW-NEXT:    vmovdqa %xmm1, (%rdi)
2603; AVX512BW-NEXT:    retq
2604  %t = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
2605  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
2606  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
2607  %res = sext <2 x i1> %obit to <2 x i32>
2608  store <2 x i64> %val, ptr %p2
2609  ret <2 x i32> %res
2610}
2611
2612define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
2613; SSE2-LABEL: umulo_v4i24:
2614; SSE2:       # %bb.0:
2615; SSE2-NEXT:    movdqa %xmm0, %xmm2
2616; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
2617; SSE2-NEXT:    pand %xmm0, %xmm1
2618; SSE2-NEXT:    pand %xmm0, %xmm2
2619; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
2620; SSE2-NEXT:    pmuludq %xmm1, %xmm2
2621; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
2622; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2623; SSE2-NEXT:    pmuludq %xmm0, %xmm1
2624; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
2625; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2626; SSE2-NEXT:    pxor %xmm4, %xmm4
2627; SSE2-NEXT:    pcmpeqd %xmm4, %xmm3
2628; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
2629; SSE2-NEXT:    pxor %xmm3, %xmm5
2630; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2631; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
2632; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2633; SSE2-NEXT:    psrld $24, %xmm0
2634; SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
2635; SSE2-NEXT:    por %xmm5, %xmm0
2636; SSE2-NEXT:    movd %xmm2, %eax
2637; SSE2-NEXT:    movw %ax, (%rdi)
2638; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
2639; SSE2-NEXT:    movd %xmm2, %ecx
2640; SSE2-NEXT:    movw %cx, 6(%rdi)
2641; SSE2-NEXT:    movd %xmm1, %edx
2642; SSE2-NEXT:    movw %dx, 3(%rdi)
2643; SSE2-NEXT:    shrl $16, %eax
2644; SSE2-NEXT:    movb %al, 2(%rdi)
2645; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
2646; SSE2-NEXT:    movd %xmm1, %eax
2647; SSE2-NEXT:    movw %ax, 9(%rdi)
2648; SSE2-NEXT:    shrl $16, %ecx
2649; SSE2-NEXT:    movb %cl, 8(%rdi)
2650; SSE2-NEXT:    shrl $16, %edx
2651; SSE2-NEXT:    movb %dl, 5(%rdi)
2652; SSE2-NEXT:    shrl $16, %eax
2653; SSE2-NEXT:    movb %al, 11(%rdi)
2654; SSE2-NEXT:    retq
2655;
2656; SSSE3-LABEL: umulo_v4i24:
2657; SSSE3:       # %bb.0:
2658; SSSE3-NEXT:    movdqa %xmm0, %xmm2
2659; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
2660; SSSE3-NEXT:    pand %xmm0, %xmm1
2661; SSSE3-NEXT:    pand %xmm0, %xmm2
2662; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
2663; SSSE3-NEXT:    pmuludq %xmm1, %xmm2
2664; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
2665; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2666; SSSE3-NEXT:    pmuludq %xmm0, %xmm1
2667; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
2668; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2669; SSSE3-NEXT:    pxor %xmm4, %xmm4
2670; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm3
2671; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5
2672; SSSE3-NEXT:    pxor %xmm3, %xmm5
2673; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2674; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
2675; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2676; SSSE3-NEXT:    psrld $24, %xmm0
2677; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm0
2678; SSSE3-NEXT:    por %xmm5, %xmm0
2679; SSSE3-NEXT:    movd %xmm2, %eax
2680; SSSE3-NEXT:    movw %ax, (%rdi)
2681; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
2682; SSSE3-NEXT:    movd %xmm2, %ecx
2683; SSSE3-NEXT:    movw %cx, 6(%rdi)
2684; SSSE3-NEXT:    movd %xmm1, %edx
2685; SSSE3-NEXT:    movw %dx, 3(%rdi)
2686; SSSE3-NEXT:    shrl $16, %eax
2687; SSSE3-NEXT:    movb %al, 2(%rdi)
2688; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
2689; SSSE3-NEXT:    movd %xmm1, %eax
2690; SSSE3-NEXT:    movw %ax, 9(%rdi)
2691; SSSE3-NEXT:    shrl $16, %ecx
2692; SSSE3-NEXT:    movb %cl, 8(%rdi)
2693; SSSE3-NEXT:    shrl $16, %edx
2694; SSSE3-NEXT:    movb %dl, 5(%rdi)
2695; SSSE3-NEXT:    shrl $16, %eax
2696; SSSE3-NEXT:    movb %al, 11(%rdi)
2697; SSSE3-NEXT:    retq
2698;
2699; SSE41-LABEL: umulo_v4i24:
2700; SSE41:       # %bb.0:
2701; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
2702; SSE41-NEXT:    pand %xmm2, %xmm0
2703; SSE41-NEXT:    pand %xmm2, %xmm1
2704; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
2705; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2706; SSE41-NEXT:    pmuludq %xmm2, %xmm3
2707; SSE41-NEXT:    movdqa %xmm0, %xmm2
2708; SSE41-NEXT:    pmuludq %xmm1, %xmm2
2709; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2710; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
2711; SSE41-NEXT:    pxor %xmm3, %xmm3
2712; SSE41-NEXT:    pcmpeqd %xmm3, %xmm2
2713; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
2714; SSE41-NEXT:    pxor %xmm2, %xmm4
2715; SSE41-NEXT:    pmulld %xmm0, %xmm1
2716; SSE41-NEXT:    pextrd $3, %xmm1, %eax
2717; SSE41-NEXT:    pextrd $2, %xmm1, %ecx
2718; SSE41-NEXT:    pextrd $1, %xmm1, %edx
2719; SSE41-NEXT:    movd %xmm1, %esi
2720; SSE41-NEXT:    movdqa %xmm1, %xmm0
2721; SSE41-NEXT:    psrld $24, %xmm0
2722; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
2723; SSE41-NEXT:    por %xmm4, %xmm0
2724; SSE41-NEXT:    movw %ax, 9(%rdi)
2725; SSE41-NEXT:    movw %cx, 6(%rdi)
2726; SSE41-NEXT:    movw %dx, 3(%rdi)
2727; SSE41-NEXT:    movw %si, (%rdi)
2728; SSE41-NEXT:    shrl $16, %eax
2729; SSE41-NEXT:    movb %al, 11(%rdi)
2730; SSE41-NEXT:    shrl $16, %ecx
2731; SSE41-NEXT:    movb %cl, 8(%rdi)
2732; SSE41-NEXT:    shrl $16, %edx
2733; SSE41-NEXT:    movb %dl, 5(%rdi)
2734; SSE41-NEXT:    shrl $16, %esi
2735; SSE41-NEXT:    movb %sil, 2(%rdi)
2736; SSE41-NEXT:    retq
2737;
2738; AVX1-LABEL: umulo_v4i24:
2739; AVX1:       # %bb.0:
2740; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
2741; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm0
2742; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm1
2743; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[1,1,3,3]
2744; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[1,1,3,3]
2745; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
2746; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
2747; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2748; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
2749; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
2750; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
2751; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
2752; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
2753; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
2754; AVX1-NEXT:    vpsrld $24, %xmm1, %xmm0
2755; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm0, %xmm0
2756; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
2757; AVX1-NEXT:    vpextrd $3, %xmm1, %eax
2758; AVX1-NEXT:    movw %ax, 9(%rdi)
2759; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
2760; AVX1-NEXT:    movw %cx, 6(%rdi)
2761; AVX1-NEXT:    vpextrd $1, %xmm1, %edx
2762; AVX1-NEXT:    movw %dx, 3(%rdi)
2763; AVX1-NEXT:    vmovd %xmm1, %esi
2764; AVX1-NEXT:    movw %si, (%rdi)
2765; AVX1-NEXT:    shrl $16, %eax
2766; AVX1-NEXT:    movb %al, 11(%rdi)
2767; AVX1-NEXT:    shrl $16, %ecx
2768; AVX1-NEXT:    movb %cl, 8(%rdi)
2769; AVX1-NEXT:    shrl $16, %edx
2770; AVX1-NEXT:    movb %dl, 5(%rdi)
2771; AVX1-NEXT:    shrl $16, %esi
2772; AVX1-NEXT:    movb %sil, 2(%rdi)
2773; AVX1-NEXT:    retq
2774;
2775; AVX2-LABEL: umulo_v4i24:
2776; AVX2:       # %bb.0:
2777; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
2778; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
2779; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
2780; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
2781; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2782; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
2783; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
2784; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2785; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
2786; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
2787; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
2788; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
2789; AVX2-NEXT:    vpxor %xmm4, %xmm2, %xmm2
2790; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
2791; AVX2-NEXT:    vpsrld $24, %xmm1, %xmm0
2792; AVX2-NEXT:    vpcmpgtd %xmm3, %xmm0, %xmm0
2793; AVX2-NEXT:    vpor %xmm2, %xmm0, %xmm0
2794; AVX2-NEXT:    vpextrd $3, %xmm1, %eax
2795; AVX2-NEXT:    movw %ax, 9(%rdi)
2796; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
2797; AVX2-NEXT:    movw %cx, 6(%rdi)
2798; AVX2-NEXT:    vpextrd $1, %xmm1, %edx
2799; AVX2-NEXT:    movw %dx, 3(%rdi)
2800; AVX2-NEXT:    vmovd %xmm1, %esi
2801; AVX2-NEXT:    movw %si, (%rdi)
2802; AVX2-NEXT:    shrl $16, %eax
2803; AVX2-NEXT:    movb %al, 11(%rdi)
2804; AVX2-NEXT:    shrl $16, %ecx
2805; AVX2-NEXT:    movb %cl, 8(%rdi)
2806; AVX2-NEXT:    shrl $16, %edx
2807; AVX2-NEXT:    movb %dl, 5(%rdi)
2808; AVX2-NEXT:    shrl $16, %esi
2809; AVX2-NEXT:    movb %sil, 2(%rdi)
2810; AVX2-NEXT:    retq
2811;
2812; AVX512-LABEL: umulo_v4i24:
2813; AVX512:       # %bb.0:
2814; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
2815; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
2816; AVX512-NEXT:    vpand %xmm2, %xmm0, %xmm0
2817; AVX512-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
2818; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
2819; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
2820; AVX512-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
2821; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7]
2822; AVX512-NEXT:    vpermi2d %xmm3, %xmm2, %xmm4
2823; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
2824; AVX512-NEXT:    vpsrld $24, %xmm1, %xmm0
2825; AVX512-NEXT:    vpor %xmm4, %xmm0, %xmm0
2826; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k1
2827; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
2828; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
2829; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
2830; AVX512-NEXT:    movw %ax, 9(%rdi)
2831; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
2832; AVX512-NEXT:    movw %cx, 6(%rdi)
2833; AVX512-NEXT:    vpextrd $1, %xmm1, %edx
2834; AVX512-NEXT:    movw %dx, 3(%rdi)
2835; AVX512-NEXT:    vmovd %xmm1, %esi
2836; AVX512-NEXT:    movw %si, (%rdi)
2837; AVX512-NEXT:    shrl $16, %eax
2838; AVX512-NEXT:    movb %al, 11(%rdi)
2839; AVX512-NEXT:    shrl $16, %ecx
2840; AVX512-NEXT:    movb %cl, 8(%rdi)
2841; AVX512-NEXT:    shrl $16, %edx
2842; AVX512-NEXT:    movb %dl, 5(%rdi)
2843; AVX512-NEXT:    shrl $16, %esi
2844; AVX512-NEXT:    movb %sil, 2(%rdi)
2845; AVX512-NEXT:    retq
2846  %t = call {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
2847  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
2848  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
2849  %res = sext <4 x i1> %obit to <4 x i32>
2850  store <4 x i24> %val, ptr %p2
2851  ret <4 x i32> %res
2852}
2853
2854define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
2855; SSE-LABEL: umulo_v4i1:
2856; SSE:       # %bb.0:
2857; SSE-NEXT:    pand %xmm1, %xmm0
2858; SSE-NEXT:    pslld $31, %xmm0
2859; SSE-NEXT:    movmskps %xmm0, %eax
2860; SSE-NEXT:    movb %al, (%rdi)
2861; SSE-NEXT:    xorps %xmm0, %xmm0
2862; SSE-NEXT:    retq
2863;
2864; AVX-LABEL: umulo_v4i1:
2865; AVX:       # %bb.0:
2866; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
2867; AVX-NEXT:    vpslld $31, %xmm0, %xmm0
2868; AVX-NEXT:    vmovmskps %xmm0, %eax
2869; AVX-NEXT:    movb %al, (%rdi)
2870; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2871; AVX-NEXT:    retq
2872;
2873; AVX512F-LABEL: umulo_v4i1:
2874; AVX512F:       # %bb.0:
2875; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm0
2876; AVX512F-NEXT:    vpslld $31, %xmm0, %xmm0
2877; AVX512F-NEXT:    vptestmd %xmm0, %xmm0, %k0
2878; AVX512F-NEXT:    kmovw %k0, %eax
2879; AVX512F-NEXT:    movb %al, (%rdi)
2880; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
2881; AVX512F-NEXT:    retq
2882;
2883; AVX512BW-LABEL: umulo_v4i1:
2884; AVX512BW:       # %bb.0:
2885; AVX512BW-NEXT:    vpand %xmm1, %xmm0, %xmm0
2886; AVX512BW-NEXT:    vpslld $31, %xmm0, %xmm0
2887; AVX512BW-NEXT:    vptestmd %xmm0, %xmm0, %k0
2888; AVX512BW-NEXT:    kmovd %k0, %eax
2889; AVX512BW-NEXT:    movb %al, (%rdi)
2890; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
2891; AVX512BW-NEXT:    retq
2892  %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
2893  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
2894  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
2895  %res = sext <4 x i1> %obit to <4 x i32>
2896  store <4 x i1> %val, ptr %p2
2897  ret <4 x i32> %res
2898}
2899
2900define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind {
2901; SSE2-LABEL: umulo_v2i128:
2902; SSE2:       # %bb.0:
2903; SSE2-NEXT:    pushq %rbp
2904; SSE2-NEXT:    pushq %r15
2905; SSE2-NEXT:    pushq %r14
2906; SSE2-NEXT:    pushq %r12
2907; SSE2-NEXT:    pushq %rbx
2908; SSE2-NEXT:    movq %r9, %r11
2909; SSE2-NEXT:    movq %rcx, %r10
2910; SSE2-NEXT:    movq %rdx, %rcx
2911; SSE2-NEXT:    movq %rsi, %rax
2912; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
2913; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
2914; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r9
2915; SSE2-NEXT:    testq %r11, %r11
2916; SSE2-NEXT:    setne %dl
2917; SSE2-NEXT:    testq %rsi, %rsi
2918; SSE2-NEXT:    setne %bpl
2919; SSE2-NEXT:    andb %dl, %bpl
2920; SSE2-NEXT:    mulq %r8
2921; SSE2-NEXT:    movq %rax, %rsi
2922; SSE2-NEXT:    seto %r15b
2923; SSE2-NEXT:    movq %r11, %rax
2924; SSE2-NEXT:    mulq %rdi
2925; SSE2-NEXT:    seto %r12b
2926; SSE2-NEXT:    orb %r15b, %r12b
2927; SSE2-NEXT:    orb %bpl, %r12b
2928; SSE2-NEXT:    leaq (%rsi,%rax), %r11
2929; SSE2-NEXT:    movq %rdi, %rax
2930; SSE2-NEXT:    mulq %r8
2931; SSE2-NEXT:    movq %rax, %rdi
2932; SSE2-NEXT:    movq %rdx, %rsi
2933; SSE2-NEXT:    addq %r11, %rsi
2934; SSE2-NEXT:    setb %r11b
2935; SSE2-NEXT:    orb %r12b, %r11b
2936; SSE2-NEXT:    testq %r9, %r9
2937; SSE2-NEXT:    setne %al
2938; SSE2-NEXT:    testq %r10, %r10
2939; SSE2-NEXT:    setne %bpl
2940; SSE2-NEXT:    andb %al, %bpl
2941; SSE2-NEXT:    movq %r10, %rax
2942; SSE2-NEXT:    mulq %r14
2943; SSE2-NEXT:    movq %rax, %r8
2944; SSE2-NEXT:    seto %r10b
2945; SSE2-NEXT:    movq %r9, %rax
2946; SSE2-NEXT:    mulq %rcx
2947; SSE2-NEXT:    seto %r9b
2948; SSE2-NEXT:    orb %r10b, %r9b
2949; SSE2-NEXT:    orb %bpl, %r9b
2950; SSE2-NEXT:    addq %rax, %r8
2951; SSE2-NEXT:    movq %rcx, %rax
2952; SSE2-NEXT:    mulq %r14
2953; SSE2-NEXT:    addq %r8, %rdx
2954; SSE2-NEXT:    setb %cl
2955; SSE2-NEXT:    orb %r9b, %cl
2956; SSE2-NEXT:    movzbl %cl, %ecx
2957; SSE2-NEXT:    negl %ecx
2958; SSE2-NEXT:    movd %ecx, %xmm1
2959; SSE2-NEXT:    movzbl %r11b, %ecx
2960; SSE2-NEXT:    negl %ecx
2961; SSE2-NEXT:    movd %ecx, %xmm0
2962; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2963; SSE2-NEXT:    movq %rax, 16(%rbx)
2964; SSE2-NEXT:    movq %rdi, (%rbx)
2965; SSE2-NEXT:    movq %rdx, 24(%rbx)
2966; SSE2-NEXT:    movq %rsi, 8(%rbx)
2967; SSE2-NEXT:    popq %rbx
2968; SSE2-NEXT:    popq %r12
2969; SSE2-NEXT:    popq %r14
2970; SSE2-NEXT:    popq %r15
2971; SSE2-NEXT:    popq %rbp
2972; SSE2-NEXT:    retq
2973;
2974; SSSE3-LABEL: umulo_v2i128:
2975; SSSE3:       # %bb.0:
2976; SSSE3-NEXT:    pushq %rbp
2977; SSSE3-NEXT:    pushq %r15
2978; SSSE3-NEXT:    pushq %r14
2979; SSSE3-NEXT:    pushq %r12
2980; SSSE3-NEXT:    pushq %rbx
2981; SSSE3-NEXT:    movq %r9, %r11
2982; SSSE3-NEXT:    movq %rcx, %r10
2983; SSSE3-NEXT:    movq %rdx, %rcx
2984; SSSE3-NEXT:    movq %rsi, %rax
2985; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
2986; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r14
2987; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r9
2988; SSSE3-NEXT:    testq %r11, %r11
2989; SSSE3-NEXT:    setne %dl
2990; SSSE3-NEXT:    testq %rsi, %rsi
2991; SSSE3-NEXT:    setne %bpl
2992; SSSE3-NEXT:    andb %dl, %bpl
2993; SSSE3-NEXT:    mulq %r8
2994; SSSE3-NEXT:    movq %rax, %rsi
2995; SSSE3-NEXT:    seto %r15b
2996; SSSE3-NEXT:    movq %r11, %rax
2997; SSSE3-NEXT:    mulq %rdi
2998; SSSE3-NEXT:    seto %r12b
2999; SSSE3-NEXT:    orb %r15b, %r12b
3000; SSSE3-NEXT:    orb %bpl, %r12b
3001; SSSE3-NEXT:    leaq (%rsi,%rax), %r11
3002; SSSE3-NEXT:    movq %rdi, %rax
3003; SSSE3-NEXT:    mulq %r8
3004; SSSE3-NEXT:    movq %rax, %rdi
3005; SSSE3-NEXT:    movq %rdx, %rsi
3006; SSSE3-NEXT:    addq %r11, %rsi
3007; SSSE3-NEXT:    setb %r11b
3008; SSSE3-NEXT:    orb %r12b, %r11b
3009; SSSE3-NEXT:    testq %r9, %r9
3010; SSSE3-NEXT:    setne %al
3011; SSSE3-NEXT:    testq %r10, %r10
3012; SSSE3-NEXT:    setne %bpl
3013; SSSE3-NEXT:    andb %al, %bpl
3014; SSSE3-NEXT:    movq %r10, %rax
3015; SSSE3-NEXT:    mulq %r14
3016; SSSE3-NEXT:    movq %rax, %r8
3017; SSSE3-NEXT:    seto %r10b
3018; SSSE3-NEXT:    movq %r9, %rax
3019; SSSE3-NEXT:    mulq %rcx
3020; SSSE3-NEXT:    seto %r9b
3021; SSSE3-NEXT:    orb %r10b, %r9b
3022; SSSE3-NEXT:    orb %bpl, %r9b
3023; SSSE3-NEXT:    addq %rax, %r8
3024; SSSE3-NEXT:    movq %rcx, %rax
3025; SSSE3-NEXT:    mulq %r14
3026; SSSE3-NEXT:    addq %r8, %rdx
3027; SSSE3-NEXT:    setb %cl
3028; SSSE3-NEXT:    orb %r9b, %cl
3029; SSSE3-NEXT:    movzbl %cl, %ecx
3030; SSSE3-NEXT:    negl %ecx
3031; SSSE3-NEXT:    movd %ecx, %xmm1
3032; SSSE3-NEXT:    movzbl %r11b, %ecx
3033; SSSE3-NEXT:    negl %ecx
3034; SSSE3-NEXT:    movd %ecx, %xmm0
3035; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3036; SSSE3-NEXT:    movq %rax, 16(%rbx)
3037; SSSE3-NEXT:    movq %rdi, (%rbx)
3038; SSSE3-NEXT:    movq %rdx, 24(%rbx)
3039; SSSE3-NEXT:    movq %rsi, 8(%rbx)
3040; SSSE3-NEXT:    popq %rbx
3041; SSSE3-NEXT:    popq %r12
3042; SSSE3-NEXT:    popq %r14
3043; SSSE3-NEXT:    popq %r15
3044; SSSE3-NEXT:    popq %rbp
3045; SSSE3-NEXT:    retq
3046;
3047; SSE41-LABEL: umulo_v2i128:
3048; SSE41:       # %bb.0:
3049; SSE41-NEXT:    pushq %rbp
3050; SSE41-NEXT:    pushq %r15
3051; SSE41-NEXT:    pushq %r14
3052; SSE41-NEXT:    pushq %r12
3053; SSE41-NEXT:    pushq %rbx
3054; SSE41-NEXT:    movq %r9, %r11
3055; SSE41-NEXT:    movq %rcx, %r10
3056; SSE41-NEXT:    movq %rdx, %rcx
3057; SSE41-NEXT:    movq %rsi, %rax
3058; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
3059; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r14
3060; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r9
3061; SSE41-NEXT:    testq %r11, %r11
3062; SSE41-NEXT:    setne %dl
3063; SSE41-NEXT:    testq %rsi, %rsi
3064; SSE41-NEXT:    setne %bpl
3065; SSE41-NEXT:    andb %dl, %bpl
3066; SSE41-NEXT:    mulq %r8
3067; SSE41-NEXT:    movq %rax, %rsi
3068; SSE41-NEXT:    seto %r15b
3069; SSE41-NEXT:    movq %r11, %rax
3070; SSE41-NEXT:    mulq %rdi
3071; SSE41-NEXT:    seto %r12b
3072; SSE41-NEXT:    orb %r15b, %r12b
3073; SSE41-NEXT:    orb %bpl, %r12b
3074; SSE41-NEXT:    leaq (%rsi,%rax), %r11
3075; SSE41-NEXT:    movq %rdi, %rax
3076; SSE41-NEXT:    mulq %r8
3077; SSE41-NEXT:    movq %rax, %rdi
3078; SSE41-NEXT:    movq %rdx, %rsi
3079; SSE41-NEXT:    addq %r11, %rsi
3080; SSE41-NEXT:    setb %r11b
3081; SSE41-NEXT:    orb %r12b, %r11b
3082; SSE41-NEXT:    testq %r9, %r9
3083; SSE41-NEXT:    setne %al
3084; SSE41-NEXT:    testq %r10, %r10
3085; SSE41-NEXT:    setne %bpl
3086; SSE41-NEXT:    andb %al, %bpl
3087; SSE41-NEXT:    movq %r10, %rax
3088; SSE41-NEXT:    mulq %r14
3089; SSE41-NEXT:    movq %rax, %r8
3090; SSE41-NEXT:    seto %r10b
3091; SSE41-NEXT:    movq %r9, %rax
3092; SSE41-NEXT:    mulq %rcx
3093; SSE41-NEXT:    seto %r9b
3094; SSE41-NEXT:    orb %r10b, %r9b
3095; SSE41-NEXT:    orb %bpl, %r9b
3096; SSE41-NEXT:    addq %rax, %r8
3097; SSE41-NEXT:    movq %rcx, %rax
3098; SSE41-NEXT:    mulq %r14
3099; SSE41-NEXT:    addq %r8, %rdx
3100; SSE41-NEXT:    setb %cl
3101; SSE41-NEXT:    orb %r9b, %cl
3102; SSE41-NEXT:    movzbl %cl, %ecx
3103; SSE41-NEXT:    negl %ecx
3104; SSE41-NEXT:    movzbl %r11b, %r8d
3105; SSE41-NEXT:    negl %r8d
3106; SSE41-NEXT:    movd %r8d, %xmm0
3107; SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
3108; SSE41-NEXT:    movq %rax, 16(%rbx)
3109; SSE41-NEXT:    movq %rdi, (%rbx)
3110; SSE41-NEXT:    movq %rdx, 24(%rbx)
3111; SSE41-NEXT:    movq %rsi, 8(%rbx)
3112; SSE41-NEXT:    popq %rbx
3113; SSE41-NEXT:    popq %r12
3114; SSE41-NEXT:    popq %r14
3115; SSE41-NEXT:    popq %r15
3116; SSE41-NEXT:    popq %rbp
3117; SSE41-NEXT:    retq
3118;
3119; AVX-LABEL: umulo_v2i128:
3120; AVX:       # %bb.0:
3121; AVX-NEXT:    pushq %rbp
3122; AVX-NEXT:    pushq %r15
3123; AVX-NEXT:    pushq %r14
3124; AVX-NEXT:    pushq %r12
3125; AVX-NEXT:    pushq %rbx
3126; AVX-NEXT:    movq %r9, %r11
3127; AVX-NEXT:    movq %rcx, %r10
3128; AVX-NEXT:    movq %rdx, %rcx
3129; AVX-NEXT:    movq %rsi, %rax
3130; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
3131; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r14
3132; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r9
3133; AVX-NEXT:    testq %r11, %r11
3134; AVX-NEXT:    setne %dl
3135; AVX-NEXT:    testq %rsi, %rsi
3136; AVX-NEXT:    setne %bpl
3137; AVX-NEXT:    andb %dl, %bpl
3138; AVX-NEXT:    mulq %r8
3139; AVX-NEXT:    movq %rax, %rsi
3140; AVX-NEXT:    seto %r15b
3141; AVX-NEXT:    movq %r11, %rax
3142; AVX-NEXT:    mulq %rdi
3143; AVX-NEXT:    seto %r12b
3144; AVX-NEXT:    orb %r15b, %r12b
3145; AVX-NEXT:    orb %bpl, %r12b
3146; AVX-NEXT:    leaq (%rsi,%rax), %r11
3147; AVX-NEXT:    movq %rdi, %rax
3148; AVX-NEXT:    mulq %r8
3149; AVX-NEXT:    movq %rax, %rdi
3150; AVX-NEXT:    movq %rdx, %rsi
3151; AVX-NEXT:    addq %r11, %rsi
3152; AVX-NEXT:    setb %r11b
3153; AVX-NEXT:    orb %r12b, %r11b
3154; AVX-NEXT:    testq %r9, %r9
3155; AVX-NEXT:    setne %al
3156; AVX-NEXT:    testq %r10, %r10
3157; AVX-NEXT:    setne %bpl
3158; AVX-NEXT:    andb %al, %bpl
3159; AVX-NEXT:    movq %r10, %rax
3160; AVX-NEXT:    mulq %r14
3161; AVX-NEXT:    movq %rax, %r8
3162; AVX-NEXT:    seto %r10b
3163; AVX-NEXT:    movq %r9, %rax
3164; AVX-NEXT:    mulq %rcx
3165; AVX-NEXT:    seto %r9b
3166; AVX-NEXT:    orb %r10b, %r9b
3167; AVX-NEXT:    orb %bpl, %r9b
3168; AVX-NEXT:    addq %rax, %r8
3169; AVX-NEXT:    movq %rcx, %rax
3170; AVX-NEXT:    mulq %r14
3171; AVX-NEXT:    addq %r8, %rdx
3172; AVX-NEXT:    setb %cl
3173; AVX-NEXT:    orb %r9b, %cl
3174; AVX-NEXT:    movzbl %cl, %ecx
3175; AVX-NEXT:    negl %ecx
3176; AVX-NEXT:    movzbl %r11b, %r8d
3177; AVX-NEXT:    negl %r8d
3178; AVX-NEXT:    vmovd %r8d, %xmm0
3179; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
3180; AVX-NEXT:    movq %rax, 16(%rbx)
3181; AVX-NEXT:    movq %rdi, (%rbx)
3182; AVX-NEXT:    movq %rdx, 24(%rbx)
3183; AVX-NEXT:    movq %rsi, 8(%rbx)
3184; AVX-NEXT:    popq %rbx
3185; AVX-NEXT:    popq %r12
3186; AVX-NEXT:    popq %r14
3187; AVX-NEXT:    popq %r15
3188; AVX-NEXT:    popq %rbp
3189; AVX-NEXT:    retq
3190;
3191; AVX512F-LABEL: umulo_v2i128:
3192; AVX512F:       # %bb.0:
3193; AVX512F-NEXT:    pushq %rbp
3194; AVX512F-NEXT:    pushq %r15
3195; AVX512F-NEXT:    pushq %r14
3196; AVX512F-NEXT:    pushq %r12
3197; AVX512F-NEXT:    pushq %rbx
3198; AVX512F-NEXT:    movq %rcx, %rax
3199; AVX512F-NEXT:    movq %rdx, %rcx
3200; AVX512F-NEXT:    movq %rsi, %r10
3201; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
3202; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r14
3203; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
3204; AVX512F-NEXT:    testq %rsi, %rsi
3205; AVX512F-NEXT:    setne %dl
3206; AVX512F-NEXT:    testq %rax, %rax
3207; AVX512F-NEXT:    setne %bpl
3208; AVX512F-NEXT:    andb %dl, %bpl
3209; AVX512F-NEXT:    mulq %r14
3210; AVX512F-NEXT:    movq %rax, %r11
3211; AVX512F-NEXT:    seto %r15b
3212; AVX512F-NEXT:    movq %rsi, %rax
3213; AVX512F-NEXT:    mulq %rcx
3214; AVX512F-NEXT:    seto %r12b
3215; AVX512F-NEXT:    orb %r15b, %r12b
3216; AVX512F-NEXT:    orb %bpl, %r12b
3217; AVX512F-NEXT:    addq %rax, %r11
3218; AVX512F-NEXT:    movq %rcx, %rax
3219; AVX512F-NEXT:    mulq %r14
3220; AVX512F-NEXT:    movq %rax, %rsi
3221; AVX512F-NEXT:    movq %rdx, %rcx
3222; AVX512F-NEXT:    addq %r11, %rcx
3223; AVX512F-NEXT:    setb %al
3224; AVX512F-NEXT:    orb %r12b, %al
3225; AVX512F-NEXT:    kmovw %eax, %k0
3226; AVX512F-NEXT:    testq %r9, %r9
3227; AVX512F-NEXT:    setne %al
3228; AVX512F-NEXT:    testq %r10, %r10
3229; AVX512F-NEXT:    setne %r11b
3230; AVX512F-NEXT:    andb %al, %r11b
3231; AVX512F-NEXT:    movq %r10, %rax
3232; AVX512F-NEXT:    mulq %r8
3233; AVX512F-NEXT:    movq %rax, %r10
3234; AVX512F-NEXT:    seto %bpl
3235; AVX512F-NEXT:    movq %r9, %rax
3236; AVX512F-NEXT:    mulq %rdi
3237; AVX512F-NEXT:    seto %r9b
3238; AVX512F-NEXT:    orb %bpl, %r9b
3239; AVX512F-NEXT:    orb %r11b, %r9b
3240; AVX512F-NEXT:    addq %rax, %r10
3241; AVX512F-NEXT:    movq %rdi, %rax
3242; AVX512F-NEXT:    mulq %r8
3243; AVX512F-NEXT:    addq %r10, %rdx
3244; AVX512F-NEXT:    setb %dil
3245; AVX512F-NEXT:    orb %r9b, %dil
3246; AVX512F-NEXT:    andl $1, %edi
3247; AVX512F-NEXT:    kmovw %edi, %k1
3248; AVX512F-NEXT:    kshiftlw $1, %k0, %k0
3249; AVX512F-NEXT:    korw %k0, %k1, %k1
3250; AVX512F-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
3251; AVX512F-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
3252; AVX512F-NEXT:    movq %rsi, 16(%rbx)
3253; AVX512F-NEXT:    movq %rax, (%rbx)
3254; AVX512F-NEXT:    movq %rcx, 24(%rbx)
3255; AVX512F-NEXT:    movq %rdx, 8(%rbx)
3256; AVX512F-NEXT:    popq %rbx
3257; AVX512F-NEXT:    popq %r12
3258; AVX512F-NEXT:    popq %r14
3259; AVX512F-NEXT:    popq %r15
3260; AVX512F-NEXT:    popq %rbp
3261; AVX512F-NEXT:    retq
3262;
3263; AVX512BW-LABEL: umulo_v2i128:
3264; AVX512BW:       # %bb.0:
3265; AVX512BW-NEXT:    pushq %rbp
3266; AVX512BW-NEXT:    pushq %r15
3267; AVX512BW-NEXT:    pushq %r14
3268; AVX512BW-NEXT:    pushq %r12
3269; AVX512BW-NEXT:    pushq %rbx
3270; AVX512BW-NEXT:    movq %rcx, %rax
3271; AVX512BW-NEXT:    movq %rdx, %rcx
3272; AVX512BW-NEXT:    movq %rsi, %r10
3273; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
3274; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r14
3275; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
3276; AVX512BW-NEXT:    testq %rsi, %rsi
3277; AVX512BW-NEXT:    setne %dl
3278; AVX512BW-NEXT:    testq %rax, %rax
3279; AVX512BW-NEXT:    setne %bpl
3280; AVX512BW-NEXT:    andb %dl, %bpl
3281; AVX512BW-NEXT:    mulq %r14
3282; AVX512BW-NEXT:    movq %rax, %r11
3283; AVX512BW-NEXT:    seto %r15b
3284; AVX512BW-NEXT:    movq %rsi, %rax
3285; AVX512BW-NEXT:    mulq %rcx
3286; AVX512BW-NEXT:    seto %r12b
3287; AVX512BW-NEXT:    orb %r15b, %r12b
3288; AVX512BW-NEXT:    orb %bpl, %r12b
3289; AVX512BW-NEXT:    addq %rax, %r11
3290; AVX512BW-NEXT:    movq %rcx, %rax
3291; AVX512BW-NEXT:    mulq %r14
3292; AVX512BW-NEXT:    movq %rax, %rsi
3293; AVX512BW-NEXT:    movq %rdx, %rcx
3294; AVX512BW-NEXT:    addq %r11, %rcx
3295; AVX512BW-NEXT:    setb %al
3296; AVX512BW-NEXT:    orb %r12b, %al
3297; AVX512BW-NEXT:    kmovd %eax, %k0
3298; AVX512BW-NEXT:    testq %r9, %r9
3299; AVX512BW-NEXT:    setne %al
3300; AVX512BW-NEXT:    testq %r10, %r10
3301; AVX512BW-NEXT:    setne %r11b
3302; AVX512BW-NEXT:    andb %al, %r11b
3303; AVX512BW-NEXT:    movq %r10, %rax
3304; AVX512BW-NEXT:    mulq %r8
3305; AVX512BW-NEXT:    movq %rax, %r10
3306; AVX512BW-NEXT:    seto %bpl
3307; AVX512BW-NEXT:    movq %r9, %rax
3308; AVX512BW-NEXT:    mulq %rdi
3309; AVX512BW-NEXT:    seto %r9b
3310; AVX512BW-NEXT:    orb %bpl, %r9b
3311; AVX512BW-NEXT:    orb %r11b, %r9b
3312; AVX512BW-NEXT:    addq %rax, %r10
3313; AVX512BW-NEXT:    movq %rdi, %rax
3314; AVX512BW-NEXT:    mulq %r8
3315; AVX512BW-NEXT:    addq %r10, %rdx
3316; AVX512BW-NEXT:    setb %dil
3317; AVX512BW-NEXT:    orb %r9b, %dil
3318; AVX512BW-NEXT:    andl $1, %edi
3319; AVX512BW-NEXT:    kmovw %edi, %k1
3320; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
3321; AVX512BW-NEXT:    korw %k0, %k1, %k1
3322; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
3323; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
3324; AVX512BW-NEXT:    movq %rsi, 16(%rbx)
3325; AVX512BW-NEXT:    movq %rax, (%rbx)
3326; AVX512BW-NEXT:    movq %rcx, 24(%rbx)
3327; AVX512BW-NEXT:    movq %rdx, 8(%rbx)
3328; AVX512BW-NEXT:    popq %rbx
3329; AVX512BW-NEXT:    popq %r12
3330; AVX512BW-NEXT:    popq %r14
3331; AVX512BW-NEXT:    popq %r15
3332; AVX512BW-NEXT:    popq %rbp
3333; AVX512BW-NEXT:    retq
3334  %t = call {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
3335  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
3336  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
3337  %res = sext <2 x i1> %obit to <2 x i32>
3338  store <2 x i128> %val, ptr %p2
3339  ret <2 x i32> %res
3340}
3341