xref: /llvm-project/llvm/test/CodeGen/X86/vector-mul.ll (revision 3d862c78bbb5ecbdfe93996bdf2dcfc64325ae87)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2   | FileCheck %s --check-prefixes=SSE,SSE2,X86-SSE,X86-SSE2
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4,X86-SSE,X86-SSE4
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X64-SSE,X64-SSE2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,-slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4,X64-SSE4-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4,X64-SSE4-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop  | FileCheck %s --check-prefixes=X64-AVX,X64-XOP
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64-AVX,X64-AVX512DQ
10
11;
12; PowOf2 (uniform)
13;
14
15define <2 x i64> @mul_v2i64_8(<2 x i64> %a0) nounwind {
16; SSE-LABEL: mul_v2i64_8:
17; SSE:       # %bb.0:
18; SSE-NEXT:    psllq $3, %xmm0
19; SSE-NEXT:    ret{{[l|q]}}
20;
21; X64-AVX-LABEL: mul_v2i64_8:
22; X64-AVX:       # %bb.0:
23; X64-AVX-NEXT:    vpsllq $3, %xmm0, %xmm0
24; X64-AVX-NEXT:    retq
25  %1 = mul <2 x i64> %a0, <i64 8, i64 8>
26  ret <2 x i64> %1
27}
28
29define <4 x i32> @mul_v4i32_8(<4 x i32> %a0) nounwind {
30; SSE-LABEL: mul_v4i32_8:
31; SSE:       # %bb.0:
32; SSE-NEXT:    pslld $3, %xmm0
33; SSE-NEXT:    ret{{[l|q]}}
34;
35; X64-AVX-LABEL: mul_v4i32_8:
36; X64-AVX:       # %bb.0:
37; X64-AVX-NEXT:    vpslld $3, %xmm0, %xmm0
38; X64-AVX-NEXT:    retq
39  %1 = mul <4 x i32> %a0, <i32 8, i32 8, i32 8, i32 8>
40  ret <4 x i32> %1
41}
42
43define <8 x i16> @mul_v8i16_8(<8 x i16> %a0) nounwind {
44; SSE-LABEL: mul_v8i16_8:
45; SSE:       # %bb.0:
46; SSE-NEXT:    psllw $3, %xmm0
47; SSE-NEXT:    ret{{[l|q]}}
48;
49; X64-AVX-LABEL: mul_v8i16_8:
50; X64-AVX:       # %bb.0:
51; X64-AVX-NEXT:    vpsllw $3, %xmm0, %xmm0
52; X64-AVX-NEXT:    retq
53  %1 = mul <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
54  ret <8 x i16> %1
55}
56
57define <16 x i8> @mul_v16i8_32(<16 x i8> %a0) nounwind {
58; X86-SSE-LABEL: mul_v16i8_32:
59; X86-SSE:       # %bb.0:
60; X86-SSE-NEXT:    psllw $5, %xmm0
61; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
62; X86-SSE-NEXT:    retl
63;
64; X64-SSE-LABEL: mul_v16i8_32:
65; X64-SSE:       # %bb.0:
66; X64-SSE-NEXT:    psllw $5, %xmm0
67; X64-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
68; X64-SSE-NEXT:    retq
69;
70; X64-XOP-LABEL: mul_v16i8_32:
71; X64-XOP:       # %bb.0:
72; X64-XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
73; X64-XOP-NEXT:    retq
74;
75; X64-AVX2-LABEL: mul_v16i8_32:
76; X64-AVX2:       # %bb.0:
77; X64-AVX2-NEXT:    vpsllw $5, %xmm0, %xmm0
78; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
79; X64-AVX2-NEXT:    retq
80;
81; X64-AVX512DQ-LABEL: mul_v16i8_32:
82; X64-AVX512DQ:       # %bb.0:
83; X64-AVX512DQ-NEXT:    vpsllw $5, %xmm0, %xmm0
84; X64-AVX512DQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
85; X64-AVX512DQ-NEXT:    retq
86  %1 = mul <16 x i8> %a0, <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
87  ret <16 x i8> %1
88}
89
90;
91; PowOf2 (non-uniform)
92;
93
94define <2 x i64> @mul_v2i64_32_8(<2 x i64> %a0) nounwind {
95; SSE2-LABEL: mul_v2i64_32_8:
96; SSE2:       # %bb.0:
97; SSE2-NEXT:    movdqa %xmm0, %xmm1
98; SSE2-NEXT:    psllq $5, %xmm1
99; SSE2-NEXT:    psllq $3, %xmm0
100; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
101; SSE2-NEXT:    ret{{[l|q]}}
102;
103; SSE4-LABEL: mul_v2i64_32_8:
104; SSE4:       # %bb.0:
105; SSE4-NEXT:    movdqa %xmm0, %xmm1
106; SSE4-NEXT:    psllq $3, %xmm1
107; SSE4-NEXT:    psllq $5, %xmm0
108; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
109; SSE4-NEXT:    ret{{[l|q]}}
110;
111; X64-XOP-LABEL: mul_v2i64_32_8:
112; X64-XOP:       # %bb.0:
113; X64-XOP-NEXT:    vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
114; X64-XOP-NEXT:    retq
115;
116; X64-AVX2-LABEL: mul_v2i64_32_8:
117; X64-AVX2:       # %bb.0:
118; X64-AVX2-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
119; X64-AVX2-NEXT:    retq
120;
121; X64-AVX512DQ-LABEL: mul_v2i64_32_8:
122; X64-AVX512DQ:       # %bb.0:
123; X64-AVX512DQ-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
124; X64-AVX512DQ-NEXT:    retq
125  %1 = mul <2 x i64> %a0, <i64 32, i64 8>
126  ret <2 x i64> %1
127}
128
129define <4 x i32> @mul_v4i32_1_2_4_8(<4 x i32> %a0) nounwind {
130; X86-SSE2-LABEL: mul_v4i32_1_2_4_8:
131; X86-SSE2:       # %bb.0:
132; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
133; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
134; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
135; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
136; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
137; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
138; X86-SSE2-NEXT:    retl
139;
140; X86-SSE4-LABEL: mul_v4i32_1_2_4_8:
141; X86-SSE4:       # %bb.0:
142; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
143; X86-SSE4-NEXT:    retl
144;
145; X64-SSE2-LABEL: mul_v4i32_1_2_4_8:
146; X64-SSE2:       # %bb.0:
147; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
148; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
149; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
150; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
151; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
152; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
153; X64-SSE2-NEXT:    retq
154;
155; X64-SSE4-LABEL: mul_v4i32_1_2_4_8:
156; X64-SSE4:       # %bb.0:
157; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
158; X64-SSE4-NEXT:    retq
159;
160; X64-XOP-LABEL: mul_v4i32_1_2_4_8:
161; X64-XOP:       # %bb.0:
162; X64-XOP-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
163; X64-XOP-NEXT:    retq
164;
165; X64-AVX2-LABEL: mul_v4i32_1_2_4_8:
166; X64-AVX2:       # %bb.0:
167; X64-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
168; X64-AVX2-NEXT:    retq
169;
170; X64-AVX512DQ-LABEL: mul_v4i32_1_2_4_8:
171; X64-AVX512DQ:       # %bb.0:
172; X64-AVX512DQ-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
173; X64-AVX512DQ-NEXT:    retq
174  %1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 4, i32 8>
175  ret <4 x i32> %1
176}
177
178define <4 x i32> @mul_v4i32_1_2_4_8_optsize(<4 x i32> %a0) nounwind optsize {
179; SSE2-LABEL: mul_v4i32_1_2_4_8_optsize:
180; SSE2:       # %bb.0:
181; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8]
182; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
183; SSE2-NEXT:    pmuludq %xmm1, %xmm0
184; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
185; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
186; SSE2-NEXT:    pmuludq %xmm2, %xmm1
187; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
188; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
189; SSE2-NEXT:    ret{{[l|q]}}
190;
191; X86-SSE4-LABEL: mul_v4i32_1_2_4_8_optsize:
192; X86-SSE4:       # %bb.0:
193; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
194; X86-SSE4-NEXT:    retl
195;
196; X64-SSE4-LABEL: mul_v4i32_1_2_4_8_optsize:
197; X64-SSE4:       # %bb.0:
198; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
199; X64-SSE4-NEXT:    retq
200;
201; X64-XOP-LABEL: mul_v4i32_1_2_4_8_optsize:
202; X64-XOP:       # %bb.0:
203; X64-XOP-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
204; X64-XOP-NEXT:    retq
205;
206; X64-AVX2-LABEL: mul_v4i32_1_2_4_8_optsize:
207; X64-AVX2:       # %bb.0:
208; X64-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
209; X64-AVX2-NEXT:    retq
210;
211; X64-AVX512DQ-LABEL: mul_v4i32_1_2_4_8_optsize:
212; X64-AVX512DQ:       # %bb.0:
213; X64-AVX512DQ-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
214; X64-AVX512DQ-NEXT:    retq
215  %1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 4, i32 8>
216  ret <4 x i32> %1
217}
218
219define <8 x i16> @mul_v8i16_1_2_4_8_16_32_64_128(<8 x i16> %a0) nounwind {
220; X86-SSE-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
221; X86-SSE:       # %bb.0:
222; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,16,32,64,128]
223; X86-SSE-NEXT:    retl
224;
225; X64-SSE-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
226; X64-SSE:       # %bb.0:
227; X64-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128]
228; X64-SSE-NEXT:    retq
229;
230; X64-XOP-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
231; X64-XOP:       # %bb.0:
232; X64-XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
233; X64-XOP-NEXT:    retq
234;
235; X64-AVX2-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
236; X64-AVX2:       # %bb.0:
237; X64-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
238; X64-AVX2-NEXT:    retq
239;
240; X64-AVX512DQ-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
241; X64-AVX512DQ:       # %bb.0:
242; X64-AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
243; X64-AVX512DQ-NEXT:    retq
244  %1 = mul <8 x i16> %a0, <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
245  ret <8 x i16> %1
246}
247
248define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounwind {
249; SSE2-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
250; SSE2:       # %bb.0:
251; SSE2-NEXT:    movdqa %xmm0, %xmm1
252; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
253; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,1,2,4,8]
254; SSE2-NEXT:    pmullw %xmm2, %xmm1
255; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
256; SSE2-NEXT:    pand %xmm3, %xmm1
257; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
258; SSE2-NEXT:    pmullw %xmm2, %xmm0
259; SSE2-NEXT:    pand %xmm3, %xmm0
260; SSE2-NEXT:    packuswb %xmm1, %xmm0
261; SSE2-NEXT:    ret{{[l|q]}}
262;
263; X86-SSE4-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
264; X86-SSE4:       # %bb.0:
265; X86-SSE4-NEXT:    movdqa %xmm0, %xmm1
266; X86-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8]
267; X86-SSE4-NEXT:    psllw $8, %xmm1
268; X86-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0]
269; X86-SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
270; X86-SSE4-NEXT:    por %xmm1, %xmm0
271; X86-SSE4-NEXT:    retl
272;
273; X64-SSE4-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
274; X64-SSE4:       # %bb.0:
275; X64-SSE4-NEXT:    movdqa %xmm0, %xmm1
276; X64-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8]
277; X64-SSE4-NEXT:    psllw $8, %xmm1
278; X64-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0]
279; X64-SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
280; X64-SSE4-NEXT:    por %xmm1, %xmm0
281; X64-SSE4-NEXT:    retq
282;
283; X64-XOP-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
284; X64-XOP:       # %bb.0:
285; X64-XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
286; X64-XOP-NEXT:    retq
287;
288; X64-AVX2-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
289; X64-AVX2:       # %bb.0:
290; X64-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
291; X64-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8]
292; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
293; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
294; X64-AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
295; X64-AVX2-NEXT:    vzeroupper
296; X64-AVX2-NEXT:    retq
297;
298; X64-AVX512DQ-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
299; X64-AVX512DQ:       # %bb.0:
300; X64-AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
301; X64-AVX512DQ-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
302; X64-AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
303; X64-AVX512DQ-NEXT:    vzeroupper
304; X64-AVX512DQ-NEXT:    retq
305  %1 = mul <16 x i8> %a0, <i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8>
306  ret <16 x i8> %1
307}
308
309;
310; PowOf2 + 1 (uniform)
311;
312
313define <2 x i64> @mul_v2i64_17(<2 x i64> %a0) nounwind {
314; SSE-LABEL: mul_v2i64_17:
315; SSE:       # %bb.0:
316; SSE-NEXT:    movdqa %xmm0, %xmm1
317; SSE-NEXT:    psllq $4, %xmm1
318; SSE-NEXT:    paddq %xmm1, %xmm0
319; SSE-NEXT:    ret{{[l|q]}}
320;
321; X64-AVX-LABEL: mul_v2i64_17:
322; X64-AVX:       # %bb.0:
323; X64-AVX-NEXT:    vpsllq $4, %xmm0, %xmm1
324; X64-AVX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
325; X64-AVX-NEXT:    retq
326  %1 = mul <2 x i64> %a0, <i64 17, i64 17>
327  ret <2 x i64> %1
328}
329
330define <4 x i32> @mul_v4i32_17(<4 x i32> %a0) nounwind {
331; SSE-LABEL: mul_v4i32_17:
332; SSE:       # %bb.0:
333; SSE-NEXT:    movdqa %xmm0, %xmm1
334; SSE-NEXT:    pslld $4, %xmm1
335; SSE-NEXT:    paddd %xmm1, %xmm0
336; SSE-NEXT:    ret{{[l|q]}}
337;
338; X64-AVX-LABEL: mul_v4i32_17:
339; X64-AVX:       # %bb.0:
340; X64-AVX-NEXT:    vpslld $4, %xmm0, %xmm1
341; X64-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
342; X64-AVX-NEXT:    retq
343  %1 = mul <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
344  ret <4 x i32> %1
345}
346
347define <8 x i16> @mul_v8i16_17(<8 x i16> %a0) nounwind {
348; SSE-LABEL: mul_v8i16_17:
349; SSE:       # %bb.0:
350; SSE-NEXT:    movdqa %xmm0, %xmm1
351; SSE-NEXT:    psllw $4, %xmm1
352; SSE-NEXT:    paddw %xmm1, %xmm0
353; SSE-NEXT:    ret{{[l|q]}}
354;
355; X64-AVX-LABEL: mul_v8i16_17:
356; X64-AVX:       # %bb.0:
357; X64-AVX-NEXT:    vpsllw $4, %xmm0, %xmm1
358; X64-AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
359; X64-AVX-NEXT:    retq
360  %1 = mul <8 x i16> %a0, <i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17>
361  ret <8 x i16> %1
362}
363
364define <16 x i8> @mul_v16i8_17(<16 x i8> %a0) nounwind {
365; X86-SSE-LABEL: mul_v16i8_17:
366; X86-SSE:       # %bb.0:
367; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
368; X86-SSE-NEXT:    psllw $4, %xmm1
369; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
370; X86-SSE-NEXT:    paddb %xmm1, %xmm0
371; X86-SSE-NEXT:    retl
372;
373; X64-SSE-LABEL: mul_v16i8_17:
374; X64-SSE:       # %bb.0:
375; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
376; X64-SSE-NEXT:    psllw $4, %xmm1
377; X64-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
378; X64-SSE-NEXT:    paddb %xmm1, %xmm0
379; X64-SSE-NEXT:    retq
380;
381; X64-XOP-LABEL: mul_v16i8_17:
382; X64-XOP:       # %bb.0:
383; X64-XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
384; X64-XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
385; X64-XOP-NEXT:    retq
386;
387; X64-AVX2-LABEL: mul_v16i8_17:
388; X64-AVX2:       # %bb.0:
389; X64-AVX2-NEXT:    vpsllw $4, %xmm0, %xmm1
390; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
391; X64-AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
392; X64-AVX2-NEXT:    retq
393;
394; X64-AVX512DQ-LABEL: mul_v16i8_17:
395; X64-AVX512DQ:       # %bb.0:
396; X64-AVX512DQ-NEXT:    vpsllw $4, %xmm0, %xmm1
397; X64-AVX512DQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
398; X64-AVX512DQ-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
399; X64-AVX512DQ-NEXT:    retq
400  %1 = mul <16 x i8> %a0, <i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17>
401  ret <16 x i8> %1
402}
403
404define <4 x i64> @mul_v4i64_17(<4 x i64> %a0) nounwind {
405; SSE-LABEL: mul_v4i64_17:
406; SSE:       # %bb.0:
407; SSE-NEXT:    movdqa %xmm0, %xmm2
408; SSE-NEXT:    psllq $4, %xmm2
409; SSE-NEXT:    paddq %xmm2, %xmm0
410; SSE-NEXT:    movdqa %xmm1, %xmm2
411; SSE-NEXT:    psllq $4, %xmm2
412; SSE-NEXT:    paddq %xmm2, %xmm1
413; SSE-NEXT:    ret{{[l|q]}}
414;
415; X64-XOP-LABEL: mul_v4i64_17:
416; X64-XOP:       # %bb.0:
417; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
418; X64-XOP-NEXT:    vpsllq $4, %xmm1, %xmm2
419; X64-XOP-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
420; X64-XOP-NEXT:    vpsllq $4, %xmm0, %xmm2
421; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
422; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
423; X64-XOP-NEXT:    retq
424;
425; X64-AVX2-LABEL: mul_v4i64_17:
426; X64-AVX2:       # %bb.0:
427; X64-AVX2-NEXT:    vpsllq $4, %ymm0, %ymm1
428; X64-AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
429; X64-AVX2-NEXT:    retq
430;
431; X64-AVX512DQ-LABEL: mul_v4i64_17:
432; X64-AVX512DQ:       # %bb.0:
433; X64-AVX512DQ-NEXT:    vpsllq $4, %ymm0, %ymm1
434; X64-AVX512DQ-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
435; X64-AVX512DQ-NEXT:    retq
436  %1 = mul <4 x i64> %a0, <i64 17, i64 17, i64 17, i64 17>
437  ret <4 x i64> %1
438}
439
440define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind {
441; SSE-LABEL: mul_v8i32_17:
442; SSE:       # %bb.0:
443; SSE-NEXT:    movdqa %xmm0, %xmm2
444; SSE-NEXT:    pslld $4, %xmm2
445; SSE-NEXT:    paddd %xmm2, %xmm0
446; SSE-NEXT:    movdqa %xmm1, %xmm2
447; SSE-NEXT:    pslld $4, %xmm2
448; SSE-NEXT:    paddd %xmm2, %xmm1
449; SSE-NEXT:    ret{{[l|q]}}
450;
451; X64-XOP-LABEL: mul_v8i32_17:
452; X64-XOP:       # %bb.0:
453; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
454; X64-XOP-NEXT:    vpslld $4, %xmm1, %xmm2
455; X64-XOP-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
456; X64-XOP-NEXT:    vpslld $4, %xmm0, %xmm2
457; X64-XOP-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
458; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
459; X64-XOP-NEXT:    retq
460;
461; X64-AVX2-LABEL: mul_v8i32_17:
462; X64-AVX2:       # %bb.0:
463; X64-AVX2-NEXT:    vpslld $4, %ymm0, %ymm1
464; X64-AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
465; X64-AVX2-NEXT:    retq
466;
467; X64-AVX512DQ-LABEL: mul_v8i32_17:
468; X64-AVX512DQ:       # %bb.0:
469; X64-AVX512DQ-NEXT:    vpslld $4, %ymm0, %ymm1
470; X64-AVX512DQ-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
471; X64-AVX512DQ-NEXT:    retq
472  %1 = mul <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
473  ret <8 x i32> %1
474}
475
476define <16 x i16> @mul_v16i16_17(<16 x i16> %a0) nounwind {
477; SSE-LABEL: mul_v16i16_17:
478; SSE:       # %bb.0:
479; SSE-NEXT:    movdqa %xmm0, %xmm2
480; SSE-NEXT:    psllw $4, %xmm2
481; SSE-NEXT:    paddw %xmm2, %xmm0
482; SSE-NEXT:    movdqa %xmm1, %xmm2
483; SSE-NEXT:    psllw $4, %xmm2
484; SSE-NEXT:    paddw %xmm2, %xmm1
485; SSE-NEXT:    ret{{[l|q]}}
486;
487; X64-XOP-LABEL: mul_v16i16_17:
488; X64-XOP:       # %bb.0:
489; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
490; X64-XOP-NEXT:    vpsllw $4, %xmm1, %xmm2
491; X64-XOP-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
492; X64-XOP-NEXT:    vpsllw $4, %xmm0, %xmm2
493; X64-XOP-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
494; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
495; X64-XOP-NEXT:    retq
496;
497; X64-AVX2-LABEL: mul_v16i16_17:
498; X64-AVX2:       # %bb.0:
499; X64-AVX2-NEXT:    vpsllw $4, %ymm0, %ymm1
500; X64-AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
501; X64-AVX2-NEXT:    retq
502;
503; X64-AVX512DQ-LABEL: mul_v16i16_17:
504; X64-AVX512DQ:       # %bb.0:
505; X64-AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm1
506; X64-AVX512DQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
507; X64-AVX512DQ-NEXT:    retq
508  %1 = mul <16 x i16> %a0, <i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17>
509  ret <16 x i16> %1
510}
511
512define <32 x i8> @mul_v32i8_17(<32 x i8> %a0) nounwind {
513; SSE-LABEL: mul_v32i8_17:
514; SSE:       # %bb.0:
515; SSE-NEXT:    movdqa %xmm0, %xmm2
516; SSE-NEXT:    psllw $4, %xmm2
517; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
518; SSE-NEXT:    pand %xmm3, %xmm2
519; SSE-NEXT:    paddb %xmm2, %xmm0
520; SSE-NEXT:    movdqa %xmm1, %xmm2
521; SSE-NEXT:    psllw $4, %xmm2
522; SSE-NEXT:    pand %xmm3, %xmm2
523; SSE-NEXT:    paddb %xmm2, %xmm1
524; SSE-NEXT:    ret{{[l|q]}}
525;
526; X64-XOP-LABEL: mul_v32i8_17:
527; X64-XOP:       # %bb.0:
528; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
529; X64-XOP-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
530; X64-XOP-NEXT:    vpshlb %xmm2, %xmm1, %xmm3
531; X64-XOP-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
532; X64-XOP-NEXT:    vpshlb %xmm2, %xmm0, %xmm2
533; X64-XOP-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
534; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
535; X64-XOP-NEXT:    retq
536;
537; X64-AVX2-LABEL: mul_v32i8_17:
538; X64-AVX2:       # %bb.0:
539; X64-AVX2-NEXT:    vpsllw $4, %ymm0, %ymm1
540; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
541; X64-AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
542; X64-AVX2-NEXT:    retq
543;
544; X64-AVX512DQ-LABEL: mul_v32i8_17:
545; X64-AVX512DQ:       # %bb.0:
546; X64-AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm1
547; X64-AVX512DQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
548; X64-AVX512DQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
549; X64-AVX512DQ-NEXT:    retq
550  %1 = mul <32 x i8> %a0, <i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17>
551  ret <32 x i8> %1
552}
553
554;
555; -(PowOf2 + 1) (uniform)
556;
557
558define <2 x i64> @mul_v2i64_neg1025(<2 x i64> %a0) nounwind {
559; SSE-LABEL: mul_v2i64_neg1025:
560; SSE:       # %bb.0:
561; SSE-NEXT:    movdqa %xmm0, %xmm1
562; SSE-NEXT:    psllq $10, %xmm1
563; SSE-NEXT:    paddq %xmm0, %xmm1
564; SSE-NEXT:    pxor %xmm0, %xmm0
565; SSE-NEXT:    psubq %xmm1, %xmm0
566; SSE-NEXT:    ret{{[l|q]}}
567;
568; X64-AVX-LABEL: mul_v2i64_neg1025:
569; X64-AVX:       # %bb.0:
570; X64-AVX-NEXT:    vpsllq $10, %xmm0, %xmm1
571; X64-AVX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
572; X64-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
573; X64-AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
574; X64-AVX-NEXT:    retq
575  %1 = mul <2 x i64> %a0, <i64 -1025, i64 -1025>
576  ret <2 x i64> %1
577}
578
579define <4 x i32> @mul_v4i32_neg33(<4 x i32> %a0) nounwind {
580; SSE-LABEL: mul_v4i32_neg33:
581; SSE:       # %bb.0:
582; SSE-NEXT:    movdqa %xmm0, %xmm1
583; SSE-NEXT:    pslld $5, %xmm1
584; SSE-NEXT:    paddd %xmm0, %xmm1
585; SSE-NEXT:    pxor %xmm0, %xmm0
586; SSE-NEXT:    psubd %xmm1, %xmm0
587; SSE-NEXT:    ret{{[l|q]}}
588;
589; X64-AVX-LABEL: mul_v4i32_neg33:
590; X64-AVX:       # %bb.0:
591; X64-AVX-NEXT:    vpslld $5, %xmm0, %xmm1
592; X64-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
593; X64-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
594; X64-AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
595; X64-AVX-NEXT:    retq
596  %1 = mul <4 x i32> %a0, <i32 -33, i32 -33, i32 -33, i32 -33>
597  ret <4 x i32> %1
598}
599
600define <8 x i16> @mul_v8i16_neg9(<8 x i16> %a0) nounwind {
601; SSE-LABEL: mul_v8i16_neg9:
602; SSE:       # %bb.0:
603; SSE-NEXT:    movdqa %xmm0, %xmm1
604; SSE-NEXT:    psllw $3, %xmm1
605; SSE-NEXT:    paddw %xmm0, %xmm1
606; SSE-NEXT:    pxor %xmm0, %xmm0
607; SSE-NEXT:    psubw %xmm1, %xmm0
608; SSE-NEXT:    ret{{[l|q]}}
609;
610; X64-AVX-LABEL: mul_v8i16_neg9:
611; X64-AVX:       # %bb.0:
612; X64-AVX-NEXT:    vpsllw $3, %xmm0, %xmm1
613; X64-AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
614; X64-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
615; X64-AVX-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
616; X64-AVX-NEXT:    retq
617  %1 = mul <8 x i16> %a0, <i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9>
618  ret <8 x i16> %1
619}
620
621define <16 x i8> @mul_v16i8_neg5(<16 x i8> %a0) nounwind {
622; X86-SSE-LABEL: mul_v16i8_neg5:
623; X86-SSE:       # %bb.0:
624; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
625; X86-SSE-NEXT:    psllw $2, %xmm1
626; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
627; X86-SSE-NEXT:    paddb %xmm0, %xmm1
628; X86-SSE-NEXT:    pxor %xmm0, %xmm0
629; X86-SSE-NEXT:    psubb %xmm1, %xmm0
630; X86-SSE-NEXT:    retl
631;
632; X64-SSE-LABEL: mul_v16i8_neg5:
633; X64-SSE:       # %bb.0:
634; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
635; X64-SSE-NEXT:    psllw $2, %xmm1
636; X64-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
637; X64-SSE-NEXT:    paddb %xmm0, %xmm1
638; X64-SSE-NEXT:    pxor %xmm0, %xmm0
639; X64-SSE-NEXT:    psubb %xmm1, %xmm0
640; X64-SSE-NEXT:    retq
641;
642; X64-XOP-LABEL: mul_v16i8_neg5:
643; X64-XOP:       # %bb.0:
644; X64-XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
645; X64-XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
646; X64-XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
647; X64-XOP-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
648; X64-XOP-NEXT:    retq
649;
650; X64-AVX2-LABEL: mul_v16i8_neg5:
651; X64-AVX2:       # %bb.0:
652; X64-AVX2-NEXT:    vpsllw $2, %xmm0, %xmm1
653; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
654; X64-AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
655; X64-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
656; X64-AVX2-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
657; X64-AVX2-NEXT:    retq
658;
659; X64-AVX512DQ-LABEL: mul_v16i8_neg5:
660; X64-AVX512DQ:       # %bb.0:
661; X64-AVX512DQ-NEXT:    vpsllw $2, %xmm0, %xmm1
662; X64-AVX512DQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
663; X64-AVX512DQ-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
664; X64-AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
665; X64-AVX512DQ-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
666; X64-AVX512DQ-NEXT:    retq
667  %1 = mul <16 x i8> %a0, <i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5>
668  ret <16 x i8> %1
669}
670
671define <4 x i64> @mul_v4i64_neg1025(<4 x i64> %a0) nounwind {
672; SSE-LABEL: mul_v4i64_neg1025:
673; SSE:       # %bb.0:
674; SSE-NEXT:    movdqa %xmm0, %xmm3
675; SSE-NEXT:    psllq $10, %xmm3
676; SSE-NEXT:    paddq %xmm0, %xmm3
677; SSE-NEXT:    pxor %xmm2, %xmm2
678; SSE-NEXT:    pxor %xmm0, %xmm0
679; SSE-NEXT:    psubq %xmm3, %xmm0
680; SSE-NEXT:    movdqa %xmm1, %xmm3
681; SSE-NEXT:    psllq $10, %xmm3
682; SSE-NEXT:    paddq %xmm1, %xmm3
683; SSE-NEXT:    psubq %xmm3, %xmm2
684; SSE-NEXT:    movdqa %xmm2, %xmm1
685; SSE-NEXT:    ret{{[l|q]}}
686;
687; X64-XOP-LABEL: mul_v4i64_neg1025:
688; X64-XOP:       # %bb.0:
689; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
690; X64-XOP-NEXT:    vpsllq $10, %xmm1, %xmm2
691; X64-XOP-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
692; X64-XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
693; X64-XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
694; X64-XOP-NEXT:    vpsllq $10, %xmm0, %xmm3
695; X64-XOP-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
696; X64-XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
697; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
698; X64-XOP-NEXT:    retq
699;
700; X64-AVX2-LABEL: mul_v4i64_neg1025:
701; X64-AVX2:       # %bb.0:
702; X64-AVX2-NEXT:    vpsllq $10, %ymm0, %ymm1
703; X64-AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
704; X64-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
705; X64-AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
706; X64-AVX2-NEXT:    retq
707;
708; X64-AVX512DQ-LABEL: mul_v4i64_neg1025:
709; X64-AVX512DQ:       # %bb.0:
710; X64-AVX512DQ-NEXT:    vpsllq $10, %ymm0, %ymm1
711; X64-AVX512DQ-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
712; X64-AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
713; X64-AVX512DQ-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
714; X64-AVX512DQ-NEXT:    retq
715  %1 = mul <4 x i64> %a0, <i64 -1025, i64 -1025, i64 -1025, i64 -1025>
716  ret <4 x i64> %1
717}
718
719define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind {
720; SSE-LABEL: mul_v8i32_neg33:
721; SSE:       # %bb.0:
722; SSE-NEXT:    movdqa %xmm0, %xmm3
723; SSE-NEXT:    pslld $5, %xmm3
724; SSE-NEXT:    paddd %xmm0, %xmm3
725; SSE-NEXT:    pxor %xmm2, %xmm2
726; SSE-NEXT:    pxor %xmm0, %xmm0
727; SSE-NEXT:    psubd %xmm3, %xmm0
728; SSE-NEXT:    movdqa %xmm1, %xmm3
729; SSE-NEXT:    pslld $5, %xmm3
730; SSE-NEXT:    paddd %xmm1, %xmm3
731; SSE-NEXT:    psubd %xmm3, %xmm2
732; SSE-NEXT:    movdqa %xmm2, %xmm1
733; SSE-NEXT:    ret{{[l|q]}}
734;
735; X64-XOP-LABEL: mul_v8i32_neg33:
736; X64-XOP:       # %bb.0:
737; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
738; X64-XOP-NEXT:    vpslld $5, %xmm1, %xmm2
739; X64-XOP-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
740; X64-XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
741; X64-XOP-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
742; X64-XOP-NEXT:    vpslld $5, %xmm0, %xmm3
743; X64-XOP-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
744; X64-XOP-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
745; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
746; X64-XOP-NEXT:    retq
747;
748; X64-AVX2-LABEL: mul_v8i32_neg33:
749; X64-AVX2:       # %bb.0:
750; X64-AVX2-NEXT:    vpslld $5, %ymm0, %ymm1
751; X64-AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
752; X64-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
753; X64-AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
754; X64-AVX2-NEXT:    retq
755;
756; X64-AVX512DQ-LABEL: mul_v8i32_neg33:
757; X64-AVX512DQ:       # %bb.0:
758; X64-AVX512DQ-NEXT:    vpslld $5, %ymm0, %ymm1
759; X64-AVX512DQ-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
760; X64-AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
761; X64-AVX512DQ-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
762; X64-AVX512DQ-NEXT:    retq
763  %1 = mul <8 x i32> %a0, <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33>
764  ret <8 x i32> %1
765}
766
767define <16 x i16> @mul_v16i16_neg9(<16 x i16> %a0) nounwind {
768; SSE-LABEL: mul_v16i16_neg9:
769; SSE:       # %bb.0:
770; SSE-NEXT:    movdqa %xmm0, %xmm3
771; SSE-NEXT:    psllw $3, %xmm3
772; SSE-NEXT:    paddw %xmm0, %xmm3
773; SSE-NEXT:    pxor %xmm2, %xmm2
774; SSE-NEXT:    pxor %xmm0, %xmm0
775; SSE-NEXT:    psubw %xmm3, %xmm0
776; SSE-NEXT:    movdqa %xmm1, %xmm3
777; SSE-NEXT:    psllw $3, %xmm3
778; SSE-NEXT:    paddw %xmm1, %xmm3
779; SSE-NEXT:    psubw %xmm3, %xmm2
780; SSE-NEXT:    movdqa %xmm2, %xmm1
781; SSE-NEXT:    ret{{[l|q]}}
782;
783; X64-XOP-LABEL: mul_v16i16_neg9:
784; X64-XOP:       # %bb.0:
785; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
786; X64-XOP-NEXT:    vpsllw $3, %xmm1, %xmm2
787; X64-XOP-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
788; X64-XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
789; X64-XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
790; X64-XOP-NEXT:    vpsllw $3, %xmm0, %xmm3
791; X64-XOP-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
792; X64-XOP-NEXT:    vpsubw %xmm0, %xmm2, %xmm0
793; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
794; X64-XOP-NEXT:    retq
795;
796; X64-AVX2-LABEL: mul_v16i16_neg9:
797; X64-AVX2:       # %bb.0:
798; X64-AVX2-NEXT:    vpsllw $3, %ymm0, %ymm1
799; X64-AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
800; X64-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
801; X64-AVX2-NEXT:    vpsubw %ymm0, %ymm1, %ymm0
802; X64-AVX2-NEXT:    retq
803;
804; X64-AVX512DQ-LABEL: mul_v16i16_neg9:
805; X64-AVX512DQ:       # %bb.0:
806; X64-AVX512DQ-NEXT:    vpsllw $3, %ymm0, %ymm1
807; X64-AVX512DQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
808; X64-AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
809; X64-AVX512DQ-NEXT:    vpsubw %ymm0, %ymm1, %ymm0
810; X64-AVX512DQ-NEXT:    retq
811  %1 = mul <16 x i16> %a0, <i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9>
812  ret <16 x i16> %1
813}
814
815define <32 x i8> @mul_v32i8_neg5(<32 x i8> %a0) nounwind {
816; SSE-LABEL: mul_v32i8_neg5:
817; SSE:       # %bb.0:
818; SSE-NEXT:    movdqa %xmm0, %xmm3
819; SSE-NEXT:    psllw $2, %xmm3
820; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
821; SSE-NEXT:    pand %xmm4, %xmm3
822; SSE-NEXT:    paddb %xmm0, %xmm3
823; SSE-NEXT:    pxor %xmm2, %xmm2
824; SSE-NEXT:    pxor %xmm0, %xmm0
825; SSE-NEXT:    psubb %xmm3, %xmm0
826; SSE-NEXT:    movdqa %xmm1, %xmm3
827; SSE-NEXT:    psllw $2, %xmm3
828; SSE-NEXT:    pand %xmm4, %xmm3
829; SSE-NEXT:    paddb %xmm1, %xmm3
830; SSE-NEXT:    psubb %xmm3, %xmm2
831; SSE-NEXT:    movdqa %xmm2, %xmm1
832; SSE-NEXT:    ret{{[l|q]}}
833;
834; X64-XOP-LABEL: mul_v32i8_neg5:
835; X64-XOP:       # %bb.0:
836; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
837; X64-XOP-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
838; X64-XOP-NEXT:    vpshlb %xmm2, %xmm1, %xmm3
839; X64-XOP-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
840; X64-XOP-NEXT:    vpxor %xmm3, %xmm3, %xmm3
841; X64-XOP-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
842; X64-XOP-NEXT:    vpshlb %xmm2, %xmm0, %xmm2
843; X64-XOP-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
844; X64-XOP-NEXT:    vpsubb %xmm0, %xmm3, %xmm0
845; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
846; X64-XOP-NEXT:    retq
847;
848; X64-AVX2-LABEL: mul_v32i8_neg5:
849; X64-AVX2:       # %bb.0:
850; X64-AVX2-NEXT:    vpsllw $2, %ymm0, %ymm1
851; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
852; X64-AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
853; X64-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
854; X64-AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm0
855; X64-AVX2-NEXT:    retq
856;
857; X64-AVX512DQ-LABEL: mul_v32i8_neg5:
858; X64-AVX512DQ:       # %bb.0:
859; X64-AVX512DQ-NEXT:    vpsllw $2, %ymm0, %ymm1
860; X64-AVX512DQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
861; X64-AVX512DQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
862; X64-AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
863; X64-AVX512DQ-NEXT:    vpsubb %ymm0, %ymm1, %ymm0
864; X64-AVX512DQ-NEXT:    retq
865  %1 = mul <32 x i8> %a0, <i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5>
866  ret <32 x i8> %1
867}
868
869;
870; PowOf2 + 1 (non-uniform)
871;
872
873define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind {
874; X86-SSE2-LABEL: mul_v2i64_17_65:
875; X86-SSE2:       # %bb.0:
876; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [17,0,65,0]
877; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
878; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm2
879; X86-SSE2-NEXT:    psrlq $32, %xmm0
880; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
881; X86-SSE2-NEXT:    psllq $32, %xmm0
882; X86-SSE2-NEXT:    paddq %xmm2, %xmm0
883; X86-SSE2-NEXT:    retl
884;
885; SSE4-LABEL: mul_v2i64_17_65:
886; SSE4:       # %bb.0:
887; SSE4-NEXT:    pmovsxbq {{.*#+}} xmm1 = [17,65]
888; SSE4-NEXT:    movdqa %xmm0, %xmm2
889; SSE4-NEXT:    pmuludq %xmm1, %xmm2
890; SSE4-NEXT:    psrlq $32, %xmm0
891; SSE4-NEXT:    pmuludq %xmm1, %xmm0
892; SSE4-NEXT:    psllq $32, %xmm0
893; SSE4-NEXT:    paddq %xmm2, %xmm0
894; SSE4-NEXT:    ret{{[l|q]}}
895;
896; X64-SSE2-LABEL: mul_v2i64_17_65:
897; X64-SSE2:       # %bb.0:
898; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [17,65]
899; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
900; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm2
901; X64-SSE2-NEXT:    psrlq $32, %xmm0
902; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm0
903; X64-SSE2-NEXT:    psllq $32, %xmm0
904; X64-SSE2-NEXT:    paddq %xmm2, %xmm0
905; X64-SSE2-NEXT:    retq
906;
907; X64-XOP-LABEL: mul_v2i64_17_65:
908; X64-XOP:       # %bb.0:
909; X64-XOP-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [17,65]
910; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
911; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm0
912; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
913; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
914; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
915; X64-XOP-NEXT:    retq
916;
917; X64-AVX2-LABEL: mul_v2i64_17_65:
918; X64-AVX2:       # %bb.0:
919; X64-AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [17,65]
920; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
921; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
922; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
923; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
924; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
925; X64-AVX2-NEXT:    retq
926;
927; X64-AVX512DQ-LABEL: mul_v2i64_17_65:
928; X64-AVX512DQ:       # %bb.0:
929; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
930; X64-AVX512DQ-NEXT:    retq
931  %1 = mul <2 x i64> %a0, <i64 17, i64 65>
932  ret <2 x i64> %1
933}
934
935define <4 x i32> @mul_v4i32_5_17_33_65(<4 x i32> %a0) nounwind {
936; X86-SSE2-LABEL: mul_v4i32_5_17_33_65:
937; X86-SSE2:       # %bb.0:
938; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
939; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
940; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
941; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
942; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
943; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
944; X86-SSE2-NEXT:    retl
945;
946; X86-SSE4-LABEL: mul_v4i32_5_17_33_65:
947; X86-SSE4:       # %bb.0:
948; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
949; X86-SSE4-NEXT:    retl
950;
951; X64-SSE2-LABEL: mul_v4i32_5_17_33_65:
952; X64-SSE2:       # %bb.0:
953; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
954; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
955; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
956; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
957; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
958; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
959; X64-SSE2-NEXT:    retq
960;
961; X64-SSE4-LABEL: mul_v4i32_5_17_33_65:
962; X64-SSE4:       # %bb.0:
963; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
964; X64-SSE4-NEXT:    retq
965;
966; X64-AVX-LABEL: mul_v4i32_5_17_33_65:
967; X64-AVX:       # %bb.0:
968; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
969; X64-AVX-NEXT:    retq
970  %1 = mul <4 x i32> %a0, <i32 5, i32 17, i32 33, i32 65>
971  ret <4 x i32> %1
972}
973
974define <8 x i16> @mul_v8i16_2_3_9_17_33_65_129_257(<8 x i16> %a0) nounwind {
975; X86-SSE-LABEL: mul_v8i16_2_3_9_17_33_65_129_257:
976; X86-SSE:       # %bb.0:
977; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,3,9,17,33,65,129,257]
978; X86-SSE-NEXT:    retl
979;
980; X64-SSE-LABEL: mul_v8i16_2_3_9_17_33_65_129_257:
981; X64-SSE:       # %bb.0:
982; X64-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,3,9,17,33,65,129,257]
983; X64-SSE-NEXT:    retq
984;
985; X64-AVX-LABEL: mul_v8i16_2_3_9_17_33_65_129_257:
986; X64-AVX:       # %bb.0:
987; X64-AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,3,9,17,33,65,129,257]
988; X64-AVX-NEXT:    retq
989  %1 = mul <8 x i16> %a0, <i16 2, i16 3, i16 9, i16 17, i16 33, i16 65, i16 129, i16 257>
990  ret <8 x i16> %1
991}
992
993define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> %a0) nounwind {
994; X86-SSE2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
995; X86-SSE2:       # %bb.0:
996; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
997; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
998; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [3,9,17,33,65,129,2,3]
999; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1000; X86-SSE2-NEXT:    pand %xmm2, %xmm1
1001; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1002; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,3,9,17,33,65,129,2]
1003; X86-SSE2-NEXT:    pand %xmm2, %xmm0
1004; X86-SSE2-NEXT:    packuswb %xmm1, %xmm0
1005; X86-SSE2-NEXT:    retl
1006;
1007; X86-SSE4-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1008; X86-SSE4:       # %bb.0:
1009; X86-SSE4-NEXT:    movdqa %xmm0, %xmm1
1010; X86-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3]
1011; X86-SSE4-NEXT:    psllw $8, %xmm1
1012; X86-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0]
1013; X86-SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1014; X86-SSE4-NEXT:    por %xmm1, %xmm0
1015; X86-SSE4-NEXT:    retl
1016;
1017; X64-SSE2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1018; X64-SSE2:       # %bb.0:
1019; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
1020; X64-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1021; X64-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3,9,17,33,65,129,2,3]
1022; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1023; X64-SSE2-NEXT:    pand %xmm2, %xmm1
1024; X64-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1025; X64-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,3,9,17,33,65,129,2]
1026; X64-SSE2-NEXT:    pand %xmm2, %xmm0
1027; X64-SSE2-NEXT:    packuswb %xmm1, %xmm0
1028; X64-SSE2-NEXT:    retq
1029;
1030; X64-SSE4-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1031; X64-SSE4:       # %bb.0:
1032; X64-SSE4-NEXT:    movdqa %xmm0, %xmm1
1033; X64-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3]
1034; X64-SSE4-NEXT:    psllw $8, %xmm1
1035; X64-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0]
1036; X64-SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1037; X64-SSE4-NEXT:    por %xmm1, %xmm0
1038; X64-SSE4-NEXT:    retq
1039;
1040; X64-XOP-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1041; X64-XOP:       # %bb.0:
1042; X64-XOP-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3]
1043; X64-XOP-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0]
1044; X64-XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2],xmm1[2],xmm0[4],xmm1[4],xmm0[6],xmm1[6],xmm0[8],xmm1[8],xmm0[10],xmm1[10],xmm0[12],xmm1[12],xmm0[14],xmm1[14]
1045; X64-XOP-NEXT:    retq
1046;
1047; X64-AVX2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1048; X64-AVX2:       # %bb.0:
1049; X64-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1050; X64-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3]
1051; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1052; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1053; X64-AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1054; X64-AVX2-NEXT:    vzeroupper
1055; X64-AVX2-NEXT:    retq
1056;
1057; X64-AVX512DQ-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1058; X64-AVX512DQ:       # %bb.0:
1059; X64-AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1060; X64-AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3]
1061; X64-AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1062; X64-AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1063; X64-AVX512DQ-NEXT:    vzeroupper
1064; X64-AVX512DQ-NEXT:    retq
1065  %1 = mul <16 x i8> %a0, <i8 2, i8 3, i8 9, i8 17, i8 33, i8 65, i8 129, i8 2, i8 3, i8 9, i8 17, i8 33, i8 65, i8 129, i8 2, i8 3>
1066  ret <16 x i8> %1
1067}
1068
1069;
1070; PowOf2 - 1 (uniform)
1071;
1072
1073define <2 x i64> @mul_v2i64_7(<2 x i64> %a0) nounwind {
1074; SSE-LABEL: mul_v2i64_7:
1075; SSE:       # %bb.0:
1076; SSE-NEXT:    movdqa %xmm0, %xmm1
1077; SSE-NEXT:    psllq $3, %xmm1
1078; SSE-NEXT:    psubq %xmm0, %xmm1
1079; SSE-NEXT:    movdqa %xmm1, %xmm0
1080; SSE-NEXT:    ret{{[l|q]}}
1081;
1082; X64-AVX-LABEL: mul_v2i64_7:
1083; X64-AVX:       # %bb.0:
1084; X64-AVX-NEXT:    vpsllq $3, %xmm0, %xmm1
1085; X64-AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
1086; X64-AVX-NEXT:    retq
1087  %1 = mul <2 x i64> %a0, <i64 7, i64 7>
1088  ret <2 x i64> %1
1089}
1090
1091define <4 x i32> @mul_v4i32_7(<4 x i32> %a0) nounwind {
1092; SSE-LABEL: mul_v4i32_7:
1093; SSE:       # %bb.0:
1094; SSE-NEXT:    movdqa %xmm0, %xmm1
1095; SSE-NEXT:    pslld $3, %xmm1
1096; SSE-NEXT:    psubd %xmm0, %xmm1
1097; SSE-NEXT:    movdqa %xmm1, %xmm0
1098; SSE-NEXT:    ret{{[l|q]}}
1099;
1100; X64-AVX-LABEL: mul_v4i32_7:
1101; X64-AVX:       # %bb.0:
1102; X64-AVX-NEXT:    vpslld $3, %xmm0, %xmm1
1103; X64-AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
1104; X64-AVX-NEXT:    retq
1105  %1 = mul <4 x i32> %a0, <i32 7, i32 7, i32 7, i32 7>
1106  ret <4 x i32> %1
1107}
1108
1109define <8 x i16> @mul_v8i16_7(<8 x i16> %a0) nounwind {
1110; SSE-LABEL: mul_v8i16_7:
1111; SSE:       # %bb.0:
1112; SSE-NEXT:    movdqa %xmm0, %xmm1
1113; SSE-NEXT:    psllw $3, %xmm1
1114; SSE-NEXT:    psubw %xmm0, %xmm1
1115; SSE-NEXT:    movdqa %xmm1, %xmm0
1116; SSE-NEXT:    ret{{[l|q]}}
1117;
1118; X64-AVX-LABEL: mul_v8i16_7:
1119; X64-AVX:       # %bb.0:
1120; X64-AVX-NEXT:    vpsllw $3, %xmm0, %xmm1
1121; X64-AVX-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
1122; X64-AVX-NEXT:    retq
1123  %1 = mul <8 x i16> %a0, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1124  ret <8 x i16> %1
1125}
1126
1127define <16 x i8> @mul_v16i8_31(<16 x i8> %a0) nounwind {
1128; X86-SSE-LABEL: mul_v16i8_31:
1129; X86-SSE:       # %bb.0:
1130; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1131; X86-SSE-NEXT:    psllw $5, %xmm1
1132; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1133; X86-SSE-NEXT:    psubb %xmm0, %xmm1
1134; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
1135; X86-SSE-NEXT:    retl
1136;
1137; X64-SSE-LABEL: mul_v16i8_31:
1138; X64-SSE:       # %bb.0:
1139; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
1140; X64-SSE-NEXT:    psllw $5, %xmm1
1141; X64-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1142; X64-SSE-NEXT:    psubb %xmm0, %xmm1
1143; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
1144; X64-SSE-NEXT:    retq
1145;
1146; X64-XOP-LABEL: mul_v16i8_31:
1147; X64-XOP:       # %bb.0:
1148; X64-XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1149; X64-XOP-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
1150; X64-XOP-NEXT:    retq
1151;
1152; X64-AVX2-LABEL: mul_v16i8_31:
1153; X64-AVX2:       # %bb.0:
1154; X64-AVX2-NEXT:    vpsllw $5, %xmm0, %xmm1
1155; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1156; X64-AVX2-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
1157; X64-AVX2-NEXT:    retq
1158;
1159; X64-AVX512DQ-LABEL: mul_v16i8_31:
1160; X64-AVX512DQ:       # %bb.0:
1161; X64-AVX512DQ-NEXT:    vpsllw $5, %xmm0, %xmm1
1162; X64-AVX512DQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
1163; X64-AVX512DQ-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
1164; X64-AVX512DQ-NEXT:    retq
1165  %1 = mul <16 x i8> %a0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
1166  ret <16 x i8> %1
1167}
1168
1169;
1170; -(PowOf2 - 1) (uniform)
1171;
1172
1173define <2 x i64> @mul_v2i64_neg7(<2 x i64> %a0) nounwind {
1174; SSE-LABEL: mul_v2i64_neg7:
1175; SSE:       # %bb.0:
1176; SSE-NEXT:    movdqa %xmm0, %xmm1
1177; SSE-NEXT:    psllq $3, %xmm1
1178; SSE-NEXT:    psubq %xmm1, %xmm0
1179; SSE-NEXT:    ret{{[l|q]}}
1180;
1181; X64-AVX-LABEL: mul_v2i64_neg7:
1182; X64-AVX:       # %bb.0:
1183; X64-AVX-NEXT:    vpsllq $3, %xmm0, %xmm1
1184; X64-AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
1185; X64-AVX-NEXT:    retq
1186  %1 = mul <2 x i64> %a0, <i64 -7, i64 -7>
1187  ret <2 x i64> %1
1188}
1189
1190define <4 x i32> @mul_v4i32_neg63(<4 x i32> %a0) nounwind {
1191; SSE-LABEL: mul_v4i32_neg63:
1192; SSE:       # %bb.0:
1193; SSE-NEXT:    movdqa %xmm0, %xmm1
1194; SSE-NEXT:    pslld $6, %xmm1
1195; SSE-NEXT:    psubd %xmm1, %xmm0
1196; SSE-NEXT:    ret{{[l|q]}}
1197;
1198; X64-AVX-LABEL: mul_v4i32_neg63:
1199; X64-AVX:       # %bb.0:
1200; X64-AVX-NEXT:    vpslld $6, %xmm0, %xmm1
1201; X64-AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1202; X64-AVX-NEXT:    retq
1203  %1 = mul <4 x i32> %a0, <i32 -63, i32 -63, i32 -63, i32 -63>
1204  ret <4 x i32> %1
1205}
1206
1207define <8 x i16> @mul_v8i16_neg31(<8 x i16> %a0) nounwind {
1208; SSE-LABEL: mul_v8i16_neg31:
1209; SSE:       # %bb.0:
1210; SSE-NEXT:    movdqa %xmm0, %xmm1
1211; SSE-NEXT:    psllw $5, %xmm1
1212; SSE-NEXT:    psubw %xmm1, %xmm0
1213; SSE-NEXT:    ret{{[l|q]}}
1214;
1215; X64-AVX-LABEL: mul_v8i16_neg31:
1216; X64-AVX:       # %bb.0:
1217; X64-AVX-NEXT:    vpsllw $5, %xmm0, %xmm1
1218; X64-AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
1219; X64-AVX-NEXT:    retq
1220  %1 = mul <8 x i16> %a0, <i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31>
1221  ret <8 x i16> %1
1222}
1223
1224define <16 x i8> @mul_v16i8_neg15(<16 x i8> %a0) nounwind {
1225; X86-SSE-LABEL: mul_v16i8_neg15:
1226; X86-SSE:       # %bb.0:
1227; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1228; X86-SSE-NEXT:    psllw $4, %xmm1
1229; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1230; X86-SSE-NEXT:    psubb %xmm1, %xmm0
1231; X86-SSE-NEXT:    retl
1232;
1233; X64-SSE-LABEL: mul_v16i8_neg15:
1234; X64-SSE:       # %bb.0:
1235; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
1236; X64-SSE-NEXT:    psllw $4, %xmm1
1237; X64-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1238; X64-SSE-NEXT:    psubb %xmm1, %xmm0
1239; X64-SSE-NEXT:    retq
1240;
1241; X64-XOP-LABEL: mul_v16i8_neg15:
1242; X64-XOP:       # %bb.0:
1243; X64-XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1244; X64-XOP-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1245; X64-XOP-NEXT:    retq
1246;
1247; X64-AVX2-LABEL: mul_v16i8_neg15:
1248; X64-AVX2:       # %bb.0:
1249; X64-AVX2-NEXT:    vpsllw $4, %xmm0, %xmm1
1250; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1251; X64-AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1252; X64-AVX2-NEXT:    retq
1253;
1254; X64-AVX512DQ-LABEL: mul_v16i8_neg15:
1255; X64-AVX512DQ:       # %bb.0:
1256; X64-AVX512DQ-NEXT:    vpsllw $4, %xmm0, %xmm1
1257; X64-AVX512DQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
1258; X64-AVX512DQ-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1259; X64-AVX512DQ-NEXT:    retq
1260  %1 = mul <16 x i8> %a0, <i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15>
1261  ret <16 x i8> %1
1262}
1263
1264;
1265; PowOf2 - 1 (non-uniform)
1266;
1267
1268define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind {
1269; X86-SSE2-LABEL: mul_v2i64_15_63:
1270; X86-SSE2:       # %bb.0:
1271; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,0,63,0]
1272; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
1273; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm2
1274; X86-SSE2-NEXT:    psrlq $32, %xmm0
1275; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
1276; X86-SSE2-NEXT:    psllq $32, %xmm0
1277; X86-SSE2-NEXT:    paddq %xmm2, %xmm0
1278; X86-SSE2-NEXT:    retl
1279;
1280; SSE4-LABEL: mul_v2i64_15_63:
1281; SSE4:       # %bb.0:
1282; SSE4-NEXT:    pmovsxbq {{.*#+}} xmm1 = [15,63]
1283; SSE4-NEXT:    movdqa %xmm0, %xmm2
1284; SSE4-NEXT:    pmuludq %xmm1, %xmm2
1285; SSE4-NEXT:    psrlq $32, %xmm0
1286; SSE4-NEXT:    pmuludq %xmm1, %xmm0
1287; SSE4-NEXT:    psllq $32, %xmm0
1288; SSE4-NEXT:    paddq %xmm2, %xmm0
1289; SSE4-NEXT:    ret{{[l|q]}}
1290;
1291; X64-SSE2-LABEL: mul_v2i64_15_63:
1292; X64-SSE2:       # %bb.0:
1293; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,63]
1294; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
1295; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm2
1296; X64-SSE2-NEXT:    psrlq $32, %xmm0
1297; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm0
1298; X64-SSE2-NEXT:    psllq $32, %xmm0
1299; X64-SSE2-NEXT:    paddq %xmm2, %xmm0
1300; X64-SSE2-NEXT:    retq
1301;
1302; X64-XOP-LABEL: mul_v2i64_15_63:
1303; X64-XOP:       # %bb.0:
1304; X64-XOP-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [15,63]
1305; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1306; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm0
1307; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1308; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
1309; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1310; X64-XOP-NEXT:    retq
1311;
1312; X64-AVX2-LABEL: mul_v2i64_15_63:
1313; X64-AVX2:       # %bb.0:
1314; X64-AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [15,63]
1315; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1316; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
1317; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1318; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
1319; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1320; X64-AVX2-NEXT:    retq
1321;
1322; X64-AVX512DQ-LABEL: mul_v2i64_15_63:
1323; X64-AVX512DQ:       # %bb.0:
1324; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1325; X64-AVX512DQ-NEXT:    retq
1326  %1 = mul <2 x i64> %a0, <i64 15, i64 63>
1327  ret <2 x i64> %1
1328}
1329
1330define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
1331; X86-SSE2-LABEL: mul_v2i64_neg_15_63:
1332; X86-SSE2:       # %bb.0:
1333; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
1334; X86-SSE2-NEXT:    pmuludq %xmm0, %xmm1
1335; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
1336; X86-SSE2-NEXT:    psrlq $32, %xmm2
1337; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [4294967281,4294967295,4294967233,4294967295]
1338; X86-SSE2-NEXT:    pmuludq %xmm3, %xmm2
1339; X86-SSE2-NEXT:    paddq %xmm1, %xmm2
1340; X86-SSE2-NEXT:    psllq $32, %xmm2
1341; X86-SSE2-NEXT:    pmuludq %xmm3, %xmm0
1342; X86-SSE2-NEXT:    paddq %xmm2, %xmm0
1343; X86-SSE2-NEXT:    retl
1344;
1345; X86-SSE4-LABEL: mul_v2i64_neg_15_63:
1346; X86-SSE4:       # %bb.0:
1347; X86-SSE4-NEXT:    pcmpeqd %xmm1, %xmm1
1348; X86-SSE4-NEXT:    pmuludq %xmm0, %xmm1
1349; X86-SSE4-NEXT:    movdqa %xmm0, %xmm2
1350; X86-SSE4-NEXT:    psrlq $32, %xmm2
1351; X86-SSE4-NEXT:    pmovsxbq {{.*#+}} xmm3 = [18446744073709551601,18446744073709551553]
1352; X86-SSE4-NEXT:    pmuludq %xmm3, %xmm2
1353; X86-SSE4-NEXT:    paddq %xmm1, %xmm2
1354; X86-SSE4-NEXT:    psllq $32, %xmm2
1355; X86-SSE4-NEXT:    pmuludq %xmm3, %xmm0
1356; X86-SSE4-NEXT:    paddq %xmm2, %xmm0
1357; X86-SSE4-NEXT:    retl
1358;
1359; X64-SSE2-LABEL: mul_v2i64_neg_15_63:
1360; X64-SSE2:       # %bb.0:
1361; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
1362; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
1363; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm2
1364; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
1365; X64-SSE2-NEXT:    psrlq $32, %xmm3
1366; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm3
1367; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1368; X64-SSE2-NEXT:    paddq %xmm3, %xmm0
1369; X64-SSE2-NEXT:    psllq $32, %xmm0
1370; X64-SSE2-NEXT:    paddq %xmm2, %xmm0
1371; X64-SSE2-NEXT:    retq
1372;
1373; X64-SSE4-LABEL: mul_v2i64_neg_15_63:
1374; X64-SSE4:       # %bb.0:
1375; X64-SSE4-NEXT:    pmovsxbq {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
1376; X64-SSE4-NEXT:    movdqa %xmm0, %xmm2
1377; X64-SSE4-NEXT:    pmuludq %xmm1, %xmm2
1378; X64-SSE4-NEXT:    movdqa %xmm0, %xmm3
1379; X64-SSE4-NEXT:    psrlq $32, %xmm3
1380; X64-SSE4-NEXT:    pmuludq %xmm1, %xmm3
1381; X64-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1382; X64-SSE4-NEXT:    paddq %xmm3, %xmm0
1383; X64-SSE4-NEXT:    psllq $32, %xmm0
1384; X64-SSE4-NEXT:    paddq %xmm2, %xmm0
1385; X64-SSE4-NEXT:    retq
1386;
1387; X64-XOP-LABEL: mul_v2i64_neg_15_63:
1388; X64-XOP:       # %bb.0:
1389; X64-XOP-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
1390; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1391; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm3
1392; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1393; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1394; X64-XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1395; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
1396; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1397; X64-XOP-NEXT:    retq
1398;
1399; X64-AVX2-LABEL: mul_v2i64_neg_15_63:
1400; X64-AVX2:       # %bb.0:
1401; X64-AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
1402; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1403; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm3
1404; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1405; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1406; X64-AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1407; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
1408; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1409; X64-AVX2-NEXT:    retq
1410;
1411; X64-AVX512DQ-LABEL: mul_v2i64_neg_15_63:
1412; X64-AVX512DQ:       # %bb.0:
1413; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1414; X64-AVX512DQ-NEXT:    retq
1415  %1 = mul <2 x i64> %a0, <i64 -15, i64 -63>
1416  ret <2 x i64> %1
1417}
1418
1419define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
1420; X86-SSE2-LABEL: mul_v2i64_neg_17_65:
1421; X86-SSE2:       # %bb.0:
1422; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
1423; X86-SSE2-NEXT:    pmuludq %xmm0, %xmm1
1424; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
1425; X86-SSE2-NEXT:    psrlq $32, %xmm2
1426; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [4294967279,4294967295,4294967231,4294967295]
1427; X86-SSE2-NEXT:    pmuludq %xmm3, %xmm2
1428; X86-SSE2-NEXT:    paddq %xmm1, %xmm2
1429; X86-SSE2-NEXT:    psllq $32, %xmm2
1430; X86-SSE2-NEXT:    pmuludq %xmm3, %xmm0
1431; X86-SSE2-NEXT:    paddq %xmm2, %xmm0
1432; X86-SSE2-NEXT:    retl
1433;
1434; X86-SSE4-LABEL: mul_v2i64_neg_17_65:
1435; X86-SSE4:       # %bb.0:
1436; X86-SSE4-NEXT:    pcmpeqd %xmm1, %xmm1
1437; X86-SSE4-NEXT:    pmuludq %xmm0, %xmm1
1438; X86-SSE4-NEXT:    movdqa %xmm0, %xmm2
1439; X86-SSE4-NEXT:    psrlq $32, %xmm2
1440; X86-SSE4-NEXT:    pmovsxbq {{.*#+}} xmm3 = [18446744073709551599,18446744073709551551]
1441; X86-SSE4-NEXT:    pmuludq %xmm3, %xmm2
1442; X86-SSE4-NEXT:    paddq %xmm1, %xmm2
1443; X86-SSE4-NEXT:    psllq $32, %xmm2
1444; X86-SSE4-NEXT:    pmuludq %xmm3, %xmm0
1445; X86-SSE4-NEXT:    paddq %xmm2, %xmm0
1446; X86-SSE4-NEXT:    retl
1447;
1448; X64-SSE2-LABEL: mul_v2i64_neg_17_65:
1449; X64-SSE2:       # %bb.0:
1450; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
1451; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
1452; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm2
1453; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
1454; X64-SSE2-NEXT:    psrlq $32, %xmm3
1455; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm3
1456; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1457; X64-SSE2-NEXT:    paddq %xmm3, %xmm0
1458; X64-SSE2-NEXT:    psllq $32, %xmm0
1459; X64-SSE2-NEXT:    paddq %xmm2, %xmm0
1460; X64-SSE2-NEXT:    retq
1461;
1462; X64-SSE4-LABEL: mul_v2i64_neg_17_65:
1463; X64-SSE4:       # %bb.0:
1464; X64-SSE4-NEXT:    pmovsxbq {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
1465; X64-SSE4-NEXT:    movdqa %xmm0, %xmm2
1466; X64-SSE4-NEXT:    pmuludq %xmm1, %xmm2
1467; X64-SSE4-NEXT:    movdqa %xmm0, %xmm3
1468; X64-SSE4-NEXT:    psrlq $32, %xmm3
1469; X64-SSE4-NEXT:    pmuludq %xmm1, %xmm3
1470; X64-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1471; X64-SSE4-NEXT:    paddq %xmm3, %xmm0
1472; X64-SSE4-NEXT:    psllq $32, %xmm0
1473; X64-SSE4-NEXT:    paddq %xmm2, %xmm0
1474; X64-SSE4-NEXT:    retq
1475;
1476; X64-XOP-LABEL: mul_v2i64_neg_17_65:
1477; X64-XOP:       # %bb.0:
1478; X64-XOP-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
1479; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1480; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm3
1481; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1482; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1483; X64-XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1484; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
1485; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1486; X64-XOP-NEXT:    retq
1487;
1488; X64-AVX2-LABEL: mul_v2i64_neg_17_65:
1489; X64-AVX2:       # %bb.0:
1490; X64-AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
1491; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1492; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm3
1493; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1494; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1495; X64-AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1496; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
1497; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1498; X64-AVX2-NEXT:    retq
1499;
1500; X64-AVX512DQ-LABEL: mul_v2i64_neg_17_65:
1501; X64-AVX512DQ:       # %bb.0:
1502; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1503; X64-AVX512DQ-NEXT:    retq
1504  %1 = mul <2 x i64> %a0, <i64 -17, i64 -65>
1505  ret <2 x i64> %1
1506}
1507
1508define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind {
1509; X86-SSE2-LABEL: mul_v2i64_0_1:
1510; X86-SSE2:       # %bb.0:
1511; X86-SSE2-NEXT:    xorpd %xmm1, %xmm1
1512; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1513; X86-SSE2-NEXT:    retl
1514;
1515; SSE4-LABEL: mul_v2i64_0_1:
1516; SSE4:       # %bb.0:
1517; SSE4-NEXT:    xorps %xmm1, %xmm1
1518; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1519; SSE4-NEXT:    ret{{[l|q]}}
1520;
1521; X64-SSE2-LABEL: mul_v2i64_0_1:
1522; X64-SSE2:       # %bb.0:
1523; X64-SSE2-NEXT:    xorps %xmm1, %xmm1
1524; X64-SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1525; X64-SSE2-NEXT:    movaps %xmm1, %xmm0
1526; X64-SSE2-NEXT:    retq
1527;
1528; X64-AVX-LABEL: mul_v2i64_0_1:
1529; X64-AVX:       # %bb.0:
1530; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1531; X64-AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1532; X64-AVX-NEXT:    retq
1533  %1 = mul <2 x i64> %a0, <i64 0, i64 1>
1534  ret <2 x i64> %1
1535}
1536
1537define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
1538; X86-SSE2-LABEL: mul_v2i64_neg_0_1:
1539; X86-SSE2:       # %bb.0:
1540; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,4294967295,4294967295]
1541; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
1542; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm2
1543; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
1544; X86-SSE2-NEXT:    psrlq $32, %xmm3
1545; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm3
1546; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1547; X86-SSE2-NEXT:    paddq %xmm3, %xmm0
1548; X86-SSE2-NEXT:    psllq $32, %xmm0
1549; X86-SSE2-NEXT:    paddq %xmm2, %xmm0
1550; X86-SSE2-NEXT:    retl
1551;
1552; X86-SSE4-LABEL: mul_v2i64_neg_0_1:
1553; X86-SSE4:       # %bb.0:
1554; X86-SSE4-NEXT:    pmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615]
1555; X86-SSE4-NEXT:    movdqa %xmm0, %xmm2
1556; X86-SSE4-NEXT:    pmuludq %xmm1, %xmm2
1557; X86-SSE4-NEXT:    movdqa %xmm0, %xmm3
1558; X86-SSE4-NEXT:    psrlq $32, %xmm3
1559; X86-SSE4-NEXT:    pmuludq %xmm1, %xmm3
1560; X86-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1561; X86-SSE4-NEXT:    paddq %xmm3, %xmm0
1562; X86-SSE4-NEXT:    psllq $32, %xmm0
1563; X86-SSE4-NEXT:    paddq %xmm2, %xmm0
1564; X86-SSE4-NEXT:    retl
1565;
1566; X64-SSE2-LABEL: mul_v2i64_neg_0_1:
1567; X64-SSE2:       # %bb.0:
1568; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
1569; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
1570; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm2
1571; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
1572; X64-SSE2-NEXT:    psrlq $32, %xmm3
1573; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm3
1574; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1575; X64-SSE2-NEXT:    paddq %xmm3, %xmm0
1576; X64-SSE2-NEXT:    psllq $32, %xmm0
1577; X64-SSE2-NEXT:    paddq %xmm2, %xmm0
1578; X64-SSE2-NEXT:    retq
1579;
1580; X64-SSE4-LABEL: mul_v2i64_neg_0_1:
1581; X64-SSE4:       # %bb.0:
1582; X64-SSE4-NEXT:    pmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615]
1583; X64-SSE4-NEXT:    movdqa %xmm0, %xmm2
1584; X64-SSE4-NEXT:    pmuludq %xmm1, %xmm2
1585; X64-SSE4-NEXT:    movdqa %xmm0, %xmm3
1586; X64-SSE4-NEXT:    psrlq $32, %xmm3
1587; X64-SSE4-NEXT:    pmuludq %xmm1, %xmm3
1588; X64-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1589; X64-SSE4-NEXT:    paddq %xmm3, %xmm0
1590; X64-SSE4-NEXT:    psllq $32, %xmm0
1591; X64-SSE4-NEXT:    paddq %xmm2, %xmm0
1592; X64-SSE4-NEXT:    retq
1593;
1594; X64-XOP-LABEL: mul_v2i64_neg_0_1:
1595; X64-XOP:       # %bb.0:
1596; X64-XOP-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615]
1597; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1598; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm3
1599; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1600; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1601; X64-XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1602; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
1603; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1604; X64-XOP-NEXT:    retq
1605;
1606; X64-AVX2-LABEL: mul_v2i64_neg_0_1:
1607; X64-AVX2:       # %bb.0:
1608; X64-AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615]
1609; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1610; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm3
1611; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1612; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1613; X64-AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1614; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
1615; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1616; X64-AVX2-NEXT:    retq
1617;
1618; X64-AVX512DQ-LABEL: mul_v2i64_neg_0_1:
1619; X64-AVX512DQ:       # %bb.0:
1620; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1621; X64-AVX512DQ-NEXT:    retq
1622  %1 = mul <2 x i64> %a0, <i64 0, i64 -1>
1623  ret <2 x i64> %1
1624}
1625
1626define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
1627; X86-SSE2-LABEL: mul_v2i64_15_neg_63:
1628; X86-SSE2:       # %bb.0:
1629; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,0,4294967233,4294967295]
1630; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
1631; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm2
1632; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
1633; X86-SSE2-NEXT:    psrlq $32, %xmm3
1634; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm3
1635; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1636; X86-SSE2-NEXT:    paddq %xmm3, %xmm0
1637; X86-SSE2-NEXT:    psllq $32, %xmm0
1638; X86-SSE2-NEXT:    paddq %xmm2, %xmm0
1639; X86-SSE2-NEXT:    retl
1640;
1641; X86-SSE4-LABEL: mul_v2i64_15_neg_63:
1642; X86-SSE4:       # %bb.0:
1643; X86-SSE4-NEXT:    pmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553]
1644; X86-SSE4-NEXT:    movdqa %xmm0, %xmm2
1645; X86-SSE4-NEXT:    pmuludq %xmm1, %xmm2
1646; X86-SSE4-NEXT:    movdqa %xmm0, %xmm3
1647; X86-SSE4-NEXT:    psrlq $32, %xmm3
1648; X86-SSE4-NEXT:    pmuludq %xmm1, %xmm3
1649; X86-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1650; X86-SSE4-NEXT:    paddq %xmm3, %xmm0
1651; X86-SSE4-NEXT:    psllq $32, %xmm0
1652; X86-SSE4-NEXT:    paddq %xmm2, %xmm0
1653; X86-SSE4-NEXT:    retl
1654;
1655; X64-SSE2-LABEL: mul_v2i64_15_neg_63:
1656; X64-SSE2:       # %bb.0:
1657; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,18446744073709551553]
1658; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
1659; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm2
1660; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
1661; X64-SSE2-NEXT:    psrlq $32, %xmm3
1662; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm3
1663; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1664; X64-SSE2-NEXT:    paddq %xmm3, %xmm0
1665; X64-SSE2-NEXT:    psllq $32, %xmm0
1666; X64-SSE2-NEXT:    paddq %xmm2, %xmm0
1667; X64-SSE2-NEXT:    retq
1668;
1669; X64-SSE4-LABEL: mul_v2i64_15_neg_63:
1670; X64-SSE4:       # %bb.0:
1671; X64-SSE4-NEXT:    pmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553]
1672; X64-SSE4-NEXT:    movdqa %xmm0, %xmm2
1673; X64-SSE4-NEXT:    pmuludq %xmm1, %xmm2
1674; X64-SSE4-NEXT:    movdqa %xmm0, %xmm3
1675; X64-SSE4-NEXT:    psrlq $32, %xmm3
1676; X64-SSE4-NEXT:    pmuludq %xmm1, %xmm3
1677; X64-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1678; X64-SSE4-NEXT:    paddq %xmm3, %xmm0
1679; X64-SSE4-NEXT:    psllq $32, %xmm0
1680; X64-SSE4-NEXT:    paddq %xmm2, %xmm0
1681; X64-SSE4-NEXT:    retq
1682;
1683; X64-XOP-LABEL: mul_v2i64_15_neg_63:
1684; X64-XOP:       # %bb.0:
1685; X64-XOP-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553]
1686; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1687; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm3
1688; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1689; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1690; X64-XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1691; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
1692; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1693; X64-XOP-NEXT:    retq
1694;
1695; X64-AVX2-LABEL: mul_v2i64_15_neg_63:
1696; X64-AVX2:       # %bb.0:
1697; X64-AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553]
1698; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1699; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm3
1700; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1701; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1702; X64-AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1703; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
1704; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1705; X64-AVX2-NEXT:    retq
1706;
1707; X64-AVX512DQ-LABEL: mul_v2i64_15_neg_63:
1708; X64-AVX512DQ:       # %bb.0:
1709; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1710; X64-AVX512DQ-NEXT:    retq
1711  %1 = mul <2 x i64> %a0, <i64 15, i64 -63>
1712  ret <2 x i64> %1
1713}
1714
1715define <4 x i32> @mul_v4i32_0_15_31_7(<4 x i32> %a0) nounwind {
1716; X86-SSE2-LABEL: mul_v4i32_0_15_31_7:
1717; X86-SSE2:       # %bb.0:
1718; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1719; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1720; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1721; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1722; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1723; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1724; X86-SSE2-NEXT:    retl
1725;
1726; X86-SSE4-LABEL: mul_v4i32_0_15_31_7:
1727; X86-SSE4:       # %bb.0:
1728; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1729; X86-SSE4-NEXT:    retl
1730;
1731; X64-SSE2-LABEL: mul_v4i32_0_15_31_7:
1732; X64-SSE2:       # %bb.0:
1733; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1734; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1735; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1736; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1737; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1738; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1739; X64-SSE2-NEXT:    retq
1740;
1741; X64-SSE4-LABEL: mul_v4i32_0_15_31_7:
1742; X64-SSE4:       # %bb.0:
1743; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1744; X64-SSE4-NEXT:    retq
1745;
1746; X64-AVX-LABEL: mul_v4i32_0_15_31_7:
1747; X64-AVX:       # %bb.0:
1748; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1749; X64-AVX-NEXT:    retq
1750  %1 = mul <4 x i32> %a0, <i32 0, i32 15, i32 31, i32 7>
1751  ret <4 x i32> %1
1752}
1753
1754define <8 x i16> @mul_v8i16_0_1_7_15_31_63_127_255(<8 x i16> %a0) nounwind {
1755; X86-SSE-LABEL: mul_v8i16_0_1_7_15_31_63_127_255:
1756; X86-SSE:       # %bb.0:
1757; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,1,7,15,31,63,127,255]
1758; X86-SSE-NEXT:    retl
1759;
1760; X64-SSE-LABEL: mul_v8i16_0_1_7_15_31_63_127_255:
1761; X64-SSE:       # %bb.0:
1762; X64-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,7,15,31,63,127,255]
1763; X64-SSE-NEXT:    retq
1764;
1765; X64-AVX-LABEL: mul_v8i16_0_1_7_15_31_63_127_255:
1766; X64-AVX:       # %bb.0:
1767; X64-AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,7,15,31,63,127,255]
1768; X64-AVX-NEXT:    retq
1769  %1 = mul <8 x i16> %a0, <i16 0, i16 1, i16 7, i16 15, i16 31, i16 63, i16 127, i16 255>
1770  ret <8 x i16> %1
1771}
1772
1773define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> %a0) nounwind {
1774; SSE2-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1775; SSE2:       # %bb.0:
1776; SSE2-NEXT:    movdqa %xmm0, %xmm1
1777; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1778; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
1779; SSE2-NEXT:    pmullw %xmm2, %xmm1
1780; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1781; SSE2-NEXT:    pand %xmm3, %xmm1
1782; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1783; SSE2-NEXT:    pmullw %xmm2, %xmm0
1784; SSE2-NEXT:    pand %xmm3, %xmm0
1785; SSE2-NEXT:    packuswb %xmm1, %xmm0
1786; SSE2-NEXT:    ret{{[l|q]}}
1787;
1788; X86-SSE4-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1789; X86-SSE4:       # %bb.0:
1790; X86-SSE4-NEXT:    movdqa %xmm0, %xmm1
1791; X86-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127]
1792; X86-SSE4-NEXT:    psllw $8, %xmm1
1793; X86-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0]
1794; X86-SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1795; X86-SSE4-NEXT:    por %xmm1, %xmm0
1796; X86-SSE4-NEXT:    retl
1797;
1798; X64-SSE4-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1799; X64-SSE4:       # %bb.0:
1800; X64-SSE4-NEXT:    movdqa %xmm0, %xmm1
1801; X64-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127]
1802; X64-SSE4-NEXT:    psllw $8, %xmm1
1803; X64-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0]
1804; X64-SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1805; X64-SSE4-NEXT:    por %xmm1, %xmm0
1806; X64-SSE4-NEXT:    retq
1807;
1808; X64-XOP-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1809; X64-XOP:       # %bb.0:
1810; X64-XOP-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127]
1811; X64-XOP-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0]
1812; X64-XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2],xmm1[2],xmm0[4],xmm1[4],xmm0[6],xmm1[6],xmm0[8],xmm1[8],xmm0[10],xmm1[10],xmm0[12],xmm1[12],xmm0[14],xmm1[14]
1813; X64-XOP-NEXT:    retq
1814;
1815; X64-AVX2-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1816; X64-AVX2:       # %bb.0:
1817; X64-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1818; X64-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127]
1819; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1820; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1821; X64-AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1822; X64-AVX2-NEXT:    vzeroupper
1823; X64-AVX2-NEXT:    retq
1824;
1825; X64-AVX512DQ-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1826; X64-AVX512DQ:       # %bb.0:
1827; X64-AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1828; X64-AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127]
1829; X64-AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1830; X64-AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1831; X64-AVX512DQ-NEXT:    vzeroupper
1832; X64-AVX512DQ-NEXT:    retq
1833  %1 = mul <16 x i8> %a0, <i8 0, i8 1, i8 3, i8 7, i8 15, i8 31, i8 63, i8 127, i8 0, i8 1, i8 3, i8 7, i8 15, i8 31, i8 63, i8 127>
1834  ret <16 x i8> %1
1835}
1836
1837define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind {
1838; X86-SSE2-LABEL: mul_v2i64_68_132:
1839; X86-SSE2:       # %bb.0:
1840; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [68,0,132,0]
1841; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
1842; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm2
1843; X86-SSE2-NEXT:    psrlq $32, %xmm0
1844; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
1845; X86-SSE2-NEXT:    psllq $32, %xmm0
1846; X86-SSE2-NEXT:    paddq %xmm2, %xmm0
1847; X86-SSE2-NEXT:    retl
1848;
1849; SSE4-LABEL: mul_v2i64_68_132:
1850; SSE4:       # %bb.0:
1851; SSE4-NEXT:    pmovzxbq {{.*#+}} xmm1 = [68,132]
1852; SSE4-NEXT:    movdqa %xmm0, %xmm2
1853; SSE4-NEXT:    pmuludq %xmm1, %xmm2
1854; SSE4-NEXT:    psrlq $32, %xmm0
1855; SSE4-NEXT:    pmuludq %xmm1, %xmm0
1856; SSE4-NEXT:    psllq $32, %xmm0
1857; SSE4-NEXT:    paddq %xmm2, %xmm0
1858; SSE4-NEXT:    ret{{[l|q]}}
1859;
1860; X64-SSE2-LABEL: mul_v2i64_68_132:
1861; X64-SSE2:       # %bb.0:
1862; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [68,132]
1863; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
1864; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm2
1865; X64-SSE2-NEXT:    psrlq $32, %xmm0
1866; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm0
1867; X64-SSE2-NEXT:    psllq $32, %xmm0
1868; X64-SSE2-NEXT:    paddq %xmm2, %xmm0
1869; X64-SSE2-NEXT:    retq
1870;
1871; X64-XOP-LABEL: mul_v2i64_68_132:
1872; X64-XOP:       # %bb.0:
1873; X64-XOP-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [68,132]
1874; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1875; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm0
1876; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1877; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
1878; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1879; X64-XOP-NEXT:    retq
1880;
1881; X64-AVX2-LABEL: mul_v2i64_68_132:
1882; X64-AVX2:       # %bb.0:
1883; X64-AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [68,132]
1884; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1885; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
1886; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1887; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
1888; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1889; X64-AVX2-NEXT:    retq
1890;
1891; X64-AVX512DQ-LABEL: mul_v2i64_68_132:
1892; X64-AVX512DQ:       # %bb.0:
1893; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1894; X64-AVX512DQ-NEXT:    retq
1895  %mul = mul <2 x i64> %x, <i64 68, i64 132>
1896  ret <2 x i64> %mul
1897}
1898
1899define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind {
1900; X86-SSE2-LABEL: mul_v2i64_60_120:
1901; X86-SSE2:       # %bb.0:
1902; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [60,0,124,0]
1903; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
1904; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm2
1905; X86-SSE2-NEXT:    psrlq $32, %xmm0
1906; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
1907; X86-SSE2-NEXT:    psllq $32, %xmm0
1908; X86-SSE2-NEXT:    paddq %xmm2, %xmm0
1909; X86-SSE2-NEXT:    retl
1910;
1911; SSE4-LABEL: mul_v2i64_60_120:
1912; SSE4:       # %bb.0:
1913; SSE4-NEXT:    pmovsxbq {{.*#+}} xmm1 = [60,124]
1914; SSE4-NEXT:    movdqa %xmm0, %xmm2
1915; SSE4-NEXT:    pmuludq %xmm1, %xmm2
1916; SSE4-NEXT:    psrlq $32, %xmm0
1917; SSE4-NEXT:    pmuludq %xmm1, %xmm0
1918; SSE4-NEXT:    psllq $32, %xmm0
1919; SSE4-NEXT:    paddq %xmm2, %xmm0
1920; SSE4-NEXT:    ret{{[l|q]}}
1921;
1922; X64-SSE2-LABEL: mul_v2i64_60_120:
1923; X64-SSE2:       # %bb.0:
1924; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [60,124]
1925; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
1926; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm2
1927; X64-SSE2-NEXT:    psrlq $32, %xmm0
1928; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm0
1929; X64-SSE2-NEXT:    psllq $32, %xmm0
1930; X64-SSE2-NEXT:    paddq %xmm2, %xmm0
1931; X64-SSE2-NEXT:    retq
1932;
1933; X64-XOP-LABEL: mul_v2i64_60_120:
1934; X64-XOP:       # %bb.0:
1935; X64-XOP-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [60,124]
1936; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1937; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm0
1938; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1939; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
1940; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1941; X64-XOP-NEXT:    retq
1942;
1943; X64-AVX2-LABEL: mul_v2i64_60_120:
1944; X64-AVX2:       # %bb.0:
1945; X64-AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [60,124]
1946; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1947; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
1948; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1949; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
1950; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1951; X64-AVX2-NEXT:    retq
1952;
1953; X64-AVX512DQ-LABEL: mul_v2i64_60_120:
1954; X64-AVX512DQ:       # %bb.0:
1955; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1956; X64-AVX512DQ-NEXT:    retq
1957  %mul = mul <2 x i64> %x, <i64 60, i64 124>
1958  ret <2 x i64> %mul
1959}
1960
1961; We unfortunately can't see the zext that lives in the other basic block so we
1962; don't know that we only need one pmuludq to compute the full 64 bits. This
1963; sort of issue is more likely to occur when there is a loop and one of the
1964; multiply inputs is loop invariant.
1965define <2 x i64> @mul_v2i64_zext_cross_bb(ptr %in, ptr %y) {
1966; X86-SSE2-LABEL: mul_v2i64_zext_cross_bb:
1967; X86-SSE2:       # %bb.0:
1968; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1969; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1970; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1971; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
1972; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1973; X86-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
1974; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0,0,1,1]
1975; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
1976; X86-SSE2-NEXT:    retl
1977;
1978; X86-SSE4-LABEL: mul_v2i64_zext_cross_bb:
1979; X86-SSE4:       # %bb.0:
1980; X86-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
1981; X86-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1982; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1983; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1984; X86-SSE4-NEXT:    pmuludq %xmm1, %xmm0
1985; X86-SSE4-NEXT:    retl
1986;
1987; X64-SSE2-LABEL: mul_v2i64_zext_cross_bb:
1988; X64-SSE2:       # %bb.0:
1989; X64-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1990; X64-SSE2-NEXT:    pxor %xmm1, %xmm1
1991; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1992; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
1993; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
1994; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm0
1995; X64-SSE2-NEXT:    retq
1996;
1997; X64-SSE4-LABEL: mul_v2i64_zext_cross_bb:
1998; X64-SSE4:       # %bb.0:
1999; X64-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
2000; X64-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
2001; X64-SSE4-NEXT:    pmuludq %xmm1, %xmm0
2002; X64-SSE4-NEXT:    retq
2003;
2004; X64-AVX-LABEL: mul_v2i64_zext_cross_bb:
2005; X64-AVX:       # %bb.0:
2006; X64-AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
2007; X64-AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
2008; X64-AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
2009; X64-AVX-NEXT:    retq
2010  %a = load <2 x i32>, ptr %in
2011  %b = zext <2 x i32> %a to <2 x i64>
2012  br label %foo
2013
2014foo:
2015  %c = load <2 x i32>, ptr %y
2016  %d = zext <2 x i32> %c to <2 x i64>
2017  %e = mul <2 x i64> %b, %d
2018  ret <2 x i64> %e
2019}
2020
2021define <4 x i64> @mul_v4i64_zext_cross_bb(ptr %in, ptr %y) {
2022; X86-SSE2-LABEL: mul_v4i64_zext_cross_bb:
2023; X86-SSE2:       # %bb.0:
2024; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2025; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2026; X86-SSE2-NEXT:    movdqa (%ecx), %xmm0
2027; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
2028; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
2029; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2030; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2031; X86-SSE2-NEXT:    movdqa (%eax), %xmm2
2032; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,1,3,3]
2033; X86-SSE2-NEXT:    pmuludq %xmm3, %xmm1
2034; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
2035; X86-SSE2-NEXT:    pmuludq %xmm2, %xmm0
2036; X86-SSE2-NEXT:    retl
2037;
2038; X86-SSE4-LABEL: mul_v4i64_zext_cross_bb:
2039; X86-SSE4:       # %bb.0:
2040; X86-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
2041; X86-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2042; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
2043; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
2044; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
2045; X86-SSE4-NEXT:    pmuludq %xmm2, %xmm1
2046; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
2047; X86-SSE4-NEXT:    pmuludq %xmm2, %xmm0
2048; X86-SSE4-NEXT:    retl
2049;
2050; X64-SSE2-LABEL: mul_v4i64_zext_cross_bb:
2051; X64-SSE2:       # %bb.0:
2052; X64-SSE2-NEXT:    movdqa (%rdi), %xmm0
2053; X64-SSE2-NEXT:    pxor %xmm2, %xmm2
2054; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
2055; X64-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2056; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2057; X64-SSE2-NEXT:    movdqa (%rsi), %xmm2
2058; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,1,3,3]
2059; X64-SSE2-NEXT:    pmuludq %xmm3, %xmm1
2060; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
2061; X64-SSE2-NEXT:    pmuludq %xmm2, %xmm0
2062; X64-SSE2-NEXT:    retq
2063;
2064; X64-SSE4-LABEL: mul_v4i64_zext_cross_bb:
2065; X64-SSE4:       # %bb.0:
2066; X64-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
2067; X64-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
2068; X64-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
2069; X64-SSE4-NEXT:    pmuludq %xmm2, %xmm1
2070; X64-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
2071; X64-SSE4-NEXT:    pmuludq %xmm2, %xmm0
2072; X64-SSE4-NEXT:    retq
2073;
2074; X64-XOP-LABEL: mul_v4i64_zext_cross_bb:
2075; X64-XOP:       # %bb.0:
2076; X64-XOP-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
2077; X64-XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
2078; X64-XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2079; X64-XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
2080; X64-XOP-NEXT:    vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
2081; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
2082; X64-XOP-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
2083; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
2084; X64-XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2085; X64-XOP-NEXT:    retq
2086;
2087; X64-AVX2-LABEL: mul_v4i64_zext_cross_bb:
2088; X64-AVX2:       # %bb.0:
2089; X64-AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2090; X64-AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2091; X64-AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
2092; X64-AVX2-NEXT:    retq
2093;
2094; X64-AVX512DQ-LABEL: mul_v4i64_zext_cross_bb:
2095; X64-AVX512DQ:       # %bb.0:
2096; X64-AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2097; X64-AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2098; X64-AVX512DQ-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
2099; X64-AVX512DQ-NEXT:    retq
2100  %a = load <4 x i32>, ptr %in
2101  %b = zext <4 x i32> %a to <4 x i64>
2102  br label %foo
2103
2104foo:
2105  %c = load <4 x i32>, ptr %y
2106  %d = zext <4 x i32> %c to <4 x i64>
2107  %e = mul <4 x i64> %b, %d
2108  ret <4 x i64> %e
2109}
2110;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2111; X64-SSE4-FAST: {{.*}}
2112; X64-SSE4-SLOW: {{.*}}
2113