xref: /llvm-project/llvm/test/CodeGen/X86/pmul.ll (revision f0b3b6d15b2c0ee2cff2dd31dc075adb5d9a4ff7)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
7
8define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind  {
9; SSE2-LABEL: mul_v16i8c:
10; SSE2:       # %bb.0: # %entry
11; SSE2-NEXT:    movdqa %xmm0, %xmm1
12; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
13; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
14; SSE2-NEXT:    pmullw %xmm2, %xmm1
15; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
16; SSE2-NEXT:    pand %xmm3, %xmm1
17; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
18; SSE2-NEXT:    pmullw %xmm2, %xmm0
19; SSE2-NEXT:    pand %xmm3, %xmm0
20; SSE2-NEXT:    packuswb %xmm1, %xmm0
21; SSE2-NEXT:    retq
22;
23; SSE41-LABEL: mul_v16i8c:
24; SSE41:       # %bb.0: # %entry
25; SSE41-NEXT:    movdqa %xmm0, %xmm1
26; SSE41-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
27; SSE41-NEXT:    psllw $8, %xmm1
28; SSE41-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
29; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
30; SSE41-NEXT:    por %xmm1, %xmm0
31; SSE41-NEXT:    retq
32;
33; AVX2-LABEL: mul_v16i8c:
34; AVX2:       # %bb.0: # %entry
35; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
36; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
37; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
38; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
39; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
40; AVX2-NEXT:    vzeroupper
41; AVX2-NEXT:    retq
42;
43; AVX512F-LABEL: mul_v16i8c:
44; AVX512F:       # %bb.0: # %entry
45; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
46; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
47; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
48; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
49; AVX512F-NEXT:    vzeroupper
50; AVX512F-NEXT:    retq
51;
52; AVX512BW-LABEL: mul_v16i8c:
53; AVX512BW:       # %bb.0: # %entry
54; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
55; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
56; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
57; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
58; AVX512BW-NEXT:    vzeroupper
59; AVX512BW-NEXT:    retq
60entry:
61  %A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
62  ret <16 x i8> %A
63}
64
65define <8 x i16> @mul_v8i16c(<8 x i16> %i) nounwind  {
66; SSE-LABEL: mul_v8i16c:
67; SSE:       # %bb.0: # %entry
68; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [117,117,117,117,117,117,117,117]
69; SSE-NEXT:    retq
70;
71; AVX-LABEL: mul_v8i16c:
72; AVX:       # %bb.0: # %entry
73; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [117,117,117,117,117,117,117,117]
74; AVX-NEXT:    retq
75entry:
76  %A = mul <8 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 >
77  ret <8 x i16> %A
78}
79
80define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind  {
81; SSE2-LABEL: mul_v4i32c:
82; SSE2:       # %bb.0: # %entry
83; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [117,117,117,117]
84; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
85; SSE2-NEXT:    pmuludq %xmm1, %xmm0
86; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
87; SSE2-NEXT:    pmuludq %xmm1, %xmm2
88; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
89; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
90; SSE2-NEXT:    retq
91;
92; SSE41-LABEL: mul_v4i32c:
93; SSE41:       # %bb.0: # %entry
94; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
95; SSE41-NEXT:    retq
96;
97; AVX-LABEL: mul_v4i32c:
98; AVX:       # %bb.0: # %entry
99; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [117,117,117,117]
100; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
101; AVX-NEXT:    retq
102entry:
103  %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
104  ret <4 x i32> %A
105}
106
107define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind  {
108; SSE2-LABEL: mul_v2i64c:
109; SSE2:       # %bb.0: # %entry
110; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [117,117]
111; SSE2-NEXT:    movdqa %xmm0, %xmm2
112; SSE2-NEXT:    pmuludq %xmm1, %xmm2
113; SSE2-NEXT:    psrlq $32, %xmm0
114; SSE2-NEXT:    pmuludq %xmm1, %xmm0
115; SSE2-NEXT:    psllq $32, %xmm0
116; SSE2-NEXT:    paddq %xmm2, %xmm0
117; SSE2-NEXT:    retq
118;
119; SSE41-LABEL: mul_v2i64c:
120; SSE41:       # %bb.0: # %entry
121; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm1 = [117,117]
122; SSE41-NEXT:    movdqa %xmm0, %xmm2
123; SSE41-NEXT:    pmuludq %xmm1, %xmm2
124; SSE41-NEXT:    psrlq $32, %xmm0
125; SSE41-NEXT:    pmuludq %xmm1, %xmm0
126; SSE41-NEXT:    psllq $32, %xmm0
127; SSE41-NEXT:    paddq %xmm2, %xmm0
128; SSE41-NEXT:    retq
129;
130; AVX-LABEL: mul_v2i64c:
131; AVX:       # %bb.0: # %entry
132; AVX-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [117,117]
133; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
134; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm0
135; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
136; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
137; AVX-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
138; AVX-NEXT:    retq
139entry:
140  %A = mul <2 x i64> %i, < i64 117, i64 117 >
141  ret <2 x i64> %A
142}
143
144define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind  {
145; SSE2-LABEL: mul_v16i8:
146; SSE2:       # %bb.0: # %entry
147; SSE2-NEXT:    movdqa %xmm1, %xmm2
148; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
149; SSE2-NEXT:    movdqa %xmm0, %xmm3
150; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
151; SSE2-NEXT:    pmullw %xmm2, %xmm3
152; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
153; SSE2-NEXT:    pand %xmm2, %xmm3
154; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
155; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
156; SSE2-NEXT:    pmullw %xmm1, %xmm0
157; SSE2-NEXT:    pand %xmm2, %xmm0
158; SSE2-NEXT:    packuswb %xmm3, %xmm0
159; SSE2-NEXT:    retq
160;
161; SSE41-LABEL: mul_v16i8:
162; SSE41:       # %bb.0: # %entry
163; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
164; SSE41-NEXT:    movdqa %xmm1, %xmm3
165; SSE41-NEXT:    pand %xmm2, %xmm3
166; SSE41-NEXT:    movdqa %xmm0, %xmm4
167; SSE41-NEXT:    pmaddubsw %xmm3, %xmm4
168; SSE41-NEXT:    pand %xmm2, %xmm4
169; SSE41-NEXT:    pandn %xmm1, %xmm2
170; SSE41-NEXT:    pmaddubsw %xmm2, %xmm0
171; SSE41-NEXT:    psllw $8, %xmm0
172; SSE41-NEXT:    por %xmm4, %xmm0
173; SSE41-NEXT:    retq
174;
175; AVX2-LABEL: mul_v16i8:
176; AVX2:       # %bb.0: # %entry
177; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
178; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
179; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
180; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
181; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
182; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
183; AVX2-NEXT:    vzeroupper
184; AVX2-NEXT:    retq
185;
186; AVX512F-LABEL: mul_v16i8:
187; AVX512F:       # %bb.0: # %entry
188; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
189; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
190; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
191; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
192; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
193; AVX512F-NEXT:    vzeroupper
194; AVX512F-NEXT:    retq
195;
196; AVX512BW-LABEL: mul_v16i8:
197; AVX512BW:       # %bb.0: # %entry
198; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
199; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
200; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
201; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
202; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
203; AVX512BW-NEXT:    vzeroupper
204; AVX512BW-NEXT:    retq
205entry:
206  %A = mul <16 x i8> %i, %j
207  ret <16 x i8> %A
208}
209
210define <8 x i16> @mul_v8i16(<8 x i16> %i, <8 x i16> %j) nounwind  {
211; SSE-LABEL: mul_v8i16:
212; SSE:       # %bb.0: # %entry
213; SSE-NEXT:    pmullw %xmm1, %xmm0
214; SSE-NEXT:    retq
215;
216; AVX-LABEL: mul_v8i16:
217; AVX:       # %bb.0: # %entry
218; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
219; AVX-NEXT:    retq
220entry:
221  %A = mul <8 x i16> %i, %j
222  ret <8 x i16> %A
223}
224
225define <4 x i32> @mul_v4i32(<4 x i32> %i, <4 x i32> %j) nounwind  {
226; SSE2-LABEL: mul_v4i32:
227; SSE2:       # %bb.0: # %entry
228; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
229; SSE2-NEXT:    pmuludq %xmm1, %xmm0
230; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
231; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
232; SSE2-NEXT:    pmuludq %xmm2, %xmm1
233; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
234; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
235; SSE2-NEXT:    retq
236;
237; SSE41-LABEL: mul_v4i32:
238; SSE41:       # %bb.0: # %entry
239; SSE41-NEXT:    pmulld %xmm1, %xmm0
240; SSE41-NEXT:    retq
241;
242; AVX-LABEL: mul_v4i32:
243; AVX:       # %bb.0: # %entry
244; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
245; AVX-NEXT:    retq
246entry:
247  %A = mul <4 x i32> %i, %j
248  ret <4 x i32> %A
249}
250
251define <2 x i64> @mul_v2i64(<2 x i64> %i, <2 x i64> %j) nounwind  {
252; SSE-LABEL: mul_v2i64:
253; SSE:       # %bb.0: # %entry
254; SSE-NEXT:    movdqa %xmm0, %xmm2
255; SSE-NEXT:    psrlq $32, %xmm2
256; SSE-NEXT:    pmuludq %xmm1, %xmm2
257; SSE-NEXT:    movdqa %xmm1, %xmm3
258; SSE-NEXT:    psrlq $32, %xmm3
259; SSE-NEXT:    pmuludq %xmm0, %xmm3
260; SSE-NEXT:    paddq %xmm2, %xmm3
261; SSE-NEXT:    psllq $32, %xmm3
262; SSE-NEXT:    pmuludq %xmm1, %xmm0
263; SSE-NEXT:    paddq %xmm3, %xmm0
264; SSE-NEXT:    retq
265;
266; AVX-LABEL: mul_v2i64:
267; AVX:       # %bb.0: # %entry
268; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm2
269; AVX-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
270; AVX-NEXT:    vpsrlq $32, %xmm1, %xmm3
271; AVX-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
272; AVX-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
273; AVX-NEXT:    vpsllq $32, %xmm2, %xmm2
274; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
275; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
276; AVX-NEXT:    retq
277entry:
278  %A = mul <2 x i64> %i, %j
279  ret <2 x i64> %A
280}
281
282declare void @foo()
283
284define <4 x i32> @mul_v4i32spill(<4 x i32> %i, <4 x i32> %j) nounwind  {
285; SSE2-LABEL: mul_v4i32spill:
286; SSE2:       # %bb.0: # %entry
287; SSE2-NEXT:    subq $40, %rsp
288; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
289; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
290; SSE2-NEXT:    callq foo@PLT
291; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
292; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
293; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
294; SSE2-NEXT:    pmuludq %xmm2, %xmm0
295; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
296; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
297; SSE2-NEXT:    pmuludq %xmm1, %xmm2
298; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
299; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
300; SSE2-NEXT:    addq $40, %rsp
301; SSE2-NEXT:    retq
302;
303; SSE41-LABEL: mul_v4i32spill:
304; SSE41:       # %bb.0: # %entry
305; SSE41-NEXT:    subq $40, %rsp
306; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
307; SSE41-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
308; SSE41-NEXT:    callq foo@PLT
309; SSE41-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
310; SSE41-NEXT:    pmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
311; SSE41-NEXT:    addq $40, %rsp
312; SSE41-NEXT:    retq
313;
314; AVX-LABEL: mul_v4i32spill:
315; AVX:       # %bb.0: # %entry
316; AVX-NEXT:    subq $40, %rsp
317; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
318; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
319; AVX-NEXT:    callq foo@PLT
320; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
321; AVX-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
322; AVX-NEXT:    addq $40, %rsp
323; AVX-NEXT:    retq
324entry:
325  ; Use a call to force spills.
326  call void @foo()
327  %A = mul <4 x i32> %i, %j
328  ret <4 x i32> %A
329}
330
331define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind  {
332; SSE-LABEL: mul_v2i64spill:
333; SSE:       # %bb.0: # %entry
334; SSE-NEXT:    subq $40, %rsp
335; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
336; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
337; SSE-NEXT:    callq foo@PLT
338; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
339; SSE-NEXT:    movdqa %xmm0, %xmm2
340; SSE-NEXT:    psrlq $32, %xmm2
341; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
342; SSE-NEXT:    pmuludq %xmm3, %xmm2
343; SSE-NEXT:    movdqa %xmm3, %xmm1
344; SSE-NEXT:    psrlq $32, %xmm1
345; SSE-NEXT:    pmuludq %xmm0, %xmm1
346; SSE-NEXT:    paddq %xmm2, %xmm1
347; SSE-NEXT:    psllq $32, %xmm1
348; SSE-NEXT:    pmuludq %xmm3, %xmm0
349; SSE-NEXT:    paddq %xmm1, %xmm0
350; SSE-NEXT:    addq $40, %rsp
351; SSE-NEXT:    retq
352;
353; AVX-LABEL: mul_v2i64spill:
354; AVX:       # %bb.0: # %entry
355; AVX-NEXT:    subq $40, %rsp
356; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
357; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
358; AVX-NEXT:    callq foo@PLT
359; AVX-NEXT:    vmovdqa (%rsp), %xmm3 # 16-byte Reload
360; AVX-NEXT:    vpsrlq $32, %xmm3, %xmm0
361; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
362; AVX-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
363; AVX-NEXT:    vpsrlq $32, %xmm2, %xmm1
364; AVX-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
365; AVX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
366; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
367; AVX-NEXT:    vpmuludq %xmm2, %xmm3, %xmm1
368; AVX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
369; AVX-NEXT:    addq $40, %rsp
370; AVX-NEXT:    retq
371entry:
372  ; Use a call to force spills.
373  call void @foo()
374  %A = mul <2 x i64> %i, %j
375  ret <2 x i64> %A
376}
377
378define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind  {
379; SSE2-LABEL: mul_v32i8c:
380; SSE2:       # %bb.0: # %entry
381; SSE2-NEXT:    movdqa %xmm0, %xmm2
382; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
383; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [117,117,117,117,117,117,117,117]
384; SSE2-NEXT:    pmullw %xmm3, %xmm2
385; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
386; SSE2-NEXT:    pand %xmm4, %xmm2
387; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
388; SSE2-NEXT:    pmullw %xmm3, %xmm0
389; SSE2-NEXT:    pand %xmm4, %xmm0
390; SSE2-NEXT:    packuswb %xmm2, %xmm0
391; SSE2-NEXT:    movdqa %xmm1, %xmm2
392; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
393; SSE2-NEXT:    pmullw %xmm3, %xmm2
394; SSE2-NEXT:    pand %xmm4, %xmm2
395; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
396; SSE2-NEXT:    pmullw %xmm3, %xmm1
397; SSE2-NEXT:    pand %xmm4, %xmm1
398; SSE2-NEXT:    packuswb %xmm2, %xmm1
399; SSE2-NEXT:    retq
400;
401; SSE41-LABEL: mul_v32i8c:
402; SSE41:       # %bb.0: # %entry
403; SSE41-NEXT:    pmovsxbw {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
404; SSE41-NEXT:    movdqa %xmm0, %xmm3
405; SSE41-NEXT:    pmaddubsw %xmm2, %xmm3
406; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
407; SSE41-NEXT:    pand %xmm4, %xmm3
408; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
409; SSE41-NEXT:    pmaddubsw %xmm5, %xmm0
410; SSE41-NEXT:    psllw $8, %xmm0
411; SSE41-NEXT:    por %xmm3, %xmm0
412; SSE41-NEXT:    movdqa %xmm1, %xmm3
413; SSE41-NEXT:    pmaddubsw %xmm2, %xmm3
414; SSE41-NEXT:    pand %xmm4, %xmm3
415; SSE41-NEXT:    pmaddubsw %xmm5, %xmm1
416; SSE41-NEXT:    psllw $8, %xmm1
417; SSE41-NEXT:    por %xmm3, %xmm1
418; SSE41-NEXT:    retq
419;
420; AVX2-LABEL: mul_v32i8c:
421; AVX2:       # %bb.0: # %entry
422; AVX2-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
423; AVX2-NEXT:    vpsllw $8, %ymm1, %ymm1
424; AVX2-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
425; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
426; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
427; AVX2-NEXT:    retq
428;
429; AVX512F-LABEL: mul_v32i8c:
430; AVX512F:       # %bb.0: # %entry
431; AVX512F-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
432; AVX512F-NEXT:    vpsllw $8, %ymm1, %ymm1
433; AVX512F-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
434; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
435; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
436; AVX512F-NEXT:    retq
437;
438; AVX512BW-LABEL: mul_v32i8c:
439; AVX512BW:       # %bb.0: # %entry
440; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
441; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
442; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
443; AVX512BW-NEXT:    retq
444entry:
445  %A = mul <32 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
446  ret <32 x i8> %A
447}
448
449define <16 x i16> @mul_v16i16c(<16 x i16> %i) nounwind  {
450; SSE2-LABEL: mul_v16i16c:
451; SSE2:       # %bb.0: # %entry
452; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
453; SSE2-NEXT:    pmullw %xmm2, %xmm0
454; SSE2-NEXT:    pmullw %xmm2, %xmm1
455; SSE2-NEXT:    retq
456;
457; SSE41-LABEL: mul_v16i16c:
458; SSE41:       # %bb.0: # %entry
459; SSE41-NEXT:    pmovsxbw {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
460; SSE41-NEXT:    pmullw %xmm2, %xmm0
461; SSE41-NEXT:    pmullw %xmm2, %xmm1
462; SSE41-NEXT:    retq
463;
464; AVX-LABEL: mul_v16i16c:
465; AVX:       # %bb.0: # %entry
466; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
467; AVX-NEXT:    retq
468entry:
469  %A = mul <16 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 >
470  ret <16 x i16> %A
471}
472
473define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind  {
474; SSE2-LABEL: mul_v8i32c:
475; SSE2:       # %bb.0: # %entry
476; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117]
477; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
478; SSE2-NEXT:    pmuludq %xmm2, %xmm0
479; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
480; SSE2-NEXT:    pmuludq %xmm2, %xmm3
481; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
482; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
483; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
484; SSE2-NEXT:    pmuludq %xmm2, %xmm1
485; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
486; SSE2-NEXT:    pmuludq %xmm2, %xmm3
487; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
488; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
489; SSE2-NEXT:    retq
490;
491; SSE41-LABEL: mul_v8i32c:
492; SSE41:       # %bb.0: # %entry
493; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [117,117,117,117]
494; SSE41-NEXT:    pmulld %xmm2, %xmm0
495; SSE41-NEXT:    pmulld %xmm2, %xmm1
496; SSE41-NEXT:    retq
497;
498; AVX-LABEL: mul_v8i32c:
499; AVX:       # %bb.0: # %entry
500; AVX-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117]
501; AVX-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
502; AVX-NEXT:    retq
503entry:
504  %A = mul <8 x i32> %i, < i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117 >
505  ret <8 x i32> %A
506}
507
508define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind  {
509; SSE2-LABEL: mul_v4i64c:
510; SSE2:       # %bb.0: # %entry
511; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [117,117]
512; SSE2-NEXT:    movdqa %xmm0, %xmm3
513; SSE2-NEXT:    pmuludq %xmm2, %xmm3
514; SSE2-NEXT:    psrlq $32, %xmm0
515; SSE2-NEXT:    pmuludq %xmm2, %xmm0
516; SSE2-NEXT:    psllq $32, %xmm0
517; SSE2-NEXT:    paddq %xmm3, %xmm0
518; SSE2-NEXT:    movdqa %xmm1, %xmm3
519; SSE2-NEXT:    pmuludq %xmm2, %xmm3
520; SSE2-NEXT:    psrlq $32, %xmm1
521; SSE2-NEXT:    pmuludq %xmm2, %xmm1
522; SSE2-NEXT:    psllq $32, %xmm1
523; SSE2-NEXT:    paddq %xmm3, %xmm1
524; SSE2-NEXT:    retq
525;
526; SSE41-LABEL: mul_v4i64c:
527; SSE41:       # %bb.0: # %entry
528; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm2 = [117,117]
529; SSE41-NEXT:    movdqa %xmm0, %xmm3
530; SSE41-NEXT:    pmuludq %xmm2, %xmm3
531; SSE41-NEXT:    psrlq $32, %xmm0
532; SSE41-NEXT:    pmuludq %xmm2, %xmm0
533; SSE41-NEXT:    psllq $32, %xmm0
534; SSE41-NEXT:    paddq %xmm3, %xmm0
535; SSE41-NEXT:    movdqa %xmm1, %xmm3
536; SSE41-NEXT:    pmuludq %xmm2, %xmm3
537; SSE41-NEXT:    psrlq $32, %xmm1
538; SSE41-NEXT:    pmuludq %xmm2, %xmm1
539; SSE41-NEXT:    psllq $32, %xmm1
540; SSE41-NEXT:    paddq %xmm3, %xmm1
541; SSE41-NEXT:    retq
542;
543; AVX-LABEL: mul_v4i64c:
544; AVX:       # %bb.0: # %entry
545; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [117,117,117,117]
546; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
547; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm0
548; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
549; AVX-NEXT:    vpsllq $32, %ymm0, %ymm0
550; AVX-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
551; AVX-NEXT:    retq
552entry:
553  %A = mul <4 x i64> %i, < i64 117, i64 117, i64 117, i64 117 >
554  ret <4 x i64> %A
555}
556
557define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind  {
558; SSE2-LABEL: mul_v32i8:
559; SSE2:       # %bb.0: # %entry
560; SSE2-NEXT:    movdqa %xmm2, %xmm4
561; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
562; SSE2-NEXT:    movdqa %xmm0, %xmm5
563; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
564; SSE2-NEXT:    pmullw %xmm4, %xmm5
565; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
566; SSE2-NEXT:    pand %xmm4, %xmm5
567; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
568; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
569; SSE2-NEXT:    pmullw %xmm2, %xmm0
570; SSE2-NEXT:    pand %xmm4, %xmm0
571; SSE2-NEXT:    packuswb %xmm5, %xmm0
572; SSE2-NEXT:    movdqa %xmm3, %xmm2
573; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
574; SSE2-NEXT:    movdqa %xmm1, %xmm5
575; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
576; SSE2-NEXT:    pmullw %xmm2, %xmm5
577; SSE2-NEXT:    pand %xmm4, %xmm5
578; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
579; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
580; SSE2-NEXT:    pmullw %xmm3, %xmm1
581; SSE2-NEXT:    pand %xmm4, %xmm1
582; SSE2-NEXT:    packuswb %xmm5, %xmm1
583; SSE2-NEXT:    retq
584;
585; SSE41-LABEL: mul_v32i8:
586; SSE41:       # %bb.0: # %entry
587; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
588; SSE41-NEXT:    movdqa %xmm4, %xmm5
589; SSE41-NEXT:    pandn %xmm2, %xmm5
590; SSE41-NEXT:    pand %xmm4, %xmm2
591; SSE41-NEXT:    movdqa %xmm0, %xmm6
592; SSE41-NEXT:    pmaddubsw %xmm2, %xmm6
593; SSE41-NEXT:    pand %xmm4, %xmm6
594; SSE41-NEXT:    pmaddubsw %xmm5, %xmm0
595; SSE41-NEXT:    psllw $8, %xmm0
596; SSE41-NEXT:    por %xmm6, %xmm0
597; SSE41-NEXT:    movdqa %xmm3, %xmm2
598; SSE41-NEXT:    pand %xmm4, %xmm2
599; SSE41-NEXT:    movdqa %xmm1, %xmm5
600; SSE41-NEXT:    pmaddubsw %xmm2, %xmm5
601; SSE41-NEXT:    pand %xmm4, %xmm5
602; SSE41-NEXT:    pandn %xmm3, %xmm4
603; SSE41-NEXT:    pmaddubsw %xmm4, %xmm1
604; SSE41-NEXT:    psllw $8, %xmm1
605; SSE41-NEXT:    por %xmm5, %xmm1
606; SSE41-NEXT:    retq
607;
608; AVX2-LABEL: mul_v32i8:
609; AVX2:       # %bb.0: # %entry
610; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
611; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
612; AVX2-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm3
613; AVX2-NEXT:    vpand %ymm2, %ymm3, %ymm3
614; AVX2-NEXT:    vpandn %ymm1, %ymm2, %ymm1
615; AVX2-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
616; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm0
617; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
618; AVX2-NEXT:    retq
619;
620; AVX512F-LABEL: mul_v32i8:
621; AVX512F:       # %bb.0: # %entry
622; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
623; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
624; AVX512F-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm3
625; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm3
626; AVX512F-NEXT:    vpandn %ymm1, %ymm2, %ymm1
627; AVX512F-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
628; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm0
629; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
630; AVX512F-NEXT:    retq
631;
632; AVX512BW-LABEL: mul_v32i8:
633; AVX512BW:       # %bb.0: # %entry
634; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
635; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
636; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
637; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
638; AVX512BW-NEXT:    retq
639entry:
640  %A = mul <32 x i8> %i, %j
641  ret <32 x i8> %A
642}
643
644define <16 x i16> @mul_v16i16(<16 x i16> %i, <16 x i16> %j) nounwind  {
645; SSE-LABEL: mul_v16i16:
646; SSE:       # %bb.0: # %entry
647; SSE-NEXT:    pmullw %xmm2, %xmm0
648; SSE-NEXT:    pmullw %xmm3, %xmm1
649; SSE-NEXT:    retq
650;
651; AVX-LABEL: mul_v16i16:
652; AVX:       # %bb.0: # %entry
653; AVX-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
654; AVX-NEXT:    retq
655entry:
656  %A = mul <16 x i16> %i, %j
657  ret <16 x i16> %A
658}
659
660define <8 x i32> @mul_v8i32(<8 x i32> %i, <8 x i32> %j) nounwind  {
661; SSE2-LABEL: mul_v8i32:
662; SSE2:       # %bb.0: # %entry
663; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
664; SSE2-NEXT:    pmuludq %xmm2, %xmm0
665; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
666; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
667; SSE2-NEXT:    pmuludq %xmm4, %xmm2
668; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
669; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
670; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
671; SSE2-NEXT:    pmuludq %xmm3, %xmm1
672; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
673; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
674; SSE2-NEXT:    pmuludq %xmm2, %xmm3
675; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
676; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
677; SSE2-NEXT:    retq
678;
679; SSE41-LABEL: mul_v8i32:
680; SSE41:       # %bb.0: # %entry
681; SSE41-NEXT:    pmulld %xmm2, %xmm0
682; SSE41-NEXT:    pmulld %xmm3, %xmm1
683; SSE41-NEXT:    retq
684;
685; AVX-LABEL: mul_v8i32:
686; AVX:       # %bb.0: # %entry
687; AVX-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
688; AVX-NEXT:    retq
689entry:
690  %A = mul <8 x i32> %i, %j
691  ret <8 x i32> %A
692}
693
694define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind  {
695; SSE-LABEL: mul_v4i64:
696; SSE:       # %bb.0: # %entry
697; SSE-NEXT:    movdqa %xmm0, %xmm4
698; SSE-NEXT:    psrlq $32, %xmm4
699; SSE-NEXT:    pmuludq %xmm2, %xmm4
700; SSE-NEXT:    movdqa %xmm2, %xmm5
701; SSE-NEXT:    psrlq $32, %xmm5
702; SSE-NEXT:    pmuludq %xmm0, %xmm5
703; SSE-NEXT:    paddq %xmm4, %xmm5
704; SSE-NEXT:    psllq $32, %xmm5
705; SSE-NEXT:    pmuludq %xmm2, %xmm0
706; SSE-NEXT:    paddq %xmm5, %xmm0
707; SSE-NEXT:    movdqa %xmm1, %xmm2
708; SSE-NEXT:    psrlq $32, %xmm2
709; SSE-NEXT:    pmuludq %xmm3, %xmm2
710; SSE-NEXT:    movdqa %xmm3, %xmm4
711; SSE-NEXT:    psrlq $32, %xmm4
712; SSE-NEXT:    pmuludq %xmm1, %xmm4
713; SSE-NEXT:    paddq %xmm2, %xmm4
714; SSE-NEXT:    psllq $32, %xmm4
715; SSE-NEXT:    pmuludq %xmm3, %xmm1
716; SSE-NEXT:    paddq %xmm4, %xmm1
717; SSE-NEXT:    retq
718;
719; AVX-LABEL: mul_v4i64:
720; AVX:       # %bb.0: # %entry
721; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm2
722; AVX-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
723; AVX-NEXT:    vpsrlq $32, %ymm1, %ymm3
724; AVX-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
725; AVX-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
726; AVX-NEXT:    vpsllq $32, %ymm2, %ymm2
727; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
728; AVX-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
729; AVX-NEXT:    retq
730entry:
731  %A = mul <4 x i64> %i, %j
732  ret <4 x i64> %A
733}
734
735define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
736; SSE2-LABEL: mul_v64i8c:
737; SSE2:       # %bb.0: # %entry
738; SSE2-NEXT:    movdqa %xmm0, %xmm6
739; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
740; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117]
741; SSE2-NEXT:    pmullw %xmm4, %xmm6
742; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
743; SSE2-NEXT:    pand %xmm5, %xmm6
744; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
745; SSE2-NEXT:    pmullw %xmm4, %xmm0
746; SSE2-NEXT:    pand %xmm5, %xmm0
747; SSE2-NEXT:    packuswb %xmm6, %xmm0
748; SSE2-NEXT:    movdqa %xmm1, %xmm6
749; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
750; SSE2-NEXT:    pmullw %xmm4, %xmm6
751; SSE2-NEXT:    pand %xmm5, %xmm6
752; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
753; SSE2-NEXT:    pmullw %xmm4, %xmm1
754; SSE2-NEXT:    pand %xmm5, %xmm1
755; SSE2-NEXT:    packuswb %xmm6, %xmm1
756; SSE2-NEXT:    movdqa %xmm2, %xmm6
757; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
758; SSE2-NEXT:    pmullw %xmm4, %xmm6
759; SSE2-NEXT:    pand %xmm5, %xmm6
760; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
761; SSE2-NEXT:    pmullw %xmm4, %xmm2
762; SSE2-NEXT:    pand %xmm5, %xmm2
763; SSE2-NEXT:    packuswb %xmm6, %xmm2
764; SSE2-NEXT:    movdqa %xmm3, %xmm6
765; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
766; SSE2-NEXT:    pmullw %xmm4, %xmm6
767; SSE2-NEXT:    pand %xmm5, %xmm6
768; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
769; SSE2-NEXT:    pmullw %xmm4, %xmm3
770; SSE2-NEXT:    pand %xmm5, %xmm3
771; SSE2-NEXT:    packuswb %xmm6, %xmm3
772; SSE2-NEXT:    retq
773;
774; SSE41-LABEL: mul_v64i8c:
775; SSE41:       # %bb.0: # %entry
776; SSE41-NEXT:    pmovsxbw {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117]
777; SSE41-NEXT:    movdqa %xmm0, %xmm6
778; SSE41-NEXT:    pmaddubsw %xmm4, %xmm6
779; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
780; SSE41-NEXT:    pand %xmm5, %xmm6
781; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
782; SSE41-NEXT:    pmaddubsw %xmm7, %xmm0
783; SSE41-NEXT:    psllw $8, %xmm0
784; SSE41-NEXT:    por %xmm6, %xmm0
785; SSE41-NEXT:    movdqa %xmm1, %xmm6
786; SSE41-NEXT:    pmaddubsw %xmm4, %xmm6
787; SSE41-NEXT:    pand %xmm5, %xmm6
788; SSE41-NEXT:    pmaddubsw %xmm7, %xmm1
789; SSE41-NEXT:    psllw $8, %xmm1
790; SSE41-NEXT:    por %xmm6, %xmm1
791; SSE41-NEXT:    movdqa %xmm2, %xmm6
792; SSE41-NEXT:    pmaddubsw %xmm4, %xmm6
793; SSE41-NEXT:    pand %xmm5, %xmm6
794; SSE41-NEXT:    pmaddubsw %xmm7, %xmm2
795; SSE41-NEXT:    psllw $8, %xmm2
796; SSE41-NEXT:    por %xmm6, %xmm2
797; SSE41-NEXT:    movdqa %xmm3, %xmm6
798; SSE41-NEXT:    pmaddubsw %xmm4, %xmm6
799; SSE41-NEXT:    pand %xmm5, %xmm6
800; SSE41-NEXT:    pmaddubsw %xmm7, %xmm3
801; SSE41-NEXT:    psllw $8, %xmm3
802; SSE41-NEXT:    por %xmm6, %xmm3
803; SSE41-NEXT:    retq
804;
805; AVX2-LABEL: mul_v64i8c:
806; AVX2:       # %bb.0: # %entry
807; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
808; AVX2-NEXT:    vpmaddubsw %ymm2, %ymm0, %ymm3
809; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
810; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
811; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm5 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
812; AVX2-NEXT:    vpmaddubsw %ymm5, %ymm0, %ymm0
813; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm0
814; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
815; AVX2-NEXT:    vpmaddubsw %ymm2, %ymm1, %ymm2
816; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
817; AVX2-NEXT:    vpmaddubsw %ymm5, %ymm1, %ymm1
818; AVX2-NEXT:    vpsllw $8, %ymm1, %ymm1
819; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
820; AVX2-NEXT:    retq
821;
822; AVX512F-LABEL: mul_v64i8c:
823; AVX512F:       # %bb.0: # %entry
824; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
825; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
826; AVX512F-NEXT:    vpmaddubsw %ymm2, %ymm1, %ymm3
827; AVX512F-NEXT:    vpmaddubsw %ymm2, %ymm0, %ymm2
828; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
829; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
830; AVX512F-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm0
831; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm0
832; AVX512F-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
833; AVX512F-NEXT:    vpsllw $8, %ymm1, %ymm1
834; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
835; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & mem)
836; AVX512F-NEXT:    retq
837;
838; AVX512BW-LABEL: mul_v64i8c:
839; AVX512BW:       # %bb.0: # %entry
840; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
841; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
842; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
843; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & mem)
844; AVX512BW-NEXT:    retq
845entry:
846  %A = mul <64 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
847  ret <64 x i8> %A
848}
849
850define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind  {
851; SSE2-LABEL: mul_v64i8:
852; SSE2:       # %bb.0: # %entry
853; SSE2-NEXT:    movdqa %xmm4, %xmm8
854; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
855; SSE2-NEXT:    movdqa %xmm0, %xmm9
856; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
857; SSE2-NEXT:    pmullw %xmm8, %xmm9
858; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
859; SSE2-NEXT:    pand %xmm8, %xmm9
860; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
861; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
862; SSE2-NEXT:    pmullw %xmm4, %xmm0
863; SSE2-NEXT:    pand %xmm8, %xmm0
864; SSE2-NEXT:    packuswb %xmm9, %xmm0
865; SSE2-NEXT:    movdqa %xmm5, %xmm4
866; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
867; SSE2-NEXT:    movdqa %xmm1, %xmm9
868; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
869; SSE2-NEXT:    pmullw %xmm4, %xmm9
870; SSE2-NEXT:    pand %xmm8, %xmm9
871; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
872; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
873; SSE2-NEXT:    pmullw %xmm5, %xmm1
874; SSE2-NEXT:    pand %xmm8, %xmm1
875; SSE2-NEXT:    packuswb %xmm9, %xmm1
876; SSE2-NEXT:    movdqa %xmm6, %xmm4
877; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
878; SSE2-NEXT:    movdqa %xmm2, %xmm5
879; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
880; SSE2-NEXT:    pmullw %xmm4, %xmm5
881; SSE2-NEXT:    pand %xmm8, %xmm5
882; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
883; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
884; SSE2-NEXT:    pmullw %xmm6, %xmm2
885; SSE2-NEXT:    pand %xmm8, %xmm2
886; SSE2-NEXT:    packuswb %xmm5, %xmm2
887; SSE2-NEXT:    movdqa %xmm7, %xmm4
888; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
889; SSE2-NEXT:    movdqa %xmm3, %xmm5
890; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
891; SSE2-NEXT:    pmullw %xmm4, %xmm5
892; SSE2-NEXT:    pand %xmm8, %xmm5
893; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
894; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
895; SSE2-NEXT:    pmullw %xmm7, %xmm3
896; SSE2-NEXT:    pand %xmm8, %xmm3
897; SSE2-NEXT:    packuswb %xmm5, %xmm3
898; SSE2-NEXT:    retq
899;
900; SSE41-LABEL: mul_v64i8:
901; SSE41:       # %bb.0: # %entry
902; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
903; SSE41-NEXT:    movdqa %xmm8, %xmm9
904; SSE41-NEXT:    pandn %xmm4, %xmm9
905; SSE41-NEXT:    pand %xmm8, %xmm4
906; SSE41-NEXT:    movdqa %xmm0, %xmm10
907; SSE41-NEXT:    pmaddubsw %xmm4, %xmm10
908; SSE41-NEXT:    pand %xmm8, %xmm10
909; SSE41-NEXT:    pmaddubsw %xmm9, %xmm0
910; SSE41-NEXT:    psllw $8, %xmm0
911; SSE41-NEXT:    por %xmm10, %xmm0
912; SSE41-NEXT:    movdqa %xmm8, %xmm4
913; SSE41-NEXT:    pandn %xmm5, %xmm4
914; SSE41-NEXT:    pand %xmm8, %xmm5
915; SSE41-NEXT:    movdqa %xmm1, %xmm9
916; SSE41-NEXT:    pmaddubsw %xmm5, %xmm9
917; SSE41-NEXT:    pand %xmm8, %xmm9
918; SSE41-NEXT:    pmaddubsw %xmm4, %xmm1
919; SSE41-NEXT:    psllw $8, %xmm1
920; SSE41-NEXT:    por %xmm9, %xmm1
921; SSE41-NEXT:    movdqa %xmm8, %xmm4
922; SSE41-NEXT:    pandn %xmm6, %xmm4
923; SSE41-NEXT:    pand %xmm8, %xmm6
924; SSE41-NEXT:    movdqa %xmm2, %xmm5
925; SSE41-NEXT:    pmaddubsw %xmm6, %xmm5
926; SSE41-NEXT:    pand %xmm8, %xmm5
927; SSE41-NEXT:    pmaddubsw %xmm4, %xmm2
928; SSE41-NEXT:    psllw $8, %xmm2
929; SSE41-NEXT:    por %xmm5, %xmm2
930; SSE41-NEXT:    movdqa %xmm7, %xmm4
931; SSE41-NEXT:    pand %xmm8, %xmm4
932; SSE41-NEXT:    movdqa %xmm3, %xmm5
933; SSE41-NEXT:    pmaddubsw %xmm4, %xmm5
934; SSE41-NEXT:    pand %xmm8, %xmm5
935; SSE41-NEXT:    pandn %xmm7, %xmm8
936; SSE41-NEXT:    pmaddubsw %xmm8, %xmm3
937; SSE41-NEXT:    psllw $8, %xmm3
938; SSE41-NEXT:    por %xmm5, %xmm3
939; SSE41-NEXT:    retq
940;
941; AVX2-LABEL: mul_v64i8:
942; AVX2:       # %bb.0: # %entry
943; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
944; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm5
945; AVX2-NEXT:    vpmaddubsw %ymm5, %ymm0, %ymm5
946; AVX2-NEXT:    vpand %ymm4, %ymm5, %ymm5
947; AVX2-NEXT:    vpandn %ymm2, %ymm4, %ymm2
948; AVX2-NEXT:    vpmaddubsw %ymm2, %ymm0, %ymm0
949; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm0
950; AVX2-NEXT:    vpor %ymm0, %ymm5, %ymm0
951; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm2
952; AVX2-NEXT:    vpmaddubsw %ymm2, %ymm1, %ymm2
953; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
954; AVX2-NEXT:    vpandn %ymm3, %ymm4, %ymm3
955; AVX2-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
956; AVX2-NEXT:    vpsllw $8, %ymm1, %ymm1
957; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
958; AVX2-NEXT:    retq
959;
960; AVX512F-LABEL: mul_v64i8:
961; AVX512F:       # %bb.0: # %entry
962; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
963; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
964; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm4
965; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
966; AVX512F-NEXT:    vpmaddubsw %ymm4, %ymm5, %ymm4
967; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm6
968; AVX512F-NEXT:    vpmaddubsw %ymm6, %ymm0, %ymm6
969; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm6, %zmm4
970; AVX512F-NEXT:    vpandn %ymm1, %ymm2, %ymm1
971; AVX512F-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
972; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm0
973; AVX512F-NEXT:    vpandn %ymm3, %ymm2, %ymm1
974; AVX512F-NEXT:    vpmaddubsw %ymm1, %ymm5, %ymm1
975; AVX512F-NEXT:    vpsllw $8, %ymm1, %ymm1
976; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
977; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm2)
978; AVX512F-NEXT:    retq
979;
980; AVX512BW-LABEL: mul_v64i8:
981; AVX512BW:       # %bb.0: # %entry
982; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
983; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm3
984; AVX512BW-NEXT:    vpmaddubsw %zmm3, %zmm0, %zmm3
985; AVX512BW-NEXT:    vpandnq %zmm1, %zmm2, %zmm1
986; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
987; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
988; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2)
989; AVX512BW-NEXT:    retq
990entry:
991  %A = mul <64 x i8> %i, %j
992  ret <64 x i8> %A
993}
994
995; PR30845
996define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
997; SSE2-LABEL: mul_v4i64_zero_upper:
998; SSE2:       # %bb.0: # %entry
999; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
1000; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
1001; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
1002; SSE2-NEXT:    pmuludq %xmm2, %xmm0
1003; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
1004; SSE2-NEXT:    pmuludq %xmm3, %xmm1
1005; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1006; SSE2-NEXT:    retq
1007;
1008; SSE41-LABEL: mul_v4i64_zero_upper:
1009; SSE41:       # %bb.0: # %entry
1010; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
1011; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
1012; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1013; SSE41-NEXT:    pmuludq %xmm2, %xmm0
1014; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
1015; SSE41-NEXT:    pmuludq %xmm3, %xmm1
1016; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1017; SSE41-NEXT:    retq
1018;
1019; AVX-LABEL: mul_v4i64_zero_upper:
1020; AVX:       # %bb.0: # %entry
1021; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1022; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1023; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1024; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
1025; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1026; AVX-NEXT:    vzeroupper
1027; AVX-NEXT:    retq
1028entry:
1029  %val1a = zext <4 x i32> %val1 to <4 x i64>
1030  %val2a = zext <4 x i32> %val2 to <4 x i64>
1031  %res64 = mul <4 x i64> %val1a, %val2a
1032  %rescast = bitcast <4 x i64> %res64 to <8 x i32>
1033  %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1034  ret <4 x i32> %res
1035}
1036
1037define <4 x i32> @mul_v4i64_zero_upper_left(<4 x i32> %val1, <4 x i64> %val2) {
1038; SSE2-LABEL: mul_v4i64_zero_upper_left:
1039; SSE2:       # %bb.0: # %entry
1040; SSE2-NEXT:    pxor %xmm4, %xmm4
1041; SSE2-NEXT:    movdqa %xmm0, %xmm3
1042; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1043; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1044; SSE2-NEXT:    movdqa %xmm0, %xmm4
1045; SSE2-NEXT:    pmuludq %xmm2, %xmm4
1046; SSE2-NEXT:    psrlq $32, %xmm2
1047; SSE2-NEXT:    pmuludq %xmm0, %xmm2
1048; SSE2-NEXT:    psllq $32, %xmm2
1049; SSE2-NEXT:    paddq %xmm4, %xmm2
1050; SSE2-NEXT:    movdqa %xmm3, %xmm0
1051; SSE2-NEXT:    pmuludq %xmm1, %xmm0
1052; SSE2-NEXT:    psrlq $32, %xmm1
1053; SSE2-NEXT:    pmuludq %xmm1, %xmm3
1054; SSE2-NEXT:    psllq $32, %xmm3
1055; SSE2-NEXT:    paddq %xmm0, %xmm3
1056; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3]
1057; SSE2-NEXT:    movaps %xmm3, %xmm0
1058; SSE2-NEXT:    retq
1059;
1060; SSE41-LABEL: mul_v4i64_zero_upper_left:
1061; SSE41:       # %bb.0: # %entry
1062; SSE41-NEXT:    pxor %xmm4, %xmm4
1063; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
1064; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1065; SSE41-NEXT:    movdqa %xmm0, %xmm4
1066; SSE41-NEXT:    pmuludq %xmm2, %xmm4
1067; SSE41-NEXT:    psrlq $32, %xmm2
1068; SSE41-NEXT:    pmuludq %xmm0, %xmm2
1069; SSE41-NEXT:    psllq $32, %xmm2
1070; SSE41-NEXT:    paddq %xmm4, %xmm2
1071; SSE41-NEXT:    movdqa %xmm3, %xmm0
1072; SSE41-NEXT:    pmuludq %xmm1, %xmm0
1073; SSE41-NEXT:    psrlq $32, %xmm1
1074; SSE41-NEXT:    pmuludq %xmm1, %xmm3
1075; SSE41-NEXT:    psllq $32, %xmm3
1076; SSE41-NEXT:    paddq %xmm0, %xmm3
1077; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3]
1078; SSE41-NEXT:    movaps %xmm3, %xmm0
1079; SSE41-NEXT:    retq
1080;
1081; AVX-LABEL: mul_v4i64_zero_upper_left:
1082; AVX:       # %bb.0: # %entry
1083; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1084; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
1085; AVX-NEXT:    vpsrlq $32, %ymm1, %ymm1
1086; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1087; AVX-NEXT:    vpsllq $32, %ymm0, %ymm0
1088; AVX-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
1089; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
1090; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1091; AVX-NEXT:    vzeroupper
1092; AVX-NEXT:    retq
1093entry:
1094  %val1a = zext <4 x i32> %val1 to <4 x i64>
1095  %res64 = mul <4 x i64> %val1a, %val2
1096  %rescast = bitcast <4 x i64> %res64 to <8 x i32>
1097  %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1098  ret <4 x i32> %res
1099}
1100
1101define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) {
1102; SSE2-LABEL: mul_v4i64_zero_lower:
1103; SSE2:       # %bb.0: # %entry
1104; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,1,1,3]
1105; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
1106; SSE2-NEXT:    psrlq $32, %xmm2
1107; SSE2-NEXT:    pmuludq %xmm0, %xmm2
1108; SSE2-NEXT:    psrlq $32, %xmm1
1109; SSE2-NEXT:    pmuludq %xmm1, %xmm3
1110; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]
1111; SSE2-NEXT:    movaps %xmm3, %xmm0
1112; SSE2-NEXT:    retq
1113;
1114; SSE41-LABEL: mul_v4i64_zero_lower:
1115; SSE41:       # %bb.0: # %entry
1116; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
1117; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
1118; SSE41-NEXT:    psrlq $32, %xmm2
1119; SSE41-NEXT:    pmuludq %xmm0, %xmm2
1120; SSE41-NEXT:    psrlq $32, %xmm1
1121; SSE41-NEXT:    pmuludq %xmm1, %xmm3
1122; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]
1123; SSE41-NEXT:    movaps %xmm3, %xmm0
1124; SSE41-NEXT:    retq
1125;
1126; AVX-LABEL: mul_v4i64_zero_lower:
1127; AVX:       # %bb.0: # %entry
1128; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1129; AVX-NEXT:    vpsrlq $32, %ymm1, %ymm1
1130; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1131; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
1132; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1133; AVX-NEXT:    vzeroupper
1134; AVX-NEXT:    retq
1135entry:
1136  %val1a = zext <4 x i32> %val1 to <4 x i64>
1137  %val2a = and <4 x i64> %val2, <i64 -4294967296, i64 -4294967296, i64 -4294967296, i64 -4294967296>
1138  %res64 = mul <4 x i64> %val1a, %val2a
1139  %rescast = bitcast <4 x i64> %res64 to <8 x i32>
1140  %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1141  ret <4 x i32> %res
1142}
1143
1144define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
1145; SSE2-LABEL: mul_v8i64_zero_upper:
1146; SSE2:       # %bb.0: # %entry
1147; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,1,1,3]
1148; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,1,3,3]
1149; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,1,1,3]
1150; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[2,1,3,3]
1151; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
1152; SSE2-NEXT:    pmuludq %xmm4, %xmm0
1153; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
1154; SSE2-NEXT:    pmuludq %xmm5, %xmm1
1155; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1156; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3]
1157; SSE2-NEXT:    pmuludq %xmm6, %xmm1
1158; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,1,3,3]
1159; SSE2-NEXT:    pmuludq %xmm7, %xmm2
1160; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
1161; SSE2-NEXT:    retq
1162;
1163; SSE41-LABEL: mul_v8i64_zero_upper:
1164; SSE41:       # %bb.0: # %entry
1165; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
1166; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,1,3,3]
1167; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero
1168; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[2,1,3,3]
1169; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
1170; SSE41-NEXT:    pmuludq %xmm4, %xmm0
1171; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
1172; SSE41-NEXT:    pmuludq %xmm5, %xmm1
1173; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1174; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
1175; SSE41-NEXT:    pmuludq %xmm6, %xmm1
1176; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,1,3,3]
1177; SSE41-NEXT:    pmuludq %xmm7, %xmm2
1178; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
1179; SSE41-NEXT:    retq
1180;
1181; AVX2-LABEL: mul_v8i64_zero_upper:
1182; AVX2:       # %bb.0: # %entry
1183; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1184; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1185; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1186; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1187; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
1188; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
1189; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1190; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1191; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7]
1192; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1193; AVX2-NEXT:    retq
1194;
1195; AVX512-LABEL: mul_v8i64_zero_upper:
1196; AVX512:       # %bb.0: # %entry
1197; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
1198; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
1199; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
1200; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1201; AVX512-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
1202; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1203; AVX512-NEXT:    retq
1204entry:
1205  %val1a = zext <8 x i32> %val1 to <8 x i64>
1206  %val2a = zext <8 x i32> %val2 to <8 x i64>
1207  %res64 = mul <8 x i64> %val1a, %val2a
1208  %rescast = bitcast <8 x i64> %res64 to <16 x i32>
1209  %res = shufflevector <16 x i32> %rescast, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 9, i32 11, i32 13, i32 15 >
1210  ret <8 x i32> %res
1211}
1212
1213define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
1214; SSE2-LABEL: mul_v8i64_sext:
1215; SSE2:       # %bb.0:
1216; SSE2-NEXT:    movdqa %xmm1, %xmm4
1217; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
1218; SSE2-NEXT:    psrad $16, %xmm6
1219; SSE2-NEXT:    pxor %xmm12, %xmm12
1220; SSE2-NEXT:    pxor %xmm7, %xmm7
1221; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
1222; SSE2-NEXT:    movdqa %xmm6, %xmm5
1223; SSE2-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3]
1224; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
1225; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1226; SSE2-NEXT:    psrad $16, %xmm0
1227; SSE2-NEXT:    pxor %xmm11, %xmm11
1228; SSE2-NEXT:    pcmpgtd %xmm0, %xmm11
1229; SSE2-NEXT:    movdqa %xmm0, %xmm9
1230; SSE2-NEXT:    punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
1231; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
1232; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
1233; SSE2-NEXT:    pxor %xmm8, %xmm8
1234; SSE2-NEXT:    pcmpgtd %xmm3, %xmm8
1235; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
1236; SSE2-NEXT:    pxor %xmm10, %xmm10
1237; SSE2-NEXT:    pcmpgtd %xmm2, %xmm10
1238; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
1239; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1240; SSE2-NEXT:    pxor %xmm13, %xmm13
1241; SSE2-NEXT:    pcmpgtd %xmm1, %xmm13
1242; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
1243; SSE2-NEXT:    pcmpgtd %xmm4, %xmm12
1244; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1]
1245; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm11[0,1,1,3]
1246; SSE2-NEXT:    pmuludq %xmm4, %xmm14
1247; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[0,1,1,3]
1248; SSE2-NEXT:    pmuludq %xmm0, %xmm12
1249; SSE2-NEXT:    paddq %xmm14, %xmm12
1250; SSE2-NEXT:    psllq $32, %xmm12
1251; SSE2-NEXT:    pmuludq %xmm4, %xmm0
1252; SSE2-NEXT:    paddq %xmm12, %xmm0
1253; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm11[2,1,3,3]
1254; SSE2-NEXT:    pmuludq %xmm1, %xmm4
1255; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[0,1,1,3]
1256; SSE2-NEXT:    pmuludq %xmm9, %xmm11
1257; SSE2-NEXT:    paddq %xmm4, %xmm11
1258; SSE2-NEXT:    psllq $32, %xmm11
1259; SSE2-NEXT:    pmuludq %xmm9, %xmm1
1260; SSE2-NEXT:    paddq %xmm11, %xmm1
1261; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,1,1,3]
1262; SSE2-NEXT:    pmuludq %xmm2, %xmm4
1263; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[0,1,1,3]
1264; SSE2-NEXT:    pmuludq %xmm6, %xmm9
1265; SSE2-NEXT:    paddq %xmm4, %xmm9
1266; SSE2-NEXT:    psllq $32, %xmm9
1267; SSE2-NEXT:    pmuludq %xmm6, %xmm2
1268; SSE2-NEXT:    paddq %xmm9, %xmm2
1269; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[2,1,3,3]
1270; SSE2-NEXT:    pmuludq %xmm3, %xmm4
1271; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[0,1,1,3]
1272; SSE2-NEXT:    pmuludq %xmm5, %xmm6
1273; SSE2-NEXT:    paddq %xmm4, %xmm6
1274; SSE2-NEXT:    psllq $32, %xmm6
1275; SSE2-NEXT:    pmuludq %xmm5, %xmm3
1276; SSE2-NEXT:    paddq %xmm6, %xmm3
1277; SSE2-NEXT:    retq
1278;
1279; SSE41-LABEL: mul_v8i64_sext:
1280; SSE41:       # %bb.0:
1281; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
1282; SSE41-NEXT:    pmovsxwq %xmm3, %xmm4
1283; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
1284; SSE41-NEXT:    pmovsxwq %xmm3, %xmm5
1285; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
1286; SSE41-NEXT:    pmovsxwq %xmm3, %xmm6
1287; SSE41-NEXT:    pmovsxwq %xmm0, %xmm7
1288; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,1,3,3]
1289; SSE41-NEXT:    pmuldq %xmm4, %xmm3
1290; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1291; SSE41-NEXT:    pmuldq %xmm5, %xmm2
1292; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,1,3,3]
1293; SSE41-NEXT:    pmuldq %xmm6, %xmm4
1294; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1295; SSE41-NEXT:    pmuldq %xmm7, %xmm0
1296; SSE41-NEXT:    movdqa %xmm4, %xmm1
1297; SSE41-NEXT:    retq
1298;
1299; AVX2-LABEL: mul_v8i64_sext:
1300; AVX2:       # %bb.0:
1301; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1302; AVX2-NEXT:    vpmovsxwq %xmm2, %ymm2
1303; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
1304; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
1305; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
1306; AVX2-NEXT:    vpmuldq %ymm3, %ymm2, %ymm2
1307; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1308; AVX2-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
1309; AVX2-NEXT:    vmovdqa %ymm2, %ymm1
1310; AVX2-NEXT:    retq
1311;
1312; AVX512-LABEL: mul_v8i64_sext:
1313; AVX512:       # %bb.0:
1314; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
1315; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
1316; AVX512-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0
1317; AVX512-NEXT:    retq
1318  %1 = sext <8 x i16> %val1 to <8 x i64>
1319  %2 = sext <8 x i32> %val2 to <8 x i64>
1320  %3 = mul <8 x i64> %1, %2
1321  ret <8 x i64> %3
1322}
1323
1324define <2 x i64> @pmuldq_square(<2 x i64> %x) {
1325; SSE2-LABEL: pmuldq_square:
1326; SSE2:       # %bb.0:
1327; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
1328; SSE2-NEXT:    psllq $32, %xmm0
1329; SSE2-NEXT:    psrad $31, %xmm0
1330; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
1331; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1332; SSE2-NEXT:    psrlq $32, %xmm0
1333; SSE2-NEXT:    pmuludq %xmm1, %xmm0
1334; SSE2-NEXT:    pmuludq %xmm1, %xmm1
1335; SSE2-NEXT:    psllq $33, %xmm0
1336; SSE2-NEXT:    paddq %xmm1, %xmm0
1337; SSE2-NEXT:    retq
1338;
1339; SSE41-LABEL: pmuldq_square:
1340; SSE41:       # %bb.0:
1341; SSE41-NEXT:    pmuldq %xmm0, %xmm0
1342; SSE41-NEXT:    retq
1343;
1344; AVX-LABEL: pmuldq_square:
1345; AVX:       # %bb.0:
1346; AVX-NEXT:    vpmuldq %xmm0, %xmm0, %xmm0
1347; AVX-NEXT:    retq
1348  %1 = shl <2 x i64> %x, <i64 32, i64 32>
1349  %2 = ashr exact <2 x i64> %1, <i64 32, i64 32>
1350  %3 = mul nsw <2 x i64> %2, %2
1351  ret <2 x i64> %3
1352}
1353
1354define <2 x i64> @pmuludq_square(<2 x i64> %x) {
1355; SSE-LABEL: pmuludq_square:
1356; SSE:       # %bb.0:
1357; SSE-NEXT:    pmuludq %xmm0, %xmm0
1358; SSE-NEXT:    retq
1359;
1360; AVX-LABEL: pmuludq_square:
1361; AVX:       # %bb.0:
1362; AVX-NEXT:    vpmuludq %xmm0, %xmm0, %xmm0
1363; AVX-NEXT:    retq
1364  %1 = and <2 x i64> %x, <i64 4294967295, i64 4294967295>
1365  %2 = mul nuw <2 x i64> %1, %1
1366  ret <2 x i64> %2
1367}
1368