xref: /llvm-project/llvm/test/CodeGen/X86/pmulh.ll (revision 520562c597a0a1d3056cf75b648c4192f77582ec)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
7
8define <4 x i16> @zext_mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) {
9; SSE-LABEL: zext_mulhuw_v4i16:
10; SSE:       # %bb.0:
11; SSE-NEXT:    pmulhuw %xmm1, %xmm0
12; SSE-NEXT:    retq
13;
14; AVX-LABEL: zext_mulhuw_v4i16:
15; AVX:       # %bb.0:
16; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
17; AVX-NEXT:    retq
18  %a1 = zext <4 x i16> %a to <4 x i32>
19  %b1 = zext <4 x i16> %b to <4 x i32>
20  %c = mul <4 x i32> %a1, %b1
21  %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
22  %e = trunc <4 x i32> %d to <4 x i16>
23  ret <4 x i16> %e
24}
25
26define <4 x i16> @and_mulhuw_v4i16(<4 x i64> %a, <4 x i64> %b) {
27; SSE2-LABEL: and_mulhuw_v4i16:
28; SSE2:       # %bb.0:
29; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
30; SSE2-NEXT:    pslld $16, %xmm2
31; SSE2-NEXT:    psrad $16, %xmm2
32; SSE2-NEXT:    xorps %xmm3, %xmm3
33; SSE2-NEXT:    packssdw %xmm3, %xmm2
34; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
35; SSE2-NEXT:    pslld $16, %xmm0
36; SSE2-NEXT:    psrad $16, %xmm0
37; SSE2-NEXT:    packssdw %xmm3, %xmm0
38; SSE2-NEXT:    pmulhuw %xmm2, %xmm0
39; SSE2-NEXT:    retq
40;
41; SSE41-LABEL: and_mulhuw_v4i16:
42; SSE41:       # %bb.0:
43; SSE41-NEXT:    pxor %xmm4, %xmm4
44; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
45; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
46; SSE41-NEXT:    packusdw %xmm3, %xmm2
47; SSE41-NEXT:    packusdw %xmm4, %xmm2
48; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
49; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
50; SSE41-NEXT:    packusdw %xmm1, %xmm0
51; SSE41-NEXT:    packusdw %xmm4, %xmm0
52; SSE41-NEXT:    pmulhuw %xmm2, %xmm0
53; SSE41-NEXT:    retq
54;
55; AVX2-LABEL: and_mulhuw_v4i16:
56; AVX2:       # %bb.0:
57; AVX2-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
58; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
59; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
60; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
61; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
62; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
63; AVX2-NEXT:    vzeroupper
64; AVX2-NEXT:    retq
65;
66; AVX512-LABEL: and_mulhuw_v4i16:
67; AVX512:       # %bb.0:
68; AVX512-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
69; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
70; AVX512-NEXT:    vzeroupper
71; AVX512-NEXT:    retq
72  %a1 = and <4 x i64> %a, <i64 65535, i64 65535, i64 65535, i64 65535>
73  %b1 = and <4 x i64> %b, <i64 65535, i64 65535, i64 65535, i64 65535>
74  %c = mul <4 x i64> %a1, %b1
75  %d = lshr <4 x i64> %c, <i64 16, i64 16, i64 16, i64 16>
76  %e = trunc <4 x i64> %d to <4 x i16>
77  ret <4 x i16> %e
78}
79
80define <4 x i16> @sext_mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) {
81; SSE-LABEL: sext_mulhw_v4i16:
82; SSE:       # %bb.0:
83; SSE-NEXT:    pmulhw %xmm1, %xmm0
84; SSE-NEXT:    retq
85;
86; AVX-LABEL: sext_mulhw_v4i16:
87; AVX:       # %bb.0:
88; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
89; AVX-NEXT:    retq
90  %a1 = sext <4 x i16> %a to <4 x i32>
91  %b1 = sext <4 x i16> %b to <4 x i32>
92  %c = mul <4 x i32> %a1, %b1
93  %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
94  %e = trunc <4 x i32> %d to <4 x i16>
95  ret <4 x i16> %e
96}
97
98define <4 x i16> @ashr_mulhw_v4i16(<4 x i32> %a, <4 x i32> %b) {
99; SSE2-LABEL: ashr_mulhw_v4i16:
100; SSE2:       # %bb.0:
101; SSE2-NEXT:    psrad $16, %xmm1
102; SSE2-NEXT:    packssdw %xmm1, %xmm1
103; SSE2-NEXT:    psrad $16, %xmm0
104; SSE2-NEXT:    packssdw %xmm0, %xmm0
105; SSE2-NEXT:    pmulhw %xmm1, %xmm0
106; SSE2-NEXT:    retq
107;
108; SSE41-LABEL: ashr_mulhw_v4i16:
109; SSE41:       # %bb.0:
110; SSE41-NEXT:    psrld $16, %xmm0
111; SSE41-NEXT:    psrld $16, %xmm1
112; SSE41-NEXT:    packusdw %xmm1, %xmm1
113; SSE41-NEXT:    packusdw %xmm0, %xmm0
114; SSE41-NEXT:    pmulhw %xmm1, %xmm0
115; SSE41-NEXT:    retq
116;
117; AVX-LABEL: ashr_mulhw_v4i16:
118; AVX:       # %bb.0:
119; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
120; AVX-NEXT:    vpsrld $16, %xmm1, %xmm1
121; AVX-NEXT:    vpackusdw %xmm1, %xmm1, %xmm1
122; AVX-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
123; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
124; AVX-NEXT:    retq
125  %a1 = ashr <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
126  %b1 = ashr <4 x i32> %b, <i32 16, i32 16, i32 16, i32 16>
127  %c = mul <4 x i32> %a1, %b1
128  %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
129  %e = trunc <4 x i32> %d to <4 x i16>
130  ret <4 x i16> %e
131}
132
133define <8 x i16> @zext_mulhuw_v8i16(<8 x i16> %a, <8 x i16> %b) {
134; SSE-LABEL: zext_mulhuw_v8i16:
135; SSE:       # %bb.0:
136; SSE-NEXT:    pmulhuw %xmm1, %xmm0
137; SSE-NEXT:    retq
138;
139; AVX-LABEL: zext_mulhuw_v8i16:
140; AVX:       # %bb.0:
141; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
142; AVX-NEXT:    retq
143  %a1 = zext <8 x i16> %a to <8 x i32>
144  %b1 = zext <8 x i16> %b to <8 x i32>
145  %c = mul <8 x i32> %a1, %b1
146  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
147  %e = trunc <8 x i32> %d to <8 x i16>
148  ret <8 x i16> %e
149}
150
151define <8 x i16> @lshr_mulhuw_v8i16(<8 x i32> %a, <8 x i32> %b) {
152; SSE2-LABEL: lshr_mulhuw_v8i16:
153; SSE2:       # %bb.0:
154; SSE2-NEXT:    psrad $16, %xmm3
155; SSE2-NEXT:    psrad $16, %xmm2
156; SSE2-NEXT:    packssdw %xmm3, %xmm2
157; SSE2-NEXT:    psrad $16, %xmm1
158; SSE2-NEXT:    psrad $16, %xmm0
159; SSE2-NEXT:    packssdw %xmm1, %xmm0
160; SSE2-NEXT:    pmulhuw %xmm2, %xmm0
161; SSE2-NEXT:    retq
162;
163; SSE41-LABEL: lshr_mulhuw_v8i16:
164; SSE41:       # %bb.0:
165; SSE41-NEXT:    psrld $16, %xmm1
166; SSE41-NEXT:    psrld $16, %xmm0
167; SSE41-NEXT:    packusdw %xmm1, %xmm0
168; SSE41-NEXT:    psrld $16, %xmm3
169; SSE41-NEXT:    psrld $16, %xmm2
170; SSE41-NEXT:    packusdw %xmm3, %xmm2
171; SSE41-NEXT:    pmulhuw %xmm2, %xmm0
172; SSE41-NEXT:    retq
173;
174; AVX2-LABEL: lshr_mulhuw_v8i16:
175; AVX2:       # %bb.0:
176; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
177; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
178; AVX2-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
179; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
180; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
181; AVX2-NEXT:    vzeroupper
182; AVX2-NEXT:    retq
183;
184; AVX512-LABEL: lshr_mulhuw_v8i16:
185; AVX512:       # %bb.0:
186; AVX512-NEXT:    vpsrld $16, %ymm0, %ymm0
187; AVX512-NEXT:    vpsrld $16, %ymm1, %ymm1
188; AVX512-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
189; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
190; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
191; AVX512-NEXT:    vzeroupper
192; AVX512-NEXT:    retq
193  %a1 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
194  %b1 = lshr <8 x i32> %b, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
195  %c = mul <8 x i32> %a1, %b1
196  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
197  %e = trunc <8 x i32> %d to <8 x i16>
198  ret <8 x i16> %e
199}
200
201define <8 x i16> @sext_mulhw_v8i16(<8 x i16> %a, <8 x i16> %b) {
202; SSE-LABEL: sext_mulhw_v8i16:
203; SSE:       # %bb.0:
204; SSE-NEXT:    pmulhw %xmm1, %xmm0
205; SSE-NEXT:    retq
206;
207; AVX-LABEL: sext_mulhw_v8i16:
208; AVX:       # %bb.0:
209; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
210; AVX-NEXT:    retq
211  %a1 = sext <8 x i16> %a to <8 x i32>
212  %b1 = sext <8 x i16> %b to <8 x i32>
213  %c = mul <8 x i32> %a1, %b1
214  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
215  %e = trunc <8 x i32> %d to <8 x i16>
216  ret <8 x i16> %e
217}
218
219define <8 x i16> @sextinreg_mulhw_v8i16(<8 x i32> %a, <8 x i32> %b) {
220; SSE-LABEL: sextinreg_mulhw_v8i16:
221; SSE:       # %bb.0:
222; SSE-NEXT:    pslld $24, %xmm1
223; SSE-NEXT:    psrad $24, %xmm1
224; SSE-NEXT:    pslld $24, %xmm0
225; SSE-NEXT:    psrad $24, %xmm0
226; SSE-NEXT:    packssdw %xmm1, %xmm0
227; SSE-NEXT:    pslld $25, %xmm3
228; SSE-NEXT:    psrad $25, %xmm3
229; SSE-NEXT:    pslld $25, %xmm2
230; SSE-NEXT:    psrad $25, %xmm2
231; SSE-NEXT:    packssdw %xmm3, %xmm2
232; SSE-NEXT:    pmulhw %xmm2, %xmm0
233; SSE-NEXT:    retq
234;
235; AVX2-LABEL: sextinreg_mulhw_v8i16:
236; AVX2:       # %bb.0:
237; AVX2-NEXT:    vpslld $24, %ymm0, %ymm0
238; AVX2-NEXT:    vpsrad $24, %ymm0, %ymm0
239; AVX2-NEXT:    vpslld $25, %ymm1, %ymm1
240; AVX2-NEXT:    vpsrad $25, %ymm1, %ymm1
241; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
242; AVX2-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
243; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
244; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
245; AVX2-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
246; AVX2-NEXT:    vzeroupper
247; AVX2-NEXT:    retq
248;
249; AVX512-LABEL: sextinreg_mulhw_v8i16:
250; AVX512:       # %bb.0:
251; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
252; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
253; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
254; AVX512-NEXT:    vpsllw $9, %xmm1, %xmm1
255; AVX512-NEXT:    vpsraw $9, %xmm1, %xmm1
256; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
257; AVX512-NEXT:    vpsllw $8, %xmm0, %xmm0
258; AVX512-NEXT:    vpsraw $8, %xmm0, %xmm0
259; AVX512-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
260; AVX512-NEXT:    vzeroupper
261; AVX512-NEXT:    retq
262  %a1 = shl <8 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
263  %b1 = shl <8 x i32> %b, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
264  %a2 = ashr <8 x i32> %a1, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
265  %b2 = ashr <8 x i32> %b1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
266  %c = mul <8 x i32> %a2, %b2
267  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
268  %e = trunc <8 x i32> %d to <8 x i16>
269  ret <8 x i16> %e
270}
271
272define <8 x i16> @zext_mulhuw_v8i16_v8i33(<8 x i16> %a, <8 x i16> %b) {
273; SSE-LABEL: zext_mulhuw_v8i16_v8i33:
274; SSE:       # %bb.0:
275; SSE-NEXT:    pmulhuw %xmm1, %xmm0
276; SSE-NEXT:    retq
277;
278; AVX-LABEL: zext_mulhuw_v8i16_v8i33:
279; AVX:       # %bb.0:
280; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
281; AVX-NEXT:    retq
282  %a1 = zext <8 x i16> %a to <8 x i33>
283  %b1 = zext <8 x i16> %b to <8 x i33>
284  %c = mul <8 x i33> %a1, %b1
285  %d = lshr <8 x i33> %c, <i33 16, i33 16, i33 16, i33 16, i33 16, i33 16, i33 16, i33 16>
286  %e = trunc <8 x i33> %d to <8 x i16>
287  ret <8 x i16> %e
288}
289
290define <16 x i16> @zext_mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
291; SSE-LABEL: zext_mulhuw_v16i16:
292; SSE:       # %bb.0:
293; SSE-NEXT:    pmulhuw %xmm2, %xmm0
294; SSE-NEXT:    pmulhuw %xmm3, %xmm1
295; SSE-NEXT:    retq
296;
297; AVX-LABEL: zext_mulhuw_v16i16:
298; AVX:       # %bb.0:
299; AVX-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
300; AVX-NEXT:    retq
301  %a1 = zext <16 x i16> %a to <16 x i32>
302  %b1 = zext <16 x i16> %b to <16 x i32>
303  %c = mul <16 x i32> %a1, %b1
304  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
305  %e = trunc <16 x i32> %d to <16 x i16>
306  ret <16 x i16> %e
307}
308
309define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
310; SSE2-LABEL: and_mulhuw_v16i16:
311; SSE2:       # %bb.0:
312; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767]
313; SSE2-NEXT:    pand %xmm8, %xmm3
314; SSE2-NEXT:    pand %xmm8, %xmm2
315; SSE2-NEXT:    packssdw %xmm3, %xmm2
316; SSE2-NEXT:    pand %xmm8, %xmm1
317; SSE2-NEXT:    pand %xmm8, %xmm0
318; SSE2-NEXT:    packssdw %xmm1, %xmm0
319; SSE2-NEXT:    pand %xmm8, %xmm7
320; SSE2-NEXT:    pand %xmm8, %xmm6
321; SSE2-NEXT:    packssdw %xmm7, %xmm6
322; SSE2-NEXT:    pmulhw %xmm2, %xmm6
323; SSE2-NEXT:    pand %xmm8, %xmm5
324; SSE2-NEXT:    pand %xmm4, %xmm8
325; SSE2-NEXT:    packssdw %xmm5, %xmm8
326; SSE2-NEXT:    pmulhw %xmm8, %xmm0
327; SSE2-NEXT:    movdqa %xmm6, %xmm1
328; SSE2-NEXT:    retq
329;
330; SSE41-LABEL: and_mulhuw_v16i16:
331; SSE41:       # %bb.0:
332; SSE41-NEXT:    pmovsxwd {{.*#+}} xmm8 = [32767,32767,32767,32767]
333; SSE41-NEXT:    pand %xmm8, %xmm3
334; SSE41-NEXT:    pand %xmm8, %xmm2
335; SSE41-NEXT:    packusdw %xmm3, %xmm2
336; SSE41-NEXT:    pand %xmm8, %xmm1
337; SSE41-NEXT:    pand %xmm8, %xmm0
338; SSE41-NEXT:    packusdw %xmm1, %xmm0
339; SSE41-NEXT:    pand %xmm8, %xmm7
340; SSE41-NEXT:    pand %xmm8, %xmm6
341; SSE41-NEXT:    packusdw %xmm7, %xmm6
342; SSE41-NEXT:    pmulhw %xmm2, %xmm6
343; SSE41-NEXT:    pand %xmm8, %xmm5
344; SSE41-NEXT:    pand %xmm4, %xmm8
345; SSE41-NEXT:    packusdw %xmm5, %xmm8
346; SSE41-NEXT:    pmulhw %xmm8, %xmm0
347; SSE41-NEXT:    movdqa %xmm6, %xmm1
348; SSE41-NEXT:    retq
349;
350; AVX2-LABEL: and_mulhuw_v16i16:
351; AVX2:       # %bb.0:
352; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32767,32767,32767,32767,32767,32767,32767,32767]
353; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
354; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
355; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
356; AVX2-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm0
357; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm2
358; AVX2-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm1
359; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
360; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
361; AVX2-NEXT:    retq
362;
363; AVX512F-LABEL: and_mulhuw_v16i16:
364; AVX512F:       # %bb.0:
365; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
366; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
367; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
368; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
369; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
370; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
371; AVX512F-NEXT:    retq
372;
373; AVX512BW-LABEL: and_mulhuw_v16i16:
374; AVX512BW:       # %bb.0:
375; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} zmm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
376; AVX512BW-NEXT:    vpandd %zmm2, %zmm0, %zmm0
377; AVX512BW-NEXT:    vpandd %zmm2, %zmm1, %zmm1
378; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0
379; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
380; AVX512BW-NEXT:    retq
381  %a1 = and <16 x i32> %a, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
382  %b1 = and <16 x i32> %b, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
383  %c = mul <16 x i32> %a1, %b1
384  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
385  %e = trunc <16 x i32> %d to <16 x i16>
386  ret <16 x i16> %e
387}
388
389define <16 x i16> @sext_mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
390; SSE-LABEL: sext_mulhuw_v16i16:
391; SSE:       # %bb.0:
392; SSE-NEXT:    pmulhw %xmm2, %xmm0
393; SSE-NEXT:    pmulhw %xmm3, %xmm1
394; SSE-NEXT:    retq
395;
396; AVX-LABEL: sext_mulhuw_v16i16:
397; AVX:       # %bb.0:
398; AVX-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
399; AVX-NEXT:    retq
400  %a1 = sext <16 x i16> %a to <16 x i32>
401  %b1 = sext <16 x i16> %b to <16 x i32>
402  %c = mul <16 x i32> %a1, %b1
403  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
404  %e = trunc <16 x i32> %d to <16 x i16>
405  ret <16 x i16> %e
406}
407
408define <16 x i16> @ashr_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
409; SSE2-LABEL: ashr_mulhuw_v16i16:
410; SSE2:       # %bb.0:
411; SSE2-NEXT:    psrad $16, %xmm5
412; SSE2-NEXT:    psrad $16, %xmm4
413; SSE2-NEXT:    packssdw %xmm5, %xmm4
414; SSE2-NEXT:    psrad $16, %xmm1
415; SSE2-NEXT:    psrad $16, %xmm0
416; SSE2-NEXT:    packssdw %xmm1, %xmm0
417; SSE2-NEXT:    pmulhw %xmm4, %xmm0
418; SSE2-NEXT:    psrad $16, %xmm7
419; SSE2-NEXT:    psrad $16, %xmm6
420; SSE2-NEXT:    packssdw %xmm7, %xmm6
421; SSE2-NEXT:    psrad $16, %xmm3
422; SSE2-NEXT:    psrad $16, %xmm2
423; SSE2-NEXT:    packssdw %xmm3, %xmm2
424; SSE2-NEXT:    pmulhw %xmm6, %xmm2
425; SSE2-NEXT:    movdqa %xmm2, %xmm1
426; SSE2-NEXT:    retq
427;
428; SSE41-LABEL: ashr_mulhuw_v16i16:
429; SSE41:       # %bb.0:
430; SSE41-NEXT:    psrld $16, %xmm3
431; SSE41-NEXT:    psrld $16, %xmm2
432; SSE41-NEXT:    packusdw %xmm3, %xmm2
433; SSE41-NEXT:    psrld $16, %xmm1
434; SSE41-NEXT:    psrld $16, %xmm0
435; SSE41-NEXT:    packusdw %xmm1, %xmm0
436; SSE41-NEXT:    psrld $16, %xmm7
437; SSE41-NEXT:    psrld $16, %xmm6
438; SSE41-NEXT:    packusdw %xmm7, %xmm6
439; SSE41-NEXT:    pmulhw %xmm2, %xmm6
440; SSE41-NEXT:    psrld $16, %xmm5
441; SSE41-NEXT:    psrld $16, %xmm4
442; SSE41-NEXT:    packusdw %xmm5, %xmm4
443; SSE41-NEXT:    pmulhw %xmm4, %xmm0
444; SSE41-NEXT:    movdqa %xmm6, %xmm1
445; SSE41-NEXT:    retq
446;
447; AVX2-LABEL: ashr_mulhuw_v16i16:
448; AVX2:       # %bb.0:
449; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
450; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
451; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
452; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm1
453; AVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
454; AVX2-NEXT:    vpackusdw %ymm1, %ymm2, %ymm1
455; AVX2-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
456; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
457; AVX2-NEXT:    retq
458;
459; AVX512-LABEL: ashr_mulhuw_v16i16:
460; AVX512:       # %bb.0:
461; AVX512-NEXT:    vpsrld $16, %zmm0, %zmm0
462; AVX512-NEXT:    vpsrld $16, %zmm1, %zmm1
463; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
464; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
465; AVX512-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
466; AVX512-NEXT:    retq
467  %a1 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
468  %b1 = ashr <16 x i32> %b, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
469  %c = mul <16 x i32> %a1, %b1
470  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
471  %e = trunc <16 x i32> %d to <16 x i16>
472  ret <16 x i16> %e
473}
474
475define <16 x i16> @zext_mulhuw_v16i16_v16i48(<16 x i16> %a, <16 x i16> %b) {
476; SSE-LABEL: zext_mulhuw_v16i16_v16i48:
477; SSE:       # %bb.0:
478; SSE-NEXT:    pmulhuw %xmm2, %xmm0
479; SSE-NEXT:    pmulhuw %xmm3, %xmm1
480; SSE-NEXT:    retq
481;
482; AVX-LABEL: zext_mulhuw_v16i16_v16i48:
483; AVX:       # %bb.0:
484; AVX-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
485; AVX-NEXT:    retq
486  %a1 = zext <16 x i16> %a to <16 x i48>
487  %b1 = zext <16 x i16> %b to <16 x i48>
488  %c = mul <16 x i48> %a1, %b1
489  %d = lshr <16 x i48> %c, <i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16>
490  %e = trunc <16 x i48> %d to <16 x i16>
491  ret <16 x i16> %e
492}
493
494define <32 x i16> @zext_mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) {
495; SSE-LABEL: zext_mulhuw_v32i16:
496; SSE:       # %bb.0:
497; SSE-NEXT:    pmulhuw %xmm4, %xmm0
498; SSE-NEXT:    pmulhuw %xmm5, %xmm1
499; SSE-NEXT:    pmulhuw %xmm6, %xmm2
500; SSE-NEXT:    pmulhuw %xmm7, %xmm3
501; SSE-NEXT:    retq
502;
503; AVX2-LABEL: zext_mulhuw_v32i16:
504; AVX2:       # %bb.0:
505; AVX2-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm0
506; AVX2-NEXT:    vpmulhuw %ymm3, %ymm1, %ymm1
507; AVX2-NEXT:    retq
508;
509; AVX512F-LABEL: zext_mulhuw_v32i16:
510; AVX512F:       # %bb.0:
511; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
512; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
513; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm3, %ymm2
514; AVX512F-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
515; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
516; AVX512F-NEXT:    retq
517;
518; AVX512BW-LABEL: zext_mulhuw_v32i16:
519; AVX512BW:       # %bb.0:
520; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0
521; AVX512BW-NEXT:    retq
522  %a1 = zext <32 x i16> %a to <32 x i32>
523  %b1 = zext <32 x i16> %b to <32 x i32>
524  %c = mul <32 x i32> %a1, %b1
525  %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
526  %e = trunc <32 x i32> %d to <32 x i16>
527  ret <32 x i16> %e
528}
529
530define <32 x i16> @sext_mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) {
531; SSE-LABEL: sext_mulhuw_v32i16:
532; SSE:       # %bb.0:
533; SSE-NEXT:    pmulhw %xmm4, %xmm0
534; SSE-NEXT:    pmulhw %xmm5, %xmm1
535; SSE-NEXT:    pmulhw %xmm6, %xmm2
536; SSE-NEXT:    pmulhw %xmm7, %xmm3
537; SSE-NEXT:    retq
538;
539; AVX2-LABEL: sext_mulhuw_v32i16:
540; AVX2:       # %bb.0:
541; AVX2-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
542; AVX2-NEXT:    vpmulhw %ymm3, %ymm1, %ymm1
543; AVX2-NEXT:    retq
544;
545; AVX512F-LABEL: sext_mulhuw_v32i16:
546; AVX512F:       # %bb.0:
547; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
548; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
549; AVX512F-NEXT:    vpmulhw %ymm2, %ymm3, %ymm2
550; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
551; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
552; AVX512F-NEXT:    retq
553;
554; AVX512BW-LABEL: sext_mulhuw_v32i16:
555; AVX512BW:       # %bb.0:
556; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0
557; AVX512BW-NEXT:    retq
558  %a1 = sext <32 x i16> %a to <32 x i32>
559  %b1 = sext <32 x i16> %b to <32 x i32>
560  %c = mul <32 x i32> %a1, %b1
561  %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
562  %e = trunc <32 x i32> %d to <32 x i16>
563  ret <32 x i16> %e
564}
565
566define <64 x i16> @zext_mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) {
567; SSE-LABEL: zext_mulhuw_v64i16:
568; SSE:       # %bb.0:
569; SSE-NEXT:    movq %rdi, %rax
570; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm0
571; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm1
572; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm2
573; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm3
574; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm4
575; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm5
576; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm6
577; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm7
578; SSE-NEXT:    movdqa %xmm7, 112(%rdi)
579; SSE-NEXT:    movdqa %xmm6, 96(%rdi)
580; SSE-NEXT:    movdqa %xmm5, 80(%rdi)
581; SSE-NEXT:    movdqa %xmm4, 64(%rdi)
582; SSE-NEXT:    movdqa %xmm3, 48(%rdi)
583; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
584; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
585; SSE-NEXT:    movdqa %xmm0, (%rdi)
586; SSE-NEXT:    retq
587;
588; AVX2-LABEL: zext_mulhuw_v64i16:
589; AVX2:       # %bb.0:
590; AVX2-NEXT:    vpmulhuw %ymm4, %ymm0, %ymm0
591; AVX2-NEXT:    vpmulhuw %ymm5, %ymm1, %ymm1
592; AVX2-NEXT:    vpmulhuw %ymm6, %ymm2, %ymm2
593; AVX2-NEXT:    vpmulhuw %ymm7, %ymm3, %ymm3
594; AVX2-NEXT:    retq
595;
596; AVX512F-LABEL: zext_mulhuw_v64i16:
597; AVX512F:       # %bb.0:
598; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
599; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
600; AVX512F-NEXT:    vpmulhuw %ymm4, %ymm5, %ymm4
601; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm0
602; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
603; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
604; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
605; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm4, %ymm2
606; AVX512F-NEXT:    vpmulhuw %ymm3, %ymm1, %ymm1
607; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
608; AVX512F-NEXT:    retq
609;
610; AVX512BW-LABEL: zext_mulhuw_v64i16:
611; AVX512BW:       # %bb.0:
612; AVX512BW-NEXT:    vpmulhuw %zmm2, %zmm0, %zmm0
613; AVX512BW-NEXT:    vpmulhuw %zmm3, %zmm1, %zmm1
614; AVX512BW-NEXT:    retq
615  %a1 = zext <64 x i16> %a to <64 x i32>
616  %b1 = zext <64 x i16> %b to <64 x i32>
617  %c = mul <64 x i32> %a1, %b1
618  %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
619  %e = trunc <64 x i32> %d to <64 x i16>
620  ret <64 x i16> %e
621}
622
623define <64 x i16> @sext_mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) {
624; SSE-LABEL: sext_mulhuw_v64i16:
625; SSE:       # %bb.0:
626; SSE-NEXT:    movq %rdi, %rax
627; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm0
628; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm1
629; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm2
630; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm3
631; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm4
632; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm5
633; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm6
634; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm7
635; SSE-NEXT:    movdqa %xmm7, 112(%rdi)
636; SSE-NEXT:    movdqa %xmm6, 96(%rdi)
637; SSE-NEXT:    movdqa %xmm5, 80(%rdi)
638; SSE-NEXT:    movdqa %xmm4, 64(%rdi)
639; SSE-NEXT:    movdqa %xmm3, 48(%rdi)
640; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
641; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
642; SSE-NEXT:    movdqa %xmm0, (%rdi)
643; SSE-NEXT:    retq
644;
645; AVX2-LABEL: sext_mulhuw_v64i16:
646; AVX2:       # %bb.0:
647; AVX2-NEXT:    vpmulhw %ymm4, %ymm0, %ymm0
648; AVX2-NEXT:    vpmulhw %ymm5, %ymm1, %ymm1
649; AVX2-NEXT:    vpmulhw %ymm6, %ymm2, %ymm2
650; AVX2-NEXT:    vpmulhw %ymm7, %ymm3, %ymm3
651; AVX2-NEXT:    retq
652;
653; AVX512F-LABEL: sext_mulhuw_v64i16:
654; AVX512F:       # %bb.0:
655; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
656; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
657; AVX512F-NEXT:    vpmulhw %ymm4, %ymm5, %ymm4
658; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
659; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
660; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
661; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
662; AVX512F-NEXT:    vpmulhw %ymm2, %ymm4, %ymm2
663; AVX512F-NEXT:    vpmulhw %ymm3, %ymm1, %ymm1
664; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
665; AVX512F-NEXT:    retq
666;
667; AVX512BW-LABEL: sext_mulhuw_v64i16:
668; AVX512BW:       # %bb.0:
669; AVX512BW-NEXT:    vpmulhw %zmm2, %zmm0, %zmm0
670; AVX512BW-NEXT:    vpmulhw %zmm3, %zmm1, %zmm1
671; AVX512BW-NEXT:    retq
672  %a1 = sext <64 x i16> %a to <64 x i32>
673  %b1 = sext <64 x i16> %b to <64 x i32>
674  %c = mul <64 x i32> %a1, %b1
675  %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
676  %e = trunc <64 x i32> %d to <64 x i16>
677  ret <64 x i16> %e
678}
679
680define <8 x i16> @zext_mulhuw_v8i16_i64(<8 x i16> %a, <8 x i16> %b) {
681; SSE-LABEL: zext_mulhuw_v8i16_i64:
682; SSE:       # %bb.0:
683; SSE-NEXT:    pmulhuw %xmm1, %xmm0
684; SSE-NEXT:    retq
685;
686; AVX-LABEL: zext_mulhuw_v8i16_i64:
687; AVX:       # %bb.0:
688; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
689; AVX-NEXT:    retq
690  %a1 = zext <8 x i16> %a to <8 x i64>
691  %b1 = zext <8 x i16> %b to <8 x i64>
692  %c = mul <8 x i64> %a1, %b1
693  %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
694  %e = trunc <8 x i64> %d to <8 x i16>
695  ret <8 x i16> %e
696}
697
698define <8 x i16> @sext_mulhuw_v8i16_i64(<8 x i16> %a, <8 x i16> %b) {
699; SSE-LABEL: sext_mulhuw_v8i16_i64:
700; SSE:       # %bb.0:
701; SSE-NEXT:    pmulhw %xmm1, %xmm0
702; SSE-NEXT:    retq
703;
704; AVX-LABEL: sext_mulhuw_v8i16_i64:
705; AVX:       # %bb.0:
706; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
707; AVX-NEXT:    retq
708  %a1 = sext <8 x i16> %a to <8 x i64>
709  %b1 = sext <8 x i16> %b to <8 x i64>
710  %c = mul <8 x i64> %a1, %b1
711  %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
712  %e = trunc <8 x i64> %d to <8 x i16>
713  ret <8 x i16> %e
714}
715
716define <4 x i32> @zext_mulhuw_v4i16_lshr(<4 x i16> %a, <4 x i16> %b) {
717; SSE2-LABEL: zext_mulhuw_v4i16_lshr:
718; SSE2:       # %bb.0:
719; SSE2-NEXT:    pmulhuw %xmm1, %xmm0
720; SSE2-NEXT:    pxor %xmm1, %xmm1
721; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
722; SSE2-NEXT:    retq
723;
724; SSE41-LABEL: zext_mulhuw_v4i16_lshr:
725; SSE41:       # %bb.0:
726; SSE41-NEXT:    pmulhuw %xmm1, %xmm0
727; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
728; SSE41-NEXT:    retq
729;
730; AVX-LABEL: zext_mulhuw_v4i16_lshr:
731; AVX:       # %bb.0:
732; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
733; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
734; AVX-NEXT:    retq
735  %a1 = zext <4 x i16> %a to <4 x i32>
736  %b1 = zext <4 x i16> %b to <4 x i32>
737  %c = mul <4 x i32> %a1, %b1
738  %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
739  ret <4 x i32> %d
740}
741
742define <4 x i32> @mulhsw_v4i16_lshr(<4 x i16> %a, <4 x i16> %b) {
743; SSE2-LABEL: mulhsw_v4i16_lshr:
744; SSE2:       # %bb.0:
745; SSE2-NEXT:    pmulhw %xmm1, %xmm0
746; SSE2-NEXT:    pxor %xmm1, %xmm1
747; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
748; SSE2-NEXT:    retq
749;
750; SSE41-LABEL: mulhsw_v4i16_lshr:
751; SSE41:       # %bb.0:
752; SSE41-NEXT:    pmulhw %xmm1, %xmm0
753; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
754; SSE41-NEXT:    retq
755;
756; AVX-LABEL: mulhsw_v4i16_lshr:
757; AVX:       # %bb.0:
758; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
759; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
760; AVX-NEXT:    retq
761  %a1 = sext <4 x i16> %a to <4 x i32>
762  %b1 = sext <4 x i16> %b to <4 x i32>
763  %c = mul <4 x i32> %a1, %b1
764  %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
765  ret <4 x i32> %d
766}
767
768define <4 x i32> @mulhsw_v4i16_ashr(<4 x i16> %a, <4 x i16> %b) {
769; SSE2-LABEL: mulhsw_v4i16_ashr:
770; SSE2:       # %bb.0:
771; SSE2-NEXT:    pmulhw %xmm1, %xmm0
772; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
773; SSE2-NEXT:    psrad $16, %xmm0
774; SSE2-NEXT:    retq
775;
776; SSE41-LABEL: mulhsw_v4i16_ashr:
777; SSE41:       # %bb.0:
778; SSE41-NEXT:    pmulhw %xmm1, %xmm0
779; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
780; SSE41-NEXT:    retq
781;
782; AVX-LABEL: mulhsw_v4i16_ashr:
783; AVX:       # %bb.0:
784; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
785; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
786; AVX-NEXT:    retq
787  %a1 = sext <4 x i16> %a to <4 x i32>
788  %b1 = sext <4 x i16> %b to <4 x i32>
789  %c = mul <4 x i32> %a1, %b1
790  %d = ashr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
791  ret <4 x i32> %d
792}
793
794define <8 x i32> @zext_mulhuw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) {
795; SSE2-LABEL: zext_mulhuw_v8i16_lshr:
796; SSE2:       # %bb.0:
797; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
798; SSE2-NEXT:    pxor %xmm2, %xmm2
799; SSE2-NEXT:    movdqa %xmm1, %xmm0
800; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
801; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
802; SSE2-NEXT:    retq
803;
804; SSE41-LABEL: zext_mulhuw_v8i16_lshr:
805; SSE41:       # %bb.0:
806; SSE41-NEXT:    movdqa %xmm0, %xmm2
807; SSE41-NEXT:    pmulhuw %xmm1, %xmm2
808; SSE41-NEXT:    pxor %xmm1, %xmm1
809; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
810; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
811; SSE41-NEXT:    movdqa %xmm2, %xmm1
812; SSE41-NEXT:    retq
813;
814; AVX-LABEL: zext_mulhuw_v8i16_lshr:
815; AVX:       # %bb.0:
816; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
817; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
818; AVX-NEXT:    retq
819  %a1 = zext <8 x i16> %a to <8 x i32>
820  %b1 = zext <8 x i16> %b to <8 x i32>
821  %c = mul <8 x i32> %a1, %b1
822  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
823  ret <8 x i32> %d
824}
825
826define <8 x i32> @mulhsw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) {
827; SSE2-LABEL: mulhsw_v8i16_lshr:
828; SSE2:       # %bb.0:
829; SSE2-NEXT:    pmulhw %xmm0, %xmm1
830; SSE2-NEXT:    pxor %xmm2, %xmm2
831; SSE2-NEXT:    movdqa %xmm1, %xmm0
832; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
833; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
834; SSE2-NEXT:    retq
835;
836; SSE41-LABEL: mulhsw_v8i16_lshr:
837; SSE41:       # %bb.0:
838; SSE41-NEXT:    movdqa %xmm0, %xmm2
839; SSE41-NEXT:    pmulhw %xmm1, %xmm2
840; SSE41-NEXT:    pxor %xmm1, %xmm1
841; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
842; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
843; SSE41-NEXT:    movdqa %xmm2, %xmm1
844; SSE41-NEXT:    retq
845;
846; AVX-LABEL: mulhsw_v8i16_lshr:
847; AVX:       # %bb.0:
848; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
849; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
850; AVX-NEXT:    retq
851  %a1 = sext <8 x i16> %a to <8 x i32>
852  %b1 = sext <8 x i16> %b to <8 x i32>
853  %c = mul <8 x i32> %a1, %b1
854  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
855  ret <8 x i32> %d
856}
857
858define <8 x i32> @mulhsw_v8i16_ashr(<8 x i16> %a, <8 x i16> %b) {
859; SSE2-LABEL: mulhsw_v8i16_ashr:
860; SSE2:       # %bb.0:
861; SSE2-NEXT:    pmulhw %xmm1, %xmm0
862; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
863; SSE2-NEXT:    psrad $16, %xmm2
864; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
865; SSE2-NEXT:    psrad $16, %xmm1
866; SSE2-NEXT:    movdqa %xmm2, %xmm0
867; SSE2-NEXT:    retq
868;
869; SSE41-LABEL: mulhsw_v8i16_ashr:
870; SSE41:       # %bb.0:
871; SSE41-NEXT:    pmulhw %xmm1, %xmm0
872; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
873; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
874; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
875; SSE41-NEXT:    movdqa %xmm2, %xmm0
876; SSE41-NEXT:    retq
877;
878; AVX-LABEL: mulhsw_v8i16_ashr:
879; AVX:       # %bb.0:
880; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
881; AVX-NEXT:    vpmovsxwd %xmm0, %ymm0
882; AVX-NEXT:    retq
883  %a1 = sext <8 x i16> %a to <8 x i32>
884  %b1 = sext <8 x i16> %b to <8 x i32>
885  %c = mul <8 x i32> %a1, %b1
886  %d = ashr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
887  ret <8 x i32> %d
888}
889
890define <16 x i32> @zext_mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
891; SSE2-LABEL: zext_mulhuw_v16i16_lshr:
892; SSE2:       # %bb.0:
893; SSE2-NEXT:    movdqa %xmm0, %xmm4
894; SSE2-NEXT:    pmulhuw %xmm2, %xmm4
895; SSE2-NEXT:    pxor %xmm5, %xmm5
896; SSE2-NEXT:    movdqa %xmm4, %xmm0
897; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
898; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
899; SSE2-NEXT:    pmulhuw %xmm1, %xmm3
900; SSE2-NEXT:    movdqa %xmm3, %xmm2
901; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
902; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
903; SSE2-NEXT:    movdqa %xmm4, %xmm1
904; SSE2-NEXT:    retq
905;
906; SSE41-LABEL: zext_mulhuw_v16i16_lshr:
907; SSE41:       # %bb.0:
908; SSE41-NEXT:    movdqa %xmm1, %xmm4
909; SSE41-NEXT:    movdqa %xmm0, %xmm1
910; SSE41-NEXT:    pmulhuw %xmm2, %xmm1
911; SSE41-NEXT:    pxor %xmm5, %xmm5
912; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
913; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
914; SSE41-NEXT:    pmulhuw %xmm3, %xmm4
915; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
916; SSE41-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
917; SSE41-NEXT:    movdqa %xmm4, %xmm3
918; SSE41-NEXT:    retq
919;
920; AVX2-LABEL: zext_mulhuw_v16i16_lshr:
921; AVX2:       # %bb.0:
922; AVX2-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm1
923; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
924; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
925; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
926; AVX2-NEXT:    retq
927;
928; AVX512-LABEL: zext_mulhuw_v16i16_lshr:
929; AVX512:       # %bb.0:
930; AVX512-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
931; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
932; AVX512-NEXT:    retq
933  %a1 = zext <16 x i16> %a to <16 x i32>
934  %b1 = zext <16 x i16> %b to <16 x i32>
935  %c = mul <16 x i32> %a1, %b1
936  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
937  ret <16 x i32> %d
938}
939
940; PR109790
941define void @PR109790(ptr sret([32 x i8]) %ret, ptr %a) {
942; SSE-LABEL: PR109790:
943; SSE:       # %bb.0:
944; SSE-NEXT:    movq %rdi, %rax
945; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [32767,32767,32767,32767,32767,32767,32767,32767]
946; SSE-NEXT:    movdqa (%rsi), %xmm1
947; SSE-NEXT:    pand %xmm0, %xmm1
948; SSE-NEXT:    pand 16(%rsi), %xmm0
949; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [64536,64536,64536,64536,64536,64536,64536,64536]
950; SSE-NEXT:    pmulhw %xmm2, %xmm0
951; SSE-NEXT:    pmulhw %xmm2, %xmm1
952; SSE-NEXT:    movdqa %xmm1, (%rdi)
953; SSE-NEXT:    movdqa %xmm0, 16(%rdi)
954; SSE-NEXT:    retq
955;
956; AVX2-LABEL: PR109790:
957; AVX2:       # %bb.0:
958; AVX2-NEXT:    movq %rdi, %rax
959; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
960; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
961; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536]
962; AVX2-NEXT:    vmovdqa %ymm0, (%rdi)
963; AVX2-NEXT:    vzeroupper
964; AVX2-NEXT:    retq
965;
966; AVX512F-LABEL: PR109790:
967; AVX512F:       # %bb.0:
968; AVX512F-NEXT:    movq %rdi, %rax
969; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
970; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
971; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
972; AVX512F-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
973; AVX512F-NEXT:    vpsrld $16, %zmm0, %zmm0
974; AVX512F-NEXT:    vpmovdw %zmm0, (%rdi)
975; AVX512F-NEXT:    vzeroupper
976; AVX512F-NEXT:    retq
977;
978; AVX512BW-LABEL: PR109790:
979; AVX512BW:       # %bb.0:
980; AVX512BW-NEXT:    movq %rdi, %rax
981; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm0
982; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
983; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
984; AVX512BW-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0]
985; AVX512BW-NEXT:    vpsrld $16, %zmm0, %zmm0
986; AVX512BW-NEXT:    vpmovdw %zmm0, (%rdi)
987; AVX512BW-NEXT:    vzeroupper
988; AVX512BW-NEXT:    retq
989  %load = load <16 x i16>, ptr %a, align 32
990  %and = and <16 x i16> %load, <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>
991  %ext = zext nneg <16 x i16> %and to <16 x i32>
992  %mul = mul nsw <16 x i32> %ext, <i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000>
993  %srl = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
994  %res = trunc nuw <16 x i32> %srl to <16 x i16>
995  store <16 x i16> %res, ptr %ret, align 32
996  ret void
997}
998
999; PR109790
1000define <16 x i16> @zext_mulhuw_v16i16_negative_constant(<16 x i16> %a) {
1001; SSE-LABEL: zext_mulhuw_v16i16_negative_constant:
1002; SSE:       # %bb.0:
1003; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
1004; SSE-NEXT:    pand %xmm2, %xmm1
1005; SSE-NEXT:    pand %xmm2, %xmm0
1006; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [64536,64536,64536,64536,64536,64536,64536,64536]
1007; SSE-NEXT:    pmulhw %xmm2, %xmm0
1008; SSE-NEXT:    pmulhw %xmm2, %xmm1
1009; SSE-NEXT:    retq
1010;
1011; AVX-LABEL: zext_mulhuw_v16i16_negative_constant:
1012; AVX:       # %bb.0:
1013; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1014; AVX-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536]
1015; AVX-NEXT:    retq
1016  %k = and <16 x i16> %a, <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>
1017  %x = zext nneg <16 x i16> %k to <16 x i32>
1018  %m = mul nsw <16 x i32> %x, <i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000>
1019  %s = lshr <16 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1020  %t = trunc nuw <16 x i32> %s to <16 x i16>
1021  ret <16 x i16> %t
1022}
1023
1024; PR109790
1025define <16 x i16> @zext_mulhuw_v16i16_positive_constant(<16 x i16> %a) {
1026; SSE-LABEL: zext_mulhuw_v16i16_positive_constant:
1027; SSE:       # %bb.0:
1028; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
1029; SSE-NEXT:    pand %xmm2, %xmm1
1030; SSE-NEXT:    pand %xmm2, %xmm0
1031; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1000,1000,1000,1000,1000,1000,1000,1000]
1032; SSE-NEXT:    pmulhw %xmm2, %xmm0
1033; SSE-NEXT:    pmulhw %xmm2, %xmm1
1034; SSE-NEXT:    retq
1035;
1036; AVX-LABEL: zext_mulhuw_v16i16_positive_constant:
1037; AVX:       # %bb.0:
1038; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1039; AVX-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000]
1040; AVX-NEXT:    retq
1041  %k = and <16 x i16> %a, <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>
1042  %x = zext nneg <16 x i16> %k to <16 x i32>
1043  %m = mul nuw nsw <16 x i32> %x, <i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000>
1044  %s = lshr <16 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1045  %t = trunc nuw nsw <16 x i32> %s to <16 x i16>
1046  ret <16 x i16> %t
1047}
1048
1049define <16 x i32> @mulhsw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
1050; SSE2-LABEL: mulhsw_v16i16_lshr:
1051; SSE2:       # %bb.0:
1052; SSE2-NEXT:    movdqa %xmm0, %xmm4
1053; SSE2-NEXT:    pmulhw %xmm2, %xmm4
1054; SSE2-NEXT:    pxor %xmm5, %xmm5
1055; SSE2-NEXT:    movdqa %xmm4, %xmm0
1056; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
1057; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
1058; SSE2-NEXT:    pmulhw %xmm1, %xmm3
1059; SSE2-NEXT:    movdqa %xmm3, %xmm2
1060; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
1061; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
1062; SSE2-NEXT:    movdqa %xmm4, %xmm1
1063; SSE2-NEXT:    retq
1064;
1065; SSE41-LABEL: mulhsw_v16i16_lshr:
1066; SSE41:       # %bb.0:
1067; SSE41-NEXT:    movdqa %xmm1, %xmm4
1068; SSE41-NEXT:    movdqa %xmm0, %xmm1
1069; SSE41-NEXT:    pmulhw %xmm2, %xmm1
1070; SSE41-NEXT:    pxor %xmm5, %xmm5
1071; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1072; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
1073; SSE41-NEXT:    pmulhw %xmm3, %xmm4
1074; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
1075; SSE41-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
1076; SSE41-NEXT:    movdqa %xmm4, %xmm3
1077; SSE41-NEXT:    retq
1078;
1079; AVX2-LABEL: mulhsw_v16i16_lshr:
1080; AVX2:       # %bb.0:
1081; AVX2-NEXT:    vpmulhw %ymm1, %ymm0, %ymm1
1082; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1083; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
1084; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1085; AVX2-NEXT:    retq
1086;
1087; AVX512-LABEL: mulhsw_v16i16_lshr:
1088; AVX512:       # %bb.0:
1089; AVX512-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
1090; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1091; AVX512-NEXT:    retq
1092  %a1 = sext <16 x i16> %a to <16 x i32>
1093  %b1 = sext <16 x i16> %b to <16 x i32>
1094  %c = mul <16 x i32> %a1, %b1
1095  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1096  ret <16 x i32> %d
1097}
1098
1099define <16 x i32> @mulhsw_v16i16_ashr(<16 x i16> %a, <16 x i16> %b) {
1100; SSE2-LABEL: mulhsw_v16i16_ashr:
1101; SSE2:       # %bb.0:
1102; SSE2-NEXT:    pmulhw %xmm2, %xmm0
1103; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
1104; SSE2-NEXT:    psrad $16, %xmm5
1105; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1106; SSE2-NEXT:    psrad $16, %xmm4
1107; SSE2-NEXT:    pmulhw %xmm3, %xmm1
1108; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1109; SSE2-NEXT:    psrad $16, %xmm2
1110; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1111; SSE2-NEXT:    psrad $16, %xmm3
1112; SSE2-NEXT:    movdqa %xmm5, %xmm0
1113; SSE2-NEXT:    movdqa %xmm4, %xmm1
1114; SSE2-NEXT:    retq
1115;
1116; SSE41-LABEL: mulhsw_v16i16_ashr:
1117; SSE41:       # %bb.0:
1118; SSE41-NEXT:    pmulhw %xmm2, %xmm0
1119; SSE41-NEXT:    pmovsxwd %xmm0, %xmm4
1120; SSE41-NEXT:    pmulhw %xmm3, %xmm1
1121; SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
1122; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1123; SSE41-NEXT:    pmovsxwd %xmm0, %xmm5
1124; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1125; SSE41-NEXT:    pmovsxwd %xmm0, %xmm3
1126; SSE41-NEXT:    movdqa %xmm4, %xmm0
1127; SSE41-NEXT:    movdqa %xmm5, %xmm1
1128; SSE41-NEXT:    retq
1129;
1130; AVX2-LABEL: mulhsw_v16i16_ashr:
1131; AVX2:       # %bb.0:
1132; AVX2-NEXT:    vpmulhw %ymm1, %ymm0, %ymm1
1133; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm0
1134; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
1135; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
1136; AVX2-NEXT:    retq
1137;
1138; AVX512-LABEL: mulhsw_v16i16_ashr:
1139; AVX512:       # %bb.0:
1140; AVX512-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
1141; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
1142; AVX512-NEXT:    retq
1143  %a1 = sext <16 x i16> %a to <16 x i32>
1144  %b1 = sext <16 x i16> %b to <16 x i32>
1145  %c = mul <16 x i32> %a1, %b1
1146  %d = ashr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1147  ret <16 x i32> %d
1148}
1149
1150define <32 x i32> @zext_mulhuw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
1151; SSE2-LABEL: zext_mulhuw_v32i16_lshr:
1152; SSE2:       # %bb.0:
1153; SSE2-NEXT:    movq %rdi, %rax
1154; SSE2-NEXT:    pmulhuw %xmm4, %xmm0
1155; SSE2-NEXT:    pxor %xmm4, %xmm4
1156; SSE2-NEXT:    movdqa %xmm0, %xmm8
1157; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
1158; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1159; SSE2-NEXT:    pmulhuw %xmm5, %xmm1
1160; SSE2-NEXT:    movdqa %xmm1, %xmm5
1161; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1162; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1163; SSE2-NEXT:    pmulhuw %xmm6, %xmm2
1164; SSE2-NEXT:    movdqa %xmm2, %xmm6
1165; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
1166; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1167; SSE2-NEXT:    pmulhuw %xmm7, %xmm3
1168; SSE2-NEXT:    movdqa %xmm3, %xmm7
1169; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
1170; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1171; SSE2-NEXT:    movdqa %xmm3, 112(%rdi)
1172; SSE2-NEXT:    movdqa %xmm7, 96(%rdi)
1173; SSE2-NEXT:    movdqa %xmm2, 80(%rdi)
1174; SSE2-NEXT:    movdqa %xmm6, 64(%rdi)
1175; SSE2-NEXT:    movdqa %xmm1, 48(%rdi)
1176; SSE2-NEXT:    movdqa %xmm5, 32(%rdi)
1177; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
1178; SSE2-NEXT:    movdqa %xmm8, (%rdi)
1179; SSE2-NEXT:    retq
1180;
1181; SSE41-LABEL: zext_mulhuw_v32i16_lshr:
1182; SSE41:       # %bb.0:
1183; SSE41-NEXT:    movq %rdi, %rax
1184; SSE41-NEXT:    pmulhuw %xmm4, %xmm0
1185; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1186; SSE41-NEXT:    pxor %xmm8, %xmm8
1187; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
1188; SSE41-NEXT:    pmulhuw %xmm5, %xmm1
1189; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1190; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
1191; SSE41-NEXT:    pmulhuw %xmm6, %xmm2
1192; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1193; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
1194; SSE41-NEXT:    pmulhuw %xmm7, %xmm3
1195; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
1196; SSE41-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
1197; SSE41-NEXT:    movdqa %xmm3, 112(%rdi)
1198; SSE41-NEXT:    movdqa %xmm7, 96(%rdi)
1199; SSE41-NEXT:    movdqa %xmm2, 80(%rdi)
1200; SSE41-NEXT:    movdqa %xmm6, 64(%rdi)
1201; SSE41-NEXT:    movdqa %xmm1, 48(%rdi)
1202; SSE41-NEXT:    movdqa %xmm5, 32(%rdi)
1203; SSE41-NEXT:    movdqa %xmm0, 16(%rdi)
1204; SSE41-NEXT:    movdqa %xmm4, (%rdi)
1205; SSE41-NEXT:    retq
1206;
1207; AVX2-LABEL: zext_mulhuw_v32i16_lshr:
1208; AVX2:       # %bb.0:
1209; AVX2-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm2
1210; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1211; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
1212; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1213; AVX2-NEXT:    vpmulhuw %ymm3, %ymm1, %ymm1
1214; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1215; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
1216; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1217; AVX2-NEXT:    vmovdqa %ymm4, %ymm1
1218; AVX2-NEXT:    retq
1219;
1220; AVX512F-LABEL: zext_mulhuw_v32i16_lshr:
1221; AVX512F:       # %bb.0:
1222; AVX512F-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
1223; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1224; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1225; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1226; AVX512F-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
1227; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1228; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
1229; AVX512F-NEXT:    retq
1230;
1231; AVX512BW-LABEL: zext_mulhuw_v32i16_lshr:
1232; AVX512BW:       # %bb.0:
1233; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm1
1234; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1235; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1236; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1237; AVX512BW-NEXT:    retq
1238  %a1 = zext <32 x i16> %a to <32 x i32>
1239  %b1 = zext <32 x i16> %b to <32 x i32>
1240  %c = mul <32 x i32> %a1, %b1
1241  %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1242  ret <32 x i32> %d
1243}
1244
1245define <32 x i32> @mulhsw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
1246; SSE2-LABEL: mulhsw_v32i16_lshr:
1247; SSE2:       # %bb.0:
1248; SSE2-NEXT:    movq %rdi, %rax
1249; SSE2-NEXT:    pmulhw %xmm4, %xmm0
1250; SSE2-NEXT:    pxor %xmm4, %xmm4
1251; SSE2-NEXT:    movdqa %xmm0, %xmm8
1252; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
1253; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1254; SSE2-NEXT:    pmulhw %xmm5, %xmm1
1255; SSE2-NEXT:    movdqa %xmm1, %xmm5
1256; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1257; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1258; SSE2-NEXT:    pmulhw %xmm6, %xmm2
1259; SSE2-NEXT:    movdqa %xmm2, %xmm6
1260; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
1261; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1262; SSE2-NEXT:    pmulhw %xmm7, %xmm3
1263; SSE2-NEXT:    movdqa %xmm3, %xmm7
1264; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
1265; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1266; SSE2-NEXT:    movdqa %xmm3, 112(%rdi)
1267; SSE2-NEXT:    movdqa %xmm7, 96(%rdi)
1268; SSE2-NEXT:    movdqa %xmm2, 80(%rdi)
1269; SSE2-NEXT:    movdqa %xmm6, 64(%rdi)
1270; SSE2-NEXT:    movdqa %xmm1, 48(%rdi)
1271; SSE2-NEXT:    movdqa %xmm5, 32(%rdi)
1272; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
1273; SSE2-NEXT:    movdqa %xmm8, (%rdi)
1274; SSE2-NEXT:    retq
1275;
1276; SSE41-LABEL: mulhsw_v32i16_lshr:
1277; SSE41:       # %bb.0:
1278; SSE41-NEXT:    movq %rdi, %rax
1279; SSE41-NEXT:    pmulhw %xmm4, %xmm0
1280; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1281; SSE41-NEXT:    pxor %xmm8, %xmm8
1282; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
1283; SSE41-NEXT:    pmulhw %xmm5, %xmm1
1284; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1285; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
1286; SSE41-NEXT:    pmulhw %xmm6, %xmm2
1287; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1288; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
1289; SSE41-NEXT:    pmulhw %xmm7, %xmm3
1290; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
1291; SSE41-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
1292; SSE41-NEXT:    movdqa %xmm3, 112(%rdi)
1293; SSE41-NEXT:    movdqa %xmm7, 96(%rdi)
1294; SSE41-NEXT:    movdqa %xmm2, 80(%rdi)
1295; SSE41-NEXT:    movdqa %xmm6, 64(%rdi)
1296; SSE41-NEXT:    movdqa %xmm1, 48(%rdi)
1297; SSE41-NEXT:    movdqa %xmm5, 32(%rdi)
1298; SSE41-NEXT:    movdqa %xmm0, 16(%rdi)
1299; SSE41-NEXT:    movdqa %xmm4, (%rdi)
1300; SSE41-NEXT:    retq
1301;
1302; AVX2-LABEL: mulhsw_v32i16_lshr:
1303; AVX2:       # %bb.0:
1304; AVX2-NEXT:    vpmulhw %ymm2, %ymm0, %ymm2
1305; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1306; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
1307; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1308; AVX2-NEXT:    vpmulhw %ymm3, %ymm1, %ymm1
1309; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1310; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
1311; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1312; AVX2-NEXT:    vmovdqa %ymm4, %ymm1
1313; AVX2-NEXT:    retq
1314;
1315; AVX512F-LABEL: mulhsw_v32i16_lshr:
1316; AVX512F:       # %bb.0:
1317; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm2
1318; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1319; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1320; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1321; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
1322; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1323; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
1324; AVX512F-NEXT:    retq
1325;
1326; AVX512BW-LABEL: mulhsw_v32i16_lshr:
1327; AVX512BW:       # %bb.0:
1328; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm1
1329; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1330; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1331; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1332; AVX512BW-NEXT:    retq
1333  %a1 = sext <32 x i16> %a to <32 x i32>
1334  %b1 = sext <32 x i16> %b to <32 x i32>
1335  %c = mul <32 x i32> %a1, %b1
1336  %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1337  ret <32 x i32> %d
1338}
1339
1340define <32 x i32> @mulhsw_v32i16_ashr(<32 x i16> %a, <32 x i16> %b) {
1341; SSE2-LABEL: mulhsw_v32i16_ashr:
1342; SSE2:       # %bb.0:
1343; SSE2-NEXT:    movq %rdi, %rax
1344; SSE2-NEXT:    pmulhw %xmm4, %xmm0
1345; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
1346; SSE2-NEXT:    psrad $16, %xmm4
1347; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1348; SSE2-NEXT:    psrad $16, %xmm0
1349; SSE2-NEXT:    pmulhw %xmm5, %xmm1
1350; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
1351; SSE2-NEXT:    psrad $16, %xmm5
1352; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1353; SSE2-NEXT:    psrad $16, %xmm1
1354; SSE2-NEXT:    pmulhw %xmm6, %xmm2
1355; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
1356; SSE2-NEXT:    psrad $16, %xmm6
1357; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
1358; SSE2-NEXT:    psrad $16, %xmm2
1359; SSE2-NEXT:    pmulhw %xmm7, %xmm3
1360; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
1361; SSE2-NEXT:    psrad $16, %xmm7
1362; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1363; SSE2-NEXT:    psrad $16, %xmm3
1364; SSE2-NEXT:    movdqa %xmm3, 112(%rdi)
1365; SSE2-NEXT:    movdqa %xmm7, 96(%rdi)
1366; SSE2-NEXT:    movdqa %xmm2, 80(%rdi)
1367; SSE2-NEXT:    movdqa %xmm6, 64(%rdi)
1368; SSE2-NEXT:    movdqa %xmm1, 48(%rdi)
1369; SSE2-NEXT:    movdqa %xmm5, 32(%rdi)
1370; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
1371; SSE2-NEXT:    movdqa %xmm4, (%rdi)
1372; SSE2-NEXT:    retq
1373;
1374; SSE41-LABEL: mulhsw_v32i16_ashr:
1375; SSE41:       # %bb.0:
1376; SSE41-NEXT:    movq %rdi, %rax
1377; SSE41-NEXT:    pmulhw %xmm4, %xmm0
1378; SSE41-NEXT:    pmovsxwd %xmm0, %xmm4
1379; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1380; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
1381; SSE41-NEXT:    pmulhw %xmm5, %xmm1
1382; SSE41-NEXT:    pmovsxwd %xmm1, %xmm5
1383; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1384; SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
1385; SSE41-NEXT:    pmulhw %xmm6, %xmm2
1386; SSE41-NEXT:    pmovsxwd %xmm2, %xmm6
1387; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
1388; SSE41-NEXT:    pmovsxwd %xmm2, %xmm2
1389; SSE41-NEXT:    pmulhw %xmm7, %xmm3
1390; SSE41-NEXT:    pmovsxwd %xmm3, %xmm7
1391; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
1392; SSE41-NEXT:    pmovsxwd %xmm3, %xmm3
1393; SSE41-NEXT:    movdqa %xmm3, 112(%rdi)
1394; SSE41-NEXT:    movdqa %xmm7, 96(%rdi)
1395; SSE41-NEXT:    movdqa %xmm2, 80(%rdi)
1396; SSE41-NEXT:    movdqa %xmm6, 64(%rdi)
1397; SSE41-NEXT:    movdqa %xmm1, 48(%rdi)
1398; SSE41-NEXT:    movdqa %xmm5, 32(%rdi)
1399; SSE41-NEXT:    movdqa %xmm0, 16(%rdi)
1400; SSE41-NEXT:    movdqa %xmm4, (%rdi)
1401; SSE41-NEXT:    retq
1402;
1403; AVX2-LABEL: mulhsw_v32i16_ashr:
1404; AVX2:       # %bb.0:
1405; AVX2-NEXT:    vpmulhw %ymm2, %ymm0, %ymm2
1406; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm0
1407; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
1408; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm4
1409; AVX2-NEXT:    vpmulhw %ymm3, %ymm1, %ymm1
1410; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm2
1411; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
1412; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm3
1413; AVX2-NEXT:    vmovdqa %ymm4, %ymm1
1414; AVX2-NEXT:    retq
1415;
1416; AVX512F-LABEL: mulhsw_v32i16_ashr:
1417; AVX512F:       # %bb.0:
1418; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm2
1419; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
1420; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1421; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1422; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
1423; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm1
1424; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
1425; AVX512F-NEXT:    retq
1426;
1427; AVX512BW-LABEL: mulhsw_v32i16_ashr:
1428; AVX512BW:       # %bb.0:
1429; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm1
1430; AVX512BW-NEXT:    vpmovsxwd %ymm1, %zmm0
1431; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1432; AVX512BW-NEXT:    vpmovsxwd %ymm1, %zmm1
1433; AVX512BW-NEXT:    retq
1434  %a1 = sext <32 x i16> %a to <32 x i32>
1435  %b1 = sext <32 x i16> %b to <32 x i32>
1436  %c = mul <32 x i32> %a1, %b1
1437  %d = ashr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1438  ret <32 x i32> %d
1439}
1440
1441define <64 x i32> @zext_mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) {
1442; SSE2-LABEL: zext_mulhuw_v64i16_lshr:
1443; SSE2:       # %bb.0:
1444; SSE2-NEXT:    movdqa %xmm7, %xmm8
1445; SSE2-NEXT:    movq %rdi, %rax
1446; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm0
1447; SSE2-NEXT:    pxor %xmm10, %xmm10
1448; SSE2-NEXT:    movdqa %xmm0, %xmm7
1449; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
1450; SSE2-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1451; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
1452; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm1
1453; SSE2-NEXT:    movdqa %xmm1, %xmm9
1454; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
1455; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
1456; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm2
1457; SSE2-NEXT:    movdqa %xmm2, %xmm11
1458; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1459; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
1460; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm3
1461; SSE2-NEXT:    movdqa %xmm3, %xmm12
1462; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
1463; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
1464; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm4
1465; SSE2-NEXT:    movdqa %xmm4, %xmm13
1466; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
1467; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
1468; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm5
1469; SSE2-NEXT:    movdqa %xmm5, %xmm14
1470; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
1471; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
1472; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm6
1473; SSE2-NEXT:    movdqa %xmm6, %xmm15
1474; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3]
1475; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
1476; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm8
1477; SSE2-NEXT:    movdqa %xmm8, %xmm7
1478; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
1479; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
1480; SSE2-NEXT:    movdqa %xmm8, 240(%rdi)
1481; SSE2-NEXT:    movdqa %xmm7, 224(%rdi)
1482; SSE2-NEXT:    movdqa %xmm6, 208(%rdi)
1483; SSE2-NEXT:    movdqa %xmm15, 192(%rdi)
1484; SSE2-NEXT:    movdqa %xmm5, 176(%rdi)
1485; SSE2-NEXT:    movdqa %xmm14, 160(%rdi)
1486; SSE2-NEXT:    movdqa %xmm4, 144(%rdi)
1487; SSE2-NEXT:    movdqa %xmm13, 128(%rdi)
1488; SSE2-NEXT:    movdqa %xmm3, 112(%rdi)
1489; SSE2-NEXT:    movdqa %xmm12, 96(%rdi)
1490; SSE2-NEXT:    movdqa %xmm2, 80(%rdi)
1491; SSE2-NEXT:    movdqa %xmm11, 64(%rdi)
1492; SSE2-NEXT:    movdqa %xmm1, 48(%rdi)
1493; SSE2-NEXT:    movdqa %xmm9, 32(%rdi)
1494; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
1495; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1496; SSE2-NEXT:    movaps %xmm0, (%rdi)
1497; SSE2-NEXT:    retq
1498;
1499; SSE41-LABEL: zext_mulhuw_v64i16_lshr:
1500; SSE41:       # %bb.0:
1501; SSE41-NEXT:    movdqa %xmm0, %xmm8
1502; SSE41-NEXT:    movq %rdi, %rax
1503; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm8
1504; SSE41-NEXT:    pxor %xmm11, %xmm11
1505; SSE41-NEXT:    movdqa %xmm8, %xmm0
1506; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
1507; SSE41-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1508; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm1
1509; SSE41-NEXT:    movdqa %xmm1, %xmm9
1510; SSE41-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
1511; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm2
1512; SSE41-NEXT:    movdqa %xmm2, %xmm10
1513; SSE41-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
1514; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm3
1515; SSE41-NEXT:    movdqa %xmm3, %xmm12
1516; SSE41-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
1517; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm4
1518; SSE41-NEXT:    movdqa %xmm4, %xmm13
1519; SSE41-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
1520; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm5
1521; SSE41-NEXT:    movdqa %xmm5, %xmm14
1522; SSE41-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1523; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm6
1524; SSE41-NEXT:    movdqa %xmm6, %xmm15
1525; SSE41-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7]
1526; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm7
1527; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
1528; SSE41-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
1529; SSE41-NEXT:    movdqa %xmm7, 240(%rdi)
1530; SSE41-NEXT:    movdqa %xmm0, 224(%rdi)
1531; SSE41-NEXT:    movdqa %xmm15, 208(%rdi)
1532; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
1533; SSE41-NEXT:    movdqa %xmm0, 192(%rdi)
1534; SSE41-NEXT:    movdqa %xmm14, 176(%rdi)
1535; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
1536; SSE41-NEXT:    movdqa %xmm0, 160(%rdi)
1537; SSE41-NEXT:    movdqa %xmm13, 144(%rdi)
1538; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
1539; SSE41-NEXT:    movdqa %xmm0, 128(%rdi)
1540; SSE41-NEXT:    movdqa %xmm12, 112(%rdi)
1541; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
1542; SSE41-NEXT:    movdqa %xmm0, 96(%rdi)
1543; SSE41-NEXT:    movdqa %xmm10, 80(%rdi)
1544; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1545; SSE41-NEXT:    movdqa %xmm0, 64(%rdi)
1546; SSE41-NEXT:    movdqa %xmm9, 48(%rdi)
1547; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1548; SSE41-NEXT:    movdqa %xmm0, 32(%rdi)
1549; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1550; SSE41-NEXT:    movaps %xmm0, 16(%rdi)
1551; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
1552; SSE41-NEXT:    movdqa %xmm0, (%rdi)
1553; SSE41-NEXT:    retq
1554;
1555; AVX2-LABEL: zext_mulhuw_v64i16_lshr:
1556; AVX2:       # %bb.0:
1557; AVX2-NEXT:    movq %rdi, %rax
1558; AVX2-NEXT:    vpmulhuw %ymm4, %ymm0, %ymm0
1559; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1560; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1561; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1562; AVX2-NEXT:    vpmulhuw %ymm5, %ymm1, %ymm1
1563; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1564; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
1565; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1566; AVX2-NEXT:    vpmulhuw %ymm6, %ymm2, %ymm2
1567; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1568; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
1569; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1570; AVX2-NEXT:    vpmulhuw %ymm7, %ymm3, %ymm3
1571; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1572; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
1573; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1574; AVX2-NEXT:    vmovdqa %ymm3, 224(%rdi)
1575; AVX2-NEXT:    vmovdqa %ymm7, 192(%rdi)
1576; AVX2-NEXT:    vmovdqa %ymm2, 160(%rdi)
1577; AVX2-NEXT:    vmovdqa %ymm6, 128(%rdi)
1578; AVX2-NEXT:    vmovdqa %ymm1, 96(%rdi)
1579; AVX2-NEXT:    vmovdqa %ymm5, 64(%rdi)
1580; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdi)
1581; AVX2-NEXT:    vmovdqa %ymm4, (%rdi)
1582; AVX2-NEXT:    vzeroupper
1583; AVX2-NEXT:    retq
1584;
1585; AVX512F-LABEL: zext_mulhuw_v64i16_lshr:
1586; AVX512F:       # %bb.0:
1587; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm4
1588; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
1589; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
1590; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1591; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm0
1592; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1593; AVX512F-NEXT:    vpmulhuw %ymm3, %ymm1, %ymm0
1594; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1595; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
1596; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1597; AVX512F-NEXT:    vpmulhuw %ymm0, %ymm1, %ymm0
1598; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1599; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
1600; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
1601; AVX512F-NEXT:    retq
1602;
1603; AVX512BW-LABEL: zext_mulhuw_v64i16_lshr:
1604; AVX512BW:       # %bb.0:
1605; AVX512BW-NEXT:    vpmulhuw %zmm2, %zmm0, %zmm2
1606; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1607; AVX512BW-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
1608; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1609; AVX512BW-NEXT:    vpmulhuw %zmm3, %zmm1, %zmm1
1610; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1611; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1612; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1613; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
1614; AVX512BW-NEXT:    retq
1615  %a1 = zext <64 x i16> %a to <64 x i32>
1616  %b1 = zext <64 x i16> %b to <64 x i32>
1617  %c = mul <64 x i32> %a1, %b1
1618  %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1619  ret <64 x i32> %d
1620}
1621
1622define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) {
1623; SSE2-LABEL: mulhsw_v64i16_lshr:
1624; SSE2:       # %bb.0:
1625; SSE2-NEXT:    movdqa %xmm7, %xmm8
1626; SSE2-NEXT:    movq %rdi, %rax
1627; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm0
1628; SSE2-NEXT:    pxor %xmm10, %xmm10
1629; SSE2-NEXT:    movdqa %xmm0, %xmm7
1630; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
1631; SSE2-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1632; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
1633; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm1
1634; SSE2-NEXT:    movdqa %xmm1, %xmm9
1635; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
1636; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
1637; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm2
1638; SSE2-NEXT:    movdqa %xmm2, %xmm11
1639; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1640; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
1641; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm3
1642; SSE2-NEXT:    movdqa %xmm3, %xmm12
1643; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
1644; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
1645; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm4
1646; SSE2-NEXT:    movdqa %xmm4, %xmm13
1647; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
1648; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
1649; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm5
1650; SSE2-NEXT:    movdqa %xmm5, %xmm14
1651; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
1652; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
1653; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm6
1654; SSE2-NEXT:    movdqa %xmm6, %xmm15
1655; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3]
1656; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
1657; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm8
1658; SSE2-NEXT:    movdqa %xmm8, %xmm7
1659; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
1660; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
1661; SSE2-NEXT:    movdqa %xmm8, 240(%rdi)
1662; SSE2-NEXT:    movdqa %xmm7, 224(%rdi)
1663; SSE2-NEXT:    movdqa %xmm6, 208(%rdi)
1664; SSE2-NEXT:    movdqa %xmm15, 192(%rdi)
1665; SSE2-NEXT:    movdqa %xmm5, 176(%rdi)
1666; SSE2-NEXT:    movdqa %xmm14, 160(%rdi)
1667; SSE2-NEXT:    movdqa %xmm4, 144(%rdi)
1668; SSE2-NEXT:    movdqa %xmm13, 128(%rdi)
1669; SSE2-NEXT:    movdqa %xmm3, 112(%rdi)
1670; SSE2-NEXT:    movdqa %xmm12, 96(%rdi)
1671; SSE2-NEXT:    movdqa %xmm2, 80(%rdi)
1672; SSE2-NEXT:    movdqa %xmm11, 64(%rdi)
1673; SSE2-NEXT:    movdqa %xmm1, 48(%rdi)
1674; SSE2-NEXT:    movdqa %xmm9, 32(%rdi)
1675; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
1676; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1677; SSE2-NEXT:    movaps %xmm0, (%rdi)
1678; SSE2-NEXT:    retq
1679;
1680; SSE41-LABEL: mulhsw_v64i16_lshr:
1681; SSE41:       # %bb.0:
1682; SSE41-NEXT:    movdqa %xmm0, %xmm8
1683; SSE41-NEXT:    movq %rdi, %rax
1684; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm8
1685; SSE41-NEXT:    pxor %xmm11, %xmm11
1686; SSE41-NEXT:    movdqa %xmm8, %xmm0
1687; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
1688; SSE41-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1689; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm1
1690; SSE41-NEXT:    movdqa %xmm1, %xmm9
1691; SSE41-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
1692; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm2
1693; SSE41-NEXT:    movdqa %xmm2, %xmm10
1694; SSE41-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
1695; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm3
1696; SSE41-NEXT:    movdqa %xmm3, %xmm12
1697; SSE41-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
1698; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm4
1699; SSE41-NEXT:    movdqa %xmm4, %xmm13
1700; SSE41-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
1701; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm5
1702; SSE41-NEXT:    movdqa %xmm5, %xmm14
1703; SSE41-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1704; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm6
1705; SSE41-NEXT:    movdqa %xmm6, %xmm15
1706; SSE41-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7]
1707; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm7
1708; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
1709; SSE41-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
1710; SSE41-NEXT:    movdqa %xmm7, 240(%rdi)
1711; SSE41-NEXT:    movdqa %xmm0, 224(%rdi)
1712; SSE41-NEXT:    movdqa %xmm15, 208(%rdi)
1713; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
1714; SSE41-NEXT:    movdqa %xmm0, 192(%rdi)
1715; SSE41-NEXT:    movdqa %xmm14, 176(%rdi)
1716; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
1717; SSE41-NEXT:    movdqa %xmm0, 160(%rdi)
1718; SSE41-NEXT:    movdqa %xmm13, 144(%rdi)
1719; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
1720; SSE41-NEXT:    movdqa %xmm0, 128(%rdi)
1721; SSE41-NEXT:    movdqa %xmm12, 112(%rdi)
1722; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
1723; SSE41-NEXT:    movdqa %xmm0, 96(%rdi)
1724; SSE41-NEXT:    movdqa %xmm10, 80(%rdi)
1725; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1726; SSE41-NEXT:    movdqa %xmm0, 64(%rdi)
1727; SSE41-NEXT:    movdqa %xmm9, 48(%rdi)
1728; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1729; SSE41-NEXT:    movdqa %xmm0, 32(%rdi)
1730; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1731; SSE41-NEXT:    movaps %xmm0, 16(%rdi)
1732; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
1733; SSE41-NEXT:    movdqa %xmm0, (%rdi)
1734; SSE41-NEXT:    retq
1735;
1736; AVX2-LABEL: mulhsw_v64i16_lshr:
1737; AVX2:       # %bb.0:
1738; AVX2-NEXT:    movq %rdi, %rax
1739; AVX2-NEXT:    vpmulhw %ymm4, %ymm0, %ymm0
1740; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1741; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1742; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1743; AVX2-NEXT:    vpmulhw %ymm5, %ymm1, %ymm1
1744; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1745; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
1746; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1747; AVX2-NEXT:    vpmulhw %ymm6, %ymm2, %ymm2
1748; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1749; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
1750; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1751; AVX2-NEXT:    vpmulhw %ymm7, %ymm3, %ymm3
1752; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1753; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
1754; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1755; AVX2-NEXT:    vmovdqa %ymm3, 224(%rdi)
1756; AVX2-NEXT:    vmovdqa %ymm7, 192(%rdi)
1757; AVX2-NEXT:    vmovdqa %ymm2, 160(%rdi)
1758; AVX2-NEXT:    vmovdqa %ymm6, 128(%rdi)
1759; AVX2-NEXT:    vmovdqa %ymm1, 96(%rdi)
1760; AVX2-NEXT:    vmovdqa %ymm5, 64(%rdi)
1761; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdi)
1762; AVX2-NEXT:    vmovdqa %ymm4, (%rdi)
1763; AVX2-NEXT:    vzeroupper
1764; AVX2-NEXT:    retq
1765;
1766; AVX512F-LABEL: mulhsw_v64i16_lshr:
1767; AVX512F:       # %bb.0:
1768; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm4
1769; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
1770; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
1771; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1772; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
1773; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1774; AVX512F-NEXT:    vpmulhw %ymm3, %ymm1, %ymm0
1775; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1776; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
1777; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1778; AVX512F-NEXT:    vpmulhw %ymm0, %ymm1, %ymm0
1779; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1780; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
1781; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
1782; AVX512F-NEXT:    retq
1783;
1784; AVX512BW-LABEL: mulhsw_v64i16_lshr:
1785; AVX512BW:       # %bb.0:
1786; AVX512BW-NEXT:    vpmulhw %zmm2, %zmm0, %zmm2
1787; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1788; AVX512BW-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
1789; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1790; AVX512BW-NEXT:    vpmulhw %zmm3, %zmm1, %zmm1
1791; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1792; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1793; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1794; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
1795; AVX512BW-NEXT:    retq
1796  %a1 = sext <64 x i16> %a to <64 x i32>
1797  %b1 = sext <64 x i16> %b to <64 x i32>
1798  %c = mul <64 x i32> %a1, %b1
1799  %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1800  ret <64 x i32> %d
1801}
1802
1803define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) {
1804; SSE2-LABEL: mulhsw_v64i16_ashr:
1805; SSE2:       # %bb.0:
1806; SSE2-NEXT:    movq %rdi, %rax
1807; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm0
1808; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
1809; SSE2-NEXT:    psrad $16, %xmm8
1810; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1811; SSE2-NEXT:    psrad $16, %xmm0
1812; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm1
1813; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
1814; SSE2-NEXT:    psrad $16, %xmm9
1815; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1816; SSE2-NEXT:    psrad $16, %xmm1
1817; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm2
1818; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3]
1819; SSE2-NEXT:    psrad $16, %xmm10
1820; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
1821; SSE2-NEXT:    psrad $16, %xmm2
1822; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm3
1823; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
1824; SSE2-NEXT:    psrad $16, %xmm11
1825; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1826; SSE2-NEXT:    psrad $16, %xmm3
1827; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm4
1828; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
1829; SSE2-NEXT:    psrad $16, %xmm12
1830; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
1831; SSE2-NEXT:    psrad $16, %xmm4
1832; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm5
1833; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
1834; SSE2-NEXT:    psrad $16, %xmm13
1835; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
1836; SSE2-NEXT:    psrad $16, %xmm5
1837; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm6
1838; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
1839; SSE2-NEXT:    psrad $16, %xmm14
1840; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
1841; SSE2-NEXT:    psrad $16, %xmm6
1842; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm7
1843; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3]
1844; SSE2-NEXT:    psrad $16, %xmm15
1845; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
1846; SSE2-NEXT:    psrad $16, %xmm7
1847; SSE2-NEXT:    movdqa %xmm7, 240(%rdi)
1848; SSE2-NEXT:    movdqa %xmm15, 224(%rdi)
1849; SSE2-NEXT:    movdqa %xmm6, 208(%rdi)
1850; SSE2-NEXT:    movdqa %xmm14, 192(%rdi)
1851; SSE2-NEXT:    movdqa %xmm5, 176(%rdi)
1852; SSE2-NEXT:    movdqa %xmm13, 160(%rdi)
1853; SSE2-NEXT:    movdqa %xmm4, 144(%rdi)
1854; SSE2-NEXT:    movdqa %xmm12, 128(%rdi)
1855; SSE2-NEXT:    movdqa %xmm3, 112(%rdi)
1856; SSE2-NEXT:    movdqa %xmm11, 96(%rdi)
1857; SSE2-NEXT:    movdqa %xmm2, 80(%rdi)
1858; SSE2-NEXT:    movdqa %xmm10, 64(%rdi)
1859; SSE2-NEXT:    movdqa %xmm1, 48(%rdi)
1860; SSE2-NEXT:    movdqa %xmm9, 32(%rdi)
1861; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
1862; SSE2-NEXT:    movdqa %xmm8, (%rdi)
1863; SSE2-NEXT:    retq
1864;
1865; SSE41-LABEL: mulhsw_v64i16_ashr:
1866; SSE41:       # %bb.0:
1867; SSE41-NEXT:    movq %rdi, %rax
1868; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm0
1869; SSE41-NEXT:    pmovsxwd %xmm0, %xmm8
1870; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1871; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
1872; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm1
1873; SSE41-NEXT:    pmovsxwd %xmm1, %xmm9
1874; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1875; SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
1876; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm2
1877; SSE41-NEXT:    pmovsxwd %xmm2, %xmm10
1878; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
1879; SSE41-NEXT:    pmovsxwd %xmm2, %xmm2
1880; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm3
1881; SSE41-NEXT:    pmovsxwd %xmm3, %xmm11
1882; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
1883; SSE41-NEXT:    pmovsxwd %xmm3, %xmm3
1884; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm4
1885; SSE41-NEXT:    pmovsxwd %xmm4, %xmm12
1886; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
1887; SSE41-NEXT:    pmovsxwd %xmm4, %xmm4
1888; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm5
1889; SSE41-NEXT:    pmovsxwd %xmm5, %xmm13
1890; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
1891; SSE41-NEXT:    pmovsxwd %xmm5, %xmm5
1892; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm6
1893; SSE41-NEXT:    pmovsxwd %xmm6, %xmm14
1894; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
1895; SSE41-NEXT:    pmovsxwd %xmm6, %xmm6
1896; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm7
1897; SSE41-NEXT:    pmovsxwd %xmm7, %xmm15
1898; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
1899; SSE41-NEXT:    pmovsxwd %xmm7, %xmm7
1900; SSE41-NEXT:    movdqa %xmm7, 240(%rdi)
1901; SSE41-NEXT:    movdqa %xmm15, 224(%rdi)
1902; SSE41-NEXT:    movdqa %xmm6, 208(%rdi)
1903; SSE41-NEXT:    movdqa %xmm14, 192(%rdi)
1904; SSE41-NEXT:    movdqa %xmm5, 176(%rdi)
1905; SSE41-NEXT:    movdqa %xmm13, 160(%rdi)
1906; SSE41-NEXT:    movdqa %xmm4, 144(%rdi)
1907; SSE41-NEXT:    movdqa %xmm12, 128(%rdi)
1908; SSE41-NEXT:    movdqa %xmm3, 112(%rdi)
1909; SSE41-NEXT:    movdqa %xmm11, 96(%rdi)
1910; SSE41-NEXT:    movdqa %xmm2, 80(%rdi)
1911; SSE41-NEXT:    movdqa %xmm10, 64(%rdi)
1912; SSE41-NEXT:    movdqa %xmm1, 48(%rdi)
1913; SSE41-NEXT:    movdqa %xmm9, 32(%rdi)
1914; SSE41-NEXT:    movdqa %xmm0, 16(%rdi)
1915; SSE41-NEXT:    movdqa %xmm8, (%rdi)
1916; SSE41-NEXT:    retq
1917;
1918; AVX2-LABEL: mulhsw_v64i16_ashr:
1919; AVX2:       # %bb.0:
1920; AVX2-NEXT:    movq %rdi, %rax
1921; AVX2-NEXT:    vpmulhw %ymm4, %ymm0, %ymm0
1922; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm4
1923; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1924; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
1925; AVX2-NEXT:    vpmulhw %ymm5, %ymm1, %ymm1
1926; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm5
1927; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
1928; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
1929; AVX2-NEXT:    vpmulhw %ymm6, %ymm2, %ymm2
1930; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm6
1931; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
1932; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm2
1933; AVX2-NEXT:    vpmulhw %ymm7, %ymm3, %ymm3
1934; AVX2-NEXT:    vpmovsxwd %xmm3, %ymm7
1935; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
1936; AVX2-NEXT:    vpmovsxwd %xmm3, %ymm3
1937; AVX2-NEXT:    vmovdqa %ymm3, 224(%rdi)
1938; AVX2-NEXT:    vmovdqa %ymm7, 192(%rdi)
1939; AVX2-NEXT:    vmovdqa %ymm2, 160(%rdi)
1940; AVX2-NEXT:    vmovdqa %ymm6, 128(%rdi)
1941; AVX2-NEXT:    vmovdqa %ymm1, 96(%rdi)
1942; AVX2-NEXT:    vmovdqa %ymm5, 64(%rdi)
1943; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdi)
1944; AVX2-NEXT:    vmovdqa %ymm4, (%rdi)
1945; AVX2-NEXT:    vzeroupper
1946; AVX2-NEXT:    retq
1947;
1948; AVX512F-LABEL: mulhsw_v64i16_ashr:
1949; AVX512F:       # %bb.0:
1950; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm4
1951; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
1952; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
1953; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1954; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
1955; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm5
1956; AVX512F-NEXT:    vpmulhw %ymm3, %ymm1, %ymm0
1957; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm2
1958; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
1959; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1960; AVX512F-NEXT:    vpmulhw %ymm0, %ymm1, %ymm0
1961; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm3
1962; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
1963; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
1964; AVX512F-NEXT:    retq
1965;
1966; AVX512BW-LABEL: mulhsw_v64i16_ashr:
1967; AVX512BW:       # %bb.0:
1968; AVX512BW-NEXT:    vpmulhw %zmm2, %zmm0, %zmm2
1969; AVX512BW-NEXT:    vpmovsxwd %ymm2, %zmm0
1970; AVX512BW-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
1971; AVX512BW-NEXT:    vpmovsxwd %ymm2, %zmm4
1972; AVX512BW-NEXT:    vpmulhw %zmm3, %zmm1, %zmm1
1973; AVX512BW-NEXT:    vpmovsxwd %ymm1, %zmm2
1974; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1975; AVX512BW-NEXT:    vpmovsxwd %ymm1, %zmm3
1976; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
1977; AVX512BW-NEXT:    retq
1978  %a1 = sext <64 x i16> %a to <64 x i32>
1979  %b1 = sext <64 x i16> %b to <64 x i32>
1980  %c = mul <64 x i32> %a1, %b1
1981  %d = ashr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1982  ret <64 x i32> %d
1983}
1984
1985define <8 x i64> @zext_mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
1986; SSE2-LABEL: zext_mulhuw_v8i16_lshr_i64:
1987; SSE2:       # %bb.0:
1988; SSE2-NEXT:    movdqa %xmm0, %xmm3
1989; SSE2-NEXT:    pmulhuw %xmm1, %xmm3
1990; SSE2-NEXT:    pxor %xmm4, %xmm4
1991; SSE2-NEXT:    movdqa %xmm3, %xmm1
1992; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1993; SSE2-NEXT:    movdqa %xmm1, %xmm0
1994; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
1995; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1996; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1997; SSE2-NEXT:    movdqa %xmm3, %xmm2
1998; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
1999; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2000; SSE2-NEXT:    retq
2001;
2002; SSE41-LABEL: zext_mulhuw_v8i16_lshr_i64:
2003; SSE41:       # %bb.0:
2004; SSE41-NEXT:    pmulhuw %xmm1, %xmm0
2005; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2006; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2007; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
2008; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
2009; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
2010; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
2011; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2012; SSE41-NEXT:    movdqa %xmm4, %xmm0
2013; SSE41-NEXT:    retq
2014;
2015; AVX2-LABEL: zext_mulhuw_v8i16_lshr_i64:
2016; AVX2:       # %bb.0:
2017; AVX2-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm1
2018; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
2019; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2020; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
2021; AVX2-NEXT:    retq
2022;
2023; AVX512-LABEL: zext_mulhuw_v8i16_lshr_i64:
2024; AVX512:       # %bb.0:
2025; AVX512-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
2026; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2027; AVX512-NEXT:    retq
2028  %a1 = zext <8 x i16> %a to <8 x i64>
2029  %b1 = zext <8 x i16> %b to <8 x i64>
2030  %c = mul <8 x i64> %a1, %b1
2031  %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
2032  ret <8 x i64> %d
2033}
2034
2035define <8 x i64> @sext_mulhsw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
2036; SSE2-LABEL: sext_mulhsw_v8i16_lshr_i64:
2037; SSE2:       # %bb.0:
2038; SSE2-NEXT:    movdqa %xmm0, %xmm3
2039; SSE2-NEXT:    pmulhw %xmm1, %xmm3
2040; SSE2-NEXT:    pxor %xmm4, %xmm4
2041; SSE2-NEXT:    movdqa %xmm3, %xmm1
2042; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
2043; SSE2-NEXT:    movdqa %xmm1, %xmm0
2044; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2045; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
2046; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2047; SSE2-NEXT:    movdqa %xmm3, %xmm2
2048; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2049; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2050; SSE2-NEXT:    retq
2051;
2052; SSE41-LABEL: sext_mulhsw_v8i16_lshr_i64:
2053; SSE41:       # %bb.0:
2054; SSE41-NEXT:    pmulhw %xmm1, %xmm0
2055; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2056; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2057; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
2058; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
2059; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
2060; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
2061; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2062; SSE41-NEXT:    movdqa %xmm4, %xmm0
2063; SSE41-NEXT:    retq
2064;
2065; AVX2-LABEL: sext_mulhsw_v8i16_lshr_i64:
2066; AVX2:       # %bb.0:
2067; AVX2-NEXT:    vpmulhw %xmm1, %xmm0, %xmm1
2068; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
2069; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2070; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
2071; AVX2-NEXT:    retq
2072;
2073; AVX512-LABEL: sext_mulhsw_v8i16_lshr_i64:
2074; AVX512:       # %bb.0:
2075; AVX512-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
2076; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2077; AVX512-NEXT:    retq
2078  %a1 = sext <8 x i16> %a to <8 x i64>
2079  %b1 = sext <8 x i16> %b to <8 x i64>
2080  %c = mul <8 x i64> %a1, %b1
2081  %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
2082  ret <8 x i64> %d
2083}
2084
2085define <8 x i64> @sext_mulhsw_v8i16_ashr_i64(<8 x i16> %a, <8 x i16> %b) {
2086; SSE2-LABEL: sext_mulhsw_v8i16_ashr_i64:
2087; SSE2:       # %bb.0:
2088; SSE2-NEXT:    pmulhw %xmm1, %xmm0
2089; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2090; SSE2-NEXT:    psrad $16, %xmm1
2091; SSE2-NEXT:    pxor %xmm5, %xmm5
2092; SSE2-NEXT:    pxor %xmm2, %xmm2
2093; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
2094; SSE2-NEXT:    movdqa %xmm1, %xmm4
2095; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
2096; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2097; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2098; SSE2-NEXT:    psrad $16, %xmm3
2099; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
2100; SSE2-NEXT:    movdqa %xmm3, %xmm2
2101; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
2102; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
2103; SSE2-NEXT:    movdqa %xmm4, %xmm0
2104; SSE2-NEXT:    retq
2105;
2106; SSE41-LABEL: sext_mulhsw_v8i16_ashr_i64:
2107; SSE41:       # %bb.0:
2108; SSE41-NEXT:    pmulhw %xmm1, %xmm0
2109; SSE41-NEXT:    pmovsxwq %xmm0, %xmm4
2110; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2111; SSE41-NEXT:    pmovsxwq %xmm1, %xmm1
2112; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
2113; SSE41-NEXT:    pmovsxwq %xmm2, %xmm2
2114; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
2115; SSE41-NEXT:    pmovsxwq %xmm0, %xmm3
2116; SSE41-NEXT:    movdqa %xmm4, %xmm0
2117; SSE41-NEXT:    retq
2118;
2119; AVX2-LABEL: sext_mulhsw_v8i16_ashr_i64:
2120; AVX2:       # %bb.0:
2121; AVX2-NEXT:    vpmulhw %xmm1, %xmm0, %xmm1
2122; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm0
2123; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2124; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm1
2125; AVX2-NEXT:    retq
2126;
2127; AVX512-LABEL: sext_mulhsw_v8i16_ashr_i64:
2128; AVX512:       # %bb.0:
2129; AVX512-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
2130; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
2131; AVX512-NEXT:    retq
2132  %a1 = sext <8 x i16> %a to <8 x i64>
2133  %b1 = sext <8 x i16> %b to <8 x i64>
2134  %c = mul <8 x i64> %a1, %b1
2135  %d = ashr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
2136  ret <8 x i64> %d
2137}
2138
2139define <8 x i16> @sse2_pmulh_w_const(<8 x i16> %a0, <8 x i16> %a1) {
2140; SSE-LABEL: sse2_pmulh_w_const:
2141; SSE:       # %bb.0:
2142; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0]
2143; SSE-NEXT:    retq
2144;
2145; AVX-LABEL: sse2_pmulh_w_const:
2146; AVX:       # %bb.0:
2147; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0]
2148; AVX-NEXT:    retq
2149  %res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0>, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
2150  ret <8 x i16> %res
2151}
2152declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>)
2153
2154define <8 x i16> @sse2_pmulhu_w_const(<8 x i16> %a0, <8 x i16> %a1) {
2155; SSE-LABEL: sse2_pmulhu_w_const:
2156; SSE:       # %bb.0:
2157; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,0,1,2,3,4,5,0]
2158; SSE-NEXT:    retq
2159;
2160; AVX-LABEL: sse2_pmulhu_w_const:
2161; AVX:       # %bb.0:
2162; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,1,2,3,4,5,0]
2163; AVX-NEXT:    retq
2164  %res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0>, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
2165  ret <8 x i16> %res
2166}
2167declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>)
2168
2169