xref: /llvm-project/llvm/test/CodeGen/X86/slow-pmulld.ll (revision 74fe1da01eb149a2234fc0f9570c84a08692e782)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=SSE-32,SLM,SLM-32
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=SSE-64,SLM,SLM-64
4; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=SSE-32,SLOW,SLOW-32
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=SSE-64,SLOW,SLOW-64
6; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE-32,SSE4,SSE4-32
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE-64,SSE4,SSE4-64
8; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx2,+slow-pmulld | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,AVX2-SLOW32
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+slow-pmulld | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,AVX2-SLOW64
10; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX-32,AVX2-32
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX-64,AVX2-64
12; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX2,AVX-32,AVX512-32,AVX512DQ-32
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX2,AVX-64,AVX512-64,AVX512DQ-64
14; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX2,AVX-32,AVX512-32,AVX512BW-32
15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX2,AVX-64,AVX512-64,AVX512BW-64
16; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX2,AVX-32,AVX512-32,KNL-32
17; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX2,AVX-64,AVX512-64,KNL-64
18
19; Make sure that the slow-pmulld feature can be used without SSE4.1.
20; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont -mattr=-sse4.1
21
22define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) {
23; SSE-32-LABEL: test_mul_v4i32_v4i8:
24; SSE-32:       # %bb.0:
25; SSE-32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
26; SSE-32-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
27; SSE-32-NEXT:    retl
28;
29; SSE-64-LABEL: test_mul_v4i32_v4i8:
30; SSE-64:       # %bb.0:
31; SSE-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
32; SSE-64-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [18778,0,18778,0,18778,0,18778,0]
33; SSE-64-NEXT:    retq
34;
35; AVX2-SLOW32-LABEL: test_mul_v4i32_v4i8:
36; AVX2-SLOW32:       # %bb.0:
37; AVX2-SLOW32-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
38; AVX2-SLOW32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
39; AVX2-SLOW32-NEXT:    retl
40;
41; AVX2-SLOW64-LABEL: test_mul_v4i32_v4i8:
42; AVX2-SLOW64:       # %bb.0:
43; AVX2-SLOW64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
44; AVX2-SLOW64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
45; AVX2-SLOW64-NEXT:    retq
46;
47; AVX2-32-LABEL: test_mul_v4i32_v4i8:
48; AVX2-32:       # %bb.0:
49; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
50; AVX2-32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
51; AVX2-32-NEXT:    retl
52;
53; AVX2-64-LABEL: test_mul_v4i32_v4i8:
54; AVX2-64:       # %bb.0:
55; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
56; AVX2-64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
57; AVX2-64-NEXT:    retq
58;
59; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8:
60; AVX512DQ-32:       # %bb.0:
61; AVX512DQ-32-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
62; AVX512DQ-32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
63; AVX512DQ-32-NEXT:    retl
64;
65; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8:
66; AVX512DQ-64:       # %bb.0:
67; AVX512DQ-64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
68; AVX512DQ-64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
69; AVX512DQ-64-NEXT:    retq
70;
71; AVX512BW-32-LABEL: test_mul_v4i32_v4i8:
72; AVX512BW-32:       # %bb.0:
73; AVX512BW-32-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
74; AVX512BW-32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
75; AVX512BW-32-NEXT:    retl
76;
77; AVX512BW-64-LABEL: test_mul_v4i32_v4i8:
78; AVX512BW-64:       # %bb.0:
79; AVX512BW-64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
80; AVX512BW-64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
81; AVX512BW-64-NEXT:    retq
82;
83; KNL-32-LABEL: test_mul_v4i32_v4i8:
84; KNL-32:       # %bb.0:
85; KNL-32-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
86; KNL-32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
87; KNL-32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
88; KNL-32-NEXT:    retl
89;
90; KNL-64-LABEL: test_mul_v4i32_v4i8:
91; KNL-64:       # %bb.0:
92; KNL-64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
93; KNL-64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
94; KNL-64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
95; KNL-64-NEXT:    retq
96  %z = zext <4 x i8> %A to <4 x i32>
97  %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
98  ret <4 x i32> %m
99}
100
101define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
102; SLM-LABEL: test_mul_v8i32_v8i8:
103; SLM:       # %bb.0:
104; SLM-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
105; SLM-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
106; SLM-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
107; SLM-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
108; SLM-NEXT:    pmaddwd %xmm2, %xmm0
109; SLM-NEXT:    pmaddwd %xmm2, %xmm1
110; SLM-NEXT:    ret{{[l|q]}}
111;
112; SLOW-LABEL: test_mul_v8i32_v8i8:
113; SLOW:       # %bb.0:
114; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
115; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
116; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
117; SLOW-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
118; SLOW-NEXT:    pmaddwd %xmm2, %xmm0
119; SLOW-NEXT:    pmaddwd %xmm2, %xmm1
120; SLOW-NEXT:    ret{{[l|q]}}
121;
122; SSE4-LABEL: test_mul_v8i32_v8i8:
123; SSE4:       # %bb.0:
124; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
125; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
126; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
127; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
128; SSE4-NEXT:    pmaddwd %xmm2, %xmm0
129; SSE4-NEXT:    pmaddwd %xmm2, %xmm1
130; SSE4-NEXT:    ret{{[l|q]}}
131;
132; AVX2-SLOW32-LABEL: test_mul_v8i32_v8i8:
133; AVX2-SLOW32:       # %bb.0:
134; AVX2-SLOW32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
135; AVX2-SLOW32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
136; AVX2-SLOW32-NEXT:    retl
137;
138; AVX2-SLOW64-LABEL: test_mul_v8i32_v8i8:
139; AVX2-SLOW64:       # %bb.0:
140; AVX2-SLOW64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
141; AVX2-SLOW64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
142; AVX2-SLOW64-NEXT:    retq
143;
144; AVX2-32-LABEL: test_mul_v8i32_v8i8:
145; AVX2-32:       # %bb.0:
146; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
147; AVX2-32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
148; AVX2-32-NEXT:    retl
149;
150; AVX2-64-LABEL: test_mul_v8i32_v8i8:
151; AVX2-64:       # %bb.0:
152; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
153; AVX2-64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
154; AVX2-64-NEXT:    retq
155;
156; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8:
157; AVX512DQ-32:       # %bb.0:
158; AVX512DQ-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
159; AVX512DQ-32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
160; AVX512DQ-32-NEXT:    retl
161;
162; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8:
163; AVX512DQ-64:       # %bb.0:
164; AVX512DQ-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
165; AVX512DQ-64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
166; AVX512DQ-64-NEXT:    retq
167;
168; AVX512BW-32-LABEL: test_mul_v8i32_v8i8:
169; AVX512BW-32:       # %bb.0:
170; AVX512BW-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
171; AVX512BW-32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
172; AVX512BW-32-NEXT:    retl
173;
174; AVX512BW-64-LABEL: test_mul_v8i32_v8i8:
175; AVX512BW-64:       # %bb.0:
176; AVX512BW-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
177; AVX512BW-64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
178; AVX512BW-64-NEXT:    retq
179;
180; KNL-32-LABEL: test_mul_v8i32_v8i8:
181; KNL-32:       # %bb.0:
182; KNL-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
183; KNL-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
184; KNL-32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
185; KNL-32-NEXT:    retl
186;
187; KNL-64-LABEL: test_mul_v8i32_v8i8:
188; KNL-64:       # %bb.0:
189; KNL-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
190; KNL-64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
191; KNL-64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
192; KNL-64-NEXT:    retq
193  %z = zext <8 x i8> %A to <8 x i32>
194  %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
195  ret <8 x i32> %m
196}
197
198define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
199; SLM-LABEL: test_mul_v16i32_v16i8:
200; SLM:       # %bb.0:
201; SLM-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
202; SLM-NEXT:    pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778]
203; SLM-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
204; SLM-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
205; SLM-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
206; SLM-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
207; SLM-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
208; SLM-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
209; SLM-NEXT:    pmaddwd %xmm5, %xmm0
210; SLM-NEXT:    pmaddwd %xmm5, %xmm1
211; SLM-NEXT:    pmaddwd %xmm5, %xmm2
212; SLM-NEXT:    pmaddwd %xmm5, %xmm3
213; SLM-NEXT:    ret{{[l|q]}}
214;
215; SLOW-LABEL: test_mul_v16i32_v16i8:
216; SLOW:       # %bb.0:
217; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
218; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
219; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
220; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
221; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
222; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
223; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
224; SLOW-NEXT:    pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
225; SLOW-NEXT:    pmaddwd %xmm4, %xmm0
226; SLOW-NEXT:    pmaddwd %xmm4, %xmm1
227; SLOW-NEXT:    pmaddwd %xmm4, %xmm2
228; SLOW-NEXT:    pmaddwd %xmm4, %xmm3
229; SLOW-NEXT:    ret{{[l|q]}}
230;
231; SSE4-LABEL: test_mul_v16i32_v16i8:
232; SSE4:       # %bb.0:
233; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
234; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
235; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
236; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
237; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
238; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
239; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
240; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
241; SSE4-NEXT:    pmaddwd %xmm4, %xmm0
242; SSE4-NEXT:    pmaddwd %xmm4, %xmm1
243; SSE4-NEXT:    pmaddwd %xmm4, %xmm2
244; SSE4-NEXT:    pmaddwd %xmm4, %xmm3
245; SSE4-NEXT:    ret{{[l|q]}}
246;
247; AVX2-SLOW-LABEL: test_mul_v16i32_v16i8:
248; AVX2-SLOW:       # %bb.0:
249; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
250; AVX2-SLOW-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
251; AVX2-SLOW-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
252; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
253; AVX2-SLOW-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
254; AVX2-SLOW-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
255; AVX2-SLOW-NEXT:    ret{{[l|q]}}
256;
257; AVX2-32-LABEL: test_mul_v16i32_v16i8:
258; AVX2-32:       # %bb.0:
259; AVX2-32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
260; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
261; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
262; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
263; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
264; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
265; AVX2-32-NEXT:    retl
266;
267; AVX2-64-LABEL: test_mul_v16i32_v16i8:
268; AVX2-64:       # %bb.0:
269; AVX2-64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
270; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
271; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
272; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
273; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
274; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
275; AVX2-64-NEXT:    retq
276;
277; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8:
278; AVX512DQ-32:       # %bb.0:
279; AVX512DQ-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
280; AVX512DQ-32-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
281; AVX512DQ-32-NEXT:    retl
282;
283; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8:
284; AVX512DQ-64:       # %bb.0:
285; AVX512DQ-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
286; AVX512DQ-64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
287; AVX512DQ-64-NEXT:    retq
288;
289; AVX512BW-32-LABEL: test_mul_v16i32_v16i8:
290; AVX512BW-32:       # %bb.0:
291; AVX512BW-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
292; AVX512BW-32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
293; AVX512BW-32-NEXT:    retl
294;
295; AVX512BW-64-LABEL: test_mul_v16i32_v16i8:
296; AVX512BW-64:       # %bb.0:
297; AVX512BW-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
298; AVX512BW-64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
299; AVX512BW-64-NEXT:    retq
300;
301; KNL-32-LABEL: test_mul_v16i32_v16i8:
302; KNL-32:       # %bb.0:
303; KNL-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
304; KNL-32-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
305; KNL-32-NEXT:    retl
306;
307; KNL-64-LABEL: test_mul_v16i32_v16i8:
308; KNL-64:       # %bb.0:
309; KNL-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
310; KNL-64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
311; KNL-64-NEXT:    retq
312  %z = zext <16 x i8> %A to <16 x i32>
313  %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
314  ret <16 x i32> %m
315}
316
317define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
318; SLM-LABEL: test_mul_v4i32_v4i16:
319; SLM:       # %bb.0:
320; SLM-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,u,u,u,u]
321; SLM-NEXT:    movdqa %xmm0, %xmm2
322; SLM-NEXT:    pmulhuw %xmm1, %xmm2
323; SLM-NEXT:    pmullw %xmm1, %xmm0
324; SLM-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
325; SLM-NEXT:    ret{{[l|q]}}
326;
327; SLOW-LABEL: test_mul_v4i32_v4i16:
328; SLOW:       # %bb.0:
329; SLOW-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,u,u,u,u]
330; SLOW-NEXT:    movdqa %xmm0, %xmm2
331; SLOW-NEXT:    pmulhuw %xmm1, %xmm2
332; SLOW-NEXT:    pmullw %xmm1, %xmm0
333; SLOW-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
334; SLOW-NEXT:    ret{{[l|q]}}
335;
336; SSE4-32-LABEL: test_mul_v4i32_v4i16:
337; SSE4-32:       # %bb.0:
338; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
339; SSE4-32-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
340; SSE4-32-NEXT:    retl
341;
342; SSE4-64-LABEL: test_mul_v4i32_v4i16:
343; SSE4-64:       # %bb.0:
344; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
345; SSE4-64-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
346; SSE4-64-NEXT:    retq
347;
348; AVX2-SLOW-LABEL: test_mul_v4i32_v4i16:
349; AVX2-SLOW:       # %bb.0:
350; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
351; AVX2-SLOW-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
352; AVX2-SLOW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
353; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
354; AVX2-SLOW-NEXT:    ret{{[l|q]}}
355;
356; AVX-32-LABEL: test_mul_v4i32_v4i16:
357; AVX-32:       # %bb.0:
358; AVX-32-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
359; AVX-32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
360; AVX-32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
361; AVX-32-NEXT:    retl
362;
363; AVX-64-LABEL: test_mul_v4i32_v4i16:
364; AVX-64:       # %bb.0:
365; AVX-64-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
366; AVX-64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
367; AVX-64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
368; AVX-64-NEXT:    retq
369  %z = zext <4 x i16> %A to <4 x i32>
370  %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
371  ret <4 x i32> %m
372}
373
374define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
375; SLM-LABEL: test_mul_v8i32_v8i16:
376; SLM:       # %bb.0:
377; SLM-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
378; SLM-NEXT:    movdqa %xmm0, %xmm2
379; SLM-NEXT:    pmulhuw %xmm1, %xmm2
380; SLM-NEXT:    pmullw %xmm0, %xmm1
381; SLM-NEXT:    movdqa %xmm1, %xmm0
382; SLM-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
383; SLM-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
384; SLM-NEXT:    ret{{[l|q]}}
385;
386; SLOW-LABEL: test_mul_v8i32_v8i16:
387; SLOW:       # %bb.0:
388; SLOW-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
389; SLOW-NEXT:    movdqa %xmm0, %xmm2
390; SLOW-NEXT:    pmulhuw %xmm1, %xmm2
391; SLOW-NEXT:    pmullw %xmm0, %xmm1
392; SLOW-NEXT:    movdqa %xmm1, %xmm0
393; SLOW-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
394; SLOW-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
395; SLOW-NEXT:    ret{{[l|q]}}
396;
397; SSE4-LABEL: test_mul_v8i32_v8i16:
398; SSE4:       # %bb.0:
399; SSE4-NEXT:    pxor %xmm1, %xmm1
400; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
401; SSE4-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
402; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
403; SSE4-NEXT:    pmulld %xmm1, %xmm2
404; SSE4-NEXT:    pmulld %xmm0, %xmm1
405; SSE4-NEXT:    movdqa %xmm2, %xmm0
406; SSE4-NEXT:    ret{{[l|q]}}
407;
408; AVX2-SLOW-LABEL: test_mul_v8i32_v8i16:
409; AVX2-SLOW:       # %bb.0:
410; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
411; AVX2-SLOW-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
412; AVX2-SLOW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
413; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
414; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
415; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
416; AVX2-SLOW-NEXT:    ret{{[l|q]}}
417;
418; AVX-32-LABEL: test_mul_v8i32_v8i16:
419; AVX-32:       # %bb.0:
420; AVX-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
421; AVX-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
422; AVX-32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
423; AVX-32-NEXT:    retl
424;
425; AVX-64-LABEL: test_mul_v8i32_v8i16:
426; AVX-64:       # %bb.0:
427; AVX-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
428; AVX-64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
429; AVX-64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
430; AVX-64-NEXT:    retq
431  %z = zext <8 x i16> %A to <8 x i32>
432  %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
433  ret <8 x i32> %m
434}
435
436define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
437; SLM-LABEL: test_mul_v16i32_v16i16:
438; SLM:       # %bb.0:
439; SLM-NEXT:    movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778]
440; SLM-NEXT:    movdqa %xmm0, %xmm4
441; SLM-NEXT:    movdqa %xmm0, %xmm2
442; SLM-NEXT:    movdqa %xmm1, %xmm5
443; SLM-NEXT:    pmullw %xmm3, %xmm4
444; SLM-NEXT:    pmulhuw %xmm3, %xmm2
445; SLM-NEXT:    pmulhuw %xmm3, %xmm5
446; SLM-NEXT:    pmullw %xmm1, %xmm3
447; SLM-NEXT:    movdqa %xmm4, %xmm0
448; SLM-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
449; SLM-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
450; SLM-NEXT:    movdqa %xmm3, %xmm2
451; SLM-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
452; SLM-NEXT:    movdqa %xmm4, %xmm1
453; SLM-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
454; SLM-NEXT:    ret{{[l|q]}}
455;
456; SLOW-LABEL: test_mul_v16i32_v16i16:
457; SLOW:       # %bb.0:
458; SLOW-NEXT:    movdqa %xmm0, %xmm4
459; SLOW-NEXT:    movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778]
460; SLOW-NEXT:    movdqa %xmm0, %xmm2
461; SLOW-NEXT:    pmulhuw %xmm3, %xmm2
462; SLOW-NEXT:    pmullw %xmm3, %xmm4
463; SLOW-NEXT:    movdqa %xmm4, %xmm0
464; SLOW-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
465; SLOW-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
466; SLOW-NEXT:    movdqa %xmm1, %xmm5
467; SLOW-NEXT:    pmulhuw %xmm3, %xmm5
468; SLOW-NEXT:    pmullw %xmm1, %xmm3
469; SLOW-NEXT:    movdqa %xmm3, %xmm2
470; SLOW-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
471; SLOW-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
472; SLOW-NEXT:    movdqa %xmm4, %xmm1
473; SLOW-NEXT:    ret{{[l|q]}}
474;
475; SSE4-LABEL: test_mul_v16i32_v16i16:
476; SSE4:       # %bb.0:
477; SSE4-NEXT:    movdqa %xmm0, %xmm4
478; SSE4-NEXT:    pxor %xmm3, %xmm3
479; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
480; SSE4-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
481; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
482; SSE4-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
483; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
484; SSE4-NEXT:    pmulld %xmm3, %xmm0
485; SSE4-NEXT:    pmulld %xmm3, %xmm4
486; SSE4-NEXT:    pmulld %xmm3, %xmm2
487; SSE4-NEXT:    pmulld %xmm1, %xmm3
488; SSE4-NEXT:    movdqa %xmm4, %xmm1
489; SSE4-NEXT:    ret{{[l|q]}}
490;
491; AVX2-SLOW-LABEL: test_mul_v16i32_v16i16:
492; AVX2-SLOW:       # %bb.0:
493; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778]
494; AVX2-SLOW-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
495; AVX2-SLOW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
496; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
497; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
498; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[0,1],ymm1[0,1]
499; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
500; AVX2-SLOW-NEXT:    ret{{[l|q]}}
501;
502; AVX2-32-LABEL: test_mul_v16i32_v16i16:
503; AVX2-32:       # %bb.0:
504; AVX2-32-NEXT:    vextracti128 $1, %ymm0, %xmm1
505; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
506; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
507; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
508; AVX2-32-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
509; AVX2-32-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
510; AVX2-32-NEXT:    retl
511;
512; AVX2-64-LABEL: test_mul_v16i32_v16i16:
513; AVX2-64:       # %bb.0:
514; AVX2-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
515; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
516; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
517; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
518; AVX2-64-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
519; AVX2-64-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
520; AVX2-64-NEXT:    retq
521;
522; AVX512-32-LABEL: test_mul_v16i32_v16i16:
523; AVX512-32:       # %bb.0:
524; AVX512-32-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
525; AVX512-32-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
526; AVX512-32-NEXT:    retl
527;
528; AVX512-64-LABEL: test_mul_v16i32_v16i16:
529; AVX512-64:       # %bb.0:
530; AVX512-64-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
531; AVX512-64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
532; AVX512-64-NEXT:    retq
533  %z = zext <16 x i16> %A to <16 x i32>
534  %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
535  ret <16 x i32> %m
536}
537
538;
539; MinSize Tests
540;
541
542define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize {
543; SSE-32-LABEL: test_mul_v4i32_v4i8_minsize:
544; SSE-32:       # %bb.0:
545; SSE-32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
546; SSE-32-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
547; SSE-32-NEXT:    retl
548;
549; SSE-64-LABEL: test_mul_v4i32_v4i8_minsize:
550; SSE-64:       # %bb.0:
551; SSE-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
552; SSE-64-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [18778,0,18778,0,18778,0,18778,0]
553; SSE-64-NEXT:    retq
554;
555; AVX2-SLOW32-LABEL: test_mul_v4i32_v4i8_minsize:
556; AVX2-SLOW32:       # %bb.0:
557; AVX2-SLOW32-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
558; AVX2-SLOW32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
559; AVX2-SLOW32-NEXT:    retl
560;
561; AVX2-SLOW64-LABEL: test_mul_v4i32_v4i8_minsize:
562; AVX2-SLOW64:       # %bb.0:
563; AVX2-SLOW64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
564; AVX2-SLOW64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
565; AVX2-SLOW64-NEXT:    retq
566;
567; AVX2-32-LABEL: test_mul_v4i32_v4i8_minsize:
568; AVX2-32:       # %bb.0:
569; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
570; AVX2-32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
571; AVX2-32-NEXT:    retl
572;
573; AVX2-64-LABEL: test_mul_v4i32_v4i8_minsize:
574; AVX2-64:       # %bb.0:
575; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
576; AVX2-64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
577; AVX2-64-NEXT:    retq
578;
579; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8_minsize:
580; AVX512DQ-32:       # %bb.0:
581; AVX512DQ-32-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
582; AVX512DQ-32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
583; AVX512DQ-32-NEXT:    retl
584;
585; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8_minsize:
586; AVX512DQ-64:       # %bb.0:
587; AVX512DQ-64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
588; AVX512DQ-64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
589; AVX512DQ-64-NEXT:    retq
590;
591; AVX512BW-32-LABEL: test_mul_v4i32_v4i8_minsize:
592; AVX512BW-32:       # %bb.0:
593; AVX512BW-32-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
594; AVX512BW-32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
595; AVX512BW-32-NEXT:    retl
596;
597; AVX512BW-64-LABEL: test_mul_v4i32_v4i8_minsize:
598; AVX512BW-64:       # %bb.0:
599; AVX512BW-64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
600; AVX512BW-64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0]
601; AVX512BW-64-NEXT:    retq
602;
603; KNL-32-LABEL: test_mul_v4i32_v4i8_minsize:
604; KNL-32:       # %bb.0:
605; KNL-32-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
606; KNL-32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
607; KNL-32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
608; KNL-32-NEXT:    retl
609;
610; KNL-64-LABEL: test_mul_v4i32_v4i8_minsize:
611; KNL-64:       # %bb.0:
612; KNL-64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
613; KNL-64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
614; KNL-64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
615; KNL-64-NEXT:    retq
616  %z = zext <4 x i8> %A to <4 x i32>
617  %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
618  ret <4 x i32> %m
619}
620
621define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
622; SLM-LABEL: test_mul_v8i32_v8i8_minsize:
623; SLM:       # %bb.0:
624; SLM-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
625; SLM-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
626; SLM-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
627; SLM-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
628; SLM-NEXT:    pmaddwd %xmm2, %xmm0
629; SLM-NEXT:    pmaddwd %xmm2, %xmm1
630; SLM-NEXT:    ret{{[l|q]}}
631;
632; SLOW-LABEL: test_mul_v8i32_v8i8_minsize:
633; SLOW:       # %bb.0:
634; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
635; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
636; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
637; SLOW-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
638; SLOW-NEXT:    pmaddwd %xmm2, %xmm0
639; SLOW-NEXT:    pmaddwd %xmm2, %xmm1
640; SLOW-NEXT:    ret{{[l|q]}}
641;
642; SSE4-LABEL: test_mul_v8i32_v8i8_minsize:
643; SSE4:       # %bb.0:
644; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
645; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
646; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
647; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
648; SSE4-NEXT:    pmaddwd %xmm2, %xmm0
649; SSE4-NEXT:    pmaddwd %xmm2, %xmm1
650; SSE4-NEXT:    ret{{[l|q]}}
651;
652; AVX2-SLOW32-LABEL: test_mul_v8i32_v8i8_minsize:
653; AVX2-SLOW32:       # %bb.0:
654; AVX2-SLOW32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
655; AVX2-SLOW32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
656; AVX2-SLOW32-NEXT:    retl
657;
658; AVX2-SLOW64-LABEL: test_mul_v8i32_v8i8_minsize:
659; AVX2-SLOW64:       # %bb.0:
660; AVX2-SLOW64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
661; AVX2-SLOW64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
662; AVX2-SLOW64-NEXT:    retq
663;
664; AVX2-32-LABEL: test_mul_v8i32_v8i8_minsize:
665; AVX2-32:       # %bb.0:
666; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
667; AVX2-32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
668; AVX2-32-NEXT:    retl
669;
670; AVX2-64-LABEL: test_mul_v8i32_v8i8_minsize:
671; AVX2-64:       # %bb.0:
672; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
673; AVX2-64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
674; AVX2-64-NEXT:    retq
675;
676; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8_minsize:
677; AVX512DQ-32:       # %bb.0:
678; AVX512DQ-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
679; AVX512DQ-32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
680; AVX512DQ-32-NEXT:    retl
681;
682; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8_minsize:
683; AVX512DQ-64:       # %bb.0:
684; AVX512DQ-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
685; AVX512DQ-64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
686; AVX512DQ-64-NEXT:    retq
687;
688; AVX512BW-32-LABEL: test_mul_v8i32_v8i8_minsize:
689; AVX512BW-32:       # %bb.0:
690; AVX512BW-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
691; AVX512BW-32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
692; AVX512BW-32-NEXT:    retl
693;
694; AVX512BW-64-LABEL: test_mul_v8i32_v8i8_minsize:
695; AVX512BW-64:       # %bb.0:
696; AVX512BW-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
697; AVX512BW-64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
698; AVX512BW-64-NEXT:    retq
699;
700; KNL-32-LABEL: test_mul_v8i32_v8i8_minsize:
701; KNL-32:       # %bb.0:
702; KNL-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
703; KNL-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
704; KNL-32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
705; KNL-32-NEXT:    retl
706;
707; KNL-64-LABEL: test_mul_v8i32_v8i8_minsize:
708; KNL-64:       # %bb.0:
709; KNL-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
710; KNL-64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
711; KNL-64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
712; KNL-64-NEXT:    retq
713  %z = zext <8 x i8> %A to <8 x i32>
714  %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
715  ret <8 x i32> %m
716}
717
718define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
719; SLM-LABEL: test_mul_v16i32_v16i8_minsize:
720; SLM:       # %bb.0:
721; SLM-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
722; SLM-NEXT:    pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778]
723; SLM-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
724; SLM-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
725; SLM-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
726; SLM-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
727; SLM-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
728; SLM-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
729; SLM-NEXT:    pmaddwd %xmm5, %xmm0
730; SLM-NEXT:    pmaddwd %xmm5, %xmm1
731; SLM-NEXT:    pmaddwd %xmm5, %xmm2
732; SLM-NEXT:    pmaddwd %xmm5, %xmm3
733; SLM-NEXT:    ret{{[l|q]}}
734;
735; SLOW-LABEL: test_mul_v16i32_v16i8_minsize:
736; SLOW:       # %bb.0:
737; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
738; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
739; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
740; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
741; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
742; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
743; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
744; SLOW-NEXT:    pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
745; SLOW-NEXT:    pmaddwd %xmm4, %xmm0
746; SLOW-NEXT:    pmaddwd %xmm4, %xmm1
747; SLOW-NEXT:    pmaddwd %xmm4, %xmm2
748; SLOW-NEXT:    pmaddwd %xmm4, %xmm3
749; SLOW-NEXT:    ret{{[l|q]}}
750;
751; SSE4-LABEL: test_mul_v16i32_v16i8_minsize:
752; SSE4:       # %bb.0:
753; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
754; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
755; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
756; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
757; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
758; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
759; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
760; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
761; SSE4-NEXT:    pmaddwd %xmm4, %xmm0
762; SSE4-NEXT:    pmaddwd %xmm4, %xmm1
763; SSE4-NEXT:    pmaddwd %xmm4, %xmm2
764; SSE4-NEXT:    pmaddwd %xmm4, %xmm3
765; SSE4-NEXT:    ret{{[l|q]}}
766;
767; AVX2-SLOW-LABEL: test_mul_v16i32_v16i8_minsize:
768; AVX2-SLOW:       # %bb.0:
769; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
770; AVX2-SLOW-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
771; AVX2-SLOW-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
772; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
773; AVX2-SLOW-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
774; AVX2-SLOW-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
775; AVX2-SLOW-NEXT:    ret{{[l|q]}}
776;
777; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize:
778; AVX2-32:       # %bb.0:
779; AVX2-32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
780; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
781; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
782; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
783; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
784; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
785; AVX2-32-NEXT:    retl
786;
787; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize:
788; AVX2-64:       # %bb.0:
789; AVX2-64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
790; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
791; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
792; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
793; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
794; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
795; AVX2-64-NEXT:    retq
796;
797; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8_minsize:
798; AVX512DQ-32:       # %bb.0:
799; AVX512DQ-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
800; AVX512DQ-32-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
801; AVX512DQ-32-NEXT:    retl
802;
803; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8_minsize:
804; AVX512DQ-64:       # %bb.0:
805; AVX512DQ-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
806; AVX512DQ-64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
807; AVX512DQ-64-NEXT:    retq
808;
809; AVX512BW-32-LABEL: test_mul_v16i32_v16i8_minsize:
810; AVX512BW-32:       # %bb.0:
811; AVX512BW-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
812; AVX512BW-32-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
813; AVX512BW-32-NEXT:    retl
814;
815; AVX512BW-64-LABEL: test_mul_v16i32_v16i8_minsize:
816; AVX512BW-64:       # %bb.0:
817; AVX512BW-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
818; AVX512BW-64-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
819; AVX512BW-64-NEXT:    retq
820;
821; KNL-32-LABEL: test_mul_v16i32_v16i8_minsize:
822; KNL-32:       # %bb.0:
823; KNL-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
824; KNL-32-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
825; KNL-32-NEXT:    retl
826;
827; KNL-64-LABEL: test_mul_v16i32_v16i8_minsize:
828; KNL-64:       # %bb.0:
829; KNL-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
830; KNL-64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
831; KNL-64-NEXT:    retq
832  %z = zext <16 x i8> %A to <16 x i32>
833  %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
834  ret <16 x i32> %m
835}
836
837define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {
838; SSE-32-LABEL: test_mul_v4i32_v4i16_minsize:
839; SSE-32:       # %bb.0:
840; SSE-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
841; SSE-32-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
842; SSE-32-NEXT:    retl
843;
844; SSE-64-LABEL: test_mul_v4i32_v4i16_minsize:
845; SSE-64:       # %bb.0:
846; SSE-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
847; SSE-64-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
848; SSE-64-NEXT:    retq
849;
850; AVX2-LABEL: test_mul_v4i32_v4i16_minsize:
851; AVX2:       # %bb.0:
852; AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
853; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
854; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
855; AVX2-NEXT:    ret{{[l|q]}}
856  %z = zext <4 x i16> %A to <4 x i32>
857  %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
858  ret <4 x i32> %m
859}
860
861define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
862; SLM-LABEL: test_mul_v8i32_v8i16_minsize:
863; SLM:       # %bb.0:
864; SLM-NEXT:    pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
865; SLM-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
866; SLM-NEXT:    pxor %xmm3, %xmm3
867; SLM-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
868; SLM-NEXT:    pmulld %xmm1, %xmm2
869; SLM-NEXT:    pmulld %xmm0, %xmm1
870; SLM-NEXT:    movdqa %xmm2, %xmm0
871; SLM-NEXT:    ret{{[l|q]}}
872;
873; SLOW-LABEL: test_mul_v8i32_v8i16_minsize:
874; SLOW:       # %bb.0:
875; SLOW-NEXT:    pxor %xmm1, %xmm1
876; SLOW-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
877; SLOW-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
878; SLOW-NEXT:    pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
879; SLOW-NEXT:    pmulld %xmm1, %xmm2
880; SLOW-NEXT:    pmulld %xmm0, %xmm1
881; SLOW-NEXT:    movdqa %xmm2, %xmm0
882; SLOW-NEXT:    ret{{[l|q]}}
883;
884; SSE4-LABEL: test_mul_v8i32_v8i16_minsize:
885; SSE4:       # %bb.0:
886; SSE4-NEXT:    pxor %xmm1, %xmm1
887; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
888; SSE4-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
889; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
890; SSE4-NEXT:    pmulld %xmm1, %xmm2
891; SSE4-NEXT:    pmulld %xmm0, %xmm1
892; SSE4-NEXT:    movdqa %xmm2, %xmm0
893; SSE4-NEXT:    ret{{[l|q]}}
894;
895; AVX2-LABEL: test_mul_v8i32_v8i16_minsize:
896; AVX2:       # %bb.0:
897; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
898; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
899; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
900; AVX2-NEXT:    ret{{[l|q]}}
901  %z = zext <8 x i16> %A to <8 x i32>
902  %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
903  ret <8 x i32> %m
904}
905
906define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
907; SLM-LABEL: test_mul_v16i32_v16i16_minsize:
908; SLM:       # %bb.0:
909; SLM-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
910; SLM-NEXT:    movdqa %xmm0, %xmm4
911; SLM-NEXT:    pxor %xmm5, %xmm5
912; SLM-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
913; SLM-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
914; SLM-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
915; SLM-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
916; SLM-NEXT:    pmulld %xmm3, %xmm4
917; SLM-NEXT:    pmulld %xmm3, %xmm0
918; SLM-NEXT:    pmulld %xmm3, %xmm2
919; SLM-NEXT:    pmulld %xmm1, %xmm3
920; SLM-NEXT:    movdqa %xmm4, %xmm1
921; SLM-NEXT:    ret{{[l|q]}}
922;
923; SLOW-LABEL: test_mul_v16i32_v16i16_minsize:
924; SLOW:       # %bb.0:
925; SLOW-NEXT:    movdqa %xmm0, %xmm4
926; SLOW-NEXT:    pxor %xmm3, %xmm3
927; SLOW-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
928; SLOW-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
929; SLOW-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
930; SLOW-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
931; SLOW-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
932; SLOW-NEXT:    pmulld %xmm3, %xmm0
933; SLOW-NEXT:    pmulld %xmm3, %xmm4
934; SLOW-NEXT:    pmulld %xmm3, %xmm2
935; SLOW-NEXT:    pmulld %xmm1, %xmm3
936; SLOW-NEXT:    movdqa %xmm4, %xmm1
937; SLOW-NEXT:    ret{{[l|q]}}
938;
939; SSE4-LABEL: test_mul_v16i32_v16i16_minsize:
940; SSE4:       # %bb.0:
941; SSE4-NEXT:    movdqa %xmm0, %xmm4
942; SSE4-NEXT:    pxor %xmm3, %xmm3
943; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
944; SSE4-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
945; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
946; SSE4-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
947; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
948; SSE4-NEXT:    pmulld %xmm3, %xmm0
949; SSE4-NEXT:    pmulld %xmm3, %xmm4
950; SSE4-NEXT:    pmulld %xmm3, %xmm2
951; SSE4-NEXT:    pmulld %xmm1, %xmm3
952; SSE4-NEXT:    movdqa %xmm4, %xmm1
953; SSE4-NEXT:    ret{{[l|q]}}
954;
955; AVX2-SLOW-LABEL: test_mul_v16i32_v16i16_minsize:
956; AVX2-SLOW:       # %bb.0:
957; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
958; AVX2-SLOW-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
959; AVX2-SLOW-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
960; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
961; AVX2-SLOW-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
962; AVX2-SLOW-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
963; AVX2-SLOW-NEXT:    ret{{[l|q]}}
964;
965; AVX2-32-LABEL: test_mul_v16i32_v16i16_minsize:
966; AVX2-32:       # %bb.0:
967; AVX2-32-NEXT:    vextracti128 $1, %ymm0, %xmm1
968; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
969; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
970; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
971; AVX2-32-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
972; AVX2-32-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
973; AVX2-32-NEXT:    retl
974;
975; AVX2-64-LABEL: test_mul_v16i32_v16i16_minsize:
976; AVX2-64:       # %bb.0:
977; AVX2-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
978; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
979; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
980; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
981; AVX2-64-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
982; AVX2-64-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
983; AVX2-64-NEXT:    retq
984;
985; AVX512-32-LABEL: test_mul_v16i32_v16i16_minsize:
986; AVX512-32:       # %bb.0:
987; AVX512-32-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
988; AVX512-32-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
989; AVX512-32-NEXT:    retl
990;
991; AVX512-64-LABEL: test_mul_v16i32_v16i16_minsize:
992; AVX512-64:       # %bb.0:
993; AVX512-64-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
994; AVX512-64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
995; AVX512-64-NEXT:    retq
996  %z = zext <16 x i16> %A to <16 x i32>
997  %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
998  ret <16 x i32> %m
999}
1000;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
1001; SLM-32: {{.*}}
1002; SLM-64: {{.*}}
1003; SLOW-32: {{.*}}
1004; SLOW-64: {{.*}}
1005