xref: /llvm-project/llvm/test/CodeGen/X86/shrink_vmul.ll (revision 74fe1da01eb149a2234fc0f9570c84a08692e782)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefixes=X86-AVX,X86-AVX1
4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86-AVX,X86-AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2
8
9@c = external dso_local global ptr, align 8
10
11; %val1 = load <2 x i8>
12; %op1 = zext<2 x i32> %val1
13; %val2 = load <2 x i8>
14; %op2 = zext<2 x i32> %val2
15; %rst = mul <2 x i32> %op1, %op2
16;
17define void @mul_2xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
18; X86-SSE-LABEL: mul_2xi8:
19; X86-SSE:       # %bb.0: # %entry
20; X86-SSE-NEXT:    pushl %esi
21; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
22; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
23; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
24; X86-SSE-NEXT:    movl c, %esi
25; X86-SSE-NEXT:    movzwl (%edx,%eax), %edx
26; X86-SSE-NEXT:    movd %edx, %xmm0
27; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
28; X86-SSE-NEXT:    movd %ecx, %xmm1
29; X86-SSE-NEXT:    pxor %xmm2, %xmm2
30; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
31; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
32; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
33; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
34; X86-SSE-NEXT:    movq %xmm1, (%esi,%eax,4)
35; X86-SSE-NEXT:    popl %esi
36; X86-SSE-NEXT:    retl
37;
38; X86-AVX-LABEL: mul_2xi8:
39; X86-AVX:       # %bb.0: # %entry
40; X86-AVX-NEXT:    pushl %esi
41; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
42; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
43; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
44; X86-AVX-NEXT:    movl c, %esi
45; X86-AVX-NEXT:    movzwl (%edx,%ecx), %edx
46; X86-AVX-NEXT:    vmovd %edx, %xmm0
47; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
48; X86-AVX-NEXT:    movzwl (%eax,%ecx), %eax
49; X86-AVX-NEXT:    vmovd %eax, %xmm1
50; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
51; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
52; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
53; X86-AVX-NEXT:    popl %esi
54; X86-AVX-NEXT:    retl
55;
56; X64-SSE-LABEL: mul_2xi8:
57; X64-SSE:       # %bb.0: # %entry
58; X64-SSE-NEXT:    movq c(%rip), %rax
59; X64-SSE-NEXT:    movzwl (%rdi,%rdx), %ecx
60; X64-SSE-NEXT:    movd %ecx, %xmm0
61; X64-SSE-NEXT:    movzwl (%rsi,%rdx), %ecx
62; X64-SSE-NEXT:    movd %ecx, %xmm1
63; X64-SSE-NEXT:    pxor %xmm2, %xmm2
64; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
65; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
66; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
67; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
68; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
69; X64-SSE-NEXT:    retq
70;
71; X64-AVX-LABEL: mul_2xi8:
72; X64-AVX:       # %bb.0: # %entry
73; X64-AVX-NEXT:    movq c(%rip), %rax
74; X64-AVX-NEXT:    movzwl (%rdi,%rdx), %ecx
75; X64-AVX-NEXT:    vmovd %ecx, %xmm0
76; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
77; X64-AVX-NEXT:    movzwl (%rsi,%rdx), %ecx
78; X64-AVX-NEXT:    vmovd %ecx, %xmm1
79; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
80; X64-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
81; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
82; X64-AVX-NEXT:    retq
83entry:
84  %pre = load ptr, ptr @c
85  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
86  %wide.load = load <2 x i8>, ptr %tmp6, align 1
87  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
88  %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
89  %wide.load17 = load <2 x i8>, ptr %tmp10, align 1
90  %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
91  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
92  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
93  store <2 x i32> %tmp13, ptr %tmp14, align 4
94  ret void
95}
96
97; %val1 = load <4 x i8>
98; %op1 = zext<4 x i32> %val1
99; %val2 = load <4 x i8>
100; %op2 = zext<4 x i32> %val2
101; %rst = mul <4 x i32> %op1, %op2
102;
103define void @mul_4xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
104; X86-SSE-LABEL: mul_4xi8:
105; X86-SSE:       # %bb.0: # %entry
106; X86-SSE-NEXT:    pushl %esi
107; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
108; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
109; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
110; X86-SSE-NEXT:    movl c, %esi
111; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
112; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
113; X86-SSE-NEXT:    pxor %xmm2, %xmm2
114; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
115; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
116; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
117; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
118; X86-SSE-NEXT:    movdqu %xmm1, (%esi,%ecx,4)
119; X86-SSE-NEXT:    popl %esi
120; X86-SSE-NEXT:    retl
121;
122; X86-AVX-LABEL: mul_4xi8:
123; X86-AVX:       # %bb.0: # %entry
124; X86-AVX-NEXT:    pushl %esi
125; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
126; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
127; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
128; X86-AVX-NEXT:    movl c, %esi
129; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
130; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
131; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
132; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
133; X86-AVX-NEXT:    popl %esi
134; X86-AVX-NEXT:    retl
135;
136; X64-SSE-LABEL: mul_4xi8:
137; X64-SSE:       # %bb.0: # %entry
138; X64-SSE-NEXT:    movq c(%rip), %rax
139; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
140; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
141; X64-SSE-NEXT:    pxor %xmm2, %xmm2
142; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
143; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
144; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
145; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
146; X64-SSE-NEXT:    movdqu %xmm1, (%rax,%rdx,4)
147; X64-SSE-NEXT:    retq
148;
149; X64-AVX-LABEL: mul_4xi8:
150; X64-AVX:       # %bb.0: # %entry
151; X64-AVX-NEXT:    movq c(%rip), %rax
152; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
153; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
154; X64-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
155; X64-AVX-NEXT:    vmovdqu %xmm0, (%rax,%rdx,4)
156; X64-AVX-NEXT:    retq
157entry:
158  %pre = load ptr, ptr @c
159  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
160  %wide.load = load <4 x i8>, ptr %tmp6, align 1
161  %tmp8 = zext <4 x i8> %wide.load to <4 x i32>
162  %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
163  %wide.load17 = load <4 x i8>, ptr %tmp10, align 1
164  %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
165  %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
166  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
167  store <4 x i32> %tmp13, ptr %tmp14, align 4
168  ret void
169}
170
171; %val1 = load <8 x i8>
172; %op1 = zext<8 x i32> %val1
173; %val2 = load <8 x i8>
174; %op2 = zext<8 x i32> %val2
175; %rst = mul <8 x i32> %op1, %op2
176;
177define void @mul_8xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
178; X86-SSE-LABEL: mul_8xi8:
179; X86-SSE:       # %bb.0: # %entry
180; X86-SSE-NEXT:    pushl %esi
181; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
182; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
183; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
184; X86-SSE-NEXT:    movl c, %ecx
185; X86-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
186; X86-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
187; X86-SSE-NEXT:    pxor %xmm2, %xmm2
188; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
189; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
190; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
191; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
192; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
193; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
194; X86-SSE-NEXT:    movdqu %xmm1, 16(%ecx,%eax,4)
195; X86-SSE-NEXT:    movdqu %xmm0, (%ecx,%eax,4)
196; X86-SSE-NEXT:    popl %esi
197; X86-SSE-NEXT:    retl
198;
199; X86-AVX1-LABEL: mul_8xi8:
200; X86-AVX1:       # %bb.0: # %entry
201; X86-AVX1-NEXT:    pushl %esi
202; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
203; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
204; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
205; X86-AVX1-NEXT:    movl c, %esi
206; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
207; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
208; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
209; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm2, %xmm0
210; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
211; X86-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
212; X86-AVX1-NEXT:    vmovdqu %xmm0, 16(%esi,%ecx,4)
213; X86-AVX1-NEXT:    vmovdqu %xmm1, (%esi,%ecx,4)
214; X86-AVX1-NEXT:    popl %esi
215; X86-AVX1-NEXT:    retl
216;
217; X86-AVX2-LABEL: mul_8xi8:
218; X86-AVX2:       # %bb.0: # %entry
219; X86-AVX2-NEXT:    pushl %esi
220; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
221; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
222; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
223; X86-AVX2-NEXT:    movl c, %esi
224; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
225; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
226; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
227; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
228; X86-AVX2-NEXT:    popl %esi
229; X86-AVX2-NEXT:    vzeroupper
230; X86-AVX2-NEXT:    retl
231;
232; X64-SSE-LABEL: mul_8xi8:
233; X64-SSE:       # %bb.0: # %entry
234; X64-SSE-NEXT:    movq c(%rip), %rax
235; X64-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
236; X64-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
237; X64-SSE-NEXT:    pxor %xmm2, %xmm2
238; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
239; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
240; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
241; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
242; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
243; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
244; X64-SSE-NEXT:    movdqu %xmm1, 16(%rax,%rdx,4)
245; X64-SSE-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
246; X64-SSE-NEXT:    retq
247;
248; X64-AVX1-LABEL: mul_8xi8:
249; X64-AVX1:       # %bb.0: # %entry
250; X64-AVX1-NEXT:    movq c(%rip), %rax
251; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
252; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
253; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
254; X64-AVX1-NEXT:    vpmaddwd %xmm0, %xmm2, %xmm0
255; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
256; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
257; X64-AVX1-NEXT:    vmovdqu %xmm0, 16(%rax,%rdx,4)
258; X64-AVX1-NEXT:    vmovdqu %xmm1, (%rax,%rdx,4)
259; X64-AVX1-NEXT:    retq
260;
261; X64-AVX2-LABEL: mul_8xi8:
262; X64-AVX2:       # %bb.0: # %entry
263; X64-AVX2-NEXT:    movq c(%rip), %rax
264; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
265; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
266; X64-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
267; X64-AVX2-NEXT:    vmovdqu %ymm0, (%rax,%rdx,4)
268; X64-AVX2-NEXT:    vzeroupper
269; X64-AVX2-NEXT:    retq
270entry:
271  %pre = load ptr, ptr @c
272  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
273  %wide.load = load <8 x i8>, ptr %tmp6, align 1
274  %tmp8 = zext <8 x i8> %wide.load to <8 x i32>
275  %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
276  %wide.load17 = load <8 x i8>, ptr %tmp10, align 1
277  %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
278  %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
279  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
280  store <8 x i32> %tmp13, ptr %tmp14, align 4
281  ret void
282}
283
284; %val1 = load <16 x i8>
285; %op1 = zext<16 x i32> %val1
286; %val2 = load <16 x i8>
287; %op2 = zext<16 x i32> %val2
288; %rst = mul <16 x i32> %op1, %op2
289;
290define void @mul_16xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
291; X86-SSE-LABEL: mul_16xi8:
292; X86-SSE:       # %bb.0: # %entry
293; X86-SSE-NEXT:    pushl %esi
294; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
295; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
296; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
297; X86-SSE-NEXT:    movl c, %ecx
298; X86-SSE-NEXT:    movdqu (%esi,%eax), %xmm3
299; X86-SSE-NEXT:    movdqu (%edx,%eax), %xmm0
300; X86-SSE-NEXT:    pxor %xmm1, %xmm1
301; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
302; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
303; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
304; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
305; X86-SSE-NEXT:    pmullw %xmm4, %xmm2
306; X86-SSE-NEXT:    movdqa %xmm2, %xmm4
307; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
308; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
309; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
310; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
311; X86-SSE-NEXT:    pmullw %xmm3, %xmm0
312; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
313; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
314; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
315; X86-SSE-NEXT:    movdqu %xmm0, 48(%ecx,%eax,4)
316; X86-SSE-NEXT:    movdqu %xmm3, 32(%ecx,%eax,4)
317; X86-SSE-NEXT:    movdqu %xmm2, 16(%ecx,%eax,4)
318; X86-SSE-NEXT:    movdqu %xmm4, (%ecx,%eax,4)
319; X86-SSE-NEXT:    popl %esi
320; X86-SSE-NEXT:    retl
321;
322; X86-AVX1-LABEL: mul_16xi8:
323; X86-AVX1:       # %bb.0: # %entry
324; X86-AVX1-NEXT:    pushl %esi
325; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
326; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
327; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
328; X86-AVX1-NEXT:    movl c, %ecx
329; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
330; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
331; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
332; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
333; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
334; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
335; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
336; X86-AVX1-NEXT:    vpmaddwd %xmm1, %xmm4, %xmm1
337; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
338; X86-AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
339; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
340; X86-AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
341; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%ecx,%eax,4)
342; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%ecx,%eax,4)
343; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%ecx,%eax,4)
344; X86-AVX1-NEXT:    vmovdqu %xmm3, (%ecx,%eax,4)
345; X86-AVX1-NEXT:    popl %esi
346; X86-AVX1-NEXT:    retl
347;
348; X86-AVX2-LABEL: mul_16xi8:
349; X86-AVX2:       # %bb.0: # %entry
350; X86-AVX2-NEXT:    pushl %esi
351; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
352; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
353; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
354; X86-AVX2-NEXT:    movl c, %esi
355; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
356; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
357; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
358; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm2, %ymm0
359; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
360; X86-AVX2-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
361; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
362; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
363; X86-AVX2-NEXT:    popl %esi
364; X86-AVX2-NEXT:    vzeroupper
365; X86-AVX2-NEXT:    retl
366;
367; X64-SSE-LABEL: mul_16xi8:
368; X64-SSE:       # %bb.0: # %entry
369; X64-SSE-NEXT:    movq c(%rip), %rax
370; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
371; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm1
372; X64-SSE-NEXT:    pxor %xmm2, %xmm2
373; X64-SSE-NEXT:    movdqa %xmm0, %xmm3
374; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
375; X64-SSE-NEXT:    movdqa %xmm1, %xmm4
376; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
377; X64-SSE-NEXT:    pmullw %xmm3, %xmm4
378; X64-SSE-NEXT:    movdqa %xmm4, %xmm3
379; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
380; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
381; X64-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
382; X64-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
383; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
384; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
385; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
386; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
387; X64-SSE-NEXT:    movdqu %xmm1, 48(%rax,%rdx,4)
388; X64-SSE-NEXT:    movdqu %xmm0, 32(%rax,%rdx,4)
389; X64-SSE-NEXT:    movdqu %xmm4, 16(%rax,%rdx,4)
390; X64-SSE-NEXT:    movdqu %xmm3, (%rax,%rdx,4)
391; X64-SSE-NEXT:    retq
392;
393; X64-AVX1-LABEL: mul_16xi8:
394; X64-AVX1:       # %bb.0: # %entry
395; X64-AVX1-NEXT:    movq c(%rip), %rax
396; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
397; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
398; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
399; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
400; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
401; X64-AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
402; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
403; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm4, %xmm1
404; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
405; X64-AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
406; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
407; X64-AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
408; X64-AVX1-NEXT:    vmovdqu %xmm0, 48(%rax,%rdx,4)
409; X64-AVX1-NEXT:    vmovdqu %xmm1, 32(%rax,%rdx,4)
410; X64-AVX1-NEXT:    vmovdqu %xmm2, 16(%rax,%rdx,4)
411; X64-AVX1-NEXT:    vmovdqu %xmm3, (%rax,%rdx,4)
412; X64-AVX1-NEXT:    retq
413;
414; X64-AVX2-LABEL: mul_16xi8:
415; X64-AVX2:       # %bb.0: # %entry
416; X64-AVX2-NEXT:    movq c(%rip), %rax
417; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
418; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
419; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
420; X64-AVX2-NEXT:    vpmaddwd %ymm0, %ymm2, %ymm0
421; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
422; X64-AVX2-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
423; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
424; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
425; X64-AVX2-NEXT:    vzeroupper
426; X64-AVX2-NEXT:    retq
427entry:
428  %pre = load ptr, ptr @c
429  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
430  %wide.load = load <16 x i8>, ptr %tmp6, align 1
431  %tmp8 = zext <16 x i8> %wide.load to <16 x i32>
432  %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
433  %wide.load17 = load <16 x i8>, ptr %tmp10, align 1
434  %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
435  %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
436  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
437  store <16 x i32> %tmp13, ptr %tmp14, align 4
438  ret void
439}
440
441; %val1 = load <2 x i16>
442; %op1 = zext<2 x i32> %val1
443; %val2 = load <2 x i16>
444; %op2 = zext<2 x i32> %val2
445; %rst = mul <2 x i32> %op1, %op2
446;
447define void @mul_2xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
448; X86-SSE-LABEL: mul_2xi16:
449; X86-SSE:       # %bb.0: # %entry
450; X86-SSE-NEXT:    pushl %esi
451; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
452; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
453; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
454; X86-SSE-NEXT:    movl c, %esi
455; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
456; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
457; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
458; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
459; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
460; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
461; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
462; X86-SSE-NEXT:    popl %esi
463; X86-SSE-NEXT:    retl
464;
465; X86-AVX-LABEL: mul_2xi16:
466; X86-AVX:       # %bb.0: # %entry
467; X86-AVX-NEXT:    pushl %esi
468; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
469; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
470; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
471; X86-AVX-NEXT:    movl c, %esi
472; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
473; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
474; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
475; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
476; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
477; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
478; X86-AVX-NEXT:    popl %esi
479; X86-AVX-NEXT:    retl
480;
481; X64-SSE-LABEL: mul_2xi16:
482; X64-SSE:       # %bb.0: # %entry
483; X64-SSE-NEXT:    movq c(%rip), %rax
484; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
485; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
486; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
487; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm2
488; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
489; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
490; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
491; X64-SSE-NEXT:    retq
492;
493; X64-AVX-LABEL: mul_2xi16:
494; X64-AVX:       # %bb.0: # %entry
495; X64-AVX-NEXT:    movq c(%rip), %rax
496; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
497; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
498; X64-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
499; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
500; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
501; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
502; X64-AVX-NEXT:    retq
503entry:
504  %pre = load ptr, ptr @c
505  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
506  %wide.load = load <2 x i16>, ptr %tmp6, align 1
507  %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
508  %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
509  %wide.load17 = load <2 x i16>, ptr %tmp10, align 1
510  %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
511  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
512  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
513  store <2 x i32> %tmp13, ptr %tmp14, align 4
514  ret void
515}
516
517; %val1 = load <4 x i16>
518; %op1 = zext<4 x i32> %val1
519; %val2 = load <4 x i16>
520; %op2 = zext<4 x i32> %val2
521; %rst = mul <4 x i32> %op1, %op2
522;
523define void @mul_4xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
524; X86-SSE-LABEL: mul_4xi16:
525; X86-SSE:       # %bb.0: # %entry
526; X86-SSE-NEXT:    pushl %esi
527; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
528; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
529; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
530; X86-SSE-NEXT:    movl c, %esi
531; X86-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
532; X86-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
533; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
534; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
535; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
536; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
537; X86-SSE-NEXT:    movdqu %xmm1, (%esi,%ecx,4)
538; X86-SSE-NEXT:    popl %esi
539; X86-SSE-NEXT:    retl
540;
541; X86-AVX-LABEL: mul_4xi16:
542; X86-AVX:       # %bb.0: # %entry
543; X86-AVX-NEXT:    pushl %esi
544; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
545; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
546; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
547; X86-AVX-NEXT:    movl c, %esi
548; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
549; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
550; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
551; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
552; X86-AVX-NEXT:    popl %esi
553; X86-AVX-NEXT:    retl
554;
555; X64-SSE-LABEL: mul_4xi16:
556; X64-SSE:       # %bb.0: # %entry
557; X64-SSE-NEXT:    movq c(%rip), %rax
558; X64-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
559; X64-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
560; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
561; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm2
562; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
563; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
564; X64-SSE-NEXT:    movdqu %xmm1, (%rax,%rdx,4)
565; X64-SSE-NEXT:    retq
566;
567; X64-AVX-LABEL: mul_4xi16:
568; X64-AVX:       # %bb.0: # %entry
569; X64-AVX-NEXT:    movq c(%rip), %rax
570; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
571; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
572; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
573; X64-AVX-NEXT:    vmovdqu %xmm0, (%rax,%rdx,4)
574; X64-AVX-NEXT:    retq
575entry:
576  %pre = load ptr, ptr @c
577  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
578  %wide.load = load <4 x i16>, ptr %tmp6, align 1
579  %tmp8 = zext <4 x i16> %wide.load to <4 x i32>
580  %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
581  %wide.load17 = load <4 x i16>, ptr %tmp10, align 1
582  %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
583  %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
584  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
585  store <4 x i32> %tmp13, ptr %tmp14, align 4
586  ret void
587}
588
589; %val1 = load <8 x i16>
590; %op1 = zext<8 x i32> %val1
591; %val2 = load <8 x i16>
592; %op2 = zext<8 x i32> %val2
593; %rst = mul <8 x i32> %op1, %op2
594;
595define void @mul_8xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
596; X86-SSE-LABEL: mul_8xi16:
597; X86-SSE:       # %bb.0: # %entry
598; X86-SSE-NEXT:    pushl %esi
599; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
600; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
601; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
602; X86-SSE-NEXT:    movl c, %esi
603; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
604; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm1
605; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
606; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
607; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
608; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
609; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
610; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
611; X86-SSE-NEXT:    movdqu %xmm1, 16(%esi,%ecx,4)
612; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
613; X86-SSE-NEXT:    popl %esi
614; X86-SSE-NEXT:    retl
615;
616; X86-AVX1-LABEL: mul_8xi16:
617; X86-AVX1:       # %bb.0: # %entry
618; X86-AVX1-NEXT:    pushl %esi
619; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
620; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
621; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
622; X86-AVX1-NEXT:    movl c, %esi
623; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
624; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
625; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
626; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm2, %xmm0
627; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
628; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
629; X86-AVX1-NEXT:    vmovdqu %xmm0, 16(%esi,%ecx,4)
630; X86-AVX1-NEXT:    vmovdqu %xmm1, (%esi,%ecx,4)
631; X86-AVX1-NEXT:    popl %esi
632; X86-AVX1-NEXT:    retl
633;
634; X86-AVX2-LABEL: mul_8xi16:
635; X86-AVX2:       # %bb.0: # %entry
636; X86-AVX2-NEXT:    pushl %esi
637; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
638; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
639; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
640; X86-AVX2-NEXT:    movl c, %esi
641; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
642; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
643; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
644; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
645; X86-AVX2-NEXT:    popl %esi
646; X86-AVX2-NEXT:    vzeroupper
647; X86-AVX2-NEXT:    retl
648;
649; X64-SSE-LABEL: mul_8xi16:
650; X64-SSE:       # %bb.0: # %entry
651; X64-SSE-NEXT:    movq c(%rip), %rax
652; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
653; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm1
654; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
655; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm2
656; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
657; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
658; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
659; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
660; X64-SSE-NEXT:    movdqu %xmm1, 16(%rax,%rdx,4)
661; X64-SSE-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
662; X64-SSE-NEXT:    retq
663;
664; X64-AVX1-LABEL: mul_8xi16:
665; X64-AVX1:       # %bb.0: # %entry
666; X64-AVX1-NEXT:    movq c(%rip), %rax
667; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
668; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
669; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
670; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm2, %xmm0
671; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
672; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
673; X64-AVX1-NEXT:    vmovdqu %xmm0, 16(%rax,%rdx,4)
674; X64-AVX1-NEXT:    vmovdqu %xmm1, (%rax,%rdx,4)
675; X64-AVX1-NEXT:    retq
676;
677; X64-AVX2-LABEL: mul_8xi16:
678; X64-AVX2:       # %bb.0: # %entry
679; X64-AVX2-NEXT:    movq c(%rip), %rax
680; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
681; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
682; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
683; X64-AVX2-NEXT:    vmovdqu %ymm0, (%rax,%rdx,4)
684; X64-AVX2-NEXT:    vzeroupper
685; X64-AVX2-NEXT:    retq
686entry:
687  %pre = load ptr, ptr @c
688  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
689  %wide.load = load <8 x i16>, ptr %tmp6, align 1
690  %tmp8 = zext <8 x i16> %wide.load to <8 x i32>
691  %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
692  %wide.load17 = load <8 x i16>, ptr %tmp10, align 1
693  %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
694  %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
695  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
696  store <8 x i32> %tmp13, ptr %tmp14, align 4
697  ret void
698}
699
700; %val1 = load <16 x i16>
701; %op1 = zext<16 x i32> %val1
702; %val2 = load <16 x i16>
703; %op2 = zext<16 x i32> %val2
704; %rst = mul <16 x i32> %op1, %op2
705;
706define void @mul_16xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
707; X86-SSE-LABEL: mul_16xi16:
708; X86-SSE:       # %bb.0: # %entry
709; X86-SSE-NEXT:    pushl %esi
710; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
711; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
712; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
713; X86-SSE-NEXT:    movl c, %ecx
714; X86-SSE-NEXT:    movdqu (%esi,%eax), %xmm2
715; X86-SSE-NEXT:    movdqu 16(%esi,%eax), %xmm3
716; X86-SSE-NEXT:    movdqu (%edx,%eax), %xmm0
717; X86-SSE-NEXT:    movdqu 16(%edx,%eax), %xmm1
718; X86-SSE-NEXT:    movdqa %xmm0, %xmm4
719; X86-SSE-NEXT:    pmulhuw %xmm2, %xmm4
720; X86-SSE-NEXT:    pmullw %xmm2, %xmm0
721; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
722; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
723; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
724; X86-SSE-NEXT:    movdqa %xmm1, %xmm4
725; X86-SSE-NEXT:    pmulhuw %xmm3, %xmm4
726; X86-SSE-NEXT:    pmullw %xmm3, %xmm1
727; X86-SSE-NEXT:    movdqa %xmm1, %xmm3
728; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
729; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
730; X86-SSE-NEXT:    movdqu %xmm1, 32(%ecx,%eax,4)
731; X86-SSE-NEXT:    movdqu %xmm3, 48(%ecx,%eax,4)
732; X86-SSE-NEXT:    movdqu %xmm0, (%ecx,%eax,4)
733; X86-SSE-NEXT:    movdqu %xmm2, 16(%ecx,%eax,4)
734; X86-SSE-NEXT:    popl %esi
735; X86-SSE-NEXT:    retl
736;
737; X86-AVX1-LABEL: mul_16xi16:
738; X86-AVX1:       # %bb.0: # %entry
739; X86-AVX1-NEXT:    pushl %esi
740; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
741; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
742; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
743; X86-AVX1-NEXT:    movl c, %ecx
744; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
745; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
746; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
747; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
748; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
749; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
750; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
751; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
752; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
753; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
754; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
755; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
756; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%ecx,%eax,4)
757; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%ecx,%eax,4)
758; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%ecx,%eax,4)
759; X86-AVX1-NEXT:    vmovdqu %xmm3, (%ecx,%eax,4)
760; X86-AVX1-NEXT:    popl %esi
761; X86-AVX1-NEXT:    retl
762;
763; X86-AVX2-LABEL: mul_16xi16:
764; X86-AVX2:       # %bb.0: # %entry
765; X86-AVX2-NEXT:    pushl %esi
766; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
767; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
768; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
769; X86-AVX2-NEXT:    movl c, %esi
770; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
771; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
772; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
773; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
774; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
775; X86-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
776; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
777; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
778; X86-AVX2-NEXT:    popl %esi
779; X86-AVX2-NEXT:    vzeroupper
780; X86-AVX2-NEXT:    retl
781;
782; X64-SSE-LABEL: mul_16xi16:
783; X64-SSE:       # %bb.0: # %entry
784; X64-SSE-NEXT:    movq c(%rip), %rax
785; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
786; X64-SSE-NEXT:    movdqu 16(%rdi,%rdx), %xmm1
787; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm2
788; X64-SSE-NEXT:    movdqu 16(%rsi,%rdx), %xmm3
789; X64-SSE-NEXT:    movdqa %xmm2, %xmm4
790; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm4
791; X64-SSE-NEXT:    pmullw %xmm0, %xmm2
792; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
793; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
794; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
795; X64-SSE-NEXT:    movdqa %xmm3, %xmm4
796; X64-SSE-NEXT:    pmulhuw %xmm1, %xmm4
797; X64-SSE-NEXT:    pmullw %xmm1, %xmm3
798; X64-SSE-NEXT:    movdqa %xmm3, %xmm1
799; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
800; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
801; X64-SSE-NEXT:    movdqu %xmm3, 32(%rax,%rdx,4)
802; X64-SSE-NEXT:    movdqu %xmm1, 48(%rax,%rdx,4)
803; X64-SSE-NEXT:    movdqu %xmm2, (%rax,%rdx,4)
804; X64-SSE-NEXT:    movdqu %xmm0, 16(%rax,%rdx,4)
805; X64-SSE-NEXT:    retq
806;
807; X64-AVX1-LABEL: mul_16xi16:
808; X64-AVX1:       # %bb.0: # %entry
809; X64-AVX1-NEXT:    movq c(%rip), %rax
810; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
811; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
812; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
813; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
814; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
815; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
816; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
817; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
818; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
819; X64-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
820; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
821; X64-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
822; X64-AVX1-NEXT:    vmovdqu %xmm0, 48(%rax,%rdx,4)
823; X64-AVX1-NEXT:    vmovdqu %xmm1, 32(%rax,%rdx,4)
824; X64-AVX1-NEXT:    vmovdqu %xmm2, 16(%rax,%rdx,4)
825; X64-AVX1-NEXT:    vmovdqu %xmm3, (%rax,%rdx,4)
826; X64-AVX1-NEXT:    retq
827;
828; X64-AVX2-LABEL: mul_16xi16:
829; X64-AVX2:       # %bb.0: # %entry
830; X64-AVX2-NEXT:    movq c(%rip), %rax
831; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
832; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
833; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
834; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
835; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
836; X64-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
837; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
838; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
839; X64-AVX2-NEXT:    vzeroupper
840; X64-AVX2-NEXT:    retq
841entry:
842  %pre = load ptr, ptr @c
843  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
844  %wide.load = load <16 x i16>, ptr %tmp6, align 1
845  %tmp8 = zext <16 x i16> %wide.load to <16 x i32>
846  %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
847  %wide.load17 = load <16 x i16>, ptr %tmp10, align 1
848  %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
849  %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
850  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
851  store <16 x i32> %tmp13, ptr %tmp14, align 4
852  ret void
853}
854
855; %val1 = load <2 x i8>
856; %op1 = sext<2 x i32> %val1
857; %val2 = load <2 x i8>
858; %op2 = sext<2 x i32> %val2
859; %rst = mul <2 x i32> %op1, %op2
860;
861define void @mul_2xi8_sext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
862; X86-SSE-LABEL: mul_2xi8_sext:
863; X86-SSE:       # %bb.0: # %entry
864; X86-SSE-NEXT:    pushl %esi
865; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
866; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
867; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
868; X86-SSE-NEXT:    movl c, %ecx
869; X86-SSE-NEXT:    movzwl (%esi,%eax), %esi
870; X86-SSE-NEXT:    movd %esi, %xmm0
871; X86-SSE-NEXT:    movzwl (%edx,%eax), %edx
872; X86-SSE-NEXT:    movd %edx, %xmm1
873; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
874; X86-SSE-NEXT:    psraw $8, %xmm0
875; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
876; X86-SSE-NEXT:    psraw $8, %xmm1
877; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
878; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
879; X86-SSE-NEXT:    psrad $16, %xmm0
880; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
881; X86-SSE-NEXT:    popl %esi
882; X86-SSE-NEXT:    retl
883;
884; X86-AVX-LABEL: mul_2xi8_sext:
885; X86-AVX:       # %bb.0: # %entry
886; X86-AVX-NEXT:    pushl %esi
887; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
888; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
889; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
890; X86-AVX-NEXT:    movl c, %esi
891; X86-AVX-NEXT:    movzwl (%edx,%ecx), %edx
892; X86-AVX-NEXT:    vmovd %edx, %xmm0
893; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
894; X86-AVX-NEXT:    movzwl (%eax,%ecx), %eax
895; X86-AVX-NEXT:    vmovd %eax, %xmm1
896; X86-AVX-NEXT:    vpmovsxbd %xmm1, %xmm1
897; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
898; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
899; X86-AVX-NEXT:    popl %esi
900; X86-AVX-NEXT:    retl
901;
902; X64-SSE-LABEL: mul_2xi8_sext:
903; X64-SSE:       # %bb.0: # %entry
904; X64-SSE-NEXT:    movq c(%rip), %rax
905; X64-SSE-NEXT:    movzwl (%rdi,%rdx), %ecx
906; X64-SSE-NEXT:    movd %ecx, %xmm0
907; X64-SSE-NEXT:    movzwl (%rsi,%rdx), %ecx
908; X64-SSE-NEXT:    movd %ecx, %xmm1
909; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
910; X64-SSE-NEXT:    psraw $8, %xmm0
911; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
912; X64-SSE-NEXT:    psraw $8, %xmm1
913; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
914; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
915; X64-SSE-NEXT:    psrad $16, %xmm0
916; X64-SSE-NEXT:    movq %xmm0, (%rax,%rdx,4)
917; X64-SSE-NEXT:    retq
918;
919; X64-AVX-LABEL: mul_2xi8_sext:
920; X64-AVX:       # %bb.0: # %entry
921; X64-AVX-NEXT:    movq c(%rip), %rax
922; X64-AVX-NEXT:    movzwl (%rdi,%rdx), %ecx
923; X64-AVX-NEXT:    vmovd %ecx, %xmm0
924; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
925; X64-AVX-NEXT:    movzwl (%rsi,%rdx), %ecx
926; X64-AVX-NEXT:    vmovd %ecx, %xmm1
927; X64-AVX-NEXT:    vpmovsxbd %xmm1, %xmm1
928; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
929; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
930; X64-AVX-NEXT:    retq
931entry:
932  %pre = load ptr, ptr @c
933  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
934  %wide.load = load <2 x i8>, ptr %tmp6, align 1
935  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
936  %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
937  %wide.load17 = load <2 x i8>, ptr %tmp10, align 1
938  %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
939  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
940  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
941  store <2 x i32> %tmp13, ptr %tmp14, align 4
942  ret void
943}
944
945; %val1 = load <2 x i8>
946; %op1 = sext<2 x i32> %val1
947; %val2 = load <2 x i8>
948; %op2 = zext<2 x i32> %val2
949; %rst = mul <2 x i32> %op1, %op2
950;
951define void @mul_2xi8_sext_zext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
952; X86-SSE-LABEL: mul_2xi8_sext_zext:
953; X86-SSE:       # %bb.0: # %entry
954; X86-SSE-NEXT:    pushl %esi
955; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
956; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
957; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
958; X86-SSE-NEXT:    movl c, %ecx
959; X86-SSE-NEXT:    movzwl (%esi,%eax), %esi
960; X86-SSE-NEXT:    movd %esi, %xmm0
961; X86-SSE-NEXT:    movzwl (%edx,%eax), %edx
962; X86-SSE-NEXT:    movd %edx, %xmm1
963; X86-SSE-NEXT:    pxor %xmm2, %xmm2
964; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
965; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
966; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
967; X86-SSE-NEXT:    psraw $8, %xmm0
968; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
969; X86-SSE-NEXT:    pmaddwd %xmm1, %xmm0
970; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
971; X86-SSE-NEXT:    popl %esi
972; X86-SSE-NEXT:    retl
973;
974; X86-AVX-LABEL: mul_2xi8_sext_zext:
975; X86-AVX:       # %bb.0: # %entry
976; X86-AVX-NEXT:    pushl %esi
977; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
978; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
979; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
980; X86-AVX-NEXT:    movl c, %esi
981; X86-AVX-NEXT:    movzwl (%edx,%ecx), %edx
982; X86-AVX-NEXT:    vmovd %edx, %xmm0
983; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
984; X86-AVX-NEXT:    movzwl (%eax,%ecx), %eax
985; X86-AVX-NEXT:    vmovd %eax, %xmm1
986; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
987; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
988; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
989; X86-AVX-NEXT:    popl %esi
990; X86-AVX-NEXT:    retl
991;
992; X64-SSE-LABEL: mul_2xi8_sext_zext:
993; X64-SSE:       # %bb.0: # %entry
994; X64-SSE-NEXT:    movq c(%rip), %rax
995; X64-SSE-NEXT:    movzwl (%rdi,%rdx), %ecx
996; X64-SSE-NEXT:    movd %ecx, %xmm0
997; X64-SSE-NEXT:    movzwl (%rsi,%rdx), %ecx
998; X64-SSE-NEXT:    movd %ecx, %xmm1
999; X64-SSE-NEXT:    pxor %xmm2, %xmm2
1000; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1001; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
1002; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1003; X64-SSE-NEXT:    psraw $8, %xmm0
1004; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1005; X64-SSE-NEXT:    pmaddwd %xmm1, %xmm0
1006; X64-SSE-NEXT:    movq %xmm0, (%rax,%rdx,4)
1007; X64-SSE-NEXT:    retq
1008;
1009; X64-AVX-LABEL: mul_2xi8_sext_zext:
1010; X64-AVX:       # %bb.0: # %entry
1011; X64-AVX-NEXT:    movq c(%rip), %rax
1012; X64-AVX-NEXT:    movzwl (%rdi,%rdx), %ecx
1013; X64-AVX-NEXT:    vmovd %ecx, %xmm0
1014; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1015; X64-AVX-NEXT:    movzwl (%rsi,%rdx), %ecx
1016; X64-AVX-NEXT:    vmovd %ecx, %xmm1
1017; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1018; X64-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
1019; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
1020; X64-AVX-NEXT:    retq
1021entry:
1022  %pre = load ptr, ptr @c
1023  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1024  %wide.load = load <2 x i8>, ptr %tmp6, align 1
1025  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1026  %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
1027  %wide.load17 = load <2 x i8>, ptr %tmp10, align 1
1028  %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
1029  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1030  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1031  store <2 x i32> %tmp13, ptr %tmp14, align 4
1032  ret void
1033}
1034
1035; %val1 = load <2 x i16>
1036; %op1 = sext<2 x i32> %val1
1037; %val2 = load <2 x i16>
1038; %op2 = sext<2 x i32> %val2
1039; %rst = mul <2 x i32> %op1, %op2
1040;
1041define void @mul_2xi16_sext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
1042; X86-SSE-LABEL: mul_2xi16_sext:
1043; X86-SSE:       # %bb.0: # %entry
1044; X86-SSE-NEXT:    pushl %esi
1045; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1046; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1047; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
1048; X86-SSE-NEXT:    movl c, %esi
1049; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1050; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1051; X86-SSE-NEXT:    pxor %xmm2, %xmm2
1052; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1053; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
1054; X86-SSE-NEXT:    pmaddwd %xmm0, %xmm1
1055; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
1056; X86-SSE-NEXT:    popl %esi
1057; X86-SSE-NEXT:    retl
1058;
1059; X86-AVX-LABEL: mul_2xi16_sext:
1060; X86-AVX:       # %bb.0: # %entry
1061; X86-AVX-NEXT:    pushl %esi
1062; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1063; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1064; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
1065; X86-AVX-NEXT:    movl c, %esi
1066; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1067; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1068; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1069; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1070; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
1071; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
1072; X86-AVX-NEXT:    popl %esi
1073; X86-AVX-NEXT:    retl
1074;
1075; X64-SSE-LABEL: mul_2xi16_sext:
1076; X64-SSE:       # %bb.0: # %entry
1077; X64-SSE-NEXT:    movq c(%rip), %rax
1078; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1079; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1080; X64-SSE-NEXT:    pxor %xmm2, %xmm2
1081; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1082; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
1083; X64-SSE-NEXT:    pmaddwd %xmm0, %xmm1
1084; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
1085; X64-SSE-NEXT:    retq
1086;
1087; X64-AVX-LABEL: mul_2xi16_sext:
1088; X64-AVX:       # %bb.0: # %entry
1089; X64-AVX-NEXT:    movq c(%rip), %rax
1090; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1091; X64-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1092; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1093; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1094; X64-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
1095; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
1096; X64-AVX-NEXT:    retq
1097entry:
1098  %pre = load ptr, ptr @c
1099  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1100  %wide.load = load <2 x i16>, ptr %tmp6, align 1
1101  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1102  %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
1103  %wide.load17 = load <2 x i16>, ptr %tmp10, align 1
1104  %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
1105  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1106  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1107  store <2 x i32> %tmp13, ptr %tmp14, align 4
1108  ret void
1109}
1110
1111; %val1 = load <2 x i16>
1112; %op1 = sext<2 x i32> %val1
1113; %val2 = load <2 x i16>
1114; %op2 = zext<2 x i32> %val2
1115; %rst = mul <2 x i32> %op1, %op2
1116;
1117define void @mul_2xi16_sext_zext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
1118; X86-SSE-LABEL: mul_2xi16_sext_zext:
1119; X86-SSE:       # %bb.0: # %entry
1120; X86-SSE-NEXT:    pushl %esi
1121; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
1122; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1123; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
1124; X86-SSE-NEXT:    movl c, %ecx
1125; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1126; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1127; X86-SSE-NEXT:    psrad $16, %xmm0
1128; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1129; X86-SSE-NEXT:    pxor %xmm2, %xmm2
1130; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1131; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1132; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
1133; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1134; X86-SSE-NEXT:    pmuludq %xmm2, %xmm0
1135; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1136; X86-SSE-NEXT:    movq %xmm1, (%ecx,%eax,4)
1137; X86-SSE-NEXT:    popl %esi
1138; X86-SSE-NEXT:    retl
1139;
1140; X86-AVX-LABEL: mul_2xi16_sext_zext:
1141; X86-AVX:       # %bb.0: # %entry
1142; X86-AVX-NEXT:    pushl %esi
1143; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1144; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1145; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
1146; X86-AVX-NEXT:    movl c, %esi
1147; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1148; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
1149; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1150; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1151; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
1152; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
1153; X86-AVX-NEXT:    popl %esi
1154; X86-AVX-NEXT:    retl
1155;
1156; X64-SSE-LABEL: mul_2xi16_sext_zext:
1157; X64-SSE:       # %bb.0: # %entry
1158; X64-SSE-NEXT:    movq c(%rip), %rax
1159; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1160; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1161; X64-SSE-NEXT:    psrad $16, %xmm0
1162; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1163; X64-SSE-NEXT:    pxor %xmm2, %xmm2
1164; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1165; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1166; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
1167; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1168; X64-SSE-NEXT:    pmuludq %xmm2, %xmm0
1169; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1170; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
1171; X64-SSE-NEXT:    retq
1172;
1173; X64-AVX-LABEL: mul_2xi16_sext_zext:
1174; X64-AVX:       # %bb.0: # %entry
1175; X64-AVX-NEXT:    movq c(%rip), %rax
1176; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1177; X64-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
1178; X64-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1179; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1180; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
1181; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
1182; X64-AVX-NEXT:    retq
1183entry:
1184  %pre = load ptr, ptr @c
1185  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1186  %wide.load = load <2 x i16>, ptr %tmp6, align 1
1187  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1188  %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
1189  %wide.load17 = load <2 x i16>, ptr %tmp10, align 1
1190  %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
1191  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1192  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1193  store <2 x i32> %tmp13, ptr %tmp14, align 4
1194  ret void
1195}
1196
1197; %val1 = load <16 x i16>
1198; %op1 = sext<16 x i32> %val1
1199; %val2 = load <16 x i16>
1200; %op2 = sext<16 x i32> %val2
1201; %rst = mul <16 x i32> %op1, %op2
1202;
1203define void @mul_16xi16_sext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
1204; X86-SSE-LABEL: mul_16xi16_sext:
1205; X86-SSE:       # %bb.0: # %entry
1206; X86-SSE-NEXT:    pushl %esi
1207; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
1208; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1209; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
1210; X86-SSE-NEXT:    movl c, %ecx
1211; X86-SSE-NEXT:    movdqu (%esi,%eax), %xmm2
1212; X86-SSE-NEXT:    movdqu 16(%esi,%eax), %xmm3
1213; X86-SSE-NEXT:    movdqu (%edx,%eax), %xmm0
1214; X86-SSE-NEXT:    movdqu 16(%edx,%eax), %xmm1
1215; X86-SSE-NEXT:    movdqa %xmm0, %xmm4
1216; X86-SSE-NEXT:    pmulhw %xmm2, %xmm4
1217; X86-SSE-NEXT:    pmullw %xmm2, %xmm0
1218; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1219; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1220; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1221; X86-SSE-NEXT:    movdqa %xmm1, %xmm4
1222; X86-SSE-NEXT:    pmulhw %xmm3, %xmm4
1223; X86-SSE-NEXT:    pmullw %xmm3, %xmm1
1224; X86-SSE-NEXT:    movdqa %xmm1, %xmm3
1225; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1226; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1227; X86-SSE-NEXT:    movdqu %xmm1, 32(%ecx,%eax,4)
1228; X86-SSE-NEXT:    movdqu %xmm3, 48(%ecx,%eax,4)
1229; X86-SSE-NEXT:    movdqu %xmm0, (%ecx,%eax,4)
1230; X86-SSE-NEXT:    movdqu %xmm2, 16(%ecx,%eax,4)
1231; X86-SSE-NEXT:    popl %esi
1232; X86-SSE-NEXT:    retl
1233;
1234; X86-AVX1-LABEL: mul_16xi16_sext:
1235; X86-AVX1:       # %bb.0: # %entry
1236; X86-AVX1-NEXT:    pushl %esi
1237; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
1238; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
1239; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
1240; X86-AVX1-NEXT:    movl c, %ecx
1241; X86-AVX1-NEXT:    vpmovsxwd 24(%esi,%eax), %xmm0
1242; X86-AVX1-NEXT:    vpmovsxwd 16(%esi,%eax), %xmm1
1243; X86-AVX1-NEXT:    vpmovsxwd 8(%esi,%eax), %xmm2
1244; X86-AVX1-NEXT:    vpmovsxwd (%esi,%eax), %xmm3
1245; X86-AVX1-NEXT:    vpmovsxwd 24(%edx,%eax), %xmm4
1246; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
1247; X86-AVX1-NEXT:    vpmovsxwd 16(%edx,%eax), %xmm4
1248; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
1249; X86-AVX1-NEXT:    vpmovsxwd 8(%edx,%eax), %xmm4
1250; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
1251; X86-AVX1-NEXT:    vpmovsxwd (%edx,%eax), %xmm4
1252; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
1253; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%ecx,%eax,4)
1254; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%ecx,%eax,4)
1255; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%ecx,%eax,4)
1256; X86-AVX1-NEXT:    vmovdqu %xmm3, (%ecx,%eax,4)
1257; X86-AVX1-NEXT:    popl %esi
1258; X86-AVX1-NEXT:    retl
1259;
1260; X86-AVX2-LABEL: mul_16xi16_sext:
1261; X86-AVX2:       # %bb.0: # %entry
1262; X86-AVX2-NEXT:    pushl %esi
1263; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1264; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1265; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
1266; X86-AVX2-NEXT:    movl c, %esi
1267; X86-AVX2-NEXT:    vpmovsxwd 16(%edx,%ecx), %ymm0
1268; X86-AVX2-NEXT:    vpmovsxwd (%edx,%ecx), %ymm1
1269; X86-AVX2-NEXT:    vpmovsxwd 16(%eax,%ecx), %ymm2
1270; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
1271; X86-AVX2-NEXT:    vpmovsxwd (%eax,%ecx), %ymm2
1272; X86-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
1273; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
1274; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
1275; X86-AVX2-NEXT:    popl %esi
1276; X86-AVX2-NEXT:    vzeroupper
1277; X86-AVX2-NEXT:    retl
1278;
1279; X64-SSE-LABEL: mul_16xi16_sext:
1280; X64-SSE:       # %bb.0: # %entry
1281; X64-SSE-NEXT:    movq c(%rip), %rax
1282; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
1283; X64-SSE-NEXT:    movdqu 16(%rdi,%rdx), %xmm1
1284; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm2
1285; X64-SSE-NEXT:    movdqu 16(%rsi,%rdx), %xmm3
1286; X64-SSE-NEXT:    movdqa %xmm2, %xmm4
1287; X64-SSE-NEXT:    pmulhw %xmm0, %xmm4
1288; X64-SSE-NEXT:    pmullw %xmm0, %xmm2
1289; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
1290; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1291; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1292; X64-SSE-NEXT:    movdqa %xmm3, %xmm4
1293; X64-SSE-NEXT:    pmulhw %xmm1, %xmm4
1294; X64-SSE-NEXT:    pmullw %xmm1, %xmm3
1295; X64-SSE-NEXT:    movdqa %xmm3, %xmm1
1296; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1297; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1298; X64-SSE-NEXT:    movdqu %xmm3, 32(%rax,%rdx,4)
1299; X64-SSE-NEXT:    movdqu %xmm1, 48(%rax,%rdx,4)
1300; X64-SSE-NEXT:    movdqu %xmm2, (%rax,%rdx,4)
1301; X64-SSE-NEXT:    movdqu %xmm0, 16(%rax,%rdx,4)
1302; X64-SSE-NEXT:    retq
1303;
1304; X64-AVX1-LABEL: mul_16xi16_sext:
1305; X64-AVX1:       # %bb.0: # %entry
1306; X64-AVX1-NEXT:    movq c(%rip), %rax
1307; X64-AVX1-NEXT:    vpmovsxwd 24(%rdi,%rdx), %xmm0
1308; X64-AVX1-NEXT:    vpmovsxwd 16(%rdi,%rdx), %xmm1
1309; X64-AVX1-NEXT:    vpmovsxwd 8(%rdi,%rdx), %xmm2
1310; X64-AVX1-NEXT:    vpmovsxwd (%rdi,%rdx), %xmm3
1311; X64-AVX1-NEXT:    vpmovsxwd 24(%rsi,%rdx), %xmm4
1312; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
1313; X64-AVX1-NEXT:    vpmovsxwd 16(%rsi,%rdx), %xmm4
1314; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
1315; X64-AVX1-NEXT:    vpmovsxwd 8(%rsi,%rdx), %xmm4
1316; X64-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
1317; X64-AVX1-NEXT:    vpmovsxwd (%rsi,%rdx), %xmm4
1318; X64-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
1319; X64-AVX1-NEXT:    vmovdqu %xmm0, 48(%rax,%rdx,4)
1320; X64-AVX1-NEXT:    vmovdqu %xmm1, 32(%rax,%rdx,4)
1321; X64-AVX1-NEXT:    vmovdqu %xmm2, 16(%rax,%rdx,4)
1322; X64-AVX1-NEXT:    vmovdqu %xmm3, (%rax,%rdx,4)
1323; X64-AVX1-NEXT:    retq
1324;
1325; X64-AVX2-LABEL: mul_16xi16_sext:
1326; X64-AVX2:       # %bb.0: # %entry
1327; X64-AVX2-NEXT:    movq c(%rip), %rax
1328; X64-AVX2-NEXT:    vpmovsxwd 16(%rdi,%rdx), %ymm0
1329; X64-AVX2-NEXT:    vpmovsxwd (%rdi,%rdx), %ymm1
1330; X64-AVX2-NEXT:    vpmovsxwd 16(%rsi,%rdx), %ymm2
1331; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
1332; X64-AVX2-NEXT:    vpmovsxwd (%rsi,%rdx), %ymm2
1333; X64-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
1334; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
1335; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
1336; X64-AVX2-NEXT:    vzeroupper
1337; X64-AVX2-NEXT:    retq
1338entry:
1339  %pre = load ptr, ptr @c
1340  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1341  %wide.load = load <16 x i16>, ptr %tmp6, align 1
1342  %tmp8 = sext <16 x i16> %wide.load to <16 x i32>
1343  %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
1344  %wide.load17 = load <16 x i16>, ptr %tmp10, align 1
1345  %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
1346  %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
1347  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1348  store <16 x i32> %tmp13, ptr %tmp14, align 4
1349  ret void
1350}
1351
1352; %val = load <2 x i8>
1353; %op1 = zext<2 x i32> %val
1354; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
1355; %rst = mul <2 x i32> %op1, %op2
1356;
1357define void @mul_2xi8_varconst1(ptr nocapture readonly %a, i64 %index) {
1358; X86-SSE-LABEL: mul_2xi8_varconst1:
1359; X86-SSE:       # %bb.0: # %entry
1360; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1361; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1362; X86-SSE-NEXT:    movl c, %edx
1363; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
1364; X86-SSE-NEXT:    movd %ecx, %xmm0
1365; X86-SSE-NEXT:    pxor %xmm1, %xmm1
1366; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1367; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1368; X86-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,255,0,u,u,u,u]
1369; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1370; X86-SSE-NEXT:    retl
1371;
1372; X86-AVX-LABEL: mul_2xi8_varconst1:
1373; X86-AVX:       # %bb.0: # %entry
1374; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1375; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1376; X86-AVX-NEXT:    movl c, %edx
1377; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
1378; X86-AVX-NEXT:    vmovd %ecx, %xmm0
1379; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1380; X86-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,0,255,0,u,u,u,u]
1381; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1382; X86-AVX-NEXT:    retl
1383;
1384; X64-SSE-LABEL: mul_2xi8_varconst1:
1385; X64-SSE:       # %bb.0: # %entry
1386; X64-SSE-NEXT:    movq c(%rip), %rax
1387; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
1388; X64-SSE-NEXT:    movd %ecx, %xmm0
1389; X64-SSE-NEXT:    pxor %xmm1, %xmm1
1390; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1391; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1392; X64-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,255,0,u,u,u,u]
1393; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1394; X64-SSE-NEXT:    retq
1395;
1396; X64-AVX-LABEL: mul_2xi8_varconst1:
1397; X64-AVX:       # %bb.0: # %entry
1398; X64-AVX-NEXT:    movq c(%rip), %rax
1399; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
1400; X64-AVX-NEXT:    vmovd %ecx, %xmm0
1401; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1402; X64-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,255,0,u,u,u,u]
1403; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1404; X64-AVX-NEXT:    retq
1405entry:
1406  %pre = load ptr, ptr @c
1407  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1408  %wide.load = load <2 x i8>, ptr %tmp6, align 1
1409  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1410  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
1411  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1412  store <2 x i32> %tmp13, ptr %tmp14, align 4
1413  ret void
1414}
1415
1416; %val = load <2 x i8>
1417; %op1 = sext<2 x i32> %val
1418; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
1419; %rst = mul <2 x i32> %op1, %op2
1420;
1421define void @mul_2xi8_varconst2(ptr nocapture readonly %a, i64 %index) {
1422; X86-SSE-LABEL: mul_2xi8_varconst2:
1423; X86-SSE:       # %bb.0: # %entry
1424; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1425; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1426; X86-SSE-NEXT:    movl c, %edx
1427; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
1428; X86-SSE-NEXT:    movd %ecx, %xmm0
1429; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1430; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1431; X86-SSE-NEXT:    psrad $24, %xmm0
1432; X86-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65408,0,127,0,u,u,u,u]
1433; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1434; X86-SSE-NEXT:    retl
1435;
1436; X86-AVX-LABEL: mul_2xi8_varconst2:
1437; X86-AVX:       # %bb.0: # %entry
1438; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1439; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1440; X86-AVX-NEXT:    movl c, %edx
1441; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
1442; X86-AVX-NEXT:    vmovd %ecx, %xmm0
1443; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1444; X86-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [65408,0,127,0,u,u,u,u]
1445; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1446; X86-AVX-NEXT:    retl
1447;
1448; X64-SSE-LABEL: mul_2xi8_varconst2:
1449; X64-SSE:       # %bb.0: # %entry
1450; X64-SSE-NEXT:    movq c(%rip), %rax
1451; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
1452; X64-SSE-NEXT:    movd %ecx, %xmm0
1453; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1454; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1455; X64-SSE-NEXT:    psrad $24, %xmm0
1456; X64-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65408,0,127,0,u,u,u,u]
1457; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1458; X64-SSE-NEXT:    retq
1459;
1460; X64-AVX-LABEL: mul_2xi8_varconst2:
1461; X64-AVX:       # %bb.0: # %entry
1462; X64-AVX-NEXT:    movq c(%rip), %rax
1463; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
1464; X64-AVX-NEXT:    vmovd %ecx, %xmm0
1465; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1466; X64-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [65408,0,127,0,u,u,u,u]
1467; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1468; X64-AVX-NEXT:    retq
1469entry:
1470  %pre = load ptr, ptr @c
1471  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1472  %wide.load = load <2 x i8>, ptr %tmp6, align 1
1473  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1474  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
1475  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1476  store <2 x i32> %tmp13, ptr %tmp14, align 4
1477  ret void
1478}
1479
1480; %val = load <2 x i8>
1481; %op1 = zext<2 x i32> %val
1482; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
1483; %rst = mul <2 x i32> %op1, %op2
1484;
1485define void @mul_2xi8_varconst3(ptr nocapture readonly %a, i64 %index) {
1486; X86-SSE-LABEL: mul_2xi8_varconst3:
1487; X86-SSE:       # %bb.0: # %entry
1488; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1489; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1490; X86-SSE-NEXT:    movl c, %edx
1491; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
1492; X86-SSE-NEXT:    movd %ecx, %xmm0
1493; X86-SSE-NEXT:    pxor %xmm1, %xmm1
1494; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1495; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1496; X86-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,256,0,u,u,u,u]
1497; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1498; X86-SSE-NEXT:    retl
1499;
1500; X86-AVX-LABEL: mul_2xi8_varconst3:
1501; X86-AVX:       # %bb.0: # %entry
1502; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1503; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1504; X86-AVX-NEXT:    movl c, %edx
1505; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
1506; X86-AVX-NEXT:    vmovd %ecx, %xmm0
1507; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1508; X86-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,0,256,0,u,u,u,u]
1509; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1510; X86-AVX-NEXT:    retl
1511;
1512; X64-SSE-LABEL: mul_2xi8_varconst3:
1513; X64-SSE:       # %bb.0: # %entry
1514; X64-SSE-NEXT:    movq c(%rip), %rax
1515; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
1516; X64-SSE-NEXT:    movd %ecx, %xmm0
1517; X64-SSE-NEXT:    pxor %xmm1, %xmm1
1518; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1519; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1520; X64-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,256,0,u,u,u,u]
1521; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1522; X64-SSE-NEXT:    retq
1523;
1524; X64-AVX-LABEL: mul_2xi8_varconst3:
1525; X64-AVX:       # %bb.0: # %entry
1526; X64-AVX-NEXT:    movq c(%rip), %rax
1527; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
1528; X64-AVX-NEXT:    vmovd %ecx, %xmm0
1529; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1530; X64-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,256,0,u,u,u,u]
1531; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1532; X64-AVX-NEXT:    retq
1533entry:
1534  %pre = load ptr, ptr @c
1535  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1536  %wide.load = load <2 x i8>, ptr %tmp6, align 1
1537  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1538  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
1539  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1540  store <2 x i32> %tmp13, ptr %tmp14, align 4
1541  ret void
1542}
1543
1544; %val = load <2 x i8>
1545; %op1 = zext<2 x i32> %val
1546; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
1547; %rst = mul <2 x i32> %op1, %op2
1548;
1549define void @mul_2xi8_varconst4(ptr nocapture readonly %a, i64 %index) {
1550; X86-SSE-LABEL: mul_2xi8_varconst4:
1551; X86-SSE:       # %bb.0: # %entry
1552; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1553; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1554; X86-SSE-NEXT:    movl c, %edx
1555; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
1556; X86-SSE-NEXT:    movd %ecx, %xmm0
1557; X86-SSE-NEXT:    pxor %xmm1, %xmm1
1558; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1559; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1560; X86-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65535,0,255,0,u,u,u,u]
1561; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1562; X86-SSE-NEXT:    retl
1563;
1564; X86-AVX-LABEL: mul_2xi8_varconst4:
1565; X86-AVX:       # %bb.0: # %entry
1566; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1567; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1568; X86-AVX-NEXT:    movl c, %edx
1569; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
1570; X86-AVX-NEXT:    vmovd %ecx, %xmm0
1571; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1572; X86-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [65535,0,255,0,u,u,u,u]
1573; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1574; X86-AVX-NEXT:    retl
1575;
1576; X64-SSE-LABEL: mul_2xi8_varconst4:
1577; X64-SSE:       # %bb.0: # %entry
1578; X64-SSE-NEXT:    movq c(%rip), %rax
1579; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
1580; X64-SSE-NEXT:    movd %ecx, %xmm0
1581; X64-SSE-NEXT:    pxor %xmm1, %xmm1
1582; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1583; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1584; X64-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65535,0,255,0,u,u,u,u]
1585; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1586; X64-SSE-NEXT:    retq
1587;
1588; X64-AVX-LABEL: mul_2xi8_varconst4:
1589; X64-AVX:       # %bb.0: # %entry
1590; X64-AVX-NEXT:    movq c(%rip), %rax
1591; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
1592; X64-AVX-NEXT:    vmovd %ecx, %xmm0
1593; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1594; X64-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [65535,0,255,0,u,u,u,u]
1595; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1596; X64-AVX-NEXT:    retq
1597entry:
1598  %pre = load ptr, ptr @c
1599  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1600  %wide.load = load <2 x i8>, ptr %tmp6, align 1
1601  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1602  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
1603  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1604  store <2 x i32> %tmp13, ptr %tmp14, align 4
1605  ret void
1606}
1607
1608; %val = load <2 x i8>
1609; %op1 = sext<2 x i32> %val
1610; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
1611; %rst = mul <2 x i32> %op1, %op2
1612;
1613define void @mul_2xi8_varconst5(ptr nocapture readonly %a, i64 %index) {
1614; X86-SSE-LABEL: mul_2xi8_varconst5:
1615; X86-SSE:       # %bb.0: # %entry
1616; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1617; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1618; X86-SSE-NEXT:    movl c, %edx
1619; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
1620; X86-SSE-NEXT:    movd %ecx, %xmm0
1621; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1622; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1623; X86-SSE-NEXT:    psrad $24, %xmm0
1624; X86-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65407,0,127,0,u,u,u,u]
1625; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1626; X86-SSE-NEXT:    retl
1627;
1628; X86-AVX-LABEL: mul_2xi8_varconst5:
1629; X86-AVX:       # %bb.0: # %entry
1630; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1631; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1632; X86-AVX-NEXT:    movl c, %edx
1633; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
1634; X86-AVX-NEXT:    vmovd %ecx, %xmm0
1635; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1636; X86-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [65407,0,127,0,u,u,u,u]
1637; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1638; X86-AVX-NEXT:    retl
1639;
1640; X64-SSE-LABEL: mul_2xi8_varconst5:
1641; X64-SSE:       # %bb.0: # %entry
1642; X64-SSE-NEXT:    movq c(%rip), %rax
1643; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
1644; X64-SSE-NEXT:    movd %ecx, %xmm0
1645; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1646; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1647; X64-SSE-NEXT:    psrad $24, %xmm0
1648; X64-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65407,0,127,0,u,u,u,u]
1649; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1650; X64-SSE-NEXT:    retq
1651;
1652; X64-AVX-LABEL: mul_2xi8_varconst5:
1653; X64-AVX:       # %bb.0: # %entry
1654; X64-AVX-NEXT:    movq c(%rip), %rax
1655; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
1656; X64-AVX-NEXT:    vmovd %ecx, %xmm0
1657; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1658; X64-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [65407,0,127,0,u,u,u,u]
1659; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1660; X64-AVX-NEXT:    retq
1661entry:
1662  %pre = load ptr, ptr @c
1663  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1664  %wide.load = load <2 x i8>, ptr %tmp6, align 1
1665  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1666  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
1667  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1668  store <2 x i32> %tmp13, ptr %tmp14, align 4
1669  ret void
1670}
1671
1672; %val = load <2 x i8>
1673; %op1 = sext<2 x i32> %val
1674; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
1675; %rst = mul <2 x i32> %op1, %op2
1676;
1677define void @mul_2xi8_varconst6(ptr nocapture readonly %a, i64 %index) {
1678; X86-SSE-LABEL: mul_2xi8_varconst6:
1679; X86-SSE:       # %bb.0: # %entry
1680; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1681; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1682; X86-SSE-NEXT:    movl c, %edx
1683; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
1684; X86-SSE-NEXT:    movd %ecx, %xmm0
1685; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1686; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1687; X86-SSE-NEXT:    psrad $24, %xmm0
1688; X86-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65408,0,128,0,u,u,u,u]
1689; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1690; X86-SSE-NEXT:    retl
1691;
1692; X86-AVX-LABEL: mul_2xi8_varconst6:
1693; X86-AVX:       # %bb.0: # %entry
1694; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1695; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1696; X86-AVX-NEXT:    movl c, %edx
1697; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
1698; X86-AVX-NEXT:    vmovd %ecx, %xmm0
1699; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1700; X86-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [65408,0,128,0,u,u,u,u]
1701; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1702; X86-AVX-NEXT:    retl
1703;
1704; X64-SSE-LABEL: mul_2xi8_varconst6:
1705; X64-SSE:       # %bb.0: # %entry
1706; X64-SSE-NEXT:    movq c(%rip), %rax
1707; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
1708; X64-SSE-NEXT:    movd %ecx, %xmm0
1709; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1710; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1711; X64-SSE-NEXT:    psrad $24, %xmm0
1712; X64-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65408,0,128,0,u,u,u,u]
1713; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1714; X64-SSE-NEXT:    retq
1715;
1716; X64-AVX-LABEL: mul_2xi8_varconst6:
1717; X64-AVX:       # %bb.0: # %entry
1718; X64-AVX-NEXT:    movq c(%rip), %rax
1719; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
1720; X64-AVX-NEXT:    vmovd %ecx, %xmm0
1721; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1722; X64-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [65408,0,128,0,u,u,u,u]
1723; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1724; X64-AVX-NEXT:    retq
1725entry:
1726  %pre = load ptr, ptr @c
1727  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1728  %wide.load = load <2 x i8>, ptr %tmp6, align 1
1729  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1730  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
1731  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1732  store <2 x i32> %tmp13, ptr %tmp14, align 4
1733  ret void
1734}
1735
1736; %val = load <2 x i16>
1737; %op1 = zext<2 x i32> %val
1738; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
1739; %rst = mul <2 x i32> %op1, %op2
1740;
1741define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
1742; X86-SSE-LABEL: mul_2xi16_varconst1:
1743; X86-SSE:       # %bb.0: # %entry
1744; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1745; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1746; X86-SSE-NEXT:    movl c, %edx
1747; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1748; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = [0,65535,0,0,0,0,0,0]
1749; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1750; X86-SSE-NEXT:    pmulhuw %xmm1, %xmm2
1751; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
1752; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1753; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1754; X86-SSE-NEXT:    retl
1755;
1756; X86-AVX-LABEL: mul_2xi16_varconst1:
1757; X86-AVX:       # %bb.0: # %entry
1758; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1759; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1760; X86-AVX-NEXT:    movl c, %edx
1761; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1762; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1763; X86-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1764; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1765; X86-AVX-NEXT:    retl
1766;
1767; X64-SSE-LABEL: mul_2xi16_varconst1:
1768; X64-SSE:       # %bb.0: # %entry
1769; X64-SSE-NEXT:    movq c(%rip), %rax
1770; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1771; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = [0,65535,0,0,0,0,0,0]
1772; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
1773; X64-SSE-NEXT:    pmulhuw %xmm1, %xmm2
1774; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
1775; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1776; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1777; X64-SSE-NEXT:    retq
1778;
1779; X64-AVX-LABEL: mul_2xi16_varconst1:
1780; X64-AVX:       # %bb.0: # %entry
1781; X64-AVX-NEXT:    movq c(%rip), %rax
1782; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1783; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1784; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1785; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1786; X64-AVX-NEXT:    retq
1787entry:
1788  %pre = load ptr, ptr @c
1789  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1790  %wide.load = load <2 x i16>, ptr %tmp6, align 1
1791  %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
1792  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
1793  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1794  store <2 x i32> %tmp13, ptr %tmp14, align 4
1795  ret void
1796}
1797
1798; %val = load <2 x i16>
1799; %op1 = sext<2 x i32> %val
1800; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
1801; %rst = mul <2 x i32> %op1, %op2
1802;
1803define void @mul_2xi16_varconst2(ptr nocapture readonly %a, i64 %index) {
1804; X86-SSE-LABEL: mul_2xi16_varconst2:
1805; X86-SSE:       # %bb.0: # %entry
1806; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1807; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1808; X86-SSE-NEXT:    movl c, %edx
1809; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1810; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
1811; X86-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,0,32767,0,u,u,u,u]
1812; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1813; X86-SSE-NEXT:    retl
1814;
1815; X86-AVX-LABEL: mul_2xi16_varconst2:
1816; X86-AVX:       # %bb.0: # %entry
1817; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1818; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1819; X86-AVX-NEXT:    movl c, %edx
1820; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1821; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1822; X86-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [32768,0,32767,0,u,u,u,u]
1823; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1824; X86-AVX-NEXT:    retl
1825;
1826; X64-SSE-LABEL: mul_2xi16_varconst2:
1827; X64-SSE:       # %bb.0: # %entry
1828; X64-SSE-NEXT:    movq c(%rip), %rax
1829; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1830; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
1831; X64-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,0,32767,0,u,u,u,u]
1832; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1833; X64-SSE-NEXT:    retq
1834;
1835; X64-AVX-LABEL: mul_2xi16_varconst2:
1836; X64-AVX:       # %bb.0: # %entry
1837; X64-AVX-NEXT:    movq c(%rip), %rax
1838; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1839; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1840; X64-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,0,32767,0,u,u,u,u]
1841; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1842; X64-AVX-NEXT:    retq
1843entry:
1844  %pre = load ptr, ptr @c
1845  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1846  %wide.load = load <2 x i16>, ptr %tmp6, align 1
1847  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1848  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
1849  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1850  store <2 x i32> %tmp13, ptr %tmp14, align 4
1851  ret void
1852}
1853
1854; %val = load <2 x i16>
1855; %op1 = zext<2 x i32> %val
1856; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
1857; %rst = mul <2 x i32> %op1, %op2
1858;
1859define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
1860; X86-SSE-LABEL: mul_2xi16_varconst3:
1861; X86-SSE:       # %bb.0: # %entry
1862; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1863; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1864; X86-SSE-NEXT:    movl c, %edx
1865; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1866; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
1867; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1868; X86-SSE-NEXT:    psllq $32, %xmm0
1869; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1870; X86-SSE-NEXT:    retl
1871;
1872; X86-AVX-LABEL: mul_2xi16_varconst3:
1873; X86-AVX:       # %bb.0: # %entry
1874; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1875; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1876; X86-AVX-NEXT:    movl c, %edx
1877; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1878; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1879; X86-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1880; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1881; X86-AVX-NEXT:    retl
1882;
1883; X64-SSE-LABEL: mul_2xi16_varconst3:
1884; X64-SSE:       # %bb.0: # %entry
1885; X64-SSE-NEXT:    movq c(%rip), %rax
1886; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1887; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
1888; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1889; X64-SSE-NEXT:    psllq $32, %xmm0
1890; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1891; X64-SSE-NEXT:    retq
1892;
1893; X64-AVX-LABEL: mul_2xi16_varconst3:
1894; X64-AVX:       # %bb.0: # %entry
1895; X64-AVX-NEXT:    movq c(%rip), %rax
1896; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1897; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1898; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1899; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1900; X64-AVX-NEXT:    retq
1901entry:
1902  %pre = load ptr, ptr @c
1903  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1904  %wide.load = load <2 x i16>, ptr %tmp6, align 1
1905  %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
1906  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536>
1907  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1908  store <2 x i32> %tmp13, ptr %tmp14, align 4
1909  ret void
1910}
1911
1912; %val = load <2 x i16>
1913; %op1 = sext<2 x i32> %val
1914; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
1915; %rst = mul <2 x i32> %op1, %op2
1916;
1917define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
1918; X86-SSE-LABEL: mul_2xi16_varconst4:
1919; X86-SSE:       # %bb.0: # %entry
1920; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1921; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1922; X86-SSE-NEXT:    movl c, %edx
1923; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1924; X86-SSE-NEXT:    psrad $16, %xmm0
1925; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1926; X86-SSE-NEXT:    psllq $32, %xmm0
1927; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1928; X86-SSE-NEXT:    retl
1929;
1930; X86-AVX-LABEL: mul_2xi16_varconst4:
1931; X86-AVX:       # %bb.0: # %entry
1932; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1933; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1934; X86-AVX-NEXT:    movl c, %edx
1935; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1936; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
1937; X86-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1938; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1939; X86-AVX-NEXT:    retl
1940;
1941; X64-SSE-LABEL: mul_2xi16_varconst4:
1942; X64-SSE:       # %bb.0: # %entry
1943; X64-SSE-NEXT:    movq c(%rip), %rax
1944; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1945; X64-SSE-NEXT:    psrad $16, %xmm0
1946; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1947; X64-SSE-NEXT:    psllq $32, %xmm0
1948; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1949; X64-SSE-NEXT:    retq
1950;
1951; X64-AVX-LABEL: mul_2xi16_varconst4:
1952; X64-AVX:       # %bb.0: # %entry
1953; X64-AVX-NEXT:    movq c(%rip), %rax
1954; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1955; X64-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
1956; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1957; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1958; X64-AVX-NEXT:    retq
1959entry:
1960  %pre = load ptr, ptr @c
1961  %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1962  %wide.load = load <2 x i16>, ptr %tmp6, align 1
1963  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1964  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
1965  %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1966  store <2 x i32> %tmp13, ptr %tmp14, align 4
1967  ret void
1968}
1969
1970;
1971; Illegal Types
1972;
1973
1974define void @PR34947(ptr %p0, ptr %p1) nounwind {
1975; X86-SSE-LABEL: PR34947:
1976; X86-SSE:       # %bb.0:
1977; X86-SSE-NEXT:    pushl %ebp
1978; X86-SSE-NEXT:    pushl %ebx
1979; X86-SSE-NEXT:    pushl %edi
1980; X86-SSE-NEXT:    pushl %esi
1981; X86-SSE-NEXT:    pushl %eax
1982; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1983; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1984; X86-SSE-NEXT:    movzwl 16(%eax), %edx
1985; X86-SSE-NEXT:    movl %edx, (%esp) # 4-byte Spill
1986; X86-SSE-NEXT:    movdqa (%eax), %xmm2
1987; X86-SSE-NEXT:    pxor %xmm1, %xmm1
1988; X86-SSE-NEXT:    movdqa %xmm2, %xmm0
1989; X86-SSE-NEXT:    pextrw $7, %xmm2, %eax
1990; X86-SSE-NEXT:    pextrw $4, %xmm2, %esi
1991; X86-SSE-NEXT:    pextrw $1, %xmm2, %edi
1992; X86-SSE-NEXT:    pextrw $0, %xmm2, %ebx
1993; X86-SSE-NEXT:    pextrw $3, %xmm2, %ebp
1994; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1995; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1996; X86-SSE-NEXT:    xorl %edx, %edx
1997; X86-SSE-NEXT:    divl 28(%ecx)
1998; X86-SSE-NEXT:    movd %edx, %xmm1
1999; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
2000; X86-SSE-NEXT:    movd %xmm3, %eax
2001; X86-SSE-NEXT:    xorl %edx, %edx
2002; X86-SSE-NEXT:    divl 24(%ecx)
2003; X86-SSE-NEXT:    movd %edx, %xmm3
2004; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2005; X86-SSE-NEXT:    movl %esi, %eax
2006; X86-SSE-NEXT:    xorl %edx, %edx
2007; X86-SSE-NEXT:    divl 16(%ecx)
2008; X86-SSE-NEXT:    movd %edx, %xmm1
2009; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2010; X86-SSE-NEXT:    movd %xmm0, %eax
2011; X86-SSE-NEXT:    xorl %edx, %edx
2012; X86-SSE-NEXT:    divl 20(%ecx)
2013; X86-SSE-NEXT:    movd %edx, %xmm0
2014; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2015; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
2016; X86-SSE-NEXT:    movl %edi, %eax
2017; X86-SSE-NEXT:    xorl %edx, %edx
2018; X86-SSE-NEXT:    divl 4(%ecx)
2019; X86-SSE-NEXT:    movd %edx, %xmm3
2020; X86-SSE-NEXT:    movl %ebx, %eax
2021; X86-SSE-NEXT:    xorl %edx, %edx
2022; X86-SSE-NEXT:    divl (%ecx)
2023; X86-SSE-NEXT:    movd %edx, %xmm0
2024; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2025; X86-SSE-NEXT:    movl %ebp, %eax
2026; X86-SSE-NEXT:    xorl %edx, %edx
2027; X86-SSE-NEXT:    divl 12(%ecx)
2028; X86-SSE-NEXT:    movd %edx, %xmm3
2029; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
2030; X86-SSE-NEXT:    movd %xmm2, %eax
2031; X86-SSE-NEXT:    xorl %edx, %edx
2032; X86-SSE-NEXT:    divl 8(%ecx)
2033; X86-SSE-NEXT:    movd %edx, %xmm2
2034; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2035; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2036; X86-SSE-NEXT:    movl (%esp), %eax # 4-byte Reload
2037; X86-SSE-NEXT:    xorl %edx, %edx
2038; X86-SSE-NEXT:    divl 32(%ecx)
2039; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
2040; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2041; X86-SSE-NEXT:    pmuludq %xmm2, %xmm0
2042; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2043; X86-SSE-NEXT:    pmuludq %xmm2, %xmm3
2044; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2045; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2046; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
2047; X86-SSE-NEXT:    pmuludq %xmm2, %xmm1
2048; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2049; X86-SSE-NEXT:    pmuludq %xmm2, %xmm3
2050; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
2051; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2052; X86-SSE-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
2053; X86-SSE-NEXT:    movl %eax, (%eax)
2054; X86-SSE-NEXT:    movdqa %xmm1, (%eax)
2055; X86-SSE-NEXT:    movdqa %xmm0, (%eax)
2056; X86-SSE-NEXT:    addl $4, %esp
2057; X86-SSE-NEXT:    popl %esi
2058; X86-SSE-NEXT:    popl %edi
2059; X86-SSE-NEXT:    popl %ebx
2060; X86-SSE-NEXT:    popl %ebp
2061; X86-SSE-NEXT:    retl
2062;
2063; X86-AVX1-LABEL: PR34947:
2064; X86-AVX1:       # %bb.0:
2065; X86-AVX1-NEXT:    pushl %ebp
2066; X86-AVX1-NEXT:    pushl %ebx
2067; X86-AVX1-NEXT:    pushl %edi
2068; X86-AVX1-NEXT:    pushl %esi
2069; X86-AVX1-NEXT:    subl $16, %esp
2070; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2071; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
2072; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2073; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2074; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2075; X86-AVX1-NEXT:    vmovd %xmm2, %eax
2076; X86-AVX1-NEXT:    xorl %edx, %edx
2077; X86-AVX1-NEXT:    divl 32(%ecx)
2078; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2079; X86-AVX1-NEXT:    vpextrd $3, %xmm1, %eax
2080; X86-AVX1-NEXT:    xorl %edx, %edx
2081; X86-AVX1-NEXT:    divl 28(%ecx)
2082; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2083; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %eax
2084; X86-AVX1-NEXT:    xorl %edx, %edx
2085; X86-AVX1-NEXT:    divl 24(%ecx)
2086; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2087; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %eax
2088; X86-AVX1-NEXT:    xorl %edx, %edx
2089; X86-AVX1-NEXT:    divl 20(%ecx)
2090; X86-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
2091; X86-AVX1-NEXT:    vmovd %xmm1, %eax
2092; X86-AVX1-NEXT:    xorl %edx, %edx
2093; X86-AVX1-NEXT:    divl 16(%ecx)
2094; X86-AVX1-NEXT:    movl %edx, %ebp
2095; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
2096; X86-AVX1-NEXT:    xorl %edx, %edx
2097; X86-AVX1-NEXT:    divl 12(%ecx)
2098; X86-AVX1-NEXT:    movl %edx, %ebx
2099; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
2100; X86-AVX1-NEXT:    xorl %edx, %edx
2101; X86-AVX1-NEXT:    divl 8(%ecx)
2102; X86-AVX1-NEXT:    movl %edx, %esi
2103; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
2104; X86-AVX1-NEXT:    xorl %edx, %edx
2105; X86-AVX1-NEXT:    divl 4(%ecx)
2106; X86-AVX1-NEXT:    movl %edx, %edi
2107; X86-AVX1-NEXT:    vmovd %xmm0, %eax
2108; X86-AVX1-NEXT:    xorl %edx, %edx
2109; X86-AVX1-NEXT:    divl (%ecx)
2110; X86-AVX1-NEXT:    vmovd %edx, %xmm0
2111; X86-AVX1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
2112; X86-AVX1-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
2113; X86-AVX1-NEXT:    vpinsrd $3, %ebx, %xmm0, %xmm0
2114; X86-AVX1-NEXT:    vmovd %ebp, %xmm1
2115; X86-AVX1-NEXT:    vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
2116; X86-AVX1-NEXT:    vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
2117; X86-AVX1-NEXT:    vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
2118; X86-AVX1-NEXT:    imull $8199, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
2119; X86-AVX1-NEXT:    # imm = 0x2007
2120; X86-AVX1-NEXT:    movl %eax, (%eax)
2121; X86-AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [8199,8199,8199,8199]
2122; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
2123; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
2124; X86-AVX1-NEXT:    vmovdqa %xmm1, (%eax)
2125; X86-AVX1-NEXT:    vmovdqa %xmm0, (%eax)
2126; X86-AVX1-NEXT:    addl $16, %esp
2127; X86-AVX1-NEXT:    popl %esi
2128; X86-AVX1-NEXT:    popl %edi
2129; X86-AVX1-NEXT:    popl %ebx
2130; X86-AVX1-NEXT:    popl %ebp
2131; X86-AVX1-NEXT:    retl
2132;
2133; X86-AVX2-LABEL: PR34947:
2134; X86-AVX2:       # %bb.0:
2135; X86-AVX2-NEXT:    pushl %esi
2136; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %esi
2137; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2138; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2139; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2140; X86-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2141; X86-AVX2-NEXT:    vpextrd $1, %xmm2, %eax
2142; X86-AVX2-NEXT:    xorl %edx, %edx
2143; X86-AVX2-NEXT:    divl 20(%esi)
2144; X86-AVX2-NEXT:    movl %edx, %ecx
2145; X86-AVX2-NEXT:    vmovd %xmm2, %eax
2146; X86-AVX2-NEXT:    xorl %edx, %edx
2147; X86-AVX2-NEXT:    divl 16(%esi)
2148; X86-AVX2-NEXT:    vmovd %edx, %xmm3
2149; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
2150; X86-AVX2-NEXT:    vpextrd $2, %xmm2, %eax
2151; X86-AVX2-NEXT:    xorl %edx, %edx
2152; X86-AVX2-NEXT:    divl 24(%esi)
2153; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm3, %xmm3
2154; X86-AVX2-NEXT:    vpextrd $3, %xmm2, %eax
2155; X86-AVX2-NEXT:    xorl %edx, %edx
2156; X86-AVX2-NEXT:    divl 28(%esi)
2157; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm3, %xmm2
2158; X86-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
2159; X86-AVX2-NEXT:    xorl %edx, %edx
2160; X86-AVX2-NEXT:    divl 4(%esi)
2161; X86-AVX2-NEXT:    movl %edx, %ecx
2162; X86-AVX2-NEXT:    vmovd %xmm1, %eax
2163; X86-AVX2-NEXT:    xorl %edx, %edx
2164; X86-AVX2-NEXT:    divl (%esi)
2165; X86-AVX2-NEXT:    vmovd %edx, %xmm3
2166; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
2167; X86-AVX2-NEXT:    vpextrd $2, %xmm1, %eax
2168; X86-AVX2-NEXT:    xorl %edx, %edx
2169; X86-AVX2-NEXT:    divl 8(%esi)
2170; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm3, %xmm3
2171; X86-AVX2-NEXT:    vpextrd $3, %xmm1, %eax
2172; X86-AVX2-NEXT:    xorl %edx, %edx
2173; X86-AVX2-NEXT:    divl 12(%esi)
2174; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm3, %xmm1
2175; X86-AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2176; X86-AVX2-NEXT:    vmovd %xmm0, %eax
2177; X86-AVX2-NEXT:    xorl %edx, %edx
2178; X86-AVX2-NEXT:    divl 32(%esi)
2179; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
2180; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
2181; X86-AVX2-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
2182; X86-AVX2-NEXT:    movl %eax, (%eax)
2183; X86-AVX2-NEXT:    vmovdqa %ymm0, (%eax)
2184; X86-AVX2-NEXT:    popl %esi
2185; X86-AVX2-NEXT:    vzeroupper
2186; X86-AVX2-NEXT:    retl
2187;
2188; X64-SSE-LABEL: PR34947:
2189; X64-SSE:       # %bb.0:
2190; X64-SSE-NEXT:    movzwl 16(%rdi), %ecx
2191; X64-SSE-NEXT:    movdqa (%rdi), %xmm2
2192; X64-SSE-NEXT:    pxor %xmm1, %xmm1
2193; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
2194; X64-SSE-NEXT:    pextrw $7, %xmm2, %eax
2195; X64-SSE-NEXT:    pextrw $4, %xmm2, %edi
2196; X64-SSE-NEXT:    pextrw $1, %xmm2, %r8d
2197; X64-SSE-NEXT:    pextrw $0, %xmm2, %r9d
2198; X64-SSE-NEXT:    pextrw $3, %xmm2, %r10d
2199; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2200; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2201; X64-SSE-NEXT:    xorl %edx, %edx
2202; X64-SSE-NEXT:    divl 28(%rsi)
2203; X64-SSE-NEXT:    movd %edx, %xmm1
2204; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
2205; X64-SSE-NEXT:    movd %xmm3, %eax
2206; X64-SSE-NEXT:    xorl %edx, %edx
2207; X64-SSE-NEXT:    divl 24(%rsi)
2208; X64-SSE-NEXT:    movd %edx, %xmm3
2209; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2210; X64-SSE-NEXT:    movl %edi, %eax
2211; X64-SSE-NEXT:    xorl %edx, %edx
2212; X64-SSE-NEXT:    divl 16(%rsi)
2213; X64-SSE-NEXT:    movd %edx, %xmm1
2214; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2215; X64-SSE-NEXT:    movd %xmm0, %eax
2216; X64-SSE-NEXT:    xorl %edx, %edx
2217; X64-SSE-NEXT:    divl 20(%rsi)
2218; X64-SSE-NEXT:    movd %edx, %xmm0
2219; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2220; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
2221; X64-SSE-NEXT:    movl %r8d, %eax
2222; X64-SSE-NEXT:    xorl %edx, %edx
2223; X64-SSE-NEXT:    divl 4(%rsi)
2224; X64-SSE-NEXT:    movd %edx, %xmm0
2225; X64-SSE-NEXT:    movl %r9d, %eax
2226; X64-SSE-NEXT:    xorl %edx, %edx
2227; X64-SSE-NEXT:    divl (%rsi)
2228; X64-SSE-NEXT:    movd %edx, %xmm3
2229; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2230; X64-SSE-NEXT:    movl %r10d, %eax
2231; X64-SSE-NEXT:    xorl %edx, %edx
2232; X64-SSE-NEXT:    divl 12(%rsi)
2233; X64-SSE-NEXT:    movd %edx, %xmm0
2234; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
2235; X64-SSE-NEXT:    movd %xmm2, %eax
2236; X64-SSE-NEXT:    xorl %edx, %edx
2237; X64-SSE-NEXT:    divl 8(%rsi)
2238; X64-SSE-NEXT:    movd %edx, %xmm2
2239; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2240; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
2241; X64-SSE-NEXT:    movl %ecx, %eax
2242; X64-SSE-NEXT:    xorl %edx, %edx
2243; X64-SSE-NEXT:    divl 32(%rsi)
2244; X64-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199]
2245; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
2246; X64-SSE-NEXT:    pmuludq %xmm0, %xmm3
2247; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2248; X64-SSE-NEXT:    pmuludq %xmm0, %xmm2
2249; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2250; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2251; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
2252; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
2253; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2254; X64-SSE-NEXT:    pmuludq %xmm0, %xmm2
2255; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2256; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2257; X64-SSE-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
2258; X64-SSE-NEXT:    movl %eax, (%rax)
2259; X64-SSE-NEXT:    movdqa %xmm1, (%rax)
2260; X64-SSE-NEXT:    movdqa %xmm3, (%rax)
2261; X64-SSE-NEXT:    retq
2262;
2263; X64-AVX1-LABEL: PR34947:
2264; X64-AVX1:       # %bb.0:
2265; X64-AVX1-NEXT:    pushq %rbp
2266; X64-AVX1-NEXT:    pushq %rbx
2267; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2268; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2269; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2270; X64-AVX1-NEXT:    vmovd %xmm2, %eax
2271; X64-AVX1-NEXT:    xorl %edx, %edx
2272; X64-AVX1-NEXT:    divl 32(%rsi)
2273; X64-AVX1-NEXT:    movl %edx, %ecx
2274; X64-AVX1-NEXT:    vpextrd $3, %xmm1, %eax
2275; X64-AVX1-NEXT:    xorl %edx, %edx
2276; X64-AVX1-NEXT:    divl 28(%rsi)
2277; X64-AVX1-NEXT:    movl %edx, %edi
2278; X64-AVX1-NEXT:    vpextrd $2, %xmm1, %eax
2279; X64-AVX1-NEXT:    xorl %edx, %edx
2280; X64-AVX1-NEXT:    divl 24(%rsi)
2281; X64-AVX1-NEXT:    movl %edx, %r8d
2282; X64-AVX1-NEXT:    vpextrd $1, %xmm1, %eax
2283; X64-AVX1-NEXT:    xorl %edx, %edx
2284; X64-AVX1-NEXT:    divl 20(%rsi)
2285; X64-AVX1-NEXT:    movl %edx, %r9d
2286; X64-AVX1-NEXT:    vmovd %xmm1, %eax
2287; X64-AVX1-NEXT:    xorl %edx, %edx
2288; X64-AVX1-NEXT:    divl 16(%rsi)
2289; X64-AVX1-NEXT:    movl %edx, %r10d
2290; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
2291; X64-AVX1-NEXT:    xorl %edx, %edx
2292; X64-AVX1-NEXT:    divl 12(%rsi)
2293; X64-AVX1-NEXT:    movl %edx, %r11d
2294; X64-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
2295; X64-AVX1-NEXT:    xorl %edx, %edx
2296; X64-AVX1-NEXT:    divl 8(%rsi)
2297; X64-AVX1-NEXT:    movl %edx, %ebx
2298; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
2299; X64-AVX1-NEXT:    xorl %edx, %edx
2300; X64-AVX1-NEXT:    divl 4(%rsi)
2301; X64-AVX1-NEXT:    movl %edx, %ebp
2302; X64-AVX1-NEXT:    vmovd %xmm0, %eax
2303; X64-AVX1-NEXT:    xorl %edx, %edx
2304; X64-AVX1-NEXT:    divl (%rsi)
2305; X64-AVX1-NEXT:    vmovd %edx, %xmm0
2306; X64-AVX1-NEXT:    vpinsrd $1, %ebp, %xmm0, %xmm0
2307; X64-AVX1-NEXT:    vpinsrd $2, %ebx, %xmm0, %xmm0
2308; X64-AVX1-NEXT:    vpinsrd $3, %r11d, %xmm0, %xmm0
2309; X64-AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [8199,8199,8199,8199]
2310; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
2311; X64-AVX1-NEXT:    vmovd %r10d, %xmm2
2312; X64-AVX1-NEXT:    vpinsrd $1, %r9d, %xmm2, %xmm2
2313; X64-AVX1-NEXT:    vpinsrd $2, %r8d, %xmm2, %xmm2
2314; X64-AVX1-NEXT:    vpinsrd $3, %edi, %xmm2, %xmm2
2315; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
2316; X64-AVX1-NEXT:    imull $8199, %ecx, %eax # imm = 0x2007
2317; X64-AVX1-NEXT:    movl %eax, (%rax)
2318; X64-AVX1-NEXT:    vmovdqa %xmm1, (%rax)
2319; X64-AVX1-NEXT:    vmovdqa %xmm0, (%rax)
2320; X64-AVX1-NEXT:    popq %rbx
2321; X64-AVX1-NEXT:    popq %rbp
2322; X64-AVX1-NEXT:    retq
2323;
2324; X64-AVX2-LABEL: PR34947:
2325; X64-AVX2:       # %bb.0:
2326; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2327; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2328; X64-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2329; X64-AVX2-NEXT:    vpextrd $1, %xmm2, %eax
2330; X64-AVX2-NEXT:    xorl %edx, %edx
2331; X64-AVX2-NEXT:    divl 20(%rsi)
2332; X64-AVX2-NEXT:    movl %edx, %ecx
2333; X64-AVX2-NEXT:    vmovd %xmm2, %eax
2334; X64-AVX2-NEXT:    xorl %edx, %edx
2335; X64-AVX2-NEXT:    divl 16(%rsi)
2336; X64-AVX2-NEXT:    vmovd %edx, %xmm3
2337; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
2338; X64-AVX2-NEXT:    vpextrd $2, %xmm2, %eax
2339; X64-AVX2-NEXT:    xorl %edx, %edx
2340; X64-AVX2-NEXT:    divl 24(%rsi)
2341; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm3, %xmm3
2342; X64-AVX2-NEXT:    vpextrd $3, %xmm2, %eax
2343; X64-AVX2-NEXT:    xorl %edx, %edx
2344; X64-AVX2-NEXT:    divl 28(%rsi)
2345; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm3, %xmm2
2346; X64-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
2347; X64-AVX2-NEXT:    xorl %edx, %edx
2348; X64-AVX2-NEXT:    divl 4(%rsi)
2349; X64-AVX2-NEXT:    movl %edx, %ecx
2350; X64-AVX2-NEXT:    vmovd %xmm1, %eax
2351; X64-AVX2-NEXT:    xorl %edx, %edx
2352; X64-AVX2-NEXT:    divl (%rsi)
2353; X64-AVX2-NEXT:    vmovd %edx, %xmm3
2354; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
2355; X64-AVX2-NEXT:    vpextrd $2, %xmm1, %eax
2356; X64-AVX2-NEXT:    xorl %edx, %edx
2357; X64-AVX2-NEXT:    divl 8(%rsi)
2358; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm3, %xmm3
2359; X64-AVX2-NEXT:    vpextrd $3, %xmm1, %eax
2360; X64-AVX2-NEXT:    xorl %edx, %edx
2361; X64-AVX2-NEXT:    divl 12(%rsi)
2362; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm3, %xmm1
2363; X64-AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2364; X64-AVX2-NEXT:    vmovd %xmm0, %eax
2365; X64-AVX2-NEXT:    xorl %edx, %edx
2366; X64-AVX2-NEXT:    divl 32(%rsi)
2367; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
2368; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
2369; X64-AVX2-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
2370; X64-AVX2-NEXT:    movl %eax, (%rax)
2371; X64-AVX2-NEXT:    vmovdqa %ymm0, (%rax)
2372; X64-AVX2-NEXT:    vzeroupper
2373; X64-AVX2-NEXT:    retq
2374  %a0 = load <9 x i16>, ptr %p0, align 64
2375  %a1 = load <9 x i32>, ptr %p1, align 64
2376  %ext0 = zext <9 x i16> %a0 to <9 x i32>
2377  %rem = urem <9 x i32> %ext0, %a1
2378  %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem
2379  store <9 x i32> %mul, ptr undef, align 64
2380  ret void
2381}
2382