xref: /llvm-project/llvm/test/CodeGen/X86/subvector-broadcast.ll (revision 95ab42661e8d1f57a4ef8e9d058b44627af0e58d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX2
4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512
5; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512
6; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512
12
13;
14; Subvector Load + Broadcast
15;
16
17define <4 x double> @test_broadcast_2f64_4f64(ptr%p) nounwind {
18; X86-LABEL: test_broadcast_2f64_4f64:
19; X86:       # %bb.0:
20; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
21; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
22; X86-NEXT:    retl
23;
24; X64-LABEL: test_broadcast_2f64_4f64:
25; X64:       # %bb.0:
26; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
27; X64-NEXT:    retq
28 %1 = load <2 x double>, ptr%p
29 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
30 ret <4 x double> %2
31}
32
33define <8 x double> @test_broadcast_2f64_8f64(ptr%p) nounwind {
34; X86-AVX-LABEL: test_broadcast_2f64_8f64:
35; X86-AVX:       # %bb.0:
36; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
37; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
38; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
39; X86-AVX-NEXT:    retl
40;
41; X86-AVX512-LABEL: test_broadcast_2f64_8f64:
42; X86-AVX512:       # %bb.0:
43; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
44; X86-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
45; X86-AVX512-NEXT:    retl
46;
47; X64-AVX-LABEL: test_broadcast_2f64_8f64:
48; X64-AVX:       # %bb.0:
49; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
50; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
51; X64-AVX-NEXT:    retq
52;
53; X64-AVX512-LABEL: test_broadcast_2f64_8f64:
54; X64-AVX512:       # %bb.0:
55; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
56; X64-AVX512-NEXT:    retq
57 %1 = load <2 x double>, ptr%p
58 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
59 ret <8 x double> %2
60}
61
62define <8 x double> @test_broadcast_4f64_8f64(ptr%p) nounwind {
63; X86-AVX-LABEL: test_broadcast_4f64_8f64:
64; X86-AVX:       # %bb.0:
65; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
66; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
67; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
68; X86-AVX-NEXT:    retl
69;
70; X86-AVX512-LABEL: test_broadcast_4f64_8f64:
71; X86-AVX512:       # %bb.0:
72; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
73; X86-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
74; X86-AVX512-NEXT:    retl
75;
76; X64-AVX-LABEL: test_broadcast_4f64_8f64:
77; X64-AVX:       # %bb.0:
78; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
79; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
80; X64-AVX-NEXT:    retq
81;
82; X64-AVX512-LABEL: test_broadcast_4f64_8f64:
83; X64-AVX512:       # %bb.0:
84; X64-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
85; X64-AVX512-NEXT:    retq
86 %1 = load <4 x double>, ptr%p
87 %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
88 ret <8 x double> %2
89}
90
91define <4 x i64> @test_broadcast_2i64_4i64(ptr%p) nounwind {
92; X86-AVX-LABEL: test_broadcast_2i64_4i64:
93; X86-AVX:       # %bb.0:
94; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
95; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
96; X86-AVX-NEXT:    retl
97;
98; X86-AVX512-LABEL: test_broadcast_2i64_4i64:
99; X86-AVX512:       # %bb.0:
100; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
101; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
102; X86-AVX512-NEXT:    retl
103;
104; X64-AVX-LABEL: test_broadcast_2i64_4i64:
105; X64-AVX:       # %bb.0:
106; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
107; X64-AVX-NEXT:    retq
108;
109; X64-AVX512-LABEL: test_broadcast_2i64_4i64:
110; X64-AVX512:       # %bb.0:
111; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
112; X64-AVX512-NEXT:    retq
113 %1 = load <2 x i64>, ptr%p
114 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
115 ret <4 x i64> %2
116}
117
118define <8 x i64> @test_broadcast_2i64_8i64(ptr%p) nounwind {
119; X86-AVX-LABEL: test_broadcast_2i64_8i64:
120; X86-AVX:       # %bb.0:
121; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
122; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
123; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
124; X86-AVX-NEXT:    retl
125;
126; X86-AVX512-LABEL: test_broadcast_2i64_8i64:
127; X86-AVX512:       # %bb.0:
128; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
129; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
130; X86-AVX512-NEXT:    retl
131;
132; X64-AVX-LABEL: test_broadcast_2i64_8i64:
133; X64-AVX:       # %bb.0:
134; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
135; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
136; X64-AVX-NEXT:    retq
137;
138; X64-AVX512-LABEL: test_broadcast_2i64_8i64:
139; X64-AVX512:       # %bb.0:
140; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
141; X64-AVX512-NEXT:    retq
142 %1 = load <2 x i64>, ptr%p
143 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
144 ret <8 x i64> %2
145}
146
147define <8 x i64> @test_broadcast_4i64_8i64(ptr%p) nounwind {
148; X86-AVX-LABEL: test_broadcast_4i64_8i64:
149; X86-AVX:       # %bb.0:
150; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
151; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
152; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
153; X86-AVX-NEXT:    retl
154;
155; X86-AVX512-LABEL: test_broadcast_4i64_8i64:
156; X86-AVX512:       # %bb.0:
157; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
158; X86-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
159; X86-AVX512-NEXT:    retl
160;
161; X64-AVX-LABEL: test_broadcast_4i64_8i64:
162; X64-AVX:       # %bb.0:
163; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
164; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
165; X64-AVX-NEXT:    retq
166;
167; X64-AVX512-LABEL: test_broadcast_4i64_8i64:
168; X64-AVX512:       # %bb.0:
169; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
170; X64-AVX512-NEXT:    retq
171 %1 = load <4 x i64>, ptr%p
172 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
173 ret <8 x i64> %2
174}
175
176define <8 x float> @test_broadcast_4f32_8f32(ptr%p) nounwind {
177; X86-LABEL: test_broadcast_4f32_8f32:
178; X86:       # %bb.0:
179; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
180; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
181; X86-NEXT:    retl
182;
183; X64-LABEL: test_broadcast_4f32_8f32:
184; X64:       # %bb.0:
185; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
186; X64-NEXT:    retq
187 %1 = load <4 x float>, ptr%p
188 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
189 ret <8 x float> %2
190}
191
192define <16 x float> @test_broadcast_4f32_16f32(ptr%p) nounwind {
193; X86-AVX-LABEL: test_broadcast_4f32_16f32:
194; X86-AVX:       # %bb.0:
195; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
196; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
197; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
198; X86-AVX-NEXT:    retl
199;
200; X86-AVX512-LABEL: test_broadcast_4f32_16f32:
201; X86-AVX512:       # %bb.0:
202; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
203; X86-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
204; X86-AVX512-NEXT:    retl
205;
206; X64-AVX-LABEL: test_broadcast_4f32_16f32:
207; X64-AVX:       # %bb.0:
208; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
209; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
210; X64-AVX-NEXT:    retq
211;
212; X64-AVX512-LABEL: test_broadcast_4f32_16f32:
213; X64-AVX512:       # %bb.0:
214; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
215; X64-AVX512-NEXT:    retq
216 %1 = load <4 x float>, ptr%p
217 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
218 ret <16 x float> %2
219}
220
221define <16 x float> @test_broadcast_8f32_16f32(ptr%p) nounwind {
222; X86-AVX-LABEL: test_broadcast_8f32_16f32:
223; X86-AVX:       # %bb.0:
224; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
225; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
226; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
227; X86-AVX-NEXT:    retl
228;
229; X86-AVX512-LABEL: test_broadcast_8f32_16f32:
230; X86-AVX512:       # %bb.0:
231; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
232; X86-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
233; X86-AVX512-NEXT:    retl
234;
235; X64-AVX-LABEL: test_broadcast_8f32_16f32:
236; X64-AVX:       # %bb.0:
237; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
238; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
239; X64-AVX-NEXT:    retq
240;
241; X64-AVX512-LABEL: test_broadcast_8f32_16f32:
242; X64-AVX512:       # %bb.0:
243; X64-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
244; X64-AVX512-NEXT:    retq
245 %1 = load <8 x float>, ptr%p
246 %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
247 ret <16 x float> %2
248}
249
250define <8 x i32> @test_broadcast_4i32_8i32(ptr%p) nounwind {
251; X86-AVX-LABEL: test_broadcast_4i32_8i32:
252; X86-AVX:       # %bb.0:
253; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
254; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
255; X86-AVX-NEXT:    retl
256;
257; X86-AVX512-LABEL: test_broadcast_4i32_8i32:
258; X86-AVX512:       # %bb.0:
259; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
260; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
261; X86-AVX512-NEXT:    retl
262;
263; X64-AVX-LABEL: test_broadcast_4i32_8i32:
264; X64-AVX:       # %bb.0:
265; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
266; X64-AVX-NEXT:    retq
267;
268; X64-AVX512-LABEL: test_broadcast_4i32_8i32:
269; X64-AVX512:       # %bb.0:
270; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
271; X64-AVX512-NEXT:    retq
272 %1 = load <4 x i32>, ptr%p
273 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
274 ret <8 x i32> %2
275}
276
277define <16 x i32> @test_broadcast_4i32_16i32(ptr%p) nounwind {
278; X86-AVX-LABEL: test_broadcast_4i32_16i32:
279; X86-AVX:       # %bb.0:
280; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
281; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
282; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
283; X86-AVX-NEXT:    retl
284;
285; X86-AVX512-LABEL: test_broadcast_4i32_16i32:
286; X86-AVX512:       # %bb.0:
287; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
288; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
289; X86-AVX512-NEXT:    retl
290;
291; X64-AVX-LABEL: test_broadcast_4i32_16i32:
292; X64-AVX:       # %bb.0:
293; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
294; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
295; X64-AVX-NEXT:    retq
296;
297; X64-AVX512-LABEL: test_broadcast_4i32_16i32:
298; X64-AVX512:       # %bb.0:
299; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
300; X64-AVX512-NEXT:    retq
301 %1 = load <4 x i32>, ptr%p
302 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
303 ret <16 x i32> %2
304}
305
306define <16 x i32> @test_broadcast_8i32_16i32(ptr%p) nounwind {
307; X86-AVX-LABEL: test_broadcast_8i32_16i32:
308; X86-AVX:       # %bb.0:
309; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
310; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
311; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
312; X86-AVX-NEXT:    retl
313;
314; X86-AVX512-LABEL: test_broadcast_8i32_16i32:
315; X86-AVX512:       # %bb.0:
316; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
317; X86-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
318; X86-AVX512-NEXT:    retl
319;
320; X64-AVX-LABEL: test_broadcast_8i32_16i32:
321; X64-AVX:       # %bb.0:
322; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
323; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
324; X64-AVX-NEXT:    retq
325;
326; X64-AVX512-LABEL: test_broadcast_8i32_16i32:
327; X64-AVX512:       # %bb.0:
328; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
329; X64-AVX512-NEXT:    retq
330 %1 = load <8 x i32>, ptr%p
331 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
332 ret <16 x i32> %2
333}
334
335define <16 x i16> @test_broadcast_8i16_16i16(ptr%p) nounwind {
336; X86-AVX-LABEL: test_broadcast_8i16_16i16:
337; X86-AVX:       # %bb.0:
338; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
339; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
340; X86-AVX-NEXT:    retl
341;
342; X86-AVX512-LABEL: test_broadcast_8i16_16i16:
343; X86-AVX512:       # %bb.0:
344; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
345; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
346; X86-AVX512-NEXT:    retl
347;
348; X64-AVX-LABEL: test_broadcast_8i16_16i16:
349; X64-AVX:       # %bb.0:
350; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
351; X64-AVX-NEXT:    retq
352;
353; X64-AVX512-LABEL: test_broadcast_8i16_16i16:
354; X64-AVX512:       # %bb.0:
355; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
356; X64-AVX512-NEXT:    retq
357 %1 = load <8 x i16>, ptr%p
358 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
359 ret <16 x i16> %2
360}
361
362define <32 x i16> @test_broadcast_8i16_32i16(ptr%p) nounwind {
363; X86-AVX-LABEL: test_broadcast_8i16_32i16:
364; X86-AVX:       # %bb.0:
365; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
366; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
367; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
368; X86-AVX-NEXT:    retl
369;
370; X86-AVX512-LABEL: test_broadcast_8i16_32i16:
371; X86-AVX512:       # %bb.0:
372; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
373; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
374; X86-AVX512-NEXT:    retl
375;
376; X64-AVX-LABEL: test_broadcast_8i16_32i16:
377; X64-AVX:       # %bb.0:
378; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
379; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
380; X64-AVX-NEXT:    retq
381;
382; X64-AVX512-LABEL: test_broadcast_8i16_32i16:
383; X64-AVX512:       # %bb.0:
384; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
385; X64-AVX512-NEXT:    retq
386 %1 = load <8 x i16>, ptr%p
387 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
388 ret <32 x i16> %2
389}
390
391define <32 x i16> @test_broadcast_16i16_32i16(ptr%p) nounwind {
392; X86-AVX-LABEL: test_broadcast_16i16_32i16:
393; X86-AVX:       # %bb.0:
394; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
395; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
396; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
397; X86-AVX-NEXT:    retl
398;
399; X86-AVX512-LABEL: test_broadcast_16i16_32i16:
400; X86-AVX512:       # %bb.0:
401; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
402; X86-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
403; X86-AVX512-NEXT:    retl
404;
405; X64-AVX-LABEL: test_broadcast_16i16_32i16:
406; X64-AVX:       # %bb.0:
407; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
408; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
409; X64-AVX-NEXT:    retq
410;
411; X64-AVX512-LABEL: test_broadcast_16i16_32i16:
412; X64-AVX512:       # %bb.0:
413; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
414; X64-AVX512-NEXT:    retq
415 %1 = load <16 x i16>, ptr%p
416 %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
417 ret <32 x i16> %2
418}
419
420define <32 x i8> @test_broadcast_16i8_32i8(ptr%p) nounwind {
421; X86-AVX-LABEL: test_broadcast_16i8_32i8:
422; X86-AVX:       # %bb.0:
423; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
424; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
425; X86-AVX-NEXT:    retl
426;
427; X86-AVX512-LABEL: test_broadcast_16i8_32i8:
428; X86-AVX512:       # %bb.0:
429; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
430; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
431; X86-AVX512-NEXT:    retl
432;
433; X64-AVX-LABEL: test_broadcast_16i8_32i8:
434; X64-AVX:       # %bb.0:
435; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
436; X64-AVX-NEXT:    retq
437;
438; X64-AVX512-LABEL: test_broadcast_16i8_32i8:
439; X64-AVX512:       # %bb.0:
440; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
441; X64-AVX512-NEXT:    retq
442 %1 = load <16 x i8>, ptr%p
443 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
444 ret <32 x i8> %2
445}
446
447define <64 x i8> @test_broadcast_16i8_64i8(ptr%p) nounwind {
448; X86-AVX-LABEL: test_broadcast_16i8_64i8:
449; X86-AVX:       # %bb.0:
450; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
451; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
452; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
453; X86-AVX-NEXT:    retl
454;
455; X86-AVX512-LABEL: test_broadcast_16i8_64i8:
456; X86-AVX512:       # %bb.0:
457; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
458; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
459; X86-AVX512-NEXT:    retl
460;
461; X64-AVX-LABEL: test_broadcast_16i8_64i8:
462; X64-AVX:       # %bb.0:
463; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
464; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
465; X64-AVX-NEXT:    retq
466;
467; X64-AVX512-LABEL: test_broadcast_16i8_64i8:
468; X64-AVX512:       # %bb.0:
469; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
470; X64-AVX512-NEXT:    retq
471 %1 = load <16 x i8>, ptr%p
472 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
473 ret <64 x i8> %2
474}
475
476define <64 x i8> @test_broadcast_32i8_64i8(ptr%p) nounwind {
477; X86-AVX-LABEL: test_broadcast_32i8_64i8:
478; X86-AVX:       # %bb.0:
479; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
480; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
481; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
482; X86-AVX-NEXT:    retl
483;
484; X86-AVX512-LABEL: test_broadcast_32i8_64i8:
485; X86-AVX512:       # %bb.0:
486; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
487; X86-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
488; X86-AVX512-NEXT:    retl
489;
490; X64-AVX-LABEL: test_broadcast_32i8_64i8:
491; X64-AVX:       # %bb.0:
492; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
493; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
494; X64-AVX-NEXT:    retq
495;
496; X64-AVX512-LABEL: test_broadcast_32i8_64i8:
497; X64-AVX512:       # %bb.0:
498; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
499; X64-AVX512-NEXT:    retq
500 %1 = load <32 x i8>, ptr%p
501 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
502 ret <64 x i8> %2
503}
504
505;
506; Subvector Load + Broadcast + Store
507;
508
509define <4 x double> @test_broadcast_2f64_4f64_reuse(ptr %p0, ptr %p1) {
510; X86-AVX-LABEL: test_broadcast_2f64_4f64_reuse:
511; X86-AVX:       # %bb.0:
512; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
513; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
514; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
515; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
516; X86-AVX-NEXT:    retl
517;
518; X86-AVX512-LABEL: test_broadcast_2f64_4f64_reuse:
519; X86-AVX512:       # %bb.0:
520; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
521; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
522; X86-AVX512-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
523; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
524; X86-AVX512-NEXT:    retl
525;
526; X64-AVX-LABEL: test_broadcast_2f64_4f64_reuse:
527; X64-AVX:       # %bb.0:
528; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
529; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
530; X64-AVX-NEXT:    retq
531;
532; X64-AVX512-LABEL: test_broadcast_2f64_4f64_reuse:
533; X64-AVX512:       # %bb.0:
534; X64-AVX512-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
535; X64-AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
536; X64-AVX512-NEXT:    retq
537 %1 = load <2 x double>, ptr %p0
538 store <2 x double> %1, ptr %p1
539 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
540 ret <4 x double> %2
541}
542
543define <4 x i64> @test_broadcast_2i64_4i64_reuse(ptr %p0, ptr %p1) {
544; X86-AVX-LABEL: test_broadcast_2i64_4i64_reuse:
545; X86-AVX:       # %bb.0:
546; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
547; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
548; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
549; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
550; X86-AVX-NEXT:    retl
551;
552; X86-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
553; X86-AVX512:       # %bb.0:
554; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
555; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
556; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
557; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
558; X86-AVX512-NEXT:    retl
559;
560; X64-AVX-LABEL: test_broadcast_2i64_4i64_reuse:
561; X64-AVX:       # %bb.0:
562; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
563; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
564; X64-AVX-NEXT:    retq
565;
566; X64-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
567; X64-AVX512:       # %bb.0:
568; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
569; X64-AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
570; X64-AVX512-NEXT:    retq
571 %1 = load <2 x i64>, ptr %p0
572 store <2 x i64> %1, ptr %p1
573 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
574 ret <4 x i64> %2
575}
576
577define <8 x float> @test_broadcast_4f32_8f32_reuse(ptr %p0, ptr %p1) {
578; X86-AVX-LABEL: test_broadcast_4f32_8f32_reuse:
579; X86-AVX:       # %bb.0:
580; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
581; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
582; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
583; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
584; X86-AVX-NEXT:    retl
585;
586; X86-AVX512-LABEL: test_broadcast_4f32_8f32_reuse:
587; X86-AVX512:       # %bb.0:
588; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
589; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
590; X86-AVX512-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
591; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
592; X86-AVX512-NEXT:    retl
593;
594; X64-AVX-LABEL: test_broadcast_4f32_8f32_reuse:
595; X64-AVX:       # %bb.0:
596; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
597; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
598; X64-AVX-NEXT:    retq
599;
600; X64-AVX512-LABEL: test_broadcast_4f32_8f32_reuse:
601; X64-AVX512:       # %bb.0:
602; X64-AVX512-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
603; X64-AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
604; X64-AVX512-NEXT:    retq
605 %1 = load <4 x float>, ptr %p0
606 store <4 x float> %1, ptr %p1
607 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
608 ret <8 x float> %2
609}
610
611define <8 x i32> @test_broadcast_4i32_8i32_reuse(ptr %p0, ptr %p1) {
612; X86-AVX-LABEL: test_broadcast_4i32_8i32_reuse:
613; X86-AVX:       # %bb.0:
614; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
615; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
616; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
617; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
618; X86-AVX-NEXT:    retl
619;
620; X86-AVX512-LABEL: test_broadcast_4i32_8i32_reuse:
621; X86-AVX512:       # %bb.0:
622; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
623; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
624; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
625; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
626; X86-AVX512-NEXT:    retl
627;
628; X64-AVX-LABEL: test_broadcast_4i32_8i32_reuse:
629; X64-AVX:       # %bb.0:
630; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
631; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
632; X64-AVX-NEXT:    retq
633;
634; X64-AVX512-LABEL: test_broadcast_4i32_8i32_reuse:
635; X64-AVX512:       # %bb.0:
636; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
637; X64-AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
638; X64-AVX512-NEXT:    retq
639 %1 = load <4 x i32>, ptr %p0
640 store <4 x i32> %1, ptr %p1
641 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
642 ret <8 x i32> %2
643}
644
645define <16 x i16> @test_broadcast_8i16_16i16_reuse(ptr%p0, ptr%p1) nounwind {
646; X86-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
647; X86-AVX:       # %bb.0:
648; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
649; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
650; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
651; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
652; X86-AVX-NEXT:    retl
653;
654; X86-AVX512-LABEL: test_broadcast_8i16_16i16_reuse:
655; X86-AVX512:       # %bb.0:
656; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
657; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
658; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
659; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
660; X86-AVX512-NEXT:    retl
661;
662; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
663; X64-AVX:       # %bb.0:
664; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
665; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
666; X64-AVX-NEXT:    retq
667;
668; X64-AVX512-LABEL: test_broadcast_8i16_16i16_reuse:
669; X64-AVX512:       # %bb.0:
670; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
671; X64-AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
672; X64-AVX512-NEXT:    retq
673 %1 = load <8 x i16>, ptr%p0
674 store <8 x i16> %1, ptr %p1
675 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
676 ret <16 x i16> %2
677}
678
679define <32 x i8> @test_broadcast_16i8_32i8_reuse(ptr%p0, ptr%p1) nounwind {
680; X86-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
681; X86-AVX:       # %bb.0:
682; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
683; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
684; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
685; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
686; X86-AVX-NEXT:    retl
687;
688; X86-AVX512-LABEL: test_broadcast_16i8_32i8_reuse:
689; X86-AVX512:       # %bb.0:
690; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
691; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
692; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
693; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
694; X86-AVX512-NEXT:    retl
695;
696; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
697; X64-AVX:       # %bb.0:
698; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
699; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
700; X64-AVX-NEXT:    retq
701;
702; X64-AVX512-LABEL: test_broadcast_16i8_32i8_reuse:
703; X64-AVX512:       # %bb.0:
704; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
705; X64-AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
706; X64-AVX512-NEXT:    retq
707 %1 = load <16 x i8>, ptr%p0
708 store <16 x i8> %1, ptr %p1
709 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
710 ret <32 x i8> %2
711}
712
713;
714; Subvector Load + Broadcast with Separate Store
715;
716
717define <8 x i32> @test_broadcast_4i32_8i32_chain(ptr %p0, ptr %p1) {
718; X86-AVX-LABEL: test_broadcast_4i32_8i32_chain:
719; X86-AVX:       # %bb.0:
720; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
721; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
722; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
723; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
724; X86-AVX-NEXT:    vmovaps %xmm1, (%eax)
725; X86-AVX-NEXT:    retl
726;
727; X86-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
728; X86-AVX512:       # %bb.0:
729; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
730; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
731; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
732; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
733; X86-AVX512-NEXT:    vmovaps %xmm1, (%eax)
734; X86-AVX512-NEXT:    retl
735;
736; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain:
737; X64-AVX:       # %bb.0:
738; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
739; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
740; X64-AVX-NEXT:    vmovaps %xmm1, (%rsi)
741; X64-AVX-NEXT:    retq
742;
743; X64-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
744; X64-AVX512:       # %bb.0:
745; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
746; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
747; X64-AVX512-NEXT:    vmovaps %xmm1, (%rsi)
748; X64-AVX512-NEXT:    retq
749  %1 = load <4 x i32>, ptr %p0
750  store <4 x float> zeroinitializer, ptr %p1
751  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
752  ret <8 x i32> %2
753}
754
755define <16 x i32> @test_broadcast_4i32_16i32_chain(ptr %p0, ptr %p1) {
756; X86-AVX-LABEL: test_broadcast_4i32_16i32_chain:
757; X86-AVX:       # %bb.0:
758; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
759; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
760; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
761; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
762; X86-AVX-NEXT:    vmovaps %xmm1, (%eax)
763; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
764; X86-AVX-NEXT:    retl
765;
766; X86-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
767; X86-AVX512:       # %bb.0:
768; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
769; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
770; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
771; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
772; X86-AVX512-NEXT:    vmovaps %xmm1, (%eax)
773; X86-AVX512-NEXT:    retl
774;
775; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain:
776; X64-AVX:       # %bb.0:
777; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
778; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
779; X64-AVX-NEXT:    vmovaps %xmm1, (%rsi)
780; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
781; X64-AVX-NEXT:    retq
782;
783; X64-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
784; X64-AVX512:       # %bb.0:
785; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
786; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
787; X64-AVX512-NEXT:    vmovaps %xmm1, (%rsi)
788; X64-AVX512-NEXT:    retq
789  %1 = load <4 x i32>, ptr %p0
790  store <4 x float> zeroinitializer, ptr %p1
791  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
792  ret <16 x i32> %2
793}
794
795;
796; subvector Load with multiple uses + broadcast
797; Fallback to the broadcast should be done
798;
799
800@ga4 = dso_local global <4 x i64> zeroinitializer, align 8
801@gb4 = dso_local global <8 x i64> zeroinitializer, align 8
802
803define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
804; X86-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
805; X86-AVX1:       # %bb.0: # %entry
806; X86-AVX1-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
807; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm4
808; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
809; X86-AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [3,4]
810; X86-AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
811; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
812; X86-AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm6
813; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
814; X86-AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
815; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
816; X86-AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
817; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
818; X86-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
819; X86-AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
820; X86-AVX1-NEXT:    vandps %ymm3, %ymm2, %ymm2
821; X86-AVX1-NEXT:    vmovdqu %xmm0, ga4+16
822; X86-AVX1-NEXT:    vmovdqu %xmm4, ga4
823; X86-AVX1-NEXT:    vmovups %ymm2, gb4+32
824; X86-AVX1-NEXT:    vmovups %ymm1, gb4
825; X86-AVX1-NEXT:    vzeroupper
826; X86-AVX1-NEXT:    retl
827;
828; X86-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
829; X86-AVX2:       # %bb.0: # %entry
830; X86-AVX2-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,2,3,4]
831; X86-AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
832; X86-AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
833; X86-AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
834; X86-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
835; X86-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
836; X86-AVX2-NEXT:    vmovdqu %ymm0, ga4
837; X86-AVX2-NEXT:    vmovdqu %ymm2, gb4+32
838; X86-AVX2-NEXT:    vmovdqu %ymm1, gb4
839; X86-AVX2-NEXT:    vzeroupper
840; X86-AVX2-NEXT:    retl
841;
842; X86-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
843; X86-AVX512:       # %bb.0: # %entry
844; X86-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0]
845; X86-AVX512-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
846; X86-AVX512-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
847; X86-AVX512-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
848; X86-AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
849; X86-AVX512-NEXT:    vmovdqu %ymm0, ga4
850; X86-AVX512-NEXT:    vmovdqu64 %zmm1, gb4
851; X86-AVX512-NEXT:    vzeroupper
852; X86-AVX512-NEXT:    retl
853;
854; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
855; X64-AVX1:       # %bb.0: # %entry
856; X64-AVX1-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,2,3,4]
857; X64-AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm4
858; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
859; X64-AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [3,4]
860; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
861; X64-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
862; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm6
863; X64-AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
864; X64-AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
865; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
866; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
867; X64-AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
868; X64-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
869; X64-AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
870; X64-AVX1-NEXT:    vandps %ymm3, %ymm2, %ymm2
871; X64-AVX1-NEXT:    vmovdqu %xmm0, ga4+16(%rip)
872; X64-AVX1-NEXT:    vmovdqu %xmm4, ga4(%rip)
873; X64-AVX1-NEXT:    vmovups %ymm2, gb4+32(%rip)
874; X64-AVX1-NEXT:    vmovups %ymm1, gb4(%rip)
875; X64-AVX1-NEXT:    vzeroupper
876; X64-AVX1-NEXT:    retq
877;
878; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
879; X64-AVX2:       # %bb.0: # %entry
880; X64-AVX2-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,2,3,4]
881; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
882; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
883; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
884; X64-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
885; X64-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
886; X64-AVX2-NEXT:    vmovdqu %ymm0, ga4(%rip)
887; X64-AVX2-NEXT:    vmovdqu %ymm2, gb4+32(%rip)
888; X64-AVX2-NEXT:    vmovdqu %ymm1, gb4(%rip)
889; X64-AVX2-NEXT:    vzeroupper
890; X64-AVX2-NEXT:    retq
891;
892; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
893; X64-AVX512:       # %bb.0: # %entry
894; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,1,2,3,4]
895; X64-AVX512-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
896; X64-AVX512-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
897; X64-AVX512-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
898; X64-AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
899; X64-AVX512-NEXT:    vmovdqu %ymm0, ga4(%rip)
900; X64-AVX512-NEXT:    vmovdqu64 %zmm1, gb4(%rip)
901; X64-AVX512-NEXT:    vzeroupper
902; X64-AVX512-NEXT:    retq
903entry:
904  %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
905  %1 = add <8 x i64> %b, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
906  %2 = and <8 x i64> %1, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
907  store <4 x i64> %0, ptr @ga4, align 8
908  store <8 x i64> %2, ptr @gb4, align 8
909  ret void
910}
911
912
913@ga2 = dso_local global <4 x double> zeroinitializer, align 8
914@gb2 = dso_local global <8 x double> zeroinitializer, align 8
915
916define dso_local void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) {
917; X86-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
918; X86-AVX:       # %bb.0: # %entry
919; X86-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
920; X86-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
921; X86-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
922; X86-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
923; X86-AVX-NEXT:    vdivpd %ymm3, %ymm1, %ymm1
924; X86-AVX-NEXT:    vdivpd %ymm3, %ymm2, %ymm2
925; X86-AVX-NEXT:    vmovupd %ymm0, ga2
926; X86-AVX-NEXT:    vmovupd %ymm2, gb2+32
927; X86-AVX-NEXT:    vmovupd %ymm1, gb2
928; X86-AVX-NEXT:    vzeroupper
929; X86-AVX-NEXT:    retl
930;
931; X86-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
932; X86-AVX512:       # %bb.0: # %entry
933; X86-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0]
934; X86-AVX512-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
935; X86-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
936; X86-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
937; X86-AVX512-NEXT:    vdivpd %zmm2, %zmm1, %zmm1
938; X86-AVX512-NEXT:    vmovupd %ymm0, ga2
939; X86-AVX512-NEXT:    vmovupd %zmm1, gb2
940; X86-AVX512-NEXT:    vzeroupper
941; X86-AVX512-NEXT:    retl
942;
943; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
944; X64-AVX:       # %bb.0: # %entry
945; X64-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
946; X64-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
947; X64-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
948; X64-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
949; X64-AVX-NEXT:    vdivpd %ymm3, %ymm1, %ymm1
950; X64-AVX-NEXT:    vdivpd %ymm3, %ymm2, %ymm2
951; X64-AVX-NEXT:    vmovupd %ymm0, ga2(%rip)
952; X64-AVX-NEXT:    vmovupd %ymm2, gb2+32(%rip)
953; X64-AVX-NEXT:    vmovupd %ymm1, gb2(%rip)
954; X64-AVX-NEXT:    vzeroupper
955; X64-AVX-NEXT:    retq
956;
957; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
958; X64-AVX512:       # %bb.0: # %entry
959; X64-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0]
960; X64-AVX512-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
961; X64-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
962; X64-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
963; X64-AVX512-NEXT:    vdivpd %zmm2, %zmm1, %zmm1
964; X64-AVX512-NEXT:    vmovupd %ymm0, ga2(%rip)
965; X64-AVX512-NEXT:    vmovupd %zmm1, gb2(%rip)
966; X64-AVX512-NEXT:    vzeroupper
967; X64-AVX512-NEXT:    retq
968entry:
969  %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0>
970  %1 = fadd <8 x double> %b, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
971  %2 = fdiv <8 x double> %1, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
972  store <4 x double> %0, ptr @ga2, align 8
973  store <8 x double> %2, ptr @gb2, align 8
974  ret void
975}
976
977@ha4 = dso_local global <4 x i32> zeroinitializer, align 8
978@hb4 = dso_local global <8 x i32> zeroinitializer, align 8
979@hc4 = dso_local global <16 x i32> zeroinitializer, align 8
980
981define dso_local void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x i32> %b, <16 x i32> %c) nounwind {
982; X86-AVX1-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
983; X86-AVX1:       # %bb.0: # %entry
984; X86-AVX1-NEXT:    pushl %ebp
985; X86-AVX1-NEXT:    movl %esp, %ebp
986; X86-AVX1-NEXT:    andl $-32, %esp
987; X86-AVX1-NEXT:    subl $32, %esp
988; X86-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4]
989; X86-AVX1-NEXT:    # ymm3 = mem[0,1,0,1]
990; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
991; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
992; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm4
993; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
994; X86-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
995; X86-AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
996; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
997; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm4
998; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
999; X86-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
1000; X86-AVX1-NEXT:    vpaddd 8(%ebp), %xmm3, %xmm4
1001; X86-AVX1-NEXT:    vpaddd 24(%ebp), %xmm3, %xmm5
1002; X86-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
1003; X86-AVX1-NEXT:    vandps %ymm3, %ymm2, %ymm2
1004; X86-AVX1-NEXT:    vandps %ymm3, %ymm4, %ymm3
1005; X86-AVX1-NEXT:    vmovdqu %xmm0, ha4
1006; X86-AVX1-NEXT:    vmovups %ymm1, hb4
1007; X86-AVX1-NEXT:    vmovups %ymm3, hc4+32
1008; X86-AVX1-NEXT:    vmovups %ymm2, hc4
1009; X86-AVX1-NEXT:    movl %ebp, %esp
1010; X86-AVX1-NEXT:    popl %ebp
1011; X86-AVX1-NEXT:    vzeroupper
1012; X86-AVX1-NEXT:    retl
1013;
1014; X86-AVX2-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1015; X86-AVX2:       # %bb.0: # %entry
1016; X86-AVX2-NEXT:    pushl %ebp
1017; X86-AVX2-NEXT:    movl %esp, %ebp
1018; X86-AVX2-NEXT:    andl $-32, %esp
1019; X86-AVX2-NEXT:    subl $32, %esp
1020; X86-AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4]
1021; X86-AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
1022; X86-AVX2-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
1023; X86-AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
1024; X86-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
1025; X86-AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
1026; X86-AVX2-NEXT:    vpaddd 8(%ebp), %ymm3, %ymm4
1027; X86-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
1028; X86-AVX2-NEXT:    vpand %ymm3, %ymm4, %ymm3
1029; X86-AVX2-NEXT:    vmovdqu %xmm0, ha4
1030; X86-AVX2-NEXT:    vmovdqu %ymm1, hb4
1031; X86-AVX2-NEXT:    vmovdqu %ymm3, hc4+32
1032; X86-AVX2-NEXT:    vmovdqu %ymm2, hc4
1033; X86-AVX2-NEXT:    movl %ebp, %esp
1034; X86-AVX2-NEXT:    popl %ebp
1035; X86-AVX2-NEXT:    vzeroupper
1036; X86-AVX2-NEXT:    retl
1037;
1038; X86-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1039; X86-AVX512:       # %bb.0: # %entry
1040; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4]
1041; X86-AVX512-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1042; X86-AVX512-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
1043; X86-AVX512-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
1044; X86-AVX512-NEXT:    vpand %ymm3, %ymm1, %ymm1
1045; X86-AVX512-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
1046; X86-AVX512-NEXT:    vpandd %zmm3, %zmm2, %zmm2
1047; X86-AVX512-NEXT:    vmovdqu %xmm0, ha4
1048; X86-AVX512-NEXT:    vmovdqu %ymm1, hb4
1049; X86-AVX512-NEXT:    vmovdqu64 %zmm2, hc4
1050; X86-AVX512-NEXT:    vzeroupper
1051; X86-AVX512-NEXT:    retl
1052;
1053; X64-AVX1-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1054; X64-AVX1:       # %bb.0: # %entry
1055; X64-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4]
1056; X64-AVX1-NEXT:    # ymm4 = mem[0,1,0,1]
1057; X64-AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
1058; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
1059; X64-AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm5
1060; X64-AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
1061; X64-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
1062; X64-AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
1063; X64-AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
1064; X64-AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm5
1065; X64-AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
1066; X64-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
1067; X64-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
1068; X64-AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm5
1069; X64-AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
1070; X64-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
1071; X64-AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
1072; X64-AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
1073; X64-AVX1-NEXT:    vmovdqu %xmm0, ha4(%rip)
1074; X64-AVX1-NEXT:    vmovups %ymm1, hb4(%rip)
1075; X64-AVX1-NEXT:    vmovups %ymm3, hc4+32(%rip)
1076; X64-AVX1-NEXT:    vmovups %ymm2, hc4(%rip)
1077; X64-AVX1-NEXT:    vzeroupper
1078; X64-AVX1-NEXT:    retq
1079;
1080; X64-AVX2-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1081; X64-AVX2:       # %bb.0: # %entry
1082; X64-AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4]
1083; X64-AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
1084; X64-AVX2-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
1085; X64-AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1
1086; X64-AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
1087; X64-AVX2-NEXT:    vpaddd %ymm4, %ymm3, %ymm3
1088; X64-AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2
1089; X64-AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
1090; X64-AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
1091; X64-AVX2-NEXT:    vmovdqu %xmm0, ha4(%rip)
1092; X64-AVX2-NEXT:    vmovdqu %ymm1, hb4(%rip)
1093; X64-AVX2-NEXT:    vmovdqu %ymm3, hc4+32(%rip)
1094; X64-AVX2-NEXT:    vmovdqu %ymm2, hc4(%rip)
1095; X64-AVX2-NEXT:    vzeroupper
1096; X64-AVX2-NEXT:    retq
1097;
1098; X64-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1099; X64-AVX512:       # %bb.0: # %entry
1100; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4]
1101; X64-AVX512-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1102; X64-AVX512-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
1103; X64-AVX512-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
1104; X64-AVX512-NEXT:    vpand %ymm3, %ymm1, %ymm1
1105; X64-AVX512-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
1106; X64-AVX512-NEXT:    vpandd %zmm3, %zmm2, %zmm2
1107; X64-AVX512-NEXT:    vmovdqu %xmm0, ha4(%rip)
1108; X64-AVX512-NEXT:    vmovdqu %ymm1, hb4(%rip)
1109; X64-AVX512-NEXT:    vmovdqu64 %zmm2, hc4(%rip)
1110; X64-AVX512-NEXT:    vzeroupper
1111; X64-AVX512-NEXT:    retq
1112entry:
1113  %0 = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
1114  %1 = add <8 x i32> %b, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1115  %2 = and <8 x i32> %1, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1116  %3 = add <16 x i32> %c, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1117  %4 = and <16 x i32> %3, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1118  store <4 x i32> %0, ptr @ha4, align 8
1119  store <8 x i32> %2, ptr @hb4, align 8
1120  store <16 x i32> %4, ptr @hc4, align 8
1121  ret void
1122}
1123
1124;
1125; Subvector Broadcast from register
1126;
1127
1128define <4 x double> @reg_broadcast_2f64_4f64(<2 x double> %a0) nounwind {
1129; X86-LABEL: reg_broadcast_2f64_4f64:
1130; X86:       # %bb.0:
1131; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1132; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1133; X86-NEXT:    retl
1134;
1135; X64-LABEL: reg_broadcast_2f64_4f64:
1136; X64:       # %bb.0:
1137; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1138; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1139; X64-NEXT:    retq
1140 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1141 ret <4 x double> %1
1142}
1143
1144define <8 x double> @reg_broadcast_2f64_8f64(<2 x double> %a0) nounwind {
1145; X86-AVX-LABEL: reg_broadcast_2f64_8f64:
1146; X86-AVX:       # %bb.0:
1147; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1148; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1149; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1150; X86-AVX-NEXT:    retl
1151;
1152; X86-AVX512-LABEL: reg_broadcast_2f64_8f64:
1153; X86-AVX512:       # %bb.0:
1154; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1155; X86-AVX512-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1156; X86-AVX512-NEXT:    retl
1157;
1158; X64-AVX-LABEL: reg_broadcast_2f64_8f64:
1159; X64-AVX:       # %bb.0:
1160; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1161; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1162; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1163; X64-AVX-NEXT:    retq
1164;
1165; X64-AVX512-LABEL: reg_broadcast_2f64_8f64:
1166; X64-AVX512:       # %bb.0:
1167; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1168; X64-AVX512-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1169; X64-AVX512-NEXT:    retq
1170 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1171 ret <8 x double> %1
1172}
1173
1174define <8 x double> @reg_broadcast_4f64_8f64(<4 x double> %a0) nounwind {
1175; X86-AVX-LABEL: reg_broadcast_4f64_8f64:
1176; X86-AVX:       # %bb.0:
1177; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1178; X86-AVX-NEXT:    retl
1179;
1180; X86-AVX512-LABEL: reg_broadcast_4f64_8f64:
1181; X86-AVX512:       # %bb.0:
1182; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1183; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1184; X86-AVX512-NEXT:    retl
1185;
1186; X64-AVX-LABEL: reg_broadcast_4f64_8f64:
1187; X64-AVX:       # %bb.0:
1188; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1189; X64-AVX-NEXT:    retq
1190;
1191; X64-AVX512-LABEL: reg_broadcast_4f64_8f64:
1192; X64-AVX512:       # %bb.0:
1193; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1194; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1195; X64-AVX512-NEXT:    retq
1196 %1 = shufflevector <4 x double> %a0, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1197 ret <8 x double> %1
1198}
1199
1200define <4 x i64> @reg_broadcast_2i64_4i64(<2 x i64> %a0) nounwind {
1201; X86-LABEL: reg_broadcast_2i64_4i64:
1202; X86:       # %bb.0:
1203; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1204; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1205; X86-NEXT:    retl
1206;
1207; X64-LABEL: reg_broadcast_2i64_4i64:
1208; X64:       # %bb.0:
1209; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1210; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1211; X64-NEXT:    retq
1212 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1213 ret <4 x i64> %1
1214}
1215
1216define <8 x i64> @reg_broadcast_2i64_8i64(<2 x i64> %a0) nounwind {
1217; X86-AVX-LABEL: reg_broadcast_2i64_8i64:
1218; X86-AVX:       # %bb.0:
1219; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1220; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1221; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1222; X86-AVX-NEXT:    retl
1223;
1224; X86-AVX512-LABEL: reg_broadcast_2i64_8i64:
1225; X86-AVX512:       # %bb.0:
1226; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1227; X86-AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1228; X86-AVX512-NEXT:    retl
1229;
1230; X64-AVX-LABEL: reg_broadcast_2i64_8i64:
1231; X64-AVX:       # %bb.0:
1232; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1233; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1234; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1235; X64-AVX-NEXT:    retq
1236;
1237; X64-AVX512-LABEL: reg_broadcast_2i64_8i64:
1238; X64-AVX512:       # %bb.0:
1239; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1240; X64-AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1241; X64-AVX512-NEXT:    retq
1242 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1243 ret <8 x i64> %1
1244}
1245
1246define <8 x i64> @reg_broadcast_4i64_8i64(<4 x i64> %a0) nounwind {
1247; X86-AVX-LABEL: reg_broadcast_4i64_8i64:
1248; X86-AVX:       # %bb.0:
1249; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1250; X86-AVX-NEXT:    retl
1251;
1252; X86-AVX512-LABEL: reg_broadcast_4i64_8i64:
1253; X86-AVX512:       # %bb.0:
1254; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1255; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1256; X86-AVX512-NEXT:    retl
1257;
1258; X64-AVX-LABEL: reg_broadcast_4i64_8i64:
1259; X64-AVX:       # %bb.0:
1260; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1261; X64-AVX-NEXT:    retq
1262;
1263; X64-AVX512-LABEL: reg_broadcast_4i64_8i64:
1264; X64-AVX512:       # %bb.0:
1265; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1266; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1267; X64-AVX512-NEXT:    retq
1268 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1269 ret <8 x i64> %1
1270}
1271
1272define <8 x float> @reg_broadcast_4f32_8f32(<4 x float> %a0) nounwind {
1273; X86-LABEL: reg_broadcast_4f32_8f32:
1274; X86:       # %bb.0:
1275; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1276; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1277; X86-NEXT:    retl
1278;
1279; X64-LABEL: reg_broadcast_4f32_8f32:
1280; X64:       # %bb.0:
1281; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1282; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1283; X64-NEXT:    retq
1284 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1285 ret <8 x float> %1
1286}
1287
1288define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind {
1289; X86-AVX-LABEL: reg_broadcast_4f32_16f32:
1290; X86-AVX:       # %bb.0:
1291; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1292; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1293; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1294; X86-AVX-NEXT:    retl
1295;
1296; X86-AVX512-LABEL: reg_broadcast_4f32_16f32:
1297; X86-AVX512:       # %bb.0:
1298; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1299; X86-AVX512-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1300; X86-AVX512-NEXT:    retl
1301;
1302; X64-AVX-LABEL: reg_broadcast_4f32_16f32:
1303; X64-AVX:       # %bb.0:
1304; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1305; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1306; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1307; X64-AVX-NEXT:    retq
1308;
1309; X64-AVX512-LABEL: reg_broadcast_4f32_16f32:
1310; X64-AVX512:       # %bb.0:
1311; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1312; X64-AVX512-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1313; X64-AVX512-NEXT:    retq
1314 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1315 ret <16 x float> %1
1316}
1317
1318define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind {
1319; X86-AVX-LABEL: reg_broadcast_8f32_16f32:
1320; X86-AVX:       # %bb.0:
1321; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1322; X86-AVX-NEXT:    retl
1323;
1324; X86-AVX512-LABEL: reg_broadcast_8f32_16f32:
1325; X86-AVX512:       # %bb.0:
1326; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1327; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1328; X86-AVX512-NEXT:    retl
1329;
1330; X64-AVX-LABEL: reg_broadcast_8f32_16f32:
1331; X64-AVX:       # %bb.0:
1332; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1333; X64-AVX-NEXT:    retq
1334;
1335; X64-AVX512-LABEL: reg_broadcast_8f32_16f32:
1336; X64-AVX512:       # %bb.0:
1337; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1338; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1339; X64-AVX512-NEXT:    retq
1340 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1341 ret <16 x float> %1
1342}
1343
1344define <8 x i32> @reg_broadcast_4i32_8i32(<4 x i32> %a0) nounwind {
1345; X86-LABEL: reg_broadcast_4i32_8i32:
1346; X86:       # %bb.0:
1347; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1348; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1349; X86-NEXT:    retl
1350;
1351; X64-LABEL: reg_broadcast_4i32_8i32:
1352; X64:       # %bb.0:
1353; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1354; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1355; X64-NEXT:    retq
1356 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1357 ret <8 x i32> %1
1358}
1359
1360define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind {
1361; X86-AVX-LABEL: reg_broadcast_4i32_16i32:
1362; X86-AVX:       # %bb.0:
1363; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1364; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1365; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1366; X86-AVX-NEXT:    retl
1367;
1368; X86-AVX512-LABEL: reg_broadcast_4i32_16i32:
1369; X86-AVX512:       # %bb.0:
1370; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1371; X86-AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1372; X86-AVX512-NEXT:    retl
1373;
1374; X64-AVX-LABEL: reg_broadcast_4i32_16i32:
1375; X64-AVX:       # %bb.0:
1376; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1377; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1378; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1379; X64-AVX-NEXT:    retq
1380;
1381; X64-AVX512-LABEL: reg_broadcast_4i32_16i32:
1382; X64-AVX512:       # %bb.0:
1383; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1384; X64-AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1385; X64-AVX512-NEXT:    retq
1386 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1387 ret <16 x i32> %1
1388}
1389
1390define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind {
1391; X86-AVX-LABEL: reg_broadcast_8i32_16i32:
1392; X86-AVX:       # %bb.0:
1393; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1394; X86-AVX-NEXT:    retl
1395;
1396; X86-AVX512-LABEL: reg_broadcast_8i32_16i32:
1397; X86-AVX512:       # %bb.0:
1398; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1399; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1400; X86-AVX512-NEXT:    retl
1401;
1402; X64-AVX-LABEL: reg_broadcast_8i32_16i32:
1403; X64-AVX:       # %bb.0:
1404; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1405; X64-AVX-NEXT:    retq
1406;
1407; X64-AVX512-LABEL: reg_broadcast_8i32_16i32:
1408; X64-AVX512:       # %bb.0:
1409; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1410; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1411; X64-AVX512-NEXT:    retq
1412 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1413 ret <16 x i32> %1
1414}
1415
1416define <16 x i16> @reg_broadcast_8i16_16i16(<8 x i16> %a0) nounwind {
1417; X86-LABEL: reg_broadcast_8i16_16i16:
1418; X86:       # %bb.0:
1419; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1420; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1421; X86-NEXT:    retl
1422;
1423; X64-LABEL: reg_broadcast_8i16_16i16:
1424; X64:       # %bb.0:
1425; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1426; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1427; X64-NEXT:    retq
1428 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1429 ret <16 x i16> %1
1430}
1431
1432define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind {
1433; X86-AVX-LABEL: reg_broadcast_8i16_32i16:
1434; X86-AVX:       # %bb.0:
1435; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1436; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1437; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1438; X86-AVX-NEXT:    retl
1439;
1440; X86-AVX512-LABEL: reg_broadcast_8i16_32i16:
1441; X86-AVX512:       # %bb.0:
1442; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1443; X86-AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1444; X86-AVX512-NEXT:    retl
1445;
1446; X64-AVX-LABEL: reg_broadcast_8i16_32i16:
1447; X64-AVX:       # %bb.0:
1448; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1449; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1450; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1451; X64-AVX-NEXT:    retq
1452;
1453; X64-AVX512-LABEL: reg_broadcast_8i16_32i16:
1454; X64-AVX512:       # %bb.0:
1455; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1456; X64-AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1457; X64-AVX512-NEXT:    retq
1458 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1459 ret <32 x i16> %1
1460}
1461
1462define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind {
1463; X86-AVX-LABEL: reg_broadcast_16i16_32i16:
1464; X86-AVX:       # %bb.0:
1465; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1466; X86-AVX-NEXT:    retl
1467;
1468; X86-AVX512-LABEL: reg_broadcast_16i16_32i16:
1469; X86-AVX512:       # %bb.0:
1470; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1471; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1472; X86-AVX512-NEXT:    retl
1473;
1474; X64-AVX-LABEL: reg_broadcast_16i16_32i16:
1475; X64-AVX:       # %bb.0:
1476; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1477; X64-AVX-NEXT:    retq
1478;
1479; X64-AVX512-LABEL: reg_broadcast_16i16_32i16:
1480; X64-AVX512:       # %bb.0:
1481; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1482; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1483; X64-AVX512-NEXT:    retq
1484 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1485 ret <32 x i16> %1
1486}
1487
1488define <32 x i8> @reg_broadcast_16i8_32i8(<16 x i8> %a0) nounwind {
1489; X86-LABEL: reg_broadcast_16i8_32i8:
1490; X86:       # %bb.0:
1491; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1492; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1493; X86-NEXT:    retl
1494;
1495; X64-LABEL: reg_broadcast_16i8_32i8:
1496; X64:       # %bb.0:
1497; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1498; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1499; X64-NEXT:    retq
1500 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1501 ret <32 x i8> %1
1502}
1503
1504define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind {
1505; X86-AVX-LABEL: reg_broadcast_16i8_64i8:
1506; X86-AVX:       # %bb.0:
1507; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1508; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1509; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1510; X86-AVX-NEXT:    retl
1511;
1512; X86-AVX512-LABEL: reg_broadcast_16i8_64i8:
1513; X86-AVX512:       # %bb.0:
1514; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1515; X86-AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1516; X86-AVX512-NEXT:    retl
1517;
1518; X64-AVX-LABEL: reg_broadcast_16i8_64i8:
1519; X64-AVX:       # %bb.0:
1520; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1521; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1522; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1523; X64-AVX-NEXT:    retq
1524;
1525; X64-AVX512-LABEL: reg_broadcast_16i8_64i8:
1526; X64-AVX512:       # %bb.0:
1527; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1528; X64-AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1529; X64-AVX512-NEXT:    retq
1530 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1531 ret <64 x i8> %1
1532}
1533
1534define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind {
1535; X86-AVX-LABEL: reg_broadcast_32i8_64i8:
1536; X86-AVX:       # %bb.0:
1537; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1538; X86-AVX-NEXT:    retl
1539;
1540; X86-AVX512-LABEL: reg_broadcast_32i8_64i8:
1541; X86-AVX512:       # %bb.0:
1542; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1543; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1544; X86-AVX512-NEXT:    retl
1545;
1546; X64-AVX-LABEL: reg_broadcast_32i8_64i8:
1547; X64-AVX:       # %bb.0:
1548; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1549; X64-AVX-NEXT:    retq
1550;
1551; X64-AVX512-LABEL: reg_broadcast_32i8_64i8:
1552; X64-AVX512:       # %bb.0:
1553; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1554; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1555; X64-AVX512-NEXT:    retq
1556 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1557 ret <64 x i8> %1
1558}
1559
1560;
1561; PR34394
1562;
1563
1564define <4 x i32> @test_2xi32_to_4xi32_mem(ptr %vp) {
1565; X86-LABEL: test_2xi32_to_4xi32_mem:
1566; X86:       # %bb.0:
1567; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1568; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
1569; X86-NEXT:    retl
1570;
1571; X64-LABEL: test_2xi32_to_4xi32_mem:
1572; X64:       # %bb.0:
1573; X64-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
1574; X64-NEXT:    retq
1575  %vec = load <2 x i32>, ptr %vp
1576  %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1577  ret <4 x i32> %res
1578}
1579
1580define <8 x i32> @test_2xi32_to_8xi32_mem(ptr %vp) {
1581; X86-LABEL: test_2xi32_to_8xi32_mem:
1582; X86:       # %bb.0:
1583; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1584; X86-NEXT:    vbroadcastsd (%eax), %ymm0
1585; X86-NEXT:    retl
1586;
1587; X64-LABEL: test_2xi32_to_8xi32_mem:
1588; X64:       # %bb.0:
1589; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
1590; X64-NEXT:    retq
1591  %vec = load <2 x i32>, ptr %vp
1592  %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1593  ret <8 x i32> %res
1594}
1595
1596define <16 x i32> @test_2xi32_to_16xi32_mem(ptr %vp) {
1597; X86-AVX-LABEL: test_2xi32_to_16xi32_mem:
1598; X86-AVX:       # %bb.0:
1599; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1600; X86-AVX-NEXT:    vbroadcastsd (%eax), %ymm0
1601; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1602; X86-AVX-NEXT:    retl
1603;
1604; X86-AVX512-LABEL: test_2xi32_to_16xi32_mem:
1605; X86-AVX512:       # %bb.0:
1606; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1607; X86-AVX512-NEXT:    vbroadcastsd (%eax), %zmm0
1608; X86-AVX512-NEXT:    retl
1609;
1610; X64-AVX-LABEL: test_2xi32_to_16xi32_mem:
1611; X64-AVX:       # %bb.0:
1612; X64-AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
1613; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1614; X64-AVX-NEXT:    retq
1615;
1616; X64-AVX512-LABEL: test_2xi32_to_16xi32_mem:
1617; X64-AVX512:       # %bb.0:
1618; X64-AVX512-NEXT:    vbroadcastsd (%rdi), %zmm0
1619; X64-AVX512-NEXT:    retq
1620  %vec = load <2 x i32>, ptr %vp
1621  %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1622  ret <16 x i32> %res
1623}
1624
1625;
1626; PR34041
1627;
1628
1629define <4 x double> @broadcast_v4f64_f64_u000(ptr %p) {
1630; X86-LABEL: broadcast_v4f64_f64_u000:
1631; X86:       # %bb.0:
1632; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1633; X86-NEXT:    vbroadcastsd (%eax), %ymm0
1634; X86-NEXT:    retl
1635;
1636; X64-LABEL: broadcast_v4f64_f64_u000:
1637; X64:       # %bb.0:
1638; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
1639; X64-NEXT:    retq
1640  %s = load double, ptr %p
1641  %vec = insertelement <2 x double> undef, double %s, i32 0
1642  %res = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1643  ret <4 x double> %res
1644}
1645
1646define <4 x double> @broadcast_v4f64_v2f64_4u61(ptr %vp, <4 x double> %default) {
1647; X86-LABEL: broadcast_v4f64_v2f64_4u61:
1648; X86:       # %bb.0:
1649; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1650; X86-NEXT:    vinsertf128 $1, (%eax), %ymm0, %ymm1
1651; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1652; X86-NEXT:    retl
1653;
1654; X64-LABEL: broadcast_v4f64_v2f64_4u61:
1655; X64:       # %bb.0:
1656; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm1
1657; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1658; X64-NEXT:    retq
1659  %vec = load <2 x double>, ptr %vp
1660  %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 1>
1661  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %default
1662  ret <4 x double> %res
1663}
1664
1665define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(ptr %vp, <8 x float> %default) {
1666; X86-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1667; X86:       # %bb.0:
1668; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1669; X86-NEXT:    vbroadcastsd (%eax), %ymm1
1670; X86-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
1671; X86-NEXT:    retl
1672;
1673; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1674; X64:       # %bb.0:
1675; X64-NEXT:    vbroadcastsd (%rdi), %ymm1
1676; X64-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
1677; X64-NEXT:    retq
1678  %vec = load <2 x float>, ptr %vp
1679  %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef>
1680  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %default
1681  ret <8 x float> %res
1682}
1683
1684define <8 x double> @broadcast_v8f64_v2f64_u1u10101(ptr %vp) {
1685; X86-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101:
1686; X86-AVX:       # %bb.0:
1687; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1688; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1689; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1690; X86-AVX-NEXT:    retl
1691;
1692; X86-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101:
1693; X86-AVX512:       # %bb.0:
1694; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1695; X86-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1696; X86-AVX512-NEXT:    retl
1697;
1698; X64-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101:
1699; X64-AVX:       # %bb.0:
1700; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1701; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1702; X64-AVX-NEXT:    retq
1703;
1704; X64-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101:
1705; X64-AVX512:       # %bb.0:
1706; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1707; X64-AVX512-NEXT:    retq
1708  %vec = load <2 x double>, ptr %vp
1709  %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 3, i32 1, i32 undef, i32 1, i32 0, i32 1, i32 0, i32 1>
1710  ret <8 x double> %res
1711}
1712
1713define <8 x double> @broadcast_v8f64_v2f64_0uuu0101(ptr %vp) {
1714; X86-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1715; X86-AVX:       # %bb.0:
1716; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1717; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1718; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1719; X86-AVX-NEXT:    retl
1720;
1721; X86-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1722; X86-AVX512:       # %bb.0:
1723; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1724; X86-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1725; X86-AVX512-NEXT:    retl
1726;
1727; X64-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1728; X64-AVX:       # %bb.0:
1729; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1730; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1731; X64-AVX-NEXT:    retq
1732;
1733; X64-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1734; X64-AVX512:       # %bb.0:
1735; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1736; X64-AVX512-NEXT:    retq
1737  %vec = load <2 x double>, ptr %vp
1738  %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 0, i32 1>
1739  ret <8 x double> %res
1740}
1741
1742define void @PR51226() {
1743; X86-AVX1-LABEL: PR51226:
1744; X86-AVX1:       # %bb.0:
1745; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1746; X86-AVX1-NEXT:    vpslld $16, %xmm0, %xmm0
1747; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1748; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1749; X86-AVX1-NEXT:    vminps %ymm1, %ymm0, %ymm0
1750; X86-AVX1-NEXT:    vmovups %ymm0, (%eax)
1751; X86-AVX1-NEXT:    vzeroupper
1752; X86-AVX1-NEXT:    retl
1753;
1754; X86-AVX2-LABEL: PR51226:
1755; X86-AVX2:       # %bb.0:
1756; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1757; X86-AVX2-NEXT:    vpslld $16, %xmm0, %xmm0
1758; X86-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1759; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1760; X86-AVX2-NEXT:    vminps %ymm1, %ymm0, %ymm0
1761; X86-AVX2-NEXT:    vmovups %ymm0, (%eax)
1762; X86-AVX2-NEXT:    vzeroupper
1763; X86-AVX2-NEXT:    retl
1764;
1765; X86-AVX512-LABEL: PR51226:
1766; X86-AVX512:       # %bb.0:
1767; X86-AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1768; X86-AVX512-NEXT:    vpslld $16, %xmm0, %xmm0
1769; X86-AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1770; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1771; X86-AVX512-NEXT:    vminps %ymm1, %ymm0, %ymm0
1772; X86-AVX512-NEXT:    vmovups %ymm0, (%eax)
1773; X86-AVX512-NEXT:    vzeroupper
1774; X86-AVX512-NEXT:    retl
1775;
1776; X64-AVX1-LABEL: PR51226:
1777; X64-AVX1:       # %bb.0:
1778; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1779; X64-AVX1-NEXT:    vpslld $16, %xmm0, %xmm0
1780; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1781; X64-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1782; X64-AVX1-NEXT:    vminps %ymm1, %ymm0, %ymm0
1783; X64-AVX1-NEXT:    vmovups %ymm0, (%rax)
1784; X64-AVX1-NEXT:    vzeroupper
1785; X64-AVX1-NEXT:    retq
1786;
1787; X64-AVX2-LABEL: PR51226:
1788; X64-AVX2:       # %bb.0:
1789; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1790; X64-AVX2-NEXT:    vpslld $16, %xmm0, %xmm0
1791; X64-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1792; X64-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1793; X64-AVX2-NEXT:    vminps %ymm1, %ymm0, %ymm0
1794; X64-AVX2-NEXT:    vmovups %ymm0, (%rax)
1795; X64-AVX2-NEXT:    vzeroupper
1796; X64-AVX2-NEXT:    retq
1797;
1798; X64-AVX512-LABEL: PR51226:
1799; X64-AVX512:       # %bb.0:
1800; X64-AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1801; X64-AVX512-NEXT:    vpslld $16, %xmm0, %xmm0
1802; X64-AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1803; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1804; X64-AVX512-NEXT:    vminps %ymm1, %ymm0, %ymm0
1805; X64-AVX512-NEXT:    vmovups %ymm0, (%rax)
1806; X64-AVX512-NEXT:    vzeroupper
1807; X64-AVX512-NEXT:    retq
1808  %i = load <4 x i16>, ptr undef, align 8
1809  %i1 = zext <4 x i16> %i to <4 x i32>
1810  %i2 = shl nuw <4 x i32> %i1, <i32 16, i32 16, i32 16, i32 16>
1811  %i3 = bitcast <4 x i32> %i2 to <4 x float>
1812  %shuffle99 = shufflevector <4 x float> %i3, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1813  %i4 = fcmp reassoc nsz contract ogt <8 x float> zeroinitializer, %shuffle99
1814  %i5 = select <8 x i1> %i4, <8 x float> %shuffle99, <8 x float> zeroinitializer
1815  store <8 x float> %i5, ptr undef, align 16
1816  ret void
1817}
1818