xref: /llvm-project/llvm/test/Transforms/InstCombine/X86/x86-vpermil.ll (revision d57c04647e6f0a6f0cd79e280c257f570e8f30f4)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
3target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
4
5; Verify that instcombine is able to fold identity shuffles.
6
7define <4 x float> @identity_test_vpermilvar_ps(<4 x float> %v) {
8; CHECK-LABEL: @identity_test_vpermilvar_ps(
9; CHECK-NEXT:    ret <4 x float> [[V:%.*]]
10;
11  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 0, i32 1, i32 2, i32 3>)
12  ret <4 x float> %a
13}
14
15define <8 x float> @identity_test_vpermilvar_ps_256(<8 x float> %v) {
16; CHECK-LABEL: @identity_test_vpermilvar_ps_256(
17; CHECK-NEXT:    ret <8 x float> [[V:%.*]]
18;
19  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
20  ret <8 x float> %a
21}
22
23define <16 x float> @identity_test_vpermilvar_ps_512(<16 x float> %v) {
24; CHECK-LABEL: @identity_test_vpermilvar_ps_512(
25; CHECK-NEXT:    ret <16 x float> [[V:%.*]]
26;
27  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>)
28  ret <16 x float> %a
29}
30
31define <2 x double> @identity_test_vpermilvar_pd(<2 x double> %v) {
32; CHECK-LABEL: @identity_test_vpermilvar_pd(
33; CHECK-NEXT:    ret <2 x double> [[V:%.*]]
34;
35  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 0, i64 2>)
36  ret <2 x double> %a
37}
38
39define <4 x double> @identity_test_vpermilvar_pd_256(<4 x double> %v) {
40; CHECK-LABEL: @identity_test_vpermilvar_pd_256(
41; CHECK-NEXT:    ret <4 x double> [[V:%.*]]
42;
43  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 0, i64 2, i64 0, i64 2>)
44  ret <4 x double> %a
45}
46
47define <8 x double> @identity_test_vpermilvar_pd_512(<8 x double> %v) {
48; CHECK-LABEL: @identity_test_vpermilvar_pd_512(
49; CHECK-NEXT:    ret <8 x double> [[V:%.*]]
50;
51  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 0, i64 2, i64 0, i64 2, i64 0, i64 2, i64 0, i64 2>)
52  ret <8 x double> %a
53}
54
55; Instcombine should be able to fold the following byte shuffle to a builtin shufflevector
56; with a shuffle mask of all zeroes.
57
58define <4 x float> @zero_test_vpermilvar_ps_zero(<4 x float> %v) {
59; CHECK-LABEL: @zero_test_vpermilvar_ps_zero(
60; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> zeroinitializer
61; CHECK-NEXT:    ret <4 x float> [[A]]
62;
63  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer)
64  ret <4 x float> %a
65}
66
67define <8 x float> @zero_test_vpermilvar_ps_256_zero(<8 x float> %v) {
68; CHECK-LABEL: @zero_test_vpermilvar_ps_256_zero(
69; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x float> [[V:%.*]], <8 x float> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
70; CHECK-NEXT:    ret <8 x float> [[A]]
71;
72  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer)
73  ret <8 x float> %a
74}
75
76define <16 x float> @zero_test_vpermilvar_ps_512_zero(<16 x float> %v) {
77; CHECK-LABEL: @zero_test_vpermilvar_ps_512_zero(
78; CHECK-NEXT:    [[A:%.*]] = shufflevector <16 x float> [[V:%.*]], <16 x float> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
79; CHECK-NEXT:    ret <16 x float> [[A]]
80;
81  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> zeroinitializer)
82  ret <16 x float> %a
83}
84
85define <2 x double> @zero_test_vpermilvar_pd_zero(<2 x double> %v) {
86; CHECK-LABEL: @zero_test_vpermilvar_pd_zero(
87; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> zeroinitializer
88; CHECK-NEXT:    ret <2 x double> [[A]]
89;
90  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> zeroinitializer)
91  ret <2 x double> %a
92}
93
94define <4 x double> @zero_test_vpermilvar_pd_256_zero(<4 x double> %v) {
95; CHECK-LABEL: @zero_test_vpermilvar_pd_256_zero(
96; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x double> [[V:%.*]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
97; CHECK-NEXT:    ret <4 x double> [[A]]
98;
99  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> zeroinitializer)
100  ret <4 x double> %a
101}
102
103define <8 x double> @zero_test_vpermilvar_pd_512_zero(<8 x double> %v) {
104; CHECK-LABEL: @zero_test_vpermilvar_pd_512_zero(
105; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x double> [[V:%.*]], <8 x double> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
106; CHECK-NEXT:    ret <8 x double> [[A]]
107;
108  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> zeroinitializer)
109  ret <8 x double> %a
110}
111
112; Verify that instcombine is able to fold constant shuffles.
113
114define <4 x float> @test_vpermilvar_ps(<4 x float> %v) {
115; CHECK-LABEL: @test_vpermilvar_ps(
116; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
117; CHECK-NEXT:    ret <4 x float> [[A]]
118;
119  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
120  ret <4 x float> %a
121}
122
123define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) {
124; CHECK-LABEL: @test_vpermilvar_ps_256(
125; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x float> [[V:%.*]], <8 x float> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
126; CHECK-NEXT:    ret <8 x float> [[A]]
127;
128  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
129  ret <8 x float> %a
130}
131
132define <16 x float> @test_vpermilvar_ps_512(<16 x float> %v) {
133; CHECK-LABEL: @test_vpermilvar_ps_512(
134; CHECK-NEXT:    [[A:%.*]] = shufflevector <16 x float> [[V:%.*]], <16 x float> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
135; CHECK-NEXT:    ret <16 x float> [[A]]
136;
137  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
138  ret <16 x float> %a
139}
140
141define <2 x double> @test_vpermilvar_pd(<2 x double> %v) {
142; CHECK-LABEL: @test_vpermilvar_pd(
143; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
144; CHECK-NEXT:    ret <2 x double> [[A]]
145;
146  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 2, i64 0>)
147  ret <2 x double> %a
148}
149
150define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) {
151; CHECK-LABEL: @test_vpermilvar_pd_256(
152; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x double> [[V:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
153; CHECK-NEXT:    ret <4 x double> [[A]]
154;
155  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 3, i64 1, i64 2, i64 0>)
156  ret <4 x double> %a
157}
158
159define <8 x double> @test_vpermilvar_pd_512(<8 x double> %v) {
160; CHECK-LABEL: @test_vpermilvar_pd_512(
161; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x double> [[V:%.*]], <8 x double> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
162; CHECK-NEXT:    ret <8 x double> [[A]]
163;
164  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 3, i64 1, i64 2, i64 0, i64 7, i64 5, i64 6, i64 4>)
165  ret <8 x double> %a
166}
167
168; Verify that instcombine is able to fold constant shuffles with undef mask elements.
169
170define <4 x float> @undef_test_vpermilvar_ps(<4 x float> %v) {
171; CHECK-LABEL: @undef_test_vpermilvar_ps(
172; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 2, i32 1, i32 poison>
173; CHECK-NEXT:    ret <4 x float> [[A]]
174;
175  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>)
176  ret <4 x float> %a
177}
178
179define <8 x float> @undef_test_vpermilvar_ps_256(<8 x float> %v) {
180; CHECK-LABEL: @undef_test_vpermilvar_ps_256(
181; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x float> [[V:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 2, i32 1, i32 poison, i32 7, i32 6, i32 5, i32 4>
182; CHECK-NEXT:    ret <8 x float> [[A]]
183;
184  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0>)
185  ret <8 x float> %a
186}
187
188define <16 x float> @undef_test_vpermilvar_ps_512(<16 x float> %v) {
189; CHECK-LABEL: @undef_test_vpermilvar_ps_512(
190; CHECK-NEXT:    [[A:%.*]] = shufflevector <16 x float> [[V:%.*]], <16 x float> poison, <16 x i32> <i32 poison, i32 2, i32 1, i32 poison, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 10, i32 9, i32 poison, i32 15, i32 14, i32 13, i32 12>
191; CHECK-NEXT:    ret <16 x float> [[A]]
192;
193  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0, i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0>)
194  ret <16 x float> %a
195}
196
197define <2 x double> @undef_test_vpermilvar_pd(<2 x double> %v) {
198; CHECK-LABEL: @undef_test_vpermilvar_pd(
199; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
200; CHECK-NEXT:    ret <2 x double> [[A]]
201;
202  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 undef, i64 0>)
203  ret <2 x double> %a
204}
205
206define <4 x double> @undef_test_vpermilvar_pd_256(<4 x double> %v) {
207; CHECK-LABEL: @undef_test_vpermilvar_pd_256(
208; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x double> [[V:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 0, i32 3, i32 poison>
209; CHECK-NEXT:    ret <4 x double> [[A]]
210;
211  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 undef, i64 1, i64 2, i64 undef>)
212  ret <4 x double> %a
213}
214
215define <8 x double> @undef_test_vpermilvar_pd_512(<8 x double> %v) {
216; CHECK-LABEL: @undef_test_vpermilvar_pd_512(
217; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x double> [[V:%.*]], <8 x double> poison, <8 x i32> <i32 poison, i32 0, i32 3, i32 poison, i32 poison, i32 4, i32 7, i32 poison>
218; CHECK-NEXT:    ret <8 x double> [[A]]
219;
220  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 undef, i64 1, i64 2, i64 undef, i64 undef, i64 1, i64 2, i64 undef>)
221  ret <8 x double> %a
222}
223
224; Simplify demanded bits (PR106413)
225
226define <4 x float> @bits_test_vpermilvar_ps(<4 x float> %InVec, <4 x i32> %InMask) {
227; CHECK-LABEL: @bits_test_vpermilvar_ps(
228; CHECK-NEXT:    [[S:%.*]] = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[INVEC:%.*]], <4 x i32> [[INMASK:%.*]])
229; CHECK-NEXT:    ret <4 x float> [[S]]
230;
231  %m = or <4 x i32> %InMask, <i32 0, i32 12, i32 4294967292, i32 -4>
232  %s = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %InVec, <4 x i32> %m)
233  ret <4 x float> %s
234}
235
236define <8 x float> @bits_test_vpermilvar_ps_256(<8 x float> %InVec, <8 x i32> %InMask) {
237; CHECK-LABEL: @bits_test_vpermilvar_ps_256(
238; CHECK-NEXT:    [[S:%.*]] = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[INVEC:%.*]], <8 x i32> [[INMASK:%.*]])
239; CHECK-NEXT:    ret <8 x float> [[S]]
240;
241  %m = or <8 x i32> %InMask, <i32 0, i32 12, i32 4294967292, i32 -4, i32 0, i32 12, i32 4294967292, i32 -4>
242  %s = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %InVec, <8 x i32> %m)
243  ret <8 x float> %s
244}
245
246define <16 x float> @bits_test_vpermilvar_ps_512(<16 x float> %InVec, <16 x i32> %InMask) {
247; CHECK-LABEL: @bits_test_vpermilvar_ps_512(
248; CHECK-NEXT:    [[S:%.*]] = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[INVEC:%.*]], <16 x i32> [[INMASK:%.*]])
249; CHECK-NEXT:    ret <16 x float> [[S]]
250;
251  %m = or <16 x i32> %InMask, <i32 0, i32 12, i32 4294967292, i32 -4, i32 0, i32 12, i32 4294967292, i32 -4, i32 0, i32 12, i32 4294967292, i32 -4, i32 0, i32 12, i32 4294967292, i32 -4>
252  %s = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %InVec, <16 x i32> %m)
253  ret <16 x float> %s
254}
255
256define <2 x double> @bits_test_vpermilvar_pd(<2 x double> %InVec, <2 x i64> %InMask) {
257; CHECK-LABEL: @bits_test_vpermilvar_pd(
258; CHECK-NEXT:    [[S:%.*]] = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[INVEC:%.*]], <2 x i64> [[INMASK:%.*]])
259; CHECK-NEXT:    ret <2 x double> [[S]]
260;
261  %m = or <2 x i64> %InMask, <i64 0, i64 4294967293>
262  %s = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %InVec, <2 x i64> %m)
263  ret <2 x double> %s
264}
265
266define <4 x double> @bits_test_vpermilvar_pd_256(<4 x double> %InVec, <4 x i64> %InMask) {
267; CHECK-LABEL: @bits_test_vpermilvar_pd_256(
268; CHECK-NEXT:    [[S:%.*]] = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[INVEC:%.*]], <4 x i64> [[INMASK:%.*]])
269; CHECK-NEXT:    ret <4 x double> [[S]]
270;
271  %m = or <4 x i64> %InMask, <i64 0, i64 1, i64 4294967293, i64 -3>
272  %s = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %InVec, <4 x i64> %m)
273  ret <4 x double> %s
274}
275
276define <8 x double> @bits_test_vpermilvar_pd_512(<8 x double> %InVec, <8 x i64> %InMask) {
277; CHECK-LABEL: @bits_test_vpermilvar_pd_512(
278; CHECK-NEXT:    [[S:%.*]] = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[INVEC:%.*]], <8 x i64> [[INMASK:%.*]])
279; CHECK-NEXT:    ret <8 x double> [[S]]
280;
281  %m = or <8 x i64> %InMask, <i64 0, i64 1, i64 4294967293, i64 -3, i64 0, i64 1, i64 4294967293, i64 -3>
282  %s = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %InVec, <8 x i64> %m)
283  ret <8 x double> %s
284}
285
286; negative test - vpermilpd uses bit1 not bit0 for the index bit
287define <2 x double> @bits_test_vpermilvar_pd_negative(<2 x double> %InVec, <2 x i64> %InMask) {
288; CHECK-LABEL: @bits_test_vpermilvar_pd_negative(
289; CHECK-NEXT:    [[M:%.*]] = or <2 x i64> [[INMASK:%.*]], <i64 0, i64 2>
290; CHECK-NEXT:    [[S:%.*]] = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[INVEC:%.*]], <2 x i64> [[M]])
291; CHECK-NEXT:    ret <2 x double> [[S]]
292;
293  %m = or <2 x i64> %InMask, <i64 0, i64 2>
294  %s = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %InVec, <2 x i64> %m)
295  ret <2 x double> %s
296}
297
298; Simplify demanded elts
299
300define <4 x float> @elts_test_vpermilvar_ps(<4 x float> %a0, i32 %a1) {
301; CHECK-LABEL: @elts_test_vpermilvar_ps(
302; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
303; CHECK-NEXT:    ret <4 x float> [[TMP1]]
304;
305  %1 = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %a1, i32 3
306  %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %1)
307  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
308  ret <4 x float> %3
309}
310
311define <8 x float> @elts_test_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a1) {
312; CHECK-LABEL: @elts_test_vpermilvar_ps_256(
313; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 6, i32 poison, i32 7>
314; CHECK-NEXT:    ret <8 x float> [[TMP1]]
315;
316  %1 = shufflevector <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
317  %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %1)
318  %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7>
319  ret <8 x float> %3
320}
321
322define <16 x float> @elts_test_vpermilvar_ps_512(<16 x float> %a0, <16 x i32> %a1, i32 %a2) {
323; CHECK-LABEL: @elts_test_vpermilvar_ps_512(
324; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[A0:%.*]], <16 x i32> [[A1:%.*]])
325; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <16 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
326; CHECK-NEXT:    ret <16 x float> [[TMP2]]
327;
328  %1 = insertelement <16 x i32> %a1, i32 %a2, i32 0
329  %2 = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %1)
330  %3 = shufflevector <16 x float> %2, <16 x float> undef, <16 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
331  ret <16 x float> %3
332}
333
334define <2 x double> @elts_test_vpermilvar_pd(<2 x double> %a0, i64 %a1) {
335; CHECK-LABEL: @elts_test_vpermilvar_pd(
336; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
337; CHECK-NEXT:    ret <2 x double> [[TMP1]]
338;
339  %1 = insertelement <2 x i64> <i64 0, i64 2>, i64 %a1, i32 1
340  %2 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %1)
341  %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
342  ret <2 x double> %3
343}
344
345define <4 x double> @elts_test_vpermilvar_pd_256(<4 x double> %a0, <4 x i64> %a1) {
346; CHECK-LABEL: @elts_test_vpermilvar_pd_256(
347; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 poison>
348; CHECK-NEXT:    ret <4 x double> [[TMP1]]
349;
350  %1 = shufflevector <4 x i64> <i64 0, i64 2, i64 0, i64 2>, <4 x i64> %a1, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
351  %2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %1)
352  %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
353  ret <4 x double> %3
354}
355
356define <8 x double> @elts_test_vpermilvar_pd_512(<8 x double> %a0, <8 x i64> %a1, i64 %a2) {
357; CHECK-LABEL: @elts_test_vpermilvar_pd_512(
358; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i64> poison, i64 [[A2:%.*]], i64 0
359; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[A0:%.*]], <8 x i64> [[TMP1]])
360; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <8 x i32> zeroinitializer
361; CHECK-NEXT:    ret <8 x double> [[TMP3]]
362;
363  %1 = insertelement <8 x i64> %a1, i64 %a2, i32 0
364  %2 = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %1)
365  %3 = shufflevector <8 x double> %2, <8 x double> undef, <8 x i32> zeroinitializer
366  ret <8 x double> %3
367}
368
369declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>)
370declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>)
371declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
372
373declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
374declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
375declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>)
376