xref: /llvm-project/llvm/test/CodeGen/X86/perm.avx512-false-deps.ll (revision 2f448bf509432c1a19ec46ab8cbc7353c03c6280)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE
3; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE
4
5define <4 x i64> @permq_ri_256(<4 x i64> %a0) {
6; ENABLE-LABEL: permq_ri_256:
7; ENABLE:       # %bb.0:
8; ENABLE-NEXT:    #APP
9; ENABLE-NEXT:    nop
10; ENABLE-NEXT:    #NO_APP
11; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
12; ENABLE-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0]
13; ENABLE-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
14; ENABLE-NEXT:    retq
15;
16; DISABLE-LABEL: permq_ri_256:
17; DISABLE:       # %bb.0:
18; DISABLE-NEXT:    #APP
19; DISABLE-NEXT:    nop
20; DISABLE-NEXT:    #NO_APP
21; DISABLE-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0]
22; DISABLE-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
23; DISABLE-NEXT:    retq
24  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
25  %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
26  %res = add <4 x i64> %2, %a0
27  ret <4 x i64> %res
28}
29
30define <4 x i64> @permq_rr_256(<4 x i64> %a0, <4 x i64> %idx) {
31; ENABLE-LABEL: permq_rr_256:
32; ENABLE:       # %bb.0:
33; ENABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
34; ENABLE-NEXT:    #APP
35; ENABLE-NEXT:    nop
36; ENABLE-NEXT:    #NO_APP
37; ENABLE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
38; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
39; ENABLE-NEXT:    vpermq %ymm0, %ymm2, %ymm1
40; ENABLE-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
41; ENABLE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
42; ENABLE-NEXT:    retq
43;
44; DISABLE-LABEL: permq_rr_256:
45; DISABLE:       # %bb.0:
46; DISABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
47; DISABLE-NEXT:    #APP
48; DISABLE-NEXT:    nop
49; DISABLE-NEXT:    #NO_APP
50; DISABLE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
51; DISABLE-NEXT:    vpermq %ymm0, %ymm2, %ymm1
52; DISABLE-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
53; DISABLE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
54; DISABLE-NEXT:    retq
55  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
56  %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx)
57  %t = add <4 x i64> %a0, %idx
58  %res = add <4 x i64> %t, %2
59  ret <4 x i64> %res
60}
61
62define <4 x i64> @permq_rm_256(ptr %p0, <4 x i64> %idx) {
63; ENABLE-LABEL: permq_rm_256:
64; ENABLE:       # %bb.0:
65; ENABLE-NEXT:    #APP
66; ENABLE-NEXT:    nop
67; ENABLE-NEXT:    #NO_APP
68; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
69; ENABLE-NEXT:    vpermq (%rdi), %ymm0, %ymm1
70; ENABLE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
71; ENABLE-NEXT:    retq
72;
73; DISABLE-LABEL: permq_rm_256:
74; DISABLE:       # %bb.0:
75; DISABLE-NEXT:    #APP
76; DISABLE-NEXT:    nop
77; DISABLE-NEXT:    #NO_APP
78; DISABLE-NEXT:    vpermq (%rdi), %ymm0, %ymm1
79; DISABLE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
80; DISABLE-NEXT:    retq
81  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
82  %a0 = load <4 x i64>, ptr %p0, align 64
83  %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx)
84  %res = add <4 x i64> %idx, %2
85  ret <4 x i64> %res
86}
87
88define <4 x i64> @permq_mi_256(ptr %p0) {
89; ENABLE-LABEL: permq_mi_256:
90; ENABLE:       # %bb.0:
91; ENABLE-NEXT:    #APP
92; ENABLE-NEXT:    nop
93; ENABLE-NEXT:    #NO_APP
94; ENABLE-NEXT:    vxorps %xmm0, %xmm0, %xmm0
95; ENABLE-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,2,0]
96; ENABLE-NEXT:    retq
97;
98; DISABLE-LABEL: permq_mi_256:
99; DISABLE:       # %bb.0:
100; DISABLE-NEXT:    #APP
101; DISABLE-NEXT:    nop
102; DISABLE-NEXT:    #NO_APP
103; DISABLE-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,2,0]
104; DISABLE-NEXT:    retq
105  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
106  %a0 = load <4 x i64>, ptr %p0, align 64
107  %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
108  ret <4 x i64> %2
109}
110
111define <4 x i64> @permq_broadcast_256(ptr %p0, <4 x i64> %idx) {
112; ENABLE-LABEL: permq_broadcast_256:
113; ENABLE:       # %bb.0:
114; ENABLE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
115; ENABLE-NEXT:    #APP
116; ENABLE-NEXT:    nop
117; ENABLE-NEXT:    #NO_APP
118; ENABLE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
119; ENABLE-NEXT:    vxorps %xmm0, %xmm0, %xmm0
120; ENABLE-NEXT:    vpermq (%rdi){1to4}, %ymm1, %ymm0
121; ENABLE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
122; ENABLE-NEXT:    retq
123;
124; DISABLE-LABEL: permq_broadcast_256:
125; DISABLE:       # %bb.0:
126; DISABLE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
127; DISABLE-NEXT:    #APP
128; DISABLE-NEXT:    nop
129; DISABLE-NEXT:    #NO_APP
130; DISABLE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
131; DISABLE-NEXT:    vpermq (%rdi){1to4}, %ymm1, %ymm0
132; DISABLE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
133; DISABLE-NEXT:    retq
134  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
135  %v0 = load i64, ptr %p0, align 4
136  %t0 = insertelement <4 x i64> undef, i64 %v0, i64 0
137  %a0 = shufflevector <4 x i64> %t0, <4 x i64> undef, <4 x i32> zeroinitializer
138  %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx)
139  %res = add <4 x i64> %2, %idx
140  ret <4 x i64> %res
141}
142
143define <4 x i64> @permq_maskz_256(<4 x i64> %a0, <4 x i64> %idx, ptr %mask) {
144; ENABLE-LABEL: permq_maskz_256:
145; ENABLE:       # %bb.0:
146; ENABLE-NEXT:    #APP
147; ENABLE-NEXT:    nop
148; ENABLE-NEXT:    #NO_APP
149; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
150; ENABLE-NEXT:    vpermq %ymm0, %ymm1, %ymm2
151; ENABLE-NEXT:    kmovb (%rdi), %k1
152; ENABLE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
153; ENABLE-NEXT:    vpaddq %ymm2, %ymm0, %ymm0 {%k1}
154; ENABLE-NEXT:    retq
155;
156; DISABLE-LABEL: permq_maskz_256:
157; DISABLE:       # %bb.0:
158; DISABLE-NEXT:    #APP
159; DISABLE-NEXT:    nop
160; DISABLE-NEXT:    #NO_APP
161; DISABLE-NEXT:    vpermq %ymm0, %ymm1, %ymm2
162; DISABLE-NEXT:    kmovb (%rdi), %k1
163; DISABLE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
164; DISABLE-NEXT:    vpaddq %ymm2, %ymm0, %ymm0 {%k1}
165; DISABLE-NEXT:    retq
166  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
167  %2 = load i8, ptr %mask
168  %3 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx, <4 x i64> zeroinitializer, i8 %2)
169  %t = add <4 x i64> %a0, %idx
170  %res = add <4 x i64> %3, %t
171  ret <4 x i64> %res
172}
173
174declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>)
175declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
176
177define <8 x i64> @permq_rr_512(<8 x i64> %a0, <8 x i64> %idx) {
178; ENABLE-LABEL: permq_rr_512:
179; ENABLE:       # %bb.0:
180; ENABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
181; ENABLE-NEXT:    #APP
182; ENABLE-NEXT:    nop
183; ENABLE-NEXT:    #NO_APP
184; ENABLE-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
185; ENABLE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
186; ENABLE-NEXT:    vpermq %zmm0, %zmm2, %zmm1
187; ENABLE-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
188; ENABLE-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
189; ENABLE-NEXT:    retq
190;
191; DISABLE-LABEL: permq_rr_512:
192; DISABLE:       # %bb.0:
193; DISABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
194; DISABLE-NEXT:    #APP
195; DISABLE-NEXT:    nop
196; DISABLE-NEXT:    #NO_APP
197; DISABLE-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
198; DISABLE-NEXT:    vpermq %zmm0, %zmm2, %zmm1
199; DISABLE-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
200; DISABLE-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
201; DISABLE-NEXT:    retq
202  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
203  %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx)
204  %t = add <8 x i64> %a0, %idx
205  %res = add <8 x i64> %t, %2
206  ret <8 x i64> %res
207}
208
209define <8 x i64> @permq_rm_512(ptr %p0, <8 x i64> %idx) {
210; ENABLE-LABEL: permq_rm_512:
211; ENABLE:       # %bb.0:
212; ENABLE-NEXT:    #APP
213; ENABLE-NEXT:    nop
214; ENABLE-NEXT:    #NO_APP
215; ENABLE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
216; ENABLE-NEXT:    vpermq (%rdi), %zmm0, %zmm1
217; ENABLE-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
218; ENABLE-NEXT:    retq
219;
220; DISABLE-LABEL: permq_rm_512:
221; DISABLE:       # %bb.0:
222; DISABLE-NEXT:    #APP
223; DISABLE-NEXT:    nop
224; DISABLE-NEXT:    #NO_APP
225; DISABLE-NEXT:    vpermq (%rdi), %zmm0, %zmm1
226; DISABLE-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
227; DISABLE-NEXT:    retq
228  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
229  %a0 = load <8 x i64>, ptr %p0, align 64
230  %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx)
231  %res = add <8 x i64> %idx, %2
232  ret <8 x i64> %res
233}
234
235define <8 x i64> @permq_broadcast_512(ptr %p0, <8 x i64> %idx) {
236; ENABLE-LABEL: permq_broadcast_512:
237; ENABLE:       # %bb.0:
238; ENABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
239; ENABLE-NEXT:    #APP
240; ENABLE-NEXT:    nop
241; ENABLE-NEXT:    #NO_APP
242; ENABLE-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
243; ENABLE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
244; ENABLE-NEXT:    vpermq (%rdi){1to8}, %zmm1, %zmm0
245; ENABLE-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
246; ENABLE-NEXT:    retq
247;
248; DISABLE-LABEL: permq_broadcast_512:
249; DISABLE:       # %bb.0:
250; DISABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
251; DISABLE-NEXT:    #APP
252; DISABLE-NEXT:    nop
253; DISABLE-NEXT:    #NO_APP
254; DISABLE-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
255; DISABLE-NEXT:    vpermq (%rdi){1to8}, %zmm1, %zmm0
256; DISABLE-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
257; DISABLE-NEXT:    retq
258  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
259  %v0 = load i64, ptr %p0, align 4
260  %t0 = insertelement <8 x i64> undef, i64 %v0, i64 0
261  %a0 = shufflevector <8 x i64> %t0, <8 x i64> undef, <8 x i32> zeroinitializer
262  %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx)
263  %res = add <8 x i64> %2, %idx
264  ret <8 x i64> %res
265}
266
267define <8 x i64> @permq_maskz_512(<8 x i64> %a0, <8 x i64> %idx, ptr %mask) {
268; ENABLE-LABEL: permq_maskz_512:
269; ENABLE:       # %bb.0:
270; ENABLE-NEXT:    #APP
271; ENABLE-NEXT:    nop
272; ENABLE-NEXT:    #NO_APP
273; ENABLE-NEXT:    vpxor %xmm2, %xmm2, %xmm2
274; ENABLE-NEXT:    vpermq %zmm0, %zmm1, %zmm2
275; ENABLE-NEXT:    kmovb (%rdi), %k1
276; ENABLE-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
277; ENABLE-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 {%k1}
278; ENABLE-NEXT:    retq
279;
280; DISABLE-LABEL: permq_maskz_512:
281; DISABLE:       # %bb.0:
282; DISABLE-NEXT:    #APP
283; DISABLE-NEXT:    nop
284; DISABLE-NEXT:    #NO_APP
285; DISABLE-NEXT:    vpermq %zmm0, %zmm1, %zmm2
286; DISABLE-NEXT:    kmovb (%rdi), %k1
287; DISABLE-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
288; DISABLE-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 {%k1}
289; DISABLE-NEXT:    retq
290  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
291  %2 = load i8, ptr %mask
292  %3 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx, <8 x i64> zeroinitializer, i8 %2)
293  %t = add <8 x i64> %a0, %idx
294  %res = add <8 x i64> %3, %t
295  ret <8 x i64> %res
296}
297
298declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
299declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
300
301define <8 x i32> @permd_rr_256(<8 x i32> %a0, <8 x i32> %idx) {
302; ENABLE-LABEL: permd_rr_256:
303; ENABLE:       # %bb.0:
304; ENABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
305; ENABLE-NEXT:    #APP
306; ENABLE-NEXT:    nop
307; ENABLE-NEXT:    #NO_APP
308; ENABLE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
309; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
310; ENABLE-NEXT:    vpermd %ymm0, %ymm2, %ymm1
311; ENABLE-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
312; ENABLE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
313; ENABLE-NEXT:    retq
314;
315; DISABLE-LABEL: permd_rr_256:
316; DISABLE:       # %bb.0:
317; DISABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
318; DISABLE-NEXT:    #APP
319; DISABLE-NEXT:    nop
320; DISABLE-NEXT:    #NO_APP
321; DISABLE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
322; DISABLE-NEXT:    vpermd %ymm0, %ymm2, %ymm1
323; DISABLE-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
324; DISABLE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
325; DISABLE-NEXT:    retq
326  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
327  %2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> undef, i8 -1)
328  %t = add <8 x i32> %a0, %idx
329  %res = add <8 x i32> %t, %2
330  ret <8 x i32> %res
331}
332
333define <8 x i32> @permd_rm_256(ptr %p0, <8 x i32> %idx) {
334; ENABLE-LABEL: permd_rm_256:
335; ENABLE:       # %bb.0:
336; ENABLE-NEXT:    #APP
337; ENABLE-NEXT:    nop
338; ENABLE-NEXT:    #NO_APP
339; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
340; ENABLE-NEXT:    vpermd (%rdi), %ymm0, %ymm1
341; ENABLE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
342; ENABLE-NEXT:    retq
343;
344; DISABLE-LABEL: permd_rm_256:
345; DISABLE:       # %bb.0:
346; DISABLE-NEXT:    #APP
347; DISABLE-NEXT:    nop
348; DISABLE-NEXT:    #NO_APP
349; DISABLE-NEXT:    vpermd (%rdi), %ymm0, %ymm1
350; DISABLE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
351; DISABLE-NEXT:    retq
352  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
353  %a0 = load <8 x i32>, ptr %p0, align 64
354  %2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> undef, i8 -1)
355  %res = add <8 x i32> %idx, %2
356  ret <8 x i32> %res
357}
358
359define <8 x i32> @permd_broadcast_256(ptr %p0, <8 x i32> %idx) {
360; ENABLE-LABEL: permd_broadcast_256:
361; ENABLE:       # %bb.0:
362; ENABLE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
363; ENABLE-NEXT:    #APP
364; ENABLE-NEXT:    nop
365; ENABLE-NEXT:    #NO_APP
366; ENABLE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
367; ENABLE-NEXT:    vxorps %xmm0, %xmm0, %xmm0
368; ENABLE-NEXT:    vpermd (%rdi){1to8}, %ymm1, %ymm0
369; ENABLE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
370; ENABLE-NEXT:    retq
371;
372; DISABLE-LABEL: permd_broadcast_256:
373; DISABLE:       # %bb.0:
374; DISABLE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
375; DISABLE-NEXT:    #APP
376; DISABLE-NEXT:    nop
377; DISABLE-NEXT:    #NO_APP
378; DISABLE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
379; DISABLE-NEXT:    vpermd (%rdi){1to8}, %ymm1, %ymm0
380; DISABLE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
381; DISABLE-NEXT:    retq
382  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
383  %v0 = load i32, ptr %p0, align 4
384  %t0 = insertelement <8 x i32> undef, i32 %v0, i32 0
385  %a0 = shufflevector <8 x i32> %t0, <8 x i32> undef, <8 x i32> zeroinitializer
386  %2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> zeroinitializer, i8 -1)
387  %res = add <8 x i32> %2, %idx
388  ret <8 x i32> %res
389}
390
391define <8 x i32> @permd_maskz_256(<8 x i32> %a0, <8 x i32> %idx, ptr %mask) {
392; ENABLE-LABEL: permd_maskz_256:
393; ENABLE:       # %bb.0:
394; ENABLE-NEXT:    #APP
395; ENABLE-NEXT:    nop
396; ENABLE-NEXT:    #NO_APP
397; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
398; ENABLE-NEXT:    vpermd %ymm0, %ymm1, %ymm2
399; ENABLE-NEXT:    kmovb (%rdi), %k1
400; ENABLE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
401; ENABLE-NEXT:    vpaddd %ymm2, %ymm0, %ymm0 {%k1}
402; ENABLE-NEXT:    retq
403;
404; DISABLE-LABEL: permd_maskz_256:
405; DISABLE:       # %bb.0:
406; DISABLE-NEXT:    #APP
407; DISABLE-NEXT:    nop
408; DISABLE-NEXT:    #NO_APP
409; DISABLE-NEXT:    vpermd %ymm0, %ymm1, %ymm2
410; DISABLE-NEXT:    kmovb (%rdi), %k1
411; DISABLE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
412; DISABLE-NEXT:    vpaddd %ymm2, %ymm0, %ymm0 {%k1}
413; DISABLE-NEXT:    retq
414  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
415  %2 = load i8, ptr %mask
416  %3 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> zeroinitializer, i8 %2)
417  %t = add <8 x i32> %a0, %idx
418  %res = add <8 x i32> %3, %t
419  ret <8 x i32> %res
420}
421
422declare <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
423
424define <16 x i32> @permd_rr_512(<16 x i32> %a0, <16 x i32> %idx) {
425; ENABLE-LABEL: permd_rr_512:
426; ENABLE:       # %bb.0:
427; ENABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
428; ENABLE-NEXT:    #APP
429; ENABLE-NEXT:    nop
430; ENABLE-NEXT:    #NO_APP
431; ENABLE-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
432; ENABLE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
433; ENABLE-NEXT:    vpermd %zmm0, %zmm2, %zmm1
434; ENABLE-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
435; ENABLE-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
436; ENABLE-NEXT:    retq
437;
438; DISABLE-LABEL: permd_rr_512:
439; DISABLE:       # %bb.0:
440; DISABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
441; DISABLE-NEXT:    #APP
442; DISABLE-NEXT:    nop
443; DISABLE-NEXT:    #NO_APP
444; DISABLE-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
445; DISABLE-NEXT:    vpermd %zmm0, %zmm2, %zmm1
446; DISABLE-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
447; DISABLE-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
448; DISABLE-NEXT:    retq
449  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
450  %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> undef, i16 -1)
451  %t = add <16 x i32> %a0, %idx
452  %res = add <16 x i32> %t, %2
453  ret <16 x i32> %res
454}
455
456define <16 x i32> @permd_rm_512(ptr %p0, <16 x i32> %idx) {
457; ENABLE-LABEL: permd_rm_512:
458; ENABLE:       # %bb.0:
459; ENABLE-NEXT:    #APP
460; ENABLE-NEXT:    nop
461; ENABLE-NEXT:    #NO_APP
462; ENABLE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
463; ENABLE-NEXT:    vpermd (%rdi), %zmm0, %zmm1
464; ENABLE-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
465; ENABLE-NEXT:    retq
466;
467; DISABLE-LABEL: permd_rm_512:
468; DISABLE:       # %bb.0:
469; DISABLE-NEXT:    #APP
470; DISABLE-NEXT:    nop
471; DISABLE-NEXT:    #NO_APP
472; DISABLE-NEXT:    vpermd (%rdi), %zmm0, %zmm1
473; DISABLE-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
474; DISABLE-NEXT:    retq
475  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
476  %a0 = load <16 x i32>, ptr %p0, align 64
477  %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> undef, i16 -1)
478  %res = add <16 x i32> %idx, %2
479  ret <16 x i32> %res
480}
481
482define <16 x i32> @permd_broadcast_512(ptr %p0, <16 x i32> %idx) {
483; ENABLE-LABEL: permd_broadcast_512:
484; ENABLE:       # %bb.0:
485; ENABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
486; ENABLE-NEXT:    #APP
487; ENABLE-NEXT:    nop
488; ENABLE-NEXT:    #NO_APP
489; ENABLE-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
490; ENABLE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
491; ENABLE-NEXT:    vpermd (%rdi){1to16}, %zmm1, %zmm0
492; ENABLE-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
493; ENABLE-NEXT:    retq
494;
495; DISABLE-LABEL: permd_broadcast_512:
496; DISABLE:       # %bb.0:
497; DISABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
498; DISABLE-NEXT:    #APP
499; DISABLE-NEXT:    nop
500; DISABLE-NEXT:    #NO_APP
501; DISABLE-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
502; DISABLE-NEXT:    vpermd (%rdi){1to16}, %zmm1, %zmm0
503; DISABLE-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
504; DISABLE-NEXT:    retq
505  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
506  %v0 = load i32, ptr %p0, align 4
507  %t0 = insertelement <16 x i32> undef, i32 %v0, i32 0
508  %a0 = shufflevector <16 x i32> %t0, <16 x i32> undef, <16 x i32> zeroinitializer
509  %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> undef, i16 -1)
510  %res = add <16 x i32> %2, %idx
511  ret <16 x i32> %res
512}
513
514define <16 x i32> @permd_maskz_512(<16 x i32> %a0, <16 x i32> %idx, ptr %mask) {
515; ENABLE-LABEL: permd_maskz_512:
516; ENABLE:       # %bb.0:
517; ENABLE-NEXT:    #APP
518; ENABLE-NEXT:    nop
519; ENABLE-NEXT:    #NO_APP
520; ENABLE-NEXT:    vpxor %xmm2, %xmm2, %xmm2
521; ENABLE-NEXT:    vpermd %zmm0, %zmm1, %zmm2
522; ENABLE-NEXT:    kmovw (%rdi), %k1
523; ENABLE-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
524; ENABLE-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 {%k1}
525; ENABLE-NEXT:    retq
526;
527; DISABLE-LABEL: permd_maskz_512:
528; DISABLE:       # %bb.0:
529; DISABLE-NEXT:    #APP
530; DISABLE-NEXT:    nop
531; DISABLE-NEXT:    #NO_APP
532; DISABLE-NEXT:    vpermd %zmm0, %zmm1, %zmm2
533; DISABLE-NEXT:    kmovw (%rdi), %k1
534; DISABLE-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
535; DISABLE-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 {%k1}
536; DISABLE-NEXT:    retq
537  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
538  %2 = load i16, ptr %mask
539  %3 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> zeroinitializer, i16 %2)
540  %t = add <16 x i32> %a0, %idx
541  %res = add <16 x i32> %3, %t
542  ret <16 x i32> %res
543}
544
545declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
546
547define <4 x double> @permpd_ri_256(<4 x double> %a0) {
548; ENABLE-LABEL: permpd_ri_256:
549; ENABLE:       # %bb.0:
550; ENABLE-NEXT:    #APP
551; ENABLE-NEXT:    nop
552; ENABLE-NEXT:    #NO_APP
553; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
554; ENABLE-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0]
555; ENABLE-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
556; ENABLE-NEXT:    retq
557;
558; DISABLE-LABEL: permpd_ri_256:
559; DISABLE:       # %bb.0:
560; DISABLE-NEXT:    #APP
561; DISABLE-NEXT:    nop
562; DISABLE-NEXT:    #NO_APP
563; DISABLE-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0]
564; DISABLE-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
565; DISABLE-NEXT:    retq
566  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
567  %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
568  %res = fadd <4 x double> %2, %a0
569  ret <4 x double> %res
570}
571
572define <4 x double> @permpd_rr_256(<4 x double> %a0, <4 x i64> %idx) {
573; ENABLE-LABEL: permpd_rr_256:
574; ENABLE:       # %bb.0:
575; ENABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
576; ENABLE-NEXT:    #APP
577; ENABLE-NEXT:    nop
578; ENABLE-NEXT:    #NO_APP
579; ENABLE-NEXT:    vmovapd %ymm0, %ymm2
580; ENABLE-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
581; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
582; ENABLE-NEXT:    vpermpd %ymm2, %ymm0, %ymm1
583; ENABLE-NEXT:    vcvtqq2pd %ymm0, %ymm0
584; ENABLE-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
585; ENABLE-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
586; ENABLE-NEXT:    retq
587;
588; DISABLE-LABEL: permpd_rr_256:
589; DISABLE:       # %bb.0:
590; DISABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
591; DISABLE-NEXT:    #APP
592; DISABLE-NEXT:    nop
593; DISABLE-NEXT:    #NO_APP
594; DISABLE-NEXT:    vmovapd %ymm0, %ymm2
595; DISABLE-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
596; DISABLE-NEXT:    vpermpd %ymm2, %ymm0, %ymm1
597; DISABLE-NEXT:    vcvtqq2pd %ymm0, %ymm0
598; DISABLE-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
599; DISABLE-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
600; DISABLE-NEXT:    retq
601  %1 = tail call <4 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
602  %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %1, <4 x i64> %idx)
603  %a1 = sitofp <4 x i64> %idx to <4 x double>
604  %t = fadd <4 x double> %1, %a1
605  %res = fadd <4 x double> %2, %t
606  ret <4 x double> %res
607}
608
609define <4 x double> @permpd_rm_256(ptr %p0, <4 x i64> %idx) {
610; ENABLE-LABEL: permpd_rm_256:
611; ENABLE:       # %bb.0:
612; ENABLE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
613; ENABLE-NEXT:    #APP
614; ENABLE-NEXT:    nop
615; ENABLE-NEXT:    #NO_APP
616; ENABLE-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
617; ENABLE-NEXT:    vxorps %xmm0, %xmm0, %xmm0
618; ENABLE-NEXT:    vpermpd (%rdi), %ymm1, %ymm0
619; ENABLE-NEXT:    vcvtqq2pd %ymm1, %ymm1
620; ENABLE-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
621; ENABLE-NEXT:    retq
622;
623; DISABLE-LABEL: permpd_rm_256:
624; DISABLE:       # %bb.0:
625; DISABLE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
626; DISABLE-NEXT:    #APP
627; DISABLE-NEXT:    nop
628; DISABLE-NEXT:    #NO_APP
629; DISABLE-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
630; DISABLE-NEXT:    vpermpd (%rdi), %ymm1, %ymm0
631; DISABLE-NEXT:    vcvtqq2pd %ymm1, %ymm1
632; DISABLE-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
633; DISABLE-NEXT:    retq
634  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
635  %a0 = load <4 x double>, ptr %p0, align 64
636  %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> %idx)
637  %a1 = sitofp <4 x i64> %idx to <4 x double>
638  %res = fadd <4 x double> %2, %a1
639  ret <4 x double> %res
640}
641
642define <4 x double> @permpd_mi_256(ptr %p0) {
643; ENABLE-LABEL: permpd_mi_256:
644; ENABLE:       # %bb.0:
645; ENABLE-NEXT:    #APP
646; ENABLE-NEXT:    nop
647; ENABLE-NEXT:    #NO_APP
648; ENABLE-NEXT:    vxorps %xmm0, %xmm0, %xmm0
649; ENABLE-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,2,0]
650; ENABLE-NEXT:    retq
651;
652; DISABLE-LABEL: permpd_mi_256:
653; DISABLE:       # %bb.0:
654; DISABLE-NEXT:    #APP
655; DISABLE-NEXT:    nop
656; DISABLE-NEXT:    #NO_APP
657; DISABLE-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,2,0]
658; DISABLE-NEXT:    retq
659  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
660  %a0 = load <4 x double>, ptr %p0, align 64
661  %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
662  ret <4 x double> %2
663}
664
665define <4 x double> @permpd_broadcast_256(ptr %p0, <4 x i64> %idx) {
666; ENABLE-LABEL: permpd_broadcast_256:
667; ENABLE:       # %bb.0:
668; ENABLE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
669; ENABLE-NEXT:    #APP
670; ENABLE-NEXT:    nop
671; ENABLE-NEXT:    #NO_APP
672; ENABLE-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
673; ENABLE-NEXT:    vxorps %xmm0, %xmm0, %xmm0
674; ENABLE-NEXT:    vpermpd (%rdi){1to4}, %ymm1, %ymm0
675; ENABLE-NEXT:    vcvtqq2pd %ymm1, %ymm1
676; ENABLE-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
677; ENABLE-NEXT:    retq
678;
679; DISABLE-LABEL: permpd_broadcast_256:
680; DISABLE:       # %bb.0:
681; DISABLE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
682; DISABLE-NEXT:    #APP
683; DISABLE-NEXT:    nop
684; DISABLE-NEXT:    #NO_APP
685; DISABLE-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
686; DISABLE-NEXT:    vpermpd (%rdi){1to4}, %ymm1, %ymm0
687; DISABLE-NEXT:    vcvtqq2pd %ymm1, %ymm1
688; DISABLE-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
689; DISABLE-NEXT:    retq
690  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
691  %v0 = load double, ptr %p0, align 4
692  %t0 = insertelement <4 x double> undef, double %v0, i64 0
693  %a0 = shufflevector <4 x double> %t0, <4 x double> undef, <4 x i32> zeroinitializer
694  %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> %idx)
695  %a1 = sitofp <4 x i64> %idx to <4 x double>
696  %res = fadd <4 x double> %2, %a1
697  ret <4 x double> %res
698}
699
700define <4 x double> @permpd_maskz_256(<4 x double> %a0, <4 x i64> %idx, ptr %mask) {
701; ENABLE-LABEL: permpd_maskz_256:
702; ENABLE:       # %bb.0:
703; ENABLE-NEXT:    #APP
704; ENABLE-NEXT:    nop
705; ENABLE-NEXT:    #NO_APP
706; ENABLE-NEXT:    kmovb (%rdi), %k1
707; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
708; ENABLE-NEXT:    vpermpd %ymm0, %ymm1, %ymm2 {%k1} {z}
709; ENABLE-NEXT:    vcvtqq2pd %ymm1, %ymm1
710; ENABLE-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
711; ENABLE-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
712; ENABLE-NEXT:    retq
713;
714; DISABLE-LABEL: permpd_maskz_256:
715; DISABLE:       # %bb.0:
716; DISABLE-NEXT:    #APP
717; DISABLE-NEXT:    nop
718; DISABLE-NEXT:    #NO_APP
719; DISABLE-NEXT:    kmovb (%rdi), %k1
720; DISABLE-NEXT:    vpermpd %ymm0, %ymm1, %ymm2 {%k1} {z}
721; DISABLE-NEXT:    vcvtqq2pd %ymm1, %ymm1
722; DISABLE-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
723; DISABLE-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
724; DISABLE-NEXT:    retq
725  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
726  %2 = load i8, ptr %mask
727  %3 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> %idx, <4 x double> zeroinitializer, i8 %2)
728  %a1 = sitofp <4 x i64> %idx to <4 x double>
729  %t = fadd <4 x double> %a0, %a1
730  %res = fadd <4 x double> %3, %t
731  ret <4 x double> %res
732}
733
734declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>)
735declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8)
736
737define <8 x double> @permpd_rr_512(<8 x double> %a0, <8 x i64> %idx) {
738; ENABLE-LABEL: permpd_rr_512:
739; ENABLE:       # %bb.0:
740; ENABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
741; ENABLE-NEXT:    #APP
742; ENABLE-NEXT:    nop
743; ENABLE-NEXT:    #NO_APP
744; ENABLE-NEXT:    vmovapd %zmm0, %zmm2
745; ENABLE-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
746; ENABLE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
747; ENABLE-NEXT:    vpermpd %zmm2, %zmm0, %zmm1
748; ENABLE-NEXT:    vcvtqq2pd %zmm0, %zmm0
749; ENABLE-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
750; ENABLE-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
751; ENABLE-NEXT:    retq
752;
753; DISABLE-LABEL: permpd_rr_512:
754; DISABLE:       # %bb.0:
755; DISABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
756; DISABLE-NEXT:    #APP
757; DISABLE-NEXT:    nop
758; DISABLE-NEXT:    #NO_APP
759; DISABLE-NEXT:    vmovapd %zmm0, %zmm2
760; DISABLE-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
761; DISABLE-NEXT:    vpermpd %zmm2, %zmm0, %zmm1
762; DISABLE-NEXT:    vcvtqq2pd %zmm0, %zmm0
763; DISABLE-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
764; DISABLE-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
765; DISABLE-NEXT:    retq
766  %1 = tail call <8 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
767  %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %1, <8 x i64> %idx)
768  %a1 = sitofp <8 x i64> %idx to <8 x double>
769  %t = fadd <8 x double> %1, %a1
770  %res = fadd <8 x double> %2, %t
771  ret <8 x double> %res
772}
773
774define <8 x double> @permpd_rm_512(ptr %p0, <8 x i64> %idx) {
775; ENABLE-LABEL: permpd_rm_512:
776; ENABLE:       # %bb.0:
777; ENABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
778; ENABLE-NEXT:    #APP
779; ENABLE-NEXT:    nop
780; ENABLE-NEXT:    #NO_APP
781; ENABLE-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
782; ENABLE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
783; ENABLE-NEXT:    vpermpd (%rdi), %zmm1, %zmm0
784; ENABLE-NEXT:    vcvtqq2pd %zmm1, %zmm1
785; ENABLE-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
786; ENABLE-NEXT:    retq
787;
788; DISABLE-LABEL: permpd_rm_512:
789; DISABLE:       # %bb.0:
790; DISABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
791; DISABLE-NEXT:    #APP
792; DISABLE-NEXT:    nop
793; DISABLE-NEXT:    #NO_APP
794; DISABLE-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
795; DISABLE-NEXT:    vpermpd (%rdi), %zmm1, %zmm0
796; DISABLE-NEXT:    vcvtqq2pd %zmm1, %zmm1
797; DISABLE-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
798; DISABLE-NEXT:    retq
799  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
800  %a0 = load <8 x double>, ptr %p0, align 64
801  %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> %idx)
802  %a1 = sitofp <8 x i64> %idx to <8 x double>
803  %res = fadd <8 x double> %2, %a1
804  ret <8 x double> %res
805}
806
807define <8 x double> @permpd_broadcast_512(ptr %p0, <8 x i64> %idx) {
808; ENABLE-LABEL: permpd_broadcast_512:
809; ENABLE:       # %bb.0:
810; ENABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
811; ENABLE-NEXT:    #APP
812; ENABLE-NEXT:    nop
813; ENABLE-NEXT:    #NO_APP
814; ENABLE-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
815; ENABLE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
816; ENABLE-NEXT:    vpermpd (%rdi){1to8}, %zmm1, %zmm0
817; ENABLE-NEXT:    vcvtqq2pd %zmm1, %zmm1
818; ENABLE-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
819; ENABLE-NEXT:    retq
820;
821; DISABLE-LABEL: permpd_broadcast_512:
822; DISABLE:       # %bb.0:
823; DISABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
824; DISABLE-NEXT:    #APP
825; DISABLE-NEXT:    nop
826; DISABLE-NEXT:    #NO_APP
827; DISABLE-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
828; DISABLE-NEXT:    vpermpd (%rdi){1to8}, %zmm1, %zmm0
829; DISABLE-NEXT:    vcvtqq2pd %zmm1, %zmm1
830; DISABLE-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
831; DISABLE-NEXT:    retq
832  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
833  %v0 = load double, ptr %p0, align 4
834  %t0 = insertelement <8 x double> undef, double %v0, i64 0
835  %a0 = shufflevector <8 x double> %t0, <8 x double> undef, <8 x i32> zeroinitializer
836  %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> %idx)
837  %a1 = sitofp <8 x i64> %idx to <8 x double>
838  %res = fadd <8 x double> %2, %a1
839  ret <8 x double> %res
840}
841
842define <8 x double> @permpd_maskz_512(<8 x double> %a0, <8 x i64> %idx, ptr %mask) {
843; ENABLE-LABEL: permpd_maskz_512:
844; ENABLE:       # %bb.0:
845; ENABLE-NEXT:    #APP
846; ENABLE-NEXT:    nop
847; ENABLE-NEXT:    #NO_APP
848; ENABLE-NEXT:    kmovb (%rdi), %k1
849; ENABLE-NEXT:    vpxor %xmm2, %xmm2, %xmm2
850; ENABLE-NEXT:    vpermpd %zmm0, %zmm1, %zmm2 {%k1} {z}
851; ENABLE-NEXT:    vcvtqq2pd %zmm1, %zmm1
852; ENABLE-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
853; ENABLE-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
854; ENABLE-NEXT:    retq
855;
856; DISABLE-LABEL: permpd_maskz_512:
857; DISABLE:       # %bb.0:
858; DISABLE-NEXT:    #APP
859; DISABLE-NEXT:    nop
860; DISABLE-NEXT:    #NO_APP
861; DISABLE-NEXT:    kmovb (%rdi), %k1
862; DISABLE-NEXT:    vpermpd %zmm0, %zmm1, %zmm2 {%k1} {z}
863; DISABLE-NEXT:    vcvtqq2pd %zmm1, %zmm1
864; DISABLE-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
865; DISABLE-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
866; DISABLE-NEXT:    retq
867  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
868  %2 = load i8, ptr %mask
869  %3 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> %idx, <8 x double> zeroinitializer, i8 %2)
870  %a1 = sitofp <8 x i64> %idx to <8 x double>
871  %t = fadd <8 x double> %a0, %a1
872  %res = fadd <8 x double> %3, %t
873  ret <8 x double> %res
874}
875
876declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
877declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
878
879
880define <8 x float> @permps_rr_256(<8 x float> %a0, <8 x i32> %idx) {
881; ENABLE-LABEL: permps_rr_256:
882; ENABLE:       # %bb.0:
883; ENABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
884; ENABLE-NEXT:    #APP
885; ENABLE-NEXT:    nop
886; ENABLE-NEXT:    #NO_APP
887; ENABLE-NEXT:    vmovaps %ymm0, %ymm2
888; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
889; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
890; ENABLE-NEXT:    vpermps %ymm2, %ymm0, %ymm1
891; ENABLE-NEXT:    vcvtdq2ps %ymm0, %ymm0
892; ENABLE-NEXT:    vaddps %ymm0, %ymm2, %ymm0
893; ENABLE-NEXT:    vaddps %ymm0, %ymm1, %ymm0
894; ENABLE-NEXT:    retq
895;
896; DISABLE-LABEL: permps_rr_256:
897; DISABLE:       # %bb.0:
898; DISABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
899; DISABLE-NEXT:    #APP
900; DISABLE-NEXT:    nop
901; DISABLE-NEXT:    #NO_APP
902; DISABLE-NEXT:    vmovaps %ymm0, %ymm2
903; DISABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
904; DISABLE-NEXT:    vpermps %ymm2, %ymm0, %ymm1
905; DISABLE-NEXT:    vcvtdq2ps %ymm0, %ymm0
906; DISABLE-NEXT:    vaddps %ymm0, %ymm2, %ymm0
907; DISABLE-NEXT:    vaddps %ymm0, %ymm1, %ymm0
908; DISABLE-NEXT:    retq
909  %1 = tail call <8 x float> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
910  %2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %1, <8 x i32> %idx, <8 x float> zeroinitializer, i8 -1)
911  %a1 = sitofp <8 x i32> %idx to <8 x float>
912  %t = fadd <8 x float> %1, %a1
913  %res = fadd <8 x float> %2, %t
914  ret <8 x float> %res
915}
916
917define <8 x float> @permps_rm_256(ptr %p0, <8 x i32> %idx) {
918; ENABLE-LABEL: permps_rm_256:
919; ENABLE:       # %bb.0:
920; ENABLE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
921; ENABLE-NEXT:    #APP
922; ENABLE-NEXT:    nop
923; ENABLE-NEXT:    #NO_APP
924; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
925; ENABLE-NEXT:    vxorps %xmm0, %xmm0, %xmm0
926; ENABLE-NEXT:    vpermps (%rdi), %ymm1, %ymm0
927; ENABLE-NEXT:    vcvtdq2ps %ymm1, %ymm1
928; ENABLE-NEXT:    vaddps %ymm1, %ymm0, %ymm0
929; ENABLE-NEXT:    retq
930;
931; DISABLE-LABEL: permps_rm_256:
932; DISABLE:       # %bb.0:
933; DISABLE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
934; DISABLE-NEXT:    #APP
935; DISABLE-NEXT:    nop
936; DISABLE-NEXT:    #NO_APP
937; DISABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
938; DISABLE-NEXT:    vpermps (%rdi), %ymm1, %ymm0
939; DISABLE-NEXT:    vcvtdq2ps %ymm1, %ymm1
940; DISABLE-NEXT:    vaddps %ymm1, %ymm0, %ymm0
941; DISABLE-NEXT:    retq
942  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
943  %a0 = load <8 x float>, ptr %p0, align 64
944  %2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> %idx, <8 x float> zeroinitializer, i8 -1)
945  %a1 = sitofp <8 x i32> %idx to <8 x float>
946  %res = fadd <8 x float> %2, %a1
947  ret <8 x float> %res
948}
949
950define <8 x float> @permps_broadcast_256(ptr %p0, <8 x i32> %idx) {
951; ENABLE-LABEL: permps_broadcast_256:
952; ENABLE:       # %bb.0:
953; ENABLE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
954; ENABLE-NEXT:    #APP
955; ENABLE-NEXT:    nop
956; ENABLE-NEXT:    #NO_APP
957; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
958; ENABLE-NEXT:    vxorps %xmm0, %xmm0, %xmm0
959; ENABLE-NEXT:    vpermps (%rdi){1to8}, %ymm1, %ymm0
960; ENABLE-NEXT:    vcvtdq2ps %ymm1, %ymm1
961; ENABLE-NEXT:    vaddps %ymm1, %ymm0, %ymm0
962; ENABLE-NEXT:    retq
963;
964; DISABLE-LABEL: permps_broadcast_256:
965; DISABLE:       # %bb.0:
966; DISABLE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
967; DISABLE-NEXT:    #APP
968; DISABLE-NEXT:    nop
969; DISABLE-NEXT:    #NO_APP
970; DISABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
971; DISABLE-NEXT:    vpermps (%rdi){1to8}, %ymm1, %ymm0
972; DISABLE-NEXT:    vcvtdq2ps %ymm1, %ymm1
973; DISABLE-NEXT:    vaddps %ymm1, %ymm0, %ymm0
974; DISABLE-NEXT:    retq
975  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
976  %v0 = load float, ptr %p0, align 4
977  %t0 = insertelement <8 x float> undef, float %v0, i32 0
978  %a0 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer
979  %2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> %idx, <8 x float> zeroinitializer, i8 -1)
980  %a1 = sitofp <8 x i32> %idx to <8 x float>
981  %res = fadd <8 x float> %2, %a1
982  ret <8 x float> %res
983}
984
985define <8 x float> @permps_maskz_256(<8 x float> %a0, <8 x i32> %idx, ptr %mask) {
986; ENABLE-LABEL: permps_maskz_256:
987; ENABLE:       # %bb.0:
988; ENABLE-NEXT:    #APP
989; ENABLE-NEXT:    nop
990; ENABLE-NEXT:    #NO_APP
991; ENABLE-NEXT:    kmovb (%rdi), %k1
992; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
993; ENABLE-NEXT:    vpermps %ymm0, %ymm1, %ymm2 {%k1} {z}
994; ENABLE-NEXT:    vcvtdq2ps %ymm1, %ymm1
995; ENABLE-NEXT:    vaddps %ymm1, %ymm0, %ymm0
996; ENABLE-NEXT:    vaddps %ymm0, %ymm2, %ymm0
997; ENABLE-NEXT:    retq
998;
999; DISABLE-LABEL: permps_maskz_256:
1000; DISABLE:       # %bb.0:
1001; DISABLE-NEXT:    #APP
1002; DISABLE-NEXT:    nop
1003; DISABLE-NEXT:    #NO_APP
1004; DISABLE-NEXT:    kmovb (%rdi), %k1
1005; DISABLE-NEXT:    vpermps %ymm0, %ymm1, %ymm2 {%k1} {z}
1006; DISABLE-NEXT:    vcvtdq2ps %ymm1, %ymm1
1007; DISABLE-NEXT:    vaddps %ymm1, %ymm0, %ymm0
1008; DISABLE-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1009; DISABLE-NEXT:    retq
1010  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1011  %2 = load i8, ptr %mask
1012  %3 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> %idx, <8 x float> zeroinitializer, i8 %2)
1013  %a1 = sitofp <8 x i32> %idx to <8 x float>
1014  %t = fadd <8 x float> %a0, %a1
1015  %res = fadd <8 x float> %3, %t
1016  ret <8 x float> %res
1017}
1018
1019declare <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float>, <8 x i32>, <8 x float>, i8)
1020
1021define <16 x float> @permps_rr_512(<16 x float> %a0, <16 x i32> %idx) {
1022; ENABLE-LABEL: permps_rr_512:
1023; ENABLE:       # %bb.0:
1024; ENABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1025; ENABLE-NEXT:    #APP
1026; ENABLE-NEXT:    nop
1027; ENABLE-NEXT:    #NO_APP
1028; ENABLE-NEXT:    vmovaps %zmm0, %zmm2
1029; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1030; ENABLE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1031; ENABLE-NEXT:    vpermps %zmm2, %zmm0, %zmm1
1032; ENABLE-NEXT:    vcvtdq2ps %zmm0, %zmm0
1033; ENABLE-NEXT:    vaddps %zmm0, %zmm2, %zmm0
1034; ENABLE-NEXT:    vaddps %zmm0, %zmm1, %zmm0
1035; ENABLE-NEXT:    retq
1036;
1037; DISABLE-LABEL: permps_rr_512:
1038; DISABLE:       # %bb.0:
1039; DISABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1040; DISABLE-NEXT:    #APP
1041; DISABLE-NEXT:    nop
1042; DISABLE-NEXT:    #NO_APP
1043; DISABLE-NEXT:    vmovaps %zmm0, %zmm2
1044; DISABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1045; DISABLE-NEXT:    vpermps %zmm2, %zmm0, %zmm1
1046; DISABLE-NEXT:    vcvtdq2ps %zmm0, %zmm0
1047; DISABLE-NEXT:    vaddps %zmm0, %zmm2, %zmm0
1048; DISABLE-NEXT:    vaddps %zmm0, %zmm1, %zmm0
1049; DISABLE-NEXT:    retq
1050  %1 = tail call <16 x float> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1051  %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %1, <16 x i32> %idx)
1052  %a1 = sitofp <16 x i32> %idx to <16 x float>
1053  %t = fadd <16 x float> %1, %a1
1054  %res = fadd <16 x float> %2, %t
1055  ret <16 x float> %res
1056}
1057
1058define <16 x float> @permps_rm_512(ptr %p0, <16 x i32> %idx) {
1059; ENABLE-LABEL: permps_rm_512:
1060; ENABLE:       # %bb.0:
1061; ENABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1062; ENABLE-NEXT:    #APP
1063; ENABLE-NEXT:    nop
1064; ENABLE-NEXT:    #NO_APP
1065; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
1066; ENABLE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1067; ENABLE-NEXT:    vpermps (%rdi), %zmm1, %zmm0
1068; ENABLE-NEXT:    vcvtdq2ps %zmm1, %zmm1
1069; ENABLE-NEXT:    vaddps %zmm1, %zmm0, %zmm0
1070; ENABLE-NEXT:    retq
1071;
1072; DISABLE-LABEL: permps_rm_512:
1073; DISABLE:       # %bb.0:
1074; DISABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1075; DISABLE-NEXT:    #APP
1076; DISABLE-NEXT:    nop
1077; DISABLE-NEXT:    #NO_APP
1078; DISABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
1079; DISABLE-NEXT:    vpermps (%rdi), %zmm1, %zmm0
1080; DISABLE-NEXT:    vcvtdq2ps %zmm1, %zmm1
1081; DISABLE-NEXT:    vaddps %zmm1, %zmm0, %zmm0
1082; DISABLE-NEXT:    retq
1083  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1084  %a0 = load <16 x float>, ptr %p0, align 64
1085  %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> %idx)
1086  %a1 = sitofp <16 x i32> %idx to <16 x float>
1087  %res = fadd <16 x float> %2, %a1
1088  ret <16 x float> %res
1089}
1090
1091define <16 x float> @permps_broadcast_512(ptr %p0, <16 x i32> %idx) {
1092; ENABLE-LABEL: permps_broadcast_512:
1093; ENABLE:       # %bb.0:
1094; ENABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1095; ENABLE-NEXT:    #APP
1096; ENABLE-NEXT:    nop
1097; ENABLE-NEXT:    #NO_APP
1098; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
1099; ENABLE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1100; ENABLE-NEXT:    vpermps (%rdi){1to16}, %zmm1, %zmm0
1101; ENABLE-NEXT:    vcvtdq2ps %zmm1, %zmm1
1102; ENABLE-NEXT:    vaddps %zmm1, %zmm0, %zmm0
1103; ENABLE-NEXT:    retq
1104;
1105; DISABLE-LABEL: permps_broadcast_512:
1106; DISABLE:       # %bb.0:
1107; DISABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1108; DISABLE-NEXT:    #APP
1109; DISABLE-NEXT:    nop
1110; DISABLE-NEXT:    #NO_APP
1111; DISABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
1112; DISABLE-NEXT:    vpermps (%rdi){1to16}, %zmm1, %zmm0
1113; DISABLE-NEXT:    vcvtdq2ps %zmm1, %zmm1
1114; DISABLE-NEXT:    vaddps %zmm1, %zmm0, %zmm0
1115; DISABLE-NEXT:    retq
1116  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1117  %v0 = load float, ptr %p0, align 4
1118  %t0 = insertelement <16 x float> undef, float %v0, i32 0
1119  %a0 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer
1120  %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> %idx)
1121  %a1 = sitofp <16 x i32> %idx to <16 x float>
1122  %res = fadd <16 x float> %2, %a1
1123  ret <16 x float> %res
1124}
1125
1126define <16 x float> @permps_maskz_512(<16 x float> %a0, <16 x i32> %idx, ptr %mask) {
1127; ENABLE-LABEL: permps_maskz_512:
1128; ENABLE:       # %bb.0:
1129; ENABLE-NEXT:    #APP
1130; ENABLE-NEXT:    nop
1131; ENABLE-NEXT:    #NO_APP
1132; ENABLE-NEXT:    kmovw (%rdi), %k1
1133; ENABLE-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1134; ENABLE-NEXT:    vpermps %zmm0, %zmm1, %zmm2 {%k1} {z}
1135; ENABLE-NEXT:    vcvtdq2ps %zmm1, %zmm1
1136; ENABLE-NEXT:    vaddps %zmm1, %zmm0, %zmm0
1137; ENABLE-NEXT:    vaddps %zmm0, %zmm2, %zmm0
1138; ENABLE-NEXT:    retq
1139;
1140; DISABLE-LABEL: permps_maskz_512:
1141; DISABLE:       # %bb.0:
1142; DISABLE-NEXT:    #APP
1143; DISABLE-NEXT:    nop
1144; DISABLE-NEXT:    #NO_APP
1145; DISABLE-NEXT:    kmovw (%rdi), %k1
1146; DISABLE-NEXT:    vpermps %zmm0, %zmm1, %zmm2 {%k1} {z}
1147; DISABLE-NEXT:    vcvtdq2ps %zmm1, %zmm1
1148; DISABLE-NEXT:    vaddps %zmm1, %zmm0, %zmm0
1149; DISABLE-NEXT:    vaddps %zmm0, %zmm2, %zmm0
1150; DISABLE-NEXT:    retq
1151  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1152  %2 = load i16, ptr %mask
1153  %3 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> %idx, <16 x float> zeroinitializer, i16 %2)
1154  %a1 = sitofp <16 x i32> %idx to <16 x float>
1155  %t = fadd <16 x float> %a0, %a1
1156  %res = fadd <16 x float> %3, %t
1157  ret <16 x float> %res
1158}
1159
1160declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
1161declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16)
1162