xref: /llvm-project/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll (revision 4def1ce1012897d4f80f5094c29e5383b9aa6a0b)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s
3
4define dso_local void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, ptr %base, ptr %stbuf) {
5; CHECK-LABEL: gather_mask_dps:
6; CHECK:       # %bb.0:
7; CHECK-NEXT:    kmovd %edi, %k1
8; CHECK-NEXT:    kmovq %k1, %k2
9; CHECK-NEXT:    vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
10; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
11; CHECK-NEXT:    vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
12; CHECK-NEXT:    vzeroupper
13; CHECK-NEXT:    retq
14  %1 = bitcast i16 %mask to <16 x i1>
15  %x = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, ptr %base, <16 x i32> %ind, <16 x i1> %1, i32 4)
16  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
17  call void @llvm.x86.avx512.mask.scatter.dps.512(ptr %stbuf, <16 x i1> %1, <16 x i32> %ind2, <16 x float> %x, i32 4)
18  ret void
19}
20
21define dso_local void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf) {
22; CHECK-LABEL: gather_mask_dpd:
23; CHECK:       # %bb.0:
24; CHECK-NEXT:    kmovd %edi, %k1
25; CHECK-NEXT:    kmovq %k1, %k2
26; CHECK-NEXT:    vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
27; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
28; CHECK-NEXT:    vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
29; CHECK-NEXT:    vzeroupper
30; CHECK-NEXT:    retq
31  %1 = bitcast i8 %mask to <8 x i1>
32  %x = call <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double> %src, ptr %base, <8 x i32> %ind, <8 x i1> %1, i32 4)
33  %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
34  call void @llvm.x86.avx512.mask.scatter.dpd.512(ptr %stbuf, <8 x i1> %1, <8 x i32> %ind2, <8 x double> %x, i32 4)
35  ret void
36}
37
38define dso_local void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, ptr %base, ptr %stbuf) {
39; CHECK-LABEL: gather_mask_qps:
40; CHECK:       # %bb.0:
41; CHECK-NEXT:    kmovd %edi, %k1
42; CHECK-NEXT:    kmovq %k1, %k2
43; CHECK-NEXT:    vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
44; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
45; CHECK-NEXT:    vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
46; CHECK-NEXT:    vzeroupper
47; CHECK-NEXT:    retq
48  %1 = bitcast i8 %mask to <8 x i1>
49  %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, ptr %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
50  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
51  call void @llvm.x86.avx512.mask.scatter.qps.512(ptr %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x float> %x, i32 4)
52  ret void
53}
54
55define dso_local void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf) {
56; CHECK-LABEL: gather_mask_qpd:
57; CHECK:       # %bb.0:
58; CHECK-NEXT:    kmovd %edi, %k1
59; CHECK-NEXT:    kmovq %k1, %k2
60; CHECK-NEXT:    vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
61; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
62; CHECK-NEXT:    vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
63; CHECK-NEXT:    vzeroupper
64; CHECK-NEXT:    retq
65  %1 = bitcast i8 %mask to <8 x i1>
66  %x = call <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double> %src, ptr %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
67  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
68  call void @llvm.x86.avx512.mask.scatter.qpd.512(ptr %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x double> %x, i32 4)
69  ret void
70}
71;;
72;; Integer Gather/Scatter
73;;
74
75define dso_local void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, ptr %base, ptr %stbuf) {
76; CHECK-LABEL: gather_mask_dd:
77; CHECK:       # %bb.0:
78; CHECK-NEXT:    kmovd %edi, %k1
79; CHECK-NEXT:    kmovq %k1, %k2
80; CHECK-NEXT:    vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
81; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
82; CHECK-NEXT:    vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
83; CHECK-NEXT:    vzeroupper
84; CHECK-NEXT:    retq
85  %1 = bitcast i16 %mask to <16 x i1>
86  %x = call <16 x i32> @llvm.x86.avx512.mask.gather.dpi.512(<16 x i32> %src, ptr %base, <16 x i32> %ind, <16 x i1> %1, i32 4)
87  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
88  call void @llvm.x86.avx512.mask.scatter.dpi.512(ptr %stbuf, <16 x i1> %1, <16 x i32> %ind2, <16 x i32> %x, i32 4)
89  ret void
90}
91
92define dso_local void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, ptr %base, ptr %stbuf) {
93; CHECK-LABEL: gather_mask_qd:
94; CHECK:       # %bb.0:
95; CHECK-NEXT:    kmovd %edi, %k1
96; CHECK-NEXT:    kmovq %k1, %k2
97; CHECK-NEXT:    vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
98; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
99; CHECK-NEXT:    vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
100; CHECK-NEXT:    vzeroupper
101; CHECK-NEXT:    retq
102  %1 = bitcast i8 %mask to <8 x i1>
103  %x = call <8 x i32> @llvm.x86.avx512.mask.gather.qpi.512(<8 x i32> %src, ptr %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
104  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
105  call void @llvm.x86.avx512.mask.scatter.qpi.512(ptr %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x i32> %x, i32 4)
106  ret void
107}
108
109define dso_local void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, ptr %base, ptr %stbuf) {
110; CHECK-LABEL: gather_mask_qq:
111; CHECK:       # %bb.0:
112; CHECK-NEXT:    kmovd %edi, %k1
113; CHECK-NEXT:    kmovq %k1, %k2
114; CHECK-NEXT:    vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
115; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
116; CHECK-NEXT:    vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
117; CHECK-NEXT:    vzeroupper
118; CHECK-NEXT:    retq
119  %1 = bitcast i8 %mask to <8 x i1>
120  %x = call <8 x i64> @llvm.x86.avx512.mask.gather.qpq.512(<8 x i64> %src, ptr %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
121  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
122  call void @llvm.x86.avx512.mask.scatter.qpq.512(ptr %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x i64> %x, i32 4)
123  ret void
124}
125
126define dso_local void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, ptr %base, ptr %stbuf) {
127; CHECK-LABEL: gather_mask_dq:
128; CHECK:       # %bb.0:
129; CHECK-NEXT:    kmovd %edi, %k1
130; CHECK-NEXT:    kmovq %k1, %k2
131; CHECK-NEXT:    vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
132; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
133; CHECK-NEXT:    vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
134; CHECK-NEXT:    vzeroupper
135; CHECK-NEXT:    retq
136  %1 = bitcast i8 %mask to <8 x i1>
137  %x = call <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512(<8 x i64> %src, ptr %base, <8 x i32> %ind, <8 x i1> %1, i32 4)
138  %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
139  call void @llvm.x86.avx512.mask.scatter.dpq.512(ptr %stbuf, <8 x i1> %1, <8 x i32> %ind2, <8 x i64> %x, i32 4)
140  ret void
141}
142
143define dso_local void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf) {
144; CHECK-LABEL: gather_mask_dpd_execdomain:
145; CHECK:       # %bb.0:
146; CHECK-NEXT:    kmovd %edi, %k1
147; CHECK-NEXT:    vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1}
148; CHECK-NEXT:    vmovapd %zmm1, (%rdx)
149; CHECK-NEXT:    vzeroupper
150; CHECK-NEXT:    retq
151  %1 = bitcast i8 %mask to <8 x i1>
152  %x = call <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double> %src, ptr %base, <8 x i32> %ind, <8 x i1> %1, i32 4)
153  store <8 x double> %x, ptr %stbuf
154  ret void
155}
156
157define dso_local void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf) {
158; CHECK-LABEL: gather_mask_qpd_execdomain:
159; CHECK:       # %bb.0:
160; CHECK-NEXT:    kmovd %edi, %k1
161; CHECK-NEXT:    vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1}
162; CHECK-NEXT:    vmovapd %zmm1, (%rdx)
163; CHECK-NEXT:    vzeroupper
164; CHECK-NEXT:    retq
165  %1 = bitcast i8 %mask to <8 x i1>
166  %x = call <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double> %src, ptr %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
167  store <8 x double> %x, ptr %stbuf
168  ret void
169}
170
171define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, ptr %base) {
172; CHECK-LABEL: gather_mask_dps_execdomain:
173; CHECK:       # %bb.0:
174; CHECK-NEXT:    kmovd %edi, %k1
175; CHECK-NEXT:    vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1}
176; CHECK-NEXT:    vmovaps %zmm1, %zmm0
177; CHECK-NEXT:    retq
178  %1 = bitcast i16 %mask to <16 x i1>
179  %res = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, ptr %base, <16 x i32> %ind, <16 x i1> %1, i32 4)
180  ret <16 x float> %res
181}
182
183define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, ptr %base) {
184; CHECK-LABEL: gather_mask_qps_execdomain:
185; CHECK:       # %bb.0:
186; CHECK-NEXT:    kmovd %edi, %k1
187; CHECK-NEXT:    vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
188; CHECK-NEXT:    vmovaps %ymm1, %ymm0
189; CHECK-NEXT:    retq
190  %1 = bitcast i8 %mask to <8 x i1>
191  %res = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, ptr %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
192  ret <8 x float> %res
193}
194
195define dso_local void @scatter_mask_dpd_execdomain(<8 x i32> %ind, ptr %src, i8 %mask, ptr %base, ptr %stbuf) {
196; CHECK-LABEL: scatter_mask_dpd_execdomain:
197; CHECK:       # %bb.0:
198; CHECK-NEXT:    kmovd %esi, %k1
199; CHECK-NEXT:    vmovapd (%rdi), %zmm1
200; CHECK-NEXT:    vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
201; CHECK-NEXT:    vzeroupper
202; CHECK-NEXT:    retq
203  %1 = bitcast i8 %mask to <8 x i1>
204  %x = load <8 x double>, ptr %src, align 64
205  call void @llvm.x86.avx512.mask.scatter.dpd.512(ptr %stbuf, <8 x i1> %1, <8 x i32>%ind, <8 x double> %x, i32 4)
206  ret void
207}
208
209define dso_local void @scatter_mask_qpd_execdomain(<8 x i64> %ind, ptr %src, i8 %mask, ptr %base, ptr %stbuf)  {
210; CHECK-LABEL: scatter_mask_qpd_execdomain:
211; CHECK:       # %bb.0:
212; CHECK-NEXT:    kmovd %esi, %k1
213; CHECK-NEXT:    vmovapd (%rdi), %zmm1
214; CHECK-NEXT:    vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
215; CHECK-NEXT:    vzeroupper
216; CHECK-NEXT:    retq
217  %1 = bitcast i8 %mask to <8 x i1>
218  %x = load <8 x double>, ptr %src, align 64
219  call void @llvm.x86.avx512.mask.scatter.qpd.512(ptr %stbuf, <8 x i1> %1, <8 x i64>%ind, <8 x double> %x, i32 4)
220  ret void
221}
222
223define dso_local void @scatter_mask_dps_execdomain(<16 x i32> %ind, ptr %src, i16 %mask, ptr %base, ptr %stbuf)  {
224; CHECK-LABEL: scatter_mask_dps_execdomain:
225; CHECK:       # %bb.0:
226; CHECK-NEXT:    kmovd %esi, %k1
227; CHECK-NEXT:    vmovaps (%rdi), %zmm1
228; CHECK-NEXT:    vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
229; CHECK-NEXT:    vzeroupper
230; CHECK-NEXT:    retq
231  %1 = bitcast i16 %mask to <16 x i1>
232  %x = load <16 x float>, ptr %src, align 64
233  call void @llvm.x86.avx512.mask.scatter.dps.512(ptr %stbuf, <16 x i1> %1, <16 x i32>%ind, <16 x float> %x, i32 4)
234  ret void
235}
236
237define dso_local void @scatter_mask_qps_execdomain(<8 x i64> %ind, ptr %src, i8 %mask, ptr %base, ptr %stbuf)  {
238; CHECK-LABEL: scatter_mask_qps_execdomain:
239; CHECK:       # %bb.0:
240; CHECK-NEXT:    kmovd %esi, %k1
241; CHECK-NEXT:    vmovaps (%rdi), %ymm1
242; CHECK-NEXT:    vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
243; CHECK-NEXT:    vzeroupper
244; CHECK-NEXT:    retq
245  %1 = bitcast i8 %mask to <8 x i1>
246  %x = load <8 x float>, ptr %src, align 32
247  call void @llvm.x86.avx512.mask.scatter.qps.512(ptr %stbuf, <8 x i1> %1, <8 x i64>%ind, <8 x float> %x, i32 4)
248  ret void
249}
250
251define dso_local void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf) {
252; CHECK-LABEL: gather_qps:
253; CHECK:       # %bb.0:
254; CHECK-NEXT:    kxnorw %k0, %k0, %k1
255; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
256; CHECK-NEXT:    kxnorw %k0, %k0, %k2
257; CHECK-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
258; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
259; CHECK-NEXT:    vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
260; CHECK-NEXT:    vzeroupper
261; CHECK-NEXT:    retq
262  %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, ptr %base, <8 x i64> %ind, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
263  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
264  call void @llvm.x86.avx512.mask.scatter.qps.512(ptr %stbuf, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> %ind2, <8 x float> %x, i32 4)
265  ret void
266}
267
268define <2 x double> @test_int_x86_avx512_mask_gather3div2_df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
269; CHECK-LABEL: test_int_x86_avx512_mask_gather3div2_df:
270; CHECK:       # %bb.0:
271; CHECK-NEXT:    kmovd %esi, %k1
272; CHECK-NEXT:    vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1}
273; CHECK-NEXT:    kxnorw %k0, %k0, %k1
274; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
275; CHECK-NEXT:    vgatherqpd (%rdi,%xmm1,2), %xmm2 {%k1}
276; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
277; CHECK-NEXT:    retq
278  %1 = bitcast i8 %x3 to <8 x i1>
279  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
280  %res = call <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4)
281  %res1 = call <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> <i1 true, i1 true>, i32 2)
282  %res2 = fadd <2 x double> %res, %res1
283  ret <2 x double> %res2
284}
285
286define <2 x i64> @test_int_x86_avx512_mask_gather3div2_di(<2 x i64> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
287; CHECK-LABEL: test_int_x86_avx512_mask_gather3div2_di:
288; CHECK:       # %bb.0:
289; CHECK-NEXT:    kmovd %esi, %k1
290; CHECK-NEXT:    vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
291; CHECK-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
292; CHECK-NEXT:    retq
293  %1 = bitcast i8 %x3 to <8 x i1>
294  %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
295  %res = call <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> %extract1, i32 8)
296  %2 = bitcast i8 %x3 to <8 x i1>
297  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
298  %res1 = call <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> %extract, i32 8)
299  %res2 = add <2 x i64> %res, %res1
300  ret <2 x i64> %res2
301}
302
303define <4 x double> @test_int_x86_avx512_mask_gather3div4_df(<4 x double> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) {
304; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_df:
305; CHECK:       # %bb.0:
306; CHECK-NEXT:    kmovd %esi, %k1
307; CHECK-NEXT:    vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1}
308; CHECK-NEXT:    kxnorw %k0, %k0, %k1
309; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
310; CHECK-NEXT:    vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1}
311; CHECK-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
312; CHECK-NEXT:    retq
313  %1 = bitcast i8 %x3 to <8 x i1>
314  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
315  %res = call <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> %extract, i32 4)
316  %res1 = call <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2)
317  %res2 = fadd <4 x double> %res, %res1
318  ret <4 x double> %res2
319}
320
321define <4 x i64> @test_int_x86_avx512_mask_gather3div4_di(<4 x i64> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) {
322; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_di:
323; CHECK:       # %bb.0:
324; CHECK-NEXT:    kmovd %esi, %k1
325; CHECK-NEXT:    vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
326; CHECK-NEXT:    kxnorw %k0, %k0, %k1
327; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
328; CHECK-NEXT:    vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
329; CHECK-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
330; CHECK-NEXT:    retq
331  %1 = bitcast i8 %x3 to <8 x i1>
332  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
333  %res = call <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> %extract, i32 8)
334  %res1 = call <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 8)
335  %res2 = add <4 x i64> %res, %res1
336  ret <4 x i64> %res2
337}
338
339define <4 x float> @test_int_x86_avx512_mask_gather3div4_sf(<4 x float> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
340; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_sf:
341; CHECK:       # %bb.0:
342; CHECK-NEXT:    kmovd %esi, %k1
343; CHECK-NEXT:    vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1}
344; CHECK-NEXT:    kxnorw %k0, %k0, %k1
345; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
346; CHECK-NEXT:    vgatherqps (%rdi,%xmm1,2), %xmm2 {%k1}
347; CHECK-NEXT:    vaddps %xmm2, %xmm0, %xmm0
348; CHECK-NEXT:    retq
349  %1 = bitcast i8 %x3 to <8 x i1>
350  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
351  %res = call <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4)
352  %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> <i1 true, i1 true>, i32 2)
353  %res2 = fadd <4 x float> %res, %res1
354  ret <4 x float> %res2
355}
356
357define <4 x i32> @test_int_x86_avx512_mask_gather3div4_si(<4 x i32> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
358; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_si:
359; CHECK:       # %bb.0:
360; CHECK-NEXT:    kxnorw %k0, %k0, %k1
361; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
362; CHECK-NEXT:    vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k1}
363; CHECK-NEXT:    kmovd %esi, %k1
364; CHECK-NEXT:    vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
365; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
366; CHECK-NEXT:    retq
367  %res = call <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> <i1 true, i1 true>, i32 4)
368  %1 = bitcast i8 %x3 to <8 x i1>
369  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
370  %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4)
371  %res2 = add <4 x i32> %res, %res1
372  ret <4 x i32> %res2
373}
374
375define <4 x float> @test_int_x86_avx512_mask_gather3div8_sf(<4 x float> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) {
376; CHECK-LABEL: test_int_x86_avx512_mask_gather3div8_sf:
377; CHECK:       # %bb.0:
378; CHECK-NEXT:    kmovd %esi, %k1
379; CHECK-NEXT:    vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1}
380; CHECK-NEXT:    kxnorw %k0, %k0, %k1
381; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
382; CHECK-NEXT:    vgatherqps (%rdi,%ymm1,2), %xmm2 {%k1}
383; CHECK-NEXT:    vaddps %xmm2, %xmm0, %xmm0
384; CHECK-NEXT:    vzeroupper
385; CHECK-NEXT:    retq
386  %1 = bitcast i8 %x3 to <8 x i1>
387  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
388  %res = call <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> %extract, i32 4)
389  %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2)
390  %res2 = fadd <4 x float> %res, %res1
391  ret <4 x float> %res2
392}
393
394define <4 x i32> @test_int_x86_avx512_mask_gather3div8_si(<4 x i32> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) {
395; CHECK-LABEL: test_int_x86_avx512_mask_gather3div8_si:
396; CHECK:       # %bb.0:
397; CHECK-NEXT:    kmovd %esi, %k1
398; CHECK-NEXT:    vmovdqa %xmm0, %xmm2
399; CHECK-NEXT:    kmovq %k1, %k2
400; CHECK-NEXT:    vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
401; CHECK-NEXT:    vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
402; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
403; CHECK-NEXT:    vzeroupper
404; CHECK-NEXT:    retq
405  %1 = bitcast i8 %x3 to <8 x i1>
406  %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
407  %res = call <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> %extract1, i32 4)
408  %2 = bitcast i8 %x3 to <8 x i1>
409  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
410  %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> %extract, i32 2)
411  %res2 = add <4 x i32> %res, %res1
412  ret <4 x i32> %res2
413}
414
415define <2 x double> @test_int_x86_avx512_mask_gather3siv2_df(<2 x double> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
416; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv2_df:
417; CHECK:       # %bb.0:
418; CHECK-NEXT:    kmovd %esi, %k1
419; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1}
420; CHECK-NEXT:    kxnorw %k0, %k0, %k1
421; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
422; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,2), %xmm2 {%k1}
423; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
424; CHECK-NEXT:    retq
425  %1 = bitcast i8 %x3 to <8 x i1>
426  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
427  %res = call <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double> %x0, ptr %x1, <4 x i32> %x2, <2 x i1> %extract, i32 4)
428  %res1 = call <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double> %x0, ptr %x1, <4 x i32> %x2, <2 x i1> <i1 true, i1 true>, i32 2)
429  %res2 = fadd <2 x double> %res, %res1
430  ret <2 x double> %res2
431}
432
433define <2 x i64> @test_int_x86_avx512_mask_gather3siv2_di(<2 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
434; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv2_di:
435; CHECK:       # %bb.0:
436; CHECK-NEXT:    kmovd %esi, %k1
437; CHECK-NEXT:    vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
438; CHECK-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
439; CHECK-NEXT:    retq
440  %1 = bitcast i8 %x3 to <8 x i1>
441  %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
442  %res = call <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64> %x0, ptr %x1, <4 x i32> %x2, <2 x i1> %extract1, i32 8)
443  %2 = bitcast i8 %x3 to <8 x i1>
444  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
445  %res1 = call <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64> %x0, ptr %x1, <4 x i32> %x2, <2 x i1> %extract, i32 8)
446  %res2 = add <2 x i64> %res, %res1
447  ret <2 x i64> %res2
448}
449
450define <4 x double> @test_int_x86_avx512_mask_gather3siv4_df(<4 x double> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
451; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_df:
452; CHECK:       # %bb.0:
453; CHECK-NEXT:    kmovd %esi, %k1
454; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1}
455; CHECK-NEXT:    kxnorw %k0, %k0, %k1
456; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
457; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1}
458; CHECK-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
459; CHECK-NEXT:    retq
460  %1 = bitcast i8 %x3 to <8 x i1>
461  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
462  %res = call <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> %extract, i32 4)
463  %res1 = call <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2)
464  %res2 = fadd <4 x double> %res, %res1
465  ret <4 x double> %res2
466}
467
468define <4 x i64> @test_int_x86_avx512_mask_gather3siv4_di(<4 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
469; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_di:
470; CHECK:       # %bb.0:
471; CHECK-NEXT:    kmovd %esi, %k1
472; CHECK-NEXT:    vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
473; CHECK-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
474; CHECK-NEXT:    retq
475  %1 = bitcast i8 %x3 to <8 x i1>
476  %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
477  %res = call <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> %extract1, i32 8)
478  %2 = bitcast i8 %x3 to <8 x i1>
479  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
480  %res1 = call <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> %extract, i32 8)
481  %res2 = add <4 x i64> %res, %res1
482  ret <4 x i64> %res2
483}
484
485define <4 x float> @test_int_x86_avx512_mask_gather3siv4_sf(<4 x float> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
486; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_sf:
487; CHECK:       # %bb.0:
488; CHECK-NEXT:    kmovd %esi, %k1
489; CHECK-NEXT:    vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
490; CHECK-NEXT:    kxnorw %k0, %k0, %k1
491; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
492; CHECK-NEXT:    vgatherdps (%rdi,%xmm1,2), %xmm2 {%k1}
493; CHECK-NEXT:    vaddps %xmm2, %xmm0, %xmm0
494; CHECK-NEXT:    retq
495  %1 = bitcast i8 %x3 to <8 x i1>
496  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
497  %res = call <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> %extract, i32 4)
498  %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2)
499  %res2 = fadd <4 x float> %res, %res1
500  ret <4 x float> %res2
501}
502
503define <4 x i32> @test_int_x86_avx512_mask_gather3siv4_si(<4 x i32> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
504; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_si:
505; CHECK:       # %bb.0:
506; CHECK-NEXT:    kxnorw %k0, %k0, %k1
507; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
508; CHECK-NEXT:    vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k1}
509; CHECK-NEXT:    kmovd %esi, %k1
510; CHECK-NEXT:    vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1}
511; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
512; CHECK-NEXT:    retq
513  %res = call <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
514  %1 = bitcast i8 %x3 to <8 x i1>
515  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
516  %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> %extract, i32 2)
517  %res2 = add <4 x i32> %res, %res1
518  ret <4 x i32> %res2
519}
520
521define <8 x float> @test_int_x86_avx512_mask_gather3siv8_sf(<8 x float> %x0, ptr %x1, <8 x i32> %x2, i8 %x3) {
522; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv8_sf:
523; CHECK:       # %bb.0:
524; CHECK-NEXT:    kmovd %esi, %k1
525; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
526; CHECK-NEXT:    kxnorw %k0, %k0, %k1
527; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
528; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
529; CHECK-NEXT:    vaddps %ymm2, %ymm0, %ymm0
530; CHECK-NEXT:    retq
531  %1 = bitcast i8 %x3 to <8 x i1>
532  %res = call <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float> %x0, ptr %x1, <8 x i32> %x2, <8 x i1> %1, i32 4)
533  %res1 = call <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float> %x0, ptr %x1, <8 x i32> %x2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 2)
534  %res2 = fadd <8 x float> %res, %res1
535  ret <8 x float> %res2
536}
537
538define <8 x i32> @test_int_x86_avx512_mask_gather3siv8_si(<8 x i32> %x0, ptr %x1, <8 x i32> %x2, i8 %x3) {
539; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv8_si:
540; CHECK:       # %bb.0:
541; CHECK-NEXT:    kmovd %esi, %k1
542; CHECK-NEXT:    vmovdqa %ymm0, %ymm2
543; CHECK-NEXT:    kmovq %k1, %k2
544; CHECK-NEXT:    vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2}
545; CHECK-NEXT:    vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1}
546; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
547; CHECK-NEXT:    retq
548  %1 = bitcast i8 %x3 to <8 x i1>
549  %res = call <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32> %x0, ptr %x1, <8 x i32> %x2, <8 x i1> %1, i32 4)
550  %2 = bitcast i8 %x3 to <8 x i1>
551  %res1 = call <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32> %x0, ptr %x1, <8 x i32> %x2, <8 x i1> %2, i32 2)
552  %res2 = add <8 x i32> %res, %res1
553  ret <8 x i32> %res2
554}
555
556define dso_local void@test_int_x86_avx512_scatterdiv2_df(ptr %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) {
557; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df:
558; CHECK:       # %bb.0:
559; CHECK-NEXT:    kmovd %esi, %k1
560; CHECK-NEXT:    kxnorw %k0, %k0, %k2
561; CHECK-NEXT:    vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2}
562; CHECK-NEXT:    vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1}
563; CHECK-NEXT:    retq
564  %1 = bitcast i8 %x1 to <8 x i1>
565  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
566  call void @llvm.x86.avx512.mask.scatterdiv2.df(ptr %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <2 x double> %x3, i32 2)
567  call void @llvm.x86.avx512.mask.scatterdiv2.df(ptr %x0, <2 x i1> %2, <2 x i64> %x2, <2 x double> %x3, i32 4)
568  ret void
569}
570
571define dso_local void@test_int_x86_avx512_scatterdiv2_di(ptr %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) {
572; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di:
573; CHECK:       # %bb.0:
574; CHECK-NEXT:    kmovd %esi, %k1
575; CHECK-NEXT:    vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1}
576; CHECK-NEXT:    kxnorw %k0, %k0, %k1
577; CHECK-NEXT:    vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1}
578; CHECK-NEXT:    retq
579  %1 = bitcast i8 %x1 to <8 x i1>
580  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
581  call void @llvm.x86.avx512.mask.scatterdiv2.di(ptr %x0, <2 x i1> %2, <2 x i64> %x2, <2 x i64> %x3, i32 2)
582  call void @llvm.x86.avx512.mask.scatterdiv2.di(ptr %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <2 x i64> %x3, i32 4)
583  ret void
584}
585
586define dso_local void@test_int_x86_avx512_scatterdiv4_df(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) {
587; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df:
588; CHECK:       # %bb.0:
589; CHECK-NEXT:    kmovd %esi, %k1
590; CHECK-NEXT:    vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1}
591; CHECK-NEXT:    kxnorw %k0, %k0, %k1
592; CHECK-NEXT:    vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1}
593; CHECK-NEXT:    vzeroupper
594; CHECK-NEXT:    retq
595  %1 = bitcast i8 %x1 to <8 x i1>
596  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
597  call void @llvm.x86.avx512.mask.scatterdiv4.df(ptr %x0, <4 x i1> %2, <4 x i64> %x2, <4 x double> %x3, i32 2)
598  call void @llvm.x86.avx512.mask.scatterdiv4.df(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x double> %x3, i32 4)
599  ret void
600}
601
602define dso_local void@test_int_x86_avx512_scatterdiv4_di(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) {
603; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di:
604; CHECK:       # %bb.0:
605; CHECK-NEXT:    kmovd %esi, %k1
606; CHECK-NEXT:    vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1}
607; CHECK-NEXT:    kxnorw %k0, %k0, %k1
608; CHECK-NEXT:    vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1}
609; CHECK-NEXT:    vzeroupper
610; CHECK-NEXT:    retq
611  %1 = bitcast i8 %x1 to <8 x i1>
612  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
613  call void @llvm.x86.avx512.mask.scatterdiv4.di(ptr %x0, <4 x i1> %2, <4 x i64> %x2, <4 x i64> %x3, i32 2)
614  call void @llvm.x86.avx512.mask.scatterdiv4.di(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x i64> %x3, i32 4)
615  ret void
616}
617
618define dso_local void@test_int_x86_avx512_scatterdiv4_sf(ptr %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) {
619; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf:
620; CHECK:       # %bb.0:
621; CHECK-NEXT:    kmovd %esi, %k1
622; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1}
623; CHECK-NEXT:    kxnorw %k0, %k0, %k1
624; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1}
625; CHECK-NEXT:    retq
626  %1 = bitcast i8 %x1 to <8 x i1>
627  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
628  call void @llvm.x86.avx512.mask.scatterdiv4.sf(ptr %x0, <2 x i1> %2, <2 x i64> %x2, <4 x float> %x3, i32 2)
629  call void @llvm.x86.avx512.mask.scatterdiv4.sf(ptr %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <4 x float> %x3, i32 4)
630  ret void
631}
632
633define dso_local void@test_int_x86_avx512_scatterdiv4_si(ptr %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) {
634; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si:
635; CHECK:       # %bb.0:
636; CHECK-NEXT:    kmovd %esi, %k1
637; CHECK-NEXT:    kxnorw %k0, %k0, %k2
638; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2}
639; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1}
640; CHECK-NEXT:    retq
641  %1 = bitcast i8 %x1 to <8 x i1>
642  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
643  call void @llvm.x86.avx512.mask.scatterdiv4.si(ptr %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <4 x i32> %x3, i32 2)
644  call void @llvm.x86.avx512.mask.scatterdiv4.si(ptr %x0, <2 x i1> %2, <2 x i64> %x2, <4 x i32> %x3, i32 4)
645  ret void
646}
647
648define dso_local void@test_int_x86_avx512_scatterdiv8_sf(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) {
649; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf:
650; CHECK:       # %bb.0:
651; CHECK-NEXT:    kmovd %esi, %k1
652; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1}
653; CHECK-NEXT:    kxnorw %k0, %k0, %k1
654; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1}
655; CHECK-NEXT:    vzeroupper
656; CHECK-NEXT:    retq
657  %1 = bitcast i8 %x1 to <8 x i1>
658  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
659  call void @llvm.x86.avx512.mask.scatterdiv8.sf(ptr %x0, <4 x i1> %2, <4 x i64> %x2, <4 x float> %x3, i32 2)
660  call void @llvm.x86.avx512.mask.scatterdiv8.sf(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x float> %x3, i32 4)
661  ret void
662}
663
664define dso_local void@test_int_x86_avx512_scatterdiv8_si(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) {
665; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si:
666; CHECK:       # %bb.0:
667; CHECK-NEXT:    kmovd %esi, %k1
668; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1}
669; CHECK-NEXT:    kxnorw %k0, %k0, %k1
670; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1}
671; CHECK-NEXT:    vzeroupper
672; CHECK-NEXT:    retq
673  %1 = bitcast i8 %x1 to <8 x i1>
674  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
675  call void @llvm.x86.avx512.mask.scatterdiv8.si(ptr %x0, <4 x i1> %2, <4 x i64> %x2, <4 x i32> %x3, i32 2)
676  call void @llvm.x86.avx512.mask.scatterdiv8.si(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x i32> %x3, i32 4)
677  ret void
678}
679
680define dso_local void@test_int_x86_avx512_scattersiv2_df(ptr %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) {
681; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df:
682; CHECK:       # %bb.0:
683; CHECK-NEXT:    kmovd %esi, %k1
684; CHECK-NEXT:    kxnorw %k0, %k0, %k2
685; CHECK-NEXT:    vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2}
686; CHECK-NEXT:    vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1}
687; CHECK-NEXT:    retq
688  %1 = bitcast i8 %x1 to <8 x i1>
689  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
690  call void @llvm.x86.avx512.mask.scattersiv2.df(ptr %x0, <2 x i1> <i1 true, i1 true>, <4 x i32> %x2, <2 x double> %x3, i32 2)
691  call void @llvm.x86.avx512.mask.scattersiv2.df(ptr %x0, <2 x i1> %2, <4 x i32> %x2, <2 x double> %x3, i32 4)
692  ret void
693}
694
695define dso_local void@test_int_x86_avx512_scattersiv2_di(ptr %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) {
696; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di:
697; CHECK:       # %bb.0:
698; CHECK-NEXT:    kmovd %esi, %k1
699; CHECK-NEXT:    kxnorw %k0, %k0, %k2
700; CHECK-NEXT:    vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2}
701; CHECK-NEXT:    vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1}
702; CHECK-NEXT:    retq
703  %1 = bitcast i8 %x1 to <8 x i1>
704  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
705  call void @llvm.x86.avx512.mask.scattersiv2.di(ptr %x0, <2 x i1> <i1 true, i1 true>, <4 x i32> %x2, <2 x i64> %x3, i32 2)
706  call void @llvm.x86.avx512.mask.scattersiv2.di(ptr %x0, <2 x i1> %2, <4 x i32> %x2, <2 x i64> %x3, i32 4)
707  ret void
708}
709
710define dso_local void@test_int_x86_avx512_scattersiv4_df(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) {
711; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df:
712; CHECK:       # %bb.0:
713; CHECK-NEXT:    kmovd %esi, %k1
714; CHECK-NEXT:    vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1}
715; CHECK-NEXT:    kxnorw %k0, %k0, %k1
716; CHECK-NEXT:    vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1}
717; CHECK-NEXT:    vzeroupper
718; CHECK-NEXT:    retq
719  %1 = bitcast i8 %x1 to <8 x i1>
720  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
721  call void @llvm.x86.avx512.mask.scattersiv4.df(ptr %x0, <4 x i1> %2, <4 x i32> %x2, <4 x double> %x3, i32 2)
722  call void @llvm.x86.avx512.mask.scattersiv4.df(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x double> %x3, i32 4)
723  ret void
724}
725
726define dso_local void@test_int_x86_avx512_scattersiv4_di(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) {
727; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di:
728; CHECK:       # %bb.0:
729; CHECK-NEXT:    kmovd %esi, %k1
730; CHECK-NEXT:    kxnorw %k0, %k0, %k2
731; CHECK-NEXT:    vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2}
732; CHECK-NEXT:    vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1}
733; CHECK-NEXT:    vzeroupper
734; CHECK-NEXT:    retq
735  %1 = bitcast i8 %x1 to <8 x i1>
736  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
737  call void @llvm.x86.avx512.mask.scattersiv4.di(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x i64> %x3, i32 2)
738  call void @llvm.x86.avx512.mask.scattersiv4.di(ptr %x0, <4 x i1> %2, <4 x i32> %x2, <4 x i64> %x3, i32 4)
739  ret void
740}
741
742define dso_local void@test_int_x86_avx512_scattersiv4_sf(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) {
743; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf:
744; CHECK:       # %bb.0:
745; CHECK-NEXT:    kmovd %esi, %k1
746; CHECK-NEXT:    vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1}
747; CHECK-NEXT:    kxnorw %k0, %k0, %k1
748; CHECK-NEXT:    vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1}
749; CHECK-NEXT:    retq
750  %1 = bitcast i8 %x1 to <8 x i1>
751  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
752  call void @llvm.x86.avx512.mask.scattersiv4.sf(ptr %x0, <4 x i1> %2, <4 x i32> %x2, <4 x float> %x3, i32 2)
753  call void @llvm.x86.avx512.mask.scattersiv4.sf(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x float> %x3, i32 4)
754  ret void
755}
756
757define dso_local void@test_int_x86_avx512_scattersiv4_si(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) {
758; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si:
759; CHECK:       # %bb.0:
760; CHECK-NEXT:    kmovd %esi, %k1
761; CHECK-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1}
762; CHECK-NEXT:    kxnorw %k0, %k0, %k1
763; CHECK-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
764; CHECK-NEXT:    retq
765  %1 = bitcast i8 %x1 to <8 x i1>
766  %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
767  call void @llvm.x86.avx512.mask.scattersiv4.si(ptr %x0, <4 x i1> %2, <4 x i32> %x2, <4 x i32> %x3, i32 2)
768  call void @llvm.x86.avx512.mask.scattersiv4.si(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x i32> %x3, i32 4)
769  ret void
770}
771
772define dso_local void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) {
773; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf:
774; CHECK:       # %bb.0:
775; CHECK-NEXT:    kmovd %esi, %k1
776; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
777; CHECK-NEXT:    kxnorw %k0, %k0, %k1
778; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
779; CHECK-NEXT:    vzeroupper
780; CHECK-NEXT:    retq
781  %1 = bitcast i8 %x1 to <8 x i1>
782  call void @llvm.x86.avx512.mask.scattersiv8.sf(ptr %x0, <8 x i1> %1, <8 x i32> %x2, <8 x float> %x3, i32 2)
783  call void @llvm.x86.avx512.mask.scattersiv8.sf(ptr %x0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %x2, <8 x float> %x3, i32 4)
784  ret void
785}
786
787define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) {
788; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si:
789; CHECK:       # %bb.0:
790; CHECK-NEXT:    kmovd %esi, %k1
791; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
792; CHECK-NEXT:    kxnorw %k0, %k0, %k1
793; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
794; CHECK-NEXT:    vzeroupper
795; CHECK-NEXT:    retq
796  %1 = bitcast i8 %x1 to <8 x i1>
797  call void @llvm.x86.avx512.mask.scattersiv8.si(ptr %x0, <8 x i1> %1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
798  call void @llvm.x86.avx512.mask.scattersiv8.si(ptr %x0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %x2, <8 x i32> %x3, i32 4)
799  ret void
800}
801
802define dso_local void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) {
803; CHECK-LABEL: scatter_mask_test:
804; CHECK:       # %bb.0:
805; CHECK-NEXT:    kxnorw %k0, %k0, %k1
806; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
807; CHECK-NEXT:    kxorw %k0, %k0, %k1
808; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
809; CHECK-NEXT:    movb $1, %al
810; CHECK-NEXT:    kmovd %eax, %k1
811; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
812; CHECK-NEXT:    movb $96, %al
813; CHECK-NEXT:    kmovd %eax, %k1
814; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
815; CHECK-NEXT:    vzeroupper
816; CHECK-NEXT:    retq
817  call void @llvm.x86.avx512.mask.scattersiv8.si(ptr %x0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %x2, <8 x i32> %x3, i32 2)
818  call void @llvm.x86.avx512.mask.scattersiv8.si(ptr %x0, <8 x i1> zeroinitializer, <8 x i32> %x2, <8 x i32> %x3, i32 4)
819  call void @llvm.x86.avx512.mask.scattersiv8.si(ptr %x0, <8 x i1> bitcast (<1 x i8> <i8 1> to <8 x i1>), <8 x i32> %x2, <8 x i32> %x3, i32 2)
820  call void @llvm.x86.avx512.mask.scattersiv8.si(ptr %x0, <8 x i1> bitcast (<1 x i8> <i8 96> to <8 x i1>), <8 x i32> %x2, <8 x i32> %x3, i32 4)
821  ret void
822}
823
824define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, ptr %base) {
825; CHECK-LABEL: gather_mask_test:
826; CHECK:       # %bb.0:
827; CHECK-NEXT:    kxnorw %k0, %k0, %k1
828; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
829; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
830; CHECK-NEXT:    kxorw %k0, %k0, %k1
831; CHECK-NEXT:    vmovaps %zmm1, %zmm3
832; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
833; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm2
834; CHECK-NEXT:    movw $1, %ax
835; CHECK-NEXT:    kmovd %eax, %k1
836; CHECK-NEXT:    vmovaps %zmm1, %zmm3
837; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
838; CHECK-NEXT:    movw $220, %ax
839; CHECK-NEXT:    kmovd %eax, %k1
840; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
841; CHECK-NEXT:    vaddps %zmm3, %zmm1, %zmm0
842; CHECK-NEXT:    vaddps %zmm2, %zmm0, %zmm0
843; CHECK-NEXT:    retq
844  %res = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, ptr %base, <16 x i32> %ind, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
845  %res1 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, ptr %base, <16 x i32> %ind, <16 x i1> zeroinitializer, i32 4)
846  %res2 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, ptr %base, <16 x i32> %ind, <16 x i1> bitcast (<1 x i16> <i16 1> to <16 x i1>), i32 4)
847  %res3 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, ptr %base, <16 x i32> %ind, <16 x i1> bitcast (<1 x i16> <i16 220> to <16 x i1>), i32 4)
848  %res4 = fadd <16 x float> %res, %res1
849  %res5 = fadd <16 x float> %res3, %res2
850  %res6 = fadd <16 x float> %res5, %res4
851  ret <16 x float> %res6
852}
853
854@x = dso_local global [1024 x float] zeroinitializer, align 16
855
856define <8 x float> @gather_global(<8 x i64>, ptr nocapture readnone) {
857; CHECK-LABEL: gather_global:
858; CHECK:       # %bb.0:
859; CHECK-NEXT:    kxnorw %k0, %k0, %k1
860; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
861; CHECK-NEXT:    vgatherqps x(,%zmm0,4), %ymm1 {%k1}
862; CHECK-NEXT:    vmovaps %ymm1, %ymm0
863; CHECK-NEXT:    retq
864  %3 = tail call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> zeroinitializer, ptr @x, <8 x i64> %0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
865  ret <8 x float> %3
866}
867
868declare <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float>, ptr, <16 x i32>, <16 x i1>, i32)
869declare <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double>, ptr, <8 x i32>, <8 x i1>, i32)
870declare <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float>, ptr, <8 x i64>, <8 x i1>, i32)
871declare <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double>, ptr, <8 x i64>, <8 x i1>, i32)
872declare <16 x i32> @llvm.x86.avx512.mask.gather.dpi.512(<16 x i32>, ptr, <16 x i32>, <16 x i1>, i32)
873declare <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512(<8 x i64>, ptr, <8 x i32>, <8 x i1>, i32)
874declare <8 x i32> @llvm.x86.avx512.mask.gather.qpi.512(<8 x i32>, ptr, <8 x i64>, <8 x i1>, i32)
875declare <8 x i64> @llvm.x86.avx512.mask.gather.qpq.512(<8 x i64>, ptr, <8 x i64>, <8 x i1>, i32)
876declare <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double>, ptr, <2 x i64>, <2 x i1>, i32)
877declare <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64>, ptr, <2 x i64>, <2 x i1>, i32)
878declare <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double>, ptr, <4 x i64>, <4 x i1>, i32)
879declare <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64>, ptr, <4 x i64>, <4 x i1>, i32)
880declare <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float>, ptr, <2 x i64>, <2 x i1>, i32)
881declare <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32>, ptr, <2 x i64>, <2 x i1>, i32)
882declare <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float>, ptr, <4 x i64>, <4 x i1>, i32)
883declare <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32>, ptr, <4 x i64>, <4 x i1>, i32)
884declare <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double>, ptr, <4 x i32>, <2 x i1>, i32)
885declare <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64>, ptr, <4 x i32>, <2 x i1>, i32)
886declare <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double>, ptr, <4 x i32>, <4 x i1>, i32)
887declare <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64>, ptr, <4 x i32>, <4 x i1>, i32)
888declare <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float>, ptr, <4 x i32>, <4 x i1>, i32)
889declare <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32>, ptr, <4 x i32>, <4 x i1>, i32)
890declare <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float>, ptr, <8 x i32>, <8 x i1>, i32)
891declare <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32>, ptr, <8 x i32>, <8 x i1>, i32)
892declare void @llvm.x86.avx512.mask.scatter.dps.512(ptr, <16 x i1>, <16 x i32>, <16 x float>, i32)
893declare void @llvm.x86.avx512.mask.scatter.dpd.512(ptr, <8 x i1>, <8 x i32>, <8 x double>, i32)
894declare void @llvm.x86.avx512.mask.scatter.qps.512(ptr, <8 x i1>, <8 x i64>, <8 x float>, i32)
895declare void @llvm.x86.avx512.mask.scatter.qpd.512(ptr, <8 x i1>, <8 x i64>, <8 x double>, i32)
896declare void @llvm.x86.avx512.mask.scatter.dpi.512(ptr, <16 x i1>, <16 x i32>, <16 x i32>, i32)
897declare void @llvm.x86.avx512.mask.scatter.dpq.512(ptr, <8 x i1>, <8 x i32>, <8 x i64>, i32)
898declare void @llvm.x86.avx512.mask.scatter.qpi.512(ptr, <8 x i1>, <8 x i64>, <8 x i32>, i32)
899declare void @llvm.x86.avx512.mask.scatter.qpq.512(ptr, <8 x i1>, <8 x i64>, <8 x i64>, i32)
900declare void @llvm.x86.avx512.mask.scatterdiv2.df(ptr, <2 x i1>, <2 x i64>, <2 x double>, i32)
901declare void @llvm.x86.avx512.mask.scatterdiv2.di(ptr, <2 x i1>, <2 x i64>, <2 x i64>, i32)
902declare void @llvm.x86.avx512.mask.scatterdiv4.df(ptr, <4 x i1>, <4 x i64>, <4 x double>, i32)
903declare void @llvm.x86.avx512.mask.scatterdiv4.di(ptr, <4 x i1>, <4 x i64>, <4 x i64>, i32)
904declare void @llvm.x86.avx512.mask.scatterdiv4.sf(ptr, <2 x i1>, <2 x i64>, <4 x float>, i32)
905declare void @llvm.x86.avx512.mask.scatterdiv4.si(ptr, <2 x i1>, <2 x i64>, <4 x i32>, i32)
906declare void @llvm.x86.avx512.mask.scatterdiv8.sf(ptr, <4 x i1>, <4 x i64>, <4 x float>, i32)
907declare void @llvm.x86.avx512.mask.scatterdiv8.si(ptr, <4 x i1>, <4 x i64>, <4 x i32>, i32)
908declare void @llvm.x86.avx512.mask.scattersiv2.df(ptr, <2 x i1>, <4 x i32>, <2 x double>, i32)
909declare void @llvm.x86.avx512.mask.scattersiv2.di(ptr, <2 x i1>, <4 x i32>, <2 x i64>, i32)
910declare void @llvm.x86.avx512.mask.scattersiv4.df(ptr, <4 x i1>, <4 x i32>, <4 x double>, i32)
911declare void @llvm.x86.avx512.mask.scattersiv4.di(ptr, <4 x i1>, <4 x i32>, <4 x i64>, i32)
912declare void @llvm.x86.avx512.mask.scattersiv4.sf(ptr, <4 x i1>, <4 x i32>, <4 x float>, i32)
913declare void @llvm.x86.avx512.mask.scattersiv4.si(ptr, <4 x i1>, <4 x i32>, <4 x i32>, i32)
914declare void @llvm.x86.avx512.mask.scattersiv8.sf(ptr, <8 x i1>, <8 x i32>, <8 x float>, i32)
915declare void @llvm.x86.avx512.mask.scattersiv8.si(ptr, <8 x i1>, <8 x i32>, <8 x i32>, i32)
916
917