xref: /llvm-project/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll (revision 4def1ce1012897d4f80f5094c29e5383b9aa6a0b)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s
3
4declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, ptr, <16 x i32>, i16, i32)
5declare void @llvm.x86.avx512.scatter.dps.512 (ptr, i16, <16 x i32>, <16 x float>, i32)
6declare <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double>, ptr, <8 x i32>, i8, i32)
7declare void @llvm.x86.avx512.scatter.dpd.512 (ptr, i8, <8 x i32>, <8 x double>, i32)
8
9declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, ptr, <8 x i64>, i8, i32)
10declare void @llvm.x86.avx512.scatter.qps.512 (ptr, i8, <8 x i64>, <8 x float>, i32)
11declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, ptr, <8 x i64>, i8, i32)
12declare void @llvm.x86.avx512.scatter.qpd.512 (ptr, i8, <8 x i64>, <8 x double>, i32)
13
14define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, ptr %base, ptr %stbuf)  {
15; CHECK-LABEL: gather_mask_dps:
16; CHECK:       ## %bb.0:
17; CHECK-NEXT:    kmovd %edi, %k1
18; CHECK-NEXT:    kmovq %k1, %k2
19; CHECK-NEXT:    vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
20; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
21; CHECK-NEXT:    vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
22; CHECK-NEXT:    vzeroupper
23; CHECK-NEXT:    retq
24  %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, ptr %base, <16 x i32>%ind, i16 %mask, i32 4)
25  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
26  call void @llvm.x86.avx512.scatter.dps.512 (ptr %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
27  ret void
28}
29
30define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf)  {
31; CHECK-LABEL: gather_mask_dpd:
32; CHECK:       ## %bb.0:
33; CHECK-NEXT:    kmovd %edi, %k1
34; CHECK-NEXT:    kmovq %k1, %k2
35; CHECK-NEXT:    vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
36; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
37; CHECK-NEXT:    vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
38; CHECK-NEXT:    vzeroupper
39; CHECK-NEXT:    retq
40  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, ptr %base, <8 x i32>%ind, i8 %mask, i32 4)
41  %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
42  call void @llvm.x86.avx512.scatter.dpd.512 (ptr %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
43  ret void
44}
45
46define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, ptr %base, ptr %stbuf)  {
47; CHECK-LABEL: gather_mask_qps:
48; CHECK:       ## %bb.0:
49; CHECK-NEXT:    kmovd %edi, %k1
50; CHECK-NEXT:    kmovq %k1, %k2
51; CHECK-NEXT:    vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
52; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
53; CHECK-NEXT:    vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
54; CHECK-NEXT:    vzeroupper
55; CHECK-NEXT:    retq
56  %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, ptr %base, <8 x i64>%ind, i8 %mask, i32 4)
57  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
58  call void @llvm.x86.avx512.scatter.qps.512 (ptr %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
59  ret void
60}
61
62define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf)  {
63; CHECK-LABEL: gather_mask_qpd:
64; CHECK:       ## %bb.0:
65; CHECK-NEXT:    kmovd %edi, %k1
66; CHECK-NEXT:    kmovq %k1, %k2
67; CHECK-NEXT:    vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
68; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
69; CHECK-NEXT:    vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
70; CHECK-NEXT:    vzeroupper
71; CHECK-NEXT:    retq
72  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, ptr %base, <8 x i64>%ind, i8 %mask, i32 4)
73  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
74  call void @llvm.x86.avx512.scatter.qpd.512 (ptr %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
75  ret void
76}
77;;
78;; Integer Gather/Scatter
79;;
80declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, ptr, <16 x i32>, i16, i32)
81declare void @llvm.x86.avx512.scatter.dpi.512 (ptr, i16, <16 x i32>, <16 x i32>, i32)
82declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64>, ptr, <8 x i32>, i8, i32)
83declare void @llvm.x86.avx512.scatter.dpq.512 (ptr, i8, <8 x i32>, <8 x i64>, i32)
84
85declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, ptr, <8 x i64>, i8, i32)
86declare void @llvm.x86.avx512.scatter.qpi.512 (ptr, i8, <8 x i64>, <8 x i32>, i32)
87declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, ptr, <8 x i64>, i8, i32)
88declare void @llvm.x86.avx512.scatter.qpq.512 (ptr, i8, <8 x i64>, <8 x i64>, i32)
89
90define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, ptr %base, ptr %stbuf)  {
91; CHECK-LABEL: gather_mask_dd:
92; CHECK:       ## %bb.0:
93; CHECK-NEXT:    kmovd %edi, %k1
94; CHECK-NEXT:    kmovq %k1, %k2
95; CHECK-NEXT:    vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
96; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
97; CHECK-NEXT:    vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
98; CHECK-NEXT:    vzeroupper
99; CHECK-NEXT:    retq
100  %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, ptr %base, <16 x i32>%ind, i16 %mask, i32 4)
101  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
102  call void @llvm.x86.avx512.scatter.dpi.512 (ptr %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
103  ret void
104}
105
106define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, ptr %base, ptr %stbuf)  {
107; CHECK-LABEL: gather_mask_qd:
108; CHECK:       ## %bb.0:
109; CHECK-NEXT:    kmovd %edi, %k1
110; CHECK-NEXT:    kmovq %k1, %k2
111; CHECK-NEXT:    vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
112; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
113; CHECK-NEXT:    vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
114; CHECK-NEXT:    vzeroupper
115; CHECK-NEXT:    retq
116  %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, ptr %base, <8 x i64>%ind, i8 %mask, i32 4)
117  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
118  call void @llvm.x86.avx512.scatter.qpi.512 (ptr %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
119  ret void
120}
121
122define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, ptr %base, ptr %stbuf)  {
123; CHECK-LABEL: gather_mask_qq:
124; CHECK:       ## %bb.0:
125; CHECK-NEXT:    kmovd %edi, %k1
126; CHECK-NEXT:    kmovq %k1, %k2
127; CHECK-NEXT:    vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
128; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
129; CHECK-NEXT:    vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
130; CHECK-NEXT:    vzeroupper
131; CHECK-NEXT:    retq
132  %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, ptr %base, <8 x i64>%ind, i8 %mask, i32 4)
133  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
134  call void @llvm.x86.avx512.scatter.qpq.512 (ptr %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
135  ret void
136}
137
138define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, ptr %base, ptr %stbuf)  {
139; CHECK-LABEL: gather_mask_dq:
140; CHECK:       ## %bb.0:
141; CHECK-NEXT:    kmovd %edi, %k1
142; CHECK-NEXT:    kmovq %k1, %k2
143; CHECK-NEXT:    vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
144; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
145; CHECK-NEXT:    vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
146; CHECK-NEXT:    vzeroupper
147; CHECK-NEXT:    retq
148  %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, ptr %base, <8 x i32>%ind, i8 %mask, i32 4)
149  %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
150  call void @llvm.x86.avx512.scatter.dpq.512 (ptr %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
151  ret void
152}
153
154define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf)  {
155; CHECK-LABEL: gather_mask_dpd_execdomain:
156; CHECK:       ## %bb.0:
157; CHECK-NEXT:    kmovd %edi, %k1
158; CHECK-NEXT:    vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1}
159; CHECK-NEXT:    vmovapd %zmm1, (%rdx)
160; CHECK-NEXT:    vzeroupper
161; CHECK-NEXT:    retq
162  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, ptr %base, <8 x i32>%ind, i8 %mask, i32 4)
163  store <8 x double> %x, ptr %stbuf
164  ret void
165}
166
167define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf)  {
168; CHECK-LABEL: gather_mask_qpd_execdomain:
169; CHECK:       ## %bb.0:
170; CHECK-NEXT:    kmovd %edi, %k1
171; CHECK-NEXT:    vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1}
172; CHECK-NEXT:    vmovapd %zmm1, (%rdx)
173; CHECK-NEXT:    vzeroupper
174; CHECK-NEXT:    retq
175  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, ptr %base, <8 x i64>%ind, i8 %mask, i32 4)
176  store <8 x double> %x, ptr %stbuf
177  ret void
178}
179
180define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, ptr %base)  {
181; CHECK-LABEL: gather_mask_dps_execdomain:
182; CHECK:       ## %bb.0:
183; CHECK-NEXT:    kmovd %edi, %k1
184; CHECK-NEXT:    vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1}
185; CHECK-NEXT:    vmovaps %zmm1, %zmm0
186; CHECK-NEXT:    retq
187  %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, ptr %base, <16 x i32>%ind, i16 %mask, i32 4)
188  ret <16 x float> %res;
189}
190
191define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, ptr %base)  {
192; CHECK-LABEL: gather_mask_qps_execdomain:
193; CHECK:       ## %bb.0:
194; CHECK-NEXT:    kmovd %edi, %k1
195; CHECK-NEXT:    vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
196; CHECK-NEXT:    vmovaps %ymm1, %ymm0
197; CHECK-NEXT:    retq
198  %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, ptr %base, <8 x i64>%ind, i8 %mask, i32 4)
199  ret <8 x float> %res;
200}
201
202define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, ptr %src, i8 %mask, ptr %base, ptr %stbuf)  {
203; CHECK-LABEL: scatter_mask_dpd_execdomain:
204; CHECK:       ## %bb.0:
205; CHECK-NEXT:    kmovd %esi, %k1
206; CHECK-NEXT:    vmovapd (%rdi), %zmm1
207; CHECK-NEXT:    vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
208; CHECK-NEXT:    vzeroupper
209; CHECK-NEXT:    retq
210  %x = load <8 x double>, ptr %src, align 64
211  call void @llvm.x86.avx512.scatter.dpd.512 (ptr %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
212  ret void
213}
214
215define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, ptr %src, i8 %mask, ptr %base, ptr %stbuf)  {
216; CHECK-LABEL: scatter_mask_qpd_execdomain:
217; CHECK:       ## %bb.0:
218; CHECK-NEXT:    kmovd %esi, %k1
219; CHECK-NEXT:    vmovapd (%rdi), %zmm1
220; CHECK-NEXT:    vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
221; CHECK-NEXT:    vzeroupper
222; CHECK-NEXT:    retq
223  %x = load <8 x double>, ptr %src, align 64
224  call void @llvm.x86.avx512.scatter.qpd.512 (ptr %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
225  ret void
226}
227
228define void @scatter_mask_dps_execdomain(<16 x i32> %ind, ptr %src, i16 %mask, ptr %base, ptr %stbuf)  {
229; CHECK-LABEL: scatter_mask_dps_execdomain:
230; CHECK:       ## %bb.0:
231; CHECK-NEXT:    kmovd %esi, %k1
232; CHECK-NEXT:    vmovaps (%rdi), %zmm1
233; CHECK-NEXT:    vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
234; CHECK-NEXT:    vzeroupper
235; CHECK-NEXT:    retq
236  %x = load <16 x float>, ptr %src, align 64
237  call void @llvm.x86.avx512.scatter.dps.512 (ptr %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
238  ret void
239}
240
241define void @scatter_mask_qps_execdomain(<8 x i64> %ind, ptr %src, i8 %mask, ptr %base, ptr %stbuf)  {
242; CHECK-LABEL: scatter_mask_qps_execdomain:
243; CHECK:       ## %bb.0:
244; CHECK-NEXT:    kmovd %esi, %k1
245; CHECK-NEXT:    vmovaps (%rdi), %ymm1
246; CHECK-NEXT:    vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
247; CHECK-NEXT:    vzeroupper
248; CHECK-NEXT:    retq
249  %x = load <8 x float>, ptr %src, align 32
250  call void @llvm.x86.avx512.scatter.qps.512 (ptr %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
251  ret void
252}
253
254define void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf)  {
255; CHECK-LABEL: gather_qps:
256; CHECK:       ## %bb.0:
257; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
258; CHECK-NEXT:    kxnorw %k0, %k0, %k1
259; CHECK-NEXT:    kxnorw %k0, %k0, %k2
260; CHECK-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
261; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
262; CHECK-NEXT:    vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
263; CHECK-NEXT:    vzeroupper
264; CHECK-NEXT:    retq
265  %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, ptr %base, <8 x i64>%ind, i8 -1, i32 4)
266  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
267  call void @llvm.x86.avx512.scatter.qps.512 (ptr %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4)
268  ret void
269}
270
271declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, ptr, <2 x i64>, i8, i32)
272
273define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
274; CHECK-LABEL: test_int_x86_avx512_gather3div2_df:
275; CHECK:       ## %bb.0:
276; CHECK-NEXT:    kmovd %esi, %k1
277; CHECK-NEXT:    vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1}
278; CHECK-NEXT:    kxnorw %k0, %k0, %k1
279; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
280; CHECK-NEXT:    vgatherqpd (%rdi,%xmm1,2), %xmm2 {%k1}
281; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
282; CHECK-NEXT:    retq
283  %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, i8 %x3, i32 4)
284  %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, i8 -1, i32 2)
285  %res2 = fadd <2 x double> %res, %res1
286  ret <2 x double> %res2
287}
288
289declare <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64>, ptr, <2 x i64>, i8, i32)
290
291define <2 x i64>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
292; CHECK-LABEL: test_int_x86_avx512_gather3div2_di:
293; CHECK:       ## %bb.0:
294; CHECK-NEXT:    kmovd %esi, %k1
295; CHECK-NEXT:    vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
296; CHECK-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
297; CHECK-NEXT:    retq
298  %res = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, ptr %x1, <2 x i64> %x2, i8 %x3, i32 8)
299  %res1 = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, ptr %x1, <2 x i64> %x2, i8 %x3, i32 8)
300  %res2 = add <2 x i64> %res, %res1
301  ret <2 x i64> %res2
302}
303
304declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, ptr, <4 x i64>, i8, i32)
305
306define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) {
307; CHECK-LABEL: test_int_x86_avx512_gather3div4_df:
308; CHECK:       ## %bb.0:
309; CHECK-NEXT:    kmovd %esi, %k1
310; CHECK-NEXT:    vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1}
311; CHECK-NEXT:    kxnorw %k0, %k0, %k1
312; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
313; CHECK-NEXT:    vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1}
314; CHECK-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
315; CHECK-NEXT:    retq
316  %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, ptr %x1, <4 x i64> %x2, i8 %x3, i32 4)
317  %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, ptr %x1, <4 x i64> %x2, i8 -1, i32 2)
318  %res2 = fadd <4 x double> %res, %res1
319  ret <4 x double> %res2
320}
321
322declare <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64>, ptr, <4 x i64>, i8, i32)
323
324define <4 x i64>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) {
325; CHECK-LABEL: test_int_x86_avx512_gather3div4_di:
326; CHECK:       ## %bb.0:
327; CHECK-NEXT:    kmovd %esi, %k1
328; CHECK-NEXT:    vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
329; CHECK-NEXT:    kxnorw %k0, %k0, %k1
330; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
331; CHECK-NEXT:    vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
332; CHECK-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
333; CHECK-NEXT:    retq
334  %res = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, ptr %x1, <4 x i64> %x2, i8 %x3, i32 8)
335  %res1 = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, ptr %x1, <4 x i64> %x2, i8 -1, i32 8)
336  %res2 = add <4 x i64> %res, %res1
337  ret <4 x i64> %res2
338}
339
340declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, ptr, <2 x i64>, i8, i32)
341
342define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
343; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf:
344; CHECK:       ## %bb.0:
345; CHECK-NEXT:    kmovd %esi, %k1
346; CHECK-NEXT:    vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1}
347; CHECK-NEXT:    kxnorw %k0, %k0, %k1
348; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
349; CHECK-NEXT:    vgatherqps (%rdi,%xmm1,2), %xmm2 {%k1}
350; CHECK-NEXT:    vaddps %xmm2, %xmm0, %xmm0
351; CHECK-NEXT:    retq
352  %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, ptr %x1, <2 x i64> %x2, i8 %x3, i32 4)
353  %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, ptr %x1, <2 x i64> %x2, i8 -1, i32 2)
354  %res2 = fadd <4 x float> %res, %res1
355  ret <4 x float> %res2
356}
357
358declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, ptr, <2 x i64>, i8, i32)
359
360define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
361; CHECK-LABEL: test_int_x86_avx512_gather3div4_si:
362; CHECK:       ## %bb.0:
363; CHECK-NEXT:    kmovd %esi, %k1
364; CHECK-NEXT:    kxnorw %k0, %k0, %k2
365; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
366; CHECK-NEXT:    vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2}
367; CHECK-NEXT:    vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
368; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
369; CHECK-NEXT:    retq
370  %res = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, ptr %x1, <2 x i64> %x2, i8 -1, i32 4)
371  %res1 = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, ptr %x1, <2 x i64> %x2, i8 %x3, i32 4)
372  %res2 = add <4 x i32> %res, %res1
373  ret <4 x i32> %res2
374}
375
376declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, ptr, <4 x i64>, i8, i32)
377
378define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) {
379; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf:
380; CHECK:       ## %bb.0:
381; CHECK-NEXT:    kmovd %esi, %k1
382; CHECK-NEXT:    vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1}
383; CHECK-NEXT:    kxnorw %k0, %k0, %k1
384; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
385; CHECK-NEXT:    vgatherqps (%rdi,%ymm1,2), %xmm2 {%k1}
386; CHECK-NEXT:    vaddps %xmm2, %xmm0, %xmm0
387; CHECK-NEXT:    vzeroupper
388; CHECK-NEXT:    retq
389  %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, ptr %x1, <4 x i64> %x2, i8 %x3, i32 4)
390  %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, ptr %x1, <4 x i64> %x2, i8 -1, i32 2)
391  %res2 = fadd <4 x float> %res, %res1
392  ret <4 x float> %res2
393}
394
395declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, ptr, <4 x i64>, i8, i32)
396
397define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) {
398; CHECK-LABEL: test_int_x86_avx512_gather3div8_si:
399; CHECK:       ## %bb.0:
400; CHECK-NEXT:    kmovd %esi, %k1
401; CHECK-NEXT:    vmovdqa %xmm0, %xmm2
402; CHECK-NEXT:    kmovq %k1, %k2
403; CHECK-NEXT:    vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
404; CHECK-NEXT:    vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
405; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
406; CHECK-NEXT:    vzeroupper
407; CHECK-NEXT:    retq
408  %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, ptr %x1, <4 x i64> %x2, i8 %x3, i32 4)
409  %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, ptr %x1, <4 x i64> %x2, i8 %x3, i32 2)
410  %res2 = add <4 x i32> %res, %res1
411  ret <4 x i32> %res2
412}
413
414declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, ptr, <4 x i32>, i8, i32)
415
416define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
417; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df:
418; CHECK:       ## %bb.0:
419; CHECK-NEXT:    kmovd %esi, %k1
420; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1}
421; CHECK-NEXT:    kxnorw %k0, %k0, %k1
422; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
423; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,2), %xmm2 {%k1}
424; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
425; CHECK-NEXT:    retq
426  %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 4)
427  %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, ptr %x1, <4 x i32> %x2, i8 -1, i32 2)
428  %res2 = fadd <2 x double> %res, %res1
429  ret <2 x double> %res2
430}
431
432declare <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, ptr, <4 x i32>, i8, i32)
433
434define <2 x i64>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
435; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di:
436; CHECK:       ## %bb.0:
437; CHECK-NEXT:    kmovd %esi, %k1
438; CHECK-NEXT:    vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
439; CHECK-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
440; CHECK-NEXT:    retq
441  %res = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 8)
442  %res1 = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 8)
443  %res2 = add <2 x i64> %res, %res1
444  ret <2 x i64> %res2
445}
446
447declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, ptr, <4 x i32>, i8, i32)
448
449define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
450; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df:
451; CHECK:       ## %bb.0:
452; CHECK-NEXT:    kmovd %esi, %k1
453; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1}
454; CHECK-NEXT:    kxnorw %k0, %k0, %k1
455; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
456; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1}
457; CHECK-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
458; CHECK-NEXT:    retq
459  %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 4)
460  %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, ptr %x1, <4 x i32> %x2, i8 -1, i32 2)
461  %res2 = fadd <4 x double> %res, %res1
462  ret <4 x double> %res2
463}
464
465declare <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, ptr, <4 x i32>, i8, i32)
466
467define <4 x i64>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
468; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di:
469; CHECK:       ## %bb.0:
470; CHECK-NEXT:    kmovd %esi, %k1
471; CHECK-NEXT:    vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
472; CHECK-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
473; CHECK-NEXT:    retq
474  %res = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 8)
475  %res1 = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 8)
476  %res2 = add <4 x i64> %res, %res1
477  ret <4 x i64> %res2
478}
479
480declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, ptr, <4 x i32>, i8, i32)
481
482define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
483; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf:
484; CHECK:       ## %bb.0:
485; CHECK-NEXT:    kmovd %esi, %k1
486; CHECK-NEXT:    vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
487; CHECK-NEXT:    kxnorw %k0, %k0, %k1
488; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
489; CHECK-NEXT:    vgatherdps (%rdi,%xmm1,2), %xmm2 {%k1}
490; CHECK-NEXT:    vaddps %xmm2, %xmm0, %xmm0
491; CHECK-NEXT:    retq
492  %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 4)
493  %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, ptr %x1, <4 x i32> %x2, i8 -1, i32 2)
494  %res2 = fadd <4 x float> %res, %res1
495  ret <4 x float> %res2
496}
497
498declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, ptr, <4 x i32>, i8, i32)
499
500define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
501; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si:
502; CHECK:       ## %bb.0:
503; CHECK-NEXT:    kmovd %esi, %k1
504; CHECK-NEXT:    kxnorw %k0, %k0, %k2
505; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
506; CHECK-NEXT:    vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2}
507; CHECK-NEXT:    vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1}
508; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
509; CHECK-NEXT:    retq
510  %res = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, ptr %x1, <4 x i32> %x2, i8 -1, i32 4)
511  %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 2)
512  %res2 = add <4 x i32> %res, %res1
513  ret <4 x i32> %res2
514}
515
516declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, ptr, <8 x i32>, i8, i32)
517
518define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, ptr %x1, <8 x i32> %x2, i8 %x3) {
519; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf:
520; CHECK:       ## %bb.0:
521; CHECK-NEXT:    kmovd %esi, %k1
522; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
523; CHECK-NEXT:    kxnorw %k0, %k0, %k1
524; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
525; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
526; CHECK-NEXT:    vaddps %ymm2, %ymm0, %ymm0
527; CHECK-NEXT:    retq
528  %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, ptr %x1, <8 x i32> %x2, i8 %x3, i32 4)
529  %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, ptr %x1, <8 x i32> %x2, i8 -1, i32 2)
530  %res2 = fadd <8 x float> %res, %res1
531  ret <8 x float> %res2
532}
533
534declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, ptr, <8 x i32>, i8, i32)
535
536define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, ptr %x1, <8 x i32> %x2, i8 %x3) {
537; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si:
538; CHECK:       ## %bb.0:
539; CHECK-NEXT:    kmovd %esi, %k1
540; CHECK-NEXT:    vmovdqa %ymm0, %ymm2
541; CHECK-NEXT:    kmovq %k1, %k2
542; CHECK-NEXT:    vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2}
543; CHECK-NEXT:    vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1}
544; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
545; CHECK-NEXT:    retq
546  %res = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, ptr %x1, <8 x i32> %x2, i8 %x3, i32 4)
547  %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, ptr %x1, <8 x i32> %x2, i8 %x3, i32 2)
548  %res2 = add <8 x i32> %res, %res1
549  ret <8 x i32> %res2
550}
551
552declare void @llvm.x86.avx512.scatterdiv2.df(ptr, i8, <2 x i64>, <2 x double>, i32)
553
554define void@test_int_x86_avx512_scatterdiv2_df(ptr %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) {
555; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df:
556; CHECK:       ## %bb.0:
557; CHECK-NEXT:    kmovd %esi, %k1
558; CHECK-NEXT:    kxnorw %k0, %k0, %k2
559; CHECK-NEXT:    vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2}
560; CHECK-NEXT:    vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1}
561; CHECK-NEXT:    retq
562  call void @llvm.x86.avx512.scatterdiv2.df(ptr %x0, i8 -1, <2 x i64> %x2, <2 x double> %x3, i32 2)
563  call void @llvm.x86.avx512.scatterdiv2.df(ptr %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3, i32 4)
564  ret void
565}
566
567declare void @llvm.x86.avx512.scatterdiv2.di(ptr, i8, <2 x i64>, <2 x i64>, i32)
568
569define void@test_int_x86_avx512_scatterdiv2_di(ptr %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) {
570; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di:
571; CHECK:       ## %bb.0:
572; CHECK-NEXT:    kmovd %esi, %k1
573; CHECK-NEXT:    vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1}
574; CHECK-NEXT:    kxnorw %k0, %k0, %k1
575; CHECK-NEXT:    vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1}
576; CHECK-NEXT:    retq
577  call void @llvm.x86.avx512.scatterdiv2.di(ptr %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3, i32 2)
578  call void @llvm.x86.avx512.scatterdiv2.di(ptr %x0, i8 -1, <2 x i64> %x2, <2 x i64> %x3, i32 4)
579  ret void
580}
581
582declare void @llvm.x86.avx512.scatterdiv4.df(ptr, i8, <4 x i64>, <4 x double>, i32)
583
584define void@test_int_x86_avx512_scatterdiv4_df(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) {
585; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df:
586; CHECK:       ## %bb.0:
587; CHECK-NEXT:    kmovd %esi, %k1
588; CHECK-NEXT:    vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1}
589; CHECK-NEXT:    kxnorw %k0, %k0, %k1
590; CHECK-NEXT:    vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1}
591; CHECK-NEXT:    vzeroupper
592; CHECK-NEXT:    retq
593  call void @llvm.x86.avx512.scatterdiv4.df(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2)
594  call void @llvm.x86.avx512.scatterdiv4.df(ptr %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4)
595  ret void
596}
597
598declare void @llvm.x86.avx512.scatterdiv4.di(ptr, i8, <4 x i64>, <4 x i64>, i32)
599
600define void@test_int_x86_avx512_scatterdiv4_di(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) {
601; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di:
602; CHECK:       ## %bb.0:
603; CHECK-NEXT:    kmovd %esi, %k1
604; CHECK-NEXT:    vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1}
605; CHECK-NEXT:    kxnorw %k0, %k0, %k1
606; CHECK-NEXT:    vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1}
607; CHECK-NEXT:    vzeroupper
608; CHECK-NEXT:    retq
609  call void @llvm.x86.avx512.scatterdiv4.di(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2)
610  call void @llvm.x86.avx512.scatterdiv4.di(ptr %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4)
611  ret void
612}
613
614declare void @llvm.x86.avx512.scatterdiv4.sf(ptr, i8, <2 x i64>, <4 x float>, i32)
615
616define void@test_int_x86_avx512_scatterdiv4_sf(ptr %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) {
617; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf:
618; CHECK:       ## %bb.0:
619; CHECK-NEXT:    kmovd %esi, %k1
620; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1}
621; CHECK-NEXT:    kxnorw %k0, %k0, %k1
622; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1}
623; CHECK-NEXT:    retq
624  call void @llvm.x86.avx512.scatterdiv4.sf(ptr %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3, i32 2)
625  call void @llvm.x86.avx512.scatterdiv4.sf(ptr %x0, i8 -1, <2 x i64> %x2, <4 x float> %x3, i32 4)
626  ret void
627}
628
629declare void @llvm.x86.avx512.scatterdiv4.si(ptr, i8, <2 x i64>, <4 x i32>, i32)
630
631define void@test_int_x86_avx512_scatterdiv4_si(ptr %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) {
632; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si:
633; CHECK:       ## %bb.0:
634; CHECK-NEXT:    kmovd %esi, %k1
635; CHECK-NEXT:    kxnorw %k0, %k0, %k2
636; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2}
637; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1}
638; CHECK-NEXT:    retq
639  call void @llvm.x86.avx512.scatterdiv4.si(ptr %x0, i8 -1, <2 x i64> %x2, <4 x i32> %x3, i32 2)
640  call void @llvm.x86.avx512.scatterdiv4.si(ptr %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3, i32 4)
641  ret void
642}
643
644declare void @llvm.x86.avx512.scatterdiv8.sf(ptr, i8, <4 x i64>, <4 x float>, i32)
645
646define void@test_int_x86_avx512_scatterdiv8_sf(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) {
647; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf:
648; CHECK:       ## %bb.0:
649; CHECK-NEXT:    kmovd %esi, %k1
650; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1}
651; CHECK-NEXT:    kxnorw %k0, %k0, %k1
652; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1}
653; CHECK-NEXT:    vzeroupper
654; CHECK-NEXT:    retq
655  call void @llvm.x86.avx512.scatterdiv8.sf(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2)
656  call void @llvm.x86.avx512.scatterdiv8.sf(ptr %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4)
657  ret void
658}
659
660declare void @llvm.x86.avx512.scatterdiv8.si(ptr, i8, <4 x i64>, <4 x i32>, i32)
661
662define void@test_int_x86_avx512_scatterdiv8_si(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) {
663; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si:
664; CHECK:       ## %bb.0:
665; CHECK-NEXT:    kmovd %esi, %k1
666; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1}
667; CHECK-NEXT:    kxnorw %k0, %k0, %k1
668; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1}
669; CHECK-NEXT:    vzeroupper
670; CHECK-NEXT:    retq
671  call void @llvm.x86.avx512.scatterdiv8.si(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2)
672  call void @llvm.x86.avx512.scatterdiv8.si(ptr %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4)
673  ret void
674}
675
676declare void @llvm.x86.avx512.scattersiv2.df(ptr, i8, <4 x i32>, <2 x double>, i32)
677
678define void@test_int_x86_avx512_scattersiv2_df(ptr %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) {
679; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df:
680; CHECK:       ## %bb.0:
681; CHECK-NEXT:    kmovd %esi, %k1
682; CHECK-NEXT:    kxnorw %k0, %k0, %k2
683; CHECK-NEXT:    vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2}
684; CHECK-NEXT:    vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1}
685; CHECK-NEXT:    retq
686  call void @llvm.x86.avx512.scattersiv2.df(ptr %x0, i8 -1, <4 x i32> %x2, <2 x double> %x3, i32 2)
687  call void @llvm.x86.avx512.scattersiv2.df(ptr %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3, i32 4)
688  ret void
689}
690
691declare void @llvm.x86.avx512.scattersiv2.di(ptr, i8, <4 x i32>, <2 x i64>, i32)
692
693define void@test_int_x86_avx512_scattersiv2_di(ptr %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) {
694; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di:
695; CHECK:       ## %bb.0:
696; CHECK-NEXT:    kmovd %esi, %k1
697; CHECK-NEXT:    kxnorw %k0, %k0, %k2
698; CHECK-NEXT:    vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2}
699; CHECK-NEXT:    vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1}
700; CHECK-NEXT:    retq
701  call void @llvm.x86.avx512.scattersiv2.di(ptr %x0, i8 -1, <4 x i32> %x2, <2 x i64> %x3, i32 2)
702  call void @llvm.x86.avx512.scattersiv2.di(ptr %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3, i32 4)
703  ret void
704}
705
706declare void @llvm.x86.avx512.scattersiv4.df(ptr, i8, <4 x i32>, <4 x double>, i32)
707
708define void@test_int_x86_avx512_scattersiv4_df(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) {
709; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df:
710; CHECK:       ## %bb.0:
711; CHECK-NEXT:    kmovd %esi, %k1
712; CHECK-NEXT:    vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1}
713; CHECK-NEXT:    kxnorw %k0, %k0, %k1
714; CHECK-NEXT:    vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1}
715; CHECK-NEXT:    vzeroupper
716; CHECK-NEXT:    retq
717  call void @llvm.x86.avx512.scattersiv4.df(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2)
718  call void @llvm.x86.avx512.scattersiv4.df(ptr %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4)
719  ret void
720}
721
722declare void @llvm.x86.avx512.scattersiv4.di(ptr, i8, <4 x i32>, <4 x i64>, i32)
723
724define void@test_int_x86_avx512_scattersiv4_di(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) {
725; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di:
726; CHECK:       ## %bb.0:
727; CHECK-NEXT:    kmovd %esi, %k1
728; CHECK-NEXT:    kxnorw %k0, %k0, %k2
729; CHECK-NEXT:    vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2}
730; CHECK-NEXT:    vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1}
731; CHECK-NEXT:    vzeroupper
732; CHECK-NEXT:    retq
733  call void @llvm.x86.avx512.scattersiv4.di(ptr %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2)
734  call void @llvm.x86.avx512.scattersiv4.di(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4)
735  ret void
736}
737
738declare void @llvm.x86.avx512.scattersiv4.sf(ptr, i8, <4 x i32>, <4 x float>, i32)
739
740define void@test_int_x86_avx512_scattersiv4_sf(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) {
741; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf:
742; CHECK:       ## %bb.0:
743; CHECK-NEXT:    kmovd %esi, %k1
744; CHECK-NEXT:    vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1}
745; CHECK-NEXT:    kxnorw %k0, %k0, %k1
746; CHECK-NEXT:    vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1}
747; CHECK-NEXT:    retq
748  call void @llvm.x86.avx512.scattersiv4.sf(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3, i32 2)
749  call void @llvm.x86.avx512.scattersiv4.sf(ptr %x0, i8 -1, <4 x i32> %x2, <4 x float> %x3, i32 4)
750  ret void
751}
752
753declare void @llvm.x86.avx512.scattersiv4.si(ptr, i8, <4 x i32>, <4 x i32>, i32)
754
755define void@test_int_x86_avx512_scattersiv4_si(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) {
756; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si:
757; CHECK:       ## %bb.0:
758; CHECK-NEXT:    kmovd %esi, %k1
759; CHECK-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1}
760; CHECK-NEXT:    kxnorw %k0, %k0, %k1
761; CHECK-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
762; CHECK-NEXT:    retq
763  call void @llvm.x86.avx512.scattersiv4.si(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3, i32 2)
764  call void @llvm.x86.avx512.scattersiv4.si(ptr %x0, i8 -1, <4 x i32> %x2, <4 x i32> %x3, i32 4)
765  ret void
766}
767
768declare void @llvm.x86.avx512.scattersiv8.sf(ptr, i8, <8 x i32>, <8 x float>, i32)
769
770define void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) {
771; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf:
772; CHECK:       ## %bb.0:
773; CHECK-NEXT:    kmovd %esi, %k1
774; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
775; CHECK-NEXT:    kxnorw %k0, %k0, %k1
776; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
777; CHECK-NEXT:    vzeroupper
778; CHECK-NEXT:    retq
779  call void @llvm.x86.avx512.scattersiv8.sf(ptr %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2)
780  call void @llvm.x86.avx512.scattersiv8.sf(ptr %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4)
781  ret void
782}
783
784declare void @llvm.x86.avx512.scattersiv8.si(ptr, i8, <8 x i32>, <8 x i32>, i32)
785
786define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) {
787; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si:
788; CHECK:       ## %bb.0:
789; CHECK-NEXT:    kmovd %esi, %k1
790; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
791; CHECK-NEXT:    kxnorw %k0, %k0, %k1
792; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
793; CHECK-NEXT:    vzeroupper
794; CHECK-NEXT:    retq
795  call void @llvm.x86.avx512.scattersiv8.si(ptr %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
796  call void @llvm.x86.avx512.scattersiv8.si(ptr %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4)
797  ret void
798}
799
800define void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) {
801; CHECK-LABEL: scatter_mask_test:
802; CHECK:       ## %bb.0:
803; CHECK-NEXT:    kxnorw %k0, %k0, %k1
804; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
805; CHECK-NEXT:    kxorw %k0, %k0, %k1
806; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
807; CHECK-NEXT:    movb $1, %al
808; CHECK-NEXT:    kmovd %eax, %k1
809; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
810; CHECK-NEXT:    movb $96, %al
811; CHECK-NEXT:    kmovd %eax, %k1
812; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
813; CHECK-NEXT:    vzeroupper
814; CHECK-NEXT:    retq
815  call void @llvm.x86.avx512.scattersiv8.si(ptr %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
816  call void @llvm.x86.avx512.scattersiv8.si(ptr %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4)
817  call void @llvm.x86.avx512.scattersiv8.si(ptr %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
818  call void @llvm.x86.avx512.scattersiv8.si(ptr %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4)
819  ret void
820}
821
822define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, ptr %base)  {
823; CHECK-LABEL: gather_mask_test:
824; CHECK:       ## %bb.0:
825; CHECK-NEXT:    kxnorw %k0, %k0, %k1
826; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
827; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
828; CHECK-NEXT:    kxorw %k0, %k0, %k1
829; CHECK-NEXT:    vmovaps %zmm1, %zmm3
830; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
831; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm2
832; CHECK-NEXT:    movw $1, %ax
833; CHECK-NEXT:    kmovd %eax, %k1
834; CHECK-NEXT:    vmovaps %zmm1, %zmm3
835; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
836; CHECK-NEXT:    movw $220, %ax
837; CHECK-NEXT:    kmovd %eax, %k1
838; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
839; CHECK-NEXT:    vaddps %zmm3, %zmm1, %zmm0
840; CHECK-NEXT:    vaddps %zmm2, %zmm0, %zmm0
841; CHECK-NEXT:    retq
842  %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, ptr %base, <16 x i32>%ind, i16 -1, i32 4)
843  %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, ptr %base, <16 x i32>%ind, i16 0, i32 4)
844  %res2 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, ptr %base, <16 x i32>%ind, i16 1, i32 4)
845  %res3 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, ptr %base, <16 x i32>%ind, i16 220, i32 4)
846
847  %res4 = fadd <16 x float> %res, %res1
848  %res5 = fadd <16 x float> %res3, %res2
849  %res6 = fadd <16 x float> %res5, %res4
850  ret <16 x float> %res6
851}
852