xref: /llvm-project/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll (revision 8b43c1be23119c1024bed0a8ce392bc73727e2e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=WIDEN_SKX
3; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=WIDEN_KNL
4; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck %s --check-prefix=WIDEN_AVX2
5
6define <2 x double> @test_gather_v2i32_index(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
7; WIDEN_SKX-LABEL: test_gather_v2i32_index:
8; WIDEN_SKX:       # %bb.0:
9; WIDEN_SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
10; WIDEN_SKX-NEXT:    vpmovq2m %xmm1, %k0
11; WIDEN_SKX-NEXT:    vpbroadcastq %rdi, %xmm1
12; WIDEN_SKX-NEXT:    vpmovsxdq %xmm0, %xmm0
13; WIDEN_SKX-NEXT:    vpsllq $3, %xmm0, %xmm0
14; WIDEN_SKX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
15; WIDEN_SKX-NEXT:    kmovw %k0, %eax
16; WIDEN_SKX-NEXT:    testb $1, %al
17; WIDEN_SKX-NEXT:    jne .LBB0_1
18; WIDEN_SKX-NEXT:  # %bb.2: # %else
19; WIDEN_SKX-NEXT:    testb $2, %al
20; WIDEN_SKX-NEXT:    jne .LBB0_3
21; WIDEN_SKX-NEXT:  .LBB0_4: # %else2
22; WIDEN_SKX-NEXT:    vmovaps %xmm2, %xmm0
23; WIDEN_SKX-NEXT:    retq
24; WIDEN_SKX-NEXT:  .LBB0_1: # %cond.load
25; WIDEN_SKX-NEXT:    vmovq %xmm0, %rcx
26; WIDEN_SKX-NEXT:    vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
27; WIDEN_SKX-NEXT:    testb $2, %al
28; WIDEN_SKX-NEXT:    je .LBB0_4
29; WIDEN_SKX-NEXT:  .LBB0_3: # %cond.load1
30; WIDEN_SKX-NEXT:    vpextrq $1, %xmm0, %rax
31; WIDEN_SKX-NEXT:    vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
32; WIDEN_SKX-NEXT:    vmovaps %xmm2, %xmm0
33; WIDEN_SKX-NEXT:    retq
34;
35; WIDEN_KNL-LABEL: test_gather_v2i32_index:
36; WIDEN_KNL:       # %bb.0:
37; WIDEN_KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
38; WIDEN_KNL-NEXT:    vptestmq %zmm1, %zmm1, %k0
39; WIDEN_KNL-NEXT:    vpmovsxdq %xmm0, %xmm0
40; WIDEN_KNL-NEXT:    vpsllq $3, %xmm0, %xmm0
41; WIDEN_KNL-NEXT:    vmovq %rdi, %xmm1
42; WIDEN_KNL-NEXT:    vpbroadcastq %xmm1, %xmm1
43; WIDEN_KNL-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
44; WIDEN_KNL-NEXT:    kmovw %k0, %eax
45; WIDEN_KNL-NEXT:    testb $1, %al
46; WIDEN_KNL-NEXT:    jne .LBB0_1
47; WIDEN_KNL-NEXT:  # %bb.2: # %else
48; WIDEN_KNL-NEXT:    testb $2, %al
49; WIDEN_KNL-NEXT:    jne .LBB0_3
50; WIDEN_KNL-NEXT:  .LBB0_4: # %else2
51; WIDEN_KNL-NEXT:    vmovaps %xmm2, %xmm0
52; WIDEN_KNL-NEXT:    vzeroupper
53; WIDEN_KNL-NEXT:    retq
54; WIDEN_KNL-NEXT:  .LBB0_1: # %cond.load
55; WIDEN_KNL-NEXT:    vmovq %xmm0, %rcx
56; WIDEN_KNL-NEXT:    vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
57; WIDEN_KNL-NEXT:    testb $2, %al
58; WIDEN_KNL-NEXT:    je .LBB0_4
59; WIDEN_KNL-NEXT:  .LBB0_3: # %cond.load1
60; WIDEN_KNL-NEXT:    vpextrq $1, %xmm0, %rax
61; WIDEN_KNL-NEXT:    vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
62; WIDEN_KNL-NEXT:    vmovaps %xmm2, %xmm0
63; WIDEN_KNL-NEXT:    vzeroupper
64; WIDEN_KNL-NEXT:    retq
65;
66; WIDEN_AVX2-LABEL: test_gather_v2i32_index:
67; WIDEN_AVX2:       # %bb.0:
68; WIDEN_AVX2-NEXT:    vpsllq $63, %xmm1, %xmm1
69; WIDEN_AVX2-NEXT:    vgatherdpd %xmm1, (%rdi,%xmm0,8), %xmm2
70; WIDEN_AVX2-NEXT:    vmovapd %xmm2, %xmm0
71; WIDEN_AVX2-NEXT:    retq
72  %gep.random = getelementptr double, ptr %base, <2 x i32> %ind
73  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
74  ret <2 x double> %res
75}
76
77define void @test_scatter_v2i32_index(<2 x double> %a1, ptr %base, <2 x i32> %ind, <2 x i1> %mask) {
78; WIDEN_SKX-LABEL: test_scatter_v2i32_index:
79; WIDEN_SKX:       # %bb.0:
80; WIDEN_SKX-NEXT:    vpsllq $63, %xmm2, %xmm2
81; WIDEN_SKX-NEXT:    vpmovq2m %xmm2, %k0
82; WIDEN_SKX-NEXT:    vpbroadcastq %rdi, %xmm2
83; WIDEN_SKX-NEXT:    vpmovsxdq %xmm1, %xmm1
84; WIDEN_SKX-NEXT:    vpsllq $3, %xmm1, %xmm1
85; WIDEN_SKX-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
86; WIDEN_SKX-NEXT:    kmovw %k0, %eax
87; WIDEN_SKX-NEXT:    testb $1, %al
88; WIDEN_SKX-NEXT:    jne .LBB1_1
89; WIDEN_SKX-NEXT:  # %bb.2: # %else
90; WIDEN_SKX-NEXT:    testb $2, %al
91; WIDEN_SKX-NEXT:    jne .LBB1_3
92; WIDEN_SKX-NEXT:  .LBB1_4: # %else2
93; WIDEN_SKX-NEXT:    retq
94; WIDEN_SKX-NEXT:  .LBB1_1: # %cond.store
95; WIDEN_SKX-NEXT:    vmovq %xmm1, %rcx
96; WIDEN_SKX-NEXT:    vmovlps %xmm0, (%rcx)
97; WIDEN_SKX-NEXT:    testb $2, %al
98; WIDEN_SKX-NEXT:    je .LBB1_4
99; WIDEN_SKX-NEXT:  .LBB1_3: # %cond.store1
100; WIDEN_SKX-NEXT:    vpextrq $1, %xmm1, %rax
101; WIDEN_SKX-NEXT:    vmovhps %xmm0, (%rax)
102; WIDEN_SKX-NEXT:    retq
103;
104; WIDEN_KNL-LABEL: test_scatter_v2i32_index:
105; WIDEN_KNL:       # %bb.0:
106; WIDEN_KNL-NEXT:    vpsllq $63, %xmm2, %xmm2
107; WIDEN_KNL-NEXT:    vptestmq %zmm2, %zmm2, %k0
108; WIDEN_KNL-NEXT:    vpmovsxdq %xmm1, %xmm1
109; WIDEN_KNL-NEXT:    vpsllq $3, %xmm1, %xmm1
110; WIDEN_KNL-NEXT:    vmovq %rdi, %xmm2
111; WIDEN_KNL-NEXT:    vpbroadcastq %xmm2, %xmm2
112; WIDEN_KNL-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
113; WIDEN_KNL-NEXT:    kmovw %k0, %eax
114; WIDEN_KNL-NEXT:    testb $1, %al
115; WIDEN_KNL-NEXT:    jne .LBB1_1
116; WIDEN_KNL-NEXT:  # %bb.2: # %else
117; WIDEN_KNL-NEXT:    testb $2, %al
118; WIDEN_KNL-NEXT:    jne .LBB1_3
119; WIDEN_KNL-NEXT:  .LBB1_4: # %else2
120; WIDEN_KNL-NEXT:    vzeroupper
121; WIDEN_KNL-NEXT:    retq
122; WIDEN_KNL-NEXT:  .LBB1_1: # %cond.store
123; WIDEN_KNL-NEXT:    vmovq %xmm1, %rcx
124; WIDEN_KNL-NEXT:    vmovlps %xmm0, (%rcx)
125; WIDEN_KNL-NEXT:    testb $2, %al
126; WIDEN_KNL-NEXT:    je .LBB1_4
127; WIDEN_KNL-NEXT:  .LBB1_3: # %cond.store1
128; WIDEN_KNL-NEXT:    vpextrq $1, %xmm1, %rax
129; WIDEN_KNL-NEXT:    vmovhps %xmm0, (%rax)
130; WIDEN_KNL-NEXT:    vzeroupper
131; WIDEN_KNL-NEXT:    retq
132;
133; WIDEN_AVX2-LABEL: test_scatter_v2i32_index:
134; WIDEN_AVX2:       # %bb.0:
135; WIDEN_AVX2-NEXT:    vpmovsxdq %xmm1, %xmm1
136; WIDEN_AVX2-NEXT:    vpsllq $3, %xmm1, %xmm1
137; WIDEN_AVX2-NEXT:    vmovq %rdi, %xmm3
138; WIDEN_AVX2-NEXT:    vpbroadcastq %xmm3, %xmm3
139; WIDEN_AVX2-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
140; WIDEN_AVX2-NEXT:    vpsllq $63, %xmm2, %xmm2
141; WIDEN_AVX2-NEXT:    vmovmskpd %xmm2, %eax
142; WIDEN_AVX2-NEXT:    testb $1, %al
143; WIDEN_AVX2-NEXT:    jne .LBB1_1
144; WIDEN_AVX2-NEXT:  # %bb.2: # %else
145; WIDEN_AVX2-NEXT:    testb $2, %al
146; WIDEN_AVX2-NEXT:    jne .LBB1_3
147; WIDEN_AVX2-NEXT:  .LBB1_4: # %else2
148; WIDEN_AVX2-NEXT:    retq
149; WIDEN_AVX2-NEXT:  .LBB1_1: # %cond.store
150; WIDEN_AVX2-NEXT:    vmovq %xmm1, %rcx
151; WIDEN_AVX2-NEXT:    vmovlps %xmm0, (%rcx)
152; WIDEN_AVX2-NEXT:    testb $2, %al
153; WIDEN_AVX2-NEXT:    je .LBB1_4
154; WIDEN_AVX2-NEXT:  .LBB1_3: # %cond.store1
155; WIDEN_AVX2-NEXT:    vpextrq $1, %xmm1, %rax
156; WIDEN_AVX2-NEXT:    vmovhps %xmm0, (%rax)
157; WIDEN_AVX2-NEXT:    retq
158  %gep = getelementptr double, ptr%base, <2 x i32> %ind
159  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> %a1, <2 x ptr> %gep, i32 4, <2 x i1> %mask)
160  ret void
161}
162
163define <2 x i32> @test_gather_v2i32_data(<2 x ptr> %ptr, <2 x i1> %mask, <2 x i32> %src0) {
164; WIDEN_SKX-LABEL: test_gather_v2i32_data:
165; WIDEN_SKX:       # %bb.0:
166; WIDEN_SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
167; WIDEN_SKX-NEXT:    vpmovq2m %xmm1, %k0
168; WIDEN_SKX-NEXT:    kmovw %k0, %eax
169; WIDEN_SKX-NEXT:    testb $1, %al
170; WIDEN_SKX-NEXT:    jne .LBB2_1
171; WIDEN_SKX-NEXT:  # %bb.2: # %else
172; WIDEN_SKX-NEXT:    testb $2, %al
173; WIDEN_SKX-NEXT:    jne .LBB2_3
174; WIDEN_SKX-NEXT:  .LBB2_4: # %else2
175; WIDEN_SKX-NEXT:    vmovdqa %xmm2, %xmm0
176; WIDEN_SKX-NEXT:    retq
177; WIDEN_SKX-NEXT:  .LBB2_1: # %cond.load
178; WIDEN_SKX-NEXT:    vmovq %xmm0, %rcx
179; WIDEN_SKX-NEXT:    vpinsrd $0, (%rcx), %xmm2, %xmm2
180; WIDEN_SKX-NEXT:    testb $2, %al
181; WIDEN_SKX-NEXT:    je .LBB2_4
182; WIDEN_SKX-NEXT:  .LBB2_3: # %cond.load1
183; WIDEN_SKX-NEXT:    vpextrq $1, %xmm0, %rax
184; WIDEN_SKX-NEXT:    vpinsrd $1, (%rax), %xmm2, %xmm2
185; WIDEN_SKX-NEXT:    vmovdqa %xmm2, %xmm0
186; WIDEN_SKX-NEXT:    retq
187;
188; WIDEN_KNL-LABEL: test_gather_v2i32_data:
189; WIDEN_KNL:       # %bb.0:
190; WIDEN_KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
191; WIDEN_KNL-NEXT:    vptestmq %zmm1, %zmm1, %k0
192; WIDEN_KNL-NEXT:    kmovw %k0, %eax
193; WIDEN_KNL-NEXT:    testb $1, %al
194; WIDEN_KNL-NEXT:    jne .LBB2_1
195; WIDEN_KNL-NEXT:  # %bb.2: # %else
196; WIDEN_KNL-NEXT:    testb $2, %al
197; WIDEN_KNL-NEXT:    jne .LBB2_3
198; WIDEN_KNL-NEXT:  .LBB2_4: # %else2
199; WIDEN_KNL-NEXT:    vmovdqa %xmm2, %xmm0
200; WIDEN_KNL-NEXT:    vzeroupper
201; WIDEN_KNL-NEXT:    retq
202; WIDEN_KNL-NEXT:  .LBB2_1: # %cond.load
203; WIDEN_KNL-NEXT:    vmovq %xmm0, %rcx
204; WIDEN_KNL-NEXT:    vpinsrd $0, (%rcx), %xmm2, %xmm2
205; WIDEN_KNL-NEXT:    testb $2, %al
206; WIDEN_KNL-NEXT:    je .LBB2_4
207; WIDEN_KNL-NEXT:  .LBB2_3: # %cond.load1
208; WIDEN_KNL-NEXT:    vpextrq $1, %xmm0, %rax
209; WIDEN_KNL-NEXT:    vpinsrd $1, (%rax), %xmm2, %xmm2
210; WIDEN_KNL-NEXT:    vmovdqa %xmm2, %xmm0
211; WIDEN_KNL-NEXT:    vzeroupper
212; WIDEN_KNL-NEXT:    retq
213;
214; WIDEN_AVX2-LABEL: test_gather_v2i32_data:
215; WIDEN_AVX2:       # %bb.0:
216; WIDEN_AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
217; WIDEN_AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
218; WIDEN_AVX2-NEXT:    vpgatherqd %xmm1, (,%xmm0), %xmm2
219; WIDEN_AVX2-NEXT:    vmovdqa %xmm2, %xmm0
220; WIDEN_AVX2-NEXT:    retq
221  %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %ptr, i32 4, <2 x i1> %mask, <2 x i32> %src0)
222  ret <2 x i32>%res
223}
224
225define void @test_scatter_v2i32_data(<2 x i32>%a1, <2 x ptr> %ptr, <2 x i1>%mask) {
226; WIDEN_SKX-LABEL: test_scatter_v2i32_data:
227; WIDEN_SKX:       # %bb.0:
228; WIDEN_SKX-NEXT:    vpsllq $63, %xmm2, %xmm2
229; WIDEN_SKX-NEXT:    vpmovq2m %xmm2, %k0
230; WIDEN_SKX-NEXT:    kmovw %k0, %eax
231; WIDEN_SKX-NEXT:    testb $1, %al
232; WIDEN_SKX-NEXT:    jne .LBB3_1
233; WIDEN_SKX-NEXT:  # %bb.2: # %else
234; WIDEN_SKX-NEXT:    testb $2, %al
235; WIDEN_SKX-NEXT:    jne .LBB3_3
236; WIDEN_SKX-NEXT:  .LBB3_4: # %else2
237; WIDEN_SKX-NEXT:    retq
238; WIDEN_SKX-NEXT:  .LBB3_1: # %cond.store
239; WIDEN_SKX-NEXT:    vmovq %xmm1, %rcx
240; WIDEN_SKX-NEXT:    vmovss %xmm0, (%rcx)
241; WIDEN_SKX-NEXT:    testb $2, %al
242; WIDEN_SKX-NEXT:    je .LBB3_4
243; WIDEN_SKX-NEXT:  .LBB3_3: # %cond.store1
244; WIDEN_SKX-NEXT:    vpextrq $1, %xmm1, %rax
245; WIDEN_SKX-NEXT:    vextractps $1, %xmm0, (%rax)
246; WIDEN_SKX-NEXT:    retq
247;
248; WIDEN_KNL-LABEL: test_scatter_v2i32_data:
249; WIDEN_KNL:       # %bb.0:
250; WIDEN_KNL-NEXT:    vpsllq $63, %xmm2, %xmm2
251; WIDEN_KNL-NEXT:    vptestmq %zmm2, %zmm2, %k0
252; WIDEN_KNL-NEXT:    kmovw %k0, %eax
253; WIDEN_KNL-NEXT:    testb $1, %al
254; WIDEN_KNL-NEXT:    jne .LBB3_1
255; WIDEN_KNL-NEXT:  # %bb.2: # %else
256; WIDEN_KNL-NEXT:    testb $2, %al
257; WIDEN_KNL-NEXT:    jne .LBB3_3
258; WIDEN_KNL-NEXT:  .LBB3_4: # %else2
259; WIDEN_KNL-NEXT:    vzeroupper
260; WIDEN_KNL-NEXT:    retq
261; WIDEN_KNL-NEXT:  .LBB3_1: # %cond.store
262; WIDEN_KNL-NEXT:    vmovq %xmm1, %rcx
263; WIDEN_KNL-NEXT:    vmovss %xmm0, (%rcx)
264; WIDEN_KNL-NEXT:    testb $2, %al
265; WIDEN_KNL-NEXT:    je .LBB3_4
266; WIDEN_KNL-NEXT:  .LBB3_3: # %cond.store1
267; WIDEN_KNL-NEXT:    vpextrq $1, %xmm1, %rax
268; WIDEN_KNL-NEXT:    vextractps $1, %xmm0, (%rax)
269; WIDEN_KNL-NEXT:    vzeroupper
270; WIDEN_KNL-NEXT:    retq
271;
272; WIDEN_AVX2-LABEL: test_scatter_v2i32_data:
273; WIDEN_AVX2:       # %bb.0:
274; WIDEN_AVX2-NEXT:    vpsllq $63, %xmm2, %xmm2
275; WIDEN_AVX2-NEXT:    vmovmskpd %xmm2, %eax
276; WIDEN_AVX2-NEXT:    testb $1, %al
277; WIDEN_AVX2-NEXT:    jne .LBB3_1
278; WIDEN_AVX2-NEXT:  # %bb.2: # %else
279; WIDEN_AVX2-NEXT:    testb $2, %al
280; WIDEN_AVX2-NEXT:    jne .LBB3_3
281; WIDEN_AVX2-NEXT:  .LBB3_4: # %else2
282; WIDEN_AVX2-NEXT:    retq
283; WIDEN_AVX2-NEXT:  .LBB3_1: # %cond.store
284; WIDEN_AVX2-NEXT:    vmovq %xmm1, %rcx
285; WIDEN_AVX2-NEXT:    vmovss %xmm0, (%rcx)
286; WIDEN_AVX2-NEXT:    testb $2, %al
287; WIDEN_AVX2-NEXT:    je .LBB3_4
288; WIDEN_AVX2-NEXT:  .LBB3_3: # %cond.store1
289; WIDEN_AVX2-NEXT:    vpextrq $1, %xmm1, %rax
290; WIDEN_AVX2-NEXT:    vextractps $1, %xmm0, (%rax)
291; WIDEN_AVX2-NEXT:    retq
292  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> %a1, <2 x ptr> %ptr, i32 4, <2 x i1> %mask)
293  ret void
294}
295
296define <2 x i32> @test_gather_v2i32_data_index(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
297; WIDEN_SKX-LABEL: test_gather_v2i32_data_index:
298; WIDEN_SKX:       # %bb.0:
299; WIDEN_SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
300; WIDEN_SKX-NEXT:    vpmovq2m %xmm1, %k0
301; WIDEN_SKX-NEXT:    vpbroadcastq %rdi, %xmm1
302; WIDEN_SKX-NEXT:    vpmovsxdq %xmm0, %xmm0
303; WIDEN_SKX-NEXT:    vpsllq $2, %xmm0, %xmm0
304; WIDEN_SKX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
305; WIDEN_SKX-NEXT:    kmovw %k0, %eax
306; WIDEN_SKX-NEXT:    testb $1, %al
307; WIDEN_SKX-NEXT:    jne .LBB4_1
308; WIDEN_SKX-NEXT:  # %bb.2: # %else
309; WIDEN_SKX-NEXT:    testb $2, %al
310; WIDEN_SKX-NEXT:    jne .LBB4_3
311; WIDEN_SKX-NEXT:  .LBB4_4: # %else2
312; WIDEN_SKX-NEXT:    vmovdqa %xmm2, %xmm0
313; WIDEN_SKX-NEXT:    retq
314; WIDEN_SKX-NEXT:  .LBB4_1: # %cond.load
315; WIDEN_SKX-NEXT:    vmovq %xmm0, %rcx
316; WIDEN_SKX-NEXT:    vpinsrd $0, (%rcx), %xmm2, %xmm2
317; WIDEN_SKX-NEXT:    testb $2, %al
318; WIDEN_SKX-NEXT:    je .LBB4_4
319; WIDEN_SKX-NEXT:  .LBB4_3: # %cond.load1
320; WIDEN_SKX-NEXT:    vpextrq $1, %xmm0, %rax
321; WIDEN_SKX-NEXT:    vpinsrd $1, (%rax), %xmm2, %xmm2
322; WIDEN_SKX-NEXT:    vmovdqa %xmm2, %xmm0
323; WIDEN_SKX-NEXT:    retq
324;
325; WIDEN_KNL-LABEL: test_gather_v2i32_data_index:
326; WIDEN_KNL:       # %bb.0:
327; WIDEN_KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
328; WIDEN_KNL-NEXT:    vptestmq %zmm1, %zmm1, %k0
329; WIDEN_KNL-NEXT:    vpmovsxdq %xmm0, %xmm0
330; WIDEN_KNL-NEXT:    vpsllq $2, %xmm0, %xmm0
331; WIDEN_KNL-NEXT:    vmovq %rdi, %xmm1
332; WIDEN_KNL-NEXT:    vpbroadcastq %xmm1, %xmm1
333; WIDEN_KNL-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
334; WIDEN_KNL-NEXT:    kmovw %k0, %eax
335; WIDEN_KNL-NEXT:    testb $1, %al
336; WIDEN_KNL-NEXT:    jne .LBB4_1
337; WIDEN_KNL-NEXT:  # %bb.2: # %else
338; WIDEN_KNL-NEXT:    testb $2, %al
339; WIDEN_KNL-NEXT:    jne .LBB4_3
340; WIDEN_KNL-NEXT:  .LBB4_4: # %else2
341; WIDEN_KNL-NEXT:    vmovdqa %xmm2, %xmm0
342; WIDEN_KNL-NEXT:    vzeroupper
343; WIDEN_KNL-NEXT:    retq
344; WIDEN_KNL-NEXT:  .LBB4_1: # %cond.load
345; WIDEN_KNL-NEXT:    vmovq %xmm0, %rcx
346; WIDEN_KNL-NEXT:    vpinsrd $0, (%rcx), %xmm2, %xmm2
347; WIDEN_KNL-NEXT:    testb $2, %al
348; WIDEN_KNL-NEXT:    je .LBB4_4
349; WIDEN_KNL-NEXT:  .LBB4_3: # %cond.load1
350; WIDEN_KNL-NEXT:    vpextrq $1, %xmm0, %rax
351; WIDEN_KNL-NEXT:    vpinsrd $1, (%rax), %xmm2, %xmm2
352; WIDEN_KNL-NEXT:    vmovdqa %xmm2, %xmm0
353; WIDEN_KNL-NEXT:    vzeroupper
354; WIDEN_KNL-NEXT:    retq
355;
356; WIDEN_AVX2-LABEL: test_gather_v2i32_data_index:
357; WIDEN_AVX2:       # %bb.0:
358; WIDEN_AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
359; WIDEN_AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
360; WIDEN_AVX2-NEXT:    vpgatherdd %xmm1, (%rdi,%xmm0,4), %xmm2
361; WIDEN_AVX2-NEXT:    vmovdqa %xmm2, %xmm0
362; WIDEN_AVX2-NEXT:    retq
363  %gep.random = getelementptr i32, ptr %base, <2 x i32> %ind
364  %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
365  ret <2 x i32> %res
366}
367
368define void @test_scatter_v2i32_data_index(<2 x i32> %a1, ptr %base, <2 x i32> %ind, <2 x i1> %mask) {
369; WIDEN_SKX-LABEL: test_scatter_v2i32_data_index:
370; WIDEN_SKX:       # %bb.0:
371; WIDEN_SKX-NEXT:    vpsllq $63, %xmm2, %xmm2
372; WIDEN_SKX-NEXT:    vpmovq2m %xmm2, %k0
373; WIDEN_SKX-NEXT:    vpbroadcastq %rdi, %xmm2
374; WIDEN_SKX-NEXT:    vpmovsxdq %xmm1, %xmm1
375; WIDEN_SKX-NEXT:    vpsllq $2, %xmm1, %xmm1
376; WIDEN_SKX-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
377; WIDEN_SKX-NEXT:    kmovw %k0, %eax
378; WIDEN_SKX-NEXT:    testb $1, %al
379; WIDEN_SKX-NEXT:    jne .LBB5_1
380; WIDEN_SKX-NEXT:  # %bb.2: # %else
381; WIDEN_SKX-NEXT:    testb $2, %al
382; WIDEN_SKX-NEXT:    jne .LBB5_3
383; WIDEN_SKX-NEXT:  .LBB5_4: # %else2
384; WIDEN_SKX-NEXT:    retq
385; WIDEN_SKX-NEXT:  .LBB5_1: # %cond.store
386; WIDEN_SKX-NEXT:    vmovq %xmm1, %rcx
387; WIDEN_SKX-NEXT:    vmovss %xmm0, (%rcx)
388; WIDEN_SKX-NEXT:    testb $2, %al
389; WIDEN_SKX-NEXT:    je .LBB5_4
390; WIDEN_SKX-NEXT:  .LBB5_3: # %cond.store1
391; WIDEN_SKX-NEXT:    vpextrq $1, %xmm1, %rax
392; WIDEN_SKX-NEXT:    vextractps $1, %xmm0, (%rax)
393; WIDEN_SKX-NEXT:    retq
394;
395; WIDEN_KNL-LABEL: test_scatter_v2i32_data_index:
396; WIDEN_KNL:       # %bb.0:
397; WIDEN_KNL-NEXT:    vpsllq $63, %xmm2, %xmm2
398; WIDEN_KNL-NEXT:    vptestmq %zmm2, %zmm2, %k0
399; WIDEN_KNL-NEXT:    vpmovsxdq %xmm1, %xmm1
400; WIDEN_KNL-NEXT:    vpsllq $2, %xmm1, %xmm1
401; WIDEN_KNL-NEXT:    vmovq %rdi, %xmm2
402; WIDEN_KNL-NEXT:    vpbroadcastq %xmm2, %xmm2
403; WIDEN_KNL-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
404; WIDEN_KNL-NEXT:    kmovw %k0, %eax
405; WIDEN_KNL-NEXT:    testb $1, %al
406; WIDEN_KNL-NEXT:    jne .LBB5_1
407; WIDEN_KNL-NEXT:  # %bb.2: # %else
408; WIDEN_KNL-NEXT:    testb $2, %al
409; WIDEN_KNL-NEXT:    jne .LBB5_3
410; WIDEN_KNL-NEXT:  .LBB5_4: # %else2
411; WIDEN_KNL-NEXT:    vzeroupper
412; WIDEN_KNL-NEXT:    retq
413; WIDEN_KNL-NEXT:  .LBB5_1: # %cond.store
414; WIDEN_KNL-NEXT:    vmovq %xmm1, %rcx
415; WIDEN_KNL-NEXT:    vmovss %xmm0, (%rcx)
416; WIDEN_KNL-NEXT:    testb $2, %al
417; WIDEN_KNL-NEXT:    je .LBB5_4
418; WIDEN_KNL-NEXT:  .LBB5_3: # %cond.store1
419; WIDEN_KNL-NEXT:    vpextrq $1, %xmm1, %rax
420; WIDEN_KNL-NEXT:    vextractps $1, %xmm0, (%rax)
421; WIDEN_KNL-NEXT:    vzeroupper
422; WIDEN_KNL-NEXT:    retq
423;
424; WIDEN_AVX2-LABEL: test_scatter_v2i32_data_index:
425; WIDEN_AVX2:       # %bb.0:
426; WIDEN_AVX2-NEXT:    vpmovsxdq %xmm1, %xmm1
427; WIDEN_AVX2-NEXT:    vpsllq $2, %xmm1, %xmm1
428; WIDEN_AVX2-NEXT:    vmovq %rdi, %xmm3
429; WIDEN_AVX2-NEXT:    vpbroadcastq %xmm3, %xmm3
430; WIDEN_AVX2-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
431; WIDEN_AVX2-NEXT:    vpsllq $63, %xmm2, %xmm2
432; WIDEN_AVX2-NEXT:    vmovmskpd %xmm2, %eax
433; WIDEN_AVX2-NEXT:    testb $1, %al
434; WIDEN_AVX2-NEXT:    jne .LBB5_1
435; WIDEN_AVX2-NEXT:  # %bb.2: # %else
436; WIDEN_AVX2-NEXT:    testb $2, %al
437; WIDEN_AVX2-NEXT:    jne .LBB5_3
438; WIDEN_AVX2-NEXT:  .LBB5_4: # %else2
439; WIDEN_AVX2-NEXT:    retq
440; WIDEN_AVX2-NEXT:  .LBB5_1: # %cond.store
441; WIDEN_AVX2-NEXT:    vmovq %xmm1, %rcx
442; WIDEN_AVX2-NEXT:    vmovss %xmm0, (%rcx)
443; WIDEN_AVX2-NEXT:    testb $2, %al
444; WIDEN_AVX2-NEXT:    je .LBB5_4
445; WIDEN_AVX2-NEXT:  .LBB5_3: # %cond.store1
446; WIDEN_AVX2-NEXT:    vpextrq $1, %xmm1, %rax
447; WIDEN_AVX2-NEXT:    vextractps $1, %xmm0, (%rax)
448; WIDEN_AVX2-NEXT:    retq
449  %gep = getelementptr i32, ptr%base, <2 x i32> %ind
450  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> %a1, <2 x ptr> %gep, i32 4, <2 x i1> %mask)
451  ret void
452}
453
454define void @test_mscatter_v17f32(ptr %base, <17 x i32> %index, <17 x float> %val)
455; WIDEN_SKX-LABEL: test_mscatter_v17f32:
456; WIDEN_SKX:       # %bb.0:
457; WIDEN_SKX-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
458; WIDEN_SKX-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
459; WIDEN_SKX-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
460; WIDEN_SKX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
461; WIDEN_SKX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
462; WIDEN_SKX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
463; WIDEN_SKX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
464; WIDEN_SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
465; WIDEN_SKX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
466; WIDEN_SKX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
467; WIDEN_SKX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
468; WIDEN_SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
469; WIDEN_SKX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
470; WIDEN_SKX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
471; WIDEN_SKX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
472; WIDEN_SKX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
473; WIDEN_SKX-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
474; WIDEN_SKX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
475; WIDEN_SKX-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
476; WIDEN_SKX-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
477; WIDEN_SKX-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
478; WIDEN_SKX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
479; WIDEN_SKX-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2
480; WIDEN_SKX-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
481; WIDEN_SKX-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2
482; WIDEN_SKX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
483; WIDEN_SKX-NEXT:    vmovd %esi, %xmm2
484; WIDEN_SKX-NEXT:    vpinsrd $1, %edx, %xmm2, %xmm2
485; WIDEN_SKX-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2
486; WIDEN_SKX-NEXT:    vpinsrd $3, %r8d, %xmm2, %xmm2
487; WIDEN_SKX-NEXT:    vmovd %r9d, %xmm3
488; WIDEN_SKX-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3
489; WIDEN_SKX-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3
490; WIDEN_SKX-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3
491; WIDEN_SKX-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
492; WIDEN_SKX-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
493; WIDEN_SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
494; WIDEN_SKX-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
495; WIDEN_SKX-NEXT:    kxnorw %k0, %k0, %k1
496; WIDEN_SKX-NEXT:    vscatterdps %zmm0, (%rdi,%zmm1,4) {%k1}
497; WIDEN_SKX-NEXT:    movw $1, %ax
498; WIDEN_SKX-NEXT:    kmovw %eax, %k1
499; WIDEN_SKX-NEXT:    vscatterdps %zmm2, (%rdi,%zmm3,4) {%k1}
500; WIDEN_SKX-NEXT:    vzeroupper
501; WIDEN_SKX-NEXT:    retq
502;
503; WIDEN_KNL-LABEL: test_mscatter_v17f32:
504; WIDEN_KNL:       # %bb.0:
505; WIDEN_KNL-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
506; WIDEN_KNL-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
507; WIDEN_KNL-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
508; WIDEN_KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
509; WIDEN_KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
510; WIDEN_KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
511; WIDEN_KNL-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
512; WIDEN_KNL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
513; WIDEN_KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
514; WIDEN_KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
515; WIDEN_KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
516; WIDEN_KNL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
517; WIDEN_KNL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
518; WIDEN_KNL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
519; WIDEN_KNL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
520; WIDEN_KNL-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
521; WIDEN_KNL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
522; WIDEN_KNL-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
523; WIDEN_KNL-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
524; WIDEN_KNL-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
525; WIDEN_KNL-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
526; WIDEN_KNL-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
527; WIDEN_KNL-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2
528; WIDEN_KNL-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
529; WIDEN_KNL-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2
530; WIDEN_KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
531; WIDEN_KNL-NEXT:    vmovd %esi, %xmm2
532; WIDEN_KNL-NEXT:    vpinsrd $1, %edx, %xmm2, %xmm2
533; WIDEN_KNL-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2
534; WIDEN_KNL-NEXT:    vpinsrd $3, %r8d, %xmm2, %xmm2
535; WIDEN_KNL-NEXT:    vmovd %r9d, %xmm3
536; WIDEN_KNL-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3
537; WIDEN_KNL-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3
538; WIDEN_KNL-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3
539; WIDEN_KNL-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
540; WIDEN_KNL-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
541; WIDEN_KNL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
542; WIDEN_KNL-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
543; WIDEN_KNL-NEXT:    kxnorw %k0, %k0, %k1
544; WIDEN_KNL-NEXT:    vscatterdps %zmm0, (%rdi,%zmm1,4) {%k1}
545; WIDEN_KNL-NEXT:    movw $1, %ax
546; WIDEN_KNL-NEXT:    kmovw %eax, %k1
547; WIDEN_KNL-NEXT:    vscatterdps %zmm2, (%rdi,%zmm3,4) {%k1}
548; WIDEN_KNL-NEXT:    vzeroupper
549; WIDEN_KNL-NEXT:    retq
550;
551; WIDEN_AVX2-LABEL: test_mscatter_v17f32:
552; WIDEN_AVX2:       # %bb.0:
553; WIDEN_AVX2-NEXT:    vmovq %rdi, %xmm8
554; WIDEN_AVX2-NEXT:    vpbroadcastq %xmm8, %ymm8
555; WIDEN_AVX2-NEXT:    vmovd %esi, %xmm9
556; WIDEN_AVX2-NEXT:    vpinsrd $1, %edx, %xmm9, %xmm9
557; WIDEN_AVX2-NEXT:    vpinsrd $2, %ecx, %xmm9, %xmm9
558; WIDEN_AVX2-NEXT:    vpinsrd $3, %r8d, %xmm9, %xmm9
559; WIDEN_AVX2-NEXT:    vpmovsxdq %xmm9, %ymm9
560; WIDEN_AVX2-NEXT:    vpsllq $2, %ymm9, %ymm9
561; WIDEN_AVX2-NEXT:    vpaddq %ymm9, %ymm8, %ymm9
562; WIDEN_AVX2-NEXT:    vmovq %xmm9, %rax
563; WIDEN_AVX2-NEXT:    vmovss %xmm0, (%rax)
564; WIDEN_AVX2-NEXT:    vpextrq $1, %xmm9, %rax
565; WIDEN_AVX2-NEXT:    vmovss %xmm1, (%rax)
566; WIDEN_AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm0
567; WIDEN_AVX2-NEXT:    vmovq %xmm0, %rax
568; WIDEN_AVX2-NEXT:    vmovss %xmm2, (%rax)
569; WIDEN_AVX2-NEXT:    vpextrq $1, %xmm0, %rax
570; WIDEN_AVX2-NEXT:    vmovss %xmm3, (%rax)
571; WIDEN_AVX2-NEXT:    vmovd %r9d, %xmm0
572; WIDEN_AVX2-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
573; WIDEN_AVX2-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
574; WIDEN_AVX2-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
575; WIDEN_AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
576; WIDEN_AVX2-NEXT:    vpsllq $2, %ymm0, %ymm0
577; WIDEN_AVX2-NEXT:    vpaddq %ymm0, %ymm8, %ymm0
578; WIDEN_AVX2-NEXT:    vmovq %xmm0, %rax
579; WIDEN_AVX2-NEXT:    vmovss %xmm4, (%rax)
580; WIDEN_AVX2-NEXT:    vpextrq $1, %xmm0, %rax
581; WIDEN_AVX2-NEXT:    vmovss %xmm5, (%rax)
582; WIDEN_AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
583; WIDEN_AVX2-NEXT:    vmovq %xmm0, %rax
584; WIDEN_AVX2-NEXT:    vmovss %xmm6, (%rax)
585; WIDEN_AVX2-NEXT:    vpextrq $1, %xmm0, %rax
586; WIDEN_AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
587; WIDEN_AVX2-NEXT:    vmovss %xmm7, (%rax)
588; WIDEN_AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
589; WIDEN_AVX2-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
590; WIDEN_AVX2-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
591; WIDEN_AVX2-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
592; WIDEN_AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
593; WIDEN_AVX2-NEXT:    vpsllq $2, %ymm1, %ymm1
594; WIDEN_AVX2-NEXT:    vpaddq %ymm1, %ymm8, %ymm1
595; WIDEN_AVX2-NEXT:    vmovq %xmm1, %rax
596; WIDEN_AVX2-NEXT:    vmovss %xmm0, (%rax)
597; WIDEN_AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
598; WIDEN_AVX2-NEXT:    vpextrq $1, %xmm1, %rax
599; WIDEN_AVX2-NEXT:    vmovss %xmm0, (%rax)
600; WIDEN_AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
601; WIDEN_AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
602; WIDEN_AVX2-NEXT:    vmovq %xmm1, %rax
603; WIDEN_AVX2-NEXT:    vmovss %xmm0, (%rax)
604; WIDEN_AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
605; WIDEN_AVX2-NEXT:    vpextrq $1, %xmm1, %rax
606; WIDEN_AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
607; WIDEN_AVX2-NEXT:    vmovss %xmm1, (%rax)
608; WIDEN_AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
609; WIDEN_AVX2-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
610; WIDEN_AVX2-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
611; WIDEN_AVX2-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
612; WIDEN_AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
613; WIDEN_AVX2-NEXT:    vpsllq $2, %ymm1, %ymm1
614; WIDEN_AVX2-NEXT:    vpaddq %ymm1, %ymm8, %ymm1
615; WIDEN_AVX2-NEXT:    vmovq %xmm1, %rax
616; WIDEN_AVX2-NEXT:    vmovss %xmm0, (%rax)
617; WIDEN_AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
618; WIDEN_AVX2-NEXT:    vpextrq $1, %xmm1, %rax
619; WIDEN_AVX2-NEXT:    vmovss %xmm0, (%rax)
620; WIDEN_AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
621; WIDEN_AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
622; WIDEN_AVX2-NEXT:    vmovq %xmm1, %rax
623; WIDEN_AVX2-NEXT:    vmovss %xmm0, (%rax)
624; WIDEN_AVX2-NEXT:    vpextrq $1, %xmm1, %rax
625; WIDEN_AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
626; WIDEN_AVX2-NEXT:    vmovss %xmm0, (%rax)
627; WIDEN_AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
628; WIDEN_AVX2-NEXT:    vpmovsxdq %xmm0, %xmm0
629; WIDEN_AVX2-NEXT:    vpsllq $2, %xmm0, %xmm0
630; WIDEN_AVX2-NEXT:    vpaddq %xmm0, %xmm8, %xmm0
631; WIDEN_AVX2-NEXT:    vmovq %xmm0, %rax
632; WIDEN_AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
633; WIDEN_AVX2-NEXT:    vmovss %xmm0, (%rax)
634; WIDEN_AVX2-NEXT:    vzeroupper
635; WIDEN_AVX2-NEXT:    retq
636{
637  %gep = getelementptr float, ptr %base, <17 x i32> %index
638  call void @llvm.masked.scatter.v17f32.v17p0(<17 x float> %val, <17 x ptr> %gep, i32 4, <17 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
639  ret void
640}
641
642define <17 x float> @test_mgather_v17f32(ptr %base, <17 x i32> %index)
643; WIDEN_SKX-LABEL: test_mgather_v17f32:
644; WIDEN_SKX:       # %bb.0:
645; WIDEN_SKX-NEXT:    movq %rdi, %rax
646; WIDEN_SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
647; WIDEN_SKX-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
648; WIDEN_SKX-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
649; WIDEN_SKX-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
650; WIDEN_SKX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
651; WIDEN_SKX-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
652; WIDEN_SKX-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
653; WIDEN_SKX-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
654; WIDEN_SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
655; WIDEN_SKX-NEXT:    vmovd %edx, %xmm1
656; WIDEN_SKX-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
657; WIDEN_SKX-NEXT:    vpinsrd $2, %r8d, %xmm1, %xmm1
658; WIDEN_SKX-NEXT:    vpinsrd $3, %r9d, %xmm1, %xmm1
659; WIDEN_SKX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
660; WIDEN_SKX-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2
661; WIDEN_SKX-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
662; WIDEN_SKX-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2
663; WIDEN_SKX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
664; WIDEN_SKX-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
665; WIDEN_SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
666; WIDEN_SKX-NEXT:    kxnorw %k0, %k0, %k1
667; WIDEN_SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
668; WIDEN_SKX-NEXT:    vxorps %xmm3, %xmm3, %xmm3
669; WIDEN_SKX-NEXT:    vgatherdps (%rsi,%zmm0,4), %zmm3 {%k1}
670; WIDEN_SKX-NEXT:    movw $1, %cx
671; WIDEN_SKX-NEXT:    kmovw %ecx, %k1
672; WIDEN_SKX-NEXT:    vgatherdps (%rsi,%zmm1,4), %zmm2 {%k1}
673; WIDEN_SKX-NEXT:    vmovss %xmm2, 64(%rdi)
674; WIDEN_SKX-NEXT:    vmovaps %zmm3, (%rdi)
675; WIDEN_SKX-NEXT:    vzeroupper
676; WIDEN_SKX-NEXT:    retq
677;
678; WIDEN_KNL-LABEL: test_mgather_v17f32:
679; WIDEN_KNL:       # %bb.0:
680; WIDEN_KNL-NEXT:    movq %rdi, %rax
681; WIDEN_KNL-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
682; WIDEN_KNL-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
683; WIDEN_KNL-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
684; WIDEN_KNL-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
685; WIDEN_KNL-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
686; WIDEN_KNL-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
687; WIDEN_KNL-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
688; WIDEN_KNL-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
689; WIDEN_KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
690; WIDEN_KNL-NEXT:    vmovd %edx, %xmm1
691; WIDEN_KNL-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
692; WIDEN_KNL-NEXT:    vpinsrd $2, %r8d, %xmm1, %xmm1
693; WIDEN_KNL-NEXT:    vpinsrd $3, %r9d, %xmm1, %xmm1
694; WIDEN_KNL-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
695; WIDEN_KNL-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2
696; WIDEN_KNL-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
697; WIDEN_KNL-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2
698; WIDEN_KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
699; WIDEN_KNL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
700; WIDEN_KNL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
701; WIDEN_KNL-NEXT:    kxnorw %k0, %k0, %k1
702; WIDEN_KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
703; WIDEN_KNL-NEXT:    vxorps %xmm3, %xmm3, %xmm3
704; WIDEN_KNL-NEXT:    vgatherdps (%rsi,%zmm0,4), %zmm3 {%k1}
705; WIDEN_KNL-NEXT:    movw $1, %cx
706; WIDEN_KNL-NEXT:    kmovw %ecx, %k1
707; WIDEN_KNL-NEXT:    vgatherdps (%rsi,%zmm1,4), %zmm2 {%k1}
708; WIDEN_KNL-NEXT:    vmovss %xmm2, 64(%rdi)
709; WIDEN_KNL-NEXT:    vmovaps %zmm3, (%rdi)
710; WIDEN_KNL-NEXT:    vzeroupper
711; WIDEN_KNL-NEXT:    retq
712;
713; WIDEN_AVX2-LABEL: test_mgather_v17f32:
714; WIDEN_AVX2:       # %bb.0:
715; WIDEN_AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
716; WIDEN_AVX2-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
717; WIDEN_AVX2-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
718; WIDEN_AVX2-NEXT:    movq %rdi, %rax
719; WIDEN_AVX2-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
720; WIDEN_AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
721; WIDEN_AVX2-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
722; WIDEN_AVX2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
723; WIDEN_AVX2-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
724; WIDEN_AVX2-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
725; WIDEN_AVX2-NEXT:    vmovd %edx, %xmm3
726; WIDEN_AVX2-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
727; WIDEN_AVX2-NEXT:    vpinsrd $2, %r8d, %xmm3, %xmm3
728; WIDEN_AVX2-NEXT:    vpinsrd $3, %r9d, %xmm3, %xmm3
729; WIDEN_AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
730; WIDEN_AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
731; WIDEN_AVX2-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
732; WIDEN_AVX2-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
733; WIDEN_AVX2-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
734; WIDEN_AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
735; WIDEN_AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
736; WIDEN_AVX2-NEXT:    vxorps %xmm4, %xmm4, %xmm4
737; WIDEN_AVX2-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
738; WIDEN_AVX2-NEXT:    vxorps %xmm6, %xmm6, %xmm6
739; WIDEN_AVX2-NEXT:    vgatherdps %ymm5, (%rsi,%ymm1,4), %ymm6
740; WIDEN_AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
741; WIDEN_AVX2-NEXT:    vgatherdps %ymm3, (%rsi,%ymm0,4), %ymm1
742; WIDEN_AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [4294967295,0,0,0]
743; WIDEN_AVX2-NEXT:    vgatherdps %ymm0, (%rsi,%ymm2,4), %ymm4
744; WIDEN_AVX2-NEXT:    vmovss %xmm4, 64(%rdi)
745; WIDEN_AVX2-NEXT:    vmovaps %ymm1, 32(%rdi)
746; WIDEN_AVX2-NEXT:    vmovaps %ymm6, (%rdi)
747; WIDEN_AVX2-NEXT:    vzeroupper
748; WIDEN_AVX2-NEXT:    retq
749{
750  %gep = getelementptr float, ptr %base, <17 x i32> %index
751  %res = call <17 x float> @llvm.masked.gather.v17f32.v17p0(<17 x ptr> %gep, i32 4, <17 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <17 x float> undef)
752  ret <17 x float> %res
753}
754
755declare <17 x float> @llvm.masked.gather.v17f32.v17p0(<17 x ptr>, i32 immarg, <17 x i1>, <17 x float>)
756declare void @llvm.masked.scatter.v17f32.v17p0(<17 x float> , <17 x ptr> , i32 , <17 x i1>)
757
758declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
759declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>)
760declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
761declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> , <2 x ptr> , i32 , <2 x i1>)
762