xref: /llvm-project/llvm/test/CodeGen/X86/vector-reduce-and-scalar.ll (revision 44f316811016e677ca3e6c6237619e71bae28986)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL
9
10;
11; vXi64
12;
13
14define i1 @test_v2i64(ptr %ptr) nounwind {
15; SSE2-LABEL: test_v2i64:
16; SSE2:       # %bb.0:
17; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
18; SSE2-NEXT:    pcmpeqd (%rdi), %xmm0
19; SSE2-NEXT:    movmskps %xmm0, %eax
20; SSE2-NEXT:    xorl $15, %eax
21; SSE2-NEXT:    sete %al
22; SSE2-NEXT:    retq
23;
24; SSE41-LABEL: test_v2i64:
25; SSE41:       # %bb.0:
26; SSE41-NEXT:    movdqa (%rdi), %xmm0
27; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
28; SSE41-NEXT:    ptest %xmm1, %xmm0
29; SSE41-NEXT:    setb %al
30; SSE41-NEXT:    retq
31;
32; AVX-LABEL: test_v2i64:
33; AVX:       # %bb.0:
34; AVX-NEXT:    vmovdqa (%rdi), %xmm0
35; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
36; AVX-NEXT:    vptest %xmm1, %xmm0
37; AVX-NEXT:    setb %al
38; AVX-NEXT:    retq
39  %vload = load <2 x i64>, ptr %ptr
40  %v0 = extractelement <2 x i64> %vload, i32 0
41  %v1 = extractelement <2 x i64> %vload, i32 1
42  %vreduce = and i64 %v0, %v1
43  %vcheck = icmp eq i64 %vreduce, -1
44  ret i1 %vcheck
45}
46
47define i1 @test_v4i64(ptr %ptr) nounwind {
48; SSE2-LABEL: test_v4i64:
49; SSE2:       # %bb.0:
50; SSE2-NEXT:    movdqa (%rdi), %xmm0
51; SSE2-NEXT:    pand 16(%rdi), %xmm0
52; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
53; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
54; SSE2-NEXT:    movmskps %xmm1, %eax
55; SSE2-NEXT:    xorl $15, %eax
56; SSE2-NEXT:    sete %al
57; SSE2-NEXT:    retq
58;
59; SSE41-LABEL: test_v4i64:
60; SSE41:       # %bb.0:
61; SSE41-NEXT:    movdqa (%rdi), %xmm0
62; SSE41-NEXT:    pand 16(%rdi), %xmm0
63; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
64; SSE41-NEXT:    ptest %xmm1, %xmm0
65; SSE41-NEXT:    setb %al
66; SSE41-NEXT:    retq
67;
68; AVX1-LABEL: test_v4i64:
69; AVX1:       # %bb.0:
70; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
71; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
72; AVX1-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
73; AVX1-NEXT:    vptest %ymm1, %ymm0
74; AVX1-NEXT:    setb %al
75; AVX1-NEXT:    vzeroupper
76; AVX1-NEXT:    retq
77;
78; AVX2-LABEL: test_v4i64:
79; AVX2:       # %bb.0:
80; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
81; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
82; AVX2-NEXT:    vptest %ymm1, %ymm0
83; AVX2-NEXT:    setb %al
84; AVX2-NEXT:    vzeroupper
85; AVX2-NEXT:    retq
86;
87; AVX512-LABEL: test_v4i64:
88; AVX512:       # %bb.0:
89; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
90; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
91; AVX512-NEXT:    vptest %ymm1, %ymm0
92; AVX512-NEXT:    setb %al
93; AVX512-NEXT:    vzeroupper
94; AVX512-NEXT:    retq
95  %vload = load <4 x i64>, ptr %ptr
96  %v0 = extractelement <4 x i64> %vload, i32 0
97  %v1 = extractelement <4 x i64> %vload, i32 1
98  %v2 = extractelement <4 x i64> %vload, i32 2
99  %v3 = extractelement <4 x i64> %vload, i32 3
100  %vreduce01 = and i64 %v0, %v1
101  %vreduce23 = and i64 %v2, %v3
102  %vreduce = and i64 %vreduce01, %vreduce23
103  %vcheck = icmp eq i64 %vreduce, -1
104  ret i1 %vcheck
105}
106
107define i1 @test_v8i64(ptr %ptr) nounwind {
108; SSE2-LABEL: test_v8i64:
109; SSE2:       # %bb.0:
110; SSE2-NEXT:    movdqa (%rdi), %xmm0
111; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
112; SSE2-NEXT:    pand 48(%rdi), %xmm1
113; SSE2-NEXT:    pand 32(%rdi), %xmm0
114; SSE2-NEXT:    pand %xmm1, %xmm0
115; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
116; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
117; SSE2-NEXT:    movmskps %xmm1, %eax
118; SSE2-NEXT:    xorl $15, %eax
119; SSE2-NEXT:    sete %al
120; SSE2-NEXT:    retq
121;
122; SSE41-LABEL: test_v8i64:
123; SSE41:       # %bb.0:
124; SSE41-NEXT:    movdqa (%rdi), %xmm0
125; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
126; SSE41-NEXT:    pand 48(%rdi), %xmm1
127; SSE41-NEXT:    pand 32(%rdi), %xmm0
128; SSE41-NEXT:    pand %xmm1, %xmm0
129; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
130; SSE41-NEXT:    ptest %xmm1, %xmm0
131; SSE41-NEXT:    setb %al
132; SSE41-NEXT:    retq
133;
134; AVX1-LABEL: test_v8i64:
135; AVX1:       # %bb.0:
136; AVX1-NEXT:    vmovaps (%rdi), %ymm0
137; AVX1-NEXT:    vandps 32(%rdi), %ymm0, %ymm0
138; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
139; AVX1-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
140; AVX1-NEXT:    vptest %ymm1, %ymm0
141; AVX1-NEXT:    setb %al
142; AVX1-NEXT:    vzeroupper
143; AVX1-NEXT:    retq
144;
145; AVX2-LABEL: test_v8i64:
146; AVX2:       # %bb.0:
147; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
148; AVX2-NEXT:    vpand 32(%rdi), %ymm0, %ymm0
149; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
150; AVX2-NEXT:    vptest %ymm1, %ymm0
151; AVX2-NEXT:    setb %al
152; AVX2-NEXT:    vzeroupper
153; AVX2-NEXT:    retq
154;
155; AVX512-LABEL: test_v8i64:
156; AVX512:       # %bb.0:
157; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = -1
158; AVX512-NEXT:    vpcmpneqd (%rdi), %zmm0, %k0
159; AVX512-NEXT:    kortestw %k0, %k0
160; AVX512-NEXT:    sete %al
161; AVX512-NEXT:    vzeroupper
162; AVX512-NEXT:    retq
163  %vload = load <8 x i64>, ptr %ptr
164  %v0 = extractelement <8 x i64> %vload, i32 0
165  %v1 = extractelement <8 x i64> %vload, i32 1
166  %v2 = extractelement <8 x i64> %vload, i32 2
167  %v3 = extractelement <8 x i64> %vload, i32 3
168  %v4 = extractelement <8 x i64> %vload, i32 4
169  %v5 = extractelement <8 x i64> %vload, i32 5
170  %v6 = extractelement <8 x i64> %vload, i32 6
171  %v7 = extractelement <8 x i64> %vload, i32 7
172  %vreduce01 = and i64 %v0, %v1
173  %vreduce23 = and i64 %v2, %v3
174  %vreduce45 = and i64 %v4, %v5
175  %vreduce67 = and i64 %v6, %v7
176  %vreduce0123 = and i64 %vreduce01, %vreduce23
177  %vreduce4567 = and i64 %vreduce45, %vreduce67
178  %vreduce = and i64 %vreduce0123, %vreduce4567
179  %vcheck = icmp eq i64 %vreduce, -1
180  ret i1 %vcheck
181}
182
183define i1 @test_v16i64(ptr %ptr) nounwind {
184; SSE2-LABEL: test_v16i64:
185; SSE2:       # %bb.0:
186; SSE2-NEXT:    movdqa (%rdi), %xmm0
187; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
188; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
189; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
190; SSE2-NEXT:    pand 112(%rdi), %xmm3
191; SSE2-NEXT:    pand 80(%rdi), %xmm1
192; SSE2-NEXT:    pand %xmm3, %xmm1
193; SSE2-NEXT:    pand 96(%rdi), %xmm2
194; SSE2-NEXT:    pand 64(%rdi), %xmm0
195; SSE2-NEXT:    pand %xmm2, %xmm0
196; SSE2-NEXT:    pand %xmm1, %xmm0
197; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
198; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
199; SSE2-NEXT:    movmskps %xmm1, %eax
200; SSE2-NEXT:    xorl $15, %eax
201; SSE2-NEXT:    sete %al
202; SSE2-NEXT:    retq
203;
204; SSE41-LABEL: test_v16i64:
205; SSE41:       # %bb.0:
206; SSE41-NEXT:    movdqa (%rdi), %xmm0
207; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
208; SSE41-NEXT:    movdqa 32(%rdi), %xmm2
209; SSE41-NEXT:    movdqa 48(%rdi), %xmm3
210; SSE41-NEXT:    pand 112(%rdi), %xmm3
211; SSE41-NEXT:    pand 80(%rdi), %xmm1
212; SSE41-NEXT:    pand %xmm3, %xmm1
213; SSE41-NEXT:    pand 96(%rdi), %xmm2
214; SSE41-NEXT:    pand 64(%rdi), %xmm0
215; SSE41-NEXT:    pand %xmm2, %xmm0
216; SSE41-NEXT:    pand %xmm1, %xmm0
217; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
218; SSE41-NEXT:    ptest %xmm1, %xmm0
219; SSE41-NEXT:    setb %al
220; SSE41-NEXT:    retq
221;
222; AVX1-LABEL: test_v16i64:
223; AVX1:       # %bb.0:
224; AVX1-NEXT:    vmovaps (%rdi), %ymm0
225; AVX1-NEXT:    vmovaps 32(%rdi), %ymm1
226; AVX1-NEXT:    vandps 96(%rdi), %ymm1, %ymm1
227; AVX1-NEXT:    vandps 64(%rdi), %ymm0, %ymm0
228; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
229; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
230; AVX1-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
231; AVX1-NEXT:    vptest %ymm1, %ymm0
232; AVX1-NEXT:    setb %al
233; AVX1-NEXT:    vzeroupper
234; AVX1-NEXT:    retq
235;
236; AVX2-LABEL: test_v16i64:
237; AVX2:       # %bb.0:
238; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
239; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
240; AVX2-NEXT:    vpand 96(%rdi), %ymm1, %ymm1
241; AVX2-NEXT:    vpand 64(%rdi), %ymm0, %ymm0
242; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
243; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
244; AVX2-NEXT:    vptest %ymm1, %ymm0
245; AVX2-NEXT:    setb %al
246; AVX2-NEXT:    vzeroupper
247; AVX2-NEXT:    retq
248;
249; AVX512-LABEL: test_v16i64:
250; AVX512:       # %bb.0:
251; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
252; AVX512-NEXT:    vpandq 64(%rdi), %zmm0, %zmm0
253; AVX512-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
254; AVX512-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
255; AVX512-NEXT:    kortestw %k0, %k0
256; AVX512-NEXT:    sete %al
257; AVX512-NEXT:    vzeroupper
258; AVX512-NEXT:    retq
259  %vload = load <16 x i64>, ptr %ptr
260  %v0  = extractelement <16 x i64> %vload, i32 0
261  %v1  = extractelement <16 x i64> %vload, i32 1
262  %v2  = extractelement <16 x i64> %vload, i32 2
263  %v3  = extractelement <16 x i64> %vload, i32 3
264  %v4  = extractelement <16 x i64> %vload, i32 4
265  %v5  = extractelement <16 x i64> %vload, i32 5
266  %v6  = extractelement <16 x i64> %vload, i32 6
267  %v7  = extractelement <16 x i64> %vload, i32 7
268  %v8  = extractelement <16 x i64> %vload, i32 8
269  %v9  = extractelement <16 x i64> %vload, i32 9
270  %v10 = extractelement <16 x i64> %vload, i32 10
271  %v11 = extractelement <16 x i64> %vload, i32 11
272  %v12 = extractelement <16 x i64> %vload, i32 12
273  %v13 = extractelement <16 x i64> %vload, i32 13
274  %v14 = extractelement <16 x i64> %vload, i32 14
275  %v15 = extractelement <16 x i64> %vload, i32 15
276  %vreduce01 = and i64 %v0, %v1
277  %vreduce23 = and i64 %v2, %v3
278  %vreduce45 = and i64 %v4, %v5
279  %vreduce67 = and i64 %v6, %v7
280  %vreduce89 = and i64 %v8, %v9
281  %vreduce1011 = and i64 %v10, %v11
282  %vreduce1213 = and i64 %v12, %v13
283  %vreduce1415 = and i64 %v14, %v15
284  %vreduce0123 = and i64 %vreduce01, %vreduce23
285  %vreduce4567 = and i64 %vreduce45, %vreduce67
286  %vreduce891011 = and i64 %vreduce89, %vreduce1011
287  %vreduce12131415 = and i64 %vreduce1213, %vreduce1415
288  %vreduce01234567 = and i64 %vreduce0123, %vreduce4567
289  %vreduce89101112131415 = and i64 %vreduce891011, %vreduce12131415
290  %vreduce = and i64 %vreduce01234567, %vreduce89101112131415
291  %vcheck = icmp eq i64 %vreduce, -1
292  ret i1 %vcheck
293}
294
295;
296; vXi32
297;
298
299define i1 @test_v2i32(ptr %ptr) nounwind {
300; SSE-LABEL: test_v2i32:
301; SSE:       # %bb.0:
302; SSE-NEXT:    cmpq $-1, (%rdi)
303; SSE-NEXT:    sete %al
304; SSE-NEXT:    retq
305;
306; AVX-LABEL: test_v2i32:
307; AVX:       # %bb.0:
308; AVX-NEXT:    cmpq $-1, (%rdi)
309; AVX-NEXT:    sete %al
310; AVX-NEXT:    retq
311  %vload = load <2 x i32>, ptr %ptr
312  %v0 = extractelement <2 x i32> %vload, i32 0
313  %v1 = extractelement <2 x i32> %vload, i32 1
314  %vreduce = and i32 %v0, %v1
315  %vcheck = icmp eq i32 %vreduce, -1
316  ret i1 %vcheck
317}
318
319define i1 @test_v4i32(ptr %ptr) nounwind {
320; SSE2-LABEL: test_v4i32:
321; SSE2:       # %bb.0:
322; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
323; SSE2-NEXT:    pcmpeqd (%rdi), %xmm0
324; SSE2-NEXT:    movmskps %xmm0, %eax
325; SSE2-NEXT:    xorl $15, %eax
326; SSE2-NEXT:    sete %al
327; SSE2-NEXT:    retq
328;
329; SSE41-LABEL: test_v4i32:
330; SSE41:       # %bb.0:
331; SSE41-NEXT:    movdqa (%rdi), %xmm0
332; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
333; SSE41-NEXT:    ptest %xmm1, %xmm0
334; SSE41-NEXT:    setb %al
335; SSE41-NEXT:    retq
336;
337; AVX-LABEL: test_v4i32:
338; AVX:       # %bb.0:
339; AVX-NEXT:    vmovdqa (%rdi), %xmm0
340; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
341; AVX-NEXT:    vptest %xmm1, %xmm0
342; AVX-NEXT:    setb %al
343; AVX-NEXT:    retq
344  %vload = load <4 x i32>, ptr %ptr
345  %v0 = extractelement <4 x i32> %vload, i32 0
346  %v1 = extractelement <4 x i32> %vload, i32 1
347  %v2 = extractelement <4 x i32> %vload, i32 2
348  %v3 = extractelement <4 x i32> %vload, i32 3
349  %vreduce01 = and i32 %v0, %v1
350  %vreduce23 = and i32 %v2, %v3
351  %vreduce = and i32 %vreduce01, %vreduce23
352  %vcheck = icmp eq i32 %vreduce, -1
353  ret i1 %vcheck
354}
355
356define i1 @test_v8i32(ptr %ptr) nounwind {
357; SSE2-LABEL: test_v8i32:
358; SSE2:       # %bb.0:
359; SSE2-NEXT:    movdqa (%rdi), %xmm0
360; SSE2-NEXT:    pand 16(%rdi), %xmm0
361; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
362; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
363; SSE2-NEXT:    movmskps %xmm1, %eax
364; SSE2-NEXT:    xorl $15, %eax
365; SSE2-NEXT:    sete %al
366; SSE2-NEXT:    retq
367;
368; SSE41-LABEL: test_v8i32:
369; SSE41:       # %bb.0:
370; SSE41-NEXT:    movdqa (%rdi), %xmm0
371; SSE41-NEXT:    pand 16(%rdi), %xmm0
372; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
373; SSE41-NEXT:    ptest %xmm1, %xmm0
374; SSE41-NEXT:    setb %al
375; SSE41-NEXT:    retq
376;
377; AVX1-LABEL: test_v8i32:
378; AVX1:       # %bb.0:
379; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
380; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
381; AVX1-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
382; AVX1-NEXT:    vptest %ymm1, %ymm0
383; AVX1-NEXT:    setb %al
384; AVX1-NEXT:    vzeroupper
385; AVX1-NEXT:    retq
386;
387; AVX2-LABEL: test_v8i32:
388; AVX2:       # %bb.0:
389; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
390; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
391; AVX2-NEXT:    vptest %ymm1, %ymm0
392; AVX2-NEXT:    setb %al
393; AVX2-NEXT:    vzeroupper
394; AVX2-NEXT:    retq
395;
396; AVX512-LABEL: test_v8i32:
397; AVX512:       # %bb.0:
398; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
399; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
400; AVX512-NEXT:    vptest %ymm1, %ymm0
401; AVX512-NEXT:    setb %al
402; AVX512-NEXT:    vzeroupper
403; AVX512-NEXT:    retq
404  %vload = load <8 x i32>, ptr %ptr
405  %v0 = extractelement <8 x i32> %vload, i32 0
406  %v1 = extractelement <8 x i32> %vload, i32 1
407  %v2 = extractelement <8 x i32> %vload, i32 2
408  %v3 = extractelement <8 x i32> %vload, i32 3
409  %v4 = extractelement <8 x i32> %vload, i32 4
410  %v5 = extractelement <8 x i32> %vload, i32 5
411  %v6 = extractelement <8 x i32> %vload, i32 6
412  %v7 = extractelement <8 x i32> %vload, i32 7
413  %vreduce01 = and i32 %v0, %v1
414  %vreduce23 = and i32 %v2, %v3
415  %vreduce45 = and i32 %v4, %v5
416  %vreduce67 = and i32 %v6, %v7
417  %vreduce0123 = and i32 %vreduce01, %vreduce23
418  %vreduce4567 = and i32 %vreduce45, %vreduce67
419  %vreduce = and i32 %vreduce0123, %vreduce4567
420  %vcheck = icmp eq i32 %vreduce, -1
421  ret i1 %vcheck
422}
423
424define i1 @test_v16i32(ptr %ptr) nounwind {
425; SSE2-LABEL: test_v16i32:
426; SSE2:       # %bb.0:
427; SSE2-NEXT:    movdqa (%rdi), %xmm0
428; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
429; SSE2-NEXT:    pand 48(%rdi), %xmm1
430; SSE2-NEXT:    pand 32(%rdi), %xmm0
431; SSE2-NEXT:    pand %xmm1, %xmm0
432; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
433; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
434; SSE2-NEXT:    movmskps %xmm1, %eax
435; SSE2-NEXT:    xorl $15, %eax
436; SSE2-NEXT:    sete %al
437; SSE2-NEXT:    retq
438;
439; SSE41-LABEL: test_v16i32:
440; SSE41:       # %bb.0:
441; SSE41-NEXT:    movdqa (%rdi), %xmm0
442; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
443; SSE41-NEXT:    pand 48(%rdi), %xmm1
444; SSE41-NEXT:    pand 32(%rdi), %xmm0
445; SSE41-NEXT:    pand %xmm1, %xmm0
446; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
447; SSE41-NEXT:    ptest %xmm1, %xmm0
448; SSE41-NEXT:    setb %al
449; SSE41-NEXT:    retq
450;
451; AVX1-LABEL: test_v16i32:
452; AVX1:       # %bb.0:
453; AVX1-NEXT:    vmovaps (%rdi), %ymm0
454; AVX1-NEXT:    vandps 32(%rdi), %ymm0, %ymm0
455; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
456; AVX1-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
457; AVX1-NEXT:    vptest %ymm1, %ymm0
458; AVX1-NEXT:    setb %al
459; AVX1-NEXT:    vzeroupper
460; AVX1-NEXT:    retq
461;
462; AVX2-LABEL: test_v16i32:
463; AVX2:       # %bb.0:
464; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
465; AVX2-NEXT:    vpand 32(%rdi), %ymm0, %ymm0
466; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
467; AVX2-NEXT:    vptest %ymm1, %ymm0
468; AVX2-NEXT:    setb %al
469; AVX2-NEXT:    vzeroupper
470; AVX2-NEXT:    retq
471;
472; AVX512-LABEL: test_v16i32:
473; AVX512:       # %bb.0:
474; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = -1
475; AVX512-NEXT:    vpcmpneqd (%rdi), %zmm0, %k0
476; AVX512-NEXT:    kortestw %k0, %k0
477; AVX512-NEXT:    sete %al
478; AVX512-NEXT:    vzeroupper
479; AVX512-NEXT:    retq
480  %vload = load <16 x i32>, ptr %ptr
481  %v0  = extractelement <16 x i32> %vload, i32 0
482  %v1  = extractelement <16 x i32> %vload, i32 1
483  %v2  = extractelement <16 x i32> %vload, i32 2
484  %v3  = extractelement <16 x i32> %vload, i32 3
485  %v4  = extractelement <16 x i32> %vload, i32 4
486  %v5  = extractelement <16 x i32> %vload, i32 5
487  %v6  = extractelement <16 x i32> %vload, i32 6
488  %v7  = extractelement <16 x i32> %vload, i32 7
489  %v8  = extractelement <16 x i32> %vload, i32 8
490  %v9  = extractelement <16 x i32> %vload, i32 9
491  %v10 = extractelement <16 x i32> %vload, i32 10
492  %v11 = extractelement <16 x i32> %vload, i32 11
493  %v12 = extractelement <16 x i32> %vload, i32 12
494  %v13 = extractelement <16 x i32> %vload, i32 13
495  %v14 = extractelement <16 x i32> %vload, i32 14
496  %v15 = extractelement <16 x i32> %vload, i32 15
497  %vreduce01 = and i32 %v0, %v1
498  %vreduce23 = and i32 %v2, %v3
499  %vreduce45 = and i32 %v4, %v5
500  %vreduce67 = and i32 %v6, %v7
501  %vreduce89 = and i32 %v8, %v9
502  %vreduce1011 = and i32 %v10, %v11
503  %vreduce1213 = and i32 %v12, %v13
504  %vreduce1415 = and i32 %v14, %v15
505  %vreduce0123 = and i32 %vreduce01, %vreduce23
506  %vreduce4567 = and i32 %vreduce45, %vreduce67
507  %vreduce891011 = and i32 %vreduce89, %vreduce1011
508  %vreduce12131415 = and i32 %vreduce1213, %vreduce1415
509  %vreduce01234567 = and i32 %vreduce0123, %vreduce4567
510  %vreduce89101112131415 = and i32 %vreduce891011, %vreduce12131415
511  %vreduce = and i32 %vreduce01234567, %vreduce89101112131415
512  %vcheck = icmp eq i32 %vreduce, -1
513  ret i1 %vcheck
514}
515
516;
517; vXi16
518;
519
520define i1 @test_v2i16(ptr %ptr) nounwind {
521; SSE-LABEL: test_v2i16:
522; SSE:       # %bb.0:
523; SSE-NEXT:    cmpl $-1, (%rdi)
524; SSE-NEXT:    sete %al
525; SSE-NEXT:    retq
526;
527; AVX-LABEL: test_v2i16:
528; AVX:       # %bb.0:
529; AVX-NEXT:    cmpl $-1, (%rdi)
530; AVX-NEXT:    sete %al
531; AVX-NEXT:    retq
532  %vload = load <2 x i16>, ptr %ptr
533  %v0 = extractelement <2 x i16> %vload, i32 0
534  %v1 = extractelement <2 x i16> %vload, i32 1
535  %vreduce = and i16 %v0, %v1
536  %vcheck = icmp eq i16 %vreduce, -1
537  ret i1 %vcheck
538}
539
540define i1 @test_v4i16(ptr %ptr) nounwind {
541; SSE-LABEL: test_v4i16:
542; SSE:       # %bb.0:
543; SSE-NEXT:    cmpq $-1, (%rdi)
544; SSE-NEXT:    sete %al
545; SSE-NEXT:    retq
546;
547; AVX-LABEL: test_v4i16:
548; AVX:       # %bb.0:
549; AVX-NEXT:    cmpq $-1, (%rdi)
550; AVX-NEXT:    sete %al
551; AVX-NEXT:    retq
552  %vload = load <4 x i16>, ptr %ptr
553  %v0 = extractelement <4 x i16> %vload, i32 0
554  %v1 = extractelement <4 x i16> %vload, i32 1
555  %v2 = extractelement <4 x i16> %vload, i32 2
556  %v3 = extractelement <4 x i16> %vload, i32 3
557  %vreduce01 = and i16 %v0, %v1
558  %vreduce23 = and i16 %v2, %v3
559  %vreduce = and i16 %vreduce01, %vreduce23
560  %vcheck = icmp eq i16 %vreduce, -1
561  ret i1 %vcheck
562}
563
564define i1 @test_v8i16(ptr %ptr) nounwind {
565; SSE2-LABEL: test_v8i16:
566; SSE2:       # %bb.0:
567; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
568; SSE2-NEXT:    pcmpeqb (%rdi), %xmm0
569; SSE2-NEXT:    pmovmskb %xmm0, %eax
570; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
571; SSE2-NEXT:    sete %al
572; SSE2-NEXT:    retq
573;
574; SSE41-LABEL: test_v8i16:
575; SSE41:       # %bb.0:
576; SSE41-NEXT:    movdqa (%rdi), %xmm0
577; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
578; SSE41-NEXT:    ptest %xmm1, %xmm0
579; SSE41-NEXT:    setb %al
580; SSE41-NEXT:    retq
581;
582; AVX-LABEL: test_v8i16:
583; AVX:       # %bb.0:
584; AVX-NEXT:    vmovdqa (%rdi), %xmm0
585; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
586; AVX-NEXT:    vptest %xmm1, %xmm0
587; AVX-NEXT:    setb %al
588; AVX-NEXT:    retq
589  %vload = load <8 x i16>, ptr %ptr
590  %v0 = extractelement <8 x i16> %vload, i32 0
591  %v1 = extractelement <8 x i16> %vload, i32 1
592  %v2 = extractelement <8 x i16> %vload, i32 2
593  %v3 = extractelement <8 x i16> %vload, i32 3
594  %v4 = extractelement <8 x i16> %vload, i32 4
595  %v5 = extractelement <8 x i16> %vload, i32 5
596  %v6 = extractelement <8 x i16> %vload, i32 6
597  %v7 = extractelement <8 x i16> %vload, i32 7
598  %vreduce01 = and i16 %v0, %v1
599  %vreduce23 = and i16 %v2, %v3
600  %vreduce45 = and i16 %v4, %v5
601  %vreduce67 = and i16 %v6, %v7
602  %vreduce0123 = and i16 %vreduce01, %vreduce23
603  %vreduce4567 = and i16 %vreduce45, %vreduce67
604  %vreduce = and i16 %vreduce0123, %vreduce4567
605  %vcheck = icmp eq i16 %vreduce, -1
606  ret i1 %vcheck
607}
608
609define i1 @test_v16i16(ptr %ptr) nounwind {
610; SSE2-LABEL: test_v16i16:
611; SSE2:       # %bb.0:
612; SSE2-NEXT:    movdqa (%rdi), %xmm0
613; SSE2-NEXT:    pand 16(%rdi), %xmm0
614; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
615; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
616; SSE2-NEXT:    pmovmskb %xmm1, %eax
617; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
618; SSE2-NEXT:    sete %al
619; SSE2-NEXT:    retq
620;
621; SSE41-LABEL: test_v16i16:
622; SSE41:       # %bb.0:
623; SSE41-NEXT:    movdqa (%rdi), %xmm0
624; SSE41-NEXT:    pand 16(%rdi), %xmm0
625; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
626; SSE41-NEXT:    ptest %xmm1, %xmm0
627; SSE41-NEXT:    setb %al
628; SSE41-NEXT:    retq
629;
630; AVX1-LABEL: test_v16i16:
631; AVX1:       # %bb.0:
632; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
633; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
634; AVX1-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
635; AVX1-NEXT:    vptest %ymm1, %ymm0
636; AVX1-NEXT:    setb %al
637; AVX1-NEXT:    vzeroupper
638; AVX1-NEXT:    retq
639;
640; AVX2-LABEL: test_v16i16:
641; AVX2:       # %bb.0:
642; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
643; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
644; AVX2-NEXT:    vptest %ymm1, %ymm0
645; AVX2-NEXT:    setb %al
646; AVX2-NEXT:    vzeroupper
647; AVX2-NEXT:    retq
648;
649; AVX512-LABEL: test_v16i16:
650; AVX512:       # %bb.0:
651; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
652; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
653; AVX512-NEXT:    vptest %ymm1, %ymm0
654; AVX512-NEXT:    setb %al
655; AVX512-NEXT:    vzeroupper
656; AVX512-NEXT:    retq
657  %vload = load <16 x i16>, ptr %ptr
658  %v0  = extractelement <16 x i16> %vload, i32 0
659  %v1  = extractelement <16 x i16> %vload, i32 1
660  %v2  = extractelement <16 x i16> %vload, i32 2
661  %v3  = extractelement <16 x i16> %vload, i32 3
662  %v4  = extractelement <16 x i16> %vload, i32 4
663  %v5  = extractelement <16 x i16> %vload, i32 5
664  %v6  = extractelement <16 x i16> %vload, i32 6
665  %v7  = extractelement <16 x i16> %vload, i32 7
666  %v8  = extractelement <16 x i16> %vload, i32 8
667  %v9  = extractelement <16 x i16> %vload, i32 9
668  %v10 = extractelement <16 x i16> %vload, i32 10
669  %v11 = extractelement <16 x i16> %vload, i32 11
670  %v12 = extractelement <16 x i16> %vload, i32 12
671  %v13 = extractelement <16 x i16> %vload, i32 13
672  %v14 = extractelement <16 x i16> %vload, i32 14
673  %v15 = extractelement <16 x i16> %vload, i32 15
674  %vreduce01 = and i16 %v0, %v1
675  %vreduce23 = and i16 %v2, %v3
676  %vreduce45 = and i16 %v4, %v5
677  %vreduce67 = and i16 %v6, %v7
678  %vreduce89 = and i16 %v8, %v9
679  %vreduce1011 = and i16 %v10, %v11
680  %vreduce1213 = and i16 %v12, %v13
681  %vreduce1415 = and i16 %v14, %v15
682  %vreduce0123 = and i16 %vreduce01, %vreduce23
683  %vreduce4567 = and i16 %vreduce45, %vreduce67
684  %vreduce891011 = and i16 %vreduce89, %vreduce1011
685  %vreduce12131415 = and i16 %vreduce1213, %vreduce1415
686  %vreduce01234567 = and i16 %vreduce0123, %vreduce4567
687  %vreduce89101112131415 = and i16 %vreduce891011, %vreduce12131415
688  %vreduce = and i16 %vreduce01234567, %vreduce89101112131415
689  %vcheck = icmp eq i16 %vreduce, -1
690  ret i1 %vcheck
691}
692
693;
694; vXi8
695;
696
697define i1 @test_v2i8(ptr %ptr) nounwind {
698; SSE-LABEL: test_v2i8:
699; SSE:       # %bb.0:
700; SSE-NEXT:    cmpw $-1, (%rdi)
701; SSE-NEXT:    sete %al
702; SSE-NEXT:    retq
703;
704; AVX-LABEL: test_v2i8:
705; AVX:       # %bb.0:
706; AVX-NEXT:    cmpw $-1, (%rdi)
707; AVX-NEXT:    sete %al
708; AVX-NEXT:    retq
709  %vload = load <2 x i8>, ptr %ptr
710  %v0 = extractelement <2 x i8> %vload, i32 0
711  %v1 = extractelement <2 x i8> %vload, i32 1
712  %vreduce = and i8 %v0, %v1
713  %vcheck = icmp eq i8 %vreduce, -1
714  ret i1 %vcheck
715}
716
717define i1 @test_v4i8(ptr %ptr) nounwind {
718; SSE-LABEL: test_v4i8:
719; SSE:       # %bb.0:
720; SSE-NEXT:    cmpl $-1, (%rdi)
721; SSE-NEXT:    sete %al
722; SSE-NEXT:    retq
723;
724; AVX-LABEL: test_v4i8:
725; AVX:       # %bb.0:
726; AVX-NEXT:    cmpl $-1, (%rdi)
727; AVX-NEXT:    sete %al
728; AVX-NEXT:    retq
729  %vload = load <4 x i8>, ptr %ptr
730  %v0 = extractelement <4 x i8> %vload, i32 0
731  %v1 = extractelement <4 x i8> %vload, i32 1
732  %v2 = extractelement <4 x i8> %vload, i32 2
733  %v3 = extractelement <4 x i8> %vload, i32 3
734  %vreduce01 = and i8 %v0, %v1
735  %vreduce23 = and i8 %v2, %v3
736  %vreduce = and i8 %vreduce01, %vreduce23
737  %vcheck = icmp eq i8 %vreduce, -1
738  ret i1 %vcheck
739}
740
741define i1 @test_v8i8(ptr %ptr) nounwind {
742; SSE-LABEL: test_v8i8:
743; SSE:       # %bb.0:
744; SSE-NEXT:    cmpq $-1, (%rdi)
745; SSE-NEXT:    sete %al
746; SSE-NEXT:    retq
747;
748; AVX-LABEL: test_v8i8:
749; AVX:       # %bb.0:
750; AVX-NEXT:    cmpq $-1, (%rdi)
751; AVX-NEXT:    sete %al
752; AVX-NEXT:    retq
753  %vload = load <8 x i8>, ptr %ptr
754  %v0 = extractelement <8 x i8> %vload, i32 0
755  %v1 = extractelement <8 x i8> %vload, i32 1
756  %v2 = extractelement <8 x i8> %vload, i32 2
757  %v3 = extractelement <8 x i8> %vload, i32 3
758  %v4 = extractelement <8 x i8> %vload, i32 4
759  %v5 = extractelement <8 x i8> %vload, i32 5
760  %v6 = extractelement <8 x i8> %vload, i32 6
761  %v7 = extractelement <8 x i8> %vload, i32 7
762  %vreduce01 = and i8 %v0, %v1
763  %vreduce23 = and i8 %v2, %v3
764  %vreduce45 = and i8 %v4, %v5
765  %vreduce67 = and i8 %v6, %v7
766  %vreduce0123 = and i8 %vreduce01, %vreduce23
767  %vreduce4567 = and i8 %vreduce45, %vreduce67
768  %vreduce = and i8 %vreduce0123, %vreduce4567
769  %vcheck = icmp eq i8 %vreduce, -1
770  ret i1 %vcheck
771}
772
773define i1 @test_v16i8(ptr %ptr) nounwind {
774; SSE2-LABEL: test_v16i8:
775; SSE2:       # %bb.0:
776; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
777; SSE2-NEXT:    pcmpeqb (%rdi), %xmm0
778; SSE2-NEXT:    pmovmskb %xmm0, %eax
779; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
780; SSE2-NEXT:    sete %al
781; SSE2-NEXT:    retq
782;
783; SSE41-LABEL: test_v16i8:
784; SSE41:       # %bb.0:
785; SSE41-NEXT:    movdqa (%rdi), %xmm0
786; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
787; SSE41-NEXT:    ptest %xmm1, %xmm0
788; SSE41-NEXT:    setb %al
789; SSE41-NEXT:    retq
790;
791; AVX-LABEL: test_v16i8:
792; AVX:       # %bb.0:
793; AVX-NEXT:    vmovdqa (%rdi), %xmm0
794; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
795; AVX-NEXT:    vptest %xmm1, %xmm0
796; AVX-NEXT:    setb %al
797; AVX-NEXT:    retq
798  %vload = load <16 x i8>, ptr %ptr
799  %v0  = extractelement <16 x i8> %vload, i32 0
800  %v1  = extractelement <16 x i8> %vload, i32 1
801  %v2  = extractelement <16 x i8> %vload, i32 2
802  %v3  = extractelement <16 x i8> %vload, i32 3
803  %v4  = extractelement <16 x i8> %vload, i32 4
804  %v5  = extractelement <16 x i8> %vload, i32 5
805  %v6  = extractelement <16 x i8> %vload, i32 6
806  %v7  = extractelement <16 x i8> %vload, i32 7
807  %v8  = extractelement <16 x i8> %vload, i32 8
808  %v9  = extractelement <16 x i8> %vload, i32 9
809  %v10 = extractelement <16 x i8> %vload, i32 10
810  %v11 = extractelement <16 x i8> %vload, i32 11
811  %v12 = extractelement <16 x i8> %vload, i32 12
812  %v13 = extractelement <16 x i8> %vload, i32 13
813  %v14 = extractelement <16 x i8> %vload, i32 14
814  %v15 = extractelement <16 x i8> %vload, i32 15
815  %vreduce01 = and i8 %v0, %v1
816  %vreduce23 = and i8 %v2, %v3
817  %vreduce45 = and i8 %v4, %v5
818  %vreduce67 = and i8 %v6, %v7
819  %vreduce89 = and i8 %v8, %v9
820  %vreduce1011 = and i8 %v10, %v11
821  %vreduce1213 = and i8 %v12, %v13
822  %vreduce1415 = and i8 %v14, %v15
823  %vreduce0123 = and i8 %vreduce01, %vreduce23
824  %vreduce4567 = and i8 %vreduce45, %vreduce67
825  %vreduce891011 = and i8 %vreduce89, %vreduce1011
826  %vreduce12131415 = and i8 %vreduce1213, %vreduce1415
827  %vreduce01234567 = and i8 %vreduce0123, %vreduce4567
828  %vreduce89101112131415 = and i8 %vreduce891011, %vreduce12131415
829  %vreduce = and i8 %vreduce01234567, %vreduce89101112131415
830  %vcheck = icmp eq i8 %vreduce, -1
831  ret i1 %vcheck
832}
833
834;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
835; AVX1OR2: {{.*}}
836; AVX512BW: {{.*}}
837; AVX512F: {{.*}}
838; AVX512VL: {{.*}}
839