xref: /llvm-project/llvm/test/CodeGen/X86/combine-and.ll (revision 862c7e0218f27b55a5b75ae59a4f73cd4610448d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx    < %s | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
4; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx2   < %s | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
5; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
6
7define i32 @and_self(i32 %x) {
8; CHECK-LABEL: and_self:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    movl %edi, %eax
11; CHECK-NEXT:    retq
12  %and = and i32 %x, %x
13  ret i32 %and
14}
15
16define <4 x i32> @and_self_vec(<4 x i32> %x) {
17; CHECK-LABEL: and_self_vec:
18; CHECK:       # %bb.0:
19; CHECK-NEXT:    retq
20  %and = and <4 x i32> %x, %x
21  ret <4 x i32> %and
22}
23
24;
25; Verify that the DAGCombiner is able to fold a vector AND into a blend
26; if one of the operands to the AND is a vector of all constants, and each
27; constant element is either zero or all-ones.
28;
29
30define <4 x i32> @test1(<4 x i32> %A) {
31; SSE-LABEL: test1:
32; SSE:       # %bb.0:
33; SSE-NEXT:    xorps %xmm1, %xmm1
34; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
35; SSE-NEXT:    retq
36;
37; AVX-LABEL: test1:
38; AVX:       # %bb.0:
39; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
40; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
41; AVX-NEXT:    retq
42  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 0>
43  ret <4 x i32> %1
44}
45
46define <4 x i32> @test2(<4 x i32> %A) {
47; SSE-LABEL: test2:
48; SSE:       # %bb.0:
49; SSE-NEXT:    xorps %xmm1, %xmm1
50; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
51; SSE-NEXT:    retq
52;
53; AVX-LABEL: test2:
54; AVX:       # %bb.0:
55; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
56; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
57; AVX-NEXT:    retq
58  %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 0>
59  ret <4 x i32> %1
60}
61
62define <4 x i32> @test3(<4 x i32> %A) {
63; SSE-LABEL: test3:
64; SSE:       # %bb.0:
65; SSE-NEXT:    xorps %xmm1, %xmm1
66; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
67; SSE-NEXT:    retq
68;
69; AVX-LABEL: test3:
70; AVX:       # %bb.0:
71; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
72; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
73; AVX-NEXT:    retq
74  %1 = and <4 x i32> %A, <i32 0, i32 0, i32 -1, i32 0>
75  ret <4 x i32> %1
76}
77
78define <4 x i32> @test4(<4 x i32> %A) {
79; SSE-LABEL: test4:
80; SSE:       # %bb.0:
81; SSE-NEXT:    xorps %xmm1, %xmm1
82; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
83; SSE-NEXT:    retq
84;
85; AVX-LABEL: test4:
86; AVX:       # %bb.0:
87; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
88; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
89; AVX-NEXT:    retq
90  %1 = and <4 x i32> %A, <i32 0, i32 0, i32 0, i32 -1>
91  ret <4 x i32> %1
92}
93
94define <4 x i32> @test5(<4 x i32> %A) {
95; SSE-LABEL: test5:
96; SSE:       # %bb.0:
97; SSE-NEXT:    xorps %xmm1, %xmm1
98; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
99; SSE-NEXT:    retq
100;
101; AVX-LABEL: test5:
102; AVX:       # %bb.0:
103; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
104; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
105; AVX-NEXT:    retq
106  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 0>
107  ret <4 x i32> %1
108}
109
110define <4 x i32> @test6(<4 x i32> %A) {
111; SSE-LABEL: test6:
112; SSE:       # %bb.0:
113; SSE-NEXT:    xorps %xmm1, %xmm1
114; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
115; SSE-NEXT:    retq
116;
117; AVX-LABEL: test6:
118; AVX:       # %bb.0:
119; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
120; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
121; AVX-NEXT:    retq
122  %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 -1>
123  ret <4 x i32> %1
124}
125
126define <4 x i32> @test7(<4 x i32> %A) {
127; SSE-LABEL: test7:
128; SSE:       # %bb.0:
129; SSE-NEXT:    xorps %xmm1, %xmm1
130; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
131; SSE-NEXT:    retq
132;
133; AVX-LABEL: test7:
134; AVX:       # %bb.0:
135; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
136; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
137; AVX-NEXT:    retq
138  %1 = and <4 x i32> %A, <i32 0, i32 0, i32 -1, i32 -1>
139  ret <4 x i32> %1
140}
141
142define <4 x i32> @test8(<4 x i32> %A) {
143; SSE-LABEL: test8:
144; SSE:       # %bb.0:
145; SSE-NEXT:    xorps %xmm1, %xmm1
146; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
147; SSE-NEXT:    retq
148;
149; AVX-LABEL: test8:
150; AVX:       # %bb.0:
151; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
152; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
153; AVX-NEXT:    retq
154  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 -1>
155  ret <4 x i32> %1
156}
157
158define <4 x i32> @test9(<4 x i32> %A) {
159; SSE-LABEL: test9:
160; SSE:       # %bb.0:
161; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
162; SSE-NEXT:    retq
163;
164; AVX-LABEL: test9:
165; AVX:       # %bb.0:
166; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
167; AVX-NEXT:    retq
168  %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 0, i32 0>
169  ret <4 x i32> %1
170}
171
172define <4 x i32> @test10(<4 x i32> %A) {
173; SSE-LABEL: test10:
174; SSE:       # %bb.0:
175; SSE-NEXT:    xorps %xmm1, %xmm1
176; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
177; SSE-NEXT:    retq
178;
179; AVX-LABEL: test10:
180; AVX:       # %bb.0:
181; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
182; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
183; AVX-NEXT:    retq
184  %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 0>
185  ret <4 x i32> %1
186}
187
188define <4 x i32> @test11(<4 x i32> %A) {
189; SSE-LABEL: test11:
190; SSE:       # %bb.0:
191; SSE-NEXT:    xorps %xmm1, %xmm1
192; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
193; SSE-NEXT:    retq
194;
195; AVX-LABEL: test11:
196; AVX:       # %bb.0:
197; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
198; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
199; AVX-NEXT:    retq
200  %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 -1>
201  ret <4 x i32> %1
202}
203
204define <4 x i32> @test12(<4 x i32> %A) {
205; SSE-LABEL: test12:
206; SSE:       # %bb.0:
207; SSE-NEXT:    xorps %xmm1, %xmm1
208; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
209; SSE-NEXT:    retq
210;
211; AVX-LABEL: test12:
212; AVX:       # %bb.0:
213; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
214; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
215; AVX-NEXT:    retq
216  %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 -1, i32 0>
217  ret <4 x i32> %1
218}
219
220define <4 x i32> @test13(<4 x i32> %A) {
221; SSE-LABEL: test13:
222; SSE:       # %bb.0:
223; SSE-NEXT:    xorps %xmm1, %xmm1
224; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
225; SSE-NEXT:    retq
226;
227; AVX-LABEL: test13:
228; AVX:       # %bb.0:
229; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
230; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
231; AVX-NEXT:    retq
232  %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 0, i32 -1>
233  ret <4 x i32> %1
234}
235
236define <4 x i32> @test14(<4 x i32> %A) {
237; SSE-LABEL: test14:
238; SSE:       # %bb.0:
239; SSE-NEXT:    xorps %xmm1, %xmm1
240; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
241; SSE-NEXT:    retq
242;
243; AVX-LABEL: test14:
244; AVX:       # %bb.0:
245; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
246; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
247; AVX-NEXT:    retq
248  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 -1>
249  ret <4 x i32> %1
250}
251
252; X & undef must fold to 0. So lane 0 must choose from the zero vector.
253
254define <4 x i32> @undef_lane(<4 x i32> %x) {
255; SSE-LABEL: undef_lane:
256; SSE:       # %bb.0:
257; SSE-NEXT:    xorps %xmm1, %xmm1
258; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
259; SSE-NEXT:    retq
260;
261; AVX-LABEL: undef_lane:
262; AVX:       # %bb.0:
263; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
264; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
265; AVX-NEXT:    retq
266  %r = and <4 x i32> %x, <i32 undef, i32 4294967295, i32 0, i32 4294967295>
267  ret <4 x i32> %r
268}
269
270define <4 x i32> @test15(<4 x i32> %A, <4 x i32> %B) {
271; SSE-LABEL: test15:
272; SSE:       # %bb.0:
273; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
274; SSE-NEXT:    retq
275;
276; AVX-LABEL: test15:
277; AVX:       # %bb.0:
278; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
279; AVX-NEXT:    retq
280  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 -1>
281  %2 = and <4 x i32> %B, <i32 0, i32 -1, i32 0, i32 0>
282  %3 = or <4 x i32> %1, %2
283  ret <4 x i32> %3
284}
285
286define <4 x i32> @test16(<4 x i32> %A, <4 x i32> %B) {
287; SSE-LABEL: test16:
288; SSE:       # %bb.0:
289; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
290; SSE-NEXT:    retq
291;
292; AVX-LABEL: test16:
293; AVX:       # %bb.0:
294; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
295; AVX-NEXT:    retq
296  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 0>
297  %2 = and <4 x i32> %B, <i32 0, i32 -1, i32 0, i32 -1>
298  %3 = or <4 x i32> %1, %2
299  ret <4 x i32> %3
300}
301
302define <4 x i32> @test17(<4 x i32> %A, <4 x i32> %B) {
303; SSE-LABEL: test17:
304; SSE:       # %bb.0:
305; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
306; SSE-NEXT:    retq
307;
308; AVX-LABEL: test17:
309; AVX:       # %bb.0:
310; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
311; AVX-NEXT:    retq
312  %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 -1>
313  %2 = and <4 x i32> %B, <i32 -1, i32 0, i32 -1, i32 0>
314  %3 = or <4 x i32> %1, %2
315  ret <4 x i32> %3
316}
317
318;
319; fold (and (or x, C), D) -> D if (C & D) == D
320;
321
322define <2 x i64> @and_or_v2i64(<2 x i64> %a0) {
323; SSE-LABEL: and_or_v2i64:
324; SSE:       # %bb.0:
325; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,8]
326; SSE-NEXT:    retq
327;
328; AVX-LABEL: and_or_v2i64:
329; AVX:       # %bb.0:
330; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [8,8]
331; AVX-NEXT:    # xmm0 = mem[0,0]
332; AVX-NEXT:    retq
333  %1 = or <2 x i64> %a0, <i64 255, i64 255>
334  %2 = and <2 x i64> %1, <i64 8, i64 8>
335  ret <2 x i64> %2
336}
337
338define <4 x i32> @and_or_v4i32(<4 x i32> %a0) {
339; SSE-LABEL: and_or_v4i32:
340; SSE:       # %bb.0:
341; SSE-NEXT:    movaps {{.*#+}} xmm0 = [3,3,3,3]
342; SSE-NEXT:    retq
343;
344; AVX-LABEL: and_or_v4i32:
345; AVX:       # %bb.0:
346; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
347; AVX-NEXT:    retq
348  %1 = or <4 x i32> %a0, <i32 15, i32 15, i32 15, i32 15>
349  %2 = and <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
350  ret <4 x i32> %2
351}
352
353define <8 x i16> @and_or_v8i16(<8 x i16> %a0) {
354; SSE-LABEL: and_or_v8i16:
355; SSE:       # %bb.0:
356; SSE-NEXT:    movaps {{.*#+}} xmm0 = [15,7,3,1,14,10,2,32767]
357; SSE-NEXT:    retq
358;
359; AVX-LABEL: and_or_v8i16:
360; AVX:       # %bb.0:
361; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [15,7,3,1,14,10,2,32767]
362; AVX-NEXT:    retq
363  %1 = or <8 x i16> %a0, <i16 255, i16 127, i16 63, i16 31, i16 15, i16 31, i16 63, i16 -1>
364  %2 = and <8 x i16> %1, <i16 15, i16 7, i16 3, i16 1, i16 14, i16 10, i16 2, i16 32767>
365  ret <8 x i16> %2
366}
367
368;
369; Check we merge and(ext(and(x,c1)),c2) before an and gets folded to a shuffle clear mask
370;
371
372define <8 x i32> @clear_sext_and(<8 x i16> %x) {
373; SSE-LABEL: clear_sext_and:
374; SSE:       # %bb.0:
375; SSE-NEXT:    pmovsxwd %xmm0, %xmm2
376; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
377; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
378; SSE-NEXT:    pmovsxwd %xmm0, %xmm1
379; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
380; SSE-NEXT:    movdqa %xmm2, %xmm0
381; SSE-NEXT:    retq
382;
383; AVX1-LABEL: clear_sext_and:
384; AVX1:       # %bb.0:
385; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
386; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
387; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
388; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
389; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
390; AVX1-NEXT:    retq
391;
392; AVX2-LABEL: clear_sext_and:
393; AVX2:       # %bb.0:
394; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
395; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
396; AVX2-NEXT:    retq
397;
398; AVX512-LABEL: clear_sext_and:
399; AVX512:       # %bb.0:
400; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
401; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
402; AVX512-NEXT:    retq
403  %1 = and <8 x i16> %x, <i16 -1, i16 3, i16 7, i16 15, i16 31, i16 63, i16 127, i16 -1>
404  %2 = sext <8 x i16> %1 to <8 x i32>
405  %3 = and <8 x i32> %2, <i32 -1, i32 0, i32 -1, i32 0, i32 0, i32 -1, i32 -1, i32 -1>
406  ret <8 x i32> %3
407}
408
409define <8 x i32> @clear_zext_and(<8 x i16> %x) {
410; SSE-LABEL: clear_zext_and:
411; SSE:       # %bb.0:
412; SSE-NEXT:    movdqa %xmm0, %xmm1
413; SSE-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
414; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
415; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
416; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
417; SSE-NEXT:    retq
418;
419; AVX1-LABEL: clear_zext_and:
420; AVX1:       # %bb.0:
421; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
422; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
423; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
424; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
425; AVX1-NEXT:    retq
426;
427; AVX2-LABEL: clear_zext_and:
428; AVX2:       # %bb.0:
429; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
430; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
431; AVX2-NEXT:    retq
432;
433; AVX512-LABEL: clear_zext_and:
434; AVX512:       # %bb.0:
435; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
436; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
437; AVX512-NEXT:    retq
438  %1 = and <8 x i16> %x, <i16 -1, i16 3, i16 7, i16 15, i16 31, i16 63, i16 127, i16 -1>
439  %2 = zext <8 x i16> %1 to <8 x i32>
440  %3 = and <8 x i32> %2, <i32 -1, i32 0, i32 -1, i32 0, i32 0, i32 -1, i32 -1, i32 -1>
441  ret <8 x i32> %3
442}
443
444;
445; known bits folding
446;
447
448define <2 x i64> @and_or_zext_v2i32(<2 x i32> %a0) {
449; SSE-LABEL: and_or_zext_v2i32:
450; SSE:       # %bb.0:
451; SSE-NEXT:    xorps %xmm0, %xmm0
452; SSE-NEXT:    retq
453;
454; AVX-LABEL: and_or_zext_v2i32:
455; AVX:       # %bb.0:
456; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
457; AVX-NEXT:    retq
458  %1 = zext <2 x i32> %a0 to <2 x i64>
459  %2 = or <2 x i64> %1, <i64 1, i64 1>
460  %3 = and <2 x i64> %2, <i64 4294967296, i64 4294967296>
461  ret <2 x i64> %3
462}
463
464define <4 x i32> @and_or_zext_v4i16(<4 x i16> %a0) {
465; SSE-LABEL: and_or_zext_v4i16:
466; SSE:       # %bb.0:
467; SSE-NEXT:    xorps %xmm0, %xmm0
468; SSE-NEXT:    retq
469;
470; AVX-LABEL: and_or_zext_v4i16:
471; AVX:       # %bb.0:
472; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
473; AVX-NEXT:    retq
474  %1 = zext <4 x i16> %a0 to <4 x i32>
475  %2 = or <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
476  %3 = and <4 x i32> %2, <i32 65536, i32 65536, i32 65536, i32 65536>
477  ret <4 x i32> %3
478}
479
480;
481; known sign bits folding
482;
483
484define <8 x i16> @ashr_mask1_v8i16(<8 x i16> %a0) {
485; SSE-LABEL: ashr_mask1_v8i16:
486; SSE:       # %bb.0:
487; SSE-NEXT:    psrlw $15, %xmm0
488; SSE-NEXT:    retq
489;
490; AVX-LABEL: ashr_mask1_v8i16:
491; AVX:       # %bb.0:
492; AVX-NEXT:    vpsrlw $15, %xmm0, %xmm0
493; AVX-NEXT:    retq
494  %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
495  %2 = and <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
496  ret <8 x i16> %2
497}
498
499define <4 x i32> @ashr_mask7_v4i32(<4 x i32> %a0) {
500; SSE-LABEL: ashr_mask7_v4i32:
501; SSE:       # %bb.0:
502; SSE-NEXT:    psrad $31, %xmm0
503; SSE-NEXT:    psrld $29, %xmm0
504; SSE-NEXT:    retq
505;
506; AVX-LABEL: ashr_mask7_v4i32:
507; AVX:       # %bb.0:
508; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
509; AVX-NEXT:    vpsrld $29, %xmm0, %xmm0
510; AVX-NEXT:    retq
511  %1 = ashr <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31>
512  %2 = and <4 x i32> %1, <i32 7, i32 7, i32 7, i32 7>
513  ret <4 x i32> %2
514}
515
516;
517; SimplifyDemandedBits
518;
519
520; PR34620 - redundant PAND after vector shift of a byte vector (PSRLW)
521define <16 x i8> @PR34620(<16 x i8> %a0, <16 x i8> %a1) {
522; SSE-LABEL: PR34620:
523; SSE:       # %bb.0:
524; SSE-NEXT:    psrlw $1, %xmm0
525; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
526; SSE-NEXT:    paddb %xmm1, %xmm0
527; SSE-NEXT:    retq
528;
529; AVX1-LABEL: PR34620:
530; AVX1:       # %bb.0:
531; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
532; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
533; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
534; AVX1-NEXT:    retq
535;
536; AVX2-LABEL: PR34620:
537; AVX2:       # %bb.0:
538; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm0
539; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
540; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
541; AVX2-NEXT:    retq
542;
543; AVX512-LABEL: PR34620:
544; AVX512:       # %bb.0:
545; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm0
546; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
547; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
548; AVX512-NEXT:    retq
549  %1 = lshr <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
550  %2 = and <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
551  %3 = add <16 x i8> %2, %a1
552  ret <16 x i8> %3
553}
554
555;
556; Simplify and with a broadcasted negated scalar
557;
558
559define <8 x i64> @neg_scalar_broadcast_v8i64_arg(i64 %a0, <8 x i64> %a1) {
560; SSE-LABEL: neg_scalar_broadcast_v8i64_arg:
561; SSE:       # %bb.0:
562; SSE-NEXT:    notq %rdi
563; SSE-NEXT:    movq %rdi, %xmm4
564; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
565; SSE-NEXT:    pand %xmm4, %xmm0
566; SSE-NEXT:    pand %xmm4, %xmm1
567; SSE-NEXT:    pand %xmm4, %xmm2
568; SSE-NEXT:    pand %xmm4, %xmm3
569; SSE-NEXT:    retq
570;
571; AVX1-LABEL: neg_scalar_broadcast_v8i64_arg:
572; AVX1:       # %bb.0:
573; AVX1-NEXT:    notq %rdi
574; AVX1-NEXT:    vmovq %rdi, %xmm2
575; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
576; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
577; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm0
578; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
579; AVX1-NEXT:    retq
580;
581; AVX2-LABEL: neg_scalar_broadcast_v8i64_arg:
582; AVX2:       # %bb.0:
583; AVX2-NEXT:    notq %rdi
584; AVX2-NEXT:    vmovq %rdi, %xmm2
585; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
586; AVX2-NEXT:    vpand %ymm0, %ymm2, %ymm0
587; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
588; AVX2-NEXT:    retq
589;
590; AVX512-LABEL: neg_scalar_broadcast_v8i64_arg:
591; AVX512:       # %bb.0:
592; AVX512-NEXT:    vpbroadcastq %rdi, %zmm1
593; AVX512-NEXT:    vpandnq %zmm0, %zmm1, %zmm0
594; AVX512-NEXT:    retq
595  %1 = xor i64 %a0, -1
596  %2 = insertelement <8 x i64> undef, i64 %1, i64 0
597  %3 = shufflevector <8 x i64> %2, <8 x i64> poison, <8 x i32> zeroinitializer
598  %4 = and <8 x i64> %3, %a1
599  ret <8 x i64> %4
600}
601
602define <8 x i64> @neg_scalar_broadcast_v8i64(i64 %a0, <2 x i64> %a1) {
603; SSE-LABEL: neg_scalar_broadcast_v8i64:
604; SSE:       # %bb.0:
605; SSE-NEXT:    movdqa %xmm0, %xmm2
606; SSE-NEXT:    notq %rdi
607; SSE-NEXT:    movq %rdi, %xmm0
608; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,1,0,1]
609; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
610; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
611; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
612; SSE-NEXT:    pand %xmm4, %xmm0
613; SSE-NEXT:    pand %xmm4, %xmm1
614; SSE-NEXT:    pand %xmm4, %xmm2
615; SSE-NEXT:    pand %xmm4, %xmm3
616; SSE-NEXT:    retq
617;
618; AVX1-LABEL: neg_scalar_broadcast_v8i64:
619; AVX1:       # %bb.0:
620; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
621; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1]
622; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
623; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
624; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,3]
625; AVX1-NEXT:    vmovq %rdi, %xmm2
626; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
627; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
628; AVX1-NEXT:    vandnpd %ymm0, %ymm2, %ymm0
629; AVX1-NEXT:    vandnpd %ymm1, %ymm2, %ymm1
630; AVX1-NEXT:    retq
631;
632; AVX2-LABEL: neg_scalar_broadcast_v8i64:
633; AVX2:       # %bb.0:
634; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
635; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,0]
636; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1]
637; AVX2-NEXT:    vmovq %rdi, %xmm2
638; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
639; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
640; AVX2-NEXT:    vpandn %ymm1, %ymm2, %ymm1
641; AVX2-NEXT:    retq
642;
643; AVX512-LABEL: neg_scalar_broadcast_v8i64:
644; AVX512:       # %bb.0:
645; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
646; AVX512-NEXT:    vpbroadcastq %rdi, %zmm1
647; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [1,0,1,1,0,1,0,0]
648; AVX512-NEXT:    vpermq %zmm0, %zmm2, %zmm0
649; AVX512-NEXT:    vpandnq %zmm0, %zmm1, %zmm0
650; AVX512-NEXT:    retq
651  %1 = xor i64 %a0, -1
652  %2 = insertelement <8 x i64> undef, i64 %1, i64 0
653  %3 = shufflevector <8 x i64> %2, <8 x i64> poison, <8 x i32> zeroinitializer
654  %4 = shufflevector <2 x i64> %a1, <2 x i64> poison, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 0, i32 1, i32 0, i32 0>
655  %5 = and <8 x i64> %4, %3
656  ret <8 x i64> %5
657}
658
659define <4 x i64> @neg_scalar_broadcast_v4i64_arg(i64 %a0, <4 x i64> %a1) {
660; SSE-LABEL: neg_scalar_broadcast_v4i64_arg:
661; SSE:       # %bb.0:
662; SSE-NEXT:    notq %rdi
663; SSE-NEXT:    movq %rdi, %xmm2
664; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
665; SSE-NEXT:    pand %xmm2, %xmm0
666; SSE-NEXT:    pand %xmm2, %xmm1
667; SSE-NEXT:    retq
668;
669; AVX1-LABEL: neg_scalar_broadcast_v4i64_arg:
670; AVX1:       # %bb.0:
671; AVX1-NEXT:    vmovq %rdi, %xmm1
672; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
673; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
674; AVX1-NEXT:    vandnps %ymm0, %ymm1, %ymm0
675; AVX1-NEXT:    retq
676;
677; AVX2-LABEL: neg_scalar_broadcast_v4i64_arg:
678; AVX2:       # %bb.0:
679; AVX2-NEXT:    vmovq %rdi, %xmm1
680; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
681; AVX2-NEXT:    vpandn %ymm0, %ymm1, %ymm0
682; AVX2-NEXT:    retq
683;
684; AVX512-LABEL: neg_scalar_broadcast_v4i64_arg:
685; AVX512:       # %bb.0:
686; AVX512-NEXT:    vpbroadcastq %rdi, %ymm1
687; AVX512-NEXT:    vpandn %ymm0, %ymm1, %ymm0
688; AVX512-NEXT:    retq
689  %1 = xor i64 %a0, -1
690  %2 = insertelement <4 x i64> undef, i64 %1, i64 0
691  %3 = shufflevector <4 x i64> %2, <4 x i64> poison, <4 x i32> zeroinitializer
692  %4 = and <4 x i64> %3, %a1
693  ret <4 x i64> %4
694}
695
696define <4 x i64> @neg_scalar_broadcast_v4i64(i64 %a0, <2 x i64> %a1) {
697; SSE-LABEL: neg_scalar_broadcast_v4i64:
698; SSE:       # %bb.0:
699; SSE-NEXT:    notq %rdi
700; SSE-NEXT:    movq %rdi, %xmm1
701; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
702; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
703; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
704; SSE-NEXT:    pand %xmm2, %xmm0
705; SSE-NEXT:    pand %xmm2, %xmm1
706; SSE-NEXT:    retq
707;
708; AVX1-LABEL: neg_scalar_broadcast_v4i64:
709; AVX1:       # %bb.0:
710; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
711; AVX1-NEXT:    vmovq %rdi, %xmm1
712; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
713; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
714; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
715; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,3]
716; AVX1-NEXT:    vandnpd %ymm0, %ymm1, %ymm0
717; AVX1-NEXT:    retq
718;
719; AVX2-LABEL: neg_scalar_broadcast_v4i64:
720; AVX2:       # %bb.0:
721; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
722; AVX2-NEXT:    vmovq %rdi, %xmm1
723; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
724; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1]
725; AVX2-NEXT:    vpandn %ymm0, %ymm1, %ymm0
726; AVX2-NEXT:    retq
727;
728; AVX512-LABEL: neg_scalar_broadcast_v4i64:
729; AVX512:       # %bb.0:
730; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
731; AVX512-NEXT:    vpbroadcastq %rdi, %ymm1
732; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1]
733; AVX512-NEXT:    vpandn %ymm0, %ymm1, %ymm0
734; AVX512-NEXT:    retq
735  %1 = xor i64 %a0, -1
736  %2 = insertelement <4 x i64> undef, i64 %1, i64 0
737  %3 = shufflevector <4 x i64> %2, <4 x i64> poison, <4 x i32> zeroinitializer
738  %4 = shufflevector <2 x i64> %a1, <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 1>
739  %5 = and <4 x i64> %4, %3
740  ret <4 x i64> %5
741}
742
743define <2 x i64> @neg_scalar_broadcast_v2i64(i64 %a0, <2 x i64> %a1) {
744; SSE-LABEL: neg_scalar_broadcast_v2i64:
745; SSE:       # %bb.0:
746; SSE-NEXT:    movq %rdi, %xmm1
747; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
748; SSE-NEXT:    pandn %xmm0, %xmm1
749; SSE-NEXT:    movdqa %xmm1, %xmm0
750; SSE-NEXT:    retq
751;
752; AVX1-LABEL: neg_scalar_broadcast_v2i64:
753; AVX1:       # %bb.0:
754; AVX1-NEXT:    vmovq %rdi, %xmm1
755; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
756; AVX1-NEXT:    vpandn %xmm0, %xmm1, %xmm0
757; AVX1-NEXT:    retq
758;
759; AVX2-LABEL: neg_scalar_broadcast_v2i64:
760; AVX2:       # %bb.0:
761; AVX2-NEXT:    vmovq %rdi, %xmm1
762; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
763; AVX2-NEXT:    vpandn %xmm0, %xmm1, %xmm0
764; AVX2-NEXT:    retq
765;
766; AVX512-LABEL: neg_scalar_broadcast_v2i64:
767; AVX512:       # %bb.0:
768; AVX512-NEXT:    vpbroadcastq %rdi, %xmm1
769; AVX512-NEXT:    vpandn %xmm0, %xmm1, %xmm0
770; AVX512-NEXT:    retq
771  %1 = xor i64 %a0, -1
772  %2 = insertelement <2 x i64> undef, i64 %1, i64 0
773  %3 = shufflevector <2 x i64> %2, <2 x i64> poison, <2 x i32> zeroinitializer
774  %4 = and <2 x i64> %3, %a1
775  ret <2 x i64> %4
776}
777
778define <2 x i64> @casted_neg_scalar_broadcast_v2i64(<2 x i32> %a0, <2 x i64> %a1) {
779; SSE-LABEL: casted_neg_scalar_broadcast_v2i64:
780; SSE:       # %bb.0:
781; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
782; SSE-NEXT:    pandn %xmm1, %xmm0
783; SSE-NEXT:    retq
784;
785; AVX1-LABEL: casted_neg_scalar_broadcast_v2i64:
786; AVX1:       # %bb.0:
787; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
788; AVX1-NEXT:    vandnps %xmm1, %xmm0, %xmm0
789; AVX1-NEXT:    retq
790;
791; AVX2-LABEL: casted_neg_scalar_broadcast_v2i64:
792; AVX2:       # %bb.0:
793; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
794; AVX2-NEXT:    vandnps %xmm1, %xmm0, %xmm0
795; AVX2-NEXT:    retq
796;
797; AVX512-LABEL: casted_neg_scalar_broadcast_v2i64:
798; AVX512:       # %bb.0:
799; AVX512-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
800; AVX512-NEXT:    vandnps %xmm1, %xmm0, %xmm0
801; AVX512-NEXT:    retq
802  %1 = xor <2 x i32> %a0, <i32 -1, i32 -1>
803  %2 = bitcast <2 x i32> %1 to i64
804  %3 = insertelement <2 x i64> undef, i64 %2, i64 0
805  %4 = shufflevector <2 x i64> %3, <2 x i64> poison, <2 x i32> zeroinitializer
806  %5 = and <2 x i64> %4, %a1
807  ret <2 x i64> %5
808}
809
810define <8 x i32> @neg_scalar_broadcast_v8i32(i32 %a0, <8 x i32> %a1) {
811; SSE-LABEL: neg_scalar_broadcast_v8i32:
812; SSE:       # %bb.0:
813; SSE-NEXT:    notl %edi
814; SSE-NEXT:    movd %edi, %xmm2
815; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
816; SSE-NEXT:    pand %xmm2, %xmm0
817; SSE-NEXT:    pand %xmm2, %xmm1
818; SSE-NEXT:    retq
819;
820; AVX1-LABEL: neg_scalar_broadcast_v8i32:
821; AVX1:       # %bb.0:
822; AVX1-NEXT:    vmovd %edi, %xmm1
823; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
824; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
825; AVX1-NEXT:    vandnps %ymm0, %ymm1, %ymm0
826; AVX1-NEXT:    retq
827;
828; AVX2-LABEL: neg_scalar_broadcast_v8i32:
829; AVX2:       # %bb.0:
830; AVX2-NEXT:    vmovd %edi, %xmm1
831; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
832; AVX2-NEXT:    vpandn %ymm0, %ymm1, %ymm0
833; AVX2-NEXT:    retq
834;
835; AVX512-LABEL: neg_scalar_broadcast_v8i32:
836; AVX512:       # %bb.0:
837; AVX512-NEXT:    vpbroadcastd %edi, %ymm1
838; AVX512-NEXT:    vpandn %ymm0, %ymm1, %ymm0
839; AVX512-NEXT:    retq
840  %1 = xor i32 %a0, -1
841  %2 = insertelement <8 x i32> undef, i32 %1, i64 0
842  %3 = shufflevector <8 x i32> %2, <8 x i32> poison, <8 x i32> zeroinitializer
843  %4 = and <8 x i32> %3, %a1
844  ret <8 x i32> %4
845}
846
847define <8 x i16> @neg_scalar_broadcast_v8i16(i16 %a0, <8 x i16> %a1) {
848; SSE-LABEL: neg_scalar_broadcast_v8i16:
849; SSE:       # %bb.0:
850; SSE-NEXT:    movd %edi, %xmm1
851; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
852; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
853; SSE-NEXT:    pandn %xmm0, %xmm1
854; SSE-NEXT:    movdqa %xmm1, %xmm0
855; SSE-NEXT:    retq
856;
857; AVX1-LABEL: neg_scalar_broadcast_v8i16:
858; AVX1:       # %bb.0:
859; AVX1-NEXT:    vmovd %edi, %xmm1
860; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
861; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
862; AVX1-NEXT:    vpandn %xmm0, %xmm1, %xmm0
863; AVX1-NEXT:    retq
864;
865; AVX2-LABEL: neg_scalar_broadcast_v8i16:
866; AVX2:       # %bb.0:
867; AVX2-NEXT:    vmovd %edi, %xmm1
868; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
869; AVX2-NEXT:    vpandn %xmm0, %xmm1, %xmm0
870; AVX2-NEXT:    retq
871;
872; AVX512-LABEL: neg_scalar_broadcast_v8i16:
873; AVX512:       # %bb.0:
874; AVX512-NEXT:    vpbroadcastw %edi, %xmm1
875; AVX512-NEXT:    vpandn %xmm0, %xmm1, %xmm0
876; AVX512-NEXT:    retq
877  %1 = xor i16 %a0, -1
878  %2 = insertelement <8 x i16> undef, i16 %1, i64 0
879  %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> zeroinitializer
880  %4 = and <8 x i16> %3, %a1
881  ret <8 x i16> %4
882}
883
884define <16 x i8> @neg_scalar_broadcast_v16i8(i8 %a0, <16 x i8> %a1) {
885; SSE-LABEL: neg_scalar_broadcast_v16i8:
886; SSE:       # %bb.0:
887; SSE-NEXT:    movd %edi, %xmm1
888; SSE-NEXT:    pxor %xmm2, %xmm2
889; SSE-NEXT:    pshufb %xmm2, %xmm1
890; SSE-NEXT:    pandn %xmm0, %xmm1
891; SSE-NEXT:    movdqa %xmm1, %xmm0
892; SSE-NEXT:    retq
893;
894; AVX1-LABEL: neg_scalar_broadcast_v16i8:
895; AVX1:       # %bb.0:
896; AVX1-NEXT:    vmovd %edi, %xmm1
897; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
898; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
899; AVX1-NEXT:    vpandn %xmm0, %xmm1, %xmm0
900; AVX1-NEXT:    retq
901;
902; AVX2-LABEL: neg_scalar_broadcast_v16i8:
903; AVX2:       # %bb.0:
904; AVX2-NEXT:    vmovd %edi, %xmm1
905; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
906; AVX2-NEXT:    vpandn %xmm0, %xmm1, %xmm0
907; AVX2-NEXT:    retq
908;
909; AVX512-LABEL: neg_scalar_broadcast_v16i8:
910; AVX512:       # %bb.0:
911; AVX512-NEXT:    vpbroadcastb %edi, %xmm1
912; AVX512-NEXT:    vpandn %xmm0, %xmm1, %xmm0
913; AVX512-NEXT:    retq
914  %1 = xor i8 %a0, -1
915  %2 = insertelement <16 x i8> undef, i8 %1, i64 0
916  %3 = shufflevector <16 x i8> %2, <16 x i8> poison, <16 x i32> zeroinitializer
917  %4 = and <16 x i8> %3, %a1
918  ret <16 x i8> %4
919}
920
921define <64 x i8> @neg_scalar_broadcast_v64i8(i8 %a0, <64 x i8> %a1) {
922; SSE-LABEL: neg_scalar_broadcast_v64i8:
923; SSE:       # %bb.0:
924; SSE-NEXT:    notb %dil
925; SSE-NEXT:    movzbl %dil, %eax
926; SSE-NEXT:    movd %eax, %xmm4
927; SSE-NEXT:    pxor %xmm5, %xmm5
928; SSE-NEXT:    pshufb %xmm5, %xmm4
929; SSE-NEXT:    pand %xmm4, %xmm0
930; SSE-NEXT:    pand %xmm4, %xmm1
931; SSE-NEXT:    pand %xmm4, %xmm2
932; SSE-NEXT:    pand %xmm4, %xmm3
933; SSE-NEXT:    retq
934;
935; AVX1-LABEL: neg_scalar_broadcast_v64i8:
936; AVX1:       # %bb.0:
937; AVX1-NEXT:    notb %dil
938; AVX1-NEXT:    vmovd %edi, %xmm2
939; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
940; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
941; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
942; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm0
943; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
944; AVX1-NEXT:    retq
945;
946; AVX2-LABEL: neg_scalar_broadcast_v64i8:
947; AVX2:       # %bb.0:
948; AVX2-NEXT:    notb %dil
949; AVX2-NEXT:    vmovd %edi, %xmm2
950; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
951; AVX2-NEXT:    vpand %ymm0, %ymm2, %ymm0
952; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
953; AVX2-NEXT:    retq
954;
955; AVX512-LABEL: neg_scalar_broadcast_v64i8:
956; AVX512:       # %bb.0:
957; AVX512-NEXT:    vpbroadcastb %edi, %zmm1
958; AVX512-NEXT:    vpandnq %zmm0, %zmm1, %zmm0
959; AVX512-NEXT:    retq
960  %1 = xor i8 %a0, -1
961  %2 = insertelement <64 x i8> undef, i8 %1, i64 0
962  %3 = shufflevector <64 x i8> %2, <64 x i8> poison, <64 x i32> zeroinitializer
963  %4 = and <64 x i8> %3, %a1
964  ret <64 x i8> %4
965}
966
967define <8 x i64> @neg_scalar_broadcast_v64i8_v8i64(i8 %a0, <8 x i64> %a1) {
968; SSE-LABEL: neg_scalar_broadcast_v64i8_v8i64:
969; SSE:       # %bb.0:
970; SSE-NEXT:    notb %dil
971; SSE-NEXT:    movzbl %dil, %eax
972; SSE-NEXT:    movd %eax, %xmm4
973; SSE-NEXT:    pxor %xmm5, %xmm5
974; SSE-NEXT:    pshufb %xmm5, %xmm4
975; SSE-NEXT:    pand %xmm4, %xmm0
976; SSE-NEXT:    pand %xmm4, %xmm1
977; SSE-NEXT:    pand %xmm4, %xmm2
978; SSE-NEXT:    pand %xmm4, %xmm3
979; SSE-NEXT:    retq
980;
981; AVX1-LABEL: neg_scalar_broadcast_v64i8_v8i64:
982; AVX1:       # %bb.0:
983; AVX1-NEXT:    notb %dil
984; AVX1-NEXT:    vmovd %edi, %xmm2
985; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
986; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
987; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
988; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm0
989; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
990; AVX1-NEXT:    retq
991;
992; AVX2-LABEL: neg_scalar_broadcast_v64i8_v8i64:
993; AVX2:       # %bb.0:
994; AVX2-NEXT:    notb %dil
995; AVX2-NEXT:    vmovd %edi, %xmm2
996; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
997; AVX2-NEXT:    vpand %ymm0, %ymm2, %ymm0
998; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
999; AVX2-NEXT:    retq
1000;
1001; AVX512-LABEL: neg_scalar_broadcast_v64i8_v8i64:
1002; AVX512:       # %bb.0:
1003; AVX512-NEXT:    vpbroadcastb %edi, %zmm1
1004; AVX512-NEXT:    vpandnq %zmm0, %zmm1, %zmm0
1005; AVX512-NEXT:    retq
1006  %1 = xor i8 %a0, -1
1007  %2 = insertelement <64 x i8> undef, i8 %1, i64 0
1008  %3 = shufflevector <64 x i8> %2, <64 x i8> poison, <64 x i32> zeroinitializer
1009  %4 = bitcast <64 x i8> %3 to <8 x i64>
1010  %5 = and <8 x i64> %4, %a1
1011  ret <8 x i64> %5
1012}
1013
1014define <4 x i64> @neg_scalar_broadcast_v32i8_v4i64(i8 %a0, <4 x i64> %a1) {
1015; SSE-LABEL: neg_scalar_broadcast_v32i8_v4i64:
1016; SSE:       # %bb.0:
1017; SSE-NEXT:    notb %dil
1018; SSE-NEXT:    movzbl %dil, %eax
1019; SSE-NEXT:    movd %eax, %xmm2
1020; SSE-NEXT:    pxor %xmm3, %xmm3
1021; SSE-NEXT:    pshufb %xmm3, %xmm2
1022; SSE-NEXT:    pand %xmm2, %xmm0
1023; SSE-NEXT:    pand %xmm2, %xmm1
1024; SSE-NEXT:    retq
1025;
1026; AVX1-LABEL: neg_scalar_broadcast_v32i8_v4i64:
1027; AVX1:       # %bb.0:
1028; AVX1-NEXT:    vmovd %edi, %xmm1
1029; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1030; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1031; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
1032; AVX1-NEXT:    vandnps %ymm0, %ymm1, %ymm0
1033; AVX1-NEXT:    retq
1034;
1035; AVX2-LABEL: neg_scalar_broadcast_v32i8_v4i64:
1036; AVX2:       # %bb.0:
1037; AVX2-NEXT:    vmovd %edi, %xmm1
1038; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
1039; AVX2-NEXT:    vpandn %ymm0, %ymm1, %ymm0
1040; AVX2-NEXT:    retq
1041;
1042; AVX512-LABEL: neg_scalar_broadcast_v32i8_v4i64:
1043; AVX512:       # %bb.0:
1044; AVX512-NEXT:    vpbroadcastb %edi, %ymm1
1045; AVX512-NEXT:    vpandn %ymm0, %ymm1, %ymm0
1046; AVX512-NEXT:    retq
1047  %1 = xor i8 %a0, -1
1048  %2 = insertelement <32 x i8> undef, i8 %1, i64 0
1049  %3 = shufflevector <32 x i8> %2, <32 x i8> poison, <32 x i32> zeroinitializer
1050  %4 = bitcast <32 x i8> %3 to <4 x i64>
1051  %5 = and <4 x i64> %4, %a1
1052  ret <4 x i64> %5
1053}
1054
1055define <2 x i64> @neg_scalar_broadcast_v16i8_v2i64(i8 %a0, <2 x i64> %a1) {
1056; SSE-LABEL: neg_scalar_broadcast_v16i8_v2i64:
1057; SSE:       # %bb.0:
1058; SSE-NEXT:    movd %edi, %xmm1
1059; SSE-NEXT:    pxor %xmm2, %xmm2
1060; SSE-NEXT:    pshufb %xmm2, %xmm1
1061; SSE-NEXT:    pandn %xmm0, %xmm1
1062; SSE-NEXT:    movdqa %xmm1, %xmm0
1063; SSE-NEXT:    retq
1064;
1065; AVX1-LABEL: neg_scalar_broadcast_v16i8_v2i64:
1066; AVX1:       # %bb.0:
1067; AVX1-NEXT:    vmovd %edi, %xmm1
1068; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1069; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1070; AVX1-NEXT:    vpandn %xmm0, %xmm1, %xmm0
1071; AVX1-NEXT:    retq
1072;
1073; AVX2-LABEL: neg_scalar_broadcast_v16i8_v2i64:
1074; AVX2:       # %bb.0:
1075; AVX2-NEXT:    vmovd %edi, %xmm1
1076; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
1077; AVX2-NEXT:    vpandn %xmm0, %xmm1, %xmm0
1078; AVX2-NEXT:    retq
1079;
1080; AVX512-LABEL: neg_scalar_broadcast_v16i8_v2i64:
1081; AVX512:       # %bb.0:
1082; AVX512-NEXT:    vpbroadcastb %edi, %xmm1
1083; AVX512-NEXT:    vpandn %xmm0, %xmm1, %xmm0
1084; AVX512-NEXT:    retq
1085  %1 = xor i8 %a0, -1
1086  %2 = insertelement <16 x i8> undef, i8 %1, i64 0
1087  %3 = shufflevector <16 x i8> %2, <16 x i8> poison, <16 x i32> zeroinitializer
1088  %4 = bitcast <16 x i8> %3 to <2 x i64>
1089  %5 = and <2 x i64> %4, %a1
1090  ret <2 x i64> %5
1091}
1092
1093define <4 x i64> @neg_scalar_broadcast_v8i32_v4i64(i32 %a0, <4 x i64> %a1) {
1094; SSE-LABEL: neg_scalar_broadcast_v8i32_v4i64:
1095; SSE:       # %bb.0:
1096; SSE-NEXT:    notl %edi
1097; SSE-NEXT:    movd %edi, %xmm2
1098; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1099; SSE-NEXT:    pand %xmm2, %xmm0
1100; SSE-NEXT:    pand %xmm2, %xmm1
1101; SSE-NEXT:    retq
1102;
1103; AVX1-LABEL: neg_scalar_broadcast_v8i32_v4i64:
1104; AVX1:       # %bb.0:
1105; AVX1-NEXT:    vmovd %edi, %xmm1
1106; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1107; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
1108; AVX1-NEXT:    vandnps %ymm0, %ymm1, %ymm0
1109; AVX1-NEXT:    retq
1110;
1111; AVX2-LABEL: neg_scalar_broadcast_v8i32_v4i64:
1112; AVX2:       # %bb.0:
1113; AVX2-NEXT:    vmovd %edi, %xmm1
1114; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
1115; AVX2-NEXT:    vpandn %ymm0, %ymm1, %ymm0
1116; AVX2-NEXT:    retq
1117;
1118; AVX512-LABEL: neg_scalar_broadcast_v8i32_v4i64:
1119; AVX512:       # %bb.0:
1120; AVX512-NEXT:    vpbroadcastd %edi, %ymm1
1121; AVX512-NEXT:    vpandn %ymm0, %ymm1, %ymm0
1122; AVX512-NEXT:    retq
1123  %1 = xor i32 %a0, -1
1124  %2 = insertelement <8 x i32> undef, i32 %1, i64 0
1125  %3 = shufflevector <8 x i32> %2, <8 x i32> poison, <8 x i32> zeroinitializer
1126  %4 = bitcast <8 x i32> %3 to <4 x i64>
1127  %5 = and <4 x i64> %4, %a1
1128  ret <4 x i64> %5
1129}
1130
1131define <4 x i32> @neg_scalar_broadcast_two_uses(i32 %a0, <4 x i32> %a1, ptr %a2) {
1132; SSE-LABEL: neg_scalar_broadcast_two_uses:
1133; SSE:       # %bb.0:
1134; SSE-NEXT:    notl %edi
1135; SSE-NEXT:    movd %edi, %xmm1
1136; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1137; SSE-NEXT:    movdqa %xmm1, (%rsi)
1138; SSE-NEXT:    pand %xmm1, %xmm0
1139; SSE-NEXT:    retq
1140;
1141; AVX1-LABEL: neg_scalar_broadcast_two_uses:
1142; AVX1:       # %bb.0:
1143; AVX1-NEXT:    notl %edi
1144; AVX1-NEXT:    vmovd %edi, %xmm1
1145; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1146; AVX1-NEXT:    vmovdqa %xmm1, (%rsi)
1147; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm0
1148; AVX1-NEXT:    retq
1149;
1150; AVX2-LABEL: neg_scalar_broadcast_two_uses:
1151; AVX2:       # %bb.0:
1152; AVX2-NEXT:    notl %edi
1153; AVX2-NEXT:    vmovd %edi, %xmm1
1154; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
1155; AVX2-NEXT:    vmovdqa %xmm1, (%rsi)
1156; AVX2-NEXT:    vpand %xmm0, %xmm1, %xmm0
1157; AVX2-NEXT:    retq
1158;
1159; AVX512-LABEL: neg_scalar_broadcast_two_uses:
1160; AVX512:       # %bb.0:
1161; AVX512-NEXT:    notl %edi
1162; AVX512-NEXT:    vpbroadcastd %edi, %xmm1
1163; AVX512-NEXT:    vmovdqa %xmm1, (%rsi)
1164; AVX512-NEXT:    vpand %xmm0, %xmm1, %xmm0
1165; AVX512-NEXT:    retq
1166  %1 = xor i32 %a0, -1
1167  %2 = insertelement <4 x i32> undef, i32 %1, i64 0
1168  %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> zeroinitializer
1169  store <4 x i32> %3, ptr %a2, align 16
1170  %4 = and <4 x i32> %3, %a1
1171  ret <4 x i32> %4
1172}
1173
1174; PR84660 - check for illegal types
1175define <2 x i128> @neg_scalar_broadcast_illegaltype(i128 %arg) {
1176; CHECK-LABEL: neg_scalar_broadcast_illegaltype:
1177; CHECK:       # %bb.0:
1178; CHECK-NEXT:    movq %rdi, %rax
1179; CHECK-NEXT:    notl %esi
1180; CHECK-NEXT:    andl $1, %esi
1181; CHECK-NEXT:    movq %rsi, 16(%rdi)
1182; CHECK-NEXT:    movq %rsi, (%rdi)
1183; CHECK-NEXT:    movq $0, 24(%rdi)
1184; CHECK-NEXT:    movq $0, 8(%rdi)
1185; CHECK-NEXT:    retq
1186  %i = xor i128 %arg, 1
1187  %i1 = insertelement <2 x i128> zeroinitializer, i128 %i, i64 0
1188  %i2 = shufflevector <2 x i128> %i1, <2 x i128> zeroinitializer, <2 x i32> zeroinitializer
1189  %i3 = and <2 x i128> <i128 1, i128 1>, %i2
1190  ret <2 x i128> %i3
1191}
1192
1193define <2 x i64> @andnp_xx(<2 x i64> %v0) nounwind {
1194; SSE-LABEL: andnp_xx:
1195; SSE:       # %bb.0:
1196; SSE-NEXT:    xorps %xmm0, %xmm0
1197; SSE-NEXT:    retq
1198;
1199; AVX-LABEL: andnp_xx:
1200; AVX:       # %bb.0:
1201; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1202; AVX-NEXT:    retq
1203  %x = xor <2 x i64> %v0, <i64 -1, i64 -1>
1204  %y = and <2 x i64> %v0, %x
1205  ret <2 x i64> %y
1206}
1207
1208define <2 x i64> @andnp_xx_2(<2 x i64> %v0) nounwind {
1209; SSE-LABEL: andnp_xx_2:
1210; SSE:       # %bb.0:
1211; SSE-NEXT:    xorps %xmm0, %xmm0
1212; SSE-NEXT:    retq
1213;
1214; AVX-LABEL: andnp_xx_2:
1215; AVX:       # %bb.0:
1216; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1217; AVX-NEXT:    retq
1218  %x = xor <2 x i64> %v0, <i64 -1, i64 -1>
1219  %y = and <2 x i64> %x, %v0
1220  ret <2 x i64> %y
1221}
1222
1223define i64 @andn_xx(i64 %v0) nounwind {
1224; CHECK-LABEL: andn_xx:
1225; CHECK:       # %bb.0:
1226; CHECK-NEXT:    xorl %eax, %eax
1227; CHECK-NEXT:    retq
1228  %x = xor i64 %v0, -1
1229  %y = and i64 %v0, %x
1230  ret i64 %y
1231}
1232
1233define i64 @andn_xx_2(i64 %v0) nounwind {
1234; CHECK-LABEL: andn_xx_2:
1235; CHECK:       # %bb.0:
1236; CHECK-NEXT:    xorl %eax, %eax
1237; CHECK-NEXT:    retq
1238  %x = xor i64 %v0, -1
1239  %y = and i64 %x, %v0
1240  ret i64 %y
1241}
1242