xref: /llvm-project/llvm/test/CodeGen/X86/avx-logic.ll (revision f6ff2cc7e0ae4fd9b14583a998ddeada256a954f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx     | FileCheck %s --check-prefixes=ANY,AVX1
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2    | FileCheck %s --check-prefixes=ANY,INT256
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=ANY,INT256
5
6define <4 x double> @andpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
7; ANY-LABEL: andpd256:
8; ANY:       # %bb.0: # %entry
9; ANY-NEXT:    vandpd %ymm0, %ymm1, %ymm0
10; ANY-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
11; ANY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
12; ANY-NEXT:    retq
13entry:
14  %0 = bitcast <4 x double> %x to <4 x i64>
15  %1 = bitcast <4 x double> %y to <4 x i64>
16  %and.i = and <4 x i64> %0, %1
17  %2 = bitcast <4 x i64> %and.i to <4 x double>
18  ; add forces execution domain
19  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
20  ret <4 x double> %3
21}
22
23define <4 x double> @andpd256fold(<4 x double> %y) nounwind uwtable readnone ssp {
24; ANY-LABEL: andpd256fold:
25; ANY:       # %bb.0: # %entry
26; ANY-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
27; ANY-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
28; ANY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
29; ANY-NEXT:    retq
30entry:
31  %0 = bitcast <4 x double> %y to <4 x i64>
32  %and.i = and <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507>
33  %1 = bitcast <4 x i64> %and.i to <4 x double>
34  ; add forces execution domain
35  %2 = fadd <4 x double> %1, <double 0x0, double 0x0, double 0x0, double 0x0>
36  ret <4 x double> %2
37}
38
39define <8 x float> @andps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
40; ANY-LABEL: andps256:
41; ANY:       # %bb.0: # %entry
42; ANY-NEXT:    vandps %ymm0, %ymm1, %ymm0
43; ANY-NEXT:    retq
44entry:
45  %0 = bitcast <8 x float> %x to <8 x i32>
46  %1 = bitcast <8 x float> %y to <8 x i32>
47  %and.i = and <8 x i32> %0, %1
48  %2 = bitcast <8 x i32> %and.i to <8 x float>
49  ret <8 x float> %2
50}
51
52define <8 x float> @andps256fold(<8 x float> %y) nounwind uwtable readnone ssp {
53; ANY-LABEL: andps256fold:
54; ANY:       # %bb.0: # %entry
55; ANY-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
56; ANY-NEXT:    retq
57entry:
58  %0 = bitcast <8 x float> %y to <8 x i32>
59  %and.i = and <8 x i32> %0, <i32 1083179008, i32 1079613850, i32 1075000115, i32 1067030938, i32 1083179008, i32 1079613850, i32 1075000115, i32 1067030938>
60  %1 = bitcast <8 x i32> %and.i to <8 x float>
61  ret <8 x float> %1
62}
63
64define <4 x double> @xorpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
65; ANY-LABEL: xorpd256:
66; ANY:       # %bb.0: # %entry
67; ANY-NEXT:    vxorpd %ymm0, %ymm1, %ymm0
68; ANY-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
69; ANY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
70; ANY-NEXT:    retq
71entry:
72  %0 = bitcast <4 x double> %x to <4 x i64>
73  %1 = bitcast <4 x double> %y to <4 x i64>
74  %xor.i = xor <4 x i64> %0, %1
75  %2 = bitcast <4 x i64> %xor.i to <4 x double>
76  ; add forces execution domain
77  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
78  ret <4 x double> %3
79}
80
81define <4 x double> @xorpd256fold(<4 x double> %y) nounwind uwtable readnone ssp {
82; ANY-LABEL: xorpd256fold:
83; ANY:       # %bb.0: # %entry
84; ANY-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
85; ANY-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
86; ANY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
87; ANY-NEXT:    retq
88entry:
89  %0 = bitcast <4 x double> %y to <4 x i64>
90  %xor.i = xor <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507>
91  %1 = bitcast <4 x i64> %xor.i to <4 x double>
92  ; add forces execution domain
93  %2 = fadd <4 x double> %1, <double 0x0, double 0x0, double 0x0, double 0x0>
94  ret <4 x double> %2
95}
96
97define <8 x float> @xorps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
98; ANY-LABEL: xorps256:
99; ANY:       # %bb.0: # %entry
100; ANY-NEXT:    vxorps %ymm0, %ymm1, %ymm0
101; ANY-NEXT:    retq
102entry:
103  %0 = bitcast <8 x float> %x to <8 x i32>
104  %1 = bitcast <8 x float> %y to <8 x i32>
105  %xor.i = xor <8 x i32> %0, %1
106  %2 = bitcast <8 x i32> %xor.i to <8 x float>
107  ret <8 x float> %2
108}
109
110define <8 x float> @xorps256fold(<8 x float> %y) nounwind uwtable readnone ssp {
111; ANY-LABEL: xorps256fold:
112; ANY:       # %bb.0: # %entry
113; ANY-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
114; ANY-NEXT:    retq
115entry:
116  %0 = bitcast <8 x float> %y to <8 x i32>
117  %xor.i = xor <8 x i32> %0, <i32 1083179008, i32 1079613850, i32 1075000115, i32 1067030938, i32 1083179008, i32 1079613850, i32 1075000115, i32 1067030938>
118  %1 = bitcast <8 x i32> %xor.i to <8 x float>
119  ret <8 x float> %1
120}
121
122define <4 x double> @orpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
123; ANY-LABEL: orpd256:
124; ANY:       # %bb.0: # %entry
125; ANY-NEXT:    vorpd %ymm0, %ymm1, %ymm0
126; ANY-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
127; ANY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
128; ANY-NEXT:    retq
129entry:
130  %0 = bitcast <4 x double> %x to <4 x i64>
131  %1 = bitcast <4 x double> %y to <4 x i64>
132  %or.i = or <4 x i64> %0, %1
133  %2 = bitcast <4 x i64> %or.i to <4 x double>
134  ; add forces execution domain
135  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
136  ret <4 x double> %3
137}
138
139define <4 x double> @orpd256fold(<4 x double> %y) nounwind uwtable readnone ssp {
140; ANY-LABEL: orpd256fold:
141; ANY:       # %bb.0: # %entry
142; ANY-NEXT:    vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
143; ANY-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
144; ANY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
145; ANY-NEXT:    retq
146entry:
147  %0 = bitcast <4 x double> %y to <4 x i64>
148  %or.i = or <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507>
149  %1 = bitcast <4 x i64> %or.i to <4 x double>
150  ; add forces execution domain
151  %2 = fadd <4 x double> %1, <double 0x0, double 0x0, double 0x0, double 0x0>
152  ret <4 x double> %2
153}
154
155define <8 x float> @orps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
156; ANY-LABEL: orps256:
157; ANY:       # %bb.0: # %entry
158; ANY-NEXT:    vorps %ymm0, %ymm1, %ymm0
159; ANY-NEXT:    retq
160entry:
161  %0 = bitcast <8 x float> %x to <8 x i32>
162  %1 = bitcast <8 x float> %y to <8 x i32>
163  %or.i = or <8 x i32> %0, %1
164  %2 = bitcast <8 x i32> %or.i to <8 x float>
165  ret <8 x float> %2
166}
167
168define <8 x float> @orps256fold(<8 x float> %y) nounwind uwtable readnone ssp {
169; ANY-LABEL: orps256fold:
170; ANY:       # %bb.0: # %entry
171; ANY-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
172; ANY-NEXT:    retq
173entry:
174  %0 = bitcast <8 x float> %y to <8 x i32>
175  %or.i = or <8 x i32> %0, <i32 1083179008, i32 1079613850, i32 1075000115, i32 1067030938, i32 1083179008, i32 1079613850, i32 1075000115, i32 1067030938>
176  %1 = bitcast <8 x i32> %or.i to <8 x float>
177  ret <8 x float> %1
178}
179
180define <4 x double> @andnotpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
181; ANY-LABEL: andnotpd256:
182; ANY:       # %bb.0: # %entry
183; ANY-NEXT:    vandnpd %ymm0, %ymm1, %ymm0
184; ANY-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
185; ANY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
186; ANY-NEXT:    retq
187entry:
188  %0 = bitcast <4 x double> %x to <4 x i64>
189  %neg.i = xor <4 x i64> %0, <i64 -1, i64 -1, i64 -1, i64 -1>
190  %1 = bitcast <4 x double> %y to <4 x i64>
191  %and.i = and <4 x i64> %1, %neg.i
192  %2 = bitcast <4 x i64> %and.i to <4 x double>
193  ; add forces execution domain
194  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
195  ret <4 x double> %3
196}
197
198define <4 x double> @andnotpd256fold(<4 x double> %y, ptr nocapture %x) nounwind uwtable readonly ssp {
199; ANY-LABEL: andnotpd256fold:
200; ANY:       # %bb.0: # %entry
201; ANY-NEXT:    vandnpd (%rdi), %ymm0, %ymm0
202; ANY-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
203; ANY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
204; ANY-NEXT:    retq
205entry:
206  %tmp2 = load <4 x double>, ptr %x, align 32
207  %0 = bitcast <4 x double> %y to <4 x i64>
208  %neg.i = xor <4 x i64> %0, <i64 -1, i64 -1, i64 -1, i64 -1>
209  %1 = bitcast <4 x double> %tmp2 to <4 x i64>
210  %and.i = and <4 x i64> %1, %neg.i
211  %2 = bitcast <4 x i64> %and.i to <4 x double>
212  ; add forces execution domain
213  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
214  ret <4 x double> %3
215}
216
217define <8 x float> @andnotps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
218; ANY-LABEL: andnotps256:
219; ANY:       # %bb.0: # %entry
220; ANY-NEXT:    vandnps %ymm0, %ymm1, %ymm0
221; ANY-NEXT:    retq
222entry:
223  %0 = bitcast <8 x float> %x to <8 x i32>
224  %neg.i = xor <8 x i32> %0, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
225  %1 = bitcast <8 x float> %y to <8 x i32>
226  %and.i = and <8 x i32> %1, %neg.i
227  %2 = bitcast <8 x i32> %and.i to <8 x float>
228  ret <8 x float> %2
229}
230
231define <8 x float> @andnotps256fold(<8 x float> %y, ptr nocapture %x) nounwind uwtable readonly ssp {
232; ANY-LABEL: andnotps256fold:
233; ANY:       # %bb.0: # %entry
234; ANY-NEXT:    vandnps (%rdi), %ymm0, %ymm0
235; ANY-NEXT:    retq
236entry:
237  %tmp2 = load <8 x float>, ptr %x, align 32
238  %0 = bitcast <8 x float> %y to <8 x i32>
239  %neg.i = xor <8 x i32> %0, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
240  %1 = bitcast <8 x float> %tmp2 to <8 x i32>
241  %and.i = and <8 x i32> %1, %neg.i
242  %2 = bitcast <8 x i32> %and.i to <8 x float>
243  ret <8 x float> %2
244}
245
246;;; Test that basic 2 x i64 logic use the integer version on AVX
247
248define <2 x i64> @vpandn(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
249  ; Force the execution domain with an add.
250; ANY-LABEL: vpandn:
251; ANY:       # %bb.0:
252; ANY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
253; ANY-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
254; ANY-NEXT:    vpandn %xmm0, %xmm1, %xmm0
255; ANY-NEXT:    retq
256  %a2 = add <2 x i64> %a, <i64 1, i64 1>
257  %y = xor <2 x i64> %a2, <i64 -1, i64 -1>
258  %x = and <2 x i64> %a, %y
259  ret <2 x i64> %x
260}
261
262define <2 x i64> @vpand(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
263  ; Force the execution domain with an add.
264; ANY-LABEL: vpand:
265; ANY:       # %bb.0:
266; ANY-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
267; ANY-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
268; ANY-NEXT:    vpand %xmm1, %xmm0, %xmm0
269; ANY-NEXT:    retq
270  %a2 = add <2 x i64> %a, <i64 1, i64 1>
271  %x = and <2 x i64> %a2, %b
272  ret <2 x i64> %x
273}
274
275define <4 x i32> @and_xor_splat1_v4i32(<4 x i32> %x) nounwind {
276; AVX1-LABEL: and_xor_splat1_v4i32:
277; AVX1:       # %bb.0:
278; AVX1-NEXT:    vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
279; AVX1-NEXT:    retq
280;
281; INT256-LABEL: and_xor_splat1_v4i32:
282; INT256:       # %bb.0:
283; INT256-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
284; INT256-NEXT:    vandnps %xmm1, %xmm0, %xmm0
285; INT256-NEXT:    retq
286  %xor = xor <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
287  %and = and <4 x i32> %xor, <i32 1, i32 1, i32 1, i32 1>
288  ret <4 x i32> %and
289}
290
291define <4 x i64> @and_xor_splat1_v4i64(<4 x i64> %x) nounwind {
292; AVX1-LABEL: and_xor_splat1_v4i64:
293; AVX1:       # %bb.0:
294; AVX1-NEXT:    vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
295; AVX1-NEXT:    retq
296;
297; INT256-LABEL: and_xor_splat1_v4i64:
298; INT256:       # %bb.0:
299; INT256-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
300; INT256-NEXT:    vandnps %ymm1, %ymm0, %ymm0
301; INT256-NEXT:    retq
302  %xor = xor <4 x i64> %x, <i64 1, i64 1, i64 1, i64 1>
303  %and = and <4 x i64> %xor, <i64 1, i64 1, i64 1, i64 1>
304  ret <4 x i64> %and
305}
306
307; PR37749 - https://bugs.llvm.org/show_bug.cgi?id=37749
308; For AVX1, we don't want a 256-bit logic op with insert/extract to the surrounding 128-bit ops.
309
310define <8 x i32> @and_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
311; AVX1-LABEL: and_disguised_i8_elts:
312; AVX1:       # %bb.0:
313; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm3
314; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
315; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
316; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
317; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [255,255,255,255]
318; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
319; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
320; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
321; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
322; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
323; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
324; AVX1-NEXT:    retq
325;
326; INT256-LABEL: and_disguised_i8_elts:
327; INT256:       # %bb.0:
328; INT256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
329; INT256-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
330; INT256-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
331; INT256-NEXT:    retq
332  %a = add <8 x i32> %x, %y
333  %l = and <8 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
334  %t = add <8 x i32> %l, %z
335  ret <8 x i32> %t
336}
337
338define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
339; AVX1-LABEL: andn_disguised_i8_elts:
340; AVX1:       # %bb.0:
341; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm3
342; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
343; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
344; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
345; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [255,255,255,255]
346; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
347; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
348; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
349; AVX1-NEXT:    vpandn %xmm1, %xmm3, %xmm1
350; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
351; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
352; AVX1-NEXT:    retq
353;
354; INT256-LABEL: andn_disguised_i8_elts:
355; INT256:       # %bb.0:
356; INT256-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
357; INT256-NEXT:    vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
358; INT256-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
359; INT256-NEXT:    retq
360  %add = add <8 x i32> %y, %x
361  %neg = and <8 x i32> %add, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
362  %and = xor <8 x i32> %neg, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
363  %add1 = add <8 x i32> %and, %z
364  ret <8 x i32> %add1
365}
366
367; Negative test - if we don't have a leading concat_vectors, the transform won't be profitable.
368
369define <8 x i32> @andn_variable_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
370; AVX1-LABEL: andn_variable_mask_operand_no_concat:
371; AVX1:       # %bb.0:
372; AVX1-NEXT:    vandnps %ymm2, %ymm0, %ymm0
373; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
374; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
375; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
376; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
377; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
378; AVX1-NEXT:    retq
379;
380; INT256-LABEL: andn_variable_mask_operand_no_concat:
381; INT256:       # %bb.0:
382; INT256-NEXT:    vpandn %ymm2, %ymm0, %ymm0
383; INT256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
384; INT256-NEXT:    retq
385  %and = and <8 x i32> %x, %z
386  %xor = xor <8 x i32> %and, %z ; demanded bits will make this a 'not'
387  %add = add <8 x i32> %xor, %y
388  ret <8 x i32> %add
389}
390
391; Negative test - if we don't have a leading concat_vectors, the transform won't be profitable (even if the mask is a constant).
392
393define <8 x i32> @andn_constant_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %y) {
394; AVX1-LABEL: andn_constant_mask_operand_no_concat:
395; AVX1:       # %bb.0:
396; AVX1-NEXT:    vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
397; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
398; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
399; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
400; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
401; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
402; AVX1-NEXT:    retq
403;
404; INT256-LABEL: andn_constant_mask_operand_no_concat:
405; INT256:       # %bb.0:
406; INT256-NEXT:    vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
407; INT256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
408; INT256-NEXT:    retq
409  %xor = xor <8 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
410  %and = and <8 x i32> %xor, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
411  %r = add <8 x i32> %and, %y
412  ret <8 x i32> %r
413}
414
415; This is a close call, but we split the 'andn' to reduce the insert/extract.
416
417define <8 x i32> @andn_variable_mask_operand_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) {
418; AVX1-LABEL: andn_variable_mask_operand_concat:
419; AVX1:       # %bb.0:
420; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm4
421; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
422; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
423; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
424; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
425; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
426; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
427; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
428; AVX1-NEXT:    vpandn %xmm2, %xmm4, %xmm1
429; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
430; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
431; AVX1-NEXT:    retq
432;
433; INT256-LABEL: andn_variable_mask_operand_concat:
434; INT256:       # %bb.0:
435; INT256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
436; INT256-NEXT:    vpandn %ymm2, %ymm0, %ymm0
437; INT256-NEXT:    vpaddd %ymm3, %ymm0, %ymm0
438; INT256-NEXT:    retq
439  %add = add <8 x i32> %x, %y
440  %xor = xor <8 x i32> %add, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
441  %and = and <8 x i32> %xor, %z
442  %r = add <8 x i32> %and, %w
443  ret <8 x i32> %r
444}
445
446define <8 x i32> @or_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
447; AVX1-LABEL: or_disguised_i8_elts:
448; AVX1:       # %bb.0:
449; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm3
450; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
451; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
452; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
453; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [255,255,255,255]
454; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
455; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
456; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
457; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
458; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
459; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
460; AVX1-NEXT:    retq
461;
462; INT256-LABEL: or_disguised_i8_elts:
463; INT256:       # %bb.0:
464; INT256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
465; INT256-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
466; INT256-NEXT:    vpor %ymm1, %ymm0, %ymm0
467; INT256-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
468; INT256-NEXT:    retq
469  %a = add <8 x i32> %x, %y
470  %l = or <8 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
471  %t = add <8 x i32> %l, %z
472  ret <8 x i32> %t
473}
474
475define <8 x i32> @xor_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
476; AVX1-LABEL: xor_disguised_i8_elts:
477; AVX1:       # %bb.0:
478; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm3
479; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
480; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
481; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
482; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [255,255,255,255]
483; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
484; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
485; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
486; AVX1-NEXT:    vpxor %xmm1, %xmm3, %xmm1
487; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
488; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
489; AVX1-NEXT:    retq
490;
491; INT256-LABEL: xor_disguised_i8_elts:
492; INT256:       # %bb.0:
493; INT256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
494; INT256-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
495; INT256-NEXT:    vpxor %ymm1, %ymm0, %ymm0
496; INT256-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
497; INT256-NEXT:    retq
498  %a = add <8 x i32> %x, %y
499  %l = xor <8 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
500  %t = add <8 x i32> %l, %z
501  ret <8 x i32> %t
502}
503
504define <8 x i32> @and_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
505; AVX1-LABEL: and_disguised_i16_elts:
506; AVX1:       # %bb.0:
507; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm3
508; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
509; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
510; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
511; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
512; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
513; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
514; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
515; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
516; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
517; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
518; AVX1-NEXT:    retq
519;
520; INT256-LABEL: and_disguised_i16_elts:
521; INT256:       # %bb.0:
522; INT256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
523; INT256-NEXT:    vpxor %xmm1, %xmm1, %xmm1
524; INT256-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
525; INT256-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
526; INT256-NEXT:    retq
527  %a = add <8 x i32> %x, %y
528  %l = and <8 x i32> %a, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
529  %t = add <8 x i32> %l, %z
530  ret <8 x i32> %t
531}
532
533define <8 x i32> @or_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
534; AVX1-LABEL: or_disguised_i16_elts:
535; AVX1:       # %bb.0:
536; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm3
537; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
538; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
539; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
540; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [65535,65535,65535,65535]
541; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
542; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
543; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
544; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
545; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
546; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
547; AVX1-NEXT:    retq
548;
549; INT256-LABEL: or_disguised_i16_elts:
550; INT256:       # %bb.0:
551; INT256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
552; INT256-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
553; INT256-NEXT:    vpor %ymm1, %ymm0, %ymm0
554; INT256-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
555; INT256-NEXT:    retq
556  %a = add <8 x i32> %x, %y
557  %l = or <8 x i32> %a, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
558  %t = add <8 x i32> %l, %z
559  ret <8 x i32> %t
560}
561
562define <8 x i32> @xor_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
563; AVX1-LABEL: xor_disguised_i16_elts:
564; AVX1:       # %bb.0:
565; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm3
566; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
567; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
568; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
569; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [65535,65535,65535,65535]
570; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
571; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
572; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
573; AVX1-NEXT:    vpxor %xmm1, %xmm3, %xmm1
574; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
575; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
576; AVX1-NEXT:    retq
577;
578; INT256-LABEL: xor_disguised_i16_elts:
579; INT256:       # %bb.0:
580; INT256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
581; INT256-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
582; INT256-NEXT:    vpxor %ymm1, %ymm0, %ymm0
583; INT256-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
584; INT256-NEXT:    retq
585  %a = add <8 x i32> %x, %y
586  %l = xor <8 x i32> %a, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
587  %t = add <8 x i32> %l, %z
588  ret <8 x i32> %t
589}
590
591