xref: /llvm-project/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll (revision c660a2f0ab1297b178fd06853c4991d0f07d8fa0)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,-sse2 < %s | FileCheck %s --check-prefix=CHECK-SSE1
3; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s --check-prefix=CHECK-SSE2
4; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+xop < %s | FileCheck %s --check-prefix=CHECK-XOP
5
6; ============================================================================ ;
7; Various cases with %x and/or %y being a constant
8; ============================================================================ ;
9
10define <4 x i32> @out_constant_varx_mone(ptr%px, ptr%py, ptr%pmask) {
11; CHECK-SSE1-LABEL: out_constant_varx_mone:
12; CHECK-SSE1:       # %bb.0:
13; CHECK-SSE1-NEXT:    movq %rdi, %rax
14; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
15; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
16; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
17; CHECK-SSE1-NEXT:    andps (%rsi), %xmm0
18; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
19; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
20; CHECK-SSE1-NEXT:    retq
21;
22; CHECK-SSE2-LABEL: out_constant_varx_mone:
23; CHECK-SSE2:       # %bb.0:
24; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm0
25; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
26; CHECK-SSE2-NEXT:    pxor %xmm0, %xmm1
27; CHECK-SSE2-NEXT:    pand (%rdi), %xmm0
28; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
29; CHECK-SSE2-NEXT:    retq
30;
31; CHECK-XOP-LABEL: out_constant_varx_mone:
32; CHECK-XOP:       # %bb.0:
33; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
34; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
35; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm1
36; CHECK-XOP-NEXT:    vpand (%rdi), %xmm0, %xmm0
37; CHECK-XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
38; CHECK-XOP-NEXT:    retq
39  %x = load <4 x i32>, ptr%px, align 16
40  %y = load <4 x i32>, ptr%py, align 16
41  %mask = load <4 x i32>, ptr%pmask, align 16
42  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
43  %mx = and <4 x i32> %mask, %x
44  %my = and <4 x i32> %notmask, <i32 -1, i32 -1, i32 -1, i32 -1>
45  %r = or <4 x i32> %mx, %my
46  ret <4 x i32> %r
47}
48
49define <4 x i32> @in_constant_varx_mone(ptr%px, ptr%py, ptr%pmask) {
50; CHECK-SSE1-LABEL: in_constant_varx_mone:
51; CHECK-SSE1:       # %bb.0:
52; CHECK-SSE1-NEXT:    movq %rdi, %rax
53; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm0
54; CHECK-SSE1-NEXT:    andnps (%rcx), %xmm0
55; CHECK-SSE1-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
56; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
57; CHECK-SSE1-NEXT:    retq
58;
59; CHECK-SSE2-LABEL: in_constant_varx_mone:
60; CHECK-SSE2:       # %bb.0:
61; CHECK-SSE2-NEXT:    movdqa (%rdi), %xmm0
62; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
63; CHECK-SSE2-NEXT:    pandn (%rdx), %xmm0
64; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm0
65; CHECK-SSE2-NEXT:    retq
66;
67; CHECK-XOP-LABEL: in_constant_varx_mone:
68; CHECK-XOP:       # %bb.0:
69; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
70; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
71; CHECK-XOP-NEXT:    vpandn (%rdx), %xmm0, %xmm0
72; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm0
73; CHECK-XOP-NEXT:    retq
74  %x = load <4 x i32>, ptr%px, align 16
75  %y = load <4 x i32>, ptr%py, align 16
76  %mask = load <4 x i32>, ptr%pmask, align 16
77  %n0 = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> ; %x
78  %n1 = and <4 x i32> %n0, %mask
79  %r = xor <4 x i32> %n1, <i32 -1, i32 -1, i32 -1, i32 -1>
80  ret <4 x i32> %r
81}
82
83; This is not a canonical form. Testing for completeness only.
84define <4 x i32> @out_constant_varx_mone_invmask(ptr%px, ptr%py, ptr%pmask) {
85; CHECK-SSE1-LABEL: out_constant_varx_mone_invmask:
86; CHECK-SSE1:       # %bb.0:
87; CHECK-SSE1-NEXT:    movq %rdi, %rax
88; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm0
89; CHECK-SSE1-NEXT:    orps (%rcx), %xmm0
90; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
91; CHECK-SSE1-NEXT:    retq
92;
93; CHECK-SSE2-LABEL: out_constant_varx_mone_invmask:
94; CHECK-SSE2:       # %bb.0:
95; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm0
96; CHECK-SSE2-NEXT:    orps (%rdx), %xmm0
97; CHECK-SSE2-NEXT:    retq
98;
99; CHECK-XOP-LABEL: out_constant_varx_mone_invmask:
100; CHECK-XOP:       # %bb.0:
101; CHECK-XOP-NEXT:    vmovaps (%rdi), %xmm0
102; CHECK-XOP-NEXT:    vorps (%rdx), %xmm0, %xmm0
103; CHECK-XOP-NEXT:    retq
104  %x = load <4 x i32>, ptr%px, align 16
105  %y = load <4 x i32>, ptr%py, align 16
106  %mask = load <4 x i32>, ptr%pmask, align 16
107  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
108  %mx = and <4 x i32> %notmask, %x
109  %my = and <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
110  %r = or <4 x i32> %mx, %my
111  ret <4 x i32> %r
112}
113
114; This is not a canonical form. Testing for completeness only.
115define <4 x i32> @in_constant_varx_mone_invmask(ptr%px, ptr%py, ptr%pmask) {
116; CHECK-SSE1-LABEL: in_constant_varx_mone_invmask:
117; CHECK-SSE1:       # %bb.0:
118; CHECK-SSE1-NEXT:    movq %rdi, %rax
119; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm0
120; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
121; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm2
122; CHECK-SSE1-NEXT:    xorps %xmm1, %xmm2
123; CHECK-SSE1-NEXT:    andnps %xmm2, %xmm0
124; CHECK-SSE1-NEXT:    xorps %xmm1, %xmm0
125; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
126; CHECK-SSE1-NEXT:    retq
127;
128; CHECK-SSE2-LABEL: in_constant_varx_mone_invmask:
129; CHECK-SSE2:       # %bb.0:
130; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm0
131; CHECK-SSE2-NEXT:    orps (%rdx), %xmm0
132; CHECK-SSE2-NEXT:    retq
133;
134; CHECK-XOP-LABEL: in_constant_varx_mone_invmask:
135; CHECK-XOP:       # %bb.0:
136; CHECK-XOP-NEXT:    vmovaps (%rdi), %xmm0
137; CHECK-XOP-NEXT:    vorps (%rdx), %xmm0, %xmm0
138; CHECK-XOP-NEXT:    retq
139  %x = load <4 x i32>, ptr%px, align 16
140  %y = load <4 x i32>, ptr%py, align 16
141  %mask = load <4 x i32>, ptr%pmask, align 16
142  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
143  %n0 = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> ; %x
144  %n1 = and <4 x i32> %n0, %notmask
145  %r = xor <4 x i32> %n1, <i32 -1, i32 -1, i32 -1, i32 -1>
146  ret <4 x i32> %r
147}
148
149define <4 x i32> @out_constant_varx_42(ptr%px, ptr%py, ptr%pmask) {
150; CHECK-SSE1-LABEL: out_constant_varx_42:
151; CHECK-SSE1:       # %bb.0:
152; CHECK-SSE1-NEXT:    movq %rdi, %rax
153; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
154; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
155; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
156; CHECK-SSE1-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
157; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
158; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
159; CHECK-SSE1-NEXT:    retq
160;
161; CHECK-SSE2-LABEL: out_constant_varx_42:
162; CHECK-SSE2:       # %bb.0:
163; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
164; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm1
165; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
166; CHECK-SSE2-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
167; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
168; CHECK-SSE2-NEXT:    retq
169;
170; CHECK-XOP-LABEL: out_constant_varx_42:
171; CHECK-XOP:       # %bb.0:
172; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
173; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
174; CHECK-XOP-NEXT:    vpcmov %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
175; CHECK-XOP-NEXT:    retq
176  %x = load <4 x i32>, ptr%px, align 16
177  %y = load <4 x i32>, ptr%py, align 16
178  %mask = load <4 x i32>, ptr%pmask, align 16
179  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
180  %mx = and <4 x i32> %mask, %x
181  %my = and <4 x i32> %notmask, <i32 42, i32 42, i32 42, i32 42>
182  %r = or <4 x i32> %mx, %my
183  ret <4 x i32> %r
184}
185
186define <4 x i32> @in_constant_varx_42(ptr%px, ptr%py, ptr%pmask) {
187; CHECK-SSE1-LABEL: in_constant_varx_42:
188; CHECK-SSE1:       # %bb.0:
189; CHECK-SSE1-NEXT:    movq %rdi, %rax
190; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
191; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
192; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
193; CHECK-SSE1-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
194; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
195; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
196; CHECK-SSE1-NEXT:    retq
197;
198; CHECK-SSE2-LABEL: in_constant_varx_42:
199; CHECK-SSE2:       # %bb.0:
200; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
201; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm1
202; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
203; CHECK-SSE2-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
204; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
205; CHECK-SSE2-NEXT:    retq
206;
207; CHECK-XOP-LABEL: in_constant_varx_42:
208; CHECK-XOP:       # %bb.0:
209; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
210; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
211; CHECK-XOP-NEXT:    vpcmov %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
212; CHECK-XOP-NEXT:    retq
213  %x = load <4 x i32>, ptr%px, align 16
214  %y = load <4 x i32>, ptr%py, align 16
215  %mask = load <4 x i32>, ptr%pmask, align 16
216  %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
217  %n1 = and <4 x i32> %n0, %mask
218  %r = xor <4 x i32> %n1, <i32 42, i32 42, i32 42, i32 42>
219  ret <4 x i32> %r
220}
221
222; This is not a canonical form. Testing for completeness only.
223define <4 x i32> @out_constant_varx_42_invmask(ptr%px, ptr%py, ptr%pmask) {
224; CHECK-SSE1-LABEL: out_constant_varx_42_invmask:
225; CHECK-SSE1:       # %bb.0:
226; CHECK-SSE1-NEXT:    movq %rdi, %rax
227; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
228; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
229; CHECK-SSE1-NEXT:    andnps (%rsi), %xmm1
230; CHECK-SSE1-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
231; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
232; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
233; CHECK-SSE1-NEXT:    retq
234;
235; CHECK-SSE2-LABEL: out_constant_varx_42_invmask:
236; CHECK-SSE2:       # %bb.0:
237; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
238; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm1
239; CHECK-SSE2-NEXT:    andnps (%rdi), %xmm1
240; CHECK-SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
241; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
242; CHECK-SSE2-NEXT:    retq
243;
244; CHECK-XOP-LABEL: out_constant_varx_42_invmask:
245; CHECK-XOP:       # %bb.0:
246; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
247; CHECK-XOP-NEXT:    vbroadcastss {{.*#+}} xmm1 = [42,42,42,42]
248; CHECK-XOP-NEXT:    vpcmov %xmm0, (%rdi), %xmm1, %xmm0
249; CHECK-XOP-NEXT:    retq
250  %x = load <4 x i32>, ptr%px, align 16
251  %y = load <4 x i32>, ptr%py, align 16
252  %mask = load <4 x i32>, ptr%pmask, align 16
253  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
254  %mx = and <4 x i32> %notmask, %x
255  %my = and <4 x i32> %mask, <i32 42, i32 42, i32 42, i32 42>
256  %r = or <4 x i32> %mx, %my
257  ret <4 x i32> %r
258}
259
260; This is not a canonical form. Testing for completeness only.
261define <4 x i32> @in_constant_varx_42_invmask(ptr%px, ptr%py, ptr%pmask) {
262; CHECK-SSE1-LABEL: in_constant_varx_42_invmask:
263; CHECK-SSE1:       # %bb.0:
264; CHECK-SSE1-NEXT:    movq %rdi, %rax
265; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
266; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
267; CHECK-SSE1-NEXT:    andnps (%rsi), %xmm1
268; CHECK-SSE1-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
269; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
270; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
271; CHECK-SSE1-NEXT:    retq
272;
273; CHECK-SSE2-LABEL: in_constant_varx_42_invmask:
274; CHECK-SSE2:       # %bb.0:
275; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
276; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm1
277; CHECK-SSE2-NEXT:    andnps (%rdi), %xmm1
278; CHECK-SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
279; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
280; CHECK-SSE2-NEXT:    retq
281;
282; CHECK-XOP-LABEL: in_constant_varx_42_invmask:
283; CHECK-XOP:       # %bb.0:
284; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
285; CHECK-XOP-NEXT:    vbroadcastss {{.*#+}} xmm1 = [42,42,42,42]
286; CHECK-XOP-NEXT:    vpcmov %xmm0, (%rdi), %xmm1, %xmm0
287; CHECK-XOP-NEXT:    retq
288  %x = load <4 x i32>, ptr%px, align 16
289  %y = load <4 x i32>, ptr%py, align 16
290  %mask = load <4 x i32>, ptr%pmask, align 16
291  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
292  %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
293  %n1 = and <4 x i32> %n0, %notmask
294  %r = xor <4 x i32> %n1, <i32 42, i32 42, i32 42, i32 42>
295  ret <4 x i32> %r
296}
297
298define <4 x i32> @out_constant_mone_vary(ptr%px, ptr%py, ptr%pmask) {
299; CHECK-SSE1-LABEL: out_constant_mone_vary:
300; CHECK-SSE1:       # %bb.0:
301; CHECK-SSE1-NEXT:    movq %rdi, %rax
302; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm0
303; CHECK-SSE1-NEXT:    orps (%rcx), %xmm0
304; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
305; CHECK-SSE1-NEXT:    retq
306;
307; CHECK-SSE2-LABEL: out_constant_mone_vary:
308; CHECK-SSE2:       # %bb.0:
309; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm0
310; CHECK-SSE2-NEXT:    orps (%rdx), %xmm0
311; CHECK-SSE2-NEXT:    retq
312;
313; CHECK-XOP-LABEL: out_constant_mone_vary:
314; CHECK-XOP:       # %bb.0:
315; CHECK-XOP-NEXT:    vmovaps (%rsi), %xmm0
316; CHECK-XOP-NEXT:    vorps (%rdx), %xmm0, %xmm0
317; CHECK-XOP-NEXT:    retq
318  %x = load <4 x i32>, ptr%px, align 16
319  %y = load <4 x i32>, ptr%py, align 16
320  %mask = load <4 x i32>, ptr%pmask, align 16
321  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
322  %mx = and <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
323  %my = and <4 x i32> %notmask, %y
324  %r = or <4 x i32> %mx, %my
325  ret <4 x i32> %r
326}
327
328define <4 x i32> @in_constant_mone_vary(ptr%px, ptr%py, ptr%pmask) {
329; CHECK-SSE1-LABEL: in_constant_mone_vary:
330; CHECK-SSE1:       # %bb.0:
331; CHECK-SSE1-NEXT:    movq %rdi, %rax
332; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
333; CHECK-SSE1-NEXT:    orps (%rdx), %xmm0
334; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
335; CHECK-SSE1-NEXT:    retq
336;
337; CHECK-SSE2-LABEL: in_constant_mone_vary:
338; CHECK-SSE2:       # %bb.0:
339; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
340; CHECK-SSE2-NEXT:    orps (%rsi), %xmm0
341; CHECK-SSE2-NEXT:    retq
342;
343; CHECK-XOP-LABEL: in_constant_mone_vary:
344; CHECK-XOP:       # %bb.0:
345; CHECK-XOP-NEXT:    vmovaps (%rdx), %xmm0
346; CHECK-XOP-NEXT:    vorps (%rsi), %xmm0, %xmm0
347; CHECK-XOP-NEXT:    retq
348  %x = load <4 x i32>, ptr%px, align 16
349  %y = load <4 x i32>, ptr%py, align 16
350  %mask = load <4 x i32>, ptr%pmask, align 16
351  %n0 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %y ; %x
352  %n1 = and <4 x i32> %n0, %mask
353  %r = xor <4 x i32> %n1, %y
354  ret <4 x i32> %r
355}
356
357; This is not a canonical form. Testing for completeness only.
358define <4 x i32> @out_constant_mone_vary_invmask(ptr%px, ptr%py, ptr%pmask) {
359; CHECK-SSE1-LABEL: out_constant_mone_vary_invmask:
360; CHECK-SSE1:       # %bb.0:
361; CHECK-SSE1-NEXT:    movq %rdi, %rax
362; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
363; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
364; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
365; CHECK-SSE1-NEXT:    andps (%rdx), %xmm0
366; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
367; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
368; CHECK-SSE1-NEXT:    retq
369;
370; CHECK-SSE2-LABEL: out_constant_mone_vary_invmask:
371; CHECK-SSE2:       # %bb.0:
372; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm0
373; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
374; CHECK-SSE2-NEXT:    pxor %xmm0, %xmm1
375; CHECK-SSE2-NEXT:    pand (%rsi), %xmm0
376; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
377; CHECK-SSE2-NEXT:    retq
378;
379; CHECK-XOP-LABEL: out_constant_mone_vary_invmask:
380; CHECK-XOP:       # %bb.0:
381; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
382; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
383; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm1
384; CHECK-XOP-NEXT:    vpand (%rsi), %xmm0, %xmm0
385; CHECK-XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
386; CHECK-XOP-NEXT:    retq
387  %x = load <4 x i32>, ptr%px, align 16
388  %y = load <4 x i32>, ptr%py, align 16
389  %mask = load <4 x i32>, ptr%pmask, align 16
390  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
391  %mx = and <4 x i32> %notmask, <i32 -1, i32 -1, i32 -1, i32 -1>
392  %my = and <4 x i32> %mask, %y
393  %r = or <4 x i32> %mx, %my
394  ret <4 x i32> %r
395}
396
397; This is not a canonical form. Testing for completeness only.
398define <4 x i32> @in_constant_mone_vary_invmask(ptr%px, ptr%py, ptr%pmask) {
399; CHECK-SSE1-LABEL: in_constant_mone_vary_invmask:
400; CHECK-SSE1:       # %bb.0:
401; CHECK-SSE1-NEXT:    movq %rdi, %rax
402; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
403; CHECK-SSE1-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
404; CHECK-SSE1-NEXT:    orps (%rdx), %xmm0
405; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
406; CHECK-SSE1-NEXT:    retq
407;
408; CHECK-SSE2-LABEL: in_constant_mone_vary_invmask:
409; CHECK-SSE2:       # %bb.0:
410; CHECK-SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
411; CHECK-SSE2-NEXT:    pxor (%rdx), %xmm0
412; CHECK-SSE2-NEXT:    por (%rsi), %xmm0
413; CHECK-SSE2-NEXT:    retq
414;
415; CHECK-XOP-LABEL: in_constant_mone_vary_invmask:
416; CHECK-XOP:       # %bb.0:
417; CHECK-XOP-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
418; CHECK-XOP-NEXT:    vpxor (%rdx), %xmm0, %xmm0
419; CHECK-XOP-NEXT:    vpor (%rsi), %xmm0, %xmm0
420; CHECK-XOP-NEXT:    retq
421  %x = load <4 x i32>, ptr%px, align 16
422  %y = load <4 x i32>, ptr%py, align 16
423  %mask = load <4 x i32>, ptr%pmask, align 16
424  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
425  %n0 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %y ; %x
426  %n1 = and <4 x i32> %n0, %notmask
427  %r = xor <4 x i32> %n1, %y
428  ret <4 x i32> %r
429}
430
431define <4 x i32> @out_constant_42_vary(ptr%px, ptr%py, ptr%pmask) {
432; CHECK-SSE1-LABEL: out_constant_42_vary:
433; CHECK-SSE1:       # %bb.0:
434; CHECK-SSE1-NEXT:    movq %rdi, %rax
435; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
436; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44]
437; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
438; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm0
439; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
440; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
441; CHECK-SSE1-NEXT:    retq
442;
443; CHECK-SSE2-LABEL: out_constant_42_vary:
444; CHECK-SSE2:       # %bb.0:
445; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
446; CHECK-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [42,42,42,42]
447; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
448; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
449; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
450; CHECK-SSE2-NEXT:    retq
451;
452; CHECK-XOP-LABEL: out_constant_42_vary:
453; CHECK-XOP:       # %bb.0:
454; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
455; CHECK-XOP-NEXT:    vbroadcastss {{.*#+}} xmm1 = [42,42,42,42]
456; CHECK-XOP-NEXT:    vpcmov %xmm0, (%rsi), %xmm1, %xmm0
457; CHECK-XOP-NEXT:    retq
458  %x = load <4 x i32>, ptr%px, align 16
459  %y = load <4 x i32>, ptr%py, align 16
460  %mask = load <4 x i32>, ptr%pmask, align 16
461  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
462  %mx = and <4 x i32> %mask, <i32 42, i32 42, i32 42, i32 42>
463  %my = and <4 x i32> %notmask, %y
464  %r = or <4 x i32> %mx, %my
465  ret <4 x i32> %r
466}
467
468define <4 x i32> @in_constant_42_vary(ptr%px, ptr%py, ptr%pmask) {
469; CHECK-SSE1-LABEL: in_constant_42_vary:
470; CHECK-SSE1:       # %bb.0:
471; CHECK-SSE1-NEXT:    movq %rdi, %rax
472; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
473; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
474; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm1
475; CHECK-SSE1-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
476; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
477; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
478; CHECK-SSE1-NEXT:    retq
479;
480; CHECK-SSE2-LABEL: in_constant_42_vary:
481; CHECK-SSE2:       # %bb.0:
482; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
483; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm1
484; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm1
485; CHECK-SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
486; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
487; CHECK-SSE2-NEXT:    retq
488;
489; CHECK-XOP-LABEL: in_constant_42_vary:
490; CHECK-XOP:       # %bb.0:
491; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
492; CHECK-XOP-NEXT:    vbroadcastss {{.*#+}} xmm1 = [42,42,42,42]
493; CHECK-XOP-NEXT:    vpcmov %xmm0, (%rsi), %xmm1, %xmm0
494; CHECK-XOP-NEXT:    retq
495  %x = load <4 x i32>, ptr%px, align 16
496  %y = load <4 x i32>, ptr%py, align 16
497  %mask = load <4 x i32>, ptr%pmask, align 16
498  %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
499  %n1 = and <4 x i32> %n0, %mask
500  %r = xor <4 x i32> %n1, %y
501  ret <4 x i32> %r
502}
503
504; This is not a canonical form. Testing for completeness only.
505define <4 x i32> @out_constant_42_vary_invmask(ptr%px, ptr%py, ptr%pmask) {
506; CHECK-SSE1-LABEL: out_constant_42_vary_invmask:
507; CHECK-SSE1:       # %bb.0:
508; CHECK-SSE1-NEXT:    movq %rdi, %rax
509; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
510; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
511; CHECK-SSE1-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
512; CHECK-SSE1-NEXT:    andps (%rdx), %xmm0
513; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
514; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
515; CHECK-SSE1-NEXT:    retq
516;
517; CHECK-SSE2-LABEL: out_constant_42_vary_invmask:
518; CHECK-SSE2:       # %bb.0:
519; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
520; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm1
521; CHECK-SSE2-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
522; CHECK-SSE2-NEXT:    andps (%rsi), %xmm0
523; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
524; CHECK-SSE2-NEXT:    retq
525;
526; CHECK-XOP-LABEL: out_constant_42_vary_invmask:
527; CHECK-XOP:       # %bb.0:
528; CHECK-XOP-NEXT:    vmovdqa (%rsi), %xmm0
529; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
530; CHECK-XOP-NEXT:    vpcmov %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
531; CHECK-XOP-NEXT:    retq
532  %x = load <4 x i32>, ptr%px, align 16
533  %y = load <4 x i32>, ptr%py, align 16
534  %mask = load <4 x i32>, ptr%pmask, align 16
535  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
536  %mx = and <4 x i32> %notmask, <i32 42, i32 42, i32 42, i32 42>
537  %my = and <4 x i32> %mask, %y
538  %r = or <4 x i32> %mx, %my
539  ret <4 x i32> %r
540}
541
542; This is not a canonical form. Testing for completeness only.
543define <4 x i32> @in_constant_42_vary_invmask(ptr%px, ptr%py, ptr%pmask) {
544; CHECK-SSE1-LABEL: in_constant_42_vary_invmask:
545; CHECK-SSE1:       # %bb.0:
546; CHECK-SSE1-NEXT:    movq %rdi, %rax
547; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
548; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm1
549; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
550; CHECK-SSE1-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
551; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
552; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
553; CHECK-SSE1-NEXT:    retq
554;
555; CHECK-SSE2-LABEL: in_constant_42_vary_invmask:
556; CHECK-SSE2:       # %bb.0:
557; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
558; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm1
559; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
560; CHECK-SSE2-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
561; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
562; CHECK-SSE2-NEXT:    retq
563;
564; CHECK-XOP-LABEL: in_constant_42_vary_invmask:
565; CHECK-XOP:       # %bb.0:
566; CHECK-XOP-NEXT:    vmovdqa (%rsi), %xmm0
567; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
568; CHECK-XOP-NEXT:    vpcmov %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
569; CHECK-XOP-NEXT:    retq
570  %x = load <4 x i32>, ptr%px, align 16
571  %y = load <4 x i32>, ptr%py, align 16
572  %mask = load <4 x i32>, ptr%pmask, align 16
573  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
574  %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
575  %n1 = and <4 x i32> %n0, %notmask
576  %r = xor <4 x i32> %n1, %y
577  ret <4 x i32> %r
578}
579