xref: /llvm-project/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll (revision 3bb147e05365680d0a954d823a7c014f357448e7)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx    | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2   | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
5
6
7define <2 x double> @test_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
8; CHECK-LABEL: test_x86_sse41_blend_pd:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    retq
11  %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 0)
12  ret <2 x double> %1
13}
14
15define <4 x float> @test_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
16; CHECK-LABEL: test_x86_sse41_blend_ps:
17; CHECK:       # %bb.0:
18; CHECK-NEXT:    retq
19  %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 0)
20  ret <4 x float> %1
21}
22
23define <8 x i16> @test_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
24; CHECK-LABEL: test_x86_sse41_pblend_w:
25; CHECK:       # %bb.0:
26; CHECK-NEXT:    retq
27  %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 0)
28  ret <8 x i16> %1
29}
30
31define <2 x double> @test2_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
32; SSE-LABEL: test2_x86_sse41_blend_pd:
33; SSE:       # %bb.0:
34; SSE-NEXT:    movaps %xmm1, %xmm0
35; SSE-NEXT:    retq
36;
37; AVX-LABEL: test2_x86_sse41_blend_pd:
38; AVX:       # %bb.0:
39; AVX-NEXT:    vmovaps %xmm1, %xmm0
40; AVX-NEXT:    retq
41  %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 -1)
42  ret <2 x double> %1
43}
44
45define <4 x float> @test2_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
46; SSE-LABEL: test2_x86_sse41_blend_ps:
47; SSE:       # %bb.0:
48; SSE-NEXT:    movaps %xmm1, %xmm0
49; SSE-NEXT:    retq
50;
51; AVX-LABEL: test2_x86_sse41_blend_ps:
52; AVX:       # %bb.0:
53; AVX-NEXT:    vmovaps %xmm1, %xmm0
54; AVX-NEXT:    retq
55  %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 -1)
56  ret <4 x float> %1
57}
58
59define <8 x i16> @test2_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
60; SSE-LABEL: test2_x86_sse41_pblend_w:
61; SSE:       # %bb.0:
62; SSE-NEXT:    movaps %xmm1, %xmm0
63; SSE-NEXT:    retq
64;
65; AVX-LABEL: test2_x86_sse41_pblend_w:
66; AVX:       # %bb.0:
67; AVX-NEXT:    vmovaps %xmm1, %xmm0
68; AVX-NEXT:    retq
69  %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 -1)
70  ret <8 x i16> %1
71}
72
73define <2 x double> @test3_x86_sse41_blend_pd(<2 x double> %a0) {
74; CHECK-LABEL: test3_x86_sse41_blend_pd:
75; CHECK:       # %bb.0:
76; CHECK-NEXT:    retq
77  %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a0, i32 7)
78  ret <2 x double> %1
79}
80
81define <4 x float> @test3_x86_sse41_blend_ps(<4 x float> %a0) {
82; CHECK-LABEL: test3_x86_sse41_blend_ps:
83; CHECK:       # %bb.0:
84; CHECK-NEXT:    retq
85  %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a0, i32 7)
86  ret <4 x float> %1
87}
88
89define <8 x i16> @test3_x86_sse41_pblend_w(<8 x i16> %a0) {
90; CHECK-LABEL: test3_x86_sse41_pblend_w:
91; CHECK:       # %bb.0:
92; CHECK-NEXT:    retq
93  %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a0, i32 7)
94  ret <8 x i16> %1
95}
96
97define double @demandedelts_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
98; SSE-LABEL: demandedelts_blendvpd:
99; SSE:       # %bb.0:
100; SSE-NEXT:    movapd %xmm0, %xmm3
101; SSE-NEXT:    movaps %xmm2, %xmm0
102; SSE-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
103; SSE-NEXT:    movapd %xmm3, %xmm0
104; SSE-NEXT:    retq
105;
106; AVX-LABEL: demandedelts_blendvpd:
107; AVX:       # %bb.0:
108; AVX-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
109; AVX-NEXT:    retq
110  %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
111  %2 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
112  %3 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer
113  %4 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %1, <2 x double> %2, <2 x double> %3)
114  %5 = extractelement <2 x double> %4, i32 0
115  ret double %5
116}
117
118define float @demandedelts_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
119; SSE-LABEL: demandedelts_blendvps:
120; SSE:       # %bb.0:
121; SSE-NEXT:    movaps %xmm0, %xmm3
122; SSE-NEXT:    movaps %xmm2, %xmm0
123; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3
124; SSE-NEXT:    movaps %xmm3, %xmm0
125; SSE-NEXT:    retq
126;
127; AVX-LABEL: demandedelts_blendvps:
128; AVX:       # %bb.0:
129; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
130; AVX-NEXT:    retq
131  %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
132  %2 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
133  %3 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> zeroinitializer
134  %4 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %1, <4 x float> %2, <4 x float> %3)
135  %5 = extractelement <4 x float> %4, i32 0
136  ret float %5
137}
138
139define <16 x i8> @demandedelts_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
140; SSE-LABEL: demandedelts_pblendvb:
141; SSE:       # %bb.0:
142; SSE-NEXT:    movdqa %xmm0, %xmm3
143; SSE-NEXT:    movdqa %xmm2, %xmm0
144; SSE-NEXT:    pblendvb %xmm0, %xmm1, %xmm3
145; SSE-NEXT:    pxor %xmm0, %xmm0
146; SSE-NEXT:    pshufb %xmm0, %xmm3
147; SSE-NEXT:    movdqa %xmm3, %xmm0
148; SSE-NEXT:    retq
149;
150; AVX1-LABEL: demandedelts_pblendvb:
151; AVX1:       # %bb.0:
152; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
153; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
154; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
155; AVX1-NEXT:    retq
156;
157; AVX2-LABEL: demandedelts_pblendvb:
158; AVX2:       # %bb.0:
159; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
160; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
161; AVX2-NEXT:    retq
162  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer
163  %2 = shufflevector <16 x i8> %a1, <16 x i8> undef, <16 x i32> zeroinitializer
164  %3 = shufflevector <16 x i8> %a2, <16 x i8> undef, <16 x i32> zeroinitializer
165  %4 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %1, <16 x i8> %2, <16 x i8> %3)
166  %5 = shufflevector <16 x i8> %4, <16 x i8> undef, <16 x i32> zeroinitializer
167  ret <16 x i8> %5
168}
169
170define <4 x float> @demandedbits_sitofp_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
171; SSE-LABEL: demandedbits_sitofp_blendvps:
172; SSE:       # %bb.0:
173; SSE-NEXT:    movaps %xmm0, %xmm3
174; SSE-NEXT:    cvtdq2ps %xmm2, %xmm0
175; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3
176; SSE-NEXT:    movaps %xmm3, %xmm0
177; SSE-NEXT:    retq
178;
179; AVX-LABEL: demandedbits_sitofp_blendvps:
180; AVX:       # %bb.0:
181; AVX-NEXT:    vcvtdq2ps %xmm2, %xmm2
182; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
183; AVX-NEXT:    retq
184  %cvt = sitofp <4 x i32> %a2 to <4 x float>
185  %sel = tail call noundef <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %cvt)
186  ret <4 x float> %sel
187}
188
189define <4 x float> @demandedbits_uitofp_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
190; SSE-LABEL: demandedbits_uitofp_blendvps:
191; SSE:       # %bb.0:
192; SSE-NEXT:    movaps %xmm0, %xmm3
193; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [1258291200,1258291200,1258291200,1258291200]
194; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
195; SSE-NEXT:    psrld $16, %xmm2
196; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7]
197; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
198; SSE-NEXT:    addps %xmm2, %xmm0
199; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3
200; SSE-NEXT:    movaps %xmm3, %xmm0
201; SSE-NEXT:    retq
202;
203; AVX1-LABEL: demandedbits_uitofp_blendvps:
204; AVX1:       # %bb.0:
205; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7]
206; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
207; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7]
208; AVX1-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
209; AVX1-NEXT:    vaddps %xmm2, %xmm3, %xmm2
210; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
211; AVX1-NEXT:    retq
212;
213; AVX2-LABEL: demandedbits_uitofp_blendvps:
214; AVX2:       # %bb.0:
215; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1258291200,1258291200,1258291200,1258291200]
216; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
217; AVX2-NEXT:    vpsrld $16, %xmm2, %xmm2
218; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928]
219; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
220; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm4 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
221; AVX2-NEXT:    vsubps %xmm4, %xmm2, %xmm2
222; AVX2-NEXT:    vaddps %xmm2, %xmm3, %xmm2
223; AVX2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
224; AVX2-NEXT:    retq
225  %cvt = uitofp <4 x i32> %a2 to <4 x float>
226  %sel = tail call noundef <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %cvt)
227  ret <4 x float> %sel
228}
229
230define <2 x i64> @demandedbits_blendvpd(i64 %a0, i64 %a2, <2 x double> %a3) {
231; SSE-LABEL: demandedbits_blendvpd:
232; SSE:       # %bb.0:
233; SSE-NEXT:    movq %rdi, %rax
234; SSE-NEXT:    orq $1, %rax
235; SSE-NEXT:    orq $4, %rdi
236; SSE-NEXT:    movq %rax, %xmm1
237; SSE-NEXT:    movq %rdi, %xmm2
238; SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
239; SSE-NEXT:    movq {{.*#+}} xmm2 = xmm2[0],zero
240; SSE-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
241; SSE-NEXT:    psrlq $11, %xmm1
242; SSE-NEXT:    movdqa %xmm1, %xmm0
243; SSE-NEXT:    retq
244;
245; AVX-LABEL: demandedbits_blendvpd:
246; AVX:       # %bb.0:
247; AVX-NEXT:    movq %rdi, %rax
248; AVX-NEXT:    orq $1, %rax
249; AVX-NEXT:    orq $4, %rdi
250; AVX-NEXT:    vmovq %rax, %xmm1
251; AVX-NEXT:    vmovq %rdi, %xmm2
252; AVX-NEXT:    vmovq {{.*#+}} xmm1 = xmm1[0],zero
253; AVX-NEXT:    vmovq {{.*#+}} xmm2 = xmm2[0],zero
254; AVX-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
255; AVX-NEXT:    vpsrlq $11, %xmm0, %xmm0
256; AVX-NEXT:    retq
257  %1  = or i64 %a0, 1
258  %2  = or i64 %a0, 4
259  %3  = bitcast i64 %1 to double
260  %4  = bitcast i64 %2 to double
261  %5  = insertelement <2 x double> zeroinitializer, double %3, i32 0
262  %6  = insertelement <2 x double> zeroinitializer, double %4, i32 0
263  %7  = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %5, <2 x double> %6, <2 x double> %a3)
264  %8  = bitcast <2 x double> %7 to <2 x i64>
265  %9  = lshr <2 x i64> %8, <i64 11, i64 11>
266  ret <2 x i64> %9
267}
268
269define <16 x i8> @xor_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
270; SSE-LABEL: xor_pblendvb:
271; SSE:       # %bb.0:
272; SSE-NEXT:    movdqa %xmm0, %xmm3
273; SSE-NEXT:    movaps %xmm2, %xmm0
274; SSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
275; SSE-NEXT:    movdqa %xmm1, %xmm0
276; SSE-NEXT:    retq
277;
278; AVX-LABEL: xor_pblendvb:
279; AVX:       # %bb.0:
280; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
281; AVX-NEXT:    retq
282  %1 = xor <16 x i8> %a2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
283  %2 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %1)
284  ret <16 x i8> %2
285}
286
287define <4 x float> @xor_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
288; SSE-LABEL: xor_blendvps:
289; SSE:       # %bb.0:
290; SSE-NEXT:    movaps %xmm0, %xmm3
291; SSE-NEXT:    movaps %xmm2, %xmm0
292; SSE-NEXT:    blendvps %xmm0, %xmm3, %xmm1
293; SSE-NEXT:    movaps %xmm1, %xmm0
294; SSE-NEXT:    retq
295;
296; AVX-LABEL: xor_blendvps:
297; AVX:       # %bb.0:
298; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
299; AVX-NEXT:    retq
300  %1 = bitcast <4 x float> %a2 to <4 x i32>
301  %2 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
302  %3 = bitcast <4 x i32> %2 to <4 x float>
303  %4 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %3)
304  ret <4 x float> %4
305}
306
307define <2 x double> @xor_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
308; SSE-LABEL: xor_blendvpd:
309; SSE:       # %bb.0:
310; SSE-NEXT:    movapd %xmm0, %xmm3
311; SSE-NEXT:    movaps %xmm2, %xmm0
312; SSE-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
313; SSE-NEXT:    movapd %xmm1, %xmm0
314; SSE-NEXT:    retq
315;
316; AVX-LABEL: xor_blendvpd:
317; AVX:       # %bb.0:
318; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
319; AVX-NEXT:    retq
320  %1 = bitcast <2 x double> %a2 to <4 x i32>
321  %2 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
322  %3 = bitcast <4 x i32> %2 to <2 x double>
323  %4 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %3)
324  ret <2 x double> %4
325}
326
327define <16 x i8> @PR47404(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
328; SSE-LABEL: PR47404:
329; SSE:       # %bb.0:
330; SSE-NEXT:    movdqa %xmm0, %xmm3
331; SSE-NEXT:    movaps %xmm2, %xmm0
332; SSE-NEXT:    pblendvb %xmm0, %xmm1, %xmm3
333; SSE-NEXT:    movdqa %xmm3, %xmm0
334; SSE-NEXT:    retq
335;
336; AVX-LABEL: PR47404:
337; AVX:       # %bb.0:
338; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
339; AVX-NEXT:    retq
340  %4 = icmp sgt <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
341  %5 = select <16 x i1> %4, <16 x i8> %0, <16 x i8> %1
342  ret <16 x i8> %5
343}
344
345declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32)
346declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32)
347declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32)
348
349declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
350declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
351declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
352