xref: /llvm-project/llvm/test/CodeGen/X86/avx.ll (revision da1eb886c4a6434c76d27ef0f6d5139dda930d72)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X64
4
5define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
6; CHECK-LABEL: blendvb_fallback_v4i32:
7; CHECK:       ## %bb.0:
8; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0
9; CHECK-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
10; CHECK-NEXT:    ret{{[l|q]}}
11  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
12  ret <4 x i32> %ret
13}
14
15define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) {
16; CHECK-LABEL: blendvb_fallback_v8i32:
17; CHECK:       ## %bb.0:
18; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
19; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
20; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
21; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0
22; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
23; CHECK-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
24; CHECK-NEXT:    ret{{[l|q]}}
25  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
26  ret <8 x i32> %ret
27}
28
29define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) {
30; CHECK-LABEL: blendvb_fallback_v8f32:
31; CHECK:       ## %bb.0:
32; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
33; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
34; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
35; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0
36; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
37; CHECK-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
38; CHECK-NEXT:    ret{{[l|q]}}
39  %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
40  ret <8 x float> %ret
41}
42
43declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
44
45define <4 x float> @insertps_from_vector_load(<4 x float> %a, ptr nocapture readonly %pb) {
46; X86-LABEL: insertps_from_vector_load:
47; X86:       ## %bb.0:
48; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
49; X86-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
50; X86-NEXT:    retl
51;
52; X64-LABEL: insertps_from_vector_load:
53; X64:       ## %bb.0:
54; X64-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
55; X64-NEXT:    retq
56  %1 = load <4 x float>, ptr %pb, align 16
57  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
58  ret <4 x float> %2
59}
60
61;; Use a non-zero CountS for insertps
62define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, ptr nocapture readonly %pb) {
63; X86-LABEL: insertps_from_vector_load_offset:
64; X86:       ## %bb.0:
65; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
66; X86-NEXT:    vinsertps $32, 4(%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
67; X86-NEXT:    retl
68;
69; X64-LABEL: insertps_from_vector_load_offset:
70; X64:       ## %bb.0:
71; X64-NEXT:    vinsertps $32, 4(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
72; X64-NEXT:    retq
73  %1 = load <4 x float>, ptr %pb, align 16
74  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
75  ret <4 x float> %2
76}
77
78define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocapture readonly %pb, i64 %index) {
79; X86-LABEL: insertps_from_vector_load_offset_2:
80; X86:       ## %bb.0:
81; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
82; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
83; X86-NEXT:    shll $4, %ecx
84; X86-NEXT:    vinsertps $0, 12(%eax,%ecx), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
85; X86-NEXT:    retl
86;
87; X64-LABEL: insertps_from_vector_load_offset_2:
88; X64:       ## %bb.0:
89; X64-NEXT:    shlq $4, %rsi
90; X64-NEXT:    vinsertps $0, 12(%rdi,%rsi), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
91; X64-NEXT:    retq
92  %1 = getelementptr inbounds <4 x float>, ptr %pb, i64 %index
93  %2 = load <4 x float>, ptr %1, align 16
94  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
95  ret <4 x float> %3
96}
97
98define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, ptr nocapture readonly %fb, i64 %index) {
99; X86-LABEL: insertps_from_broadcast_loadf32:
100; X86:       ## %bb.0:
101; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
102; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
103; X86-NEXT:    vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
104; X86-NEXT:    retl
105;
106; X64-LABEL: insertps_from_broadcast_loadf32:
107; X64:       ## %bb.0:
108; X64-NEXT:    vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
109; X64-NEXT:    retq
110  %1 = getelementptr inbounds float, ptr %fb, i64 %index
111  %2 = load float, ptr %1, align 4
112  %3 = insertelement <4 x float> undef, float %2, i32 0
113  %4 = insertelement <4 x float> %3, float %2, i32 1
114  %5 = insertelement <4 x float> %4, float %2, i32 2
115  %6 = insertelement <4 x float> %5, float %2, i32 3
116  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
117  ret <4 x float> %7
118}
119
120define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, ptr nocapture readonly %b) {
121; X86-LABEL: insertps_from_broadcast_loadv4f32:
122; X86:       ## %bb.0:
123; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
124; X86-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
125; X86-NEXT:    retl
126;
127; X64-LABEL: insertps_from_broadcast_loadv4f32:
128; X64:       ## %bb.0:
129; X64-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
130; X64-NEXT:    retq
131  %1 = load <4 x float>, ptr %b, align 4
132  %2 = extractelement <4 x float> %1, i32 0
133  %3 = insertelement <4 x float> undef, float %2, i32 0
134  %4 = insertelement <4 x float> %3, float %2, i32 1
135  %5 = insertelement <4 x float> %4, float %2, i32 2
136  %6 = insertelement <4 x float> %5, float %2, i32 3
137  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
138  ret <4 x float> %7
139}
140
141define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr nocapture readonly %fb, i64 %index) {
142; X86-LABEL: insertps_from_broadcast_multiple_use:
143; X86:       ## %bb.0:
144; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
145; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
146; X86-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4
147; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
148; X86-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
149; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
150; X86-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
151; X86-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
152; X86-NEXT:    vaddps %xmm2, %xmm1, %xmm1
153; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
154; X86-NEXT:    retl
155;
156; X64-LABEL: insertps_from_broadcast_multiple_use:
157; X64:       ## %bb.0:
158; X64-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4
159; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
160; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
161; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
162; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
163; X64-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
164; X64-NEXT:    vaddps %xmm2, %xmm1, %xmm1
165; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
166; X64-NEXT:    retq
167  %1 = getelementptr inbounds float, ptr %fb, i64 %index
168  %2 = load float, ptr %1, align 4
169  %3 = insertelement <4 x float> undef, float %2, i32 0
170  %4 = insertelement <4 x float> %3, float %2, i32 1
171  %5 = insertelement <4 x float> %4, float %2, i32 2
172  %6 = insertelement <4 x float> %5, float %2, i32 3
173  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
174  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
175  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
176  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
177  %11 = fadd <4 x float> %7, %8
178  %12 = fadd <4 x float> %9, %10
179  %13 = fadd <4 x float> %11, %12
180  ret <4 x float> %13
181}
182
183define <4 x float> @nofold_insertps(ptr %a, <4 x float> %b) {
184; X86-LABEL: nofold_insertps:
185; X86:       ## %bb.0:
186; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
187; X86-NEXT:    vinsertps $48, 8(%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
188; X86-NEXT:    retl
189;
190; X64-LABEL: nofold_insertps:
191; X64:       ## %bb.0:
192; X64-NEXT:    vinsertps $48, 8(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
193; X64-NEXT:    retq
194  %1 = load <4 x float>, ptr %a, align 1
195  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
196  ret <4 x float> %2
197}
198