xref: /llvm-project/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll (revision 8fd6fc78aec88d1662236b75b5ecc9a62d50a837)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64
4
5define <4 x double> @test_broadcast_2f64_4f64(ptr%p) nounwind {
6; X86-LABEL: test_broadcast_2f64_4f64:
7; X86:       # %bb.0:
8; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
9; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
10; X86-NEXT:    retl
11;
12; X64-LABEL: test_broadcast_2f64_4f64:
13; X64:       # %bb.0:
14; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
15; X64-NEXT:    retq
16 %1 = load <2 x double>, ptr%p
17 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
18 ret <4 x double> %2
19}
20
21define <4 x i64> @test_broadcast_2i64_4i64(ptr%p) nounwind {
22; X86-LABEL: test_broadcast_2i64_4i64:
23; X86:       # %bb.0:
24; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
25; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
26; X86-NEXT:    retl
27;
28; X64-LABEL: test_broadcast_2i64_4i64:
29; X64:       # %bb.0:
30; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
31; X64-NEXT:    retq
32 %1 = load <2 x i64>, ptr%p
33 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
34 ret <4 x i64> %2
35}
36
37define <8 x float> @test_broadcast_4f32_8f32(ptr%p) nounwind {
38; X86-LABEL: test_broadcast_4f32_8f32:
39; X86:       # %bb.0:
40; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
41; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
42; X86-NEXT:    retl
43;
44; X64-LABEL: test_broadcast_4f32_8f32:
45; X64:       # %bb.0:
46; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
47; X64-NEXT:    retq
48 %1 = load <4 x float>, ptr%p
49 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
50 ret <8 x float> %2
51}
52
53define <8 x i32> @test_broadcast_4i32_8i32(ptr%p) nounwind {
54; X86-LABEL: test_broadcast_4i32_8i32:
55; X86:       # %bb.0:
56; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
57; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
58; X86-NEXT:    retl
59;
60; X64-LABEL: test_broadcast_4i32_8i32:
61; X64:       # %bb.0:
62; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
63; X64-NEXT:    retq
64 %1 = load <4 x i32>, ptr%p
65 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
66 ret <8 x i32> %2
67}
68
69define <16 x i16> @test_broadcast_8i16_16i16(ptr%p) nounwind {
70; X86-LABEL: test_broadcast_8i16_16i16:
71; X86:       # %bb.0:
72; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
73; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
74; X86-NEXT:    retl
75;
76; X64-LABEL: test_broadcast_8i16_16i16:
77; X64:       # %bb.0:
78; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
79; X64-NEXT:    retq
80 %1 = load <8 x i16>, ptr%p
81 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
82 ret <16 x i16> %2
83}
84
85define <32 x i8> @test_broadcast_16i8_32i8(ptr%p) nounwind {
86; X86-LABEL: test_broadcast_16i8_32i8:
87; X86:       # %bb.0:
88; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
89; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
90; X86-NEXT:    retl
91;
92; X64-LABEL: test_broadcast_16i8_32i8:
93; X64:       # %bb.0:
94; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
95; X64-NEXT:    retq
96 %1 = load <16 x i8>, ptr%p
97 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
98 ret <32 x i8> %2
99}
100
101; PR38949 - https://bugs.llvm.org/show_bug.cgi?id=38949
102; Don't limit the transform based on extra uses of the load itself (the store is a user of the load's chain value).
103
104define void @subv_reuse_is_ok(ptr %a, ptr %b) {
105; X86-LABEL: subv_reuse_is_ok:
106; X86:       # %bb.0:
107; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
108; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
109; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
110; X86-NEXT:    vmovups %ymm0, (%eax)
111; X86-NEXT:    vzeroupper
112; X86-NEXT:    retl
113;
114; X64-LABEL: subv_reuse_is_ok:
115; X64:       # %bb.0:
116; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
117; X64-NEXT:    vmovups %ymm0, (%rsi)
118; X64-NEXT:    vzeroupper
119; X64-NEXT:    retq
120  %ld = load <4 x float>, ptr %a, align 1
121  %splat128 = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
122  store <8 x float> %splat128, ptr %b, align 16
123  ret void
124}
125
126define <4 x double> @test_broadcast_2f64_4f64_reuse(ptr %p0, ptr %p1) {
127; X86-LABEL: test_broadcast_2f64_4f64_reuse:
128; X86:       # %bb.0:
129; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
130; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
131; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
132; X86-NEXT:    vmovaps %xmm0, (%eax)
133; X86-NEXT:    retl
134;
135; X64-LABEL: test_broadcast_2f64_4f64_reuse:
136; X64:       # %bb.0:
137; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
138; X64-NEXT:    vmovaps %xmm0, (%rsi)
139; X64-NEXT:    retq
140 %1 = load <2 x double>, ptr %p0
141 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
142 store <2 x double> %1, ptr %p1
143 ret <4 x double> %2
144}
145
146define <4 x i64> @test_broadcast_2i64_4i64_reuse(ptr %p0, ptr %p1) {
147; X86-LABEL: test_broadcast_2i64_4i64_reuse:
148; X86:       # %bb.0:
149; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
150; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
151; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
152; X86-NEXT:    vmovaps %xmm0, (%eax)
153; X86-NEXT:    retl
154;
155; X64-LABEL: test_broadcast_2i64_4i64_reuse:
156; X64:       # %bb.0:
157; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
158; X64-NEXT:    vmovaps %xmm0, (%rsi)
159; X64-NEXT:    retq
160 %1 = load <2 x i64>, ptr %p0
161 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
162 store <2 x i64> %1, ptr %p1
163 ret <4 x i64> %2
164}
165
166define <8 x float> @test_broadcast_4f32_8f32_reuse(ptr %p0, ptr %p1) {
167; X86-LABEL: test_broadcast_4f32_8f32_reuse:
168; X86:       # %bb.0:
169; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
170; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
171; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
172; X86-NEXT:    vmovaps %xmm0, (%eax)
173; X86-NEXT:    retl
174;
175; X64-LABEL: test_broadcast_4f32_8f32_reuse:
176; X64:       # %bb.0:
177; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
178; X64-NEXT:    vmovaps %xmm0, (%rsi)
179; X64-NEXT:    retq
180 %1 = load <4 x float>, ptr %p0
181 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
182 store <4 x float> %1, ptr %p1
183 ret <8 x float> %2
184}
185
186define <8 x i32> @test_broadcast_4i32_8i32_reuse(ptr %p0, ptr %p1) {
187; X86-LABEL: test_broadcast_4i32_8i32_reuse:
188; X86:       # %bb.0:
189; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
190; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
191; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
192; X86-NEXT:    vmovaps %xmm0, (%eax)
193; X86-NEXT:    retl
194;
195; X64-LABEL: test_broadcast_4i32_8i32_reuse:
196; X64:       # %bb.0:
197; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
198; X64-NEXT:    vmovaps %xmm0, (%rsi)
199; X64-NEXT:    retq
200 %1 = load <4 x i32>, ptr %p0
201 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
202 store <4 x i32> %1, ptr %p1
203 ret <8 x i32> %2
204}
205
206define <16 x i16> @test_broadcast_8i16_16i16_reuse(ptr%p0, ptr%p1) nounwind {
207; X86-LABEL: test_broadcast_8i16_16i16_reuse:
208; X86:       # %bb.0:
209; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
210; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
211; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
212; X86-NEXT:    vmovaps %xmm0, (%eax)
213; X86-NEXT:    retl
214;
215; X64-LABEL: test_broadcast_8i16_16i16_reuse:
216; X64:       # %bb.0:
217; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
218; X64-NEXT:    vmovaps %xmm0, (%rsi)
219; X64-NEXT:    retq
220 %1 = load <8 x i16>, ptr%p0
221 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
222 store <8 x i16> %1, ptr %p1
223 ret <16 x i16> %2
224}
225
226define <32 x i8> @test_broadcast_16i8_32i8_reuse(ptr%p0, ptr%p1) nounwind {
227; X86-LABEL: test_broadcast_16i8_32i8_reuse:
228; X86:       # %bb.0:
229; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
230; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
231; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
232; X86-NEXT:    vmovaps %xmm0, (%eax)
233; X86-NEXT:    retl
234;
235; X64-LABEL: test_broadcast_16i8_32i8_reuse:
236; X64:       # %bb.0:
237; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
238; X64-NEXT:    vmovaps %xmm0, (%rsi)
239; X64-NEXT:    retq
240 %1 = load <16 x i8>, ptr%p0
241 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
242 store <16 x i8> %1, ptr %p1
243 ret <32 x i8> %2
244}
245
246define <8 x i32> @PR29088(ptr %p0, ptr %p1) {
247; X86-LABEL: PR29088:
248; X86:       # %bb.0:
249; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
250; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
251; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
252; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
253; X86-NEXT:    vmovaps %ymm1, (%eax)
254; X86-NEXT:    retl
255;
256; X64-LABEL: PR29088:
257; X64:       # %bb.0:
258; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
259; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
260; X64-NEXT:    vmovaps %ymm1, (%rsi)
261; X64-NEXT:    retq
262  %ld = load <4 x i32>, ptr %p0
263  store <8 x float> zeroinitializer, ptr %p1
264  %shuf = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
265  ret <8 x i32> %shuf
266}
267