xref: /llvm-project/llvm/test/CodeGen/X86/horizontal-shuffle-demanded.ll (revision 2f448bf509432c1a19ec46ab8cbc7353c03c6280)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64
4
5;
6; 128-bit Vectors
7;
8
9define void @test_demanded_haddps_128(<4 x float> %a0, <4 x float> %a1, ptr%a2) nounwind {
10; X86-LABEL: test_demanded_haddps_128:
11; X86:       ## %bb.0:
12; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
13; X86-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
14; X86-NEXT:    vmovss %xmm0, (%eax)
15; X86-NEXT:    retl
16;
17; X64-LABEL: test_demanded_haddps_128:
18; X64:       ## %bb.0:
19; X64-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
20; X64-NEXT:    vmovss %xmm0, (%rdi)
21; X64-NEXT:    retq
22  %1 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
23  %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %1)
24  %3 = extractelement <4 x float> %2, i32 0
25  store float %3, ptr%a2
26  ret void
27}
28
29define void @test_demanded_hsubps_128(<4 x float> %a0, <4 x float> %a1, ptr%a2) nounwind {
30; X86-LABEL: test_demanded_hsubps_128:
31; X86:       ## %bb.0:
32; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
33; X86-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
34; X86-NEXT:    vextractps $2, %xmm0, (%eax)
35; X86-NEXT:    retl
36;
37; X64-LABEL: test_demanded_hsubps_128:
38; X64:       ## %bb.0:
39; X64-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
40; X64-NEXT:    vextractps $2, %xmm0, (%rdi)
41; X64-NEXT:    retq
42  %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
43  %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %1, <4 x float> %a1)
44  %3 = extractelement <4 x float> %2, i32 2
45  store float %3, ptr%a2
46  ret void
47}
48
49define void @test_demanded_haddpd_128(<2 x double> %a0, <2 x double> %a1, ptr%a2) nounwind {
50; X86-LABEL: test_demanded_haddpd_128:
51; X86:       ## %bb.0:
52; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
53; X86-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
54; X86-NEXT:    vmovlpd %xmm0, (%eax)
55; X86-NEXT:    retl
56;
57; X64-LABEL: test_demanded_haddpd_128:
58; X64:       ## %bb.0:
59; X64-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
60; X64-NEXT:    vmovlpd %xmm0, (%rdi)
61; X64-NEXT:    retq
62  %1 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
63  %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %1)
64  %3 = extractelement <2 x double> %2, i32 0
65  store double %3, ptr%a2
66  ret void
67}
68
69define void @test_demanded_hsubpd_128(<2 x double> %a0, <2 x double> %a1, ptr%a2) nounwind {
70; X86-LABEL: test_demanded_hsubpd_128:
71; X86:       ## %bb.0:
72; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
73; X86-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
74; X86-NEXT:    vmovlpd %xmm0, (%eax)
75; X86-NEXT:    retl
76;
77; X64-LABEL: test_demanded_hsubpd_128:
78; X64:       ## %bb.0:
79; X64-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
80; X64-NEXT:    vmovlpd %xmm0, (%rdi)
81; X64-NEXT:    retq
82  %1 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
83  %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %1)
84  %3 = extractelement <2 x double> %2, i32 0
85  store double %3, ptr%a2
86  ret void
87}
88
89define void @test_demanded_phaddd_128(<4 x i32> %a0, <4 x i32> %a1, ptr%a2) nounwind {
90; X86-LABEL: test_demanded_phaddd_128:
91; X86:       ## %bb.0:
92; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
93; X86-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
94; X86-NEXT:    vmovd %xmm0, (%eax)
95; X86-NEXT:    retl
96;
97; X64-LABEL: test_demanded_phaddd_128:
98; X64:       ## %bb.0:
99; X64-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
100; X64-NEXT:    vmovd %xmm0, (%rdi)
101; X64-NEXT:    retq
102  %1 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> zeroinitializer
103  %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %1)
104  %3 = extractelement <4 x i32> %2, i32 0
105  store i32 %3, ptr%a2
106  ret void
107}
108
109define void @test_demanded_phsubd_128(<4 x i32> %a0, <4 x i32> %a1, ptr%a2) nounwind {
110; X86-LABEL: test_demanded_phsubd_128:
111; X86:       ## %bb.0:
112; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
113; X86-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
114; X86-NEXT:    vpextrd $1, %xmm0, (%eax)
115; X86-NEXT:    retl
116;
117; X64-LABEL: test_demanded_phsubd_128:
118; X64:       ## %bb.0:
119; X64-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
120; X64-NEXT:    vpextrd $1, %xmm0, (%rdi)
121; X64-NEXT:    retq
122  %1 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> zeroinitializer
123  %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %1)
124  %3 = extractelement <4 x i32> %2, i32 1
125  store i32 %3, ptr%a2
126  ret void
127}
128
129define void @test_demanded_phaddw_128(<8 x i16> %a0, <8 x i16> %a1, ptr%a2) nounwind {
130; X86-LABEL: test_demanded_phaddw_128:
131; X86:       ## %bb.0:
132; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
133; X86-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
134; X86-NEXT:    vpextrw $0, %xmm0, (%eax)
135; X86-NEXT:    retl
136;
137; X64-LABEL: test_demanded_phaddw_128:
138; X64:       ## %bb.0:
139; X64-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
140; X64-NEXT:    vpextrw $0, %xmm0, (%rdi)
141; X64-NEXT:    retq
142  %1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> zeroinitializer
143  %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %1)
144  %3 = extractelement <8 x i16> %2, i16 0
145  store i16 %3, ptr%a2
146  ret void
147}
148
149define void @test_demanded_phsubw_128(<8 x i16> %a0, <8 x i16> %a1, ptr%a2) nounwind {
150; X86-LABEL: test_demanded_phsubw_128:
151; X86:       ## %bb.0:
152; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
153; X86-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
154; X86-NEXT:    vpextrw $2, %xmm0, (%eax)
155; X86-NEXT:    retl
156;
157; X64-LABEL: test_demanded_phsubw_128:
158; X64:       ## %bb.0:
159; X64-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
160; X64-NEXT:    vpextrw $2, %xmm0, (%rdi)
161; X64-NEXT:    retq
162  %1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> zeroinitializer
163  %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %1)
164  %3 = extractelement <8 x i16> %2, i16 2
165  store i16 %3, ptr%a2
166  ret void
167}
168
169;
170; 256-bit Vectors
171;
172
173define void @test_demanded_haddps_256(<8 x float> %a0, <8 x float> %a1, ptr%a2) nounwind {
174; X86-LABEL: test_demanded_haddps_256:
175; X86:       ## %bb.0:
176; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
177; X86-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
178; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
179; X86-NEXT:    vmovss %xmm0, (%eax)
180; X86-NEXT:    vzeroupper
181; X86-NEXT:    retl
182;
183; X64-LABEL: test_demanded_haddps_256:
184; X64:       ## %bb.0:
185; X64-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
186; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
187; X64-NEXT:    vmovss %xmm0, (%rdi)
188; X64-NEXT:    vzeroupper
189; X64-NEXT:    retq
190  %1 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> zeroinitializer
191  %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %1)
192  %3 = extractelement <8 x float> %2, i32 4
193  store float %3, ptr%a2
194  ret void
195}
196
197define void @test_demanded_hsubps_256(<8 x float> %a0, <8 x float> %a1, ptr%a2) nounwind {
198; X86-LABEL: test_demanded_hsubps_256:
199; X86:       ## %bb.0:
200; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
201; X86-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
202; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
203; X86-NEXT:    vextractps $3, %xmm0, (%eax)
204; X86-NEXT:    vzeroupper
205; X86-NEXT:    retl
206;
207; X64-LABEL: test_demanded_hsubps_256:
208; X64:       ## %bb.0:
209; X64-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
210; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
211; X64-NEXT:    vextractps $3, %xmm0, (%rdi)
212; X64-NEXT:    vzeroupper
213; X64-NEXT:    retq
214  %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
215  %2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %1, <8 x float> %a1)
216  %3 = extractelement <8 x float> %2, i32 7
217  store float %3, ptr%a2
218  ret void
219}
220
221define void @test_demanded_haddpd_256(<4 x double> %a0, <4 x double> %a1, ptr%a2) nounwind {
222; X86-LABEL: test_demanded_haddpd_256:
223; X86:       ## %bb.0:
224; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
225; X86-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
226; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
227; X86-NEXT:    vmovlpd %xmm0, (%eax)
228; X86-NEXT:    vzeroupper
229; X86-NEXT:    retl
230;
231; X64-LABEL: test_demanded_haddpd_256:
232; X64:       ## %bb.0:
233; X64-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
234; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
235; X64-NEXT:    vmovlpd %xmm0, (%rdi)
236; X64-NEXT:    vzeroupper
237; X64-NEXT:    retq
238  %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> zeroinitializer
239  %2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %1)
240  %3 = extractelement <4 x double> %2, i32 2
241  store double %3, ptr%a2
242  ret void
243}
244
245define void @test_demanded_hsubpd_256(<4 x double> %a0, <4 x double> %a1, ptr%a2) nounwind {
246; X86-LABEL: test_demanded_hsubpd_256:
247; X86:       ## %bb.0:
248; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
249; X86-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
250; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
251; X86-NEXT:    vmovlpd %xmm0, (%eax)
252; X86-NEXT:    vzeroupper
253; X86-NEXT:    retl
254;
255; X64-LABEL: test_demanded_hsubpd_256:
256; X64:       ## %bb.0:
257; X64-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
258; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
259; X64-NEXT:    vmovlpd %xmm0, (%rdi)
260; X64-NEXT:    vzeroupper
261; X64-NEXT:    retq
262  %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> zeroinitializer
263  %2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %1)
264  %3 = extractelement <4 x double> %2, i32 2
265  store double %3, ptr%a2
266  ret void
267}
268
269define void @test_demanded_phaddd_256(<8 x i32> %a0, <8 x i32> %a1, ptr%a2) nounwind {
270; X86-LABEL: test_demanded_phaddd_256:
271; X86:       ## %bb.0:
272; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
273; X86-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
274; X86-NEXT:    vextracti128 $1, %ymm0, %xmm0
275; X86-NEXT:    vpextrd $3, %xmm0, (%eax)
276; X86-NEXT:    vzeroupper
277; X86-NEXT:    retl
278;
279; X64-LABEL: test_demanded_phaddd_256:
280; X64:       ## %bb.0:
281; X64-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
282; X64-NEXT:    vextracti128 $1, %ymm0, %xmm0
283; X64-NEXT:    vpextrd $3, %xmm0, (%rdi)
284; X64-NEXT:    vzeroupper
285; X64-NEXT:    retq
286  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> zeroinitializer
287  %2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %1, <8 x i32> %a1)
288  %3 = extractelement <8 x i32> %2, i32 7
289  store i32 %3, ptr%a2
290  ret void
291}
292
293define void @test_demanded_phsubd_256(<8 x i32> %a0, <8 x i32> %a1, ptr%a2) nounwind {
294; X86-LABEL: test_demanded_phsubd_256:
295; X86:       ## %bb.0:
296; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
297; X86-NEXT:    vphsubd %ymm0, %ymm0, %ymm0
298; X86-NEXT:    vextracti128 $1, %ymm0, %xmm0
299; X86-NEXT:    vpextrd $1, %xmm0, (%eax)
300; X86-NEXT:    vzeroupper
301; X86-NEXT:    retl
302;
303; X64-LABEL: test_demanded_phsubd_256:
304; X64:       ## %bb.0:
305; X64-NEXT:    vphsubd %ymm0, %ymm0, %ymm0
306; X64-NEXT:    vextracti128 $1, %ymm0, %xmm0
307; X64-NEXT:    vpextrd $1, %xmm0, (%rdi)
308; X64-NEXT:    vzeroupper
309; X64-NEXT:    retq
310  %1 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> zeroinitializer
311  %2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %1)
312  %3 = extractelement <8 x i32> %2, i32 5
313  store i32 %3, ptr%a2
314  ret void
315}
316
317define void @test_demanded_phaddw_256(<16 x i16> %a0, <16 x i16> %a1, ptr%a2) nounwind {
318; X86-LABEL: test_demanded_phaddw_256:
319; X86:       ## %bb.0:
320; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
321; X86-NEXT:    vpbroadcastw %xmm1, %xmm0
322; X86-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
323; X86-NEXT:    vpextrw $4, %xmm0, (%eax)
324; X86-NEXT:    vzeroupper
325; X86-NEXT:    retl
326;
327; X64-LABEL: test_demanded_phaddw_256:
328; X64:       ## %bb.0:
329; X64-NEXT:    vpbroadcastw %xmm1, %xmm0
330; X64-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
331; X64-NEXT:    vpextrw $4, %xmm0, (%rdi)
332; X64-NEXT:    vzeroupper
333; X64-NEXT:    retq
334  %1 = shufflevector <16 x i16> %a1, <16 x i16> undef, <16 x i32> zeroinitializer
335  %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %1)
336  %3 = extractelement <16 x i16> %2, i32 4
337  store i16 %3, ptr%a2
338  ret void
339}
340
341define void @test_demanded_phsubw_256(<16 x i16> %a0, <16 x i16> %a1, ptr%a2) nounwind {
342; X86-LABEL: test_demanded_phsubw_256:
343; X86:       ## %bb.0:
344; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
345; X86-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
346; X86-NEXT:    vpextrw $6, %xmm0, (%eax)
347; X86-NEXT:    vzeroupper
348; X86-NEXT:    retl
349;
350; X64-LABEL: test_demanded_phsubw_256:
351; X64:       ## %bb.0:
352; X64-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
353; X64-NEXT:    vpextrw $6, %xmm0, (%rdi)
354; X64-NEXT:    vzeroupper
355; X64-NEXT:    retq
356  %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> zeroinitializer
357  %2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %1, <16 x i16> %a1)
358  %3 = extractelement <16 x i16> %2, i32 6
359  store i16 %3, ptr%a2
360  ret void
361}
362
363declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
364declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>)
365declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>)
366declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>)
367
368declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>)
369declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>)
370declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>)
371declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>)
372
373declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>)
374declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>)
375declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>)
376declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>)
377
378declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>)
379declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>)
380declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>)
381declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>)
382