xref: /llvm-project/llvm/test/CodeGen/PowerPC/vector-reduce-mul.ll (revision e9d12c248013b2d2b9880436727857e0ec8a7085)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
3; RUN:   -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE
4; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
5; RUN:   -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE
6; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
7; RUN:   -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE
8; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
9; RUN:   -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE
10
11define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 {
12; PWR9LE-LABEL: v2i32:
13; PWR9LE:       # %bb.0: # %entry
14; PWR9LE-NEXT:    xxspltw v3, v2, 2
15; PWR9LE-NEXT:    li r3, 0
16; PWR9LE-NEXT:    vmuluwm v2, v2, v3
17; PWR9LE-NEXT:    vextuwrx r3, r3, v2
18; PWR9LE-NEXT:    blr
19;
20; PWR9BE-LABEL: v2i32:
21; PWR9BE:       # %bb.0: # %entry
22; PWR9BE-NEXT:    xxspltw v3, v2, 1
23; PWR9BE-NEXT:    li r3, 0
24; PWR9BE-NEXT:    vmuluwm v2, v2, v3
25; PWR9BE-NEXT:    vextuwlx r3, r3, v2
26; PWR9BE-NEXT:    blr
27;
28; PWR10LE-LABEL: v2i32:
29; PWR10LE:       # %bb.0: # %entry
30; PWR10LE-NEXT:    xxspltw v3, v2, 2
31; PWR10LE-NEXT:    li r3, 0
32; PWR10LE-NEXT:    vmuluwm v2, v2, v3
33; PWR10LE-NEXT:    vextuwrx r3, r3, v2
34; PWR10LE-NEXT:    blr
35;
36; PWR10BE-LABEL: v2i32:
37; PWR10BE:       # %bb.0: # %entry
38; PWR10BE-NEXT:    xxspltw v3, v2, 1
39; PWR10BE-NEXT:    li r3, 0
40; PWR10BE-NEXT:    vmuluwm v2, v2, v3
41; PWR10BE-NEXT:    vextuwlx r3, r3, v2
42; PWR10BE-NEXT:    blr
43entry:
44  %0 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %a)
45  ret i32 %0
46}
47
48define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 {
49; PWR9LE-LABEL: v4i32:
50; PWR9LE:       # %bb.0: # %entry
51; PWR9LE-NEXT:    xxswapd v3, v2
52; PWR9LE-NEXT:    li r3, 0
53; PWR9LE-NEXT:    vmuluwm v2, v2, v3
54; PWR9LE-NEXT:    xxspltw v3, v2, 2
55; PWR9LE-NEXT:    vmuluwm v2, v2, v3
56; PWR9LE-NEXT:    vextuwrx r3, r3, v2
57; PWR9LE-NEXT:    blr
58;
59; PWR9BE-LABEL: v4i32:
60; PWR9BE:       # %bb.0: # %entry
61; PWR9BE-NEXT:    xxswapd v3, v2
62; PWR9BE-NEXT:    li r3, 0
63; PWR9BE-NEXT:    vmuluwm v2, v2, v3
64; PWR9BE-NEXT:    xxspltw v3, v2, 1
65; PWR9BE-NEXT:    vmuluwm v2, v2, v3
66; PWR9BE-NEXT:    vextuwlx r3, r3, v2
67; PWR9BE-NEXT:    blr
68;
69; PWR10LE-LABEL: v4i32:
70; PWR10LE:       # %bb.0: # %entry
71; PWR10LE-NEXT:    xxswapd v3, v2
72; PWR10LE-NEXT:    li r3, 0
73; PWR10LE-NEXT:    vmuluwm v2, v2, v3
74; PWR10LE-NEXT:    xxspltw v3, v2, 2
75; PWR10LE-NEXT:    vmuluwm v2, v2, v3
76; PWR10LE-NEXT:    vextuwrx r3, r3, v2
77; PWR10LE-NEXT:    blr
78;
79; PWR10BE-LABEL: v4i32:
80; PWR10BE:       # %bb.0: # %entry
81; PWR10BE-NEXT:    xxswapd v3, v2
82; PWR10BE-NEXT:    li r3, 0
83; PWR10BE-NEXT:    vmuluwm v2, v2, v3
84; PWR10BE-NEXT:    xxspltw v3, v2, 1
85; PWR10BE-NEXT:    vmuluwm v2, v2, v3
86; PWR10BE-NEXT:    vextuwlx r3, r3, v2
87; PWR10BE-NEXT:    blr
88entry:
89  %0 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
90  ret i32 %0
91}
92
93define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 {
94; PWR9LE-LABEL: v8i32:
95; PWR9LE:       # %bb.0: # %entry
96; PWR9LE-NEXT:    vmuluwm v2, v2, v3
97; PWR9LE-NEXT:    li r3, 0
98; PWR9LE-NEXT:    xxswapd v3, v2
99; PWR9LE-NEXT:    vmuluwm v2, v2, v3
100; PWR9LE-NEXT:    xxspltw v3, v2, 2
101; PWR9LE-NEXT:    vmuluwm v2, v2, v3
102; PWR9LE-NEXT:    vextuwrx r3, r3, v2
103; PWR9LE-NEXT:    blr
104;
105; PWR9BE-LABEL: v8i32:
106; PWR9BE:       # %bb.0: # %entry
107; PWR9BE-NEXT:    vmuluwm v2, v2, v3
108; PWR9BE-NEXT:    li r3, 0
109; PWR9BE-NEXT:    xxswapd v3, v2
110; PWR9BE-NEXT:    vmuluwm v2, v2, v3
111; PWR9BE-NEXT:    xxspltw v3, v2, 1
112; PWR9BE-NEXT:    vmuluwm v2, v2, v3
113; PWR9BE-NEXT:    vextuwlx r3, r3, v2
114; PWR9BE-NEXT:    blr
115;
116; PWR10LE-LABEL: v8i32:
117; PWR10LE:       # %bb.0: # %entry
118; PWR10LE-NEXT:    vmuluwm v2, v2, v3
119; PWR10LE-NEXT:    li r3, 0
120; PWR10LE-NEXT:    xxswapd v3, v2
121; PWR10LE-NEXT:    vmuluwm v2, v2, v3
122; PWR10LE-NEXT:    xxspltw v3, v2, 2
123; PWR10LE-NEXT:    vmuluwm v2, v2, v3
124; PWR10LE-NEXT:    vextuwrx r3, r3, v2
125; PWR10LE-NEXT:    blr
126;
127; PWR10BE-LABEL: v8i32:
128; PWR10BE:       # %bb.0: # %entry
129; PWR10BE-NEXT:    vmuluwm v2, v2, v3
130; PWR10BE-NEXT:    li r3, 0
131; PWR10BE-NEXT:    xxswapd v3, v2
132; PWR10BE-NEXT:    vmuluwm v2, v2, v3
133; PWR10BE-NEXT:    xxspltw v3, v2, 1
134; PWR10BE-NEXT:    vmuluwm v2, v2, v3
135; PWR10BE-NEXT:    vextuwlx r3, r3, v2
136; PWR10BE-NEXT:    blr
137entry:
138  %0 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a)
139  ret i32 %0
140}
141
142define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 {
143; PWR9LE-LABEL: v16i32:
144; PWR9LE:       # %bb.0: # %entry
145; PWR9LE-NEXT:    vmuluwm v3, v3, v5
146; PWR9LE-NEXT:    vmuluwm v2, v2, v4
147; PWR9LE-NEXT:    li r3, 0
148; PWR9LE-NEXT:    vmuluwm v2, v2, v3
149; PWR9LE-NEXT:    xxswapd v3, v2
150; PWR9LE-NEXT:    vmuluwm v2, v2, v3
151; PWR9LE-NEXT:    xxspltw v3, v2, 2
152; PWR9LE-NEXT:    vmuluwm v2, v2, v3
153; PWR9LE-NEXT:    vextuwrx r3, r3, v2
154; PWR9LE-NEXT:    blr
155;
156; PWR9BE-LABEL: v16i32:
157; PWR9BE:       # %bb.0: # %entry
158; PWR9BE-NEXT:    vmuluwm v3, v3, v5
159; PWR9BE-NEXT:    vmuluwm v2, v2, v4
160; PWR9BE-NEXT:    li r3, 0
161; PWR9BE-NEXT:    vmuluwm v2, v2, v3
162; PWR9BE-NEXT:    xxswapd v3, v2
163; PWR9BE-NEXT:    vmuluwm v2, v2, v3
164; PWR9BE-NEXT:    xxspltw v3, v2, 1
165; PWR9BE-NEXT:    vmuluwm v2, v2, v3
166; PWR9BE-NEXT:    vextuwlx r3, r3, v2
167; PWR9BE-NEXT:    blr
168;
169; PWR10LE-LABEL: v16i32:
170; PWR10LE:       # %bb.0: # %entry
171; PWR10LE-NEXT:    vmuluwm v3, v3, v5
172; PWR10LE-NEXT:    vmuluwm v2, v2, v4
173; PWR10LE-NEXT:    li r3, 0
174; PWR10LE-NEXT:    vmuluwm v2, v2, v3
175; PWR10LE-NEXT:    xxswapd v3, v2
176; PWR10LE-NEXT:    vmuluwm v2, v2, v3
177; PWR10LE-NEXT:    xxspltw v3, v2, 2
178; PWR10LE-NEXT:    vmuluwm v2, v2, v3
179; PWR10LE-NEXT:    vextuwrx r3, r3, v2
180; PWR10LE-NEXT:    blr
181;
182; PWR10BE-LABEL: v16i32:
183; PWR10BE:       # %bb.0: # %entry
184; PWR10BE-NEXT:    vmuluwm v3, v3, v5
185; PWR10BE-NEXT:    vmuluwm v2, v2, v4
186; PWR10BE-NEXT:    li r3, 0
187; PWR10BE-NEXT:    vmuluwm v2, v2, v3
188; PWR10BE-NEXT:    xxswapd v3, v2
189; PWR10BE-NEXT:    vmuluwm v2, v2, v3
190; PWR10BE-NEXT:    xxspltw v3, v2, 1
191; PWR10BE-NEXT:    vmuluwm v2, v2, v3
192; PWR10BE-NEXT:    vextuwlx r3, r3, v2
193; PWR10BE-NEXT:    blr
194entry:
195  %0 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %a)
196  ret i32 %0
197}
198
199declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>) #0
200declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>) #0
201declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>) #0
202declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>) #0
203
204attributes #0 = { nounwind }
205