xref: /llvm-project/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll (revision 4178e33470763b406f614b646c8b01d24309e20b)
1; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
2; RUN: opt < %s -S -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s
3
4target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
5
6define void @add_i8() {
7; CHECK-LABEL: 'add_i8'
8; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
9; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
10; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
11; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
12; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
13; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
14;
15  %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
16
17  %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
18
19  %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
20
21  %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
22
23  %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
24
25  ret void
26}
27
28define void @add_i16() {
29; CHECK-LABEL: 'add_i16'
30; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i16>
31; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0za)
32; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i16>
33; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sa)
34; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i16>
35; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1za)
36; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i16>
37; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sa)
38; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2za = zext <4 x i8> undef to <4 x i16>
39; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2za)
40; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sa = sext <4 x i8> undef to <4 x i16>
41; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sa)
42; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3za = zext <8 x i8> undef to <8 x i16>
43; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3za)
44; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sa = sext <8 x i8> undef to <8 x i16>
45; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sa)
46; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4za = zext <16 x i8> undef to <16 x i16>
47; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4za)
48; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4sa = sext <16 x i8> undef to <16 x i16>
49; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sa)
50; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
51; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
52; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
53; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
54; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
55; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
56;
57  %a0za = zext <1 x i8> undef to <1 x i16>
58  %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0za)
59
60  %a0sa = sext <1 x i8> undef to <1 x i16>
61  %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sa)
62
63  %a1za = zext <2 x i8> undef to <2 x i16>
64  %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1za)
65
66  %a1sa = sext <2 x i8> undef to <2 x i16>
67  %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sa)
68
69  %a2za = zext <4 x i8> undef to <4 x i16>
70  %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2za)
71
72  %a2sa = sext <4 x i8> undef to <4 x i16>
73  %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sa)
74
75  %a3za = zext <8 x i8> undef to <8 x i16>
76  %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3za)
77
78  %a3sa = sext <8 x i8> undef to <8 x i16>
79  %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sa)
80
81  %a4za = zext <16 x i8> undef to <16 x i16>
82  %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4za)
83
84  %a4sa = sext <16 x i8> undef to <16 x i16>
85  %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sa)
86
87  %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
88
89  %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
90
91  %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
92
93  %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
94
95  %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
96
97  ret void
98}
99
100define void @add_i32() {
101; CHECK-LABEL: 'add_i32'
102; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i32>
103; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0za)
104; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i32>
105; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sa)
106; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i32>
107; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1za)
108; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i32>
109; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sa)
110; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2za = zext <4 x i8> undef to <4 x i32>
111; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2za)
112; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2sa = sext <4 x i8> undef to <4 x i32>
113; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sa)
114; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3za = zext <8 x i8> undef to <8 x i32>
115; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3za)
116; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3sa = sext <8 x i8> undef to <8 x i32>
117; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sa)
118; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4za = zext <16 x i8> undef to <16 x i32>
119; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4za)
120; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4sa = sext <16 x i8> undef to <16 x i32>
121; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sa)
122; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5za = zext <1 x i16> undef to <1 x i32>
123; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5za)
124; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sa = sext <1 x i16> undef to <1 x i32>
125; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sa)
126; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a6za = zext <2 x i16> undef to <2 x i32>
127; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6za)
128; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6sa = sext <2 x i16> undef to <2 x i32>
129; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sa)
130; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7za = zext <4 x i16> undef to <4 x i32>
131; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7za)
132; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sa = sext <4 x i16> undef to <4 x i32>
133; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sa)
134; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8za = zext <8 x i16> undef to <8 x i32>
135; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8za)
136; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8sa = sext <8 x i16> undef to <8 x i32>
137; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sa)
138; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9za = zext <16 x i16> undef to <16 x i32>
139; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9za)
140; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9sa = sext <16 x i16> undef to <16 x i32>
141; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sa)
142; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
143; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
144; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
145; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
146; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
147; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
148;
149  %a0za = zext <1 x i8> undef to <1 x i32>
150  %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0za)
151
152  %a0sa = sext <1 x i8> undef to <1 x i32>
153  %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sa)
154
155  %a1za = zext <2 x i8> undef to <2 x i32>
156  %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1za)
157
158  %a1sa = sext <2 x i8> undef to <2 x i32>
159  %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sa)
160
161  %a2za = zext <4 x i8> undef to <4 x i32>
162  %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2za)
163
164  %a2sa = sext <4 x i8> undef to <4 x i32>
165  %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sa)
166
167  %a3za = zext <8 x i8> undef to <8 x i32>
168  %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3za)
169
170  %a3sa = sext <8 x i8> undef to <8 x i32>
171  %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sa)
172
173  %a4za = zext <16 x i8> undef to <16 x i32>
174  %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4za)
175
176  %a4sa = sext <16 x i8> undef to <16 x i32>
177  %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sa)
178
179  %a5za = zext <1 x i16> undef to <1 x i32>
180  %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5za)
181
182  %a5sa = sext <1 x i16> undef to <1 x i32>
183  %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sa)
184
185  %a6za = zext <2 x i16> undef to <2 x i32>
186  %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6za)
187
188  %a6sa = sext <2 x i16> undef to <2 x i32>
189  %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sa)
190
191  %a7za = zext <4 x i16> undef to <4 x i32>
192  %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7za)
193
194  %a7sa = sext <4 x i16> undef to <4 x i32>
195  %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sa)
196
197  %a8za = zext <8 x i16> undef to <8 x i32>
198  %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8za)
199
200  %a8sa = sext <8 x i16> undef to <8 x i32>
201  %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sa)
202
203  %a9za = zext <16 x i16> undef to <16 x i32>
204  %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9za)
205
206  %a9sa = sext <16 x i16> undef to <16 x i32>
207  %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sa)
208
209  %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
210
211  %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
212
213  %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
214
215  %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
216
217  %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
218
219  ret void
220}
221
222define void @add_i64() {
223; CHECK-LABEL: 'add_i64'
224; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0za = zext <1 x i8> undef to <1 x i64>
225; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0za)
226; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0sa = sext <1 x i8> undef to <1 x i64>
227; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sa)
228; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1za = zext <2 x i8> undef to <2 x i64>
229; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1za)
230; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a1sa = sext <2 x i8> undef to <2 x i64>
231; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sa)
232; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a2za = zext <4 x i8> undef to <4 x i64>
233; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2za)
234; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a2sa = sext <4 x i8> undef to <4 x i64>
235; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sa)
236; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a3za = zext <8 x i8> undef to <8 x i64>
237; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3za)
238; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a3sa = sext <8 x i8> undef to <8 x i64>
239; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sa)
240; CHECK-NEXT:  Cost Model: Found an estimated cost of 298 for instruction: %a4za = zext <16 x i8> undef to <16 x i64>
241; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4za)
242; CHECK-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %a4sa = sext <16 x i8> undef to <16 x i64>
243; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sa)
244; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a5za = zext <1 x i16> undef to <1 x i64>
245; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5za)
246; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %a5sa = sext <1 x i16> undef to <1 x i64>
247; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sa)
248; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6za = zext <2 x i16> undef to <2 x i64>
249; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6za)
250; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a6sa = sext <2 x i16> undef to <2 x i64>
251; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sa)
252; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a7za = zext <4 x i16> undef to <4 x i64>
253; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7za)
254; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a7sa = sext <4 x i16> undef to <4 x i64>
255; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sa)
256; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a8za = zext <8 x i16> undef to <8 x i64>
257; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8za)
258; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a8sa = sext <8 x i16> undef to <8 x i64>
259; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sa)
260; CHECK-NEXT:  Cost Model: Found an estimated cost of 296 for instruction: %a9za = zext <16 x i16> undef to <16 x i64>
261; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9za)
262; CHECK-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %a9sa = sext <16 x i16> undef to <16 x i64>
263; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sa)
264; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10za = zext <1 x i32> undef to <1 x i64>
265; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10za)
266; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10sa = sext <1 x i32> undef to <1 x i64>
267; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sa)
268; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a11za = zext <2 x i32> undef to <2 x i64>
269; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11za)
270; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a11sa = sext <2 x i32> undef to <2 x i64>
271; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sa)
272; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a12za = zext <4 x i32> undef to <4 x i64>
273; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12za)
274; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12sa = sext <4 x i32> undef to <4 x i64>
275; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sa)
276; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %a13za = zext <8 x i32> undef to <8 x i64>
277; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13za)
278; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %a13sa = sext <8 x i32> undef to <8 x i64>
279; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sa)
280; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %a14za = zext <16 x i32> undef to <16 x i64>
281; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14za)
282; CHECK-NEXT:  Cost Model: Found an estimated cost of 1056 for instruction: %a14sa = sext <16 x i32> undef to <16 x i64>
283; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sa)
284; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
285; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
286; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
287; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
288; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
289; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
290;
291  %a0za = zext <1 x i8> undef to <1 x i64>
292  %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0za)
293
294  %a0sa = sext <1 x i8> undef to <1 x i64>
295  %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sa)
296
297  %a1za = zext <2 x i8> undef to <2 x i64>
298  %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1za)
299
300  %a1sa = sext <2 x i8> undef to <2 x i64>
301  %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sa)
302
303  %a2za = zext <4 x i8> undef to <4 x i64>
304  %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2za)
305
306  %a2sa = sext <4 x i8> undef to <4 x i64>
307  %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sa)
308
309  %a3za = zext <8 x i8> undef to <8 x i64>
310  %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3za)
311
312  %a3sa = sext <8 x i8> undef to <8 x i64>
313  %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sa)
314
315  %a4za = zext <16 x i8> undef to <16 x i64>
316  %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4za)
317
318  %a4sa = sext <16 x i8> undef to <16 x i64>
319  %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sa)
320
321  %a5za = zext <1 x i16> undef to <1 x i64>
322  %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5za)
323
324  %a5sa = sext <1 x i16> undef to <1 x i64>
325  %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sa)
326
327  %a6za = zext <2 x i16> undef to <2 x i64>
328  %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6za)
329
330  %a6sa = sext <2 x i16> undef to <2 x i64>
331  %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sa)
332
333  %a7za = zext <4 x i16> undef to <4 x i64>
334  %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7za)
335
336  %a7sa = sext <4 x i16> undef to <4 x i64>
337  %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sa)
338
339  %a8za = zext <8 x i16> undef to <8 x i64>
340  %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8za)
341
342  %a8sa = sext <8 x i16> undef to <8 x i64>
343  %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sa)
344
345  %a9za = zext <16 x i16> undef to <16 x i64>
346  %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9za)
347
348  %a9sa = sext <16 x i16> undef to <16 x i64>
349  %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sa)
350
351  %a10za = zext <1 x i32> undef to <1 x i64>
352  %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10za)
353
354  %a10sa = sext <1 x i32> undef to <1 x i64>
355  %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sa)
356
357  %a11za = zext <2 x i32> undef to <2 x i64>
358  %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11za)
359
360  %a11sa = sext <2 x i32> undef to <2 x i64>
361  %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sa)
362
363  %a12za = zext <4 x i32> undef to <4 x i64>
364  %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12za)
365
366  %a12sa = sext <4 x i32> undef to <4 x i64>
367  %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sa)
368
369  %a13za = zext <8 x i32> undef to <8 x i64>
370  %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13za)
371
372  %a13sa = sext <8 x i32> undef to <8 x i64>
373  %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sa)
374
375  %a14za = zext <16 x i32> undef to <16 x i64>
376  %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14za)
377
378  %a14sa = sext <16 x i32> undef to <16 x i64>
379  %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sa)
380
381  %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
382
383  %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
384
385  %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
386
387  %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
388
389  %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
390
391  ret void
392}
393
394define void @mla_i8() {
395; CHECK-LABEL: 'mla_i8'
396; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0m = mul <1 x i8> undef, undef
397; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a0m)
398; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a1m = mul <2 x i8> undef, undef
399; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a1m)
400; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2m = mul <4 x i8> undef, undef
401; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a2m)
402; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3m = mul <8 x i8> undef, undef
403; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a3m)
404; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a4m = mul <16 x i8> undef, undef
405; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a4m)
406; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
407;
408  %a0m = mul <1 x i8> undef, undef
409  %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a0m)
410
411  %a1m = mul <2 x i8> undef, undef
412  %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a1m)
413
414  %a2m = mul <4 x i8> undef, undef
415  %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a2m)
416
417  %a3m = mul <8 x i8> undef, undef
418  %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a3m)
419
420  %a4m = mul <16 x i8> undef, undef
421  %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a4m)
422
423  ret void
424}
425
426define void @mla_i16() {
427; CHECK-LABEL: 'mla_i16'
428; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i16>
429; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zb = zext <1 x i8> undef to <1 x i16>
430; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zm = mul <1 x i16> %a0za, %a0zb
431; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0zm)
432; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i16>
433; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sb = sext <1 x i8> undef to <1 x i16>
434; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sm = mul <1 x i16> %a0sa, %a0sb
435; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sm)
436; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i16>
437; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1zb = zext <2 x i8> undef to <2 x i16>
438; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a1zm = mul <2 x i16> %a1za, %a1zb
439; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1zm)
440; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i16>
441; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sb = sext <2 x i8> undef to <2 x i16>
442; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a1sm = mul <2 x i16> %a1sa, %a1sb
443; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sm)
444; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2za = zext <4 x i8> undef to <4 x i16>
445; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2zb = zext <4 x i8> undef to <4 x i16>
446; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2zm = mul <4 x i16> %a2za, %a2zb
447; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2zm)
448; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sa = sext <4 x i8> undef to <4 x i16>
449; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sb = sext <4 x i8> undef to <4 x i16>
450; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sm = mul <4 x i16> %a2sa, %a2sb
451; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sm)
452; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3za = zext <8 x i8> undef to <8 x i16>
453; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3zb = zext <8 x i8> undef to <8 x i16>
454; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3zm = mul <8 x i16> %a3za, %a3zb
455; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3zm)
456; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sa = sext <8 x i8> undef to <8 x i16>
457; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sb = sext <8 x i8> undef to <8 x i16>
458; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sm = mul <8 x i16> %a3sa, %a3sb
459; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sm)
460; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4za = zext <16 x i8> undef to <16 x i16>
461; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4zb = zext <16 x i8> undef to <16 x i16>
462; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4zm = mul <16 x i16> %a4za, %a4zb
463; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4zm)
464; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4sa = sext <16 x i8> undef to <16 x i16>
465; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4sb = sext <16 x i8> undef to <16 x i16>
466; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4sm = mul <16 x i16> %a4sa, %a4sb
467; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sm)
468; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5m = mul <1 x i16> undef, undef
469; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a5m)
470; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a6m = mul <2 x i16> undef, undef
471; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a6m)
472; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7m = mul <4 x i16> undef, undef
473; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a7m)
474; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a8m = mul <8 x i16> undef, undef
475; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a8m)
476; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a9m = mul <16 x i16> undef, undef
477; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a9m)
478; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
479;
480  %a0za = zext <1 x i8> undef to <1 x i16>
481  %a0zb = zext <1 x i8> undef to <1 x i16>
482  %a0zm = mul <1 x i16> %a0za, %a0zb
483  %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0zm)
484
485  %a0sa = sext <1 x i8> undef to <1 x i16>
486  %a0sb = sext <1 x i8> undef to <1 x i16>
487  %a0sm = mul <1 x i16> %a0sa, %a0sb
488  %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sm)
489
490  %a1za = zext <2 x i8> undef to <2 x i16>
491  %a1zb = zext <2 x i8> undef to <2 x i16>
492  %a1zm = mul <2 x i16> %a1za, %a1zb
493  %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1zm)
494
495  %a1sa = sext <2 x i8> undef to <2 x i16>
496  %a1sb = sext <2 x i8> undef to <2 x i16>
497  %a1sm = mul <2 x i16> %a1sa, %a1sb
498  %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sm)
499
500  %a2za = zext <4 x i8> undef to <4 x i16>
501  %a2zb = zext <4 x i8> undef to <4 x i16>
502  %a2zm = mul <4 x i16> %a2za, %a2zb
503  %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2zm)
504
505  %a2sa = sext <4 x i8> undef to <4 x i16>
506  %a2sb = sext <4 x i8> undef to <4 x i16>
507  %a2sm = mul <4 x i16> %a2sa, %a2sb
508  %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sm)
509
510  %a3za = zext <8 x i8> undef to <8 x i16>
511  %a3zb = zext <8 x i8> undef to <8 x i16>
512  %a3zm = mul <8 x i16> %a3za, %a3zb
513  %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3zm)
514
515  %a3sa = sext <8 x i8> undef to <8 x i16>
516  %a3sb = sext <8 x i8> undef to <8 x i16>
517  %a3sm = mul <8 x i16> %a3sa, %a3sb
518  %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sm)
519
520  %a4za = zext <16 x i8> undef to <16 x i16>
521  %a4zb = zext <16 x i8> undef to <16 x i16>
522  %a4zm = mul <16 x i16> %a4za, %a4zb
523  %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4zm)
524
525  %a4sa = sext <16 x i8> undef to <16 x i16>
526  %a4sb = sext <16 x i8> undef to <16 x i16>
527  %a4sm = mul <16 x i16> %a4sa, %a4sb
528  %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sm)
529
530  %a5m = mul <1 x i16> undef, undef
531  %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a5m)
532
533  %a6m = mul <2 x i16> undef, undef
534  %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a6m)
535
536  %a7m = mul <4 x i16> undef, undef
537  %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a7m)
538
539  %a8m = mul <8 x i16> undef, undef
540  %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a8m)
541
542  %a9m = mul <16 x i16> undef, undef
543  %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a9m)
544
545  ret void
546}
547
548define void @mla_i32() {
549; CHECK-LABEL: 'mla_i32'
550; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i32>
551; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zb = zext <1 x i8> undef to <1 x i32>
552; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zm = mul <1 x i32> %a0za, %a0zb
553; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0zm)
554; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i32>
555; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sb = sext <1 x i8> undef to <1 x i32>
556; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sm = mul <1 x i32> %a0sa, %a0sb
557; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sm)
558; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i32>
559; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1zb = zext <2 x i8> undef to <2 x i32>
560; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a1zm = mul <2 x i32> %a1za, %a1zb
561; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1zm)
562; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i32>
563; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sb = sext <2 x i8> undef to <2 x i32>
564; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a1sm = mul <2 x i32> %a1sa, %a1sb
565; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sm)
566; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2za = zext <4 x i8> undef to <4 x i32>
567; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2zb = zext <4 x i8> undef to <4 x i32>
568; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2zm = mul <4 x i32> %a2za, %a2zb
569; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2zm)
570; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2sa = sext <4 x i8> undef to <4 x i32>
571; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2sb = sext <4 x i8> undef to <4 x i32>
572; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sm = mul <4 x i32> %a2sa, %a2sb
573; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sm)
574; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3za = zext <8 x i8> undef to <8 x i32>
575; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3zb = zext <8 x i8> undef to <8 x i32>
576; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3zm = mul <8 x i32> %a3za, %a3zb
577; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3zm)
578; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3sa = sext <8 x i8> undef to <8 x i32>
579; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3sb = sext <8 x i8> undef to <8 x i32>
580; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3sm = mul <8 x i32> %a3sa, %a3sb
581; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sm)
582; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4za = zext <16 x i8> undef to <16 x i32>
583; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4zb = zext <16 x i8> undef to <16 x i32>
584; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4zm = mul <16 x i32> %a4za, %a4zb
585; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4zm)
586; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4sa = sext <16 x i8> undef to <16 x i32>
587; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4sb = sext <16 x i8> undef to <16 x i32>
588; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4sm = mul <16 x i32> %a4sa, %a4sb
589; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sm)
590; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5za = zext <1 x i16> undef to <1 x i32>
591; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5zb = zext <1 x i16> undef to <1 x i32>
592; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5zm = mul <1 x i32> %a5za, %a5zb
593; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5zm)
594; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sa = sext <1 x i16> undef to <1 x i32>
595; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sb = sext <1 x i16> undef to <1 x i32>
596; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sm = mul <1 x i32> %a5sa, %a5sb
597; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sm)
598; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a6za = zext <2 x i16> undef to <2 x i32>
599; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a6zb = zext <2 x i16> undef to <2 x i32>
600; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a6zm = mul <2 x i32> %a6za, %a6zb
601; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6zm)
602; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6sa = sext <2 x i16> undef to <2 x i32>
603; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6sb = sext <2 x i16> undef to <2 x i32>
604; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a6sm = mul <2 x i32> %a6sa, %a6sb
605; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sm)
606; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7za = zext <4 x i16> undef to <4 x i32>
607; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7zb = zext <4 x i16> undef to <4 x i32>
608; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7zm = mul <4 x i32> %a7za, %a7zb
609; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7zm)
610; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sa = sext <4 x i16> undef to <4 x i32>
611; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sb = sext <4 x i16> undef to <4 x i32>
612; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sm = mul <4 x i32> %a7sa, %a7sb
613; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sm)
614; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8za = zext <8 x i16> undef to <8 x i32>
615; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8zb = zext <8 x i16> undef to <8 x i32>
616; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8zm = mul <8 x i32> %a8za, %a8zb
617; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8zm)
618; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8sa = sext <8 x i16> undef to <8 x i32>
619; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8sb = sext <8 x i16> undef to <8 x i32>
620; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8sm = mul <8 x i32> %a8sa, %a8sb
621; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sm)
622; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9za = zext <16 x i16> undef to <16 x i32>
623; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9zb = zext <16 x i16> undef to <16 x i32>
624; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9zm = mul <16 x i32> %a9za, %a9zb
625; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9zm)
626; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9sa = sext <16 x i16> undef to <16 x i32>
627; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9sb = sext <16 x i16> undef to <16 x i32>
628; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9sm = mul <16 x i32> %a9sa, %a9sb
629; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sm)
630; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a10m = mul <1 x i32> undef, undef
631; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a10m)
632; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a11m = mul <2 x i32> undef, undef
633; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a11m)
634; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a12m = mul <4 x i32> undef, undef
635; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a12m)
636; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a13m = mul <8 x i32> undef, undef
637; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a13m)
638; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a14m = mul <16 x i32> undef, undef
639; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a14m)
640; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
641;
642  %a0za = zext <1 x i8> undef to <1 x i32>
643  %a0zb = zext <1 x i8> undef to <1 x i32>
644  %a0zm = mul <1 x i32> %a0za, %a0zb
645  %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0zm)
646
647  %a0sa = sext <1 x i8> undef to <1 x i32>
648  %a0sb = sext <1 x i8> undef to <1 x i32>
649  %a0sm = mul <1 x i32> %a0sa, %a0sb
650  %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sm)
651
652  %a1za = zext <2 x i8> undef to <2 x i32>
653  %a1zb = zext <2 x i8> undef to <2 x i32>
654  %a1zm = mul <2 x i32> %a1za, %a1zb
655  %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1zm)
656
657  %a1sa = sext <2 x i8> undef to <2 x i32>
658  %a1sb = sext <2 x i8> undef to <2 x i32>
659  %a1sm = mul <2 x i32> %a1sa, %a1sb
660  %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sm)
661
662  %a2za = zext <4 x i8> undef to <4 x i32>
663  %a2zb = zext <4 x i8> undef to <4 x i32>
664  %a2zm = mul <4 x i32> %a2za, %a2zb
665  %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2zm)
666
667  %a2sa = sext <4 x i8> undef to <4 x i32>
668  %a2sb = sext <4 x i8> undef to <4 x i32>
669  %a2sm = mul <4 x i32> %a2sa, %a2sb
670  %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sm)
671
672  %a3za = zext <8 x i8> undef to <8 x i32>
673  %a3zb = zext <8 x i8> undef to <8 x i32>
674  %a3zm = mul <8 x i32> %a3za, %a3zb
675  %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3zm)
676
677  %a3sa = sext <8 x i8> undef to <8 x i32>
678  %a3sb = sext <8 x i8> undef to <8 x i32>
679  %a3sm = mul <8 x i32> %a3sa, %a3sb
680  %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sm)
681
682  %a4za = zext <16 x i8> undef to <16 x i32>
683  %a4zb = zext <16 x i8> undef to <16 x i32>
684  %a4zm = mul <16 x i32> %a4za, %a4zb
685  %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4zm)
686
687  %a4sa = sext <16 x i8> undef to <16 x i32>
688  %a4sb = sext <16 x i8> undef to <16 x i32>
689  %a4sm = mul <16 x i32> %a4sa, %a4sb
690  %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sm)
691
692  %a5za = zext <1 x i16> undef to <1 x i32>
693  %a5zb = zext <1 x i16> undef to <1 x i32>
694  %a5zm = mul <1 x i32> %a5za, %a5zb
695  %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5zm)
696
697  %a5sa = sext <1 x i16> undef to <1 x i32>
698  %a5sb = sext <1 x i16> undef to <1 x i32>
699  %a5sm = mul <1 x i32> %a5sa, %a5sb
700  %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sm)
701
702  %a6za = zext <2 x i16> undef to <2 x i32>
703  %a6zb = zext <2 x i16> undef to <2 x i32>
704  %a6zm = mul <2 x i32> %a6za, %a6zb
705  %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6zm)
706
707  %a6sa = sext <2 x i16> undef to <2 x i32>
708  %a6sb = sext <2 x i16> undef to <2 x i32>
709  %a6sm = mul <2 x i32> %a6sa, %a6sb
710  %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sm)
711
712  %a7za = zext <4 x i16> undef to <4 x i32>
713  %a7zb = zext <4 x i16> undef to <4 x i32>
714  %a7zm = mul <4 x i32> %a7za, %a7zb
715  %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7zm)
716
717  %a7sa = sext <4 x i16> undef to <4 x i32>
718  %a7sb = sext <4 x i16> undef to <4 x i32>
719  %a7sm = mul <4 x i32> %a7sa, %a7sb
720  %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sm)
721
722  %a8za = zext <8 x i16> undef to <8 x i32>
723  %a8zb = zext <8 x i16> undef to <8 x i32>
724  %a8zm = mul <8 x i32> %a8za, %a8zb
725  %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8zm)
726
727  %a8sa = sext <8 x i16> undef to <8 x i32>
728  %a8sb = sext <8 x i16> undef to <8 x i32>
729  %a8sm = mul <8 x i32> %a8sa, %a8sb
730  %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sm)
731
732  %a9za = zext <16 x i16> undef to <16 x i32>
733  %a9zb = zext <16 x i16> undef to <16 x i32>
734  %a9zm = mul <16 x i32> %a9za, %a9zb
735  %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9zm)
736
737  %a9sa = sext <16 x i16> undef to <16 x i32>
738  %a9sb = sext <16 x i16> undef to <16 x i32>
739  %a9sm = mul <16 x i32> %a9sa, %a9sb
740  %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sm)
741
742  %a10m = mul <1 x i32> undef, undef
743  %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a10m)
744
745  %a11m = mul <2 x i32> undef, undef
746  %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a11m)
747
748  %a12m = mul <4 x i32> undef, undef
749  %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a12m)
750
751  %a13m = mul <8 x i32> undef, undef
752  %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a13m)
753
754  %a14m = mul <16 x i32> undef, undef
755  %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a14m)
756
757  ret void
758}
759
760define void @mla_i64() {
761; CHECK-LABEL: 'mla_i64'
762; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0za = zext <1 x i8> undef to <1 x i64>
763; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0zb = zext <1 x i8> undef to <1 x i64>
764; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0zm = mul <1 x i64> %a0za, %a0zb
765; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0zm)
766; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0sa = sext <1 x i8> undef to <1 x i64>
767; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0sb = sext <1 x i8> undef to <1 x i64>
768; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0sm = mul <1 x i64> %a0sa, %a0sb
769; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sm)
770; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1za = zext <2 x i8> undef to <2 x i64>
771; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1zb = zext <2 x i8> undef to <2 x i64>
772; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a1zm = mul <2 x i64> %a1za, %a1zb
773; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1zm)
774; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a1sa = sext <2 x i8> undef to <2 x i64>
775; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a1sb = sext <2 x i8> undef to <2 x i64>
776; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a1sm = mul <2 x i64> %a1sa, %a1sb
777; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sm)
778; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a2za = zext <4 x i8> undef to <4 x i64>
779; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a2zb = zext <4 x i8> undef to <4 x i64>
780; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a2zm = mul <4 x i64> %a2za, %a2zb
781; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2zm)
782; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a2sa = sext <4 x i8> undef to <4 x i64>
783; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a2sb = sext <4 x i8> undef to <4 x i64>
784; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a2sm = mul <4 x i64> %a2sa, %a2sb
785; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sm)
786; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a3za = zext <8 x i8> undef to <8 x i64>
787; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a3zb = zext <8 x i8> undef to <8 x i64>
788; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a3zm = mul <8 x i64> %a3za, %a3zb
789; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3zm)
790; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a3sa = sext <8 x i8> undef to <8 x i64>
791; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a3sb = sext <8 x i8> undef to <8 x i64>
792; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a3sm = mul <8 x i64> %a3sa, %a3sb
793; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sm)
794; CHECK-NEXT:  Cost Model: Found an estimated cost of 298 for instruction: %a4za = zext <16 x i8> undef to <16 x i64>
795; CHECK-NEXT:  Cost Model: Found an estimated cost of 298 for instruction: %a4zb = zext <16 x i8> undef to <16 x i64>
796; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a4zm = mul <16 x i64> %a4za, %a4zb
797; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4zm)
798; CHECK-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %a4sa = sext <16 x i8> undef to <16 x i64>
799; CHECK-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %a4sb = sext <16 x i8> undef to <16 x i64>
800; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a4sm = mul <16 x i64> %a4sa, %a4sb
801; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sm)
802; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a5za = zext <1 x i16> undef to <1 x i64>
803; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a5zb = zext <1 x i16> undef to <1 x i64>
804; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5zm = mul <1 x i64> %a5za, %a5zb
805; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5zm)
806; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %a5sa = sext <1 x i16> undef to <1 x i64>
807; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %a5sb = sext <1 x i16> undef to <1 x i64>
808; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5sm = mul <1 x i64> %a5sa, %a5sb
809; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sm)
810; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6za = zext <2 x i16> undef to <2 x i64>
811; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6zb = zext <2 x i16> undef to <2 x i64>
812; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a6zm = mul <2 x i64> %a6za, %a6zb
813; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6zm)
814; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a6sa = sext <2 x i16> undef to <2 x i64>
815; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a6sb = sext <2 x i16> undef to <2 x i64>
816; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a6sm = mul <2 x i64> %a6sa, %a6sb
817; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sm)
818; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a7za = zext <4 x i16> undef to <4 x i64>
819; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a7zb = zext <4 x i16> undef to <4 x i64>
820; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a7zm = mul <4 x i64> %a7za, %a7zb
821; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7zm)
822; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a7sa = sext <4 x i16> undef to <4 x i64>
823; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a7sb = sext <4 x i16> undef to <4 x i64>
824; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a7sm = mul <4 x i64> %a7sa, %a7sb
825; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sm)
826; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a8za = zext <8 x i16> undef to <8 x i64>
827; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a8zb = zext <8 x i16> undef to <8 x i64>
828; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a8zm = mul <8 x i64> %a8za, %a8zb
829; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8zm)
830; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a8sa = sext <8 x i16> undef to <8 x i64>
831; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a8sb = sext <8 x i16> undef to <8 x i64>
832; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a8sm = mul <8 x i64> %a8sa, %a8sb
833; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sm)
834; CHECK-NEXT:  Cost Model: Found an estimated cost of 296 for instruction: %a9za = zext <16 x i16> undef to <16 x i64>
835; CHECK-NEXT:  Cost Model: Found an estimated cost of 296 for instruction: %a9zb = zext <16 x i16> undef to <16 x i64>
836; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a9zm = mul <16 x i64> %a9za, %a9zb
837; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9zm)
838; CHECK-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %a9sa = sext <16 x i16> undef to <16 x i64>
839; CHECK-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %a9sb = sext <16 x i16> undef to <16 x i64>
840; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a9sm = mul <16 x i64> %a9sa, %a9sb
841; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sm)
842; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10za = zext <1 x i32> undef to <1 x i64>
843; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10zb = zext <1 x i32> undef to <1 x i64>
844; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a10zm = mul <1 x i64> %a10za, %a10zb
845; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10zm)
846; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10sa = sext <1 x i32> undef to <1 x i64>
847; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10sb = sext <1 x i32> undef to <1 x i64>
848; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a10sm = mul <1 x i64> %a10sa, %a10sb
849; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sm)
850; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a11za = zext <2 x i32> undef to <2 x i64>
851; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a11zb = zext <2 x i32> undef to <2 x i64>
852; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a11zm = mul <2 x i64> %a11za, %a11zb
853; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11zm)
854; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a11sa = sext <2 x i32> undef to <2 x i64>
855; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a11sb = sext <2 x i32> undef to <2 x i64>
856; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a11sm = mul <2 x i64> %a11sa, %a11sb
857; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sm)
858; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a12za = zext <4 x i32> undef to <4 x i64>
859; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a12zb = zext <4 x i32> undef to <4 x i64>
860; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a12zm = mul <4 x i64> %a12za, %a12zb
861; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12zm)
862; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12sa = sext <4 x i32> undef to <4 x i64>
863; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12sb = sext <4 x i32> undef to <4 x i64>
864; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a12sm = mul <4 x i64> %a12sa, %a12sb
865; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sm)
866; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %a13za = zext <8 x i32> undef to <8 x i64>
867; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %a13zb = zext <8 x i32> undef to <8 x i64>
868; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a13zm = mul <8 x i64> %a13za, %a13zb
869; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13zm)
870; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %a13sa = sext <8 x i32> undef to <8 x i64>
871; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %a13sb = sext <8 x i32> undef to <8 x i64>
872; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a13sm = mul <8 x i64> %a13sa, %a13sb
873; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sm)
874; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %a14za = zext <16 x i32> undef to <16 x i64>
875; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %a14zb = zext <16 x i32> undef to <16 x i64>
876; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a14zm = mul <16 x i64> %a14za, %a14zb
877; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14zm)
878; CHECK-NEXT:  Cost Model: Found an estimated cost of 1056 for instruction: %a14sa = sext <16 x i32> undef to <16 x i64>
879; CHECK-NEXT:  Cost Model: Found an estimated cost of 1056 for instruction: %a14sb = sext <16 x i32> undef to <16 x i64>
880; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a14sm = mul <16 x i64> %a14sa, %a14sb
881; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sm)
882; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a15m = mul <1 x i64> undef, undef
883; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a15m)
884; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a16m = mul <2 x i64> undef, undef
885; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a16m)
886; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a17m = mul <4 x i64> undef, undef
887; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a17m)
888; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %a18m = mul <8 x i64> undef, undef
889; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a18m)
890; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %a19m = mul <16 x i64> undef, undef
891; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a19m)
892; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
893;
894  %a0za = zext <1 x i8> undef to <1 x i64>
895  %a0zb = zext <1 x i8> undef to <1 x i64>
896  %a0zm = mul <1 x i64> %a0za, %a0zb
897  %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0zm)
898
899  %a0sa = sext <1 x i8> undef to <1 x i64>
900  %a0sb = sext <1 x i8> undef to <1 x i64>
901  %a0sm = mul <1 x i64> %a0sa, %a0sb
902  %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sm)
903
904  %a1za = zext <2 x i8> undef to <2 x i64>
905  %a1zb = zext <2 x i8> undef to <2 x i64>
906  %a1zm = mul <2 x i64> %a1za, %a1zb
907  %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1zm)
908
909  %a1sa = sext <2 x i8> undef to <2 x i64>
910  %a1sb = sext <2 x i8> undef to <2 x i64>
911  %a1sm = mul <2 x i64> %a1sa, %a1sb
912  %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sm)
913
914  %a2za = zext <4 x i8> undef to <4 x i64>
915  %a2zb = zext <4 x i8> undef to <4 x i64>
916  %a2zm = mul <4 x i64> %a2za, %a2zb
917  %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2zm)
918
919  %a2sa = sext <4 x i8> undef to <4 x i64>
920  %a2sb = sext <4 x i8> undef to <4 x i64>
921  %a2sm = mul <4 x i64> %a2sa, %a2sb
922  %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sm)
923
924  %a3za = zext <8 x i8> undef to <8 x i64>
925  %a3zb = zext <8 x i8> undef to <8 x i64>
926  %a3zm = mul <8 x i64> %a3za, %a3zb
927  %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3zm)
928
929  %a3sa = sext <8 x i8> undef to <8 x i64>
930  %a3sb = sext <8 x i8> undef to <8 x i64>
931  %a3sm = mul <8 x i64> %a3sa, %a3sb
932  %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sm)
933
934  %a4za = zext <16 x i8> undef to <16 x i64>
935  %a4zb = zext <16 x i8> undef to <16 x i64>
936  %a4zm = mul <16 x i64> %a4za, %a4zb
937  %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4zm)
938
939  %a4sa = sext <16 x i8> undef to <16 x i64>
940  %a4sb = sext <16 x i8> undef to <16 x i64>
941  %a4sm = mul <16 x i64> %a4sa, %a4sb
942  %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sm)
943
944  %a5za = zext <1 x i16> undef to <1 x i64>
945  %a5zb = zext <1 x i16> undef to <1 x i64>
946  %a5zm = mul <1 x i64> %a5za, %a5zb
947  %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5zm)
948
949  %a5sa = sext <1 x i16> undef to <1 x i64>
950  %a5sb = sext <1 x i16> undef to <1 x i64>
951  %a5sm = mul <1 x i64> %a5sa, %a5sb
952  %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sm)
953
954  %a6za = zext <2 x i16> undef to <2 x i64>
955  %a6zb = zext <2 x i16> undef to <2 x i64>
956  %a6zm = mul <2 x i64> %a6za, %a6zb
957  %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6zm)
958
959  %a6sa = sext <2 x i16> undef to <2 x i64>
960  %a6sb = sext <2 x i16> undef to <2 x i64>
961  %a6sm = mul <2 x i64> %a6sa, %a6sb
962  %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sm)
963
964  %a7za = zext <4 x i16> undef to <4 x i64>
965  %a7zb = zext <4 x i16> undef to <4 x i64>
966  %a7zm = mul <4 x i64> %a7za, %a7zb
967  %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7zm)
968
969  %a7sa = sext <4 x i16> undef to <4 x i64>
970  %a7sb = sext <4 x i16> undef to <4 x i64>
971  %a7sm = mul <4 x i64> %a7sa, %a7sb
972  %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sm)
973
974  %a8za = zext <8 x i16> undef to <8 x i64>
975  %a8zb = zext <8 x i16> undef to <8 x i64>
976  %a8zm = mul <8 x i64> %a8za, %a8zb
977  %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8zm)
978
979  %a8sa = sext <8 x i16> undef to <8 x i64>
980  %a8sb = sext <8 x i16> undef to <8 x i64>
981  %a8sm = mul <8 x i64> %a8sa, %a8sb
982  %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sm)
983
984  %a9za = zext <16 x i16> undef to <16 x i64>
985  %a9zb = zext <16 x i16> undef to <16 x i64>
986  %a9zm = mul <16 x i64> %a9za, %a9zb
987  %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9zm)
988
989  %a9sa = sext <16 x i16> undef to <16 x i64>
990  %a9sb = sext <16 x i16> undef to <16 x i64>
991  %a9sm = mul <16 x i64> %a9sa, %a9sb
992  %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sm)
993
994  %a10za = zext <1 x i32> undef to <1 x i64>
995  %a10zb = zext <1 x i32> undef to <1 x i64>
996  %a10zm = mul <1 x i64> %a10za, %a10zb
997  %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10zm)
998
999  %a10sa = sext <1 x i32> undef to <1 x i64>
1000  %a10sb = sext <1 x i32> undef to <1 x i64>
1001  %a10sm = mul <1 x i64> %a10sa, %a10sb
1002  %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sm)
1003
1004  %a11za = zext <2 x i32> undef to <2 x i64>
1005  %a11zb = zext <2 x i32> undef to <2 x i64>
1006  %a11zm = mul <2 x i64> %a11za, %a11zb
1007  %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11zm)
1008
1009  %a11sa = sext <2 x i32> undef to <2 x i64>
1010  %a11sb = sext <2 x i32> undef to <2 x i64>
1011  %a11sm = mul <2 x i64> %a11sa, %a11sb
1012  %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sm)
1013
1014  %a12za = zext <4 x i32> undef to <4 x i64>
1015  %a12zb = zext <4 x i32> undef to <4 x i64>
1016  %a12zm = mul <4 x i64> %a12za, %a12zb
1017  %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12zm)
1018
1019  %a12sa = sext <4 x i32> undef to <4 x i64>
1020  %a12sb = sext <4 x i32> undef to <4 x i64>
1021  %a12sm = mul <4 x i64> %a12sa, %a12sb
1022  %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sm)
1023
1024  %a13za = zext <8 x i32> undef to <8 x i64>
1025  %a13zb = zext <8 x i32> undef to <8 x i64>
1026  %a13zm = mul <8 x i64> %a13za, %a13zb
1027  %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13zm)
1028
1029  %a13sa = sext <8 x i32> undef to <8 x i64>
1030  %a13sb = sext <8 x i32> undef to <8 x i64>
1031  %a13sm = mul <8 x i64> %a13sa, %a13sb
1032  %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sm)
1033
1034  %a14za = zext <16 x i32> undef to <16 x i64>
1035  %a14zb = zext <16 x i32> undef to <16 x i64>
1036  %a14zm = mul <16 x i64> %a14za, %a14zb
1037  %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14zm)
1038
1039  %a14sa = sext <16 x i32> undef to <16 x i64>
1040  %a14sb = sext <16 x i32> undef to <16 x i64>
1041  %a14sm = mul <16 x i64> %a14sa, %a14sb
1042  %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sm)
1043
1044  %a15m = mul <1 x i64> undef, undef
1045  %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a15m)
1046
1047  %a16m = mul <2 x i64> undef, undef
1048  %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a16m)
1049
1050  %a17m = mul <4 x i64> undef, undef
1051  %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a17m)
1052
1053  %a18m = mul <8 x i64> undef, undef
1054  %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a18m)
1055
1056  %a19m = mul <16 x i64> undef, undef
1057  %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a19m)
1058
1059  ret void
1060}
1061
1062declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
1063declare i16 @llvm.vector.reduce.add.v1i16(<1 x i16>)
1064declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
1065declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
1066declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
1067declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
1068declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32>)
1069declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
1070declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
1071declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
1072declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
1073declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
1074declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
1075declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
1076declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
1077declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
1078declare i8 @llvm.vector.reduce.add.v1i8(<1 x i8>)
1079declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
1080declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
1081declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
1082