xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll (revision 7b3bbd83c0c24087072ec5b22a76799ab31f87d5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
3
4; Various reductions generated fro SLP vectorizing unrolled loops. Generated
5; from https://godbolt.org/z/ebxdPh1Kz with some less interesting cases removed.
6
7define i32 @addv2i32i32(ptr %x) {
8; CHECK-LABEL: addv2i32i32:
9; CHECK:       @ %bb.0: @ %entry
10; CHECK-NEXT:    ldrd r0, r1, [r0]
11; CHECK-NEXT:    add r0, r1
12; CHECK-NEXT:    bx lr
13entry:
14  %0 = load i32, ptr %x, align 4
15  %arrayidx.1 = getelementptr inbounds i32, ptr %x, i32 1
16  %1 = load i32, ptr %arrayidx.1, align 4
17  %add.1 = add nsw i32 %1, %0
18  ret i32 %add.1
19}
20
21define i32 @addv4i32i32(ptr %x) {
22; CHECK-LABEL: addv4i32i32:
23; CHECK:       @ %bb.0: @ %entry
24; CHECK-NEXT:    vldrw.u32 q0, [r0]
25; CHECK-NEXT:    vaddv.u32 r0, q0
26; CHECK-NEXT:    bx lr
27entry:
28  %0 = load <4 x i32>, ptr %x, align 4
29  %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0)
30  ret i32 %1
31}
32
33define i32 @addv8i32i32(ptr %x) {
34; CHECK-LABEL: addv8i32i32:
35; CHECK:       @ %bb.0: @ %entry
36; CHECK-NEXT:    vldrw.u32 q1, [r0]
37; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
38; CHECK-NEXT:    vaddv.u32 r0, q1
39; CHECK-NEXT:    vaddva.u32 r0, q0
40; CHECK-NEXT:    bx lr
41entry:
42  %0 = load <8 x i32>, ptr %x, align 4
43  %1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0)
44  ret i32 %1
45}
46
47define i32 @addv16i32i32(ptr %x) {
48; CHECK-LABEL: addv16i32i32:
49; CHECK:       @ %bb.0: @ %entry
50; CHECK-NEXT:    vldrw.u32 q1, [r0]
51; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
52; CHECK-NEXT:    vaddv.u32 r2, q1
53; CHECK-NEXT:    vaddva.u32 r2, q0
54; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
55; CHECK-NEXT:    vaddva.u32 r2, q0
56; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
57; CHECK-NEXT:    vaddva.u32 r2, q0
58; CHECK-NEXT:    mov r0, r2
59; CHECK-NEXT:    bx lr
60entry:
61  %0 = load <16 x i32>, ptr %x, align 4
62  %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0)
63  ret i32 %1
64}
65
66define i32 @addv24i32i32(ptr %x) {
67; CHECK-LABEL: addv24i32i32:
68; CHECK:       @ %bb.0: @ %entry
69; CHECK-NEXT:    vldrw.u32 q1, [r0]
70; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
71; CHECK-NEXT:    vaddv.u32 r2, q1
72; CHECK-NEXT:    vaddva.u32 r2, q0
73; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
74; CHECK-NEXT:    vaddva.u32 r2, q0
75; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
76; CHECK-NEXT:    vaddva.u32 r2, q0
77; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
78; CHECK-NEXT:    vaddva.u32 r2, q0
79; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
80; CHECK-NEXT:    vaddva.u32 r2, q0
81; CHECK-NEXT:    mov r0, r2
82; CHECK-NEXT:    bx lr
83entry:
84  %0 = load <8 x i32>, ptr %x, align 4
85  %arrayidx.8 = getelementptr inbounds i32, ptr %x, i32 8
86  %1 = load <16 x i32>, ptr %arrayidx.8, align 4
87  %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
88  %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0)
89  %op.rdx = add nsw i32 %2, %3
90  ret i32 %op.rdx
91}
92
93define i32 @addv32i32i32(ptr %x) {
94; CHECK-LABEL: addv32i32i32:
95; CHECK:       @ %bb.0: @ %entry
96; CHECK-NEXT:    vldrw.u32 q1, [r0]
97; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
98; CHECK-NEXT:    mov r1, r0
99; CHECK-NEXT:    vaddv.u32 r0, q1
100; CHECK-NEXT:    vaddva.u32 r0, q0
101; CHECK-NEXT:    vldrw.u32 q0, [r1, #32]
102; CHECK-NEXT:    vaddva.u32 r0, q0
103; CHECK-NEXT:    vldrw.u32 q0, [r1, #48]
104; CHECK-NEXT:    vaddva.u32 r0, q0
105; CHECK-NEXT:    vldrw.u32 q0, [r1, #64]
106; CHECK-NEXT:    vaddva.u32 r0, q0
107; CHECK-NEXT:    vldrw.u32 q0, [r1, #80]
108; CHECK-NEXT:    vaddva.u32 r0, q0
109; CHECK-NEXT:    vldrw.u32 q0, [r1, #96]
110; CHECK-NEXT:    vaddva.u32 r0, q0
111; CHECK-NEXT:    vldrw.u32 q0, [r1, #112]
112; CHECK-NEXT:    vaddva.u32 r0, q0
113; CHECK-NEXT:    bx lr
114entry:
115  %0 = load <32 x i32>, ptr %x, align 4
116  %1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %0)
117  ret i32 %1
118}
119
120define i32 @addv64i32i32(ptr %x) {
121; CHECK-LABEL: addv64i32i32:
122; CHECK:       @ %bb.0: @ %entry
123; CHECK-NEXT:    vldrw.u32 q1, [r0]
124; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
125; CHECK-NEXT:    vaddv.u32 r2, q1
126; CHECK-NEXT:    vaddva.u32 r2, q0
127; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
128; CHECK-NEXT:    vaddva.u32 r2, q0
129; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
130; CHECK-NEXT:    vaddva.u32 r2, q0
131; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
132; CHECK-NEXT:    vaddva.u32 r2, q0
133; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
134; CHECK-NEXT:    vaddva.u32 r2, q0
135; CHECK-NEXT:    vldrw.u32 q0, [r0, #96]
136; CHECK-NEXT:    vaddva.u32 r2, q0
137; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
138; CHECK-NEXT:    vaddva.u32 r2, q0
139; CHECK-NEXT:    vldrw.u32 q0, [r0, #128]
140; CHECK-NEXT:    vaddva.u32 r2, q0
141; CHECK-NEXT:    vldrw.u32 q0, [r0, #144]
142; CHECK-NEXT:    vaddva.u32 r2, q0
143; CHECK-NEXT:    vldrw.u32 q0, [r0, #160]
144; CHECK-NEXT:    vaddva.u32 r2, q0
145; CHECK-NEXT:    vldrw.u32 q0, [r0, #176]
146; CHECK-NEXT:    vaddva.u32 r2, q0
147; CHECK-NEXT:    vldrw.u32 q0, [r0, #192]
148; CHECK-NEXT:    vaddva.u32 r2, q0
149; CHECK-NEXT:    vldrw.u32 q0, [r0, #208]
150; CHECK-NEXT:    vaddva.u32 r2, q0
151; CHECK-NEXT:    vldrw.u32 q0, [r0, #224]
152; CHECK-NEXT:    vaddva.u32 r2, q0
153; CHECK-NEXT:    vldrw.u32 q0, [r0, #240]
154; CHECK-NEXT:    vaddva.u32 r2, q0
155; CHECK-NEXT:    mov r0, r2
156; CHECK-NEXT:    bx lr
157entry:
158  %0 = load <64 x i32>, ptr %x, align 4
159  %1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %0)
160  ret i32 %1
161}
162
163define i32 @addv128i32i32(ptr %x) {
164; CHECK-LABEL: addv128i32i32:
165; CHECK:       @ %bb.0: @ %entry
166; CHECK-NEXT:    vldrw.u32 q1, [r0]
167; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
168; CHECK-NEXT:    vaddv.u32 r2, q1
169; CHECK-NEXT:    vaddva.u32 r2, q0
170; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
171; CHECK-NEXT:    vaddva.u32 r2, q0
172; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
173; CHECK-NEXT:    vaddva.u32 r2, q0
174; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
175; CHECK-NEXT:    vaddva.u32 r2, q0
176; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
177; CHECK-NEXT:    vaddva.u32 r2, q0
178; CHECK-NEXT:    vldrw.u32 q0, [r0, #96]
179; CHECK-NEXT:    vaddva.u32 r2, q0
180; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
181; CHECK-NEXT:    vaddva.u32 r2, q0
182; CHECK-NEXT:    vldrw.u32 q0, [r0, #128]
183; CHECK-NEXT:    vaddva.u32 r2, q0
184; CHECK-NEXT:    vldrw.u32 q0, [r0, #144]
185; CHECK-NEXT:    vaddva.u32 r2, q0
186; CHECK-NEXT:    vldrw.u32 q0, [r0, #160]
187; CHECK-NEXT:    vaddva.u32 r2, q0
188; CHECK-NEXT:    vldrw.u32 q0, [r0, #176]
189; CHECK-NEXT:    vaddva.u32 r2, q0
190; CHECK-NEXT:    vldrw.u32 q0, [r0, #192]
191; CHECK-NEXT:    vaddva.u32 r2, q0
192; CHECK-NEXT:    vldrw.u32 q0, [r0, #208]
193; CHECK-NEXT:    vaddva.u32 r2, q0
194; CHECK-NEXT:    vldrw.u32 q0, [r0, #224]
195; CHECK-NEXT:    vaddva.u32 r2, q0
196; CHECK-NEXT:    vldrw.u32 q0, [r0, #240]
197; CHECK-NEXT:    vaddva.u32 r2, q0
198; CHECK-NEXT:    vldrw.u32 q0, [r0, #256]
199; CHECK-NEXT:    vaddva.u32 r2, q0
200; CHECK-NEXT:    vldrw.u32 q0, [r0, #272]
201; CHECK-NEXT:    vaddva.u32 r2, q0
202; CHECK-NEXT:    vldrw.u32 q0, [r0, #288]
203; CHECK-NEXT:    vaddva.u32 r2, q0
204; CHECK-NEXT:    vldrw.u32 q0, [r0, #304]
205; CHECK-NEXT:    vaddva.u32 r2, q0
206; CHECK-NEXT:    vldrw.u32 q0, [r0, #320]
207; CHECK-NEXT:    vaddva.u32 r2, q0
208; CHECK-NEXT:    vldrw.u32 q0, [r0, #336]
209; CHECK-NEXT:    vaddva.u32 r2, q0
210; CHECK-NEXT:    vldrw.u32 q0, [r0, #352]
211; CHECK-NEXT:    vaddva.u32 r2, q0
212; CHECK-NEXT:    vldrw.u32 q0, [r0, #368]
213; CHECK-NEXT:    vaddva.u32 r2, q0
214; CHECK-NEXT:    vldrw.u32 q0, [r0, #384]
215; CHECK-NEXT:    vaddva.u32 r2, q0
216; CHECK-NEXT:    vldrw.u32 q0, [r0, #400]
217; CHECK-NEXT:    vaddva.u32 r2, q0
218; CHECK-NEXT:    vldrw.u32 q0, [r0, #416]
219; CHECK-NEXT:    vaddva.u32 r2, q0
220; CHECK-NEXT:    vldrw.u32 q0, [r0, #432]
221; CHECK-NEXT:    vaddva.u32 r2, q0
222; CHECK-NEXT:    vldrw.u32 q0, [r0, #448]
223; CHECK-NEXT:    vaddva.u32 r2, q0
224; CHECK-NEXT:    vldrw.u32 q0, [r0, #464]
225; CHECK-NEXT:    vaddva.u32 r2, q0
226; CHECK-NEXT:    vldrw.u32 q0, [r0, #480]
227; CHECK-NEXT:    vaddva.u32 r2, q0
228; CHECK-NEXT:    vldrw.u32 q0, [r0, #496]
229; CHECK-NEXT:    vaddva.u32 r2, q0
230; CHECK-NEXT:    mov r0, r2
231; CHECK-NEXT:    bx lr
232entry:
233  %wide.load = load <4 x i32>, ptr %x, align 4
234  %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load)
235  %1 = getelementptr inbounds i32, ptr %x, i32 4
236  %wide.load.1 = load <4 x i32>, ptr %1, align 4
237  %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.1)
238  %3 = add i32 %2, %0
239  %4 = getelementptr inbounds i32, ptr %x, i32 8
240  %wide.load.2 = load <4 x i32>, ptr %4, align 4
241  %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.2)
242  %6 = add i32 %5, %3
243  %7 = getelementptr inbounds i32, ptr %x, i32 12
244  %wide.load.3 = load <4 x i32>, ptr %7, align 4
245  %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.3)
246  %9 = add i32 %8, %6
247  %10 = getelementptr inbounds i32, ptr %x, i32 16
248  %wide.load.4 = load <4 x i32>, ptr %10, align 4
249  %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.4)
250  %12 = add i32 %11, %9
251  %13 = getelementptr inbounds i32, ptr %x, i32 20
252  %wide.load.5 = load <4 x i32>, ptr %13, align 4
253  %14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.5)
254  %15 = add i32 %14, %12
255  %16 = getelementptr inbounds i32, ptr %x, i32 24
256  %wide.load.6 = load <4 x i32>, ptr %16, align 4
257  %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.6)
258  %18 = add i32 %17, %15
259  %19 = getelementptr inbounds i32, ptr %x, i32 28
260  %wide.load.7 = load <4 x i32>, ptr %19, align 4
261  %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.7)
262  %21 = add i32 %20, %18
263  %22 = getelementptr inbounds i32, ptr %x, i32 32
264  %wide.load.8 = load <4 x i32>, ptr %22, align 4
265  %23 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.8)
266  %24 = add i32 %23, %21
267  %25 = getelementptr inbounds i32, ptr %x, i32 36
268  %wide.load.9 = load <4 x i32>, ptr %25, align 4
269  %26 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.9)
270  %27 = add i32 %26, %24
271  %28 = getelementptr inbounds i32, ptr %x, i32 40
272  %wide.load.10 = load <4 x i32>, ptr %28, align 4
273  %29 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.10)
274  %30 = add i32 %29, %27
275  %31 = getelementptr inbounds i32, ptr %x, i32 44
276  %wide.load.11 = load <4 x i32>, ptr %31, align 4
277  %32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.11)
278  %33 = add i32 %32, %30
279  %34 = getelementptr inbounds i32, ptr %x, i32 48
280  %wide.load.12 = load <4 x i32>, ptr %34, align 4
281  %35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.12)
282  %36 = add i32 %35, %33
283  %37 = getelementptr inbounds i32, ptr %x, i32 52
284  %wide.load.13 = load <4 x i32>, ptr %37, align 4
285  %38 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.13)
286  %39 = add i32 %38, %36
287  %40 = getelementptr inbounds i32, ptr %x, i32 56
288  %wide.load.14 = load <4 x i32>, ptr %40, align 4
289  %41 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.14)
290  %42 = add i32 %41, %39
291  %43 = getelementptr inbounds i32, ptr %x, i32 60
292  %wide.load.15 = load <4 x i32>, ptr %43, align 4
293  %44 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.15)
294  %45 = add i32 %44, %42
295  %46 = getelementptr inbounds i32, ptr %x, i32 64
296  %wide.load.16 = load <4 x i32>, ptr %46, align 4
297  %47 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.16)
298  %48 = add i32 %47, %45
299  %49 = getelementptr inbounds i32, ptr %x, i32 68
300  %wide.load.17 = load <4 x i32>, ptr %49, align 4
301  %50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.17)
302  %51 = add i32 %50, %48
303  %52 = getelementptr inbounds i32, ptr %x, i32 72
304  %wide.load.18 = load <4 x i32>, ptr %52, align 4
305  %53 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.18)
306  %54 = add i32 %53, %51
307  %55 = getelementptr inbounds i32, ptr %x, i32 76
308  %wide.load.19 = load <4 x i32>, ptr %55, align 4
309  %56 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.19)
310  %57 = add i32 %56, %54
311  %58 = getelementptr inbounds i32, ptr %x, i32 80
312  %wide.load.20 = load <4 x i32>, ptr %58, align 4
313  %59 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.20)
314  %60 = add i32 %59, %57
315  %61 = getelementptr inbounds i32, ptr %x, i32 84
316  %wide.load.21 = load <4 x i32>, ptr %61, align 4
317  %62 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.21)
318  %63 = add i32 %62, %60
319  %64 = getelementptr inbounds i32, ptr %x, i32 88
320  %wide.load.22 = load <4 x i32>, ptr %64, align 4
321  %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.22)
322  %66 = add i32 %65, %63
323  %67 = getelementptr inbounds i32, ptr %x, i32 92
324  %wide.load.23 = load <4 x i32>, ptr %67, align 4
325  %68 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.23)
326  %69 = add i32 %68, %66
327  %70 = getelementptr inbounds i32, ptr %x, i32 96
328  %wide.load.24 = load <4 x i32>, ptr %70, align 4
329  %71 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.24)
330  %72 = add i32 %71, %69
331  %73 = getelementptr inbounds i32, ptr %x, i32 100
332  %wide.load.25 = load <4 x i32>, ptr %73, align 4
333  %74 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.25)
334  %75 = add i32 %74, %72
335  %76 = getelementptr inbounds i32, ptr %x, i32 104
336  %wide.load.26 = load <4 x i32>, ptr %76, align 4
337  %77 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.26)
338  %78 = add i32 %77, %75
339  %79 = getelementptr inbounds i32, ptr %x, i32 108
340  %wide.load.27 = load <4 x i32>, ptr %79, align 4
341  %80 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.27)
342  %81 = add i32 %80, %78
343  %82 = getelementptr inbounds i32, ptr %x, i32 112
344  %wide.load.28 = load <4 x i32>, ptr %82, align 4
345  %83 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.28)
346  %84 = add i32 %83, %81
347  %85 = getelementptr inbounds i32, ptr %x, i32 116
348  %wide.load.29 = load <4 x i32>, ptr %85, align 4
349  %86 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.29)
350  %87 = add i32 %86, %84
351  %88 = getelementptr inbounds i32, ptr %x, i32 120
352  %wide.load.30 = load <4 x i32>, ptr %88, align 4
353  %89 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.30)
354  %90 = add i32 %89, %87
355  %91 = getelementptr inbounds i32, ptr %x, i32 124
356  %wide.load.31 = load <4 x i32>, ptr %91, align 4
357  %92 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.31)
358  %93 = add i32 %92, %90
359  ret i32 %93
360}
361
362define i32 @addv2i32i16(ptr %x) {
363; CHECK-LABEL: addv2i32i16:
364; CHECK:       @ %bb.0: @ %entry
365; CHECK-NEXT:    ldrsh.w r1, [r0]
366; CHECK-NEXT:    ldrsh.w r0, [r0, #2]
367; CHECK-NEXT:    add r0, r1
368; CHECK-NEXT:    bx lr
369entry:
370  %0 = load i16, ptr %x, align 2
371  %conv = sext i16 %0 to i32
372  %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1
373  %1 = load i16, ptr %arrayidx.1, align 2
374  %conv.1 = sext i16 %1 to i32
375  %add.1 = add nsw i32 %conv, %conv.1
376  ret i32 %add.1
377}
378
379define i32 @addv4i32i16(ptr %x) {
380; CHECK-LABEL: addv4i32i16:
381; CHECK:       @ %bb.0: @ %entry
382; CHECK-NEXT:    vldrh.s32 q0, [r0]
383; CHECK-NEXT:    vaddv.u32 r0, q0
384; CHECK-NEXT:    bx lr
385entry:
386  %0 = load <4 x i16>, ptr %x, align 2
387  %1 = sext <4 x i16> %0 to <4 x i32>
388  %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
389  ret i32 %2
390}
391
392define i32 @addv8i32i16(ptr %x) {
393; CHECK-LABEL: addv8i32i16:
394; CHECK:       @ %bb.0: @ %entry
395; CHECK-NEXT:    vldrh.u16 q0, [r0]
396; CHECK-NEXT:    vaddv.s16 r0, q0
397; CHECK-NEXT:    bx lr
398entry:
399  %0 = load <8 x i16>, ptr %x, align 2
400  %1 = sext <8 x i16> %0 to <8 x i32>
401  %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
402  ret i32 %2
403}
404
405define i32 @addv16i32i16(ptr %x) {
406; CHECK-LABEL: addv16i32i16:
407; CHECK:       @ %bb.0: @ %entry
408; CHECK-NEXT:    vldrh.s32 q1, [r0]
409; CHECK-NEXT:    vldrh.s32 q0, [r0, #8]
410; CHECK-NEXT:    vaddv.u32 r2, q1
411; CHECK-NEXT:    vaddva.u32 r2, q0
412; CHECK-NEXT:    vldrh.s32 q0, [r0, #16]
413; CHECK-NEXT:    vaddva.u32 r2, q0
414; CHECK-NEXT:    vldrh.s32 q0, [r0, #24]
415; CHECK-NEXT:    vaddva.u32 r2, q0
416; CHECK-NEXT:    mov r0, r2
417; CHECK-NEXT:    bx lr
418entry:
419  %0 = load <16 x i16>, ptr %x, align 2
420  %1 = sext <16 x i16> %0 to <16 x i32>
421  %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
422  ret i32 %2
423}
424
425define i32 @addv24i32i16(ptr %x) {
426; CHECK-LABEL: addv24i32i16:
427; CHECK:       @ %bb.0: @ %entry
428; CHECK-NEXT:    vldrh.s32 q1, [r0]
429; CHECK-NEXT:    vldrh.s32 q0, [r0, #8]
430; CHECK-NEXT:    vaddv.u32 r2, q1
431; CHECK-NEXT:    vaddva.u32 r2, q0
432; CHECK-NEXT:    vldrh.s32 q0, [r0, #16]
433; CHECK-NEXT:    vaddva.u32 r2, q0
434; CHECK-NEXT:    vldrh.s32 q0, [r0, #24]
435; CHECK-NEXT:    vaddva.u32 r2, q0
436; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
437; CHECK-NEXT:    vaddva.s16 r2, q0
438; CHECK-NEXT:    mov r0, r2
439; CHECK-NEXT:    bx lr
440entry:
441  %0 = load <16 x i16>, ptr %x, align 2
442  %1 = sext <16 x i16> %0 to <16 x i32>
443  %arrayidx.16 = getelementptr inbounds i16, ptr %x, i32 16
444  %2 = load <8 x i16>, ptr %arrayidx.16, align 2
445  %3 = sext <8 x i16> %2 to <8 x i32>
446  %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
447  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
448  %op.rdx = add nsw i32 %4, %5
449  ret i32 %op.rdx
450}
451
452define i32 @addv32i32i16(ptr %x) {
453; CHECK-LABEL: addv32i32i16:
454; CHECK:       @ %bb.0: @ %entry
455; CHECK-NEXT:    vldrh.s32 q1, [r0]
456; CHECK-NEXT:    vldrh.s32 q0, [r0, #8]
457; CHECK-NEXT:    vaddv.u32 r2, q1
458; CHECK-NEXT:    vaddva.u32 r2, q0
459; CHECK-NEXT:    vldrh.s32 q0, [r0, #16]
460; CHECK-NEXT:    vaddva.u32 r2, q0
461; CHECK-NEXT:    vldrh.s32 q0, [r0, #24]
462; CHECK-NEXT:    vaddva.u32 r2, q0
463; CHECK-NEXT:    vldrh.s32 q0, [r0, #32]
464; CHECK-NEXT:    vaddva.u32 r2, q0
465; CHECK-NEXT:    vldrh.s32 q0, [r0, #40]
466; CHECK-NEXT:    vaddva.u32 r2, q0
467; CHECK-NEXT:    vldrh.s32 q0, [r0, #48]
468; CHECK-NEXT:    vaddva.u32 r2, q0
469; CHECK-NEXT:    vldrh.s32 q0, [r0, #56]
470; CHECK-NEXT:    vaddva.u32 r2, q0
471; CHECK-NEXT:    mov r0, r2
472; CHECK-NEXT:    bx lr
473entry:
474  %0 = load <32 x i16>, ptr %x, align 2
475  %1 = sext <32 x i16> %0 to <32 x i32>
476  %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
477  ret i32 %2
478}
479
480define i32 @addv64i32i16(ptr %x) {
481; CHECK-LABEL: addv64i32i16:
482; CHECK:       @ %bb.0: @ %entry
483; CHECK-NEXT:    vldrh.s32 q1, [r0]
484; CHECK-NEXT:    vldrh.s32 q0, [r0, #8]
485; CHECK-NEXT:    ldrsh.w r1, [r0, #120]
486; CHECK-NEXT:    vaddv.u32 r2, q1
487; CHECK-NEXT:    ldrsh.w r3, [r0, #122]
488; CHECK-NEXT:    vaddva.u32 r2, q0
489; CHECK-NEXT:    vldrh.s32 q0, [r0, #16]
490; CHECK-NEXT:    ldrsh.w r12, [r0, #124]
491; CHECK-NEXT:    vaddva.u32 r2, q0
492; CHECK-NEXT:    vldrh.s32 q0, [r0, #24]
493; CHECK-NEXT:    vaddva.u32 r2, q0
494; CHECK-NEXT:    vldrh.s32 q0, [r0, #32]
495; CHECK-NEXT:    vaddva.u32 r2, q0
496; CHECK-NEXT:    vldrh.s32 q0, [r0, #40]
497; CHECK-NEXT:    vaddva.u32 r2, q0
498; CHECK-NEXT:    vldrh.s32 q0, [r0, #48]
499; CHECK-NEXT:    vaddva.u32 r2, q0
500; CHECK-NEXT:    vldrh.s32 q0, [r0, #56]
501; CHECK-NEXT:    vaddva.u32 r2, q0
502; CHECK-NEXT:    vldrh.s32 q0, [r0, #64]
503; CHECK-NEXT:    vaddva.u32 r2, q0
504; CHECK-NEXT:    vldrh.s32 q0, [r0, #72]
505; CHECK-NEXT:    vaddva.u32 r2, q0
506; CHECK-NEXT:    vldrh.s32 q0, [r0, #80]
507; CHECK-NEXT:    vaddva.u32 r2, q0
508; CHECK-NEXT:    vldrh.s32 q0, [r0, #88]
509; CHECK-NEXT:    vaddva.u32 r2, q0
510; CHECK-NEXT:    vldrh.u16 q0, [r0, #96]
511; CHECK-NEXT:    vaddva.s16 r2, q0
512; CHECK-NEXT:    vldrh.s32 q0, [r0, #112]
513; CHECK-NEXT:    ldrsh.w r0, [r0, #126]
514; CHECK-NEXT:    vaddva.u32 r2, q0
515; CHECK-NEXT:    add r1, r2
516; CHECK-NEXT:    add r1, r3
517; CHECK-NEXT:    add r1, r12
518; CHECK-NEXT:    add r0, r1
519; CHECK-NEXT:    bx lr
520entry:
521  %0 = load <32 x i16>, ptr %x, align 2
522  %1 = sext <32 x i16> %0 to <32 x i32>
523  %arrayidx.32 = getelementptr inbounds i16, ptr %x, i32 32
524  %2 = load <16 x i16>, ptr %arrayidx.32, align 2
525  %3 = sext <16 x i16> %2 to <16 x i32>
526  %arrayidx.48 = getelementptr inbounds i16, ptr %x, i32 48
527  %4 = load <8 x i16>, ptr %arrayidx.48, align 2
528  %5 = sext <8 x i16> %4 to <8 x i32>
529  %arrayidx.56 = getelementptr inbounds i16, ptr %x, i32 56
530  %6 = load <4 x i16>, ptr %arrayidx.56, align 2
531  %7 = sext <4 x i16> %6 to <4 x i32>
532  %arrayidx.60 = getelementptr inbounds i16, ptr %x, i32 60
533  %8 = load i16, ptr %arrayidx.60, align 2
534  %conv.60 = sext i16 %8 to i32
535  %arrayidx.61 = getelementptr inbounds i16, ptr %x, i32 61
536  %9 = load i16, ptr %arrayidx.61, align 2
537  %conv.61 = sext i16 %9 to i32
538  %arrayidx.62 = getelementptr inbounds i16, ptr %x, i32 62
539  %10 = load i16, ptr %arrayidx.62, align 2
540  %conv.62 = sext i16 %10 to i32
541  %11 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
542  %12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
543  %op.rdx = add nsw i32 %11, %12
544  %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
545  %op.rdx8 = add nsw i32 %op.rdx, %13
546  %14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
547  %op.rdx9 = add nsw i32 %op.rdx8, %14
548  %15 = add nsw i32 %op.rdx9, %conv.60
549  %16 = add nsw i32 %15, %conv.61
550  %17 = add nsw i32 %16, %conv.62
551  %arrayidx.63 = getelementptr inbounds i16, ptr %x, i32 63
552  %18 = load i16, ptr %arrayidx.63, align 2
553  %conv.63 = sext i16 %18 to i32
554  %add.63 = add nsw i32 %17, %conv.63
555  ret i32 %add.63
556}
557
558define i32 @addv128i32i16(ptr %x) {
559; CHECK-LABEL: addv128i32i16:
560; CHECK:       @ %bb.0: @ %entry
561; CHECK-NEXT:    vldrh.u16 q1, [r0]
562; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
563; CHECK-NEXT:    vaddv.s16 r2, q1
564; CHECK-NEXT:    vaddva.s16 r2, q0
565; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
566; CHECK-NEXT:    vaddva.s16 r2, q0
567; CHECK-NEXT:    vldrh.u16 q0, [r0, #48]
568; CHECK-NEXT:    vaddva.s16 r2, q0
569; CHECK-NEXT:    vldrh.u16 q0, [r0, #64]
570; CHECK-NEXT:    vaddva.s16 r2, q0
571; CHECK-NEXT:    vldrh.u16 q0, [r0, #80]
572; CHECK-NEXT:    vaddva.s16 r2, q0
573; CHECK-NEXT:    vldrh.u16 q0, [r0, #96]
574; CHECK-NEXT:    vaddva.s16 r2, q0
575; CHECK-NEXT:    vldrh.u16 q0, [r0, #112]
576; CHECK-NEXT:    vaddva.s16 r2, q0
577; CHECK-NEXT:    vldrh.u16 q0, [r0, #128]
578; CHECK-NEXT:    vaddva.s16 r2, q0
579; CHECK-NEXT:    vldrh.u16 q0, [r0, #144]
580; CHECK-NEXT:    vaddva.s16 r2, q0
581; CHECK-NEXT:    vldrh.u16 q0, [r0, #160]
582; CHECK-NEXT:    vaddva.s16 r2, q0
583; CHECK-NEXT:    vldrh.u16 q0, [r0, #176]
584; CHECK-NEXT:    vaddva.s16 r2, q0
585; CHECK-NEXT:    vldrh.u16 q0, [r0, #192]
586; CHECK-NEXT:    vaddva.s16 r2, q0
587; CHECK-NEXT:    vldrh.u16 q0, [r0, #208]
588; CHECK-NEXT:    vaddva.s16 r2, q0
589; CHECK-NEXT:    vldrh.u16 q0, [r0, #224]
590; CHECK-NEXT:    vaddva.s16 r2, q0
591; CHECK-NEXT:    vldrh.u16 q0, [r0, #240]
592; CHECK-NEXT:    vaddva.s16 r2, q0
593; CHECK-NEXT:    mov r0, r2
594; CHECK-NEXT:    bx lr
595entry:
596  %wide.load = load <8 x i16>, ptr %x, align 2
597  %0 = sext <8 x i16> %wide.load to <8 x i32>
598  %1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0)
599  %2 = getelementptr inbounds i16, ptr %x, i32 8
600  %wide.load.1 = load <8 x i16>, ptr %2, align 2
601  %3 = sext <8 x i16> %wide.load.1 to <8 x i32>
602  %4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
603  %5 = add i32 %4, %1
604  %6 = getelementptr inbounds i16, ptr %x, i32 16
605  %wide.load.2 = load <8 x i16>, ptr %6, align 2
606  %7 = sext <8 x i16> %wide.load.2 to <8 x i32>
607  %8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7)
608  %9 = add i32 %8, %5
609  %10 = getelementptr inbounds i16, ptr %x, i32 24
610  %wide.load.3 = load <8 x i16>, ptr %10, align 2
611  %11 = sext <8 x i16> %wide.load.3 to <8 x i32>
612  %12 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %11)
613  %13 = add i32 %12, %9
614  %14 = getelementptr inbounds i16, ptr %x, i32 32
615  %wide.load.4 = load <8 x i16>, ptr %14, align 2
616  %15 = sext <8 x i16> %wide.load.4 to <8 x i32>
617  %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15)
618  %17 = add i32 %16, %13
619  %18 = getelementptr inbounds i16, ptr %x, i32 40
620  %wide.load.5 = load <8 x i16>, ptr %18, align 2
621  %19 = sext <8 x i16> %wide.load.5 to <8 x i32>
622  %20 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19)
623  %21 = add i32 %20, %17
624  %22 = getelementptr inbounds i16, ptr %x, i32 48
625  %wide.load.6 = load <8 x i16>, ptr %22, align 2
626  %23 = sext <8 x i16> %wide.load.6 to <8 x i32>
627  %24 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %23)
628  %25 = add i32 %24, %21
629  %26 = getelementptr inbounds i16, ptr %x, i32 56
630  %wide.load.7 = load <8 x i16>, ptr %26, align 2
631  %27 = sext <8 x i16> %wide.load.7 to <8 x i32>
632  %28 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %27)
633  %29 = add i32 %28, %25
634  %30 = getelementptr inbounds i16, ptr %x, i32 64
635  %wide.load.8 = load <8 x i16>, ptr %30, align 2
636  %31 = sext <8 x i16> %wide.load.8 to <8 x i32>
637  %32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %31)
638  %33 = add i32 %32, %29
639  %34 = getelementptr inbounds i16, ptr %x, i32 72
640  %wide.load.9 = load <8 x i16>, ptr %34, align 2
641  %35 = sext <8 x i16> %wide.load.9 to <8 x i32>
642  %36 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %35)
643  %37 = add i32 %36, %33
644  %38 = getelementptr inbounds i16, ptr %x, i32 80
645  %wide.load.10 = load <8 x i16>, ptr %38, align 2
646  %39 = sext <8 x i16> %wide.load.10 to <8 x i32>
647  %40 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %39)
648  %41 = add i32 %40, %37
649  %42 = getelementptr inbounds i16, ptr %x, i32 88
650  %wide.load.11 = load <8 x i16>, ptr %42, align 2
651  %43 = sext <8 x i16> %wide.load.11 to <8 x i32>
652  %44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43)
653  %45 = add i32 %44, %41
654  %46 = getelementptr inbounds i16, ptr %x, i32 96
655  %wide.load.12 = load <8 x i16>, ptr %46, align 2
656  %47 = sext <8 x i16> %wide.load.12 to <8 x i32>
657  %48 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47)
658  %49 = add i32 %48, %45
659  %50 = getelementptr inbounds i16, ptr %x, i32 104
660  %wide.load.13 = load <8 x i16>, ptr %50, align 2
661  %51 = sext <8 x i16> %wide.load.13 to <8 x i32>
662  %52 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %51)
663  %53 = add i32 %52, %49
664  %54 = getelementptr inbounds i16, ptr %x, i32 112
665  %wide.load.14 = load <8 x i16>, ptr %54, align 2
666  %55 = sext <8 x i16> %wide.load.14 to <8 x i32>
667  %56 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %55)
668  %57 = add i32 %56, %53
669  %58 = getelementptr inbounds i16, ptr %x, i32 120
670  %wide.load.15 = load <8 x i16>, ptr %58, align 2
671  %59 = sext <8 x i16> %wide.load.15 to <8 x i32>
672  %60 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %59)
673  %61 = add i32 %60, %57
674  ret i32 %61
675}
676
677define i32 @addv2i32i8(ptr %x) {
678; CHECK-LABEL: addv2i32i8:
679; CHECK:       @ %bb.0: @ %entry
680; CHECK-NEXT:    ldrb r1, [r0]
681; CHECK-NEXT:    ldrb r0, [r0, #1]
682; CHECK-NEXT:    add r0, r1
683; CHECK-NEXT:    bx lr
684entry:
685  %0 = load i8, ptr %x, align 1
686  %conv = zext i8 %0 to i32
687  %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1
688  %1 = load i8, ptr %arrayidx.1, align 1
689  %conv.1 = zext i8 %1 to i32
690  %add.1 = add nuw nsw i32 %conv, %conv.1
691  ret i32 %add.1
692}
693
694define i32 @addv4i32i8(ptr %x) {
695; CHECK-LABEL: addv4i32i8:
696; CHECK:       @ %bb.0: @ %entry
697; CHECK-NEXT:    vldrb.u32 q0, [r0]
698; CHECK-NEXT:    vaddv.u32 r0, q0
699; CHECK-NEXT:    bx lr
700entry:
701  %0 = load <4 x i8>, ptr %x, align 1
702  %1 = zext <4 x i8> %0 to <4 x i32>
703  %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
704  ret i32 %2
705}
706
707define i32 @addv8i32i8(ptr %x) {
708; CHECK-LABEL: addv8i32i8:
709; CHECK:       @ %bb.0: @ %entry
710; CHECK-NEXT:    vldrb.u16 q0, [r0]
711; CHECK-NEXT:    vaddv.u16 r0, q0
712; CHECK-NEXT:    bx lr
713entry:
714  %0 = load <8 x i8>, ptr %x, align 1
715  %1 = zext <8 x i8> %0 to <8 x i32>
716  %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
717  ret i32 %2
718}
719
720define i32 @addv16i32i8(ptr %x) {
721; CHECK-LABEL: addv16i32i8:
722; CHECK:       @ %bb.0: @ %entry
723; CHECK-NEXT:    vldrb.u8 q0, [r0]
724; CHECK-NEXT:    vaddv.u8 r0, q0
725; CHECK-NEXT:    bx lr
726entry:
727  %0 = load <16 x i8>, ptr %x, align 1
728  %1 = zext <16 x i8> %0 to <16 x i32>
729  %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
730  ret i32 %2
731}
732
733define i32 @addv24i32i8(ptr %x) {
734; CHECK-LABEL: addv24i32i8:
735; CHECK:       @ %bb.0: @ %entry
736; CHECK-NEXT:    vldrb.u8 q1, [r0]
737; CHECK-NEXT:    vldrb.u16 q0, [r0, #16]
738; CHECK-NEXT:    vaddv.u8 r0, q1
739; CHECK-NEXT:    vaddva.u16 r0, q0
740; CHECK-NEXT:    bx lr
741entry:
742  %0 = load <16 x i8>, ptr %x, align 1
743  %1 = zext <16 x i8> %0 to <16 x i32>
744  %arrayidx.16 = getelementptr inbounds i8, ptr %x, i32 16
745  %2 = load <8 x i8>, ptr %arrayidx.16, align 1
746  %3 = zext <8 x i8> %2 to <8 x i32>
747  %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
748  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
749  %op.rdx = add nuw nsw i32 %4, %5
750  ret i32 %op.rdx
751}
752
753define i32 @addv32i32i8(ptr %x) {
754; CHECK-LABEL: addv32i32i8:
755; CHECK:       @ %bb.0: @ %entry
756; CHECK-NEXT:    vldrb.u32 q1, [r0]
757; CHECK-NEXT:    vldrb.u32 q0, [r0, #4]
758; CHECK-NEXT:    vaddv.u32 r2, q1
759; CHECK-NEXT:    vaddva.u32 r2, q0
760; CHECK-NEXT:    vldrb.u32 q0, [r0, #8]
761; CHECK-NEXT:    vaddva.u32 r2, q0
762; CHECK-NEXT:    vldrb.u32 q0, [r0, #12]
763; CHECK-NEXT:    vaddva.u32 r2, q0
764; CHECK-NEXT:    vldrb.u32 q0, [r0, #16]
765; CHECK-NEXT:    vaddva.u32 r2, q0
766; CHECK-NEXT:    vldrb.u32 q0, [r0, #20]
767; CHECK-NEXT:    vaddva.u32 r2, q0
768; CHECK-NEXT:    vldrb.u32 q0, [r0, #24]
769; CHECK-NEXT:    vaddva.u32 r2, q0
770; CHECK-NEXT:    vldrb.u32 q0, [r0, #28]
771; CHECK-NEXT:    vaddva.u32 r2, q0
772; CHECK-NEXT:    mov r0, r2
773; CHECK-NEXT:    bx lr
774entry:
775  %0 = load <32 x i8>, ptr %x, align 1
776  %1 = zext <32 x i8> %0 to <32 x i32>
777  %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
778  ret i32 %2
779}
780
781define i32 @addv64i32i8(ptr %x) {
782; CHECK-LABEL: addv64i32i8:
783; CHECK:       @ %bb.0: @ %entry
784; CHECK-NEXT:    vldrb.u32 q1, [r0]
785; CHECK-NEXT:    vldrb.u32 q0, [r0, #4]
786; CHECK-NEXT:    ldrb.w r1, [r0, #60]
787; CHECK-NEXT:    vaddv.u32 r2, q1
788; CHECK-NEXT:    ldrb.w r3, [r0, #61]
789; CHECK-NEXT:    vaddva.u32 r2, q0
790; CHECK-NEXT:    vldrb.u32 q0, [r0, #8]
791; CHECK-NEXT:    ldrb.w r12, [r0, #62]
792; CHECK-NEXT:    vaddva.u32 r2, q0
793; CHECK-NEXT:    vldrb.u32 q0, [r0, #12]
794; CHECK-NEXT:    vaddva.u32 r2, q0
795; CHECK-NEXT:    vldrb.u32 q0, [r0, #16]
796; CHECK-NEXT:    vaddva.u32 r2, q0
797; CHECK-NEXT:    vldrb.u32 q0, [r0, #20]
798; CHECK-NEXT:    vaddva.u32 r2, q0
799; CHECK-NEXT:    vldrb.u32 q0, [r0, #24]
800; CHECK-NEXT:    vaddva.u32 r2, q0
801; CHECK-NEXT:    vldrb.u32 q0, [r0, #28]
802; CHECK-NEXT:    vaddva.u32 r2, q0
803; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
804; CHECK-NEXT:    vaddva.u8 r2, q0
805; CHECK-NEXT:    vldrb.u16 q0, [r0, #48]
806; CHECK-NEXT:    vaddva.u16 r2, q0
807; CHECK-NEXT:    vldrb.u32 q0, [r0, #56]
808; CHECK-NEXT:    ldrb.w r0, [r0, #63]
809; CHECK-NEXT:    vaddva.u32 r2, q0
810; CHECK-NEXT:    add r1, r2
811; CHECK-NEXT:    add r1, r3
812; CHECK-NEXT:    add r1, r12
813; CHECK-NEXT:    add r0, r1
814; CHECK-NEXT:    bx lr
815entry:
816  %0 = load <32 x i8>, ptr %x, align 1
817  %1 = zext <32 x i8> %0 to <32 x i32>
818  %arrayidx.32 = getelementptr inbounds i8, ptr %x, i32 32
819  %2 = load <16 x i8>, ptr %arrayidx.32, align 1
820  %3 = zext <16 x i8> %2 to <16 x i32>
821  %arrayidx.48 = getelementptr inbounds i8, ptr %x, i32 48
822  %4 = load <8 x i8>, ptr %arrayidx.48, align 1
823  %5 = zext <8 x i8> %4 to <8 x i32>
824  %arrayidx.56 = getelementptr inbounds i8, ptr %x, i32 56
825  %6 = load <4 x i8>, ptr %arrayidx.56, align 1
826  %7 = zext <4 x i8> %6 to <4 x i32>
827  %arrayidx.60 = getelementptr inbounds i8, ptr %x, i32 60
828  %8 = load i8, ptr %arrayidx.60, align 1
829  %conv.60 = zext i8 %8 to i32
830  %arrayidx.61 = getelementptr inbounds i8, ptr %x, i32 61
831  %9 = load i8, ptr %arrayidx.61, align 1
832  %conv.61 = zext i8 %9 to i32
833  %arrayidx.62 = getelementptr inbounds i8, ptr %x, i32 62
834  %10 = load i8, ptr %arrayidx.62, align 1
835  %conv.62 = zext i8 %10 to i32
836  %11 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
837  %12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
838  %op.rdx = add nuw nsw i32 %11, %12
839  %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
840  %op.rdx8 = add nuw nsw i32 %op.rdx, %13
841  %14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
842  %op.rdx9 = add nuw nsw i32 %op.rdx8, %14
843  %15 = add nuw nsw i32 %op.rdx9, %conv.60
844  %16 = add nuw nsw i32 %15, %conv.61
845  %17 = add nuw nsw i32 %16, %conv.62
846  %arrayidx.63 = getelementptr inbounds i8, ptr %x, i32 63
847  %18 = load i8, ptr %arrayidx.63, align 1
848  %conv.63 = zext i8 %18 to i32
849  %add.63 = add nuw nsw i32 %17, %conv.63
850  ret i32 %add.63
851}
852
853define i32 @addv128i32i8(ptr %x) {
854; CHECK-LABEL: addv128i32i8:
855; CHECK:       @ %bb.0: @ %entry
856; CHECK-NEXT:    vldrb.u8 q1, [r0]
857; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
858; CHECK-NEXT:    mov r1, r0
859; CHECK-NEXT:    vaddv.u8 r0, q1
860; CHECK-NEXT:    vaddva.u8 r0, q0
861; CHECK-NEXT:    vldrb.u8 q0, [r1, #32]
862; CHECK-NEXT:    vaddva.u8 r0, q0
863; CHECK-NEXT:    vldrb.u8 q0, [r1, #48]
864; CHECK-NEXT:    vaddva.u8 r0, q0
865; CHECK-NEXT:    vldrb.u8 q0, [r1, #64]
866; CHECK-NEXT:    vaddva.u8 r0, q0
867; CHECK-NEXT:    vldrb.u8 q0, [r1, #80]
868; CHECK-NEXT:    vaddva.u8 r0, q0
869; CHECK-NEXT:    vldrb.u8 q0, [r1, #96]
870; CHECK-NEXT:    vaddva.u8 r0, q0
871; CHECK-NEXT:    vldrb.u8 q0, [r1, #112]
872; CHECK-NEXT:    vaddva.u8 r0, q0
873; CHECK-NEXT:    bx lr
874entry:
875  %wide.load = load <16 x i8>, ptr %x, align 1
876  %0 = zext <16 x i8> %wide.load to <16 x i32>
877  %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0)
878  %2 = getelementptr inbounds i8, ptr %x, i32 16
879  %wide.load.1 = load <16 x i8>, ptr %2, align 1
880  %3 = zext <16 x i8> %wide.load.1 to <16 x i32>
881  %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
882  %5 = add i32 %4, %1
883  %6 = getelementptr inbounds i8, ptr %x, i32 32
884  %wide.load.2 = load <16 x i8>, ptr %6, align 1
885  %7 = zext <16 x i8> %wide.load.2 to <16 x i32>
886  %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
887  %9 = add i32 %8, %5
888  %10 = getelementptr inbounds i8, ptr %x, i32 48
889  %wide.load.3 = load <16 x i8>, ptr %10, align 1
890  %11 = zext <16 x i8> %wide.load.3 to <16 x i32>
891  %12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %11)
892  %13 = add i32 %12, %9
893  %14 = getelementptr inbounds i8, ptr %x, i32 64
894  %wide.load.4 = load <16 x i8>, ptr %14, align 1
895  %15 = zext <16 x i8> %wide.load.4 to <16 x i32>
896  %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15)
897  %17 = add i32 %16, %13
898  %18 = getelementptr inbounds i8, ptr %x, i32 80
899  %wide.load.5 = load <16 x i8>, ptr %18, align 1
900  %19 = zext <16 x i8> %wide.load.5 to <16 x i32>
901  %20 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %19)
902  %21 = add i32 %20, %17
903  %22 = getelementptr inbounds i8, ptr %x, i32 96
904  %wide.load.6 = load <16 x i8>, ptr %22, align 1
905  %23 = zext <16 x i8> %wide.load.6 to <16 x i32>
906  %24 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %23)
907  %25 = add i32 %24, %21
908  %26 = getelementptr inbounds i8, ptr %x, i32 112
909  %wide.load.7 = load <16 x i8>, ptr %26, align 1
910  %27 = zext <16 x i8> %wide.load.7 to <16 x i32>
911  %28 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %27)
912  %29 = add i32 %28, %25
913  ret i32 %29
914}
915
916define signext i16 @addv2i16i16(ptr %x) {
917; CHECK-LABEL: addv2i16i16:
918; CHECK:       @ %bb.0: @ %entry
919; CHECK-NEXT:    ldrh r1, [r0]
920; CHECK-NEXT:    ldrh r0, [r0, #2]
921; CHECK-NEXT:    add r0, r1
922; CHECK-NEXT:    sxth r0, r0
923; CHECK-NEXT:    bx lr
924entry:
925  %0 = load i16, ptr %x, align 2
926  %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1
927  %1 = load i16, ptr %arrayidx.1, align 2
928  %add.1 = add i16 %1, %0
929  ret i16 %add.1
930}
931
932define signext i16 @addv4i16i16(ptr %x) {
933; CHECK-LABEL: addv4i16i16:
934; CHECK:       @ %bb.0: @ %entry
935; CHECK-NEXT:    vldrh.u32 q0, [r0]
936; CHECK-NEXT:    vaddv.u32 r0, q0
937; CHECK-NEXT:    sxth r0, r0
938; CHECK-NEXT:    bx lr
939entry:
940  %0 = load <4 x i16>, ptr %x, align 2
941  %1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %0)
942  ret i16 %1
943}
944
945define signext i16 @addv8i16i16(ptr %x) {
946; CHECK-LABEL: addv8i16i16:
947; CHECK:       @ %bb.0: @ %entry
948; CHECK-NEXT:    vldrh.u16 q0, [r0]
949; CHECK-NEXT:    vaddv.u16 r0, q0
950; CHECK-NEXT:    sxth r0, r0
951; CHECK-NEXT:    bx lr
952entry:
953  %0 = load <8 x i16>, ptr %x, align 2
954  %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0)
955  ret i16 %1
956}
957
958define signext i16 @addv16i16i16(ptr %x) {
959; CHECK-LABEL: addv16i16i16:
960; CHECK:       @ %bb.0: @ %entry
961; CHECK-NEXT:    vldrh.u16 q1, [r0]
962; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
963; CHECK-NEXT:    vaddv.u16 r0, q1
964; CHECK-NEXT:    vaddva.u16 r0, q0
965; CHECK-NEXT:    sxth r0, r0
966; CHECK-NEXT:    bx lr
967entry:
968  %0 = load <16 x i16>, ptr %x, align 2
969  %1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %0)
970  ret i16 %1
971}
972
973define signext i16 @addv24i16i16(ptr %x) {
974; CHECK-LABEL: addv24i16i16:
975; CHECK:       @ %bb.0: @ %entry
976; CHECK-NEXT:    vldrh.u16 q1, [r0]
977; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
978; CHECK-NEXT:    vaddv.u16 r2, q1
979; CHECK-NEXT:    vaddva.u16 r2, q0
980; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
981; CHECK-NEXT:    vaddva.u16 r2, q0
982; CHECK-NEXT:    sxth r0, r2
983; CHECK-NEXT:    bx lr
984entry:
985  %0 = load <8 x i16>, ptr %x, align 2
986  %arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8
987  %1 = load <16 x i16>, ptr %arrayidx.8, align 2
988  %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1)
989  %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0)
990  %op.rdx = add i16 %2, %3
991  ret i16 %op.rdx
992}
993
994define signext i16 @addv32i16i16(ptr %x) {
995; CHECK-LABEL: addv32i16i16:
996; CHECK:       @ %bb.0: @ %entry
997; CHECK-NEXT:    vldrh.u16 q1, [r0]
998; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
999; CHECK-NEXT:    vaddv.u16 r2, q1
1000; CHECK-NEXT:    vaddva.u16 r2, q0
1001; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
1002; CHECK-NEXT:    vaddva.u16 r2, q0
1003; CHECK-NEXT:    vldrh.u16 q0, [r0, #48]
1004; CHECK-NEXT:    vaddva.u16 r2, q0
1005; CHECK-NEXT:    sxth r0, r2
1006; CHECK-NEXT:    bx lr
1007entry:
1008  %0 = load <32 x i16>, ptr %x, align 2
1009  %1 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %0)
1010  ret i16 %1
1011}
1012
1013define signext i16 @addv64i16i16(ptr %x) {
1014; CHECK-LABEL: addv64i16i16:
1015; CHECK:       @ %bb.0: @ %entry
1016; CHECK-NEXT:    vldrh.u16 q1, [r0]
1017; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
1018; CHECK-NEXT:    vaddv.u16 r2, q1
1019; CHECK-NEXT:    vaddva.u16 r2, q0
1020; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
1021; CHECK-NEXT:    vaddva.u16 r2, q0
1022; CHECK-NEXT:    vldrh.u16 q0, [r0, #48]
1023; CHECK-NEXT:    vaddva.u16 r2, q0
1024; CHECK-NEXT:    vldrh.u16 q0, [r0, #64]
1025; CHECK-NEXT:    vaddva.u16 r2, q0
1026; CHECK-NEXT:    vldrh.u16 q0, [r0, #80]
1027; CHECK-NEXT:    vaddva.u16 r2, q0
1028; CHECK-NEXT:    vldrh.u16 q0, [r0, #96]
1029; CHECK-NEXT:    vaddva.u16 r2, q0
1030; CHECK-NEXT:    vldrh.u16 q0, [r0, #112]
1031; CHECK-NEXT:    vaddva.u16 r2, q0
1032; CHECK-NEXT:    sxth r0, r2
1033; CHECK-NEXT:    bx lr
1034entry:
1035  %0 = load <64 x i16>, ptr %x, align 2
1036  %1 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %0)
1037  ret i16 %1
1038}
1039
1040define signext i16 @addv128i16i16(ptr %x) {
1041; CHECK-LABEL: addv128i16i16:
1042; CHECK:       @ %bb.0: @ %entry
1043; CHECK-NEXT:    vldrh.u16 q1, [r0]
1044; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
1045; CHECK-NEXT:    vaddv.u16 r2, q1
1046; CHECK-NEXT:    vaddva.u16 r2, q0
1047; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
1048; CHECK-NEXT:    vaddva.u16 r2, q0
1049; CHECK-NEXT:    vldrh.u16 q0, [r0, #48]
1050; CHECK-NEXT:    vaddva.u16 r2, q0
1051; CHECK-NEXT:    vldrh.u16 q0, [r0, #64]
1052; CHECK-NEXT:    vaddva.u16 r2, q0
1053; CHECK-NEXT:    vldrh.u16 q0, [r0, #80]
1054; CHECK-NEXT:    vaddva.u16 r2, q0
1055; CHECK-NEXT:    vldrh.u16 q0, [r0, #96]
1056; CHECK-NEXT:    vaddva.u16 r2, q0
1057; CHECK-NEXT:    vldrh.u16 q0, [r0, #112]
1058; CHECK-NEXT:    vaddva.u16 r2, q0
1059; CHECK-NEXT:    vldrh.u16 q0, [r0, #128]
1060; CHECK-NEXT:    vaddva.u16 r2, q0
1061; CHECK-NEXT:    vldrh.u16 q0, [r0, #144]
1062; CHECK-NEXT:    vaddva.u16 r2, q0
1063; CHECK-NEXT:    vldrh.u16 q0, [r0, #160]
1064; CHECK-NEXT:    vaddva.u16 r2, q0
1065; CHECK-NEXT:    vldrh.u16 q0, [r0, #176]
1066; CHECK-NEXT:    vaddva.u16 r2, q0
1067; CHECK-NEXT:    vldrh.u16 q0, [r0, #192]
1068; CHECK-NEXT:    vaddva.u16 r2, q0
1069; CHECK-NEXT:    vldrh.u16 q0, [r0, #208]
1070; CHECK-NEXT:    vaddva.u16 r2, q0
1071; CHECK-NEXT:    vldrh.u16 q0, [r0, #224]
1072; CHECK-NEXT:    vaddva.u16 r2, q0
1073; CHECK-NEXT:    vldrh.u16 q0, [r0, #240]
1074; CHECK-NEXT:    vaddva.u16 r2, q0
1075; CHECK-NEXT:    sxth r0, r2
1076; CHECK-NEXT:    bx lr
1077entry:
1078  %wide.load = load <8 x i16>, ptr %x, align 2
1079  %0 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load)
1080  %1 = getelementptr inbounds i16, ptr %x, i32 8
1081  %wide.load.1 = load <8 x i16>, ptr %1, align 2
1082  %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.1)
1083  %3 = add i16 %2, %0
1084  %4 = getelementptr inbounds i16, ptr %x, i32 16
1085  %wide.load.2 = load <8 x i16>, ptr %4, align 2
1086  %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.2)
1087  %6 = add i16 %5, %3
1088  %7 = getelementptr inbounds i16, ptr %x, i32 24
1089  %wide.load.3 = load <8 x i16>, ptr %7, align 2
1090  %8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.3)
1091  %9 = add i16 %8, %6
1092  %10 = getelementptr inbounds i16, ptr %x, i32 32
1093  %wide.load.4 = load <8 x i16>, ptr %10, align 2
1094  %11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.4)
1095  %12 = add i16 %11, %9
1096  %13 = getelementptr inbounds i16, ptr %x, i32 40
1097  %wide.load.5 = load <8 x i16>, ptr %13, align 2
1098  %14 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.5)
1099  %15 = add i16 %14, %12
1100  %16 = getelementptr inbounds i16, ptr %x, i32 48
1101  %wide.load.6 = load <8 x i16>, ptr %16, align 2
1102  %17 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.6)
1103  %18 = add i16 %17, %15
1104  %19 = getelementptr inbounds i16, ptr %x, i32 56
1105  %wide.load.7 = load <8 x i16>, ptr %19, align 2
1106  %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.7)
1107  %21 = add i16 %20, %18
1108  %22 = getelementptr inbounds i16, ptr %x, i32 64
1109  %wide.load.8 = load <8 x i16>, ptr %22, align 2
1110  %23 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.8)
1111  %24 = add i16 %23, %21
1112  %25 = getelementptr inbounds i16, ptr %x, i32 72
1113  %wide.load.9 = load <8 x i16>, ptr %25, align 2
1114  %26 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.9)
1115  %27 = add i16 %26, %24
1116  %28 = getelementptr inbounds i16, ptr %x, i32 80
1117  %wide.load.10 = load <8 x i16>, ptr %28, align 2
1118  %29 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.10)
1119  %30 = add i16 %29, %27
1120  %31 = getelementptr inbounds i16, ptr %x, i32 88
1121  %wide.load.11 = load <8 x i16>, ptr %31, align 2
1122  %32 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.11)
1123  %33 = add i16 %32, %30
1124  %34 = getelementptr inbounds i16, ptr %x, i32 96
1125  %wide.load.12 = load <8 x i16>, ptr %34, align 2
1126  %35 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.12)
1127  %36 = add i16 %35, %33
1128  %37 = getelementptr inbounds i16, ptr %x, i32 104
1129  %wide.load.13 = load <8 x i16>, ptr %37, align 2
1130  %38 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.13)
1131  %39 = add i16 %38, %36
1132  %40 = getelementptr inbounds i16, ptr %x, i32 112
1133  %wide.load.14 = load <8 x i16>, ptr %40, align 2
1134  %41 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.14)
1135  %42 = add i16 %41, %39
1136  %43 = getelementptr inbounds i16, ptr %x, i32 120
1137  %wide.load.15 = load <8 x i16>, ptr %43, align 2
1138  %44 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.15)
1139  %45 = add i16 %44, %42
1140  ret i16 %45
1141}
1142
1143define zeroext i8 @addv2i8i8(ptr %x) {
1144; CHECK-LABEL: addv2i8i8:
1145; CHECK:       @ %bb.0: @ %entry
1146; CHECK-NEXT:    ldrb r1, [r0]
1147; CHECK-NEXT:    ldrb r0, [r0, #1]
1148; CHECK-NEXT:    add r0, r1
1149; CHECK-NEXT:    uxtb r0, r0
1150; CHECK-NEXT:    bx lr
1151entry:
1152  %0 = load i8, ptr %x, align 1
1153  %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1
1154  %1 = load i8, ptr %arrayidx.1, align 1
1155  %add.1 = add i8 %1, %0
1156  ret i8 %add.1
1157}
1158
1159define zeroext i8 @addv4i8i8(ptr %x) {
1160; CHECK-LABEL: addv4i8i8:
1161; CHECK:       @ %bb.0: @ %entry
1162; CHECK-NEXT:    vldrb.u32 q0, [r0]
1163; CHECK-NEXT:    vaddv.u32 r0, q0
1164; CHECK-NEXT:    uxtb r0, r0
1165; CHECK-NEXT:    bx lr
1166entry:
1167  %0 = load <4 x i8>, ptr %x, align 1
1168  %1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %0)
1169  ret i8 %1
1170}
1171
1172define zeroext i8 @addv8i8i8(ptr %x) {
1173; CHECK-LABEL: addv8i8i8:
1174; CHECK:       @ %bb.0: @ %entry
1175; CHECK-NEXT:    vldrb.u16 q0, [r0]
1176; CHECK-NEXT:    vaddv.u16 r0, q0
1177; CHECK-NEXT:    uxtb r0, r0
1178; CHECK-NEXT:    bx lr
1179entry:
1180  %0 = load <8 x i8>, ptr %x, align 1
1181  %1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %0)
1182  ret i8 %1
1183}
1184
1185define zeroext i8 @addv16i8i8(ptr %x) {
1186; CHECK-LABEL: addv16i8i8:
1187; CHECK:       @ %bb.0: @ %entry
1188; CHECK-NEXT:    vldrb.u8 q0, [r0]
1189; CHECK-NEXT:    vaddv.u8 r0, q0
1190; CHECK-NEXT:    uxtb r0, r0
1191; CHECK-NEXT:    bx lr
1192entry:
1193  %0 = load <16 x i8>, ptr %x, align 1
1194  %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %0)
1195  ret i8 %1
1196}
1197
1198define zeroext i8 @addv24i8i8(ptr %x) {
1199; CHECK-LABEL: addv24i8i8:
1200; CHECK:       @ %bb.0: @ %entry
1201; CHECK-NEXT:    vldrb.u16 q1, [r0]
1202; CHECK-NEXT:    vldrb.u8 q0, [r0, #8]
1203; CHECK-NEXT:    vaddv.u16 r0, q1
1204; CHECK-NEXT:    vaddva.u8 r0, q0
1205; CHECK-NEXT:    uxtb r0, r0
1206; CHECK-NEXT:    bx lr
1207entry:
1208  %0 = load <8 x i8>, ptr %x, align 1
1209  %arrayidx.8 = getelementptr inbounds i8, ptr %x, i32 8
1210  %1 = load <16 x i8>, ptr %arrayidx.8, align 1
1211  %2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %1)
1212  %3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %0)
1213  %op.rdx = add i8 %2, %3
1214  ret i8 %op.rdx
1215}
1216
1217define zeroext i8 @addv32i8i8(ptr %x) {
1218; CHECK-LABEL: addv32i8i8:
1219; CHECK:       @ %bb.0: @ %entry
1220; CHECK-NEXT:    vldrb.u8 q1, [r0]
1221; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
1222; CHECK-NEXT:    vaddv.u8 r0, q1
1223; CHECK-NEXT:    vaddva.u8 r0, q0
1224; CHECK-NEXT:    uxtb r0, r0
1225; CHECK-NEXT:    bx lr
1226entry:
1227  %0 = load <32 x i8>, ptr %x, align 1
1228  %1 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %0)
1229  ret i8 %1
1230}
1231
1232define zeroext i8 @addv64i8i8(ptr %x) {
1233; CHECK-LABEL: addv64i8i8:
1234; CHECK:       @ %bb.0: @ %entry
1235; CHECK-NEXT:    vldrb.u8 q1, [r0]
1236; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
1237; CHECK-NEXT:    vaddv.u8 r2, q1
1238; CHECK-NEXT:    vaddva.u8 r2, q0
1239; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
1240; CHECK-NEXT:    vaddva.u8 r2, q0
1241; CHECK-NEXT:    vldrb.u8 q0, [r0, #48]
1242; CHECK-NEXT:    vaddva.u8 r2, q0
1243; CHECK-NEXT:    uxtb r0, r2
1244; CHECK-NEXT:    bx lr
1245entry:
1246  %0 = load <64 x i8>, ptr %x, align 1
1247  %1 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %0)
1248  ret i8 %1
1249}
1250
1251define zeroext i8 @addv128i8i8(ptr %x) {
1252; CHECK-LABEL: addv128i8i8:
1253; CHECK:       @ %bb.0: @ %entry
1254; CHECK-NEXT:    vldrb.u8 q1, [r0]
1255; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
1256; CHECK-NEXT:    vaddv.u8 r2, q1
1257; CHECK-NEXT:    vaddva.u8 r2, q0
1258; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
1259; CHECK-NEXT:    vaddva.u8 r2, q0
1260; CHECK-NEXT:    vldrb.u8 q0, [r0, #48]
1261; CHECK-NEXT:    vaddva.u8 r2, q0
1262; CHECK-NEXT:    vldrb.u8 q0, [r0, #64]
1263; CHECK-NEXT:    vaddva.u8 r2, q0
1264; CHECK-NEXT:    vldrb.u8 q0, [r0, #80]
1265; CHECK-NEXT:    vaddva.u8 r2, q0
1266; CHECK-NEXT:    vldrb.u8 q0, [r0, #96]
1267; CHECK-NEXT:    vaddva.u8 r2, q0
1268; CHECK-NEXT:    vldrb.u8 q0, [r0, #112]
1269; CHECK-NEXT:    vaddva.u8 r2, q0
1270; CHECK-NEXT:    uxtb r0, r2
1271; CHECK-NEXT:    bx lr
1272entry:
1273  %wide.load = load <16 x i8>, ptr %x, align 1
1274  %0 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load)
1275  %1 = getelementptr inbounds i8, ptr %x, i32 16
1276  %wide.load.1 = load <16 x i8>, ptr %1, align 1
1277  %2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.1)
1278  %3 = add i8 %2, %0
1279  %4 = getelementptr inbounds i8, ptr %x, i32 32
1280  %wide.load.2 = load <16 x i8>, ptr %4, align 1
1281  %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.2)
1282  %6 = add i8 %5, %3
1283  %7 = getelementptr inbounds i8, ptr %x, i32 48
1284  %wide.load.3 = load <16 x i8>, ptr %7, align 1
1285  %8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.3)
1286  %9 = add i8 %8, %6
1287  %10 = getelementptr inbounds i8, ptr %x, i32 64
1288  %wide.load.4 = load <16 x i8>, ptr %10, align 1
1289  %11 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.4)
1290  %12 = add i8 %11, %9
1291  %13 = getelementptr inbounds i8, ptr %x, i32 80
1292  %wide.load.5 = load <16 x i8>, ptr %13, align 1
1293  %14 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.5)
1294  %15 = add i8 %14, %12
1295  %16 = getelementptr inbounds i8, ptr %x, i32 96
1296  %wide.load.6 = load <16 x i8>, ptr %16, align 1
1297  %17 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.6)
1298  %18 = add i8 %17, %15
1299  %19 = getelementptr inbounds i8, ptr %x, i32 112
1300  %wide.load.7 = load <16 x i8>, ptr %19, align 1
1301  %20 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.7)
1302  %21 = add i8 %20, %18
1303  ret i8 %21
1304}
1305
1306
1307
1308define i32 @mlav2i32i32(ptr %x, ptr %y) {
1309; CHECK-LABEL: mlav2i32i32:
1310; CHECK:       @ %bb.0: @ %entry
1311; CHECK-NEXT:    ldrd r0, r2, [r0]
1312; CHECK-NEXT:    ldrd r1, r3, [r1]
1313; CHECK-NEXT:    muls r0, r1, r0
1314; CHECK-NEXT:    mla r0, r3, r2, r0
1315; CHECK-NEXT:    bx lr
1316entry:
1317  %0 = load i32, ptr %x, align 4
1318  %1 = load i32, ptr %y, align 4
1319  %mul = mul nsw i32 %1, %0
1320  %arrayidx.1 = getelementptr inbounds i32, ptr %x, i32 1
1321  %2 = load i32, ptr %arrayidx.1, align 4
1322  %arrayidx1.1 = getelementptr inbounds i32, ptr %y, i32 1
1323  %3 = load i32, ptr %arrayidx1.1, align 4
1324  %mul.1 = mul nsw i32 %3, %2
1325  %add.1 = add nsw i32 %mul.1, %mul
1326  ret i32 %add.1
1327}
1328
1329define i32 @mlav4i32i32(ptr %x, ptr %y) {
1330; CHECK-LABEL: mlav4i32i32:
1331; CHECK:       @ %bb.0: @ %entry
1332; CHECK-NEXT:    vldrw.u32 q0, [r0]
1333; CHECK-NEXT:    vldrw.u32 q1, [r1]
1334; CHECK-NEXT:    vmlav.u32 r0, q1, q0
1335; CHECK-NEXT:    bx lr
1336entry:
1337  %0 = load <4 x i32>, ptr %x, align 4
1338  %1 = load <4 x i32>, ptr %y, align 4
1339  %2 = mul nsw <4 x i32> %1, %0
1340  %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
1341  ret i32 %3
1342}
1343
1344define i32 @mlav8i32i32(ptr %x, ptr %y) {
1345; CHECK-LABEL: mlav8i32i32:
1346; CHECK:       @ %bb.0: @ %entry
1347; CHECK-NEXT:    vldrw.u32 q0, [r0]
1348; CHECK-NEXT:    vldrw.u32 q1, [r1]
1349; CHECK-NEXT:    vmlav.u32 r2, q1, q0
1350; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
1351; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
1352; CHECK-NEXT:    vmlava.u32 r2, q1, q0
1353; CHECK-NEXT:    mov r0, r2
1354; CHECK-NEXT:    bx lr
1355entry:
1356  %0 = load <8 x i32>, ptr %x, align 4
1357  %1 = load <8 x i32>, ptr %y, align 4
1358  %2 = mul nsw <8 x i32> %1, %0
1359  %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
1360  ret i32 %3
1361}
1362
1363define i32 @mlav16i32i32(ptr %x, ptr %y) {
1364; CHECK-LABEL: mlav16i32i32:
1365; CHECK:       @ %bb.0: @ %entry
1366; CHECK-NEXT:    vldrw.u32 q0, [r0]
1367; CHECK-NEXT:    vldrw.u32 q1, [r1]
1368; CHECK-NEXT:    vmlav.u32 r2, q1, q0
1369; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
1370; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
1371; CHECK-NEXT:    vmlava.u32 r2, q1, q0
1372; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
1373; CHECK-NEXT:    vldrw.u32 q1, [r1, #32]
1374; CHECK-NEXT:    vmlava.u32 r2, q1, q0
1375; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
1376; CHECK-NEXT:    vldrw.u32 q1, [r1, #48]
1377; CHECK-NEXT:    vmlava.u32 r2, q1, q0
1378; CHECK-NEXT:    mov r0, r2
1379; CHECK-NEXT:    bx lr
1380entry:
1381  %0 = load <16 x i32>, ptr %x, align 4
1382  %1 = load <16 x i32>, ptr %y, align 4
1383  %2 = mul nsw <16 x i32> %1, %0
1384  %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
1385  ret i32 %3
1386}
1387
1388define i32 @mlav24i32i32(ptr %x, ptr %y) {
1389; CHECK-LABEL: mlav24i32i32:
1390; CHECK:       @ %bb.0: @ %entry
1391; CHECK-NEXT:    vldrw.u32 q0, [r0]
1392; CHECK-NEXT:    vldrw.u32 q1, [r1]
1393; CHECK-NEXT:    mov r2, r0
1394; CHECK-NEXT:    vmlav.u32 r0, q1, q0
1395; CHECK-NEXT:    vldrw.u32 q0, [r2, #16]
1396; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
1397; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1398; CHECK-NEXT:    vldrw.u32 q0, [r2, #32]
1399; CHECK-NEXT:    vldrw.u32 q1, [r1, #32]
1400; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1401; CHECK-NEXT:    vldrw.u32 q0, [r2, #48]
1402; CHECK-NEXT:    vldrw.u32 q1, [r1, #48]
1403; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1404; CHECK-NEXT:    vldrw.u32 q0, [r2, #64]
1405; CHECK-NEXT:    vldrw.u32 q1, [r1, #64]
1406; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1407; CHECK-NEXT:    vldrw.u32 q0, [r2, #80]
1408; CHECK-NEXT:    vldrw.u32 q1, [r1, #80]
1409; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1410; CHECK-NEXT:    bx lr
1411entry:
1412  %0 = load <8 x i32>, ptr %x, align 4
1413  %1 = load <8 x i32>, ptr %y, align 4
1414  %2 = mul nsw <8 x i32> %1, %0
1415  %arrayidx.8 = getelementptr inbounds i32, ptr %x, i32 8
1416  %arrayidx1.8 = getelementptr inbounds i32, ptr %y, i32 8
1417  %3 = load <16 x i32>, ptr %arrayidx.8, align 4
1418  %4 = load <16 x i32>, ptr %arrayidx1.8, align 4
1419  %5 = mul nsw <16 x i32> %4, %3
1420  %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
1421  %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
1422  %op.rdx = add nsw i32 %6, %7
1423  ret i32 %op.rdx
1424}
1425
1426define i32 @mlav32i32i32(ptr %x, ptr %y) {
1427; CHECK-LABEL: mlav32i32i32:
1428; CHECK:       @ %bb.0: @ %entry
1429; CHECK-NEXT:    vldrw.u32 q0, [r0]
1430; CHECK-NEXT:    vldrw.u32 q1, [r1]
1431; CHECK-NEXT:    mov r2, r0
1432; CHECK-NEXT:    vmlav.u32 r0, q1, q0
1433; CHECK-NEXT:    vldrw.u32 q0, [r2, #16]
1434; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
1435; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1436; CHECK-NEXT:    vldrw.u32 q0, [r2, #32]
1437; CHECK-NEXT:    vldrw.u32 q1, [r1, #32]
1438; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1439; CHECK-NEXT:    vldrw.u32 q0, [r2, #48]
1440; CHECK-NEXT:    vldrw.u32 q1, [r1, #48]
1441; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1442; CHECK-NEXT:    vldrw.u32 q0, [r2, #64]
1443; CHECK-NEXT:    vldrw.u32 q1, [r1, #64]
1444; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1445; CHECK-NEXT:    vldrw.u32 q0, [r2, #80]
1446; CHECK-NEXT:    vldrw.u32 q1, [r1, #80]
1447; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1448; CHECK-NEXT:    vldrw.u32 q0, [r2, #96]
1449; CHECK-NEXT:    vldrw.u32 q1, [r1, #96]
1450; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1451; CHECK-NEXT:    vldrw.u32 q0, [r2, #112]
1452; CHECK-NEXT:    vldrw.u32 q1, [r1, #112]
1453; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1454; CHECK-NEXT:    bx lr
1455entry:
1456  %0 = load <32 x i32>, ptr %x, align 4
1457  %1 = load <32 x i32>, ptr %y, align 4
1458  %2 = mul nsw <32 x i32> %1, %0
1459  %3 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2)
1460  ret i32 %3
1461}
1462
1463define i32 @mlav64i32i32(ptr %x, ptr %y) {
1464; CHECK-LABEL: mlav64i32i32:
1465; CHECK:       @ %bb.0: @ %entry
1466; CHECK-NEXT:    vldrw.u32 q0, [r0]
1467; CHECK-NEXT:    vldrw.u32 q1, [r1]
1468; CHECK-NEXT:    mov r2, r0
1469; CHECK-NEXT:    vmlav.u32 r0, q1, q0
1470; CHECK-NEXT:    vldrw.u32 q0, [r2, #16]
1471; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
1472; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1473; CHECK-NEXT:    vldrw.u32 q0, [r2, #32]
1474; CHECK-NEXT:    vldrw.u32 q1, [r1, #32]
1475; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1476; CHECK-NEXT:    vldrw.u32 q0, [r2, #48]
1477; CHECK-NEXT:    vldrw.u32 q1, [r1, #48]
1478; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1479; CHECK-NEXT:    vldrw.u32 q0, [r2, #64]
1480; CHECK-NEXT:    vldrw.u32 q1, [r1, #64]
1481; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1482; CHECK-NEXT:    vldrw.u32 q0, [r2, #80]
1483; CHECK-NEXT:    vldrw.u32 q1, [r1, #80]
1484; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1485; CHECK-NEXT:    vldrw.u32 q0, [r2, #96]
1486; CHECK-NEXT:    vldrw.u32 q1, [r1, #96]
1487; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1488; CHECK-NEXT:    vldrw.u32 q0, [r2, #112]
1489; CHECK-NEXT:    vldrw.u32 q1, [r1, #112]
1490; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1491; CHECK-NEXT:    vldrw.u32 q0, [r2, #128]
1492; CHECK-NEXT:    vldrw.u32 q1, [r1, #128]
1493; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1494; CHECK-NEXT:    vldrw.u32 q0, [r2, #144]
1495; CHECK-NEXT:    vldrw.u32 q1, [r1, #144]
1496; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1497; CHECK-NEXT:    vldrw.u32 q0, [r2, #160]
1498; CHECK-NEXT:    vldrw.u32 q1, [r1, #160]
1499; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1500; CHECK-NEXT:    vldrw.u32 q0, [r2, #176]
1501; CHECK-NEXT:    vldrw.u32 q1, [r1, #176]
1502; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1503; CHECK-NEXT:    vldrw.u32 q0, [r2, #192]
1504; CHECK-NEXT:    vldrw.u32 q1, [r1, #192]
1505; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1506; CHECK-NEXT:    vldrw.u32 q0, [r2, #208]
1507; CHECK-NEXT:    vldrw.u32 q1, [r1, #208]
1508; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1509; CHECK-NEXT:    vldrw.u32 q0, [r2, #224]
1510; CHECK-NEXT:    vldrw.u32 q1, [r1, #224]
1511; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1512; CHECK-NEXT:    vldrw.u32 q0, [r2, #240]
1513; CHECK-NEXT:    vldrw.u32 q1, [r1, #240]
1514; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1515; CHECK-NEXT:    bx lr
1516entry:
1517  %wide.load = load <4 x i32>, ptr %x, align 4
1518  %wide.load10 = load <4 x i32>, ptr %y, align 4
1519  %0 = mul nsw <4 x i32> %wide.load10, %wide.load
1520  %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0)
1521  %2 = getelementptr inbounds i32, ptr %x, i32 4
1522  %wide.load.1 = load <4 x i32>, ptr %2, align 4
1523  %3 = getelementptr inbounds i32, ptr %y, i32 4
1524  %wide.load10.1 = load <4 x i32>, ptr %3, align 4
1525  %4 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1
1526  %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
1527  %6 = add i32 %5, %1
1528  %7 = getelementptr inbounds i32, ptr %x, i32 8
1529  %wide.load.2 = load <4 x i32>, ptr %7, align 4
1530  %8 = getelementptr inbounds i32, ptr %y, i32 8
1531  %wide.load10.2 = load <4 x i32>, ptr %8, align 4
1532  %9 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2
1533  %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9)
1534  %11 = add i32 %10, %6
1535  %12 = getelementptr inbounds i32, ptr %x, i32 12
1536  %wide.load.3 = load <4 x i32>, ptr %12, align 4
1537  %13 = getelementptr inbounds i32, ptr %y, i32 12
1538  %wide.load10.3 = load <4 x i32>, ptr %13, align 4
1539  %14 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3
1540  %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14)
1541  %16 = add i32 %15, %11
1542  %17 = getelementptr inbounds i32, ptr %x, i32 16
1543  %wide.load.4 = load <4 x i32>, ptr %17, align 4
1544  %18 = getelementptr inbounds i32, ptr %y, i32 16
1545  %wide.load10.4 = load <4 x i32>, ptr %18, align 4
1546  %19 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4
1547  %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19)
1548  %21 = add i32 %20, %16
1549  %22 = getelementptr inbounds i32, ptr %x, i32 20
1550  %wide.load.5 = load <4 x i32>, ptr %22, align 4
1551  %23 = getelementptr inbounds i32, ptr %y, i32 20
1552  %wide.load10.5 = load <4 x i32>, ptr %23, align 4
1553  %24 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5
1554  %25 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %24)
1555  %26 = add i32 %25, %21
1556  %27 = getelementptr inbounds i32, ptr %x, i32 24
1557  %wide.load.6 = load <4 x i32>, ptr %27, align 4
1558  %28 = getelementptr inbounds i32, ptr %y, i32 24
1559  %wide.load10.6 = load <4 x i32>, ptr %28, align 4
1560  %29 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6
1561  %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29)
1562  %31 = add i32 %30, %26
1563  %32 = getelementptr inbounds i32, ptr %x, i32 28
1564  %wide.load.7 = load <4 x i32>, ptr %32, align 4
1565  %33 = getelementptr inbounds i32, ptr %y, i32 28
1566  %wide.load10.7 = load <4 x i32>, ptr %33, align 4
1567  %34 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7
1568  %35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %34)
1569  %36 = add i32 %35, %31
1570  %37 = getelementptr inbounds i32, ptr %x, i32 32
1571  %wide.load.8 = load <4 x i32>, ptr %37, align 4
1572  %38 = getelementptr inbounds i32, ptr %y, i32 32
1573  %wide.load10.8 = load <4 x i32>, ptr %38, align 4
1574  %39 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8
1575  %40 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %39)
1576  %41 = add i32 %40, %36
1577  %42 = getelementptr inbounds i32, ptr %x, i32 36
1578  %wide.load.9 = load <4 x i32>, ptr %42, align 4
1579  %43 = getelementptr inbounds i32, ptr %y, i32 36
1580  %wide.load10.9 = load <4 x i32>, ptr %43, align 4
1581  %44 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9
1582  %45 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %44)
1583  %46 = add i32 %45, %41
1584  %47 = getelementptr inbounds i32, ptr %x, i32 40
1585  %wide.load.10 = load <4 x i32>, ptr %47, align 4
1586  %48 = getelementptr inbounds i32, ptr %y, i32 40
1587  %wide.load10.10 = load <4 x i32>, ptr %48, align 4
1588  %49 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10
1589  %50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %49)
1590  %51 = add i32 %50, %46
1591  %52 = getelementptr inbounds i32, ptr %x, i32 44
1592  %wide.load.11 = load <4 x i32>, ptr %52, align 4
1593  %53 = getelementptr inbounds i32, ptr %y, i32 44
1594  %wide.load10.11 = load <4 x i32>, ptr %53, align 4
1595  %54 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11
1596  %55 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %54)
1597  %56 = add i32 %55, %51
1598  %57 = getelementptr inbounds i32, ptr %x, i32 48
1599  %wide.load.12 = load <4 x i32>, ptr %57, align 4
1600  %58 = getelementptr inbounds i32, ptr %y, i32 48
1601  %wide.load10.12 = load <4 x i32>, ptr %58, align 4
1602  %59 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12
1603  %60 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %59)
1604  %61 = add i32 %60, %56
1605  %62 = getelementptr inbounds i32, ptr %x, i32 52
1606  %wide.load.13 = load <4 x i32>, ptr %62, align 4
1607  %63 = getelementptr inbounds i32, ptr %y, i32 52
1608  %wide.load10.13 = load <4 x i32>, ptr %63, align 4
1609  %64 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13
1610  %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64)
1611  %66 = add i32 %65, %61
1612  %67 = getelementptr inbounds i32, ptr %x, i32 56
1613  %wide.load.14 = load <4 x i32>, ptr %67, align 4
1614  %68 = getelementptr inbounds i32, ptr %y, i32 56
1615  %wide.load10.14 = load <4 x i32>, ptr %68, align 4
1616  %69 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14
1617  %70 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %69)
1618  %71 = add i32 %70, %66
1619  %72 = getelementptr inbounds i32, ptr %x, i32 60
1620  %wide.load.15 = load <4 x i32>, ptr %72, align 4
1621  %73 = getelementptr inbounds i32, ptr %y, i32 60
1622  %wide.load10.15 = load <4 x i32>, ptr %73, align 4
1623  %74 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15
1624  %75 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %74)
1625  %76 = add i32 %75, %71
1626  ret i32 %76
1627}
1628
1629define i32 @mlav128i32i32(ptr %x, ptr %y) {
1630; CHECK-LABEL: mlav128i32i32:
1631; CHECK:       @ %bb.0: @ %entry
1632; CHECK-NEXT:    vldrw.u32 q0, [r0]
1633; CHECK-NEXT:    vldrw.u32 q1, [r1]
1634; CHECK-NEXT:    mov r2, r0
1635; CHECK-NEXT:    vmlav.u32 r0, q1, q0
1636; CHECK-NEXT:    vldrw.u32 q0, [r2, #16]
1637; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
1638; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1639; CHECK-NEXT:    vldrw.u32 q0, [r2, #32]
1640; CHECK-NEXT:    vldrw.u32 q1, [r1, #32]
1641; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1642; CHECK-NEXT:    vldrw.u32 q0, [r2, #48]
1643; CHECK-NEXT:    vldrw.u32 q1, [r1, #48]
1644; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1645; CHECK-NEXT:    vldrw.u32 q0, [r2, #64]
1646; CHECK-NEXT:    vldrw.u32 q1, [r1, #64]
1647; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1648; CHECK-NEXT:    vldrw.u32 q0, [r2, #80]
1649; CHECK-NEXT:    vldrw.u32 q1, [r1, #80]
1650; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1651; CHECK-NEXT:    vldrw.u32 q0, [r2, #96]
1652; CHECK-NEXT:    vldrw.u32 q1, [r1, #96]
1653; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1654; CHECK-NEXT:    vldrw.u32 q0, [r2, #112]
1655; CHECK-NEXT:    vldrw.u32 q1, [r1, #112]
1656; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1657; CHECK-NEXT:    vldrw.u32 q0, [r2, #128]
1658; CHECK-NEXT:    vldrw.u32 q1, [r1, #128]
1659; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1660; CHECK-NEXT:    vldrw.u32 q0, [r2, #144]
1661; CHECK-NEXT:    vldrw.u32 q1, [r1, #144]
1662; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1663; CHECK-NEXT:    vldrw.u32 q0, [r2, #160]
1664; CHECK-NEXT:    vldrw.u32 q1, [r1, #160]
1665; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1666; CHECK-NEXT:    vldrw.u32 q0, [r2, #176]
1667; CHECK-NEXT:    vldrw.u32 q1, [r1, #176]
1668; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1669; CHECK-NEXT:    vldrw.u32 q0, [r2, #192]
1670; CHECK-NEXT:    vldrw.u32 q1, [r1, #192]
1671; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1672; CHECK-NEXT:    vldrw.u32 q0, [r2, #208]
1673; CHECK-NEXT:    vldrw.u32 q1, [r1, #208]
1674; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1675; CHECK-NEXT:    vldrw.u32 q0, [r2, #224]
1676; CHECK-NEXT:    vldrw.u32 q1, [r1, #224]
1677; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1678; CHECK-NEXT:    vldrw.u32 q0, [r2, #240]
1679; CHECK-NEXT:    vldrw.u32 q1, [r1, #240]
1680; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1681; CHECK-NEXT:    vldrw.u32 q0, [r2, #256]
1682; CHECK-NEXT:    vldrw.u32 q1, [r1, #256]
1683; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1684; CHECK-NEXT:    vldrw.u32 q0, [r2, #272]
1685; CHECK-NEXT:    vldrw.u32 q1, [r1, #272]
1686; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1687; CHECK-NEXT:    vldrw.u32 q0, [r2, #288]
1688; CHECK-NEXT:    vldrw.u32 q1, [r1, #288]
1689; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1690; CHECK-NEXT:    vldrw.u32 q0, [r2, #304]
1691; CHECK-NEXT:    vldrw.u32 q1, [r1, #304]
1692; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1693; CHECK-NEXT:    vldrw.u32 q0, [r2, #320]
1694; CHECK-NEXT:    vldrw.u32 q1, [r1, #320]
1695; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1696; CHECK-NEXT:    vldrw.u32 q0, [r2, #336]
1697; CHECK-NEXT:    vldrw.u32 q1, [r1, #336]
1698; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1699; CHECK-NEXT:    vldrw.u32 q0, [r2, #352]
1700; CHECK-NEXT:    vldrw.u32 q1, [r1, #352]
1701; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1702; CHECK-NEXT:    vldrw.u32 q0, [r2, #368]
1703; CHECK-NEXT:    vldrw.u32 q1, [r1, #368]
1704; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1705; CHECK-NEXT:    vldrw.u32 q0, [r2, #384]
1706; CHECK-NEXT:    vldrw.u32 q1, [r1, #384]
1707; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1708; CHECK-NEXT:    vldrw.u32 q0, [r2, #400]
1709; CHECK-NEXT:    vldrw.u32 q1, [r1, #400]
1710; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1711; CHECK-NEXT:    vldrw.u32 q0, [r2, #416]
1712; CHECK-NEXT:    vldrw.u32 q1, [r1, #416]
1713; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1714; CHECK-NEXT:    vldrw.u32 q0, [r2, #432]
1715; CHECK-NEXT:    vldrw.u32 q1, [r1, #432]
1716; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1717; CHECK-NEXT:    vldrw.u32 q0, [r2, #448]
1718; CHECK-NEXT:    vldrw.u32 q1, [r1, #448]
1719; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1720; CHECK-NEXT:    vldrw.u32 q0, [r2, #464]
1721; CHECK-NEXT:    vldrw.u32 q1, [r1, #464]
1722; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1723; CHECK-NEXT:    vldrw.u32 q0, [r2, #480]
1724; CHECK-NEXT:    vldrw.u32 q1, [r1, #480]
1725; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1726; CHECK-NEXT:    vldrw.u32 q0, [r2, #496]
1727; CHECK-NEXT:    vldrw.u32 q1, [r1, #496]
1728; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1729; CHECK-NEXT:    bx lr
1730entry:
1731  %wide.load = load <4 x i32>, ptr %x, align 4
1732  %wide.load10 = load <4 x i32>, ptr %y, align 4
1733  %0 = mul nsw <4 x i32> %wide.load10, %wide.load
1734  %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0)
1735  %2 = getelementptr inbounds i32, ptr %x, i32 4
1736  %wide.load.1 = load <4 x i32>, ptr %2, align 4
1737  %3 = getelementptr inbounds i32, ptr %y, i32 4
1738  %wide.load10.1 = load <4 x i32>, ptr %3, align 4
1739  %4 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1
1740  %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
1741  %6 = add i32 %5, %1
1742  %7 = getelementptr inbounds i32, ptr %x, i32 8
1743  %wide.load.2 = load <4 x i32>, ptr %7, align 4
1744  %8 = getelementptr inbounds i32, ptr %y, i32 8
1745  %wide.load10.2 = load <4 x i32>, ptr %8, align 4
1746  %9 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2
1747  %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9)
1748  %11 = add i32 %10, %6
1749  %12 = getelementptr inbounds i32, ptr %x, i32 12
1750  %wide.load.3 = load <4 x i32>, ptr %12, align 4
1751  %13 = getelementptr inbounds i32, ptr %y, i32 12
1752  %wide.load10.3 = load <4 x i32>, ptr %13, align 4
1753  %14 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3
1754  %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14)
1755  %16 = add i32 %15, %11
1756  %17 = getelementptr inbounds i32, ptr %x, i32 16
1757  %wide.load.4 = load <4 x i32>, ptr %17, align 4
1758  %18 = getelementptr inbounds i32, ptr %y, i32 16
1759  %wide.load10.4 = load <4 x i32>, ptr %18, align 4
1760  %19 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4
1761  %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19)
1762  %21 = add i32 %20, %16
1763  %22 = getelementptr inbounds i32, ptr %x, i32 20
1764  %wide.load.5 = load <4 x i32>, ptr %22, align 4
1765  %23 = getelementptr inbounds i32, ptr %y, i32 20
1766  %wide.load10.5 = load <4 x i32>, ptr %23, align 4
1767  %24 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5
1768  %25 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %24)
1769  %26 = add i32 %25, %21
1770  %27 = getelementptr inbounds i32, ptr %x, i32 24
1771  %wide.load.6 = load <4 x i32>, ptr %27, align 4
1772  %28 = getelementptr inbounds i32, ptr %y, i32 24
1773  %wide.load10.6 = load <4 x i32>, ptr %28, align 4
1774  %29 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6
1775  %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29)
1776  %31 = add i32 %30, %26
1777  %32 = getelementptr inbounds i32, ptr %x, i32 28
1778  %wide.load.7 = load <4 x i32>, ptr %32, align 4
1779  %33 = getelementptr inbounds i32, ptr %y, i32 28
1780  %wide.load10.7 = load <4 x i32>, ptr %33, align 4
1781  %34 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7
1782  %35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %34)
1783  %36 = add i32 %35, %31
1784  %37 = getelementptr inbounds i32, ptr %x, i32 32
1785  %wide.load.8 = load <4 x i32>, ptr %37, align 4
1786  %38 = getelementptr inbounds i32, ptr %y, i32 32
1787  %wide.load10.8 = load <4 x i32>, ptr %38, align 4
1788  %39 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8
1789  %40 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %39)
1790  %41 = add i32 %40, %36
1791  %42 = getelementptr inbounds i32, ptr %x, i32 36
1792  %wide.load.9 = load <4 x i32>, ptr %42, align 4
1793  %43 = getelementptr inbounds i32, ptr %y, i32 36
1794  %wide.load10.9 = load <4 x i32>, ptr %43, align 4
1795  %44 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9
1796  %45 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %44)
1797  %46 = add i32 %45, %41
1798  %47 = getelementptr inbounds i32, ptr %x, i32 40
1799  %wide.load.10 = load <4 x i32>, ptr %47, align 4
1800  %48 = getelementptr inbounds i32, ptr %y, i32 40
1801  %wide.load10.10 = load <4 x i32>, ptr %48, align 4
1802  %49 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10
1803  %50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %49)
1804  %51 = add i32 %50, %46
1805  %52 = getelementptr inbounds i32, ptr %x, i32 44
1806  %wide.load.11 = load <4 x i32>, ptr %52, align 4
1807  %53 = getelementptr inbounds i32, ptr %y, i32 44
1808  %wide.load10.11 = load <4 x i32>, ptr %53, align 4
1809  %54 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11
1810  %55 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %54)
1811  %56 = add i32 %55, %51
1812  %57 = getelementptr inbounds i32, ptr %x, i32 48
1813  %wide.load.12 = load <4 x i32>, ptr %57, align 4
1814  %58 = getelementptr inbounds i32, ptr %y, i32 48
1815  %wide.load10.12 = load <4 x i32>, ptr %58, align 4
1816  %59 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12
1817  %60 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %59)
1818  %61 = add i32 %60, %56
1819  %62 = getelementptr inbounds i32, ptr %x, i32 52
1820  %wide.load.13 = load <4 x i32>, ptr %62, align 4
1821  %63 = getelementptr inbounds i32, ptr %y, i32 52
1822  %wide.load10.13 = load <4 x i32>, ptr %63, align 4
1823  %64 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13
1824  %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64)
1825  %66 = add i32 %65, %61
1826  %67 = getelementptr inbounds i32, ptr %x, i32 56
1827  %wide.load.14 = load <4 x i32>, ptr %67, align 4
1828  %68 = getelementptr inbounds i32, ptr %y, i32 56
1829  %wide.load10.14 = load <4 x i32>, ptr %68, align 4
1830  %69 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14
1831  %70 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %69)
1832  %71 = add i32 %70, %66
1833  %72 = getelementptr inbounds i32, ptr %x, i32 60
1834  %wide.load.15 = load <4 x i32>, ptr %72, align 4
1835  %73 = getelementptr inbounds i32, ptr %y, i32 60
1836  %wide.load10.15 = load <4 x i32>, ptr %73, align 4
1837  %74 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15
1838  %75 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %74)
1839  %76 = add i32 %75, %71
1840  %77 = getelementptr inbounds i32, ptr %x, i32 64
1841  %wide.load.16 = load <4 x i32>, ptr %77, align 4
1842  %78 = getelementptr inbounds i32, ptr %y, i32 64
1843  %wide.load10.16 = load <4 x i32>, ptr %78, align 4
1844  %79 = mul nsw <4 x i32> %wide.load10.16, %wide.load.16
1845  %80 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %79)
1846  %81 = add i32 %80, %76
1847  %82 = getelementptr inbounds i32, ptr %x, i32 68
1848  %wide.load.17 = load <4 x i32>, ptr %82, align 4
1849  %83 = getelementptr inbounds i32, ptr %y, i32 68
1850  %wide.load10.17 = load <4 x i32>, ptr %83, align 4
1851  %84 = mul nsw <4 x i32> %wide.load10.17, %wide.load.17
1852  %85 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %84)
1853  %86 = add i32 %85, %81
1854  %87 = getelementptr inbounds i32, ptr %x, i32 72
1855  %wide.load.18 = load <4 x i32>, ptr %87, align 4
1856  %88 = getelementptr inbounds i32, ptr %y, i32 72
1857  %wide.load10.18 = load <4 x i32>, ptr %88, align 4
1858  %89 = mul nsw <4 x i32> %wide.load10.18, %wide.load.18
1859  %90 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %89)
1860  %91 = add i32 %90, %86
1861  %92 = getelementptr inbounds i32, ptr %x, i32 76
1862  %wide.load.19 = load <4 x i32>, ptr %92, align 4
1863  %93 = getelementptr inbounds i32, ptr %y, i32 76
1864  %wide.load10.19 = load <4 x i32>, ptr %93, align 4
1865  %94 = mul nsw <4 x i32> %wide.load10.19, %wide.load.19
1866  %95 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %94)
1867  %96 = add i32 %95, %91
1868  %97 = getelementptr inbounds i32, ptr %x, i32 80
1869  %wide.load.20 = load <4 x i32>, ptr %97, align 4
1870  %98 = getelementptr inbounds i32, ptr %y, i32 80
1871  %wide.load10.20 = load <4 x i32>, ptr %98, align 4
1872  %99 = mul nsw <4 x i32> %wide.load10.20, %wide.load.20
1873  %100 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %99)
1874  %101 = add i32 %100, %96
1875  %102 = getelementptr inbounds i32, ptr %x, i32 84
1876  %wide.load.21 = load <4 x i32>, ptr %102, align 4
1877  %103 = getelementptr inbounds i32, ptr %y, i32 84
1878  %wide.load10.21 = load <4 x i32>, ptr %103, align 4
1879  %104 = mul nsw <4 x i32> %wide.load10.21, %wide.load.21
1880  %105 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %104)
1881  %106 = add i32 %105, %101
1882  %107 = getelementptr inbounds i32, ptr %x, i32 88
1883  %wide.load.22 = load <4 x i32>, ptr %107, align 4
1884  %108 = getelementptr inbounds i32, ptr %y, i32 88
1885  %wide.load10.22 = load <4 x i32>, ptr %108, align 4
1886  %109 = mul nsw <4 x i32> %wide.load10.22, %wide.load.22
1887  %110 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %109)
1888  %111 = add i32 %110, %106
1889  %112 = getelementptr inbounds i32, ptr %x, i32 92
1890  %wide.load.23 = load <4 x i32>, ptr %112, align 4
1891  %113 = getelementptr inbounds i32, ptr %y, i32 92
1892  %wide.load10.23 = load <4 x i32>, ptr %113, align 4
1893  %114 = mul nsw <4 x i32> %wide.load10.23, %wide.load.23
1894  %115 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %114)
1895  %116 = add i32 %115, %111
1896  %117 = getelementptr inbounds i32, ptr %x, i32 96
1897  %wide.load.24 = load <4 x i32>, ptr %117, align 4
1898  %118 = getelementptr inbounds i32, ptr %y, i32 96
1899  %wide.load10.24 = load <4 x i32>, ptr %118, align 4
1900  %119 = mul nsw <4 x i32> %wide.load10.24, %wide.load.24
1901  %120 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %119)
1902  %121 = add i32 %120, %116
1903  %122 = getelementptr inbounds i32, ptr %x, i32 100
1904  %wide.load.25 = load <4 x i32>, ptr %122, align 4
1905  %123 = getelementptr inbounds i32, ptr %y, i32 100
1906  %wide.load10.25 = load <4 x i32>, ptr %123, align 4
1907  %124 = mul nsw <4 x i32> %wide.load10.25, %wide.load.25
1908  %125 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %124)
1909  %126 = add i32 %125, %121
1910  %127 = getelementptr inbounds i32, ptr %x, i32 104
1911  %wide.load.26 = load <4 x i32>, ptr %127, align 4
1912  %128 = getelementptr inbounds i32, ptr %y, i32 104
1913  %wide.load10.26 = load <4 x i32>, ptr %128, align 4
1914  %129 = mul nsw <4 x i32> %wide.load10.26, %wide.load.26
1915  %130 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %129)
1916  %131 = add i32 %130, %126
1917  %132 = getelementptr inbounds i32, ptr %x, i32 108
1918  %wide.load.27 = load <4 x i32>, ptr %132, align 4
1919  %133 = getelementptr inbounds i32, ptr %y, i32 108
1920  %wide.load10.27 = load <4 x i32>, ptr %133, align 4
1921  %134 = mul nsw <4 x i32> %wide.load10.27, %wide.load.27
1922  %135 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %134)
1923  %136 = add i32 %135, %131
1924  %137 = getelementptr inbounds i32, ptr %x, i32 112
1925  %wide.load.28 = load <4 x i32>, ptr %137, align 4
1926  %138 = getelementptr inbounds i32, ptr %y, i32 112
1927  %wide.load10.28 = load <4 x i32>, ptr %138, align 4
1928  %139 = mul nsw <4 x i32> %wide.load10.28, %wide.load.28
1929  %140 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %139)
1930  %141 = add i32 %140, %136
1931  %142 = getelementptr inbounds i32, ptr %x, i32 116
1932  %wide.load.29 = load <4 x i32>, ptr %142, align 4
1933  %143 = getelementptr inbounds i32, ptr %y, i32 116
1934  %wide.load10.29 = load <4 x i32>, ptr %143, align 4
1935  %144 = mul nsw <4 x i32> %wide.load10.29, %wide.load.29
1936  %145 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %144)
1937  %146 = add i32 %145, %141
1938  %147 = getelementptr inbounds i32, ptr %x, i32 120
1939  %wide.load.30 = load <4 x i32>, ptr %147, align 4
1940  %148 = getelementptr inbounds i32, ptr %y, i32 120
1941  %wide.load10.30 = load <4 x i32>, ptr %148, align 4
1942  %149 = mul nsw <4 x i32> %wide.load10.30, %wide.load.30
1943  %150 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %149)
1944  %151 = add i32 %150, %146
1945  %152 = getelementptr inbounds i32, ptr %x, i32 124
1946  %wide.load.31 = load <4 x i32>, ptr %152, align 4
1947  %153 = getelementptr inbounds i32, ptr %y, i32 124
1948  %wide.load10.31 = load <4 x i32>, ptr %153, align 4
1949  %154 = mul nsw <4 x i32> %wide.load10.31, %wide.load.31
1950  %155 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %154)
1951  %156 = add i32 %155, %151
1952  ret i32 %156
1953}
1954
1955define i32 @mlav2i32i16(ptr %x, ptr %y) {
1956; CHECK-LABEL: mlav2i32i16:
1957; CHECK:       @ %bb.0: @ %entry
1958; CHECK-NEXT:    ldrsh.w r2, [r0]
1959; CHECK-NEXT:    ldrsh.w r3, [r1]
1960; CHECK-NEXT:    ldrsh.w r0, [r0, #2]
1961; CHECK-NEXT:    ldrsh.w r1, [r1, #2]
1962; CHECK-NEXT:    muls r0, r1, r0
1963; CHECK-NEXT:    smlabb r0, r3, r2, r0
1964; CHECK-NEXT:    bx lr
1965entry:
1966  %0 = load i16, ptr %x, align 2
1967  %conv = sext i16 %0 to i32
1968  %1 = load i16, ptr %y, align 2
1969  %conv2 = sext i16 %1 to i32
1970  %mul = mul nsw i32 %conv2, %conv
1971  %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1
1972  %2 = load i16, ptr %arrayidx.1, align 2
1973  %conv.1 = sext i16 %2 to i32
1974  %arrayidx1.1 = getelementptr inbounds i16, ptr %y, i32 1
1975  %3 = load i16, ptr %arrayidx1.1, align 2
1976  %conv2.1 = sext i16 %3 to i32
1977  %mul.1 = mul nsw i32 %conv2.1, %conv.1
1978  %add.1 = add nsw i32 %mul.1, %mul
1979  ret i32 %add.1
1980}
1981
1982define i32 @mlav4i32i16(ptr %x, ptr %y) {
1983; CHECK-LABEL: mlav4i32i16:
1984; CHECK:       @ %bb.0: @ %entry
1985; CHECK-NEXT:    vldrh.s32 q0, [r0]
1986; CHECK-NEXT:    vldrh.s32 q1, [r1]
1987; CHECK-NEXT:    vmlav.u32 r0, q1, q0
1988; CHECK-NEXT:    bx lr
1989entry:
1990  %0 = load <4 x i16>, ptr %x, align 2
1991  %1 = sext <4 x i16> %0 to <4 x i32>
1992  %2 = load <4 x i16>, ptr %y, align 2
1993  %3 = sext <4 x i16> %2 to <4 x i32>
1994  %4 = mul nsw <4 x i32> %3, %1
1995  %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
1996  ret i32 %5
1997}
1998
1999define i32 @mlav8i32i16(ptr %x, ptr %y) {
2000; CHECK-LABEL: mlav8i32i16:
2001; CHECK:       @ %bb.0: @ %entry
2002; CHECK-NEXT:    vldrh.u16 q0, [r0]
2003; CHECK-NEXT:    vldrh.u16 q1, [r1]
2004; CHECK-NEXT:    vmlav.s16 r0, q1, q0
2005; CHECK-NEXT:    bx lr
2006entry:
2007  %0 = load <8 x i16>, ptr %x, align 2
2008  %1 = sext <8 x i16> %0 to <8 x i32>
2009  %2 = load <8 x i16>, ptr %y, align 2
2010  %3 = sext <8 x i16> %2 to <8 x i32>
2011  %4 = mul nsw <8 x i32> %3, %1
2012  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
2013  ret i32 %5
2014}
2015
2016define i32 @mlav16i32i16(ptr %x, ptr %y) {
2017; CHECK-LABEL: mlav16i32i16:
2018; CHECK:       @ %bb.0: @ %entry
2019; CHECK-NEXT:    vldrh.s32 q0, [r0]
2020; CHECK-NEXT:    vldrh.s32 q1, [r1]
2021; CHECK-NEXT:    vmlav.u32 r2, q1, q0
2022; CHECK-NEXT:    vldrh.s32 q0, [r0, #8]
2023; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
2024; CHECK-NEXT:    vmlava.u32 r2, q1, q0
2025; CHECK-NEXT:    vldrh.s32 q0, [r0, #16]
2026; CHECK-NEXT:    vldrh.s32 q1, [r1, #16]
2027; CHECK-NEXT:    vmlava.u32 r2, q1, q0
2028; CHECK-NEXT:    vldrh.s32 q0, [r0, #24]
2029; CHECK-NEXT:    vldrh.s32 q1, [r1, #24]
2030; CHECK-NEXT:    vmlava.u32 r2, q1, q0
2031; CHECK-NEXT:    mov r0, r2
2032; CHECK-NEXT:    bx lr
2033entry:
2034  %0 = load <16 x i16>, ptr %x, align 2
2035  %1 = sext <16 x i16> %0 to <16 x i32>
2036  %2 = load <16 x i16>, ptr %y, align 2
2037  %3 = sext <16 x i16> %2 to <16 x i32>
2038  %4 = mul nsw <16 x i32> %3, %1
2039  %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
2040  ret i32 %5
2041}
2042
2043define i32 @mlav24i32i16(ptr %x, ptr %y) {
2044; CHECK-LABEL: mlav24i32i16:
2045; CHECK:       @ %bb.0: @ %entry
2046; CHECK-NEXT:    vldrh.u16 q0, [r0]
2047; CHECK-NEXT:    vldrh.u16 q1, [r1]
2048; CHECK-NEXT:    mov r2, r0
2049; CHECK-NEXT:    vmlav.s16 r0, q1, q0
2050; CHECK-NEXT:    vldrh.s32 q0, [r2, #16]
2051; CHECK-NEXT:    vldrh.s32 q1, [r1, #16]
2052; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2053; CHECK-NEXT:    vldrh.s32 q0, [r2, #24]
2054; CHECK-NEXT:    vldrh.s32 q1, [r1, #24]
2055; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2056; CHECK-NEXT:    vldrh.s32 q0, [r2, #32]
2057; CHECK-NEXT:    vldrh.s32 q1, [r1, #32]
2058; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2059; CHECK-NEXT:    vldrh.s32 q0, [r2, #40]
2060; CHECK-NEXT:    vldrh.s32 q1, [r1, #40]
2061; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2062; CHECK-NEXT:    bx lr
2063entry:
2064  %0 = load <8 x i16>, ptr %x, align 2
2065  %1 = sext <8 x i16> %0 to <8 x i32>
2066  %2 = load <8 x i16>, ptr %y, align 2
2067  %3 = sext <8 x i16> %2 to <8 x i32>
2068  %4 = mul nsw <8 x i32> %3, %1
2069  %arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8
2070  %arrayidx1.8 = getelementptr inbounds i16, ptr %y, i32 8
2071  %5 = load <16 x i16>, ptr %arrayidx.8, align 2
2072  %6 = sext <16 x i16> %5 to <16 x i32>
2073  %7 = load <16 x i16>, ptr %arrayidx1.8, align 2
2074  %8 = sext <16 x i16> %7 to <16 x i32>
2075  %9 = mul nsw <16 x i32> %8, %6
2076  %10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %9)
2077  %11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
2078  %op.rdx = add nsw i32 %10, %11
2079  ret i32 %op.rdx
2080}
2081
2082define i32 @mlav32i32i16(ptr %x, ptr %y) {
2083; CHECK-LABEL: mlav32i32i16:
2084; CHECK:       @ %bb.0: @ %entry
2085; CHECK-NEXT:    vldrh.s32 q0, [r0]
2086; CHECK-NEXT:    vldrh.s32 q1, [r1]
2087; CHECK-NEXT:    mov r2, r0
2088; CHECK-NEXT:    vmlav.u32 r0, q1, q0
2089; CHECK-NEXT:    vldrh.s32 q0, [r2, #8]
2090; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
2091; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2092; CHECK-NEXT:    vldrh.s32 q0, [r2, #16]
2093; CHECK-NEXT:    vldrh.s32 q1, [r1, #16]
2094; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2095; CHECK-NEXT:    vldrh.s32 q0, [r2, #24]
2096; CHECK-NEXT:    vldrh.s32 q1, [r1, #24]
2097; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2098; CHECK-NEXT:    vldrh.s32 q0, [r2, #32]
2099; CHECK-NEXT:    vldrh.s32 q1, [r1, #32]
2100; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2101; CHECK-NEXT:    vldrh.s32 q0, [r2, #40]
2102; CHECK-NEXT:    vldrh.s32 q1, [r1, #40]
2103; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2104; CHECK-NEXT:    vldrh.s32 q0, [r2, #48]
2105; CHECK-NEXT:    vldrh.s32 q1, [r1, #48]
2106; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2107; CHECK-NEXT:    vldrh.s32 q0, [r2, #56]
2108; CHECK-NEXT:    vldrh.s32 q1, [r1, #56]
2109; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2110; CHECK-NEXT:    bx lr
2111entry:
2112  %0 = load <32 x i16>, ptr %x, align 2
2113  %1 = sext <32 x i16> %0 to <32 x i32>
2114  %2 = load <32 x i16>, ptr %y, align 2
2115  %3 = sext <32 x i16> %2 to <32 x i32>
2116  %4 = mul nsw <32 x i32> %3, %1
2117  %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
2118  ret i32 %5
2119}
2120
2121define i32 @mlav64i32i16(ptr %x, ptr %y) {
2122; CHECK-LABEL: mlav64i32i16:
2123; CHECK:       @ %bb.0: @ %entry
2124; CHECK-NEXT:    vldrh.u16 q0, [r0]
2125; CHECK-NEXT:    vldrh.u16 q1, [r1]
2126; CHECK-NEXT:    mov r2, r0
2127; CHECK-NEXT:    vmlav.s16 r0, q1, q0
2128; CHECK-NEXT:    vldrh.u16 q0, [r2, #16]
2129; CHECK-NEXT:    vldrh.u16 q1, [r1, #16]
2130; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2131; CHECK-NEXT:    vldrh.u16 q0, [r2, #32]
2132; CHECK-NEXT:    vldrh.u16 q1, [r1, #32]
2133; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2134; CHECK-NEXT:    vldrh.u16 q0, [r2, #48]
2135; CHECK-NEXT:    vldrh.u16 q1, [r1, #48]
2136; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2137; CHECK-NEXT:    vldrh.u16 q0, [r2, #64]
2138; CHECK-NEXT:    vldrh.u16 q1, [r1, #64]
2139; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2140; CHECK-NEXT:    vldrh.u16 q0, [r2, #80]
2141; CHECK-NEXT:    vldrh.u16 q1, [r1, #80]
2142; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2143; CHECK-NEXT:    vldrh.u16 q0, [r2, #96]
2144; CHECK-NEXT:    vldrh.u16 q1, [r1, #96]
2145; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2146; CHECK-NEXT:    vldrh.u16 q0, [r2, #112]
2147; CHECK-NEXT:    vldrh.u16 q1, [r1, #112]
2148; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2149; CHECK-NEXT:    bx lr
2150entry:
2151  %wide.load = load <8 x i16>, ptr %x, align 2
2152  %0 = sext <8 x i16> %wide.load to <8 x i32>
2153  %wide.load11 = load <8 x i16>, ptr %y, align 2
2154  %1 = sext <8 x i16> %wide.load11 to <8 x i32>
2155  %2 = mul nsw <8 x i32> %1, %0
2156  %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
2157  %4 = getelementptr inbounds i16, ptr %x, i32 8
2158  %wide.load.1 = load <8 x i16>, ptr %4, align 2
2159  %5 = sext <8 x i16> %wide.load.1 to <8 x i32>
2160  %6 = getelementptr inbounds i16, ptr %y, i32 8
2161  %wide.load11.1 = load <8 x i16>, ptr %6, align 2
2162  %7 = sext <8 x i16> %wide.load11.1 to <8 x i32>
2163  %8 = mul nsw <8 x i32> %7, %5
2164  %9 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8)
2165  %10 = add i32 %9, %3
2166  %11 = getelementptr inbounds i16, ptr %x, i32 16
2167  %wide.load.2 = load <8 x i16>, ptr %11, align 2
2168  %12 = sext <8 x i16> %wide.load.2 to <8 x i32>
2169  %13 = getelementptr inbounds i16, ptr %y, i32 16
2170  %wide.load11.2 = load <8 x i16>, ptr %13, align 2
2171  %14 = sext <8 x i16> %wide.load11.2 to <8 x i32>
2172  %15 = mul nsw <8 x i32> %14, %12
2173  %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15)
2174  %17 = add i32 %16, %10
2175  %18 = getelementptr inbounds i16, ptr %x, i32 24
2176  %wide.load.3 = load <8 x i16>, ptr %18, align 2
2177  %19 = sext <8 x i16> %wide.load.3 to <8 x i32>
2178  %20 = getelementptr inbounds i16, ptr %y, i32 24
2179  %wide.load11.3 = load <8 x i16>, ptr %20, align 2
2180  %21 = sext <8 x i16> %wide.load11.3 to <8 x i32>
2181  %22 = mul nsw <8 x i32> %21, %19
2182  %23 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %22)
2183  %24 = add i32 %23, %17
2184  %25 = getelementptr inbounds i16, ptr %x, i32 32
2185  %wide.load.4 = load <8 x i16>, ptr %25, align 2
2186  %26 = sext <8 x i16> %wide.load.4 to <8 x i32>
2187  %27 = getelementptr inbounds i16, ptr %y, i32 32
2188  %wide.load11.4 = load <8 x i16>, ptr %27, align 2
2189  %28 = sext <8 x i16> %wide.load11.4 to <8 x i32>
2190  %29 = mul nsw <8 x i32> %28, %26
2191  %30 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %29)
2192  %31 = add i32 %30, %24
2193  %32 = getelementptr inbounds i16, ptr %x, i32 40
2194  %wide.load.5 = load <8 x i16>, ptr %32, align 2
2195  %33 = sext <8 x i16> %wide.load.5 to <8 x i32>
2196  %34 = getelementptr inbounds i16, ptr %y, i32 40
2197  %wide.load11.5 = load <8 x i16>, ptr %34, align 2
2198  %35 = sext <8 x i16> %wide.load11.5 to <8 x i32>
2199  %36 = mul nsw <8 x i32> %35, %33
2200  %37 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %36)
2201  %38 = add i32 %37, %31
2202  %39 = getelementptr inbounds i16, ptr %x, i32 48
2203  %wide.load.6 = load <8 x i16>, ptr %39, align 2
2204  %40 = sext <8 x i16> %wide.load.6 to <8 x i32>
2205  %41 = getelementptr inbounds i16, ptr %y, i32 48
2206  %wide.load11.6 = load <8 x i16>, ptr %41, align 2
2207  %42 = sext <8 x i16> %wide.load11.6 to <8 x i32>
2208  %43 = mul nsw <8 x i32> %42, %40
2209  %44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43)
2210  %45 = add i32 %44, %38
2211  %46 = getelementptr inbounds i16, ptr %x, i32 56
2212  %wide.load.7 = load <8 x i16>, ptr %46, align 2
2213  %47 = sext <8 x i16> %wide.load.7 to <8 x i32>
2214  %48 = getelementptr inbounds i16, ptr %y, i32 56
2215  %wide.load11.7 = load <8 x i16>, ptr %48, align 2
2216  %49 = sext <8 x i16> %wide.load11.7 to <8 x i32>
2217  %50 = mul nsw <8 x i32> %49, %47
2218  %51 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %50)
2219  %52 = add i32 %51, %45
2220  ret i32 %52
2221}
2222
2223define i32 @mlav128i32i16(ptr %x, ptr %y) {
2224; CHECK-LABEL: mlav128i32i16:
2225; CHECK:       @ %bb.0: @ %entry
2226; CHECK-NEXT:    vldrh.u16 q0, [r0]
2227; CHECK-NEXT:    vldrh.u16 q1, [r1]
2228; CHECK-NEXT:    mov r2, r0
2229; CHECK-NEXT:    vmlav.s16 r0, q1, q0
2230; CHECK-NEXT:    vldrh.u16 q0, [r2, #16]
2231; CHECK-NEXT:    vldrh.u16 q1, [r1, #16]
2232; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2233; CHECK-NEXT:    vldrh.u16 q0, [r2, #32]
2234; CHECK-NEXT:    vldrh.u16 q1, [r1, #32]
2235; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2236; CHECK-NEXT:    vldrh.u16 q0, [r2, #48]
2237; CHECK-NEXT:    vldrh.u16 q1, [r1, #48]
2238; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2239; CHECK-NEXT:    vldrh.u16 q0, [r2, #64]
2240; CHECK-NEXT:    vldrh.u16 q1, [r1, #64]
2241; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2242; CHECK-NEXT:    vldrh.u16 q0, [r2, #80]
2243; CHECK-NEXT:    vldrh.u16 q1, [r1, #80]
2244; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2245; CHECK-NEXT:    vldrh.u16 q0, [r2, #96]
2246; CHECK-NEXT:    vldrh.u16 q1, [r1, #96]
2247; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2248; CHECK-NEXT:    vldrh.u16 q0, [r2, #112]
2249; CHECK-NEXT:    vldrh.u16 q1, [r1, #112]
2250; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2251; CHECK-NEXT:    vldrh.u16 q0, [r2, #128]
2252; CHECK-NEXT:    vldrh.u16 q1, [r1, #128]
2253; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2254; CHECK-NEXT:    vldrh.u16 q0, [r2, #144]
2255; CHECK-NEXT:    vldrh.u16 q1, [r1, #144]
2256; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2257; CHECK-NEXT:    vldrh.u16 q0, [r2, #160]
2258; CHECK-NEXT:    vldrh.u16 q1, [r1, #160]
2259; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2260; CHECK-NEXT:    vldrh.u16 q0, [r2, #176]
2261; CHECK-NEXT:    vldrh.u16 q1, [r1, #176]
2262; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2263; CHECK-NEXT:    vldrh.u16 q0, [r2, #192]
2264; CHECK-NEXT:    vldrh.u16 q1, [r1, #192]
2265; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2266; CHECK-NEXT:    vldrh.u16 q0, [r2, #208]
2267; CHECK-NEXT:    vldrh.u16 q1, [r1, #208]
2268; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2269; CHECK-NEXT:    vldrh.u16 q0, [r2, #224]
2270; CHECK-NEXT:    vldrh.u16 q1, [r1, #224]
2271; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2272; CHECK-NEXT:    vldrh.u16 q0, [r2, #240]
2273; CHECK-NEXT:    vldrh.u16 q1, [r1, #240]
2274; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2275; CHECK-NEXT:    bx lr
2276entry:
2277  %wide.load = load <8 x i16>, ptr %x, align 2
2278  %0 = sext <8 x i16> %wide.load to <8 x i32>
2279  %wide.load11 = load <8 x i16>, ptr %y, align 2
2280  %1 = sext <8 x i16> %wide.load11 to <8 x i32>
2281  %2 = mul nsw <8 x i32> %1, %0
2282  %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
2283  %4 = getelementptr inbounds i16, ptr %x, i32 8
2284  %wide.load.1 = load <8 x i16>, ptr %4, align 2
2285  %5 = sext <8 x i16> %wide.load.1 to <8 x i32>
2286  %6 = getelementptr inbounds i16, ptr %y, i32 8
2287  %wide.load11.1 = load <8 x i16>, ptr %6, align 2
2288  %7 = sext <8 x i16> %wide.load11.1 to <8 x i32>
2289  %8 = mul nsw <8 x i32> %7, %5
2290  %9 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8)
2291  %10 = add i32 %9, %3
2292  %11 = getelementptr inbounds i16, ptr %x, i32 16
2293  %wide.load.2 = load <8 x i16>, ptr %11, align 2
2294  %12 = sext <8 x i16> %wide.load.2 to <8 x i32>
2295  %13 = getelementptr inbounds i16, ptr %y, i32 16
2296  %wide.load11.2 = load <8 x i16>, ptr %13, align 2
2297  %14 = sext <8 x i16> %wide.load11.2 to <8 x i32>
2298  %15 = mul nsw <8 x i32> %14, %12
2299  %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15)
2300  %17 = add i32 %16, %10
2301  %18 = getelementptr inbounds i16, ptr %x, i32 24
2302  %wide.load.3 = load <8 x i16>, ptr %18, align 2
2303  %19 = sext <8 x i16> %wide.load.3 to <8 x i32>
2304  %20 = getelementptr inbounds i16, ptr %y, i32 24
2305  %wide.load11.3 = load <8 x i16>, ptr %20, align 2
2306  %21 = sext <8 x i16> %wide.load11.3 to <8 x i32>
2307  %22 = mul nsw <8 x i32> %21, %19
2308  %23 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %22)
2309  %24 = add i32 %23, %17
2310  %25 = getelementptr inbounds i16, ptr %x, i32 32
2311  %wide.load.4 = load <8 x i16>, ptr %25, align 2
2312  %26 = sext <8 x i16> %wide.load.4 to <8 x i32>
2313  %27 = getelementptr inbounds i16, ptr %y, i32 32
2314  %wide.load11.4 = load <8 x i16>, ptr %27, align 2
2315  %28 = sext <8 x i16> %wide.load11.4 to <8 x i32>
2316  %29 = mul nsw <8 x i32> %28, %26
2317  %30 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %29)
2318  %31 = add i32 %30, %24
2319  %32 = getelementptr inbounds i16, ptr %x, i32 40
2320  %wide.load.5 = load <8 x i16>, ptr %32, align 2
2321  %33 = sext <8 x i16> %wide.load.5 to <8 x i32>
2322  %34 = getelementptr inbounds i16, ptr %y, i32 40
2323  %wide.load11.5 = load <8 x i16>, ptr %34, align 2
2324  %35 = sext <8 x i16> %wide.load11.5 to <8 x i32>
2325  %36 = mul nsw <8 x i32> %35, %33
2326  %37 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %36)
2327  %38 = add i32 %37, %31
2328  %39 = getelementptr inbounds i16, ptr %x, i32 48
2329  %wide.load.6 = load <8 x i16>, ptr %39, align 2
2330  %40 = sext <8 x i16> %wide.load.6 to <8 x i32>
2331  %41 = getelementptr inbounds i16, ptr %y, i32 48
2332  %wide.load11.6 = load <8 x i16>, ptr %41, align 2
2333  %42 = sext <8 x i16> %wide.load11.6 to <8 x i32>
2334  %43 = mul nsw <8 x i32> %42, %40
2335  %44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43)
2336  %45 = add i32 %44, %38
2337  %46 = getelementptr inbounds i16, ptr %x, i32 56
2338  %wide.load.7 = load <8 x i16>, ptr %46, align 2
2339  %47 = sext <8 x i16> %wide.load.7 to <8 x i32>
2340  %48 = getelementptr inbounds i16, ptr %y, i32 56
2341  %wide.load11.7 = load <8 x i16>, ptr %48, align 2
2342  %49 = sext <8 x i16> %wide.load11.7 to <8 x i32>
2343  %50 = mul nsw <8 x i32> %49, %47
2344  %51 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %50)
2345  %52 = add i32 %51, %45
2346  %53 = getelementptr inbounds i16, ptr %x, i32 64
2347  %wide.load.8 = load <8 x i16>, ptr %53, align 2
2348  %54 = sext <8 x i16> %wide.load.8 to <8 x i32>
2349  %55 = getelementptr inbounds i16, ptr %y, i32 64
2350  %wide.load11.8 = load <8 x i16>, ptr %55, align 2
2351  %56 = sext <8 x i16> %wide.load11.8 to <8 x i32>
2352  %57 = mul nsw <8 x i32> %56, %54
2353  %58 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %57)
2354  %59 = add i32 %58, %52
2355  %60 = getelementptr inbounds i16, ptr %x, i32 72
2356  %wide.load.9 = load <8 x i16>, ptr %60, align 2
2357  %61 = sext <8 x i16> %wide.load.9 to <8 x i32>
2358  %62 = getelementptr inbounds i16, ptr %y, i32 72
2359  %wide.load11.9 = load <8 x i16>, ptr %62, align 2
2360  %63 = sext <8 x i16> %wide.load11.9 to <8 x i32>
2361  %64 = mul nsw <8 x i32> %63, %61
2362  %65 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %64)
2363  %66 = add i32 %65, %59
2364  %67 = getelementptr inbounds i16, ptr %x, i32 80
2365  %wide.load.10 = load <8 x i16>, ptr %67, align 2
2366  %68 = sext <8 x i16> %wide.load.10 to <8 x i32>
2367  %69 = getelementptr inbounds i16, ptr %y, i32 80
2368  %wide.load11.10 = load <8 x i16>, ptr %69, align 2
2369  %70 = sext <8 x i16> %wide.load11.10 to <8 x i32>
2370  %71 = mul nsw <8 x i32> %70, %68
2371  %72 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %71)
2372  %73 = add i32 %72, %66
2373  %74 = getelementptr inbounds i16, ptr %x, i32 88
2374  %wide.load.11 = load <8 x i16>, ptr %74, align 2
2375  %75 = sext <8 x i16> %wide.load.11 to <8 x i32>
2376  %76 = getelementptr inbounds i16, ptr %y, i32 88
2377  %wide.load11.11 = load <8 x i16>, ptr %76, align 2
2378  %77 = sext <8 x i16> %wide.load11.11 to <8 x i32>
2379  %78 = mul nsw <8 x i32> %77, %75
2380  %79 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %78)
2381  %80 = add i32 %79, %73
2382  %81 = getelementptr inbounds i16, ptr %x, i32 96
2383  %wide.load.12 = load <8 x i16>, ptr %81, align 2
2384  %82 = sext <8 x i16> %wide.load.12 to <8 x i32>
2385  %83 = getelementptr inbounds i16, ptr %y, i32 96
2386  %wide.load11.12 = load <8 x i16>, ptr %83, align 2
2387  %84 = sext <8 x i16> %wide.load11.12 to <8 x i32>
2388  %85 = mul nsw <8 x i32> %84, %82
2389  %86 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %85)
2390  %87 = add i32 %86, %80
2391  %88 = getelementptr inbounds i16, ptr %x, i32 104
2392  %wide.load.13 = load <8 x i16>, ptr %88, align 2
2393  %89 = sext <8 x i16> %wide.load.13 to <8 x i32>
2394  %90 = getelementptr inbounds i16, ptr %y, i32 104
2395  %wide.load11.13 = load <8 x i16>, ptr %90, align 2
2396  %91 = sext <8 x i16> %wide.load11.13 to <8 x i32>
2397  %92 = mul nsw <8 x i32> %91, %89
2398  %93 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %92)
2399  %94 = add i32 %93, %87
2400  %95 = getelementptr inbounds i16, ptr %x, i32 112
2401  %wide.load.14 = load <8 x i16>, ptr %95, align 2
2402  %96 = sext <8 x i16> %wide.load.14 to <8 x i32>
2403  %97 = getelementptr inbounds i16, ptr %y, i32 112
2404  %wide.load11.14 = load <8 x i16>, ptr %97, align 2
2405  %98 = sext <8 x i16> %wide.load11.14 to <8 x i32>
2406  %99 = mul nsw <8 x i32> %98, %96
2407  %100 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %99)
2408  %101 = add i32 %100, %94
2409  %102 = getelementptr inbounds i16, ptr %x, i32 120
2410  %wide.load.15 = load <8 x i16>, ptr %102, align 2
2411  %103 = sext <8 x i16> %wide.load.15 to <8 x i32>
2412  %104 = getelementptr inbounds i16, ptr %y, i32 120
2413  %wide.load11.15 = load <8 x i16>, ptr %104, align 2
2414  %105 = sext <8 x i16> %wide.load11.15 to <8 x i32>
2415  %106 = mul nsw <8 x i32> %105, %103
2416  %107 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %106)
2417  %108 = add i32 %107, %101
2418  ret i32 %108
2419}
2420
2421define i32 @mlav2i32i8(ptr %x, ptr %y) {
2422; CHECK-LABEL: mlav2i32i8:
2423; CHECK:       @ %bb.0: @ %entry
2424; CHECK-NEXT:    ldrb r2, [r0]
2425; CHECK-NEXT:    ldrb r3, [r1]
2426; CHECK-NEXT:    ldrb r0, [r0, #1]
2427; CHECK-NEXT:    ldrb r1, [r1, #1]
2428; CHECK-NEXT:    muls r0, r1, r0
2429; CHECK-NEXT:    smlabb r0, r3, r2, r0
2430; CHECK-NEXT:    bx lr
2431entry:
2432  %0 = load i8, ptr %x, align 1
2433  %conv = zext i8 %0 to i32
2434  %1 = load i8, ptr %y, align 1
2435  %conv2 = zext i8 %1 to i32
2436  %mul = mul nuw nsw i32 %conv2, %conv
2437  %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1
2438  %2 = load i8, ptr %arrayidx.1, align 1
2439  %conv.1 = zext i8 %2 to i32
2440  %arrayidx1.1 = getelementptr inbounds i8, ptr %y, i32 1
2441  %3 = load i8, ptr %arrayidx1.1, align 1
2442  %conv2.1 = zext i8 %3 to i32
2443  %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
2444  %add.1 = add nuw nsw i32 %mul.1, %mul
2445  ret i32 %add.1
2446}
2447
2448define i32 @mlav4i32i8(ptr %x, ptr %y) {
2449; CHECK-LABEL: mlav4i32i8:
2450; CHECK:       @ %bb.0: @ %entry
2451; CHECK-NEXT:    vldrb.u32 q0, [r0]
2452; CHECK-NEXT:    vldrb.u32 q1, [r1]
2453; CHECK-NEXT:    vmlav.u32 r0, q1, q0
2454; CHECK-NEXT:    bx lr
2455entry:
2456  %0 = load <4 x i8>, ptr %x, align 1
2457  %1 = zext <4 x i8> %0 to <4 x i32>
2458  %2 = load <4 x i8>, ptr %y, align 1
2459  %3 = zext <4 x i8> %2 to <4 x i32>
2460  %4 = mul nuw nsw <4 x i32> %3, %1
2461  %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
2462  ret i32 %5
2463}
2464
2465define i32 @mlav8i32i8(ptr %x, ptr %y) {
2466; CHECK-LABEL: mlav8i32i8:
2467; CHECK:       @ %bb.0: @ %entry
2468; CHECK-NEXT:    vldrb.u16 q0, [r0]
2469; CHECK-NEXT:    vldrb.u16 q1, [r1]
2470; CHECK-NEXT:    vmlav.u16 r0, q1, q0
2471; CHECK-NEXT:    bx lr
2472entry:
2473  %0 = load <8 x i8>, ptr %x, align 1
2474  %1 = zext <8 x i8> %0 to <8 x i32>
2475  %2 = load <8 x i8>, ptr %y, align 1
2476  %3 = zext <8 x i8> %2 to <8 x i32>
2477  %4 = mul nuw nsw <8 x i32> %3, %1
2478  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
2479  ret i32 %5
2480}
2481
2482define i32 @mlav16i32i8(ptr %x, ptr %y) {
2483; CHECK-LABEL: mlav16i32i8:
2484; CHECK:       @ %bb.0: @ %entry
2485; CHECK-NEXT:    vldrb.u8 q0, [r0]
2486; CHECK-NEXT:    vldrb.u8 q1, [r1]
2487; CHECK-NEXT:    vmlav.u8 r0, q1, q0
2488; CHECK-NEXT:    bx lr
2489entry:
2490  %0 = load <16 x i8>, ptr %x, align 1
2491  %1 = zext <16 x i8> %0 to <16 x i32>
2492  %2 = load <16 x i8>, ptr %y, align 1
2493  %3 = zext <16 x i8> %2 to <16 x i32>
2494  %4 = mul nuw nsw <16 x i32> %3, %1
2495  %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
2496  ret i32 %5
2497}
2498
2499define i32 @mlav24i32i8(ptr %x, ptr %y) {
2500; CHECK-LABEL: mlav24i32i8:
2501; CHECK:       @ %bb.0: @ %entry
2502; CHECK-NEXT:    vldrb.u16 q0, [r0]
2503; CHECK-NEXT:    vldrb.u16 q1, [r1]
2504; CHECK-NEXT:    vmlav.u16 r2, q1, q0
2505; CHECK-NEXT:    vldrb.u8 q0, [r0, #8]
2506; CHECK-NEXT:    vldrb.u8 q1, [r1, #8]
2507; CHECK-NEXT:    vmlava.u8 r2, q1, q0
2508; CHECK-NEXT:    mov r0, r2
2509; CHECK-NEXT:    bx lr
2510entry:
2511  %0 = load <8 x i8>, ptr %x, align 1
2512  %1 = zext <8 x i8> %0 to <8 x i32>
2513  %2 = load <8 x i8>, ptr %y, align 1
2514  %3 = zext <8 x i8> %2 to <8 x i32>
2515  %4 = mul nuw nsw <8 x i32> %3, %1
2516  %arrayidx.8 = getelementptr inbounds i8, ptr %x, i32 8
2517  %arrayidx1.8 = getelementptr inbounds i8, ptr %y, i32 8
2518  %5 = load <16 x i8>, ptr %arrayidx.8, align 1
2519  %6 = zext <16 x i8> %5 to <16 x i32>
2520  %7 = load <16 x i8>, ptr %arrayidx1.8, align 1
2521  %8 = zext <16 x i8> %7 to <16 x i32>
2522  %9 = mul nuw nsw <16 x i32> %8, %6
2523  %10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %9)
2524  %11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
2525  %op.rdx = add nuw nsw i32 %10, %11
2526  ret i32 %op.rdx
2527}
2528
2529define i32 @mlav32i32i8(ptr %x, ptr %y) {
2530; CHECK-LABEL: mlav32i32i8:
2531; CHECK:       @ %bb.0: @ %entry
2532; CHECK-NEXT:    vldrb.u32 q0, [r0]
2533; CHECK-NEXT:    vldrb.u32 q1, [r1]
2534; CHECK-NEXT:    mov r2, r0
2535; CHECK-NEXT:    vmlav.u32 r0, q1, q0
2536; CHECK-NEXT:    vldrb.u32 q0, [r2, #4]
2537; CHECK-NEXT:    vldrb.u32 q1, [r1, #4]
2538; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2539; CHECK-NEXT:    vldrb.u32 q0, [r2, #8]
2540; CHECK-NEXT:    vldrb.u32 q1, [r1, #8]
2541; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2542; CHECK-NEXT:    vldrb.u32 q0, [r2, #12]
2543; CHECK-NEXT:    vldrb.u32 q1, [r1, #12]
2544; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2545; CHECK-NEXT:    vldrb.u32 q0, [r2, #16]
2546; CHECK-NEXT:    vldrb.u32 q1, [r1, #16]
2547; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2548; CHECK-NEXT:    vldrb.u32 q0, [r2, #20]
2549; CHECK-NEXT:    vldrb.u32 q1, [r1, #20]
2550; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2551; CHECK-NEXT:    vldrb.u32 q0, [r2, #24]
2552; CHECK-NEXT:    vldrb.u32 q1, [r1, #24]
2553; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2554; CHECK-NEXT:    vldrb.u32 q0, [r2, #28]
2555; CHECK-NEXT:    vldrb.u32 q1, [r1, #28]
2556; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2557; CHECK-NEXT:    bx lr
2558entry:
2559  %0 = load <32 x i8>, ptr %x, align 1
2560  %1 = zext <32 x i8> %0 to <32 x i32>
2561  %2 = load <32 x i8>, ptr %y, align 1
2562  %3 = zext <32 x i8> %2 to <32 x i32>
2563  %4 = mul nuw nsw <32 x i32> %3, %1
2564  %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
2565  ret i32 %5
2566}
2567
2568define i32 @mlav64i32i8(ptr %x, ptr %y) {
2569; CHECK-LABEL: mlav64i32i8:
2570; CHECK:       @ %bb.0: @ %entry
2571; CHECK-NEXT:    vldrb.u8 q0, [r0]
2572; CHECK-NEXT:    vldrb.u8 q1, [r1]
2573; CHECK-NEXT:    vmlav.u8 r2, q1, q0
2574; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
2575; CHECK-NEXT:    vldrb.u8 q1, [r1, #16]
2576; CHECK-NEXT:    vmlava.u8 r2, q1, q0
2577; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
2578; CHECK-NEXT:    vldrb.u8 q1, [r1, #32]
2579; CHECK-NEXT:    vmlava.u8 r2, q1, q0
2580; CHECK-NEXT:    vldrb.u8 q0, [r0, #48]
2581; CHECK-NEXT:    vldrb.u8 q1, [r1, #48]
2582; CHECK-NEXT:    vmlava.u8 r2, q1, q0
2583; CHECK-NEXT:    mov r0, r2
2584; CHECK-NEXT:    bx lr
2585entry:
2586  %wide.load = load <16 x i8>, ptr %x, align 1
2587  %0 = zext <16 x i8> %wide.load to <16 x i32>
2588  %wide.load11 = load <16 x i8>, ptr %y, align 1
2589  %1 = zext <16 x i8> %wide.load11 to <16 x i32>
2590  %2 = mul nuw nsw <16 x i32> %1, %0
2591  %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
2592  %4 = getelementptr inbounds i8, ptr %x, i32 16
2593  %wide.load.1 = load <16 x i8>, ptr %4, align 1
2594  %5 = zext <16 x i8> %wide.load.1 to <16 x i32>
2595  %6 = getelementptr inbounds i8, ptr %y, i32 16
2596  %wide.load11.1 = load <16 x i8>, ptr %6, align 1
2597  %7 = zext <16 x i8> %wide.load11.1 to <16 x i32>
2598  %8 = mul nuw nsw <16 x i32> %7, %5
2599  %9 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %8)
2600  %10 = add i32 %9, %3
2601  %11 = getelementptr inbounds i8, ptr %x, i32 32
2602  %wide.load.2 = load <16 x i8>, ptr %11, align 1
2603  %12 = zext <16 x i8> %wide.load.2 to <16 x i32>
2604  %13 = getelementptr inbounds i8, ptr %y, i32 32
2605  %wide.load11.2 = load <16 x i8>, ptr %13, align 1
2606  %14 = zext <16 x i8> %wide.load11.2 to <16 x i32>
2607  %15 = mul nuw nsw <16 x i32> %14, %12
2608  %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15)
2609  %17 = add i32 %16, %10
2610  %18 = getelementptr inbounds i8, ptr %x, i32 48
2611  %wide.load.3 = load <16 x i8>, ptr %18, align 1
2612  %19 = zext <16 x i8> %wide.load.3 to <16 x i32>
2613  %20 = getelementptr inbounds i8, ptr %y, i32 48
2614  %wide.load11.3 = load <16 x i8>, ptr %20, align 1
2615  %21 = zext <16 x i8> %wide.load11.3 to <16 x i32>
2616  %22 = mul nuw nsw <16 x i32> %21, %19
2617  %23 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %22)
2618  %24 = add i32 %23, %17
2619  ret i32 %24
2620}
2621
2622define i32 @mlav128i32i8(ptr %x, ptr %y) {
2623; CHECK-LABEL: mlav128i32i8:
2624; CHECK:       @ %bb.0: @ %entry
2625; CHECK-NEXT:    vldrb.u8 q0, [r0]
2626; CHECK-NEXT:    vldrb.u8 q1, [r1]
2627; CHECK-NEXT:    mov r2, r0
2628; CHECK-NEXT:    vmlav.u8 r0, q1, q0
2629; CHECK-NEXT:    vldrb.u8 q0, [r2, #16]
2630; CHECK-NEXT:    vldrb.u8 q1, [r1, #16]
2631; CHECK-NEXT:    vmlava.u8 r0, q1, q0
2632; CHECK-NEXT:    vldrb.u8 q0, [r2, #32]
2633; CHECK-NEXT:    vldrb.u8 q1, [r1, #32]
2634; CHECK-NEXT:    vmlava.u8 r0, q1, q0
2635; CHECK-NEXT:    vldrb.u8 q0, [r2, #48]
2636; CHECK-NEXT:    vldrb.u8 q1, [r1, #48]
2637; CHECK-NEXT:    vmlava.u8 r0, q1, q0
2638; CHECK-NEXT:    vldrb.u8 q0, [r2, #64]
2639; CHECK-NEXT:    vldrb.u8 q1, [r1, #64]
2640; CHECK-NEXT:    vmlava.u8 r0, q1, q0
2641; CHECK-NEXT:    vldrb.u8 q0, [r2, #80]
2642; CHECK-NEXT:    vldrb.u8 q1, [r1, #80]
2643; CHECK-NEXT:    vmlava.u8 r0, q1, q0
2644; CHECK-NEXT:    vldrb.u8 q0, [r2, #96]
2645; CHECK-NEXT:    vldrb.u8 q1, [r1, #96]
2646; CHECK-NEXT:    vmlava.u8 r0, q1, q0
2647; CHECK-NEXT:    vldrb.u8 q0, [r2, #112]
2648; CHECK-NEXT:    vldrb.u8 q1, [r1, #112]
2649; CHECK-NEXT:    vmlava.u8 r0, q1, q0
2650; CHECK-NEXT:    bx lr
2651entry:
2652  %wide.load = load <16 x i8>, ptr %x, align 1
2653  %0 = zext <16 x i8> %wide.load to <16 x i32>
2654  %wide.load11 = load <16 x i8>, ptr %y, align 1
2655  %1 = zext <16 x i8> %wide.load11 to <16 x i32>
2656  %2 = mul nuw nsw <16 x i32> %1, %0
2657  %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
2658  %4 = getelementptr inbounds i8, ptr %x, i32 16
2659  %wide.load.1 = load <16 x i8>, ptr %4, align 1
2660  %5 = zext <16 x i8> %wide.load.1 to <16 x i32>
2661  %6 = getelementptr inbounds i8, ptr %y, i32 16
2662  %wide.load11.1 = load <16 x i8>, ptr %6, align 1
2663  %7 = zext <16 x i8> %wide.load11.1 to <16 x i32>
2664  %8 = mul nuw nsw <16 x i32> %7, %5
2665  %9 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %8)
2666  %10 = add i32 %9, %3
2667  %11 = getelementptr inbounds i8, ptr %x, i32 32
2668  %wide.load.2 = load <16 x i8>, ptr %11, align 1
2669  %12 = zext <16 x i8> %wide.load.2 to <16 x i32>
2670  %13 = getelementptr inbounds i8, ptr %y, i32 32
2671  %wide.load11.2 = load <16 x i8>, ptr %13, align 1
2672  %14 = zext <16 x i8> %wide.load11.2 to <16 x i32>
2673  %15 = mul nuw nsw <16 x i32> %14, %12
2674  %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15)
2675  %17 = add i32 %16, %10
2676  %18 = getelementptr inbounds i8, ptr %x, i32 48
2677  %wide.load.3 = load <16 x i8>, ptr %18, align 1
2678  %19 = zext <16 x i8> %wide.load.3 to <16 x i32>
2679  %20 = getelementptr inbounds i8, ptr %y, i32 48
2680  %wide.load11.3 = load <16 x i8>, ptr %20, align 1
2681  %21 = zext <16 x i8> %wide.load11.3 to <16 x i32>
2682  %22 = mul nuw nsw <16 x i32> %21, %19
2683  %23 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %22)
2684  %24 = add i32 %23, %17
2685  %25 = getelementptr inbounds i8, ptr %x, i32 64
2686  %wide.load.4 = load <16 x i8>, ptr %25, align 1
2687  %26 = zext <16 x i8> %wide.load.4 to <16 x i32>
2688  %27 = getelementptr inbounds i8, ptr %y, i32 64
2689  %wide.load11.4 = load <16 x i8>, ptr %27, align 1
2690  %28 = zext <16 x i8> %wide.load11.4 to <16 x i32>
2691  %29 = mul nuw nsw <16 x i32> %28, %26
2692  %30 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %29)
2693  %31 = add i32 %30, %24
2694  %32 = getelementptr inbounds i8, ptr %x, i32 80
2695  %wide.load.5 = load <16 x i8>, ptr %32, align 1
2696  %33 = zext <16 x i8> %wide.load.5 to <16 x i32>
2697  %34 = getelementptr inbounds i8, ptr %y, i32 80
2698  %wide.load11.5 = load <16 x i8>, ptr %34, align 1
2699  %35 = zext <16 x i8> %wide.load11.5 to <16 x i32>
2700  %36 = mul nuw nsw <16 x i32> %35, %33
2701  %37 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %36)
2702  %38 = add i32 %37, %31
2703  %39 = getelementptr inbounds i8, ptr %x, i32 96
2704  %wide.load.6 = load <16 x i8>, ptr %39, align 1
2705  %40 = zext <16 x i8> %wide.load.6 to <16 x i32>
2706  %41 = getelementptr inbounds i8, ptr %y, i32 96
2707  %wide.load11.6 = load <16 x i8>, ptr %41, align 1
2708  %42 = zext <16 x i8> %wide.load11.6 to <16 x i32>
2709  %43 = mul nuw nsw <16 x i32> %42, %40
2710  %44 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %43)
2711  %45 = add i32 %44, %38
2712  %46 = getelementptr inbounds i8, ptr %x, i32 112
2713  %wide.load.7 = load <16 x i8>, ptr %46, align 1
2714  %47 = zext <16 x i8> %wide.load.7 to <16 x i32>
2715  %48 = getelementptr inbounds i8, ptr %y, i32 112
2716  %wide.load11.7 = load <16 x i8>, ptr %48, align 1
2717  %49 = zext <16 x i8> %wide.load11.7 to <16 x i32>
2718  %50 = mul nuw nsw <16 x i32> %49, %47
2719  %51 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %50)
2720  %52 = add i32 %51, %45
2721  ret i32 %52
2722}
2723
2724define signext i16 @mlav2i16i16(ptr %x, ptr %y) {
2725; CHECK-LABEL: mlav2i16i16:
2726; CHECK:       @ %bb.0: @ %entry
2727; CHECK-NEXT:    ldrh r2, [r0]
2728; CHECK-NEXT:    ldrh r3, [r1]
2729; CHECK-NEXT:    ldrh r0, [r0, #2]
2730; CHECK-NEXT:    ldrh r1, [r1, #2]
2731; CHECK-NEXT:    muls r2, r3, r2
2732; CHECK-NEXT:    mla r0, r1, r0, r2
2733; CHECK-NEXT:    sxth r0, r0
2734; CHECK-NEXT:    bx lr
2735entry:
2736  %0 = load i16, ptr %x, align 2
2737  %1 = load i16, ptr %y, align 2
2738  %mul = mul i16 %1, %0
2739  %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1
2740  %2 = load i16, ptr %arrayidx.1, align 2
2741  %arrayidx1.1 = getelementptr inbounds i16, ptr %y, i32 1
2742  %3 = load i16, ptr %arrayidx1.1, align 2
2743  %mul.1 = mul i16 %3, %2
2744  %add.1 = add i16 %mul.1, %mul
2745  ret i16 %add.1
2746}
2747
2748define signext i16 @mlav4i16i16(ptr %x, ptr %y) {
2749; CHECK-LABEL: mlav4i16i16:
2750; CHECK:       @ %bb.0: @ %entry
2751; CHECK-NEXT:    vldrh.u32 q0, [r0]
2752; CHECK-NEXT:    vldrh.u32 q1, [r1]
2753; CHECK-NEXT:    vmlav.u32 r0, q1, q0
2754; CHECK-NEXT:    sxth r0, r0
2755; CHECK-NEXT:    bx lr
2756entry:
2757  %0 = load <4 x i16>, ptr %x, align 2
2758  %1 = load <4 x i16>, ptr %y, align 2
2759  %2 = mul <4 x i16> %1, %0
2760  %3 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %2)
2761  ret i16 %3
2762}
2763
2764define signext i16 @mlav8i16i16(ptr %x, ptr %y) {
2765; CHECK-LABEL: mlav8i16i16:
2766; CHECK:       @ %bb.0: @ %entry
2767; CHECK-NEXT:    vldrh.u16 q0, [r0]
2768; CHECK-NEXT:    vldrh.u16 q1, [r1]
2769; CHECK-NEXT:    vmlav.u16 r0, q1, q0
2770; CHECK-NEXT:    sxth r0, r0
2771; CHECK-NEXT:    bx lr
2772entry:
2773  %0 = load <8 x i16>, ptr %x, align 2
2774  %1 = load <8 x i16>, ptr %y, align 2
2775  %2 = mul <8 x i16> %1, %0
2776  %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
2777  ret i16 %3
2778}
2779
2780define signext i16 @mlav16i16i16(ptr %x, ptr %y) {
2781; CHECK-LABEL: mlav16i16i16:
2782; CHECK:       @ %bb.0: @ %entry
2783; CHECK-NEXT:    vldrh.u16 q0, [r0]
2784; CHECK-NEXT:    vldrh.u16 q1, [r1]
2785; CHECK-NEXT:    vmlav.u16 r2, q1, q0
2786; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
2787; CHECK-NEXT:    vldrh.u16 q1, [r1, #16]
2788; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2789; CHECK-NEXT:    sxth r0, r2
2790; CHECK-NEXT:    bx lr
2791entry:
2792  %0 = load <16 x i16>, ptr %x, align 2
2793  %1 = load <16 x i16>, ptr %y, align 2
2794  %2 = mul <16 x i16> %1, %0
2795  %3 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %2)
2796  ret i16 %3
2797}
2798
2799define signext i16 @mlav24i16i16(ptr %x, ptr %y) {
2800; CHECK-LABEL: mlav24i16i16:
2801; CHECK:       @ %bb.0: @ %entry
2802; CHECK-NEXT:    vldrh.u16 q0, [r0]
2803; CHECK-NEXT:    vldrh.u16 q1, [r1]
2804; CHECK-NEXT:    vmlav.u16 r2, q1, q0
2805; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
2806; CHECK-NEXT:    vldrh.u16 q1, [r1, #16]
2807; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2808; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
2809; CHECK-NEXT:    vldrh.u16 q1, [r1, #32]
2810; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2811; CHECK-NEXT:    sxth r0, r2
2812; CHECK-NEXT:    bx lr
2813entry:
2814  %0 = load <8 x i16>, ptr %x, align 2
2815  %1 = load <8 x i16>, ptr %y, align 2
2816  %2 = mul <8 x i16> %1, %0
2817  %arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8
2818  %arrayidx1.8 = getelementptr inbounds i16, ptr %y, i32 8
2819  %3 = load <16 x i16>, ptr %arrayidx.8, align 2
2820  %4 = load <16 x i16>, ptr %arrayidx1.8, align 2
2821  %5 = mul <16 x i16> %4, %3
2822  %6 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %5)
2823  %7 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
2824  %op.rdx = add i16 %6, %7
2825  ret i16 %op.rdx
2826}
2827
2828define signext i16 @mlav32i16i16(ptr %x, ptr %y) {
2829; CHECK-LABEL: mlav32i16i16:
2830; CHECK:       @ %bb.0: @ %entry
2831; CHECK-NEXT:    vldrh.u16 q0, [r0]
2832; CHECK-NEXT:    vldrh.u16 q1, [r1]
2833; CHECK-NEXT:    vmlav.u16 r2, q1, q0
2834; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
2835; CHECK-NEXT:    vldrh.u16 q1, [r1, #16]
2836; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2837; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
2838; CHECK-NEXT:    vldrh.u16 q1, [r1, #32]
2839; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2840; CHECK-NEXT:    vldrh.u16 q0, [r0, #48]
2841; CHECK-NEXT:    vldrh.u16 q1, [r1, #48]
2842; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2843; CHECK-NEXT:    sxth r0, r2
2844; CHECK-NEXT:    bx lr
2845entry:
2846  %0 = load <32 x i16>, ptr %x, align 2
2847  %1 = load <32 x i16>, ptr %y, align 2
2848  %2 = mul <32 x i16> %1, %0
2849  %3 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %2)
2850  ret i16 %3
2851}
2852
2853define signext i16 @mlav64i16i16(ptr %x, ptr %y) {
2854; CHECK-LABEL: mlav64i16i16:
2855; CHECK:       @ %bb.0: @ %entry
2856; CHECK-NEXT:    vldrh.u16 q0, [r0]
2857; CHECK-NEXT:    vldrh.u16 q1, [r1]
2858; CHECK-NEXT:    vmlav.u16 r2, q1, q0
2859; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
2860; CHECK-NEXT:    vldrh.u16 q1, [r1, #16]
2861; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2862; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
2863; CHECK-NEXT:    vldrh.u16 q1, [r1, #32]
2864; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2865; CHECK-NEXT:    vldrh.u16 q0, [r0, #48]
2866; CHECK-NEXT:    vldrh.u16 q1, [r1, #48]
2867; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2868; CHECK-NEXT:    vldrh.u16 q0, [r0, #64]
2869; CHECK-NEXT:    vldrh.u16 q1, [r1, #64]
2870; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2871; CHECK-NEXT:    vldrh.u16 q0, [r0, #80]
2872; CHECK-NEXT:    vldrh.u16 q1, [r1, #80]
2873; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2874; CHECK-NEXT:    vldrh.u16 q0, [r0, #96]
2875; CHECK-NEXT:    vldrh.u16 q1, [r1, #96]
2876; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2877; CHECK-NEXT:    vldrh.u16 q0, [r0, #112]
2878; CHECK-NEXT:    vldrh.u16 q1, [r1, #112]
2879; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2880; CHECK-NEXT:    sxth r0, r2
2881; CHECK-NEXT:    bx lr
2882entry:
2883  %wide.load = load <8 x i16>, ptr %x, align 2
2884  %wide.load13 = load <8 x i16>, ptr %y, align 2
2885  %0 = mul <8 x i16> %wide.load13, %wide.load
2886  %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0)
2887  %2 = getelementptr inbounds i16, ptr %x, i32 8
2888  %wide.load.1 = load <8 x i16>, ptr %2, align 2
2889  %3 = getelementptr inbounds i16, ptr %y, i32 8
2890  %wide.load13.1 = load <8 x i16>, ptr %3, align 2
2891  %4 = mul <8 x i16> %wide.load13.1, %wide.load.1
2892  %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %4)
2893  %6 = add i16 %5, %1
2894  %7 = getelementptr inbounds i16, ptr %x, i32 16
2895  %wide.load.2 = load <8 x i16>, ptr %7, align 2
2896  %8 = getelementptr inbounds i16, ptr %y, i32 16
2897  %wide.load13.2 = load <8 x i16>, ptr %8, align 2
2898  %9 = mul <8 x i16> %wide.load13.2, %wide.load.2
2899  %10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %9)
2900  %11 = add i16 %10, %6
2901  %12 = getelementptr inbounds i16, ptr %x, i32 24
2902  %wide.load.3 = load <8 x i16>, ptr %12, align 2
2903  %13 = getelementptr inbounds i16, ptr %y, i32 24
2904  %wide.load13.3 = load <8 x i16>, ptr %13, align 2
2905  %14 = mul <8 x i16> %wide.load13.3, %wide.load.3
2906  %15 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %14)
2907  %16 = add i16 %15, %11
2908  %17 = getelementptr inbounds i16, ptr %x, i32 32
2909  %wide.load.4 = load <8 x i16>, ptr %17, align 2
2910  %18 = getelementptr inbounds i16, ptr %y, i32 32
2911  %wide.load13.4 = load <8 x i16>, ptr %18, align 2
2912  %19 = mul <8 x i16> %wide.load13.4, %wide.load.4
2913  %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %19)
2914  %21 = add i16 %20, %16
2915  %22 = getelementptr inbounds i16, ptr %x, i32 40
2916  %wide.load.5 = load <8 x i16>, ptr %22, align 2
2917  %23 = getelementptr inbounds i16, ptr %y, i32 40
2918  %wide.load13.5 = load <8 x i16>, ptr %23, align 2
2919  %24 = mul <8 x i16> %wide.load13.5, %wide.load.5
2920  %25 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %24)
2921  %26 = add i16 %25, %21
2922  %27 = getelementptr inbounds i16, ptr %x, i32 48
2923  %wide.load.6 = load <8 x i16>, ptr %27, align 2
2924  %28 = getelementptr inbounds i16, ptr %y, i32 48
2925  %wide.load13.6 = load <8 x i16>, ptr %28, align 2
2926  %29 = mul <8 x i16> %wide.load13.6, %wide.load.6
2927  %30 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %29)
2928  %31 = add i16 %30, %26
2929  %32 = getelementptr inbounds i16, ptr %x, i32 56
2930  %wide.load.7 = load <8 x i16>, ptr %32, align 2
2931  %33 = getelementptr inbounds i16, ptr %y, i32 56
2932  %wide.load13.7 = load <8 x i16>, ptr %33, align 2
2933  %34 = mul <8 x i16> %wide.load13.7, %wide.load.7
2934  %35 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %34)
2935  %36 = add i16 %35, %31
2936  ret i16 %36
2937}
2938
2939define signext i16 @mlav128i16i16(ptr %x, ptr %y) {
2940; CHECK-LABEL: mlav128i16i16:
2941; CHECK:       @ %bb.0: @ %entry
2942; CHECK-NEXT:    vldrh.u16 q0, [r0]
2943; CHECK-NEXT:    vldrh.u16 q1, [r1]
2944; CHECK-NEXT:    vmlav.u16 r2, q1, q0
2945; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
2946; CHECK-NEXT:    vldrh.u16 q1, [r1, #16]
2947; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2948; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
2949; CHECK-NEXT:    vldrh.u16 q1, [r1, #32]
2950; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2951; CHECK-NEXT:    vldrh.u16 q0, [r0, #48]
2952; CHECK-NEXT:    vldrh.u16 q1, [r1, #48]
2953; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2954; CHECK-NEXT:    vldrh.u16 q0, [r0, #64]
2955; CHECK-NEXT:    vldrh.u16 q1, [r1, #64]
2956; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2957; CHECK-NEXT:    vldrh.u16 q0, [r0, #80]
2958; CHECK-NEXT:    vldrh.u16 q1, [r1, #80]
2959; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2960; CHECK-NEXT:    vldrh.u16 q0, [r0, #96]
2961; CHECK-NEXT:    vldrh.u16 q1, [r1, #96]
2962; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2963; CHECK-NEXT:    vldrh.u16 q0, [r0, #112]
2964; CHECK-NEXT:    vldrh.u16 q1, [r1, #112]
2965; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2966; CHECK-NEXT:    vldrh.u16 q0, [r0, #128]
2967; CHECK-NEXT:    vldrh.u16 q1, [r1, #128]
2968; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2969; CHECK-NEXT:    vldrh.u16 q0, [r0, #144]
2970; CHECK-NEXT:    vldrh.u16 q1, [r1, #144]
2971; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2972; CHECK-NEXT:    vldrh.u16 q0, [r0, #160]
2973; CHECK-NEXT:    vldrh.u16 q1, [r1, #160]
2974; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2975; CHECK-NEXT:    vldrh.u16 q0, [r0, #176]
2976; CHECK-NEXT:    vldrh.u16 q1, [r1, #176]
2977; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2978; CHECK-NEXT:    vldrh.u16 q0, [r0, #192]
2979; CHECK-NEXT:    vldrh.u16 q1, [r1, #192]
2980; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2981; CHECK-NEXT:    vldrh.u16 q0, [r0, #208]
2982; CHECK-NEXT:    vldrh.u16 q1, [r1, #208]
2983; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2984; CHECK-NEXT:    vldrh.u16 q0, [r0, #224]
2985; CHECK-NEXT:    vldrh.u16 q1, [r1, #224]
2986; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2987; CHECK-NEXT:    vldrh.u16 q0, [r0, #240]
2988; CHECK-NEXT:    vldrh.u16 q1, [r1, #240]
2989; CHECK-NEXT:    vmlava.u16 r2, q1, q0
2990; CHECK-NEXT:    sxth r0, r2
2991; CHECK-NEXT:    bx lr
2992entry:
2993  %wide.load = load <8 x i16>, ptr %x, align 2
2994  %wide.load13 = load <8 x i16>, ptr %y, align 2
2995  %0 = mul <8 x i16> %wide.load13, %wide.load
2996  %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0)
2997  %2 = getelementptr inbounds i16, ptr %x, i32 8
2998  %wide.load.1 = load <8 x i16>, ptr %2, align 2
2999  %3 = getelementptr inbounds i16, ptr %y, i32 8
3000  %wide.load13.1 = load <8 x i16>, ptr %3, align 2
3001  %4 = mul <8 x i16> %wide.load13.1, %wide.load.1
3002  %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %4)
3003  %6 = add i16 %5, %1
3004  %7 = getelementptr inbounds i16, ptr %x, i32 16
3005  %wide.load.2 = load <8 x i16>, ptr %7, align 2
3006  %8 = getelementptr inbounds i16, ptr %y, i32 16
3007  %wide.load13.2 = load <8 x i16>, ptr %8, align 2
3008  %9 = mul <8 x i16> %wide.load13.2, %wide.load.2
3009  %10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %9)
3010  %11 = add i16 %10, %6
3011  %12 = getelementptr inbounds i16, ptr %x, i32 24
3012  %wide.load.3 = load <8 x i16>, ptr %12, align 2
3013  %13 = getelementptr inbounds i16, ptr %y, i32 24
3014  %wide.load13.3 = load <8 x i16>, ptr %13, align 2
3015  %14 = mul <8 x i16> %wide.load13.3, %wide.load.3
3016  %15 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %14)
3017  %16 = add i16 %15, %11
3018  %17 = getelementptr inbounds i16, ptr %x, i32 32
3019  %wide.load.4 = load <8 x i16>, ptr %17, align 2
3020  %18 = getelementptr inbounds i16, ptr %y, i32 32
3021  %wide.load13.4 = load <8 x i16>, ptr %18, align 2
3022  %19 = mul <8 x i16> %wide.load13.4, %wide.load.4
3023  %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %19)
3024  %21 = add i16 %20, %16
3025  %22 = getelementptr inbounds i16, ptr %x, i32 40
3026  %wide.load.5 = load <8 x i16>, ptr %22, align 2
3027  %23 = getelementptr inbounds i16, ptr %y, i32 40
3028  %wide.load13.5 = load <8 x i16>, ptr %23, align 2
3029  %24 = mul <8 x i16> %wide.load13.5, %wide.load.5
3030  %25 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %24)
3031  %26 = add i16 %25, %21
3032  %27 = getelementptr inbounds i16, ptr %x, i32 48
3033  %wide.load.6 = load <8 x i16>, ptr %27, align 2
3034  %28 = getelementptr inbounds i16, ptr %y, i32 48
3035  %wide.load13.6 = load <8 x i16>, ptr %28, align 2
3036  %29 = mul <8 x i16> %wide.load13.6, %wide.load.6
3037  %30 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %29)
3038  %31 = add i16 %30, %26
3039  %32 = getelementptr inbounds i16, ptr %x, i32 56
3040  %wide.load.7 = load <8 x i16>, ptr %32, align 2
3041  %33 = getelementptr inbounds i16, ptr %y, i32 56
3042  %wide.load13.7 = load <8 x i16>, ptr %33, align 2
3043  %34 = mul <8 x i16> %wide.load13.7, %wide.load.7
3044  %35 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %34)
3045  %36 = add i16 %35, %31
3046  %37 = getelementptr inbounds i16, ptr %x, i32 64
3047  %wide.load.8 = load <8 x i16>, ptr %37, align 2
3048  %38 = getelementptr inbounds i16, ptr %y, i32 64
3049  %wide.load13.8 = load <8 x i16>, ptr %38, align 2
3050  %39 = mul <8 x i16> %wide.load13.8, %wide.load.8
3051  %40 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %39)
3052  %41 = add i16 %40, %36
3053  %42 = getelementptr inbounds i16, ptr %x, i32 72
3054  %wide.load.9 = load <8 x i16>, ptr %42, align 2
3055  %43 = getelementptr inbounds i16, ptr %y, i32 72
3056  %wide.load13.9 = load <8 x i16>, ptr %43, align 2
3057  %44 = mul <8 x i16> %wide.load13.9, %wide.load.9
3058  %45 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %44)
3059  %46 = add i16 %45, %41
3060  %47 = getelementptr inbounds i16, ptr %x, i32 80
3061  %wide.load.10 = load <8 x i16>, ptr %47, align 2
3062  %48 = getelementptr inbounds i16, ptr %y, i32 80
3063  %wide.load13.10 = load <8 x i16>, ptr %48, align 2
3064  %49 = mul <8 x i16> %wide.load13.10, %wide.load.10
3065  %50 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %49)
3066  %51 = add i16 %50, %46
3067  %52 = getelementptr inbounds i16, ptr %x, i32 88
3068  %wide.load.11 = load <8 x i16>, ptr %52, align 2
3069  %53 = getelementptr inbounds i16, ptr %y, i32 88
3070  %wide.load13.11 = load <8 x i16>, ptr %53, align 2
3071  %54 = mul <8 x i16> %wide.load13.11, %wide.load.11
3072  %55 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %54)
3073  %56 = add i16 %55, %51
3074  %57 = getelementptr inbounds i16, ptr %x, i32 96
3075  %wide.load.12 = load <8 x i16>, ptr %57, align 2
3076  %58 = getelementptr inbounds i16, ptr %y, i32 96
3077  %wide.load13.12 = load <8 x i16>, ptr %58, align 2
3078  %59 = mul <8 x i16> %wide.load13.12, %wide.load.12
3079  %60 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %59)
3080  %61 = add i16 %60, %56
3081  %62 = getelementptr inbounds i16, ptr %x, i32 104
3082  %wide.load.13 = load <8 x i16>, ptr %62, align 2
3083  %63 = getelementptr inbounds i16, ptr %y, i32 104
3084  %wide.load13.13 = load <8 x i16>, ptr %63, align 2
3085  %64 = mul <8 x i16> %wide.load13.13, %wide.load.13
3086  %65 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %64)
3087  %66 = add i16 %65, %61
3088  %67 = getelementptr inbounds i16, ptr %x, i32 112
3089  %wide.load.14 = load <8 x i16>, ptr %67, align 2
3090  %68 = getelementptr inbounds i16, ptr %y, i32 112
3091  %wide.load13.14 = load <8 x i16>, ptr %68, align 2
3092  %69 = mul <8 x i16> %wide.load13.14, %wide.load.14
3093  %70 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %69)
3094  %71 = add i16 %70, %66
3095  %72 = getelementptr inbounds i16, ptr %x, i32 120
3096  %wide.load.15 = load <8 x i16>, ptr %72, align 2
3097  %73 = getelementptr inbounds i16, ptr %y, i32 120
3098  %wide.load13.15 = load <8 x i16>, ptr %73, align 2
3099  %74 = mul <8 x i16> %wide.load13.15, %wide.load.15
3100  %75 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %74)
3101  %76 = add i16 %75, %71
3102  ret i16 %76
3103}
3104
3105define zeroext i8 @mlav2i8i8(ptr %x, ptr %y) {
3106; CHECK-LABEL: mlav2i8i8:
3107; CHECK:       @ %bb.0: @ %entry
3108; CHECK-NEXT:    ldrb r2, [r0]
3109; CHECK-NEXT:    ldrb r3, [r1]
3110; CHECK-NEXT:    ldrb r0, [r0, #1]
3111; CHECK-NEXT:    ldrb r1, [r1, #1]
3112; CHECK-NEXT:    muls r2, r3, r2
3113; CHECK-NEXT:    mla r0, r1, r0, r2
3114; CHECK-NEXT:    uxtb r0, r0
3115; CHECK-NEXT:    bx lr
3116entry:
3117  %0 = load i8, ptr %x, align 1
3118  %1 = load i8, ptr %y, align 1
3119  %mul = mul i8 %1, %0
3120  %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1
3121  %2 = load i8, ptr %arrayidx.1, align 1
3122  %arrayidx1.1 = getelementptr inbounds i8, ptr %y, i32 1
3123  %3 = load i8, ptr %arrayidx1.1, align 1
3124  %mul.1 = mul i8 %3, %2
3125  %add.1 = add i8 %mul.1, %mul
3126  ret i8 %add.1
3127}
3128
3129define zeroext i8 @mlav4i8i8(ptr %x, ptr %y) {
3130; CHECK-LABEL: mlav4i8i8:
3131; CHECK:       @ %bb.0: @ %entry
3132; CHECK-NEXT:    vldrb.u32 q0, [r0]
3133; CHECK-NEXT:    vldrb.u32 q1, [r1]
3134; CHECK-NEXT:    vmlav.u32 r0, q1, q0
3135; CHECK-NEXT:    uxtb r0, r0
3136; CHECK-NEXT:    bx lr
3137entry:
3138  %0 = load <4 x i8>, ptr %x, align 1
3139  %1 = load <4 x i8>, ptr %y, align 1
3140  %2 = mul <4 x i8> %1, %0
3141  %3 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %2)
3142  ret i8 %3
3143}
3144
3145define zeroext i8 @mlav8i8i8(ptr %x, ptr %y) {
3146; CHECK-LABEL: mlav8i8i8:
3147; CHECK:       @ %bb.0: @ %entry
3148; CHECK-NEXT:    vldrb.u16 q0, [r0]
3149; CHECK-NEXT:    vldrb.u16 q1, [r1]
3150; CHECK-NEXT:    vmlav.u16 r0, q1, q0
3151; CHECK-NEXT:    uxtb r0, r0
3152; CHECK-NEXT:    bx lr
3153entry:
3154  %0 = load <8 x i8>, ptr %x, align 1
3155  %1 = load <8 x i8>, ptr %y, align 1
3156  %2 = mul <8 x i8> %1, %0
3157  %3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %2)
3158  ret i8 %3
3159}
3160
3161define zeroext i8 @mlav16i8i8(ptr %x, ptr %y) {
3162; CHECK-LABEL: mlav16i8i8:
3163; CHECK:       @ %bb.0: @ %entry
3164; CHECK-NEXT:    vldrb.u8 q0, [r0]
3165; CHECK-NEXT:    vldrb.u8 q1, [r1]
3166; CHECK-NEXT:    vmlav.u8 r0, q1, q0
3167; CHECK-NEXT:    uxtb r0, r0
3168; CHECK-NEXT:    bx lr
3169entry:
3170  %0 = load <16 x i8>, ptr %x, align 1
3171  %1 = load <16 x i8>, ptr %y, align 1
3172  %2 = mul <16 x i8> %1, %0
3173  %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
3174  ret i8 %3
3175}
3176
3177define zeroext i8 @mlav24i8i8(ptr %x, ptr %y) {
3178; CHECK-LABEL: mlav24i8i8:
3179; CHECK:       @ %bb.0: @ %entry
3180; CHECK-NEXT:    vldrb.u16 q0, [r0]
3181; CHECK-NEXT:    vldrb.u16 q1, [r1]
3182; CHECK-NEXT:    vmlav.u16 r2, q1, q0
3183; CHECK-NEXT:    vldrb.u8 q0, [r0, #8]
3184; CHECK-NEXT:    vldrb.u8 q1, [r1, #8]
3185; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3186; CHECK-NEXT:    uxtb r0, r2
3187; CHECK-NEXT:    bx lr
3188entry:
3189  %0 = load <8 x i8>, ptr %x, align 1
3190  %1 = load <8 x i8>, ptr %y, align 1
3191  %2 = mul <8 x i8> %1, %0
3192  %arrayidx.8 = getelementptr inbounds i8, ptr %x, i32 8
3193  %arrayidx1.8 = getelementptr inbounds i8, ptr %y, i32 8
3194  %3 = load <16 x i8>, ptr %arrayidx.8, align 1
3195  %4 = load <16 x i8>, ptr %arrayidx1.8, align 1
3196  %5 = mul <16 x i8> %4, %3
3197  %6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5)
3198  %7 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %2)
3199  %op.rdx = add i8 %6, %7
3200  ret i8 %op.rdx
3201}
3202
3203define zeroext i8 @mlav32i8i8(ptr %x, ptr %y) {
3204; CHECK-LABEL: mlav32i8i8:
3205; CHECK:       @ %bb.0: @ %entry
3206; CHECK-NEXT:    vldrb.u8 q0, [r0]
3207; CHECK-NEXT:    vldrb.u8 q1, [r1]
3208; CHECK-NEXT:    vmlav.u8 r2, q1, q0
3209; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
3210; CHECK-NEXT:    vldrb.u8 q1, [r1, #16]
3211; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3212; CHECK-NEXT:    uxtb r0, r2
3213; CHECK-NEXT:    bx lr
3214entry:
3215  %0 = load <32 x i8>, ptr %x, align 1
3216  %1 = load <32 x i8>, ptr %y, align 1
3217  %2 = mul <32 x i8> %1, %0
3218  %3 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %2)
3219  ret i8 %3
3220}
3221
3222define zeroext i8 @mlav64i8i8(ptr %x, ptr %y) {
3223; CHECK-LABEL: mlav64i8i8:
3224; CHECK:       @ %bb.0: @ %entry
3225; CHECK-NEXT:    vldrb.u8 q0, [r0]
3226; CHECK-NEXT:    vldrb.u8 q1, [r1]
3227; CHECK-NEXT:    vmlav.u8 r2, q1, q0
3228; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
3229; CHECK-NEXT:    vldrb.u8 q1, [r1, #16]
3230; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3231; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
3232; CHECK-NEXT:    vldrb.u8 q1, [r1, #32]
3233; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3234; CHECK-NEXT:    vldrb.u8 q0, [r0, #48]
3235; CHECK-NEXT:    vldrb.u8 q1, [r1, #48]
3236; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3237; CHECK-NEXT:    uxtb r0, r2
3238; CHECK-NEXT:    bx lr
3239entry:
3240  %wide.load = load <16 x i8>, ptr %x, align 1
3241  %wide.load12 = load <16 x i8>, ptr %y, align 1
3242  %0 = mul <16 x i8> %wide.load12, %wide.load
3243  %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %0)
3244  %2 = getelementptr inbounds i8, ptr %x, i32 16
3245  %wide.load.1 = load <16 x i8>, ptr %2, align 1
3246  %3 = getelementptr inbounds i8, ptr %y, i32 16
3247  %wide.load12.1 = load <16 x i8>, ptr %3, align 1
3248  %4 = mul <16 x i8> %wide.load12.1, %wide.load.1
3249  %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %4)
3250  %6 = add i8 %5, %1
3251  %7 = getelementptr inbounds i8, ptr %x, i32 32
3252  %wide.load.2 = load <16 x i8>, ptr %7, align 1
3253  %8 = getelementptr inbounds i8, ptr %y, i32 32
3254  %wide.load12.2 = load <16 x i8>, ptr %8, align 1
3255  %9 = mul <16 x i8> %wide.load12.2, %wide.load.2
3256  %10 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %9)
3257  %11 = add i8 %10, %6
3258  %12 = getelementptr inbounds i8, ptr %x, i32 48
3259  %wide.load.3 = load <16 x i8>, ptr %12, align 1
3260  %13 = getelementptr inbounds i8, ptr %y, i32 48
3261  %wide.load12.3 = load <16 x i8>, ptr %13, align 1
3262  %14 = mul <16 x i8> %wide.load12.3, %wide.load.3
3263  %15 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %14)
3264  %16 = add i8 %15, %11
3265  ret i8 %16
3266}
3267
3268define zeroext i8 @mlav128i8i8(ptr %x, ptr %y) {
3269; CHECK-LABEL: mlav128i8i8:
3270; CHECK:       @ %bb.0: @ %entry
3271; CHECK-NEXT:    vldrb.u8 q0, [r0]
3272; CHECK-NEXT:    vldrb.u8 q1, [r1]
3273; CHECK-NEXT:    vmlav.u8 r2, q1, q0
3274; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
3275; CHECK-NEXT:    vldrb.u8 q1, [r1, #16]
3276; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3277; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
3278; CHECK-NEXT:    vldrb.u8 q1, [r1, #32]
3279; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3280; CHECK-NEXT:    vldrb.u8 q0, [r0, #48]
3281; CHECK-NEXT:    vldrb.u8 q1, [r1, #48]
3282; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3283; CHECK-NEXT:    vldrb.u8 q0, [r0, #64]
3284; CHECK-NEXT:    vldrb.u8 q1, [r1, #64]
3285; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3286; CHECK-NEXT:    vldrb.u8 q0, [r0, #80]
3287; CHECK-NEXT:    vldrb.u8 q1, [r1, #80]
3288; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3289; CHECK-NEXT:    vldrb.u8 q0, [r0, #96]
3290; CHECK-NEXT:    vldrb.u8 q1, [r1, #96]
3291; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3292; CHECK-NEXT:    vldrb.u8 q0, [r0, #112]
3293; CHECK-NEXT:    vldrb.u8 q1, [r1, #112]
3294; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3295; CHECK-NEXT:    uxtb r0, r2
3296; CHECK-NEXT:    bx lr
3297entry:
3298  %wide.load = load <16 x i8>, ptr %x, align 1
3299  %wide.load12 = load <16 x i8>, ptr %y, align 1
3300  %0 = mul <16 x i8> %wide.load12, %wide.load
3301  %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %0)
3302  %2 = getelementptr inbounds i8, ptr %x, i32 16
3303  %wide.load.1 = load <16 x i8>, ptr %2, align 1
3304  %3 = getelementptr inbounds i8, ptr %y, i32 16
3305  %wide.load12.1 = load <16 x i8>, ptr %3, align 1
3306  %4 = mul <16 x i8> %wide.load12.1, %wide.load.1
3307  %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %4)
3308  %6 = add i8 %5, %1
3309  %7 = getelementptr inbounds i8, ptr %x, i32 32
3310  %wide.load.2 = load <16 x i8>, ptr %7, align 1
3311  %8 = getelementptr inbounds i8, ptr %y, i32 32
3312  %wide.load12.2 = load <16 x i8>, ptr %8, align 1
3313  %9 = mul <16 x i8> %wide.load12.2, %wide.load.2
3314  %10 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %9)
3315  %11 = add i8 %10, %6
3316  %12 = getelementptr inbounds i8, ptr %x, i32 48
3317  %wide.load.3 = load <16 x i8>, ptr %12, align 1
3318  %13 = getelementptr inbounds i8, ptr %y, i32 48
3319  %wide.load12.3 = load <16 x i8>, ptr %13, align 1
3320  %14 = mul <16 x i8> %wide.load12.3, %wide.load.3
3321  %15 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %14)
3322  %16 = add i8 %15, %11
3323  %17 = getelementptr inbounds i8, ptr %x, i32 64
3324  %wide.load.4 = load <16 x i8>, ptr %17, align 1
3325  %18 = getelementptr inbounds i8, ptr %y, i32 64
3326  %wide.load12.4 = load <16 x i8>, ptr %18, align 1
3327  %19 = mul <16 x i8> %wide.load12.4, %wide.load.4
3328  %20 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %19)
3329  %21 = add i8 %20, %16
3330  %22 = getelementptr inbounds i8, ptr %x, i32 80
3331  %wide.load.5 = load <16 x i8>, ptr %22, align 1
3332  %23 = getelementptr inbounds i8, ptr %y, i32 80
3333  %wide.load12.5 = load <16 x i8>, ptr %23, align 1
3334  %24 = mul <16 x i8> %wide.load12.5, %wide.load.5
3335  %25 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %24)
3336  %26 = add i8 %25, %21
3337  %27 = getelementptr inbounds i8, ptr %x, i32 96
3338  %wide.load.6 = load <16 x i8>, ptr %27, align 1
3339  %28 = getelementptr inbounds i8, ptr %y, i32 96
3340  %wide.load12.6 = load <16 x i8>, ptr %28, align 1
3341  %29 = mul <16 x i8> %wide.load12.6, %wide.load.6
3342  %30 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %29)
3343  %31 = add i8 %30, %26
3344  %32 = getelementptr inbounds i8, ptr %x, i32 112
3345  %wide.load.7 = load <16 x i8>, ptr %32, align 1
3346  %33 = getelementptr inbounds i8, ptr %y, i32 112
3347  %wide.load12.7 = load <16 x i8>, ptr %33, align 1
3348  %34 = mul <16 x i8> %wide.load12.7, %wide.load.7
3349  %35 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %34)
3350  %36 = add i8 %35, %31
3351  ret i8 %36
3352}
3353
3354
3355define arm_aapcs_vfpcc i32 @add_two_const(<4 x i32> %x, <4 x i32> %y) {
3356; CHECK-LABEL: add_two_const:
3357; CHECK:       @ %bb.0: @ %entry
3358; CHECK-NEXT:    vaddv.u32 r0, q1
3359; CHECK-NEXT:    vaddva.u32 r0, q0
3360; CHECK-NEXT:    adds r0, #10
3361; CHECK-NEXT:    bx lr
3362entry:
3363  %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
3364  %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
3365  %c = add i32 %a, %b
3366  %d = add i32 %c, 10
3367  ret i32 %d
3368}
3369
3370define arm_aapcs_vfpcc i32 @add_two_const2(<4 x i32> %x, <4 x i32> %y) {
3371; CHECK-LABEL: add_two_const2:
3372; CHECK:       @ %bb.0: @ %entry
3373; CHECK-NEXT:    vaddv.u32 r0, q1
3374; CHECK-NEXT:    vaddva.u32 r0, q0
3375; CHECK-NEXT:    adds r0, #10
3376; CHECK-NEXT:    bx lr
3377entry:
3378  %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
3379  %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
3380  %c = add i32 %a, 10
3381  %d = add i32 %c, %b
3382  ret i32 %d
3383}
3384
3385define arm_aapcs_vfpcc i32 @add_two_const3(<4 x i32> %x, <4 x i32> %y) {
3386; CHECK-LABEL: add_two_const3:
3387; CHECK:       @ %bb.0: @ %entry
3388; CHECK-NEXT:    vaddv.u32 r0, q0
3389; CHECK-NEXT:    vaddva.u32 r0, q1
3390; CHECK-NEXT:    adds r0, #20
3391; CHECK-NEXT:    bx lr
3392entry:
3393  %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
3394  %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
3395  %c = add i32 %a, 10
3396  %d = add i32 %b, 10
3397  %e = add i32 %c, %d
3398  ret i32 %e
3399}
3400
3401declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
3402declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
3403declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
3404declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
3405declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
3406declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
3407declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
3408declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
3409declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
3410declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
3411declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
3412declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
3413declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
3414declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
3415declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
3416