xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-vhadd.ll (revision ee27e5df9e67bffbc629ea8638524ee7725d12ab)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc <4 x i32> @vhadds_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
5; CHECK-LABEL: vhadds_v4i32:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vhadd.s32 q0, q0, q1
8; CHECK-NEXT:    bx lr
9entry:
10  %s0s = sext <4 x i32> %s0 to <4 x i64>
11  %s1s = sext <4 x i32> %s1 to <4 x i64>
12  %m = add nsw <4 x i64> %s0s, %s1s
13  %s = lshr <4 x i64> %m, <i64 1, i64 1, i64 1, i64 1>
14  %s2 = trunc <4 x i64> %s to <4 x i32>
15  ret <4 x i32> %s2
16}
17
18define arm_aapcs_vfpcc <4 x i32> @vhaddu_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
19; CHECK-LABEL: vhaddu_v4i32:
20; CHECK:       @ %bb.0: @ %entry
21; CHECK-NEXT:    vhadd.u32 q0, q0, q1
22; CHECK-NEXT:    bx lr
23entry:
24  %s0s = zext <4 x i32> %s0 to <4 x i64>
25  %s1s = zext <4 x i32> %s1 to <4 x i64>
26  %m = add nuw nsw <4 x i64> %s0s, %s1s
27  %s = lshr <4 x i64> %m, <i64 1, i64 1, i64 1, i64 1>
28  %s2 = trunc <4 x i64> %s to <4 x i32>
29  ret <4 x i32> %s2
30}
31
32define arm_aapcs_vfpcc <4 x i16> @vhadds_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
33; CHECK-LABEL: vhadds_v4i16:
34; CHECK:       @ %bb.0: @ %entry
35; CHECK-NEXT:    vmovlb.s16 q1, q1
36; CHECK-NEXT:    vmovlb.s16 q0, q0
37; CHECK-NEXT:    vadd.i32 q0, q0, q1
38; CHECK-NEXT:    vshr.u32 q0, q0, #1
39; CHECK-NEXT:    bx lr
40entry:
41  %s0s = sext <4 x i16> %s0 to <4 x i32>
42  %s1s = sext <4 x i16> %s1 to <4 x i32>
43  %m = add nsw <4 x i32> %s0s, %s1s
44  %s = lshr <4 x i32> %m, <i32 1, i32 1, i32 1, i32 1>
45  %s2 = trunc <4 x i32> %s to <4 x i16>
46  ret <4 x i16> %s2
47}
48
49define arm_aapcs_vfpcc <4 x i16> @vhaddu_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
50; CHECK-LABEL: vhaddu_v4i16:
51; CHECK:       @ %bb.0: @ %entry
52; CHECK-NEXT:    vmovlb.u16 q1, q1
53; CHECK-NEXT:    vmovlb.u16 q0, q0
54; CHECK-NEXT:    vhadd.u32 q0, q0, q1
55; CHECK-NEXT:    bx lr
56entry:
57  %s0s = zext <4 x i16> %s0 to <4 x i32>
58  %s1s = zext <4 x i16> %s1 to <4 x i32>
59  %m = add nuw nsw <4 x i32> %s0s, %s1s
60  %s = lshr <4 x i32> %m, <i32 1, i32 1, i32 1, i32 1>
61  %s2 = trunc <4 x i32> %s to <4 x i16>
62  ret <4 x i16> %s2
63}
64
65define arm_aapcs_vfpcc <8 x i16> @vhadds_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
66; CHECK-LABEL: vhadds_v8i16:
67; CHECK:       @ %bb.0: @ %entry
68; CHECK-NEXT:    vhadd.s16 q0, q0, q1
69; CHECK-NEXT:    bx lr
70entry:
71  %s0s = sext <8 x i16> %s0 to <8 x i32>
72  %s1s = sext <8 x i16> %s1 to <8 x i32>
73  %m = add nsw <8 x i32> %s0s, %s1s
74  %s = lshr <8 x i32> %m, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
75  %s2 = trunc <8 x i32> %s to <8 x i16>
76  ret <8 x i16> %s2
77}
78
79define arm_aapcs_vfpcc <8 x i16> @vhaddu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
80; CHECK-LABEL: vhaddu_v8i16:
81; CHECK:       @ %bb.0: @ %entry
82; CHECK-NEXT:    vhadd.u16 q0, q0, q1
83; CHECK-NEXT:    bx lr
84entry:
85  %s0s = zext <8 x i16> %s0 to <8 x i32>
86  %s1s = zext <8 x i16> %s1 to <8 x i32>
87  %m = add nuw nsw <8 x i32> %s0s, %s1s
88  %s = lshr <8 x i32> %m, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
89  %s2 = trunc <8 x i32> %s to <8 x i16>
90  ret <8 x i16> %s2
91}
92
93define arm_aapcs_vfpcc <4 x i8> @vhadds_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
94; CHECK-LABEL: vhadds_v4i8:
95; CHECK:       @ %bb.0: @ %entry
96; CHECK-NEXT:    vmovlb.s8 q1, q1
97; CHECK-NEXT:    vmovlb.s8 q0, q0
98; CHECK-NEXT:    vmovlb.s16 q1, q1
99; CHECK-NEXT:    vmovlb.s16 q0, q0
100; CHECK-NEXT:    vadd.i32 q0, q0, q1
101; CHECK-NEXT:    vmovlb.u16 q0, q0
102; CHECK-NEXT:    vshr.u32 q0, q0, #1
103; CHECK-NEXT:    bx lr
104entry:
105  %s0s = sext <4 x i8> %s0 to <4 x i16>
106  %s1s = sext <4 x i8> %s1 to <4 x i16>
107  %m = add nsw <4 x i16> %s0s, %s1s
108  %s = lshr <4 x i16> %m, <i16 1, i16 1, i16 1, i16 1>
109  %s2 = trunc <4 x i16> %s to <4 x i8>
110  ret <4 x i8> %s2
111}
112
113define arm_aapcs_vfpcc <4 x i8> @vhaddu_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
114; CHECK-LABEL: vhaddu_v4i8:
115; CHECK:       @ %bb.0: @ %entry
116; CHECK-NEXT:    vmov.i32 q2, #0xff
117; CHECK-NEXT:    vand q1, q1, q2
118; CHECK-NEXT:    vand q0, q0, q2
119; CHECK-NEXT:    vhadd.u32 q0, q0, q1
120; CHECK-NEXT:    bx lr
121entry:
122  %s0s = zext <4 x i8> %s0 to <4 x i16>
123  %s1s = zext <4 x i8> %s1 to <4 x i16>
124  %m = add nuw nsw <4 x i16> %s0s, %s1s
125  %s = lshr <4 x i16> %m, <i16 1, i16 1, i16 1, i16 1>
126  %s2 = trunc <4 x i16> %s to <4 x i8>
127  ret <4 x i8> %s2
128}
129
130define arm_aapcs_vfpcc <8 x i8> @vhadds_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
131; CHECK-LABEL: vhadds_v8i8:
132; CHECK:       @ %bb.0: @ %entry
133; CHECK-NEXT:    vmovlb.s8 q1, q1
134; CHECK-NEXT:    vmovlb.s8 q0, q0
135; CHECK-NEXT:    vadd.i16 q0, q0, q1
136; CHECK-NEXT:    vshr.u16 q0, q0, #1
137; CHECK-NEXT:    bx lr
138entry:
139  %s0s = sext <8 x i8> %s0 to <8 x i16>
140  %s1s = sext <8 x i8> %s1 to <8 x i16>
141  %m = add nsw <8 x i16> %s0s, %s1s
142  %s = lshr <8 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
143  %s2 = trunc <8 x i16> %s to <8 x i8>
144  ret <8 x i8> %s2
145}
146
147define arm_aapcs_vfpcc <8 x i8> @vhaddu_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
148; CHECK-LABEL: vhaddu_v8i8:
149; CHECK:       @ %bb.0: @ %entry
150; CHECK-NEXT:    vmovlb.u8 q1, q1
151; CHECK-NEXT:    vmovlb.u8 q0, q0
152; CHECK-NEXT:    vhadd.u16 q0, q0, q1
153; CHECK-NEXT:    bx lr
154entry:
155  %s0s = zext <8 x i8> %s0 to <8 x i16>
156  %s1s = zext <8 x i8> %s1 to <8 x i16>
157  %m = add nuw nsw <8 x i16> %s0s, %s1s
158  %s = lshr <8 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
159  %s2 = trunc <8 x i16> %s to <8 x i8>
160  ret <8 x i8> %s2
161}
162
163define arm_aapcs_vfpcc <16 x i8> @vhadds_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
164; CHECK-LABEL: vhadds_v16i8:
165; CHECK:       @ %bb.0: @ %entry
166; CHECK-NEXT:    vhadd.s8 q0, q0, q1
167; CHECK-NEXT:    bx lr
168entry:
169  %s0s = sext <16 x i8> %s0 to <16 x i16>
170  %s1s = sext <16 x i8> %s1 to <16 x i16>
171  %m = add nsw <16 x i16> %s0s, %s1s
172  %s = lshr <16 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
173  %s2 = trunc <16 x i16> %s to <16 x i8>
174  ret <16 x i8> %s2
175}
176
177define arm_aapcs_vfpcc <16 x i8> @vhaddu_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
178; CHECK-LABEL: vhaddu_v16i8:
179; CHECK:       @ %bb.0: @ %entry
180; CHECK-NEXT:    vhadd.u8 q0, q0, q1
181; CHECK-NEXT:    bx lr
182entry:
183  %s0s = zext <16 x i8> %s0 to <16 x i16>
184  %s1s = zext <16 x i8> %s1 to <16 x i16>
185  %m = add nuw nsw <16 x i16> %s0s, %s1s
186  %s = lshr <16 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
187  %s2 = trunc <16 x i16> %s to <16 x i8>
188  ret <16 x i8> %s2
189}
190
191define arm_aapcs_vfpcc <4 x i32> @vrhadds_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
192; CHECK-LABEL: vrhadds_v4i32:
193; CHECK:       @ %bb.0: @ %entry
194; CHECK-NEXT:    vrhadd.s32 q0, q0, q1
195; CHECK-NEXT:    bx lr
196entry:
197  %s0s = sext <4 x i32> %s0 to <4 x i64>
198  %s1s = sext <4 x i32> %s1 to <4 x i64>
199  %add = add nsw <4 x i64> %s0s, <i64 1, i64 1, i64 1, i64 1>
200  %add2 = add nsw <4 x i64> %add, %s1s
201  %s = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
202  %result = trunc <4 x i64> %s to <4 x i32>
203  ret <4 x i32> %result
204}
205
206define arm_aapcs_vfpcc <4 x i32> @vrhaddu_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
207; CHECK-LABEL: vrhaddu_v4i32:
208; CHECK:       @ %bb.0: @ %entry
209; CHECK-NEXT:    vrhadd.u32 q0, q0, q1
210; CHECK-NEXT:    bx lr
211entry:
212  %s0s = zext <4 x i32> %s0 to <4 x i64>
213  %s1s = zext <4 x i32> %s1 to <4 x i64>
214  %add = add nuw nsw <4 x i64> %s0s, <i64 1, i64 1, i64 1, i64 1>
215  %add2 = add nuw nsw <4 x i64> %add, %s1s
216  %s = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
217  %result = trunc <4 x i64> %s to <4 x i32>
218  ret <4 x i32> %result
219}
220
221define arm_aapcs_vfpcc <4 x i16> @vrhadds_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
222; CHECK-LABEL: vrhadds_v4i16:
223; CHECK:       @ %bb.0: @ %entry
224; CHECK-NEXT:    vmovlb.s16 q1, q1
225; CHECK-NEXT:    vmovlb.s16 q0, q0
226; CHECK-NEXT:    vadd.i32 q0, q0, q1
227; CHECK-NEXT:    movs r0, #1
228; CHECK-NEXT:    vadd.i32 q0, q0, r0
229; CHECK-NEXT:    vshr.u32 q0, q0, #1
230; CHECK-NEXT:    bx lr
231entry:
232  %s0s = sext <4 x i16> %s0 to <4 x i32>
233  %s1s = sext <4 x i16> %s1 to <4 x i32>
234  %add = add nsw <4 x i32> %s0s, <i32 1, i32 1, i32 1, i32 1>
235  %add2 = add nsw <4 x i32> %add, %s1s
236  %s = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
237  %result = trunc <4 x i32> %s to <4 x i16>
238  ret <4 x i16> %result
239}
240
241define arm_aapcs_vfpcc <4 x i16> @vrhaddu_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
242; CHECK-LABEL: vrhaddu_v4i16:
243; CHECK:       @ %bb.0: @ %entry
244; CHECK-NEXT:    vmovlb.u16 q1, q1
245; CHECK-NEXT:    vmovlb.u16 q0, q0
246; CHECK-NEXT:    vrhadd.u32 q0, q0, q1
247; CHECK-NEXT:    bx lr
248entry:
249  %s0s = zext <4 x i16> %s0 to <4 x i32>
250  %s1s = zext <4 x i16> %s1 to <4 x i32>
251  %add = add nuw nsw <4 x i32> %s0s, <i32 1, i32 1, i32 1, i32 1>
252  %add2 = add nuw nsw <4 x i32> %add, %s1s
253  %s = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
254  %result = trunc <4 x i32> %s to <4 x i16>
255  ret <4 x i16> %result
256}
257
258define arm_aapcs_vfpcc <8 x i16> @vrhadds_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
259; CHECK-LABEL: vrhadds_v8i16:
260; CHECK:       @ %bb.0: @ %entry
261; CHECK-NEXT:    vrhadd.s16 q0, q0, q1
262; CHECK-NEXT:    bx lr
263entry:
264  %s0s = sext <8 x i16> %s0 to <8 x i32>
265  %s1s = sext <8 x i16> %s1 to <8 x i32>
266  %add = add nsw <8 x i32> %s0s, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
267  %add2 = add nsw <8 x i32> %add, %s1s
268  %s = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
269  %result = trunc <8 x i32> %s to <8 x i16>
270  ret <8 x i16> %result
271}
272
273define arm_aapcs_vfpcc <8 x i16> @vrhaddu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
274; CHECK-LABEL: vrhaddu_v8i16:
275; CHECK:       @ %bb.0: @ %entry
276; CHECK-NEXT:    vrhadd.u16 q0, q0, q1
277; CHECK-NEXT:    bx lr
278entry:
279  %s0s = zext <8 x i16> %s0 to <8 x i32>
280  %s1s = zext <8 x i16> %s1 to <8 x i32>
281  %add = add nuw nsw <8 x i32> %s0s, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
282  %add2 = add nuw nsw <8 x i32> %add, %s1s
283  %s = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
284  %result = trunc <8 x i32> %s to <8 x i16>
285  ret <8 x i16> %result
286}
287
288define arm_aapcs_vfpcc <4 x i8> @vrhadds_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
289; CHECK-LABEL: vrhadds_v4i8:
290; CHECK:       @ %bb.0: @ %entry
291; CHECK-NEXT:    vmovlb.s8 q1, q1
292; CHECK-NEXT:    vmovlb.s8 q0, q0
293; CHECK-NEXT:    vmovlb.s16 q1, q1
294; CHECK-NEXT:    vmovlb.s16 q0, q0
295; CHECK-NEXT:    vadd.i32 q0, q0, q1
296; CHECK-NEXT:    movs r0, #1
297; CHECK-NEXT:    vadd.i32 q0, q0, r0
298; CHECK-NEXT:    vmovlb.u16 q0, q0
299; CHECK-NEXT:    vshr.u32 q0, q0, #1
300; CHECK-NEXT:    bx lr
301entry:
302  %s0s = sext <4 x i8> %s0 to <4 x i16>
303  %s1s = sext <4 x i8> %s1 to <4 x i16>
304  %add = add nsw <4 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1>
305  %add2 = add nsw <4 x i16> %add, %s1s
306  %s = lshr <4 x i16> %add2, <i16 1, i16 1, i16 1, i16 1>
307  %result = trunc <4 x i16> %s to <4 x i8>
308  ret <4 x i8> %result
309}
310
311define arm_aapcs_vfpcc <4 x i8> @vrhaddu_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
312; CHECK-LABEL: vrhaddu_v4i8:
313; CHECK:       @ %bb.0: @ %entry
314; CHECK-NEXT:    vmov.i32 q2, #0xff
315; CHECK-NEXT:    vand q1, q1, q2
316; CHECK-NEXT:    vand q0, q0, q2
317; CHECK-NEXT:    vrhadd.u32 q0, q0, q1
318; CHECK-NEXT:    bx lr
319entry:
320  %s0s = zext <4 x i8> %s0 to <4 x i16>
321  %s1s = zext <4 x i8> %s1 to <4 x i16>
322  %add = add nuw nsw <4 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1>
323  %add2 = add nuw nsw <4 x i16> %add, %s1s
324  %s = lshr <4 x i16> %add2, <i16 1, i16 1, i16 1, i16 1>
325  %result = trunc <4 x i16> %s to <4 x i8>
326  ret <4 x i8> %result
327}
328
329define arm_aapcs_vfpcc <8 x i8> @vrhadds_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
330; CHECK-LABEL: vrhadds_v8i8:
331; CHECK:       @ %bb.0: @ %entry
332; CHECK-NEXT:    vmovlb.s8 q1, q1
333; CHECK-NEXT:    vmovlb.s8 q0, q0
334; CHECK-NEXT:    vadd.i16 q0, q0, q1
335; CHECK-NEXT:    movs r0, #1
336; CHECK-NEXT:    vadd.i16 q0, q0, r0
337; CHECK-NEXT:    vshr.u16 q0, q0, #1
338; CHECK-NEXT:    bx lr
339entry:
340  %s0s = sext <8 x i8> %s0 to <8 x i16>
341  %s1s = sext <8 x i8> %s1 to <8 x i16>
342  %add = add nsw <8 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
343  %add2 = add nsw <8 x i16> %add, %s1s
344  %s = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
345  %result = trunc <8 x i16> %s to <8 x i8>
346  ret <8 x i8> %result
347}
348
349define arm_aapcs_vfpcc <8 x i8> @vrhaddu_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
350; CHECK-LABEL: vrhaddu_v8i8:
351; CHECK:       @ %bb.0: @ %entry
352; CHECK-NEXT:    vmovlb.u8 q1, q1
353; CHECK-NEXT:    vmovlb.u8 q0, q0
354; CHECK-NEXT:    vrhadd.u16 q0, q0, q1
355; CHECK-NEXT:    bx lr
356entry:
357  %s0s = zext <8 x i8> %s0 to <8 x i16>
358  %s1s = zext <8 x i8> %s1 to <8 x i16>
359  %add = add nuw nsw <8 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
360  %add2 = add nuw nsw <8 x i16> %add, %s1s
361  %s = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
362  %result = trunc <8 x i16> %s to <8 x i8>
363  ret <8 x i8> %result
364}
365
366define arm_aapcs_vfpcc <16 x i8> @vrhadds_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
367; CHECK-LABEL: vrhadds_v16i8:
368; CHECK:       @ %bb.0: @ %entry
369; CHECK-NEXT:    vrhadd.s8 q0, q0, q1
370; CHECK-NEXT:    bx lr
371entry:
372  %s0s = sext <16 x i8> %s0 to <16 x i16>
373  %s1s = sext <16 x i8> %s1 to <16 x i16>
374  %add = add nsw <16 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
375  %add2 = add nsw <16 x i16> %add, %s1s
376  %s = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
377  %result = trunc <16 x i16> %s to <16 x i8>
378  ret <16 x i8> %result
379}
380
381define arm_aapcs_vfpcc <16 x i8> @vrhaddu_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
382; CHECK-LABEL: vrhaddu_v16i8:
383; CHECK:       @ %bb.0: @ %entry
384; CHECK-NEXT:    vrhadd.u8 q0, q0, q1
385; CHECK-NEXT:    bx lr
386entry:
387  %s0s = zext <16 x i8> %s0 to <16 x i16>
388  %s1s = zext <16 x i8> %s1 to <16 x i16>
389  %add = add nuw nsw <16 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
390  %add2 = add nuw nsw <16 x i16> %add, %s1s
391  %s = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
392  %result = trunc <16 x i16> %s to <16 x i8>
393  ret <16 x i8> %result
394}
395
396define void @vhadd_loop_s8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
397; CHECK-LABEL: vhadd_loop_s8:
398; CHECK:       @ %bb.0: @ %entry
399; CHECK-NEXT:    .save {r7, lr}
400; CHECK-NEXT:    push {r7, lr}
401; CHECK-NEXT:    mov.w lr, #64
402; CHECK-NEXT:  .LBB24_1: @ %vector.body
403; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
404; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
405; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
406; CHECK-NEXT:    vhadd.s8 q0, q1, q0
407; CHECK-NEXT:    vstrb.8 q0, [r2], #16
408; CHECK-NEXT:    le lr, .LBB24_1
409; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
410; CHECK-NEXT:    pop {r7, pc}
411entry:
412  br label %vector.body
413
414vector.body:                                      ; preds = %vector.body, %entry
415  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
416  %0 = getelementptr inbounds i8, ptr %x, i32 %index
417  %wide.load = load <16 x i8>, ptr %0, align 1
418  %1 = sext <16 x i8> %wide.load to <16 x i16>
419  %2 = getelementptr inbounds i8, ptr %y, i32 %index
420  %wide.load16 = load <16 x i8>, ptr %2, align 1
421  %3 = sext <16 x i8> %wide.load16 to <16 x i16>
422  %4 = add nsw <16 x i16> %3, %1
423  %5 = lshr <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
424  %6 = trunc <16 x i16> %5 to <16 x i8>
425  %7 = getelementptr inbounds i8, ptr %z, i32 %index
426  store <16 x i8> %6, ptr %7, align 1
427  %index.next = add i32 %index, 16
428  %8 = icmp eq i32 %index.next, 1024
429  br i1 %8, label %for.cond.cleanup, label %vector.body
430
431for.cond.cleanup:                                 ; preds = %vector.body
432  ret void
433}
434
435define void @vhadd_loop_s16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
436; CHECK-LABEL: vhadd_loop_s16:
437; CHECK:       @ %bb.0: @ %entry
438; CHECK-NEXT:    .save {r7, lr}
439; CHECK-NEXT:    push {r7, lr}
440; CHECK-NEXT:    mov.w lr, #128
441; CHECK-NEXT:  .LBB25_1: @ %vector.body
442; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
443; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
444; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
445; CHECK-NEXT:    vhadd.s16 q0, q1, q0
446; CHECK-NEXT:    vstrb.8 q0, [r2], #16
447; CHECK-NEXT:    le lr, .LBB25_1
448; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
449; CHECK-NEXT:    pop {r7, pc}
450entry:
451  br label %vector.body
452
453vector.body:                                      ; preds = %vector.body, %entry
454  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
455  %0 = getelementptr inbounds i16, ptr %x, i32 %index
456  %wide.load = load <8 x i16>, ptr %0, align 2
457  %1 = sext <8 x i16> %wide.load to <8 x i32>
458  %2 = getelementptr inbounds i16, ptr %y, i32 %index
459  %wide.load16 = load <8 x i16>, ptr %2, align 2
460  %3 = sext <8 x i16> %wide.load16 to <8 x i32>
461  %4 = add nsw <8 x i32> %3, %1
462  %5 = lshr <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
463  %6 = trunc <8 x i32> %5 to <8 x i16>
464  %7 = getelementptr inbounds i16, ptr %z, i32 %index
465  store <8 x i16> %6, ptr %7, align 2
466  %index.next = add i32 %index, 8
467  %8 = icmp eq i32 %index.next, 1024
468  br i1 %8, label %for.cond.cleanup, label %vector.body
469
470for.cond.cleanup:                                 ; preds = %vector.body
471  ret void
472}
473
474define void @vhadd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
475; CHECK-LABEL: vhadd_loop_s32:
476; CHECK:       @ %bb.0: @ %entry
477; CHECK-NEXT:    .save {r7, lr}
478; CHECK-NEXT:    push {r7, lr}
479; CHECK-NEXT:    mov.w lr, #256
480; CHECK-NEXT:  .LBB26_1: @ %vector.body
481; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
482; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
483; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
484; CHECK-NEXT:    vhadd.s32 q0, q1, q0
485; CHECK-NEXT:    vstrb.8 q0, [r2], #16
486; CHECK-NEXT:    le lr, .LBB26_1
487; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
488; CHECK-NEXT:    pop {r7, pc}
489entry:
490  br label %vector.body
491
492vector.body:                                      ; preds = %vector.body, %entry
493  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
494  %0 = getelementptr inbounds i32, ptr %x, i32 %index
495  %wide.load = load <4 x i32>, ptr %0, align 4
496  %1 = sext <4 x i32> %wide.load to <4 x i64>
497  %2 = getelementptr inbounds i32, ptr %y, i32 %index
498  %wide.load16 = load <4 x i32>, ptr %2, align 4
499  %3 = sext <4 x i32> %wide.load16 to <4 x i64>
500  %4 = add nsw <4 x i64> %3, %1
501  %5 = lshr <4 x i64> %4, <i64 1, i64 1, i64 1, i64 1>
502  %6 = trunc <4 x i64> %5 to <4 x i32>
503  %7 = getelementptr inbounds i32, ptr %z, i32 %index
504  store <4 x i32> %6, ptr %7, align 4
505  %index.next = add i32 %index, 4
506  %8 = icmp eq i32 %index.next, 1024
507  br i1 %8, label %for.cond.cleanup, label %vector.body
508
509for.cond.cleanup:                                 ; preds = %vector.body
510  ret void
511}
512
513define void @vhadd_loop_u8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
514; CHECK-LABEL: vhadd_loop_u8:
515; CHECK:       @ %bb.0: @ %entry
516; CHECK-NEXT:    .save {r7, lr}
517; CHECK-NEXT:    push {r7, lr}
518; CHECK-NEXT:    mov.w lr, #64
519; CHECK-NEXT:  .LBB27_1: @ %vector.body
520; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
521; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
522; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
523; CHECK-NEXT:    vhadd.u8 q0, q1, q0
524; CHECK-NEXT:    vstrb.8 q0, [r2], #16
525; CHECK-NEXT:    le lr, .LBB27_1
526; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
527; CHECK-NEXT:    pop {r7, pc}
528entry:
529  br label %vector.body
530
531vector.body:                                      ; preds = %vector.body, %entry
532  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
533  %0 = getelementptr inbounds i8, ptr %x, i32 %index
534  %wide.load = load <16 x i8>, ptr %0, align 1
535  %1 = zext <16 x i8> %wide.load to <16 x i16>
536  %2 = getelementptr inbounds i8, ptr %y, i32 %index
537  %wide.load16 = load <16 x i8>, ptr %2, align 1
538  %3 = zext <16 x i8> %wide.load16 to <16 x i16>
539  %4 = add nuw nsw <16 x i16> %3, %1
540  %5 = lshr <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
541  %6 = trunc <16 x i16> %5 to <16 x i8>
542  %7 = getelementptr inbounds i8, ptr %z, i32 %index
543  store <16 x i8> %6, ptr %7, align 1
544  %index.next = add i32 %index, 16
545  %8 = icmp eq i32 %index.next, 1024
546  br i1 %8, label %for.cond.cleanup, label %vector.body
547
548for.cond.cleanup:                                 ; preds = %vector.body
549  ret void
550}
551
552define void @vhadd_loop_u16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
553; CHECK-LABEL: vhadd_loop_u16:
554; CHECK:       @ %bb.0: @ %entry
555; CHECK-NEXT:    .save {r7, lr}
556; CHECK-NEXT:    push {r7, lr}
557; CHECK-NEXT:    mov.w lr, #128
558; CHECK-NEXT:  .LBB28_1: @ %vector.body
559; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
560; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
561; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
562; CHECK-NEXT:    vhadd.u16 q0, q1, q0
563; CHECK-NEXT:    vstrb.8 q0, [r2], #16
564; CHECK-NEXT:    le lr, .LBB28_1
565; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
566; CHECK-NEXT:    pop {r7, pc}
567entry:
568  br label %vector.body
569
570vector.body:                                      ; preds = %vector.body, %entry
571  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
572  %0 = getelementptr inbounds i16, ptr %x, i32 %index
573  %wide.load = load <8 x i16>, ptr %0, align 2
574  %1 = zext <8 x i16> %wide.load to <8 x i32>
575  %2 = getelementptr inbounds i16, ptr %y, i32 %index
576  %wide.load16 = load <8 x i16>, ptr %2, align 2
577  %3 = zext <8 x i16> %wide.load16 to <8 x i32>
578  %4 = add nuw nsw <8 x i32> %3, %1
579  %5 = lshr <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
580  %6 = trunc <8 x i32> %5 to <8 x i16>
581  %7 = getelementptr inbounds i16, ptr %z, i32 %index
582  store <8 x i16> %6, ptr %7, align 2
583  %index.next = add i32 %index, 8
584  %8 = icmp eq i32 %index.next, 1024
585  br i1 %8, label %for.cond.cleanup, label %vector.body
586
587for.cond.cleanup:                                 ; preds = %vector.body
588  ret void
589}
590
591define void @vhadd_loop_u32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
592; CHECK-LABEL: vhadd_loop_u32:
593; CHECK:       @ %bb.0: @ %entry
594; CHECK-NEXT:    .save {r7, lr}
595; CHECK-NEXT:    push {r7, lr}
596; CHECK-NEXT:    mov.w lr, #256
597; CHECK-NEXT:  .LBB29_1: @ %vector.body
598; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
599; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
600; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
601; CHECK-NEXT:    vhadd.u32 q0, q1, q0
602; CHECK-NEXT:    vstrb.8 q0, [r2], #16
603; CHECK-NEXT:    le lr, .LBB29_1
604; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
605; CHECK-NEXT:    pop {r7, pc}
606entry:
607  br label %vector.body
608
609vector.body:                                      ; preds = %vector.body, %entry
610  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
611  %0 = getelementptr inbounds i32, ptr %x, i32 %index
612  %wide.load = load <4 x i32>, ptr %0, align 4
613  %1 = zext <4 x i32> %wide.load to <4 x i64>
614  %2 = getelementptr inbounds i32, ptr %y, i32 %index
615  %wide.load16 = load <4 x i32>, ptr %2, align 4
616  %3 = zext <4 x i32> %wide.load16 to <4 x i64>
617  %4 = add nuw nsw <4 x i64> %3, %1
618  %5 = lshr <4 x i64> %4, <i64 1, i64 1, i64 1, i64 1>
619  %6 = trunc <4 x i64> %5 to <4 x i32>
620  %7 = getelementptr inbounds i32, ptr %z, i32 %index
621  store <4 x i32> %6, ptr %7, align 4
622  %index.next = add i32 %index, 4
623  %8 = icmp eq i32 %index.next, 1024
624  br i1 %8, label %for.cond.cleanup, label %vector.body
625
626for.cond.cleanup:                                 ; preds = %vector.body
627  ret void
628}
629
630define void @vrhadd_loop_s8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
631; CHECK-LABEL: vrhadd_loop_s8:
632; CHECK:       @ %bb.0: @ %entry
633; CHECK-NEXT:    .save {r7, lr}
634; CHECK-NEXT:    push {r7, lr}
635; CHECK-NEXT:    mov.w lr, #64
636; CHECK-NEXT:  .LBB30_1: @ %vector.body
637; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
638; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
639; CHECK-NEXT:    vldrb.u8 q1, [r0], #16
640; CHECK-NEXT:    vrhadd.u8 q0, q1, q0
641; CHECK-NEXT:    vstrb.8 q0, [r2], #16
642; CHECK-NEXT:    le lr, .LBB30_1
643; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
644; CHECK-NEXT:    pop {r7, pc}
645entry:
646  br label %vector.body
647
648vector.body:                                      ; preds = %vector.body, %entry
649  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
650  %0 = getelementptr inbounds i8, ptr %x, i32 %index
651  %wide.load = load <16 x i8>, ptr %0, align 1
652  %1 = zext <16 x i8> %wide.load to <16 x i16>
653  %2 = getelementptr inbounds i8, ptr %y, i32 %index
654  %wide.load16 = load <16 x i8>, ptr %2, align 1
655  %3 = zext <16 x i8> %wide.load16 to <16 x i16>
656  %4 = add nuw nsw <16 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
657  %5 = add nuw nsw <16 x i16> %4, %3
658  %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
659  %7 = trunc <16 x i16> %6 to <16 x i8>
660  %8 = getelementptr inbounds i8, ptr %z, i32 %index
661  store <16 x i8> %7, ptr %8, align 1
662  %index.next = add i32 %index, 16
663  %9 = icmp eq i32 %index.next, 1024
664  br i1 %9, label %for.cond.cleanup, label %vector.body
665
666for.cond.cleanup:                                 ; preds = %vector.body
667  ret void
668}
669
670define void @vrhadd_loop_s16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
671; CHECK-LABEL: vrhadd_loop_s16:
672; CHECK:       @ %bb.0: @ %entry
673; CHECK-NEXT:    .save {r7, lr}
674; CHECK-NEXT:    push {r7, lr}
675; CHECK-NEXT:    mov.w lr, #128
676; CHECK-NEXT:  .LBB31_1: @ %vector.body
677; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
678; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
679; CHECK-NEXT:    vldrh.u16 q1, [r0], #16
680; CHECK-NEXT:    vrhadd.u16 q0, q1, q0
681; CHECK-NEXT:    vstrb.8 q0, [r2], #16
682; CHECK-NEXT:    le lr, .LBB31_1
683; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
684; CHECK-NEXT:    pop {r7, pc}
685entry:
686  br label %vector.body
687
688vector.body:                                      ; preds = %vector.body, %entry
689  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
690  %0 = getelementptr inbounds i16, ptr %x, i32 %index
691  %wide.load = load <8 x i16>, ptr %0, align 2
692  %1 = zext <8 x i16> %wide.load to <8 x i32>
693  %2 = getelementptr inbounds i16, ptr %y, i32 %index
694  %wide.load16 = load <8 x i16>, ptr %2, align 2
695  %3 = zext <8 x i16> %wide.load16 to <8 x i32>
696  %4 = add nuw nsw <8 x i32> %1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
697  %5 = add nuw nsw <8 x i32> %4, %3
698  %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
699  %7 = trunc <8 x i32> %6 to <8 x i16>
700  %8 = getelementptr inbounds i16, ptr %z, i32 %index
701  store <8 x i16> %7, ptr %8, align 2
702  %index.next = add i32 %index, 8
703  %9 = icmp eq i32 %index.next, 1024
704  br i1 %9, label %for.cond.cleanup, label %vector.body
705
706for.cond.cleanup:                                 ; preds = %vector.body
707  ret void
708}
709
710define void @vrhadd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
711; CHECK-LABEL: vrhadd_loop_s32:
712; CHECK:       @ %bb.0: @ %entry
713; CHECK-NEXT:    .save {r7, lr}
714; CHECK-NEXT:    push {r7, lr}
715; CHECK-NEXT:    mov.w lr, #256
716; CHECK-NEXT:  .LBB32_1: @ %vector.body
717; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
718; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
719; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
720; CHECK-NEXT:    vrhadd.u32 q0, q1, q0
721; CHECK-NEXT:    vstrb.8 q0, [r2], #16
722; CHECK-NEXT:    le lr, .LBB32_1
723; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
724; CHECK-NEXT:    pop {r7, pc}
725entry:
726  br label %vector.body
727
728vector.body:                                      ; preds = %vector.body, %entry
729  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
730  %0 = getelementptr inbounds i32, ptr %x, i32 %index
731  %wide.load = load <4 x i32>, ptr %0, align 4
732  %1 = zext <4 x i32> %wide.load to <4 x i64>
733  %2 = getelementptr inbounds i32, ptr %y, i32 %index
734  %wide.load16 = load <4 x i32>, ptr %2, align 4
735  %3 = zext <4 x i32> %wide.load16 to <4 x i64>
736  %4 = add nuw nsw <4 x i64> %1, <i64 1, i64 1, i64 1, i64 1>
737  %5 = add nuw nsw <4 x i64> %4, %3
738  %6 = lshr <4 x i64> %5, <i64 1, i64 1, i64 1, i64 1>
739  %7 = trunc <4 x i64> %6 to <4 x i32>
740  %8 = getelementptr inbounds i32, ptr %z, i32 %index
741  store <4 x i32> %7, ptr %8, align 4
742  %index.next = add i32 %index, 4
743  %9 = icmp eq i32 %index.next, 1024
744  br i1 %9, label %for.cond.cleanup, label %vector.body
745
746for.cond.cleanup:                                 ; preds = %vector.body
747  ret void
748}
749
750define void @vrhadd_loop_u8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
751; CHECK-LABEL: vrhadd_loop_u8:
752; CHECK:       @ %bb.0: @ %entry
753; CHECK-NEXT:    .save {r7, lr}
754; CHECK-NEXT:    push {r7, lr}
755; CHECK-NEXT:    mov.w lr, #64
756; CHECK-NEXT:  .LBB33_1: @ %vector.body
757; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
758; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
759; CHECK-NEXT:    vldrb.u8 q1, [r0], #16
760; CHECK-NEXT:    vrhadd.u8 q0, q1, q0
761; CHECK-NEXT:    vstrb.8 q0, [r2], #16
762; CHECK-NEXT:    le lr, .LBB33_1
763; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
764; CHECK-NEXT:    pop {r7, pc}
765entry:
766  br label %vector.body
767
768vector.body:                                      ; preds = %vector.body, %entry
769  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
770  %0 = getelementptr inbounds i8, ptr %x, i32 %index
771  %wide.load = load <16 x i8>, ptr %0, align 1
772  %1 = zext <16 x i8> %wide.load to <16 x i16>
773  %2 = getelementptr inbounds i8, ptr %y, i32 %index
774  %wide.load16 = load <16 x i8>, ptr %2, align 1
775  %3 = zext <16 x i8> %wide.load16 to <16 x i16>
776  %4 = add nuw nsw <16 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
777  %5 = add nuw nsw <16 x i16> %4, %3
778  %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
779  %7 = trunc <16 x i16> %6 to <16 x i8>
780  %8 = getelementptr inbounds i8, ptr %z, i32 %index
781  store <16 x i8> %7, ptr %8, align 1
782  %index.next = add i32 %index, 16
783  %9 = icmp eq i32 %index.next, 1024
784  br i1 %9, label %for.cond.cleanup, label %vector.body
785
786for.cond.cleanup:                                 ; preds = %vector.body
787  ret void
788}
789
790define void @vrhadd_loop_u16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
791; CHECK-LABEL: vrhadd_loop_u16:
792; CHECK:       @ %bb.0: @ %entry
793; CHECK-NEXT:    .save {r7, lr}
794; CHECK-NEXT:    push {r7, lr}
795; CHECK-NEXT:    mov.w lr, #128
796; CHECK-NEXT:  .LBB34_1: @ %vector.body
797; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
798; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
799; CHECK-NEXT:    vldrh.u16 q1, [r0], #16
800; CHECK-NEXT:    vrhadd.u16 q0, q1, q0
801; CHECK-NEXT:    vstrb.8 q0, [r2], #16
802; CHECK-NEXT:    le lr, .LBB34_1
803; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
804; CHECK-NEXT:    pop {r7, pc}
805entry:
806  br label %vector.body
807
808vector.body:                                      ; preds = %vector.body, %entry
809  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
810  %0 = getelementptr inbounds i16, ptr %x, i32 %index
811  %wide.load = load <8 x i16>, ptr %0, align 2
812  %1 = zext <8 x i16> %wide.load to <8 x i32>
813  %2 = getelementptr inbounds i16, ptr %y, i32 %index
814  %wide.load16 = load <8 x i16>, ptr %2, align 2
815  %3 = zext <8 x i16> %wide.load16 to <8 x i32>
816  %4 = add nuw nsw <8 x i32> %1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
817  %5 = add nuw nsw <8 x i32> %4, %3
818  %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
819  %7 = trunc <8 x i32> %6 to <8 x i16>
820  %8 = getelementptr inbounds i16, ptr %z, i32 %index
821  store <8 x i16> %7, ptr %8, align 2
822  %index.next = add i32 %index, 8
823  %9 = icmp eq i32 %index.next, 1024
824  br i1 %9, label %for.cond.cleanup, label %vector.body
825
826for.cond.cleanup:                                 ; preds = %vector.body
827  ret void
828}
829
830define void @vrhadd_loop_u32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
831; CHECK-LABEL: vrhadd_loop_u32:
832; CHECK:       @ %bb.0: @ %entry
833; CHECK-NEXT:    .save {r7, lr}
834; CHECK-NEXT:    push {r7, lr}
835; CHECK-NEXT:    mov.w lr, #256
836; CHECK-NEXT:  .LBB35_1: @ %vector.body
837; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
838; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
839; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
840; CHECK-NEXT:    vrhadd.u32 q0, q1, q0
841; CHECK-NEXT:    vstrb.8 q0, [r2], #16
842; CHECK-NEXT:    le lr, .LBB35_1
843; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
844; CHECK-NEXT:    pop {r7, pc}
845entry:
846  br label %vector.body
847
848vector.body:                                      ; preds = %vector.body, %entry
849  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
850  %0 = getelementptr inbounds i32, ptr %x, i32 %index
851  %wide.load = load <4 x i32>, ptr %0, align 4
852  %1 = zext <4 x i32> %wide.load to <4 x i64>
853  %2 = getelementptr inbounds i32, ptr %y, i32 %index
854  %wide.load16 = load <4 x i32>, ptr %2, align 4
855  %3 = zext <4 x i32> %wide.load16 to <4 x i64>
856  %4 = add nuw nsw <4 x i64> %1, <i64 1, i64 1, i64 1, i64 1>
857  %5 = add nuw nsw <4 x i64> %4, %3
858  %6 = lshr <4 x i64> %5, <i64 1, i64 1, i64 1, i64 1>
859  %7 = trunc <4 x i64> %6 to <4 x i32>
860  %8 = getelementptr inbounds i32, ptr %z, i32 %index
861  store <4 x i32> %7, ptr %8, align 4
862  %index.next = add i32 %index, 4
863  %9 = icmp eq i32 %index.next, 1024
864  br i1 %9, label %for.cond.cleanup, label %vector.body
865
866for.cond.cleanup:                                 ; preds = %vector.body
867  ret void
868}
869
870
871define arm_aapcs_vfpcc i16 @vhadds_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
872; CHECK-LABEL: vhadds_reduce_v16i8:
873; CHECK:       @ %bb.0: @ %entry
874; CHECK-NEXT:    vhadd.s8 q0, q0, q1
875; CHECK-NEXT:    vaddv.s8 r0, q0
876; CHECK-NEXT:    bx lr
877entry:
878  %s0s = sext <16 x i8> %s0 to <16 x i16>
879  %s1s = sext <16 x i8> %s1 to <16 x i16>
880  %add = add <16 x i16> %s0s, %s1s
881  %s = ashr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
882  %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
883  ret i16 %result
884}
885
886define arm_aapcs_vfpcc i16 @vhaddu_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
887; CHECK-LABEL: vhaddu_reduce_v16i8:
888; CHECK:       @ %bb.0: @ %entry
889; CHECK-NEXT:    vhadd.u8 q0, q0, q1
890; CHECK-NEXT:    vaddv.u8 r0, q0
891; CHECK-NEXT:    bx lr
892entry:
893  %s0s = zext <16 x i8> %s0 to <16 x i16>
894  %s1s = zext <16 x i8> %s1 to <16 x i16>
895  %add = add <16 x i16> %s0s, %s1s
896  %s = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
897  %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
898  ret i16 %result
899}
900
901define arm_aapcs_vfpcc i16 @vrhadds_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
902; CHECK-LABEL: vrhadds_reduce_v16i8:
903; CHECK:       @ %bb.0: @ %entry
904; CHECK-NEXT:    vrhadd.s8 q0, q0, q1
905; CHECK-NEXT:    vaddv.s8 r0, q0
906; CHECK-NEXT:    bx lr
907entry:
908  %s0s = sext <16 x i8> %s0 to <16 x i16>
909  %s1s = sext <16 x i8> %s1 to <16 x i16>
910  %add = add <16 x i16> %s0s, %s1s
911  %add2 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
912  %s = ashr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
913  %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
914  ret i16 %result
915}
916
917define arm_aapcs_vfpcc i16 @vrhaddu_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
918; CHECK-LABEL: vrhaddu_reduce_v16i8:
919; CHECK:       @ %bb.0: @ %entry
920; CHECK-NEXT:    vrhadd.u8 q0, q0, q1
921; CHECK-NEXT:    vaddv.u8 r0, q0
922; CHECK-NEXT:    bx lr
923entry:
924  %s0s = zext <16 x i8> %s0 to <16 x i16>
925  %s1s = zext <16 x i8> %s1 to <16 x i16>
926  %add = add <16 x i16> %s0s, %s1s
927  %add2 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
928  %s = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
929  %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
930  ret i16 %result
931}
932
933declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
934