xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-vecreduce-add-combine.ll (revision b31fffbc7f1e0491bf599e82b7195e320d26e140)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
3
4define arm_aapcs_vfpcc i32 @test1(ptr %ptr, i32 %arg1, <4 x i32> %arg2, <4 x i32> %arg3) {
5; CHECK-LABEL: test1:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vaddv.u32 r2, q1
8; CHECK-NEXT:    vaddva.u32 r2, q0
9; CHECK-NEXT:    str r2, [r0]
10; CHECK-NEXT:    adds r0, r2, r1
11; CHECK-NEXT:    bx lr
12entry:
13  %reduce1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg2)
14  %reduce2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg3)
15  %add1 = add i32 %reduce1, %reduce2
16  store i32 %add1, ptr %ptr, align 4
17  %add2 = add i32 %add1, %arg1
18  ret i32 %add2
19}
20
21define arm_aapcs_vfpcc i32 @test2(ptr %ptr, i32 %arg1, <4 x i32> %arg2, <4 x i32> %arg3) {
22; CHECK-LABEL: test2:
23; CHECK:       @ %bb.0: @ %entry
24; CHECK-NEXT:    vaddv.u32 r2, q1
25; CHECK-NEXT:    vaddva.u32 r2, q0
26; CHECK-NEXT:    str r2, [r0]
27; CHECK-NEXT:    adds r0, r1, r2
28; CHECK-NEXT:    bx lr
29entry:
30  %reduce1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg2)
31  %reduce2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg3)
32  %add1 = add i32 %reduce1, %reduce2
33  store i32 %add1, ptr %ptr, align 4
34  %add2 = add i32 %arg1, %add1
35  ret i32 %add2
36}
37
38define arm_aapcs_vfpcc i32 @test3(ptr %ptr, i32 %arg1, i32 %arg2, <4 x i32> %arg3, <4 x i32> %arg4) {
39; CHECK-LABEL: test3:
40; CHECK:       @ %bb.0: @ %entry
41; CHECK-NEXT:    mov r12, r1
42; CHECK-NEXT:    vaddva.u32 r2, q1
43; CHECK-NEXT:    vaddva.u32 r12, q0
44; CHECK-NEXT:    str.w r12, [r0]
45; CHECK-NEXT:    add.w r0, r12, r2
46; CHECK-NEXT:    bx lr
47entry:
48  %reduce1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg3)
49  %reduce2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg4)
50  %add1 = add i32 %arg1, %reduce1
51  store i32 %add1, ptr %ptr, align 4
52  %add2 = add i32 %arg2, %reduce2
53  %add3 = add i32 %add1, %add2
54  ret i32 %add3
55}
56
57define arm_aapcs_vfpcc i32 @test4(ptr %ptr, i32 %arg1, ptr %arg2) {
58; CHECK-LABEL: test4:
59; CHECK:       @ %bb.0: @ %entry
60; CHECK-NEXT:    vldrw.u32 q0, [r2]
61; CHECK-NEXT:    mov r12, r1
62; CHECK-NEXT:    vaddva.u32 r12, q0
63; CHECK-NEXT:    vldrw.u32 q0, [r2, #4]
64; CHECK-NEXT:    str.w r12, [r0]
65; CHECK-NEXT:    vaddva.u32 r12, q0
66; CHECK-NEXT:    mov r0, r12
67; CHECK-NEXT:    bx lr
68entry:
69  %load1 = load <4 x i32>, ptr %arg2, align 4
70  %gep = getelementptr inbounds i32, ptr %arg2, i32 1
71  %load2 = load <4 x i32>, ptr %gep, align 4
72  %reduce1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %load1)
73  %reduce2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %load2)
74  %add1 = add i32 %arg1, %reduce1
75  store i32 %add1, ptr %ptr, align 4
76  %add2 = add i32 %add1, %reduce2
77  ret i32 %add2
78}
79
80define arm_aapcs_vfpcc i32 @test5(ptr %ptr, i32 %arg1, ptr %arg2) {
81; CHECK-LABEL: test5:
82; CHECK:       @ %bb.0: @ %entry
83; CHECK-NEXT:    vldrw.u32 q0, [r2, #4]
84; CHECK-NEXT:    mov r12, r1
85; CHECK-NEXT:    vaddva.u32 r12, q0
86; CHECK-NEXT:    vldrw.u32 q0, [r2]
87; CHECK-NEXT:    str.w r12, [r0]
88; CHECK-NEXT:    vaddva.u32 r12, q0
89; CHECK-NEXT:    mov r0, r12
90; CHECK-NEXT:    bx lr
91entry:
92  %load1 = load <4 x i32>, ptr %arg2, align 4
93  %gep = getelementptr inbounds i32, ptr %arg2, i32 1
94  %load2 = load <4 x i32>, ptr %gep, align 4
95  %reduce1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %load1)
96  %reduce2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %load2)
97  %add1 = add i32 %arg1, %reduce2
98  store i32 %add1, ptr %ptr, align 4
99  %add2 = add i32 %add1, %reduce1
100  ret i32 %add2
101}
102
103define arm_aapcs_vfpcc i16 @vaddv_shuffle_v16i8(<16 x i8> %s0) {
104; CHECK-LABEL: vaddv_shuffle_v16i8:
105; CHECK:       @ %bb.0: @ %entry
106; CHECK-NEXT:    vaddv.u8 r0, q0
107; CHECK-NEXT:    bx lr
108entry:
109  %s2 = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
110  %s1 = zext <16 x i8> %s2 to <16 x i16>
111  %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s1)
112  ret i16 %result
113}
114
115define arm_aapcs_vfpcc i16 @vaddv_shuffle_v16i8_duplicate(<16 x i8> %s0) {
116; CHECK-LABEL: vaddv_shuffle_v16i8_duplicate:
117; CHECK:       @ %bb.0: @ %entry
118; CHECK-NEXT:    vmov.u8 r0, q0[1]
119; CHECK-NEXT:    vmov.u8 r1, q0[2]
120; CHECK-NEXT:    vmov.8 q1[0], r0
121; CHECK-NEXT:    vmov.8 q1[1], r1
122; CHECK-NEXT:    vmov.u8 r1, q0[4]
123; CHECK-NEXT:    vmov.8 q1[2], r1
124; CHECK-NEXT:    vmov.u8 r1, q0[6]
125; CHECK-NEXT:    vmov.8 q1[3], r1
126; CHECK-NEXT:    vmov.u8 r1, q0[8]
127; CHECK-NEXT:    vmov.8 q1[4], r1
128; CHECK-NEXT:    vmov.u8 r1, q0[10]
129; CHECK-NEXT:    vmov.8 q1[5], r1
130; CHECK-NEXT:    vmov.u8 r1, q0[12]
131; CHECK-NEXT:    vmov.8 q1[6], r1
132; CHECK-NEXT:    vmov.u8 r1, q0[14]
133; CHECK-NEXT:    vmov.8 q1[7], r1
134; CHECK-NEXT:    vmov.8 q1[8], r0
135; CHECK-NEXT:    vmov.u8 r0, q0[3]
136; CHECK-NEXT:    vmov.8 q1[9], r0
137; CHECK-NEXT:    vmov.u8 r0, q0[5]
138; CHECK-NEXT:    vmov.8 q1[10], r0
139; CHECK-NEXT:    vmov.u8 r0, q0[7]
140; CHECK-NEXT:    vmov.8 q1[11], r0
141; CHECK-NEXT:    vmov.u8 r0, q0[9]
142; CHECK-NEXT:    vmov.8 q1[12], r0
143; CHECK-NEXT:    vmov.u8 r0, q0[11]
144; CHECK-NEXT:    vmov.8 q1[13], r0
145; CHECK-NEXT:    vmov.u8 r0, q0[13]
146; CHECK-NEXT:    vmov.8 q1[14], r0
147; CHECK-NEXT:    vmov.u8 r0, q0[15]
148; CHECK-NEXT:    vmov.8 q1[15], r0
149; CHECK-NEXT:    vaddv.u8 r0, q1
150; CHECK-NEXT:    bx lr
151entry:
152  %s2 = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> <i32 1, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
153  %s1 = zext <16 x i8> %s2 to <16 x i16>
154  %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s1)
155  ret i16 %result
156}
157
158define arm_aapcs_vfpcc i16 @vaddv_shuffle_v16i8_undef(<16 x i8> %s0) {
159; CHECK-LABEL: vaddv_shuffle_v16i8_undef:
160; CHECK:       @ %bb.0: @ %entry
161; CHECK-NEXT:    vmov.u8 r0, q0[2]
162; CHECK-NEXT:    vmov.8 q1[1], r0
163; CHECK-NEXT:    vmov.u8 r0, q0[4]
164; CHECK-NEXT:    vmov.8 q1[2], r0
165; CHECK-NEXT:    vmov.u8 r0, q0[6]
166; CHECK-NEXT:    vmov.8 q1[3], r0
167; CHECK-NEXT:    vmov.u8 r0, q0[8]
168; CHECK-NEXT:    vmov.8 q1[4], r0
169; CHECK-NEXT:    vmov.u8 r0, q0[10]
170; CHECK-NEXT:    vmov.8 q1[5], r0
171; CHECK-NEXT:    vmov.u8 r0, q0[12]
172; CHECK-NEXT:    vmov.8 q1[6], r0
173; CHECK-NEXT:    vmov.u8 r0, q0[14]
174; CHECK-NEXT:    vmov.8 q1[7], r0
175; CHECK-NEXT:    vmov.u8 r0, q0[1]
176; CHECK-NEXT:    vmov.8 q1[8], r0
177; CHECK-NEXT:    vmov.u8 r0, q0[3]
178; CHECK-NEXT:    vmov.8 q1[9], r0
179; CHECK-NEXT:    vmov.u8 r0, q0[5]
180; CHECK-NEXT:    vmov.8 q1[10], r0
181; CHECK-NEXT:    vmov.u8 r0, q0[7]
182; CHECK-NEXT:    vmov.8 q1[11], r0
183; CHECK-NEXT:    vmov.u8 r0, q0[9]
184; CHECK-NEXT:    vmov.8 q1[12], r0
185; CHECK-NEXT:    vmov.u8 r0, q0[11]
186; CHECK-NEXT:    vmov.8 q1[13], r0
187; CHECK-NEXT:    vmov.u8 r0, q0[13]
188; CHECK-NEXT:    vmov.8 q1[14], r0
189; CHECK-NEXT:    vmov.u8 r0, q0[15]
190; CHECK-NEXT:    vmov.8 q1[15], r0
191; CHECK-NEXT:    vaddv.u8 r0, q1
192; CHECK-NEXT:    bx lr
193entry:
194  %s2 = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> <i32 undef, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
195  %s1 = zext <16 x i8> %s2 to <16 x i16>
196  %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s1)
197  ret i16 %result
198}
199
200define arm_aapcs_vfpcc i64 @vaddv_shuffle_v4i32_long(<4 x i32> %s0) {
201; CHECK-LABEL: vaddv_shuffle_v4i32_long:
202; CHECK:       @ %bb.0: @ %entry
203; CHECK-NEXT:    vaddlv.u32 r0, r1, q0
204; CHECK-NEXT:    bx lr
205entry:
206  %s2 = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
207  %s1 = zext <4 x i32> %s2 to <4 x i64>
208  %r = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s1)
209  ret i64 %r
210}
211
212define arm_aapcs_vfpcc i64 @vaddv_shuffle_v4i32_long_a(<4 x i32> %s0, i64 %a) {
213; CHECK-LABEL: vaddv_shuffle_v4i32_long_a:
214; CHECK:       @ %bb.0: @ %entry
215; CHECK-NEXT:    vaddlva.u32 r0, r1, q0
216; CHECK-NEXT:    bx lr
217entry:
218  %s2 = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
219  %s1 = zext <4 x i32> %s2 to <4 x i64>
220  %r = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s1)
221  %r2 = add i64 %r, %a
222  ret i64 %r2
223}
224
225define arm_aapcs_vfpcc i16 @vmla_shuffle_v16i8(<16 x i8> %s0, <16 x i8> %s0b) {
226; CHECK-LABEL: vmla_shuffle_v16i8:
227; CHECK:       @ %bb.0: @ %entry
228; CHECK-NEXT:    vmlav.s8 r0, q0, q1
229; CHECK-NEXT:    bx lr
230entry:
231  %s2a = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
232  %s1a = sext <16 x i8> %s2a to <16 x i16>
233  %s2b = shufflevector <16 x i8> %s0b, <16 x i8> %s0, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
234  %s1b = sext <16 x i8> %s2b to <16 x i16>
235  %s1 = mul <16 x i16> %s1a, %s1b
236  %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s1)
237  ret i16 %result
238}
239
240define arm_aapcs_vfpcc i16 @vmla_shuffle_v16i8_unequal(<16 x i8> %s0, <16 x i8> %s0b) {
241; CHECK-LABEL: vmla_shuffle_v16i8_unequal:
242; CHECK:       @ %bb.0: @ %entry
243; CHECK-NEXT:    vmov.u8 r0, q1[0]
244; CHECK-NEXT:    vmov.8 q2[0], r0
245; CHECK-NEXT:    vmov.u8 r0, q1[2]
246; CHECK-NEXT:    vmov.8 q2[1], r0
247; CHECK-NEXT:    vmov.u8 r0, q1[4]
248; CHECK-NEXT:    vmov.8 q2[2], r0
249; CHECK-NEXT:    vmov.u8 r0, q1[6]
250; CHECK-NEXT:    vmov.8 q2[3], r0
251; CHECK-NEXT:    vmov.u8 r0, q1[8]
252; CHECK-NEXT:    vmov.8 q2[4], r0
253; CHECK-NEXT:    vmov.u8 r0, q1[10]
254; CHECK-NEXT:    vmov.8 q2[5], r0
255; CHECK-NEXT:    vmov.u8 r0, q1[12]
256; CHECK-NEXT:    vmov.8 q2[6], r0
257; CHECK-NEXT:    vmov.u8 r0, q1[15]
258; CHECK-NEXT:    vmov.8 q2[7], r0
259; CHECK-NEXT:    vmov.u8 r0, q1[1]
260; CHECK-NEXT:    vmov.8 q2[8], r0
261; CHECK-NEXT:    vmov.u8 r0, q1[3]
262; CHECK-NEXT:    vmov.8 q2[9], r0
263; CHECK-NEXT:    vmov.u8 r0, q1[5]
264; CHECK-NEXT:    vmov.8 q2[10], r0
265; CHECK-NEXT:    vmov.u8 r0, q1[7]
266; CHECK-NEXT:    vmov.8 q2[11], r0
267; CHECK-NEXT:    vmov.u8 r0, q1[9]
268; CHECK-NEXT:    vmov.8 q2[12], r0
269; CHECK-NEXT:    vmov.u8 r0, q1[11]
270; CHECK-NEXT:    vmov.8 q2[13], r0
271; CHECK-NEXT:    vmov.u8 r0, q1[13]
272; CHECK-NEXT:    vmov.8 q2[14], r0
273; CHECK-NEXT:    vmov.u8 r0, q1[14]
274; CHECK-NEXT:    vmov.8 q2[15], r0
275; CHECK-NEXT:    vmov.u8 r0, q0[0]
276; CHECK-NEXT:    vmov.8 q1[0], r0
277; CHECK-NEXT:    vmov.u8 r0, q0[2]
278; CHECK-NEXT:    vmov.8 q1[1], r0
279; CHECK-NEXT:    vmov.u8 r0, q0[4]
280; CHECK-NEXT:    vmov.8 q1[2], r0
281; CHECK-NEXT:    vmov.u8 r0, q0[6]
282; CHECK-NEXT:    vmov.8 q1[3], r0
283; CHECK-NEXT:    vmov.u8 r0, q0[8]
284; CHECK-NEXT:    vmov.8 q1[4], r0
285; CHECK-NEXT:    vmov.u8 r0, q0[10]
286; CHECK-NEXT:    vmov.8 q1[5], r0
287; CHECK-NEXT:    vmov.u8 r0, q0[12]
288; CHECK-NEXT:    vmov.8 q1[6], r0
289; CHECK-NEXT:    vmov.u8 r0, q0[14]
290; CHECK-NEXT:    vmov.8 q1[7], r0
291; CHECK-NEXT:    vmov.u8 r0, q0[1]
292; CHECK-NEXT:    vmov.8 q1[8], r0
293; CHECK-NEXT:    vmov.u8 r0, q0[3]
294; CHECK-NEXT:    vmov.8 q1[9], r0
295; CHECK-NEXT:    vmov.u8 r0, q0[5]
296; CHECK-NEXT:    vmov.8 q1[10], r0
297; CHECK-NEXT:    vmov.u8 r0, q0[7]
298; CHECK-NEXT:    vmov.8 q1[11], r0
299; CHECK-NEXT:    vmov.u8 r0, q0[9]
300; CHECK-NEXT:    vmov.8 q1[12], r0
301; CHECK-NEXT:    vmov.u8 r0, q0[11]
302; CHECK-NEXT:    vmov.8 q1[13], r0
303; CHECK-NEXT:    vmov.u8 r0, q0[13]
304; CHECK-NEXT:    vmov.8 q1[14], r0
305; CHECK-NEXT:    vmov.u8 r0, q0[15]
306; CHECK-NEXT:    vmov.8 q1[15], r0
307; CHECK-NEXT:    vmlav.s8 r0, q1, q2
308; CHECK-NEXT:    bx lr
309entry:
310  %s2a = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
311  %s1a = sext <16 x i8> %s2a to <16 x i16>
312  %s2b = shufflevector <16 x i8> %s0b, <16 x i8> %s0, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 15, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 14>
313  %s1b = sext <16 x i8> %s2b to <16 x i16>
314  %s1 = mul <16 x i16> %s1a, %s1b
315  %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s1)
316  ret i16 %result
317}
318
319define arm_aapcs_vfpcc i64 @vmla_shuffle_v4i32_long(<4 x i32> %s0, <4 x i32> %s0b) {
320; CHECK-LABEL: vmla_shuffle_v4i32_long:
321; CHECK:       @ %bb.0: @ %entry
322; CHECK-NEXT:    vmlalv.u32 r0, r1, q0, q1
323; CHECK-NEXT:    bx lr
324entry:
325  %s2a = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
326  %s1a = zext <4 x i32> %s2a to <4 x i64>
327  %s2b = shufflevector <4 x i32> %s0b, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
328  %s1b = zext <4 x i32> %s2b to <4 x i64>
329  %s1 = mul <4 x i64> %s1a, %s1b
330  %r = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s1)
331  ret i64 %r
332}
333
334define arm_aapcs_vfpcc i64 @vmla_shuffle_v4i32_long_a(<4 x i32> %s0, <4 x i32> %s0b, i64 %a) {
335; CHECK-LABEL: vmla_shuffle_v4i32_long_a:
336; CHECK:       @ %bb.0: @ %entry
337; CHECK-NEXT:    vmlalva.u32 r0, r1, q0, q1
338; CHECK-NEXT:    bx lr
339entry:
340  %s2a = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
341  %s1a = zext <4 x i32> %s2a to <4 x i64>
342  %s2b = shufflevector <4 x i32> %s0b, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
343  %s1b = zext <4 x i32> %s2b to <4 x i64>
344  %s1 = mul <4 x i64> %s1a, %s1b
345  %r = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s1)
346  %r2 = add i64 %r, %a
347  ret i64 %r2
348}
349
350declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
351declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
352declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
353