xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-vmla.ll (revision b5b663aac17415625340eb29c8010832bfc4c21c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc <4 x i32> @vmlau32(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind {
5; CHECK-LABEL: vmlau32:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vmla.i32 q0, q1, r0
8; CHECK-NEXT:    bx lr
9entry:
10  %0 = insertelement <4 x i32> undef, i32 %X, i32 0
11  %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
12  %2 = mul nsw <4 x i32> %B, %1
13  %3 = add nsw <4 x i32> %A, %2
14  ret <4 x i32> %3
15}
16
17define arm_aapcs_vfpcc <4 x i32> @vmlau32b(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind {
18; CHECK-LABEL: vmlau32b:
19; CHECK:       @ %bb.0: @ %entry
20; CHECK-NEXT:    vmla.i32 q0, q1, r0
21; CHECK-NEXT:    bx lr
22entry:
23  %0 = insertelement <4 x i32> undef, i32 %X, i32 0
24  %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
25  %2 = mul nsw <4 x i32> %1, %B
26  %3 = add nsw <4 x i32> %2, %A
27  ret <4 x i32> %3
28}
29
30define arm_aapcs_vfpcc <8 x i16> @vmlau16(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind {
31; CHECK-LABEL: vmlau16:
32; CHECK:       @ %bb.0: @ %entry
33; CHECK-NEXT:    vmla.i16 q0, q1, r0
34; CHECK-NEXT:    bx lr
35entry:
36  %0 = insertelement <8 x i16> undef, i16 %X, i32 0
37  %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
38  %2 = mul nsw <8 x i16> %B, %1
39  %3 = add nsw <8 x i16> %A, %2
40  ret <8 x i16> %3
41}
42
43define arm_aapcs_vfpcc <8 x i16> @vmlau16b(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind {
44; CHECK-LABEL: vmlau16b:
45; CHECK:       @ %bb.0: @ %entry
46; CHECK-NEXT:    vmla.i16 q0, q1, r0
47; CHECK-NEXT:    bx lr
48entry:
49  %0 = insertelement <8 x i16> undef, i16 %X, i32 0
50  %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
51  %2 = mul nsw <8 x i16> %1, %B
52  %3 = add nsw <8 x i16> %2, %A
53  ret <8 x i16> %3
54}
55
56define arm_aapcs_vfpcc <16 x i8> @vmlau8(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind {
57; CHECK-LABEL: vmlau8:
58; CHECK:       @ %bb.0: @ %entry
59; CHECK-NEXT:    vmla.i8 q0, q1, r0
60; CHECK-NEXT:    bx lr
61entry:
62  %0 = insertelement <16 x i8> undef, i8 %X, i32 0
63  %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
64  %2 = mul nsw <16 x i8> %B, %1
65  %3 = add nsw <16 x i8> %A, %2
66  ret <16 x i8> %3
67}
68
69define arm_aapcs_vfpcc <16 x i8> @vmlau8b(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind {
70; CHECK-LABEL: vmlau8b:
71; CHECK:       @ %bb.0: @ %entry
72; CHECK-NEXT:    vmla.i8 q0, q1, r0
73; CHECK-NEXT:    bx lr
74entry:
75  %0 = insertelement <16 x i8> undef, i8 %X, i32 0
76  %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
77  %2 = mul nsw <16 x i8> %1, %B
78  %3 = add nsw <16 x i8> %2, %A
79  ret <16 x i8> %3
80}
81
82define void @vmla32_in_loop(ptr %s1, i32 %x, ptr %d, i32 %n) {
83; CHECK-LABEL: vmla32_in_loop:
84; CHECK:       @ %bb.0: @ %entry
85; CHECK-NEXT:  .LBB6_1: @ %vector.body
86; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
87; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
88; CHECK-NEXT:    vldrw.u32 q1, [r2]
89; CHECK-NEXT:    subs r3, #4
90; CHECK-NEXT:    vmla.i32 q1, q0, r1
91; CHECK-NEXT:    vstrb.8 q1, [r2], #16
92; CHECK-NEXT:    bne .LBB6_1
93; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
94; CHECK-NEXT:    bx lr
95entry:
96  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0
97  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
98  br label %vector.body
99
100vector.body:
101  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
102  %0 = getelementptr inbounds i32, ptr %s1, i32 %index
103  %wide.load = load <4 x i32>, ptr %0, align 4
104  %1 = mul nsw <4 x i32> %wide.load, %broadcast.splat9
105  %2 = getelementptr inbounds i32, ptr %d, i32 %index
106  %wide.load10 = load <4 x i32>, ptr %2, align 4
107  %3 = add nsw <4 x i32> %wide.load10, %1
108  store <4 x i32> %3, ptr %2, align 4
109  %index.next = add i32 %index, 4
110  %4 = icmp eq i32 %index.next, %n
111  br i1 %4, label %for.cond.cleanup, label %vector.body
112
113for.cond.cleanup:
114  ret void
115}
116
117define void @vmla16_in_loop(ptr %s1, i16 %x, ptr %d, i32 %n) {
118; CHECK-LABEL: vmla16_in_loop:
119; CHECK:       @ %bb.0: @ %entry
120; CHECK-NEXT:  .LBB7_1: @ %vector.body
121; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
122; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
123; CHECK-NEXT:    vldrh.u16 q1, [r2]
124; CHECK-NEXT:    subs r3, #8
125; CHECK-NEXT:    vmla.i16 q1, q0, r1
126; CHECK-NEXT:    vstrb.8 q1, [r2], #16
127; CHECK-NEXT:    bne .LBB7_1
128; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
129; CHECK-NEXT:    bx lr
130entry:
131  %broadcast.splatinsert11 = insertelement <8 x i16> undef, i16 %x, i32 0
132  %broadcast.splat12 = shufflevector <8 x i16> %broadcast.splatinsert11, <8 x i16> undef, <8 x i32> zeroinitializer
133  br label %vector.body
134
135vector.body:
136  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
137  %0 = getelementptr inbounds i16, ptr %s1, i32 %index
138  %wide.load = load <8 x i16>, ptr %0, align 2
139  %1 = mul <8 x i16> %wide.load, %broadcast.splat12
140  %2 = getelementptr inbounds i16, ptr %d, i32 %index
141  %wide.load13 = load <8 x i16>, ptr %2, align 2
142  %3 = add <8 x i16> %1, %wide.load13
143  store <8 x i16> %3, ptr %2, align 2
144  %index.next = add i32 %index, 8
145  %4 = icmp eq i32 %index.next, %n
146  br i1 %4, label %for.cond.cleanup, label %vector.body
147
148for.cond.cleanup:
149  ret void
150}
151
152define void @vmla8_in_loop(ptr %s1, i8 %x, ptr %d, i32 %n) {
153; CHECK-LABEL: vmla8_in_loop:
154; CHECK:       @ %bb.0: @ %entry
155; CHECK-NEXT:  .LBB8_1: @ %vector.body
156; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
157; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
158; CHECK-NEXT:    vldrh.u16 q1, [r2]
159; CHECK-NEXT:    subs r3, #16
160; CHECK-NEXT:    vmla.i8 q1, q0, r1
161; CHECK-NEXT:    vstrb.8 q1, [r2], #16
162; CHECK-NEXT:    bne .LBB8_1
163; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
164; CHECK-NEXT:    bx lr
165entry:
166  %broadcast.splatinsert11 = insertelement <16 x i8> undef, i8 %x, i32 0
167  %broadcast.splat12 = shufflevector <16 x i8> %broadcast.splatinsert11, <16 x i8> undef, <16 x i32> zeroinitializer
168  br label %vector.body
169
170vector.body:
171  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
172  %0 = getelementptr inbounds i8, ptr %s1, i32 %index
173  %wide.load = load <16 x i8>, ptr %0, align 2
174  %1 = mul <16 x i8> %wide.load, %broadcast.splat12
175  %2 = getelementptr inbounds i8, ptr %d, i32 %index
176  %wide.load13 = load <16 x i8>, ptr %2, align 2
177  %3 = add <16 x i8> %1, %wide.load13
178  store <16 x i8> %3, ptr %2, align 2
179  %index.next = add i32 %index, 16
180  %4 = icmp eq i32 %index.next, %n
181  br i1 %4, label %for.cond.cleanup, label %vector.body
182
183for.cond.cleanup:
184  ret void
185}
186
187
188define arm_aapcs_vfpcc <4 x i32> @vmlasu32(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind {
189; CHECK-LABEL: vmlasu32:
190; CHECK:       @ %bb.0: @ %entry
191; CHECK-NEXT:    vmlas.i32 q0, q1, r0
192; CHECK-NEXT:    bx lr
193entry:
194  %0 = insertelement <4 x i32> undef, i32 %X, i32 0
195  %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
196  %2 = mul nsw <4 x i32> %A, %B
197  %3 = add nsw <4 x i32> %1, %2
198  ret <4 x i32> %3
199}
200
201define arm_aapcs_vfpcc <4 x i32> @vmlasu32b(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind {
202; CHECK-LABEL: vmlasu32b:
203; CHECK:       @ %bb.0: @ %entry
204; CHECK-NEXT:    vmlas.i32 q0, q1, r0
205; CHECK-NEXT:    bx lr
206entry:
207  %0 = insertelement <4 x i32> undef, i32 %X, i32 0
208  %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
209  %2 = mul nsw <4 x i32> %A, %B
210  %3 = add nsw <4 x i32> %2, %1
211  ret <4 x i32> %3
212}
213
214define arm_aapcs_vfpcc <8 x i16> @vmlasu16(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind {
215; CHECK-LABEL: vmlasu16:
216; CHECK:       @ %bb.0: @ %entry
217; CHECK-NEXT:    vmlas.i16 q0, q1, r0
218; CHECK-NEXT:    bx lr
219entry:
220  %0 = insertelement <8 x i16> undef, i16 %X, i32 0
221  %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
222  %2 = mul nsw <8 x i16> %A, %B
223  %3 = add nsw <8 x i16> %1, %2
224  ret <8 x i16> %3
225}
226
227define arm_aapcs_vfpcc <8 x i16> @vmlasu16b(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind {
228; CHECK-LABEL: vmlasu16b:
229; CHECK:       @ %bb.0: @ %entry
230; CHECK-NEXT:    vmlas.i16 q0, q1, r0
231; CHECK-NEXT:    bx lr
232entry:
233  %0 = insertelement <8 x i16> undef, i16 %X, i32 0
234  %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
235  %2 = mul nsw <8 x i16> %A, %B
236  %3 = add nsw <8 x i16> %2, %1
237  ret <8 x i16> %3
238}
239
240define arm_aapcs_vfpcc <16 x i8> @vmlasu8(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind {
241; CHECK-LABEL: vmlasu8:
242; CHECK:       @ %bb.0: @ %entry
243; CHECK-NEXT:    vmlas.i8 q0, q1, r0
244; CHECK-NEXT:    bx lr
245entry:
246  %0 = insertelement <16 x i8> undef, i8 %X, i32 0
247  %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
248  %2 = mul nsw <16 x i8> %A, %B
249  %3 = add nsw <16 x i8> %1, %2
250  ret <16 x i8> %3
251}
252
253define arm_aapcs_vfpcc <16 x i8> @vmlasu8b(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind {
254; CHECK-LABEL: vmlasu8b:
255; CHECK:       @ %bb.0: @ %entry
256; CHECK-NEXT:    vmlas.i8 q0, q1, r0
257; CHECK-NEXT:    bx lr
258entry:
259  %0 = insertelement <16 x i8> undef, i8 %X, i32 0
260  %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
261  %2 = mul nsw <16 x i8> %A, %B
262  %3 = add nsw <16 x i8> %2, %1
263  ret <16 x i8> %3
264}
265
266define void @vmlas32_in_loop(ptr %s1, i32 %x, ptr %d, i32 %n) {
267; CHECK-LABEL: vmlas32_in_loop:
268; CHECK:       @ %bb.0: @ %entry
269; CHECK-NEXT:  .LBB15_1: @ %vector.body
270; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
271; CHECK-NEXT:    vldrw.u32 q0, [r2]
272; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
273; CHECK-NEXT:    subs r3, #4
274; CHECK-NEXT:    vmlas.i32 q1, q0, r1
275; CHECK-NEXT:    vstrb.8 q1, [r2], #16
276; CHECK-NEXT:    bne .LBB15_1
277; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
278; CHECK-NEXT:    bx lr
279entry:
280  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0
281  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
282  br label %vector.body
283
284vector.body:
285  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
286  %0 = getelementptr inbounds i32, ptr %s1, i32 %index
287  %wide.load = load <4 x i32>, ptr %0, align 4
288  %1 = getelementptr inbounds i32, ptr %d, i32 %index
289  %wide.load10 = load <4 x i32>, ptr %1, align 4
290  %2 = mul nsw <4 x i32> %wide.load, %wide.load10
291  %3 = add nsw <4 x i32> %broadcast.splat9, %2
292  store <4 x i32> %3, ptr %1, align 4
293  %index.next = add i32 %index, 4
294  %4 = icmp eq i32 %index.next, %n
295  br i1 %4, label %for.cond.cleanup, label %vector.body
296
297for.cond.cleanup:
298  ret void
299}
300
301define void @vmlas16_in_loop(ptr %s1, i16 %x, ptr %d, i32 %n) {
302; CHECK-LABEL: vmlas16_in_loop:
303; CHECK:       @ %bb.0: @ %entry
304; CHECK-NEXT:  .LBB16_1: @ %vector.body
305; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
306; CHECK-NEXT:    vldrh.u16 q0, [r2]
307; CHECK-NEXT:    vldrh.u16 q1, [r0], #16
308; CHECK-NEXT:    subs r3, #8
309; CHECK-NEXT:    vmlas.i16 q1, q0, r1
310; CHECK-NEXT:    vstrb.8 q1, [r2], #16
311; CHECK-NEXT:    bne .LBB16_1
312; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
313; CHECK-NEXT:    bx lr
314entry:
315  %broadcast.splatinsert11 = insertelement <8 x i16> undef, i16 %x, i32 0
316  %broadcast.splat12 = shufflevector <8 x i16> %broadcast.splatinsert11, <8 x i16> undef, <8 x i32> zeroinitializer
317  br label %vector.body
318
319vector.body:
320  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
321  %0 = getelementptr inbounds i16, ptr %s1, i32 %index
322  %wide.load = load <8 x i16>, ptr %0, align 2
323  %1 = getelementptr inbounds i16, ptr %d, i32 %index
324  %wide.load13 = load <8 x i16>, ptr %1, align 2
325  %2 = mul <8 x i16> %wide.load, %wide.load13
326  %3 = add <8 x i16> %2, %broadcast.splat12
327  store <8 x i16> %3, ptr %1, align 2
328  %index.next = add i32 %index, 8
329  %4 = icmp eq i32 %index.next, %n
330  br i1 %4, label %for.cond.cleanup, label %vector.body
331
332for.cond.cleanup:
333  ret void
334}
335
336define void @vmlas8_in_loop(ptr %s1, i8 %x, ptr %d, i32 %n) {
337; CHECK-LABEL: vmlas8_in_loop:
338; CHECK:       @ %bb.0: @ %entry
339; CHECK-NEXT:  .LBB17_1: @ %vector.body
340; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
341; CHECK-NEXT:    vldrh.u16 q0, [r2]
342; CHECK-NEXT:    vldrh.u16 q1, [r0], #16
343; CHECK-NEXT:    subs r3, #16
344; CHECK-NEXT:    vmlas.i8 q1, q0, r1
345; CHECK-NEXT:    vstrb.8 q1, [r2], #16
346; CHECK-NEXT:    bne .LBB17_1
347; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
348; CHECK-NEXT:    bx lr
349entry:
350  %broadcast.splatinsert11 = insertelement <16 x i8> undef, i8 %x, i32 0
351  %broadcast.splat12 = shufflevector <16 x i8> %broadcast.splatinsert11, <16 x i8> undef, <16 x i32> zeroinitializer
352  br label %vector.body
353
354vector.body:
355  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
356  %0 = getelementptr inbounds i8, ptr %s1, i32 %index
357  %wide.load = load <16 x i8>, ptr %0, align 2
358  %1 = getelementptr inbounds i8, ptr %d, i32 %index
359  %wide.load13 = load <16 x i8>, ptr %1, align 2
360  %2 = mul <16 x i8> %wide.load, %wide.load13
361  %3 = add <16 x i8> %2, %broadcast.splat12
362  store <16 x i8> %3, ptr %1, align 2
363  %index.next = add i32 %index, 16
364  %4 = icmp eq i32 %index.next, %n
365  br i1 %4, label %for.cond.cleanup, label %vector.body
366
367for.cond.cleanup:
368  ret void
369}
370