xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll (revision b31fffbc7f1e0491bf599e82b7195e320d26e140)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
3
4define void @vaddq(ptr %x, ptr %y, i32 %n) {
5; CHECK-LABEL: vaddq:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r7, lr}
8; CHECK-NEXT:    push {r7, lr}
9; CHECK-NEXT:    cmp r2, #1
10; CHECK-NEXT:    it lt
11; CHECK-NEXT:    poplt {r7, pc}
12; CHECK-NEXT:  .LBB0_1: @ %for.body.preheader
13; CHECK-NEXT:    movs r3, #10
14; CHECK-NEXT:    dlstp.32 lr, r2
15; CHECK-NEXT:  .LBB0_2: @ %for.body
16; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
17; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
18; CHECK-NEXT:    vadd.i32 q0, q0, r3
19; CHECK-NEXT:    vstrw.32 q0, [r1], #16
20; CHECK-NEXT:    letp lr, .LBB0_2
21; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
22; CHECK-NEXT:    pop {r7, pc}
23entry:
24  %cmp11 = icmp sgt i32 %n, 0
25  br i1 %cmp11, label %for.body, label %for.cond.cleanup
26
27for.cond.cleanup:                                 ; preds = %for.body, %entry
28  ret void
29
30for.body:                                         ; preds = %entry, %for.body
31  %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
32  %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
33  %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
34  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
35  %1 = bitcast ptr %x.addr.014 to ptr
36  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
37  %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
38  %3 = add <4 x i32> %2, <i32 10, i32 10, i32 10, i32 10>
39  %4 = bitcast ptr %y.addr.013 to ptr
40  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0)
41  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
42  %sub = add nsw i32 %i.012, -4
43  %cmp = icmp sgt i32 %i.012, 4
44  br i1 %cmp, label %for.body, label %for.cond.cleanup
45}
46
47define void @vadd(ptr %s1, i32 %c0, i32 %N) {
48; CHECK-LABEL: vadd:
49; CHECK:       @ %bb.0: @ %entry
50; CHECK-NEXT:    .save {r7, lr}
51; CHECK-NEXT:    push {r7, lr}
52; CHECK-NEXT:    cmp r2, #1
53; CHECK-NEXT:    it lt
54; CHECK-NEXT:    poplt {r7, pc}
55; CHECK-NEXT:  .LBB1_1: @ %while.body.lr.ph
56; CHECK-NEXT:    dlstp.32 lr, r2
57; CHECK-NEXT:  .LBB1_2: @ %while.body
58; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
59; CHECK-NEXT:    vldrw.u32 q0, [r0]
60; CHECK-NEXT:    vadd.i32 q0, q0, r1
61; CHECK-NEXT:    vstrw.32 q0, [r0], #16
62; CHECK-NEXT:    letp lr, .LBB1_2
63; CHECK-NEXT:  @ %bb.3: @ %while.end
64; CHECK-NEXT:    pop {r7, pc}
65entry:
66  %cmp11 = icmp sgt i32 %N, 0
67  br i1 %cmp11, label %while.body.lr.ph, label %while.end
68
69while.body.lr.ph:                                 ; preds = %entry
70  %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
71  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
72  br label %while.body
73
74while.body:                                       ; preds = %while.body.lr.ph, %while.body
75  %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
76  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
77  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
78  %1 = bitcast ptr %s1.addr.013 to ptr
79  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
80  %3 = tail call <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
81  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0)
82  %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
83  %sub = add nsw i32 %N.addr.012, -4
84  %cmp = icmp sgt i32 %N.addr.012, 4
85  br i1 %cmp, label %while.body, label %while.end
86
87while.end:                                        ; preds = %while.body, %entry
88  ret void
89}
90
91define void @vsubq(ptr %x, ptr %y, i32 %n) {
92; CHECK-LABEL: vsubq:
93; CHECK:       @ %bb.0: @ %entry
94; CHECK-NEXT:    .save {r7, lr}
95; CHECK-NEXT:    push {r7, lr}
96; CHECK-NEXT:    cmp r2, #1
97; CHECK-NEXT:    it lt
98; CHECK-NEXT:    poplt {r7, pc}
99; CHECK-NEXT:  .LBB2_1: @ %for.body.preheader
100; CHECK-NEXT:    movs r3, #10
101; CHECK-NEXT:    dlstp.32 lr, r2
102; CHECK-NEXT:  .LBB2_2: @ %for.body
103; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
104; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
105; CHECK-NEXT:    vsub.i32 q0, q0, r3
106; CHECK-NEXT:    vstrw.32 q0, [r1], #16
107; CHECK-NEXT:    letp lr, .LBB2_2
108; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
109; CHECK-NEXT:    pop {r7, pc}
110entry:
111  %cmp11 = icmp sgt i32 %n, 0
112  br i1 %cmp11, label %for.body, label %for.cond.cleanup
113
114for.cond.cleanup:                                 ; preds = %for.body, %entry
115  ret void
116
117for.body:                                         ; preds = %entry, %for.body
118  %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
119  %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
120  %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
121  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
122  %1 = bitcast ptr %x.addr.014 to ptr
123  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
124  %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
125  %3 = sub <4 x i32> %2, <i32 10, i32 10, i32 10, i32 10>
126  %4 = bitcast ptr %y.addr.013 to ptr
127  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0)
128  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
129  %sub = add nsw i32 %i.012, -4
130  %cmp = icmp sgt i32 %i.012, 4
131  br i1 %cmp, label %for.body, label %for.cond.cleanup
132}
133
134define void @vsub(ptr %s1, i32 %N) {
135; CHECK-LABEL: vsub:
136; CHECK:       @ %bb.0: @ %entry
137; CHECK-NEXT:    .save {r7, lr}
138; CHECK-NEXT:    push {r7, lr}
139; CHECK-NEXT:    cmp r1, #1
140; CHECK-NEXT:    it lt
141; CHECK-NEXT:    poplt {r7, pc}
142; CHECK-NEXT:  .LBB3_1: @ %while.body.preheader
143; CHECK-NEXT:    movs r2, #10
144; CHECK-NEXT:    dlstp.32 lr, r1
145; CHECK-NEXT:  .LBB3_2: @ %while.body
146; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
147; CHECK-NEXT:    vldrw.u32 q0, [r0]
148; CHECK-NEXT:    vsub.i32 q0, q0, r2
149; CHECK-NEXT:    vstrw.32 q0, [r0], #16
150; CHECK-NEXT:    letp lr, .LBB3_2
151; CHECK-NEXT:  @ %bb.3: @ %while.end
152; CHECK-NEXT:    pop {r7, pc}
153entry:
154  %cmp11 = icmp sgt i32 %N, 0
155  br i1 %cmp11, label %while.body.lr.ph, label %while.end
156
157while.body.lr.ph:                                 ; preds = %entry
158  br label %while.body
159
160while.body:                                       ; preds = %while.body.lr.ph, %while.body
161  %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
162  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
163  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
164  %1 = bitcast ptr %s1.addr.013 to ptr
165  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
166  %3 = tail call <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i1> %0, <4 x i32> %2)
167  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0)
168  %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
169  %sub = add nsw i32 %N.addr.012, -4
170  %cmp = icmp sgt i32 %N.addr.012, 4
171  br i1 %cmp, label %while.body, label %while.end
172
173while.end:                                        ; preds = %while.body, %entry
174  ret void
175}
176
177define void @vmulq(ptr %x, ptr %y, i32 %n) {
178; CHECK-LABEL: vmulq:
179; CHECK:       @ %bb.0: @ %entry
180; CHECK-NEXT:    .save {r7, lr}
181; CHECK-NEXT:    push {r7, lr}
182; CHECK-NEXT:    cmp r2, #1
183; CHECK-NEXT:    it lt
184; CHECK-NEXT:    poplt {r7, pc}
185; CHECK-NEXT:  .LBB4_1: @ %for.body.preheader
186; CHECK-NEXT:    movs r3, #10
187; CHECK-NEXT:    dlstp.32 lr, r2
188; CHECK-NEXT:  .LBB4_2: @ %for.body
189; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
190; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
191; CHECK-NEXT:    vmul.i32 q0, q0, r3
192; CHECK-NEXT:    vstrw.32 q0, [r1], #16
193; CHECK-NEXT:    letp lr, .LBB4_2
194; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
195; CHECK-NEXT:    pop {r7, pc}
196entry:
197  %cmp11 = icmp sgt i32 %n, 0
198  br i1 %cmp11, label %for.body, label %for.cond.cleanup
199
200for.cond.cleanup:                                 ; preds = %for.body, %entry
201  ret void
202
203for.body:                                         ; preds = %entry, %for.body
204  %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
205  %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
206  %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
207  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
208  %1 = bitcast ptr %x.addr.014 to ptr
209  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
210  %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
211  %3 = mul <4 x i32> %2, <i32 10, i32 10, i32 10, i32 10>
212  %4 = bitcast ptr %y.addr.013 to ptr
213  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0)
214  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
215  %sub = add nsw i32 %i.012, -4
216  %cmp = icmp sgt i32 %i.012, 4
217  br i1 %cmp, label %for.body, label %for.cond.cleanup
218}
219
220define void @vmul(ptr %s1, i32 %N) {
221; CHECK-LABEL: vmul:
222; CHECK:       @ %bb.0: @ %entry
223; CHECK-NEXT:    .save {r7, lr}
224; CHECK-NEXT:    push {r7, lr}
225; CHECK-NEXT:    cmp r1, #1
226; CHECK-NEXT:    it lt
227; CHECK-NEXT:    poplt {r7, pc}
228; CHECK-NEXT:  .LBB5_1: @ %while.body.preheader
229; CHECK-NEXT:    movs r2, #10
230; CHECK-NEXT:    dlstp.32 lr, r1
231; CHECK-NEXT:  .LBB5_2: @ %while.body
232; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
233; CHECK-NEXT:    vldrw.u32 q0, [r0]
234; CHECK-NEXT:    vmul.i32 q0, q0, r2
235; CHECK-NEXT:    vstrw.32 q0, [r0], #16
236; CHECK-NEXT:    letp lr, .LBB5_2
237; CHECK-NEXT:  @ %bb.3: @ %while.end
238; CHECK-NEXT:    pop {r7, pc}
239entry:
240  %cmp11 = icmp sgt i32 %N, 0
241  br i1 %cmp11, label %while.body.lr.ph, label %while.end
242
243while.body.lr.ph:                                 ; preds = %entry
244  br label %while.body
245
246while.body:                                       ; preds = %while.body.lr.ph, %while.body
247  %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
248  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
249  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
250  %1 = bitcast ptr %s1.addr.013 to ptr
251  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
252  %3 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i1> %0, <4 x i32> %2)
253  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0)
254  %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
255  %sub = add nsw i32 %N.addr.012, -4
256  %cmp = icmp sgt i32 %N.addr.012, 4
257  br i1 %cmp, label %while.body, label %while.end
258
259while.end:                                        ; preds = %while.body, %entry
260  ret void
261}
262
263define void @vqaddq(ptr %x, ptr %y, i32 %n) {
264; CHECK-LABEL: vqaddq:
265; CHECK:       @ %bb.0: @ %entry
266; CHECK-NEXT:    .save {r7, lr}
267; CHECK-NEXT:    push {r7, lr}
268; CHECK-NEXT:    cmp r2, #1
269; CHECK-NEXT:    it lt
270; CHECK-NEXT:    poplt {r7, pc}
271; CHECK-NEXT:  .LBB6_1: @ %for.body.preheader
272; CHECK-NEXT:    movs r3, #10
273; CHECK-NEXT:    dlstp.32 lr, r2
274; CHECK-NEXT:  .LBB6_2: @ %for.body
275; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
276; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
277; CHECK-NEXT:    vqadd.s32 q0, q0, r3
278; CHECK-NEXT:    vstrw.32 q0, [r1], #16
279; CHECK-NEXT:    letp lr, .LBB6_2
280; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
281; CHECK-NEXT:    pop {r7, pc}
282entry:
283  %cmp11 = icmp sgt i32 %n, 0
284  br i1 %cmp11, label %for.body, label %for.cond.cleanup
285
286for.cond.cleanup:                                 ; preds = %for.body, %entry
287  ret void
288
289for.body:                                         ; preds = %entry, %for.body
290  %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
291  %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
292  %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
293  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
294  %1 = bitcast ptr %x.addr.014 to ptr
295  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
296  %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
297  %3 = tail call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>)
298  %4 = bitcast ptr %y.addr.013 to ptr
299  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0)
300  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
301  %sub = add nsw i32 %i.012, -4
302  %cmp = icmp sgt i32 %i.012, 4
303  br i1 %cmp, label %for.body, label %for.cond.cleanup
304}
305
306define void @vqaddqu(ptr %x, ptr %y, i32 %n) {
307; CHECK-LABEL: vqaddqu:
308; CHECK:       @ %bb.0: @ %entry
309; CHECK-NEXT:    .save {r7, lr}
310; CHECK-NEXT:    push {r7, lr}
311; CHECK-NEXT:    cmp r2, #1
312; CHECK-NEXT:    it lt
313; CHECK-NEXT:    poplt {r7, pc}
314; CHECK-NEXT:  .LBB7_1: @ %for.body.preheader
315; CHECK-NEXT:    movs r3, #10
316; CHECK-NEXT:    dlstp.32 lr, r2
317; CHECK-NEXT:  .LBB7_2: @ %for.body
318; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
319; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
320; CHECK-NEXT:    vqadd.u32 q0, q0, r3
321; CHECK-NEXT:    vstrw.32 q0, [r1], #16
322; CHECK-NEXT:    letp lr, .LBB7_2
323; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
324; CHECK-NEXT:    pop {r7, pc}
325entry:
326  %cmp11 = icmp sgt i32 %n, 0
327  br i1 %cmp11, label %for.body, label %for.cond.cleanup
328
329for.cond.cleanup:                                 ; preds = %for.body, %entry
330  ret void
331
332for.body:                                         ; preds = %entry, %for.body
333  %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
334  %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
335  %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
336  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
337  %1 = bitcast ptr %x.addr.014 to ptr
338  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
339  %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
340  %3 = tail call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>)
341  %4 = bitcast ptr %y.addr.013 to ptr
342  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0)
343  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
344  %sub = add nsw i32 %i.012, -4
345  %cmp = icmp sgt i32 %i.012, 4
346  br i1 %cmp, label %for.body, label %for.cond.cleanup
347}
348
349define void @vqadd(ptr %s1, i32 %N) {
350; CHECK-LABEL: vqadd:
351; CHECK:       @ %bb.0: @ %entry
352; CHECK-NEXT:    .save {r7, lr}
353; CHECK-NEXT:    push {r7, lr}
354; CHECK-NEXT:    cmp r1, #1
355; CHECK-NEXT:    it lt
356; CHECK-NEXT:    poplt {r7, pc}
357; CHECK-NEXT:  .LBB8_1: @ %while.body.preheader
358; CHECK-NEXT:    movs r2, #10
359; CHECK-NEXT:    dlstp.32 lr, r1
360; CHECK-NEXT:  .LBB8_2: @ %while.body
361; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
362; CHECK-NEXT:    vldrw.u32 q0, [r0]
363; CHECK-NEXT:    vqadd.s32 q0, q0, r2
364; CHECK-NEXT:    vstrw.32 q0, [r0], #16
365; CHECK-NEXT:    letp lr, .LBB8_2
366; CHECK-NEXT:  @ %bb.3: @ %while.end
367; CHECK-NEXT:    pop {r7, pc}
368entry:
369  %cmp11 = icmp sgt i32 %N, 0
370  br i1 %cmp11, label %while.body.lr.ph, label %while.end
371
372while.body.lr.ph:                                 ; preds = %entry
373  br label %while.body
374
375while.body:                                       ; preds = %while.body.lr.ph, %while.body
376  %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
377  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
378  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
379  %1 = bitcast ptr %s1.addr.013 to ptr
380  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
381  %3 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0, <4 x i1> %0, <4 x i32> %2)
382  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0)
383  %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
384  %sub = add nsw i32 %N.addr.012, -4
385  %cmp = icmp sgt i32 %N.addr.012, 4
386  br i1 %cmp, label %while.body, label %while.end
387
388while.end:                                        ; preds = %while.body, %entry
389  ret void
390}
391
392define void @vqsubq(ptr %x, ptr %y, i32 %n) {
393; CHECK-LABEL: vqsubq:
394; CHECK:       @ %bb.0: @ %entry
395; CHECK-NEXT:    .save {r7, lr}
396; CHECK-NEXT:    push {r7, lr}
397; CHECK-NEXT:    cmp r2, #1
398; CHECK-NEXT:    it lt
399; CHECK-NEXT:    poplt {r7, pc}
400; CHECK-NEXT:  .LBB9_1: @ %for.body.preheader
401; CHECK-NEXT:    movs r3, #10
402; CHECK-NEXT:    dlstp.32 lr, r2
403; CHECK-NEXT:  .LBB9_2: @ %for.body
404; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
405; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
406; CHECK-NEXT:    vqsub.s32 q0, q0, r3
407; CHECK-NEXT:    vstrw.32 q0, [r1], #16
408; CHECK-NEXT:    letp lr, .LBB9_2
409; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
410; CHECK-NEXT:    pop {r7, pc}
411entry:
412  %cmp11 = icmp sgt i32 %n, 0
413  br i1 %cmp11, label %for.body, label %for.cond.cleanup
414
415for.cond.cleanup:                                 ; preds = %for.body, %entry
416  ret void
417
418for.body:                                         ; preds = %entry, %for.body
419  %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
420  %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
421  %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
422  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
423  %1 = bitcast ptr %x.addr.014 to ptr
424  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
425  %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
426  %3 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>)
427  %4 = bitcast ptr %y.addr.013 to ptr
428  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0)
429  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
430  %sub = add nsw i32 %i.012, -4
431  %cmp = icmp sgt i32 %i.012, 4
432  br i1 %cmp, label %for.body, label %for.cond.cleanup
433}
434
435define void @vqsubqu(ptr %x, ptr %y, i32 %n) {
436; CHECK-LABEL: vqsubqu:
437; CHECK:       @ %bb.0: @ %entry
438; CHECK-NEXT:    .save {r7, lr}
439; CHECK-NEXT:    push {r7, lr}
440; CHECK-NEXT:    cmp r2, #1
441; CHECK-NEXT:    it lt
442; CHECK-NEXT:    poplt {r7, pc}
443; CHECK-NEXT:  .LBB10_1: @ %for.body.preheader
444; CHECK-NEXT:    movs r3, #10
445; CHECK-NEXT:    dlstp.32 lr, r2
446; CHECK-NEXT:  .LBB10_2: @ %for.body
447; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
448; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
449; CHECK-NEXT:    vqsub.u32 q0, q0, r3
450; CHECK-NEXT:    vstrw.32 q0, [r1], #16
451; CHECK-NEXT:    letp lr, .LBB10_2
452; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
453; CHECK-NEXT:    pop {r7, pc}
454entry:
455  %cmp11 = icmp sgt i32 %n, 0
456  br i1 %cmp11, label %for.body, label %for.cond.cleanup
457
458for.cond.cleanup:                                 ; preds = %for.body, %entry
459  ret void
460
461for.body:                                         ; preds = %entry, %for.body
462  %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
463  %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
464  %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
465  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
466  %1 = bitcast ptr %x.addr.014 to ptr
467  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
468  %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
469  %3 = tail call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>)
470  %4 = bitcast ptr %y.addr.013 to ptr
471  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0)
472  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
473  %sub = add nsw i32 %i.012, -4
474  %cmp = icmp sgt i32 %i.012, 4
475  br i1 %cmp, label %for.body, label %for.cond.cleanup
476}
477
478define void @vqsub(ptr %s1, i32 %N) {
479; CHECK-LABEL: vqsub:
480; CHECK:       @ %bb.0: @ %entry
481; CHECK-NEXT:    .save {r7, lr}
482; CHECK-NEXT:    push {r7, lr}
483; CHECK-NEXT:    cmp r1, #1
484; CHECK-NEXT:    it lt
485; CHECK-NEXT:    poplt {r7, pc}
486; CHECK-NEXT:  .LBB11_1: @ %while.body.preheader
487; CHECK-NEXT:    movs r2, #10
488; CHECK-NEXT:    dlstp.32 lr, r1
489; CHECK-NEXT:  .LBB11_2: @ %while.body
490; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
491; CHECK-NEXT:    vldrw.u32 q0, [r0]
492; CHECK-NEXT:    vqsub.s32 q0, q0, r2
493; CHECK-NEXT:    vstrw.32 q0, [r0], #16
494; CHECK-NEXT:    letp lr, .LBB11_2
495; CHECK-NEXT:  @ %bb.3: @ %while.end
496; CHECK-NEXT:    pop {r7, pc}
497entry:
498  %cmp11 = icmp sgt i32 %N, 0
499  br i1 %cmp11, label %while.body.lr.ph, label %while.end
500
501while.body.lr.ph:                                 ; preds = %entry
502  br label %while.body
503
504while.body:                                       ; preds = %while.body.lr.ph, %while.body
505  %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
506  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
507  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
508  %1 = bitcast ptr %s1.addr.013 to ptr
509  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
510  %3 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0, <4 x i1> %0, <4 x i32> %2)
511  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0)
512  %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
513  %sub = add nsw i32 %N.addr.012, -4
514  %cmp = icmp sgt i32 %N.addr.012, 4
515  br i1 %cmp, label %while.body, label %while.end
516
517while.end:                                        ; preds = %while.body, %entry
518  ret void
519}
520
521define void @vhaddq(ptr %x, ptr %y, i32 %n) {
522; CHECK-LABEL: vhaddq:
523; CHECK:       @ %bb.0: @ %entry
524; CHECK-NEXT:    .save {r7, lr}
525; CHECK-NEXT:    push {r7, lr}
526; CHECK-NEXT:    cmp r2, #1
527; CHECK-NEXT:    it lt
528; CHECK-NEXT:    poplt {r7, pc}
529; CHECK-NEXT:  .LBB12_1: @ %for.body.preheader
530; CHECK-NEXT:    movs r3, #10
531; CHECK-NEXT:    dlstp.32 lr, r2
532; CHECK-NEXT:  .LBB12_2: @ %for.body
533; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
534; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
535; CHECK-NEXT:    vhadd.s32 q0, q0, r3
536; CHECK-NEXT:    vstrw.32 q0, [r1], #16
537; CHECK-NEXT:    letp lr, .LBB12_2
538; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
539; CHECK-NEXT:    pop {r7, pc}
540entry:
541  %cmp11 = icmp sgt i32 %n, 0
542  br i1 %cmp11, label %for.body, label %for.cond.cleanup
543
544for.cond.cleanup:                                 ; preds = %for.body, %entry
545  ret void
546
547for.body:                                         ; preds = %entry, %for.body
548  %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
549  %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
550  %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
551  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
552  %1 = bitcast ptr %x.addr.014 to ptr
553  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
554  %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
555  %3 = tail call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0)
556  %4 = bitcast ptr %y.addr.013 to ptr
557  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0)
558  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
559  %sub = add nsw i32 %i.012, -4
560  %cmp = icmp sgt i32 %i.012, 4
561  br i1 %cmp, label %for.body, label %for.cond.cleanup
562}
563
564define void @vhadd(ptr %s1, i32 %N) {
565; CHECK-LABEL: vhadd:
566; CHECK:       @ %bb.0: @ %entry
567; CHECK-NEXT:    .save {r7, lr}
568; CHECK-NEXT:    push {r7, lr}
569; CHECK-NEXT:    cmp r1, #1
570; CHECK-NEXT:    it lt
571; CHECK-NEXT:    poplt {r7, pc}
572; CHECK-NEXT:  .LBB13_1: @ %while.body.preheader
573; CHECK-NEXT:    movs r2, #10
574; CHECK-NEXT:    dlstp.32 lr, r1
575; CHECK-NEXT:  .LBB13_2: @ %while.body
576; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
577; CHECK-NEXT:    vldrw.u32 q0, [r0]
578; CHECK-NEXT:    vhadd.s32 q0, q0, r2
579; CHECK-NEXT:    vstrw.32 q0, [r0], #16
580; CHECK-NEXT:    letp lr, .LBB13_2
581; CHECK-NEXT:  @ %bb.3: @ %while.end
582; CHECK-NEXT:    pop {r7, pc}
583entry:
584  %cmp11 = icmp sgt i32 %N, 0
585  br i1 %cmp11, label %while.body.lr.ph, label %while.end
586
587while.body.lr.ph:                                 ; preds = %entry
588  br label %while.body
589
590while.body:                                       ; preds = %while.body.lr.ph, %while.body
591  %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
592  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
593  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
594  %1 = bitcast ptr %s1.addr.013 to ptr
595  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
596  %3 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0, <4 x i1> %0, <4 x i32> %2)
597  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0)
598  %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
599  %sub = add nsw i32 %N.addr.012, -4
600  %cmp = icmp sgt i32 %N.addr.012, 4
601  br i1 %cmp, label %while.body, label %while.end
602
603while.end:                                        ; preds = %while.body, %entry
604  ret void
605}
606
607define void @vhsubq(ptr %x, ptr %y, i32 %n) {
608; CHECK-LABEL: vhsubq:
609; CHECK:       @ %bb.0: @ %entry
610; CHECK-NEXT:    .save {r7, lr}
611; CHECK-NEXT:    push {r7, lr}
612; CHECK-NEXT:    cmp r2, #1
613; CHECK-NEXT:    it lt
614; CHECK-NEXT:    poplt {r7, pc}
615; CHECK-NEXT:  .LBB14_1: @ %for.body.preheader
616; CHECK-NEXT:    movs r3, #10
617; CHECK-NEXT:    dlstp.32 lr, r2
618; CHECK-NEXT:  .LBB14_2: @ %for.body
619; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
620; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
621; CHECK-NEXT:    vhsub.s32 q0, q0, r3
622; CHECK-NEXT:    vstrw.32 q0, [r1], #16
623; CHECK-NEXT:    letp lr, .LBB14_2
624; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
625; CHECK-NEXT:    pop {r7, pc}
626entry:
627  %cmp11 = icmp sgt i32 %n, 0
628  br i1 %cmp11, label %for.body, label %for.cond.cleanup
629
630for.cond.cleanup:                                 ; preds = %for.body, %entry
631  ret void
632
633for.body:                                         ; preds = %entry, %for.body
634  %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
635  %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
636  %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
637  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
638  %1 = bitcast ptr %x.addr.014 to ptr
639  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
640  %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
641  %3 = tail call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0)
642  %4 = bitcast ptr %y.addr.013 to ptr
643  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0)
644  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
645  %sub = add nsw i32 %i.012, -4
646  %cmp = icmp sgt i32 %i.012, 4
647  br i1 %cmp, label %for.body, label %for.cond.cleanup
648}
649
650define void @vhsub(ptr %s1, i32 %N) {
651; CHECK-LABEL: vhsub:
652; CHECK:       @ %bb.0: @ %entry
653; CHECK-NEXT:    .save {r7, lr}
654; CHECK-NEXT:    push {r7, lr}
655; CHECK-NEXT:    cmp r1, #1
656; CHECK-NEXT:    it lt
657; CHECK-NEXT:    poplt {r7, pc}
658; CHECK-NEXT:  .LBB15_1: @ %while.body.preheader
659; CHECK-NEXT:    movs r2, #10
660; CHECK-NEXT:    dlstp.32 lr, r1
661; CHECK-NEXT:  .LBB15_2: @ %while.body
662; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
663; CHECK-NEXT:    vldrw.u32 q0, [r0]
664; CHECK-NEXT:    vhsub.s32 q0, q0, r2
665; CHECK-NEXT:    vstrw.32 q0, [r0], #16
666; CHECK-NEXT:    letp lr, .LBB15_2
667; CHECK-NEXT:  @ %bb.3: @ %while.end
668; CHECK-NEXT:    pop {r7, pc}
669entry:
670  %cmp11 = icmp sgt i32 %N, 0
671  br i1 %cmp11, label %while.body.lr.ph, label %while.end
672
673while.body.lr.ph:                                 ; preds = %entry
674  br label %while.body
675
676while.body:                                       ; preds = %while.body.lr.ph, %while.body
677  %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
678  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
679  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
680  %1 = bitcast ptr %s1.addr.013 to ptr
681  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
682  %3 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0, <4 x i1> %0, <4 x i32> %2)
683  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0)
684  %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
685  %sub = add nsw i32 %N.addr.012, -4
686  %cmp = icmp sgt i32 %N.addr.012, 4
687  br i1 %cmp, label %while.body, label %while.end
688
689while.end:                                        ; preds = %while.body, %entry
690  ret void
691}
692
693define void @vqdmullbq(ptr %x, ptr %y, i32 %n) {
694; CHECK-LABEL: vqdmullbq:
695; CHECK:       @ %bb.0: @ %entry
696; CHECK-NEXT:    .save {r7, lr}
697; CHECK-NEXT:    push {r7, lr}
698; CHECK-NEXT:    cmp r2, #1
699; CHECK-NEXT:    it lt
700; CHECK-NEXT:    poplt {r7, pc}
701; CHECK-NEXT:  .LBB16_1: @ %for.body.preheader
702; CHECK-NEXT:    movs r3, #10
703; CHECK-NEXT:    dlstp.32 lr, r2
704; CHECK-NEXT:  .LBB16_2: @ %for.body
705; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
706; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
707; CHECK-NEXT:    vqdmullb.s32 q1, q0, r3
708; CHECK-NEXT:    vstrw.32 q1, [r1], #16
709; CHECK-NEXT:    letp lr, .LBB16_2
710; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
711; CHECK-NEXT:    pop {r7, pc}
712entry:
713  %cmp11 = icmp sgt i32 %n, 0
714  br i1 %cmp11, label %for.body, label %for.cond.cleanup
715
716for.cond.cleanup:                                 ; preds = %for.body, %entry
717  ret void
718
719for.body:                                         ; preds = %entry, %for.body
720  %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
721  %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
722  %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
723  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
724  %1 = bitcast ptr %x.addr.014 to ptr
725  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
726  %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
727  %3 = tail call <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0)
728  %4 = bitcast <2 x i64> %3 to <4 x i32>
729  %5 = bitcast ptr %y.addr.013 to ptr
730  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %4, ptr %5, i32 4, <4 x i1> %0)
731  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
732  %sub = add nsw i32 %i.012, -4
733  %cmp = icmp sgt i32 %i.012, 4
734  br i1 %cmp, label %for.body, label %for.cond.cleanup
735}
736
737
738define void @vqdmull(ptr %s1, i32 %N) {
739; CHECK-LABEL: vqdmull:
740; CHECK:       @ %bb.0: @ %entry
741; CHECK-NEXT:    .save {r7, lr}
742; CHECK-NEXT:    push {r7, lr}
743; CHECK-NEXT:    cmp r1, #1
744; CHECK-NEXT:    it lt
745; CHECK-NEXT:    poplt {r7, pc}
746; CHECK-NEXT:  .LBB17_1: @ %while.body.preheader
747; CHECK-NEXT:    movs r2, #10
748; CHECK-NEXT:    dlstp.32 lr, r1
749; CHECK-NEXT:  .LBB17_2: @ %while.body
750; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
751; CHECK-NEXT:    vldrh.s32 q0, [r0]
752; CHECK-NEXT:    vqdmullb.s16 q0, q0, r2
753; CHECK-NEXT:    vstrw.32 q0, [r0], #16
754; CHECK-NEXT:    letp lr, .LBB17_2
755; CHECK-NEXT:  @ %bb.3: @ %while.end
756; CHECK-NEXT:    pop {r7, pc}
757entry:
758  %cmp11 = icmp sgt i32 %N, 0
759  br i1 %cmp11, label %while.body.lr.ph, label %while.end
760
761while.body.lr.ph:                                 ; preds = %entry
762  br label %while.body
763
764while.body:                                       ; preds = %while.body.lr.ph, %while.body
765  %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
766  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
767  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
768  %1 = bitcast ptr %s1.addr.013 to ptr
769  %2 = tail call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %1, i32 2, <4 x i1> %0, <4 x i16> zeroinitializer)
770  %3 = sext <4 x i16> %2 to <4 x i32>
771  %4 = bitcast <4 x i32> %3 to <8 x i16>
772  %5 = tail call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> %4, <8 x i16> <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>, i32 0, <4 x i1> %0, <4 x i32> %3)
773  %6 = bitcast ptr %s1.addr.013 to ptr
774  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %5, ptr %6, i32 4, <4 x i1> %0)
775  %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
776  %sub = add nsw i32 %N.addr.012, -4
777  %cmp = icmp sgt i32 %N.addr.012, 4
778  br i1 %cmp, label %while.body, label %while.end
779
780while.end:                                        ; preds = %while.body, %entry
781  ret void
782}
783
784define void @vqdmulhq(ptr %x, ptr %y, i32 %n) {
785; CHECK-LABEL: vqdmulhq:
786; CHECK:       @ %bb.0: @ %entry
787; CHECK-NEXT:    .save {r7, lr}
788; CHECK-NEXT:    push {r7, lr}
789; CHECK-NEXT:    cmp r2, #1
790; CHECK-NEXT:    it lt
791; CHECK-NEXT:    poplt {r7, pc}
792; CHECK-NEXT:  .LBB18_1: @ %for.body.preheader
793; CHECK-NEXT:    movs r3, #10
794; CHECK-NEXT:    dlstp.32 lr, r2
795; CHECK-NEXT:  .LBB18_2: @ %for.body
796; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
797; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
798; CHECK-NEXT:    vqdmulh.s32 q0, q0, r3
799; CHECK-NEXT:    vstrw.32 q0, [r1], #16
800; CHECK-NEXT:    letp lr, .LBB18_2
801; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
802; CHECK-NEXT:    pop {r7, pc}
803entry:
804  %cmp11 = icmp sgt i32 %n, 0
805  br i1 %cmp11, label %for.body, label %for.cond.cleanup
806
807for.cond.cleanup:                                 ; preds = %for.body, %entry
808  ret void
809
810for.body:                                         ; preds = %entry, %for.body
811  %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
812  %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
813  %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
814  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
815  %1 = bitcast ptr %x.addr.014 to ptr
816  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
817  %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
818  %3 = tail call <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>)
819  %4 = bitcast ptr %y.addr.013 to ptr
820  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0)
821  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
822  %sub = add nsw i32 %i.012, -4
823  %cmp = icmp sgt i32 %i.012, 4
824  br i1 %cmp, label %for.body, label %for.cond.cleanup
825}
826
827define void @vqdmulh(ptr %s1, i32 %N) {
828; CHECK-LABEL: vqdmulh:
829; CHECK:       @ %bb.0: @ %entry
830; CHECK-NEXT:    .save {r7, lr}
831; CHECK-NEXT:    push {r7, lr}
832; CHECK-NEXT:    cmp r1, #1
833; CHECK-NEXT:    it lt
834; CHECK-NEXT:    poplt {r7, pc}
835; CHECK-NEXT:  .LBB19_1: @ %while.body.preheader
836; CHECK-NEXT:    movs r2, #10
837; CHECK-NEXT:    dlstp.32 lr, r1
838; CHECK-NEXT:  .LBB19_2: @ %while.body
839; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
840; CHECK-NEXT:    vldrw.u32 q0, [r0]
841; CHECK-NEXT:    vqdmulh.s32 q0, q0, r2
842; CHECK-NEXT:    vstrw.32 q0, [r0], #16
843; CHECK-NEXT:    letp lr, .LBB19_2
844; CHECK-NEXT:  @ %bb.3: @ %while.end
845; CHECK-NEXT:    pop {r7, pc}
846entry:
847  %cmp11 = icmp sgt i32 %N, 0
848  br i1 %cmp11, label %while.body.lr.ph, label %while.end
849
850while.body.lr.ph:                                 ; preds = %entry
851  br label %while.body
852
853while.body:                                       ; preds = %while.body.lr.ph, %while.body
854  %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
855  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
856  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
857  %1 = bitcast ptr %s1.addr.013 to ptr
858  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
859  %3 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i1> %0, <4 x i32> %2)
860  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0)
861  %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
862  %sub = add nsw i32 %N.addr.012, -4
863  %cmp = icmp sgt i32 %N.addr.012, 4
864  br i1 %cmp, label %while.body, label %while.end
865
866while.end:                                        ; preds = %while.body, %entry
867  ret void
868}
869
870define void @vqrdmulhq(ptr %x, ptr %y, i32 %n) {
871; CHECK-LABEL: vqrdmulhq:
872; CHECK:       @ %bb.0: @ %entry
873; CHECK-NEXT:    .save {r7, lr}
874; CHECK-NEXT:    push {r7, lr}
875; CHECK-NEXT:    cmp r2, #1
876; CHECK-NEXT:    it lt
877; CHECK-NEXT:    poplt {r7, pc}
878; CHECK-NEXT:  .LBB20_1: @ %for.body.preheader
879; CHECK-NEXT:    movs r3, #10
880; CHECK-NEXT:    dlstp.32 lr, r2
881; CHECK-NEXT:  .LBB20_2: @ %for.body
882; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
883; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
884; CHECK-NEXT:    vqrdmulh.s32 q0, q0, r3
885; CHECK-NEXT:    vstrw.32 q0, [r1], #16
886; CHECK-NEXT:    letp lr, .LBB20_2
887; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
888; CHECK-NEXT:    pop {r7, pc}
889entry:
890  %cmp11 = icmp sgt i32 %n, 0
891  br i1 %cmp11, label %for.body, label %for.cond.cleanup
892
893for.cond.cleanup:                                 ; preds = %for.body, %entry
894  ret void
895
896for.body:                                         ; preds = %entry, %for.body
897  %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
898  %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
899  %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
900  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
901  %1 = bitcast ptr %x.addr.014 to ptr
902  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
903  %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
904  %3 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>)
905  %4 = bitcast ptr %y.addr.013 to ptr
906  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0)
907  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
908  %sub = add nsw i32 %i.012, -4
909  %cmp = icmp sgt i32 %i.012, 4
910  br i1 %cmp, label %for.body, label %for.cond.cleanup
911}
912
913define void @vqrdmulh(ptr %s1, i32 %N) {
914; CHECK-LABEL: vqrdmulh:
915; CHECK:       @ %bb.0: @ %entry
916; CHECK-NEXT:    .save {r7, lr}
917; CHECK-NEXT:    push {r7, lr}
918; CHECK-NEXT:    cmp r1, #1
919; CHECK-NEXT:    it lt
920; CHECK-NEXT:    poplt {r7, pc}
921; CHECK-NEXT:  .LBB21_1: @ %while.body.preheader
922; CHECK-NEXT:    movs r2, #10
923; CHECK-NEXT:    dlstp.32 lr, r1
924; CHECK-NEXT:  .LBB21_2: @ %while.body
925; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
926; CHECK-NEXT:    vldrw.u32 q0, [r0]
927; CHECK-NEXT:    vqrdmulh.s32 q0, q0, r2
928; CHECK-NEXT:    vstrw.32 q0, [r0], #16
929; CHECK-NEXT:    letp lr, .LBB21_2
930; CHECK-NEXT:  @ %bb.3: @ %while.end
931; CHECK-NEXT:    pop {r7, pc}
932entry:
933  %cmp11 = icmp sgt i32 %N, 0
934  br i1 %cmp11, label %while.body.lr.ph, label %while.end
935
936while.body.lr.ph:                                 ; preds = %entry
937  br label %while.body
938
939while.body:                                       ; preds = %while.body.lr.ph, %while.body
940  %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
941  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
942  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
943  %1 = bitcast ptr %s1.addr.013 to ptr
944  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
945  %3 = tail call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i1> %0, <4 x i32> %2)
946  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0)
947  %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
948  %sub = add nsw i32 %N.addr.012, -4
949  %cmp = icmp sgt i32 %N.addr.012, 4
950  br i1 %cmp, label %while.body, label %while.end
951
952while.end:                                        ; preds = %while.body, %entry
953  ret void
954}
955
956define void @vmlaq(ptr %x, ptr %y, i32 %n) {
957; CHECK-LABEL: vmlaq:
958; CHECK:       @ %bb.0: @ %entry
959; CHECK-NEXT:    .save {r7, lr}
960; CHECK-NEXT:    push {r7, lr}
961; CHECK-NEXT:    cmp r2, #1
962; CHECK-NEXT:    it lt
963; CHECK-NEXT:    poplt {r7, pc}
964; CHECK-NEXT:  .LBB22_1: @ %for.body.preheader
965; CHECK-NEXT:    movs r3, #10
966; CHECK-NEXT:    dlstp.32 lr, r2
967; CHECK-NEXT:  .LBB22_2: @ %for.body
968; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
969; CHECK-NEXT:    vldrw.u32 q0, [r1]
970; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
971; CHECK-NEXT:    vmla.i32 q1, q0, r3
972; CHECK-NEXT:    vstrw.32 q1, [r1], #16
973; CHECK-NEXT:    letp lr, .LBB22_2
974; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
975; CHECK-NEXT:    pop {r7, pc}
976entry:
977  %cmp14 = icmp sgt i32 %n, 0
978  br i1 %cmp14, label %for.body, label %for.cond.cleanup
979
980for.cond.cleanup:                                 ; preds = %for.body, %entry
981  ret void
982
983for.body:                                         ; preds = %entry, %for.body
984  %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
985  %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
986  %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
987  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015)
988  %1 = bitcast ptr %x.addr.017 to ptr
989  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
990  %add.ptr = getelementptr inbounds i32, ptr %x.addr.017, i32 4
991  %3 = bitcast ptr %y.addr.016 to ptr
992  %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
993  %5 = mul <4 x i32> %4, <i32 10, i32 10, i32 10, i32 10>
994  %6 = add <4 x i32> %5, %2
995  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %6, ptr %3, i32 4, <4 x i1> %0)
996  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.016, i32 4
997  %sub = add nsw i32 %i.015, -4
998  %cmp = icmp sgt i32 %i.015, 4
999  br i1 %cmp, label %for.body, label %for.cond.cleanup
1000}
1001
1002define void @vmlaqp(ptr %x, ptr %y, i32 %n) {
1003; CHECK-LABEL: vmlaqp:
1004; CHECK:       @ %bb.0: @ %entry
1005; CHECK-NEXT:    .save {r7, lr}
1006; CHECK-NEXT:    push {r7, lr}
1007; CHECK-NEXT:    cmp r2, #1
1008; CHECK-NEXT:    it lt
1009; CHECK-NEXT:    poplt {r7, pc}
1010; CHECK-NEXT:  .LBB23_1: @ %for.body.preheader
1011; CHECK-NEXT:    movs r3, #10
1012; CHECK-NEXT:    dlstp.32 lr, r2
1013; CHECK-NEXT:  .LBB23_2: @ %for.body
1014; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1015; CHECK-NEXT:    vldrw.u32 q0, [r1]
1016; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
1017; CHECK-NEXT:    vmla.i32 q1, q0, r3
1018; CHECK-NEXT:    vstrw.32 q1, [r1], #16
1019; CHECK-NEXT:    letp lr, .LBB23_2
1020; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1021; CHECK-NEXT:    pop {r7, pc}
1022entry:
1023  %cmp15 = icmp sgt i32 %n, 0
1024  br i1 %cmp15, label %for.body, label %for.cond.cleanup
1025
1026for.cond.cleanup:                                 ; preds = %for.body, %entry
1027  ret void
1028
1029for.body:                                         ; preds = %entry, %for.body
1030  %x.addr.018 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1031  %y.addr.017 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1032  %i.016 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1033  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.016)
1034  %1 = bitcast ptr %x.addr.018 to ptr
1035  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1036  %add.ptr = getelementptr inbounds i32, ptr %x.addr.018, i32 4
1037  %3 = bitcast ptr %y.addr.017 to ptr
1038  %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1039  %5 = tail call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %4, i32 10, <4 x i1> %0)
1040  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %5, ptr %3, i32 4, <4 x i1> %0)
1041  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.017, i32 4
1042  %sub = add nsw i32 %i.016, -4
1043  %cmp = icmp sgt i32 %i.016, 4
1044  br i1 %cmp, label %for.body, label %for.cond.cleanup
1045}
1046
1047define void @vmlasq(ptr %x, ptr %y, i32 %n) {
1048; CHECK-LABEL: vmlasq:
1049; CHECK:       @ %bb.0: @ %entry
1050; CHECK-NEXT:    .save {r7, lr}
1051; CHECK-NEXT:    push {r7, lr}
1052; CHECK-NEXT:    cmp r2, #1
1053; CHECK-NEXT:    it lt
1054; CHECK-NEXT:    poplt {r7, pc}
1055; CHECK-NEXT:  .LBB24_1: @ %for.body.preheader
1056; CHECK-NEXT:    movs r3, #10
1057; CHECK-NEXT:    dlstp.32 lr, r2
1058; CHECK-NEXT:  .LBB24_2: @ %for.body
1059; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1060; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
1061; CHECK-NEXT:    vldrw.u32 q1, [r1]
1062; CHECK-NEXT:    vmlas.i32 q1, q0, r3
1063; CHECK-NEXT:    vstrw.32 q1, [r1], #16
1064; CHECK-NEXT:    letp lr, .LBB24_2
1065; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1066; CHECK-NEXT:    pop {r7, pc}
1067entry:
1068  %cmp14 = icmp sgt i32 %n, 0
1069  br i1 %cmp14, label %for.body, label %for.cond.cleanup
1070
1071for.cond.cleanup:                                 ; preds = %for.body, %entry
1072  ret void
1073
1074for.body:                                         ; preds = %entry, %for.body
1075  %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1076  %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1077  %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1078  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015)
1079  %1 = bitcast ptr %x.addr.017 to ptr
1080  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1081  %add.ptr = getelementptr inbounds i32, ptr %x.addr.017, i32 4
1082  %3 = bitcast ptr %y.addr.016 to ptr
1083  %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1084  %5 = mul <4 x i32> %4, %2
1085  %6 = add <4 x i32> %5, <i32 10, i32 10, i32 10, i32 10>
1086  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %6, ptr %3, i32 4, <4 x i1> %0)
1087  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.016, i32 4
1088  %sub = add nsw i32 %i.015, -4
1089  %cmp = icmp sgt i32 %i.015, 4
1090  br i1 %cmp, label %for.body, label %for.cond.cleanup
1091}
1092
1093define void @vmlasqp(ptr %x, ptr %y, i32 %n) {
1094; CHECK-LABEL: vmlasqp:
1095; CHECK:       @ %bb.0: @ %entry
1096; CHECK-NEXT:    .save {r7, lr}
1097; CHECK-NEXT:    push {r7, lr}
1098; CHECK-NEXT:    cmp r2, #1
1099; CHECK-NEXT:    it lt
1100; CHECK-NEXT:    poplt {r7, pc}
1101; CHECK-NEXT:  .LBB25_1: @ %for.body.preheader
1102; CHECK-NEXT:    movs r3, #10
1103; CHECK-NEXT:    dlstp.32 lr, r2
1104; CHECK-NEXT:  .LBB25_2: @ %for.body
1105; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1106; CHECK-NEXT:    vldrw.u32 q0, [r1]
1107; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
1108; CHECK-NEXT:    vmlas.i32 q1, q0, r3
1109; CHECK-NEXT:    vstrw.32 q1, [r1], #16
1110; CHECK-NEXT:    letp lr, .LBB25_2
1111; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1112; CHECK-NEXT:    pop {r7, pc}
1113entry:
1114  %cmp15 = icmp sgt i32 %n, 0
1115  br i1 %cmp15, label %for.body, label %for.cond.cleanup
1116
1117for.cond.cleanup:                                 ; preds = %for.body, %entry
1118  ret void
1119
1120for.body:                                         ; preds = %entry, %for.body
1121  %x.addr.018 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1122  %y.addr.017 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1123  %i.016 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1124  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.016)
1125  %1 = bitcast ptr %x.addr.018 to ptr
1126  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1127  %add.ptr = getelementptr inbounds i32, ptr %x.addr.018, i32 4
1128  %3 = bitcast ptr %y.addr.017 to ptr
1129  %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1130  %5 = tail call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %4, i32 10, <4 x i1> %0)
1131  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %5, ptr %3, i32 4, <4 x i1> %0)
1132  %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.017, i32 4
1133  %sub = add nsw i32 %i.016, -4
1134  %cmp = icmp sgt i32 %i.016, 4
1135  br i1 %cmp, label %for.body, label %for.cond.cleanup
1136}
1137
1138define void @vaddqf(ptr %x, ptr %y, i32 %n) {
1139; CHECK-LABEL: vaddqf:
1140; CHECK:       @ %bb.0: @ %entry
1141; CHECK-NEXT:    .save {r7, lr}
1142; CHECK-NEXT:    push {r7, lr}
1143; CHECK-NEXT:    cmp r2, #1
1144; CHECK-NEXT:    it lt
1145; CHECK-NEXT:    poplt {r7, pc}
1146; CHECK-NEXT:  .LBB26_1: @ %for.body.preheader
1147; CHECK-NEXT:    vmov.f32 q0, #1.000000e+01
1148; CHECK-NEXT:    dlstp.32 lr, r2
1149; CHECK-NEXT:  .LBB26_2: @ %for.body
1150; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1151; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
1152; CHECK-NEXT:    vadd.f32 q1, q1, q0
1153; CHECK-NEXT:    vstrw.32 q1, [r1], #16
1154; CHECK-NEXT:    letp lr, .LBB26_2
1155; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1156; CHECK-NEXT:    pop {r7, pc}
1157entry:
1158  %cmp11 = icmp sgt i32 %n, 0
1159  br i1 %cmp11, label %for.body, label %for.cond.cleanup
1160
1161for.cond.cleanup:                                 ; preds = %for.body, %entry
1162  ret void
1163
1164for.body:                                         ; preds = %entry, %for.body
1165  %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1166  %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1167  %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1168  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
1169  %1 = bitcast ptr %x.addr.014 to ptr
1170  %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1171  %add.ptr = getelementptr inbounds float, ptr %x.addr.014, i32 4
1172  %3 = fadd fast <4 x float> %2, <float 10.0, float 10.0, float 10.0, float 10.0>
1173  %4 = bitcast ptr %y.addr.013 to ptr
1174  tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %0)
1175  %add.ptr1 = getelementptr inbounds float, ptr %y.addr.013, i32 4
1176  %sub = add nsw i32 %i.012, -4
1177  %cmp = icmp sgt i32 %i.012, 4
1178  br i1 %cmp, label %for.body, label %for.cond.cleanup
1179}
1180
1181define void @vaddf(ptr %s1, i32 %N) {
1182; CHECK-LABEL: vaddf:
1183; CHECK:       @ %bb.0: @ %entry
1184; CHECK-NEXT:    .save {r7, lr}
1185; CHECK-NEXT:    push {r7, lr}
1186; CHECK-NEXT:    cmp r1, #1
1187; CHECK-NEXT:    it lt
1188; CHECK-NEXT:    poplt {r7, pc}
1189; CHECK-NEXT:  .LBB27_1: @ %while.body.preheader
1190; CHECK-NEXT:    movs r2, #0
1191; CHECK-NEXT:    movt r2, #16672
1192; CHECK-NEXT:    dlstp.32 lr, r1
1193; CHECK-NEXT:  .LBB27_2: @ %while.body
1194; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1195; CHECK-NEXT:    vldrw.u32 q0, [r0]
1196; CHECK-NEXT:    vadd.f32 q0, q0, r2
1197; CHECK-NEXT:    vstrw.32 q0, [r0], #16
1198; CHECK-NEXT:    letp lr, .LBB27_2
1199; CHECK-NEXT:  @ %bb.3: @ %while.end
1200; CHECK-NEXT:    pop {r7, pc}
1201entry:
1202  %cmp11 = icmp sgt i32 %N, 0
1203  br i1 %cmp11, label %while.body.lr.ph, label %while.end
1204
1205while.body.lr.ph:                                 ; preds = %entry
1206  br label %while.body
1207
1208while.body:                                       ; preds = %while.body.lr.ph, %while.body
1209  %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1210  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
1211  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
1212  %1 = bitcast ptr %s1.addr.013 to ptr
1213  %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1214  %3 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x i1> %0, <4 x float> %2)
1215  tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %1, i32 4, <4 x i1> %0)
1216  %add.ptr = getelementptr inbounds float, ptr %s1.addr.013, i32 4
1217  %sub = add nsw i32 %N.addr.012, -4
1218  %cmp = icmp sgt i32 %N.addr.012, 4
1219  br i1 %cmp, label %while.body, label %while.end
1220
1221while.end:                                        ; preds = %while.body, %entry
1222  ret void
1223}
1224
1225define void @vsubqf(ptr %x, ptr %y, i32 %n) {
1226; CHECK-LABEL: vsubqf:
1227; CHECK:       @ %bb.0: @ %entry
1228; CHECK-NEXT:    .save {r7, lr}
1229; CHECK-NEXT:    push {r7, lr}
1230; CHECK-NEXT:    cmp r2, #1
1231; CHECK-NEXT:    it lt
1232; CHECK-NEXT:    poplt {r7, pc}
1233; CHECK-NEXT:  .LBB28_1: @ %for.body.preheader
1234; CHECK-NEXT:    vmov.f32 q0, #-1.000000e+01
1235; CHECK-NEXT:    dlstp.32 lr, r2
1236; CHECK-NEXT:  .LBB28_2: @ %for.body
1237; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1238; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
1239; CHECK-NEXT:    vadd.f32 q1, q1, q0
1240; CHECK-NEXT:    vstrw.32 q1, [r1], #16
1241; CHECK-NEXT:    letp lr, .LBB28_2
1242; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1243; CHECK-NEXT:    pop {r7, pc}
1244entry:
1245  %cmp11 = icmp sgt i32 %n, 0
1246  br i1 %cmp11, label %for.body, label %for.cond.cleanup
1247
1248for.cond.cleanup:                                 ; preds = %for.body, %entry
1249  ret void
1250
1251for.body:                                         ; preds = %entry, %for.body
1252  %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1253  %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1254  %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1255  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
1256  %1 = bitcast ptr %x.addr.014 to ptr
1257  %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1258  %add.ptr = getelementptr inbounds float, ptr %x.addr.014, i32 4
1259  %3 = fsub fast <4 x float> %2, <float 10.0, float 10.0, float 10.0, float 10.0>
1260  %4 = bitcast ptr %y.addr.013 to ptr
1261  tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %0)
1262  %add.ptr1 = getelementptr inbounds float, ptr %y.addr.013, i32 4
1263  %sub = add nsw i32 %i.012, -4
1264  %cmp = icmp sgt i32 %i.012, 4
1265  br i1 %cmp, label %for.body, label %for.cond.cleanup
1266}
1267
1268define void @vsubf(ptr %s1, i32 %N) {
1269; CHECK-LABEL: vsubf:
1270; CHECK:       @ %bb.0: @ %entry
1271; CHECK-NEXT:    .save {r7, lr}
1272; CHECK-NEXT:    push {r7, lr}
1273; CHECK-NEXT:    cmp r1, #1
1274; CHECK-NEXT:    it lt
1275; CHECK-NEXT:    poplt {r7, pc}
1276; CHECK-NEXT:  .LBB29_1: @ %while.body.preheader
1277; CHECK-NEXT:    movs r2, #0
1278; CHECK-NEXT:    movt r2, #16672
1279; CHECK-NEXT:    dlstp.32 lr, r1
1280; CHECK-NEXT:  .LBB29_2: @ %while.body
1281; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1282; CHECK-NEXT:    vldrw.u32 q0, [r0]
1283; CHECK-NEXT:    vsub.f32 q0, q0, r2
1284; CHECK-NEXT:    vstrw.32 q0, [r0], #16
1285; CHECK-NEXT:    letp lr, .LBB29_2
1286; CHECK-NEXT:  @ %bb.3: @ %while.end
1287; CHECK-NEXT:    pop {r7, pc}
1288entry:
1289  %cmp11 = icmp sgt i32 %N, 0
1290  br i1 %cmp11, label %while.body.lr.ph, label %while.end
1291
1292while.body.lr.ph:                                 ; preds = %entry
1293  br label %while.body
1294
1295while.body:                                       ; preds = %while.body.lr.ph, %while.body
1296  %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1297  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
1298  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
1299  %1 = bitcast ptr %s1.addr.013 to ptr
1300  %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1301  %3 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x i1> %0, <4 x float> %2)
1302  tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %1, i32 4, <4 x i1> %0)
1303  %add.ptr = getelementptr inbounds float, ptr %s1.addr.013, i32 4
1304  %sub = add nsw i32 %N.addr.012, -4
1305  %cmp = icmp sgt i32 %N.addr.012, 4
1306  br i1 %cmp, label %while.body, label %while.end
1307
1308while.end:                                        ; preds = %while.body, %entry
1309  ret void
1310}
1311
1312define void @vmulqf(ptr %x, ptr %y, i32 %n) {
1313; CHECK-LABEL: vmulqf:
1314; CHECK:       @ %bb.0: @ %entry
1315; CHECK-NEXT:    .save {r7, lr}
1316; CHECK-NEXT:    push {r7, lr}
1317; CHECK-NEXT:    cmp r2, #1
1318; CHECK-NEXT:    it lt
1319; CHECK-NEXT:    poplt {r7, pc}
1320; CHECK-NEXT:  .LBB30_1: @ %for.body.preheader
1321; CHECK-NEXT:    vmov.f32 q0, #1.000000e+01
1322; CHECK-NEXT:    dlstp.32 lr, r2
1323; CHECK-NEXT:  .LBB30_2: @ %for.body
1324; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1325; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
1326; CHECK-NEXT:    vmul.f32 q1, q1, q0
1327; CHECK-NEXT:    vstrw.32 q1, [r1], #16
1328; CHECK-NEXT:    letp lr, .LBB30_2
1329; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1330; CHECK-NEXT:    pop {r7, pc}
1331entry:
1332  %cmp11 = icmp sgt i32 %n, 0
1333  br i1 %cmp11, label %for.body, label %for.cond.cleanup
1334
1335for.cond.cleanup:                                 ; preds = %for.body, %entry
1336  ret void
1337
1338for.body:                                         ; preds = %entry, %for.body
1339  %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1340  %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1341  %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1342  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
1343  %1 = bitcast ptr %x.addr.014 to ptr
1344  %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1345  %add.ptr = getelementptr inbounds float, ptr %x.addr.014, i32 4
1346  %3 = fmul fast <4 x float> %2, <float 10.0, float 10.0, float 10.0, float 10.0>
1347  %4 = bitcast ptr %y.addr.013 to ptr
1348  tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %0)
1349  %add.ptr1 = getelementptr inbounds float, ptr %y.addr.013, i32 4
1350  %sub = add nsw i32 %i.012, -4
1351  %cmp = icmp sgt i32 %i.012, 4
1352  br i1 %cmp, label %for.body, label %for.cond.cleanup
1353}
1354
1355define void @vmulf(ptr %s1, i32 %N) {
1356; CHECK-LABEL: vmulf:
1357; CHECK:       @ %bb.0: @ %entry
1358; CHECK-NEXT:    .save {r7, lr}
1359; CHECK-NEXT:    push {r7, lr}
1360; CHECK-NEXT:    cmp r1, #1
1361; CHECK-NEXT:    it lt
1362; CHECK-NEXT:    poplt {r7, pc}
1363; CHECK-NEXT:  .LBB31_1: @ %while.body.preheader
1364; CHECK-NEXT:    movs r2, #0
1365; CHECK-NEXT:    movt r2, #16672
1366; CHECK-NEXT:    dlstp.32 lr, r1
1367; CHECK-NEXT:  .LBB31_2: @ %while.body
1368; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1369; CHECK-NEXT:    vldrw.u32 q0, [r0]
1370; CHECK-NEXT:    vmul.f32 q0, q0, r2
1371; CHECK-NEXT:    vstrw.32 q0, [r0], #16
1372; CHECK-NEXT:    letp lr, .LBB31_2
1373; CHECK-NEXT:  @ %bb.3: @ %while.end
1374; CHECK-NEXT:    pop {r7, pc}
1375entry:
1376  %cmp11 = icmp sgt i32 %N, 0
1377  br i1 %cmp11, label %while.body.lr.ph, label %while.end
1378
1379while.body.lr.ph:                                 ; preds = %entry
1380  br label %while.body
1381
1382while.body:                                       ; preds = %while.body.lr.ph, %while.body
1383  %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1384  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
1385  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
1386  %1 = bitcast ptr %s1.addr.013 to ptr
1387  %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1388  %3 = tail call fast <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x i1> %0, <4 x float> %2)
1389  tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %1, i32 4, <4 x i1> %0)
1390  %add.ptr = getelementptr inbounds float, ptr %s1.addr.013, i32 4
1391  %sub = add nsw i32 %N.addr.012, -4
1392  %cmp = icmp sgt i32 %N.addr.012, 4
1393  br i1 %cmp, label %while.body, label %while.end
1394
1395while.end:                                        ; preds = %while.body, %entry
1396  ret void
1397}
1398
1399define void @vfmaq(ptr %x, ptr %y, i32 %n) {
1400; CHECK-LABEL: vfmaq:
1401; CHECK:       @ %bb.0: @ %entry
1402; CHECK-NEXT:    .save {r7, lr}
1403; CHECK-NEXT:    push {r7, lr}
1404; CHECK-NEXT:    cmp r2, #1
1405; CHECK-NEXT:    it lt
1406; CHECK-NEXT:    poplt {r7, pc}
1407; CHECK-NEXT:  .LBB32_1: @ %for.body.preheader
1408; CHECK-NEXT:    vmov.f32 q0, #1.000000e+01
1409; CHECK-NEXT:    dlstp.32 lr, r2
1410; CHECK-NEXT:  .LBB32_2: @ %for.body
1411; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1412; CHECK-NEXT:    vldrw.u32 q1, [r1]
1413; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
1414; CHECK-NEXT:    vfma.f32 q2, q1, q0
1415; CHECK-NEXT:    vstrw.32 q2, [r1], #16
1416; CHECK-NEXT:    letp lr, .LBB32_2
1417; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1418; CHECK-NEXT:    pop {r7, pc}
1419entry:
1420  %cmp14 = icmp sgt i32 %n, 0
1421  br i1 %cmp14, label %for.body, label %for.cond.cleanup
1422
1423for.cond.cleanup:                                 ; preds = %for.body, %entry
1424  ret void
1425
1426for.body:                                         ; preds = %entry, %for.body
1427  %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1428  %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1429  %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1430  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015)
1431  %1 = bitcast ptr %x.addr.017 to ptr
1432  %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1433  %add.ptr = getelementptr inbounds float, ptr %x.addr.017, i32 4
1434  %3 = bitcast ptr %y.addr.016 to ptr
1435  %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %3, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1436  %5 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %4, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x float> %2)
1437  tail call void @llvm.masked.store.v4f32.p0(<4 x float> %5, ptr %3, i32 4, <4 x i1> %0)
1438  %add.ptr1 = getelementptr inbounds float, ptr %y.addr.016, i32 4
1439  %sub = add nsw i32 %i.015, -4
1440  %cmp = icmp sgt i32 %i.015, 4
1441  br i1 %cmp, label %for.body, label %for.cond.cleanup
1442}
1443
1444define void @vfma(ptr %s1, ptr %s2, i32 %N) {
1445; CHECK-LABEL: vfma:
1446; CHECK:       @ %bb.0: @ %entry
1447; CHECK-NEXT:    .save {r7, lr}
1448; CHECK-NEXT:    push {r7, lr}
1449; CHECK-NEXT:    cmp r2, #1
1450; CHECK-NEXT:    it lt
1451; CHECK-NEXT:    poplt {r7, pc}
1452; CHECK-NEXT:  .LBB33_1: @ %while.body.lr.ph
1453; CHECK-NEXT:    vmov.f32 q0, #1.000000e+01
1454; CHECK-NEXT:    dlstp.32 lr, r2
1455; CHECK-NEXT:  .LBB33_2: @ %while.body
1456; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1457; CHECK-NEXT:    vldrw.u32 q1, [r1]
1458; CHECK-NEXT:    vldrw.u32 q2, [r0]
1459; CHECK-NEXT:    vfma.f32 q2, q1, q0
1460; CHECK-NEXT:    vstrw.32 q2, [r0], #16
1461; CHECK-NEXT:    letp lr, .LBB33_2
1462; CHECK-NEXT:  @ %bb.3: @ %while.end
1463; CHECK-NEXT:    pop {r7, pc}
1464entry:
1465  %cmp12 = icmp sgt i32 %N, 0
1466  br i1 %cmp12, label %while.body.lr.ph, label %while.end
1467
1468while.body.lr.ph:                                 ; preds = %entry
1469  %0 = bitcast ptr %s2 to ptr
1470  br label %while.body
1471
1472while.body:                                       ; preds = %while.body.lr.ph, %while.body
1473  %s1.addr.014 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1474  %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
1475  %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013)
1476  %2 = bitcast ptr %s1.addr.014 to ptr
1477  %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
1478  %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
1479  %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %4, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x float> %3, <4 x i1> %1)
1480  tail call void @llvm.masked.store.v4f32.p0(<4 x float> %5, ptr %2, i32 4, <4 x i1> %1)
1481  %add.ptr = getelementptr inbounds float, ptr %s1.addr.014, i32 4
1482  %sub = add nsw i32 %N.addr.013, -4
1483  %cmp = icmp sgt i32 %N.addr.013, 4
1484  br i1 %cmp, label %while.body, label %while.end
1485
1486while.end:                                        ; preds = %while.body, %entry
1487  ret void
1488}
1489
1490define void @vfmasq(ptr %x, ptr %y, i32 %n) {
1491; CHECK-LABEL: vfmasq:
1492; CHECK:       @ %bb.0: @ %entry
1493; CHECK-NEXT:    .save {r7, lr}
1494; CHECK-NEXT:    push {r7, lr}
1495; CHECK-NEXT:    cmp r2, #1
1496; CHECK-NEXT:    it lt
1497; CHECK-NEXT:    poplt {r7, pc}
1498; CHECK-NEXT:  .LBB34_1: @ %for.body.preheader
1499; CHECK-NEXT:    vmov.f32 q0, #1.000000e+01
1500; CHECK-NEXT:    dlstp.32 lr, r2
1501; CHECK-NEXT:  .LBB34_2: @ %for.body
1502; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1503; CHECK-NEXT:    vmov q3, q0
1504; CHECK-NEXT:    vldrw.u32 q1, [r1]
1505; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
1506; CHECK-NEXT:    vfma.f32 q3, q2, q1
1507; CHECK-NEXT:    vstrw.32 q3, [r1], #16
1508; CHECK-NEXT:    letp lr, .LBB34_2
1509; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1510; CHECK-NEXT:    pop {r7, pc}
1511entry:
1512  %cmp14 = icmp sgt i32 %n, 0
1513  br i1 %cmp14, label %for.body, label %for.cond.cleanup
1514
1515for.cond.cleanup:                                 ; preds = %for.body, %entry
1516  ret void
1517
1518for.body:                                         ; preds = %entry, %for.body
1519  %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1520  %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1521  %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1522  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015)
1523  %1 = bitcast ptr %x.addr.017 to ptr
1524  %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1525  %add.ptr = getelementptr inbounds float, ptr %x.addr.017, i32 4
1526  %3 = bitcast ptr %y.addr.016 to ptr
1527  %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %3, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1528  %5 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %2, <4 x float> %4, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>)
1529  tail call void @llvm.masked.store.v4f32.p0(<4 x float> %5, ptr %3, i32 4, <4 x i1> %0)
1530  %add.ptr1 = getelementptr inbounds float, ptr %y.addr.016, i32 4
1531  %sub = add nsw i32 %i.015, -4
1532  %cmp = icmp sgt i32 %i.015, 4
1533  br i1 %cmp, label %for.body, label %for.cond.cleanup
1534}
1535
1536define void @vfmas(ptr %s1, ptr %s2, i32 %N) {
1537; CHECK-LABEL: vfmas:
1538; CHECK:       @ %bb.0: @ %entry
1539; CHECK-NEXT:    .save {r7, lr}
1540; CHECK-NEXT:    push {r7, lr}
1541; CHECK-NEXT:    cmp r2, #1
1542; CHECK-NEXT:    it lt
1543; CHECK-NEXT:    poplt {r7, pc}
1544; CHECK-NEXT:  .LBB35_1: @ %while.body.lr.ph
1545; CHECK-NEXT:    vmov.f32 q0, #1.000000e+01
1546; CHECK-NEXT:    dlstp.32 lr, r2
1547; CHECK-NEXT:  .LBB35_2: @ %while.body
1548; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1549; CHECK-NEXT:    vmov q3, q0
1550; CHECK-NEXT:    vldrw.u32 q1, [r1]
1551; CHECK-NEXT:    vldrw.u32 q2, [r0]
1552; CHECK-NEXT:    vfma.f32 q3, q2, q1
1553; CHECK-NEXT:    vstrw.32 q3, [r0], #16
1554; CHECK-NEXT:    letp lr, .LBB35_2
1555; CHECK-NEXT:  @ %bb.3: @ %while.end
1556; CHECK-NEXT:    pop {r7, pc}
1557entry:
1558  %cmp12 = icmp sgt i32 %N, 0
1559  br i1 %cmp12, label %while.body.lr.ph, label %while.end
1560
1561while.body.lr.ph:                                 ; preds = %entry
1562  %0 = bitcast ptr %s2 to ptr
1563  br label %while.body
1564
1565while.body:                                       ; preds = %while.body.lr.ph, %while.body
1566  %s1.addr.014 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1567  %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
1568  %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013)
1569  %2 = bitcast ptr %s1.addr.014 to ptr
1570  %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
1571  %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
1572  %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %3, <4 x float> %4, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x i1> %1)
1573  tail call void @llvm.masked.store.v4f32.p0(<4 x float> %5, ptr %2, i32 4, <4 x i1> %1)
1574  %add.ptr = getelementptr inbounds float, ptr %s1.addr.014, i32 4
1575  %sub = add nsw i32 %N.addr.013, -4
1576  %cmp = icmp sgt i32 %N.addr.013, 4
1577  br i1 %cmp, label %while.body, label %while.end
1578
1579while.end:                                        ; preds = %while.body, %entry
1580  ret void
1581}
1582
1583define void @rgbconvert(ptr noalias %pwSourceBase, i16 signext %iSourceStride, ptr noalias %phwTargetBase, i16 signext %iTargetStride, i16 %iHeight, i16 %iWidth) {
1584; CHECK-LABEL: rgbconvert:
1585; CHECK:       @ %bb.0: @ %entry
1586; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1587; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1588; CHECK-NEXT:    .pad #4
1589; CHECK-NEXT:    sub sp, #4
1590; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
1591; CHECK-NEXT:    vpush {d8, d9, d10, d11}
1592; CHECK-NEXT:    .pad #8
1593; CHECK-NEXT:    sub sp, #8
1594; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
1595; CHECK-NEXT:    ldrsh.w r3, [sp, #80]
1596; CHECK-NEXT:    cmp r3, #1
1597; CHECK-NEXT:    blt .LBB36_5
1598; CHECK-NEXT:  @ %bb.1: @ %for.body.lr.ph
1599; CHECK-NEXT:    mov r9, r2
1600; CHECK-NEXT:    ldr r2, [sp, #84]
1601; CHECK-NEXT:    mov.w r10, #0
1602; CHECK-NEXT:    mov.w r11, #8388608
1603; CHECK-NEXT:    mov.w r4, #67108864
1604; CHECK-NEXT:    sxth.w r12, r2
1605; CHECK-NEXT:    vmov.i32 q0, #0xf800
1606; CHECK-NEXT:    vmov.i32 q1, #0x1f
1607; CHECK-NEXT:    mov.w r2, #2016
1608; CHECK-NEXT:    mov.w r7, #268435456
1609; CHECK-NEXT:    vdup.32 q2, r2
1610; CHECK-NEXT:  .LBB36_2: @ %for.body
1611; CHECK-NEXT:    @ =>This Loop Header: Depth=1
1612; CHECK-NEXT:    @ Child Loop BB36_3 Depth 2
1613; CHECK-NEXT:    mov r2, r9
1614; CHECK-NEXT:    mov r5, r0
1615; CHECK-NEXT:    dlstp.32 lr, r12
1616; CHECK-NEXT:  .LBB36_3: @ %do.body
1617; CHECK-NEXT:    @ Parent Loop BB36_2 Depth=1
1618; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1619; CHECK-NEXT:    vldrw.u32 q3, [r5], #16
1620; CHECK-NEXT:    vqdmulh.s32 q4, q3, r4
1621; CHECK-NEXT:    vqdmulh.s32 q5, q3, r7
1622; CHECK-NEXT:    vqdmulh.s32 q3, q3, r11
1623; CHECK-NEXT:    vand q4, q4, q2
1624; CHECK-NEXT:    vand q5, q5, q1
1625; CHECK-NEXT:    vand q3, q3, q0
1626; CHECK-NEXT:    vorr q4, q4, q5
1627; CHECK-NEXT:    vorr q3, q4, q3
1628; CHECK-NEXT:    vstrh.32 q3, [r2], #8
1629; CHECK-NEXT:    letp lr, .LBB36_3
1630; CHECK-NEXT:  @ %bb.4: @ %do.end
1631; CHECK-NEXT:    @ in Loop: Header=BB36_2 Depth=1
1632; CHECK-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
1633; CHECK-NEXT:    add.w r10, r10, #1
1634; CHECK-NEXT:    add.w r0, r0, r1, lsl #2
1635; CHECK-NEXT:    cmp r10, r3
1636; CHECK-NEXT:    add.w r9, r9, r2, lsl #1
1637; CHECK-NEXT:    bne .LBB36_2
1638; CHECK-NEXT:  .LBB36_5: @ %for.cond.cleanup
1639; CHECK-NEXT:    add sp, #8
1640; CHECK-NEXT:    vpop {d8, d9, d10, d11}
1641; CHECK-NEXT:    add sp, #4
1642; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1643entry:
1644  %conv = sext i16 %iHeight to i32
1645  %conv9 = sext i16 %iSourceStride to i32
1646  %conv11 = sext i16 %iTargetStride to i32
1647  %cmp37 = icmp sgt i16 %iHeight, 0
1648  br i1 %cmp37, label %for.body.lr.ph, label %for.cond.cleanup
1649
1650for.body.lr.ph:                                   ; preds = %entry
1651  %conv2 = sext i16 %iWidth to i32
1652  br label %for.body
1653
1654for.cond.cleanup:                                 ; preds = %do.end, %entry
1655  ret void
1656
1657for.body:                                         ; preds = %for.body.lr.ph, %do.end
1658  %pwSourceBase.addr.040 = phi ptr [ %pwSourceBase, %for.body.lr.ph ], [ %add.ptr10, %do.end ]
1659  %phwTargetBase.addr.039 = phi ptr [ %phwTargetBase, %for.body.lr.ph ], [ %add.ptr12, %do.end ]
1660  %y.038 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %do.end ]
1661  br label %do.body
1662
1663do.body:                                          ; preds = %do.body, %for.body
1664  %pTarget.0 = phi ptr [ %phwTargetBase.addr.039, %for.body ], [ %add.ptr6, %do.body ]
1665  %pSource.0 = phi ptr [ %pwSourceBase.addr.040, %for.body ], [ %add.ptr, %do.body ]
1666  %blkCnt.0 = phi i32 [ %conv2, %for.body ], [ %sub, %do.body ]
1667  %l2 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
1668  %l3 = bitcast ptr %pSource.0 to ptr
1669  %l4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %l3, i32 4, <4 x i1> %l2, <4 x i32> zeroinitializer)
1670  %l5 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %l4, <4 x i32> <i32 268435456, i32 268435456, i32 268435456, i32 268435456>, <4 x i1> %l2, <4 x i32> undef)
1671  %and = and <4 x i32> %l5, <i32 31, i32 31, i32 31, i32 31>
1672  %l6 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %l4, <4 x i32> <i32 67108864, i32 67108864, i32 67108864, i32 67108864>, <4 x i1> %l2, <4 x i32> undef)
1673  %and3 = and <4 x i32> %l6, <i32 2016, i32 2016, i32 2016, i32 2016>
1674  %l7 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %l4, <4 x i32> <i32 8388608, i32 8388608, i32 8388608, i32 8388608>, <4 x i1> %l2, <4 x i32> undef)
1675  %and4 = and <4 x i32> %l7, <i32 63488, i32 63488, i32 63488, i32 63488>
1676  %or = or <4 x i32> %and3, %and
1677  %or5 = or <4 x i32> %or, %and4
1678  %l8 = trunc <4 x i32> %or5 to <4 x i16>
1679  %l9 = bitcast ptr %pTarget.0 to ptr
1680  tail call void @llvm.masked.store.v4i16.p0(<4 x i16> %l8, ptr %l9, i32 2, <4 x i1> %l2)
1681  %add.ptr = getelementptr inbounds i32, ptr %pSource.0, i32 4
1682  %add.ptr6 = getelementptr inbounds i16, ptr %pTarget.0, i32 4
1683  %sub = add nsw i32 %blkCnt.0, -4
1684  %cmp7 = icmp sgt i32 %blkCnt.0, 4
1685  br i1 %cmp7, label %do.body, label %do.end
1686
1687do.end:                                           ; preds = %do.body
1688  %add.ptr10 = getelementptr inbounds i32, ptr %pwSourceBase.addr.040, i32 %conv9
1689  %add.ptr12 = getelementptr inbounds i16, ptr %phwTargetBase.addr.039, i32 %conv11
1690  %inc = add nuw nsw i32 %y.038, 1
1691  %exitcond.not = icmp eq i32 %inc, %conv
1692  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
1693}
1694
1695declare <4 x i1> @llvm.arm.mve.vctp32(i32)
1696declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32 immarg, <4 x i1>, <4 x i16>)
1697declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
1698declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>)
1699declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
1700declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>)
1701declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32 immarg, <4 x i1>) #3
1702
1703declare <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
1704declare <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
1705declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
1706declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
1707declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>)
1708declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>)
1709declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
1710declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
1711declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>)
1712declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
1713declare <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32>, <4 x i32>, i32)
1714declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
1715declare <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32>, <4 x i32>, i32)
1716declare <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32) #1
1717declare <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16>, <8 x i16>, i32, <4 x i1>, <4 x i32>)
1718declare <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
1719declare <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32>, <4 x i32>)
1720declare <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
1721declare <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
1722declare <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)
1723declare <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)
1724declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
1725declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
1726declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
1727declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>)
1728declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
1729